In [16]:
from basepair.datasets import *
In [6]:
cat  /users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/dataspec.yaml
task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair/data/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
path: null
In [7]:
from basepair.cli.schemas import DataSpec
In [17]:
ds = DataSpec.load("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/dataspec.yaml")
In [64]:
ds.task_specs['Oct4'].peaks
In [65]:
train, valid, test = chip_exo_nexus(ds, 201, shuffle=False)
100%|██████████| 9396/9396 [00:00<00:00, 694027.90it/s]
2018-06-25 22:06:29,622 [INFO] extract sequence
2018-06-25 22:06:30,038 [INFO] extract counts
100%|██████████| 2/2 [00:18<00:00,  9.40s/it]
In [20]:
otrain, ovalid, otest = sox2_oct4_peaks_sox2()
In [46]:
np.mean(test[1]['profile/Oct4'] == otest[1]['oct4'])
Out[46]:
1.0
In [47]:
np.mean(test[1]['profile/Sox2'] == otest[1]['sox2'])
Out[47]:
1.0
In [48]:
from basepair.cli.evaluate import eval_profile
In [42]:
test[1]['profile/Oct4'].sum()
Out[42]:
542282.0
In [43]:
test[1]['profile/Sox2'].sum()
Out[43]:
268906.0
In [50]:
from basepair.config import get_data_dir, create_tf_session
from keras.models import load_model
from basepair.datasets import *
from basepair.preproc import transform_data
from basepair.plots import regression_eval
In [1]:
a=1
In [51]:
create_tf_session(1)
Out[51]:
<tensorflow.python.client.session.Session at 0x7f5822333ac8>
In [52]:
model = load_model("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/seq_mutlitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,use_profile=True,use_counts=True,c_task_weight=10,lr=0.004.5.h5")
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-06-25 21:37:39,455 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-06-25 21:37:44,062 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [123]:
preds = model.predict(valid[0])
In [124]:
from basepair.math import softmax
In [125]:
eval_profile(valid[1]['profile/Sox2'], softmax(preds[0]), binsizes=[1, 2, 5, 10])
Out[125]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.418850 1 0.211562 0.007209 2374 0.009344
1 0.520032 2 0.349331 0.016715 2260 0.022433
2 0.680696 5 0.588823 0.061181 2091 0.088653
3 0.825783 10 0.743816 0.186907 1990 0.277635
In [126]:
eval_profile(valid[1]['profile/Oct4'], softmax(preds[1]), binsizes=[1, 2, 5, 10])
Out[126]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.197214 1 0.196674 0.002904 1309 0.002937
1 0.270291 2 0.325616 0.006772 1275 0.007018
2 0.392995 5 0.560163 0.025020 1229 0.027106
3 0.557236 10 0.730999 0.078357 1177 0.089163
In [60]:
from kipoi.utils import read_pickle
In [66]:
ntrain, nvalid, ntest = read_pickle("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/data.pkl")
In [67]:
preds = model.predict(nvalid[0])
In [77]:
pd.read_table("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/eval/profile_metrics.tsv").query('binsize==1 & split=="valid"')
Out[77]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task split
4 0.065714 1 0.196674 0.002904 1309 0.003295 Oct4 valid
6 0.159771 1 0.211562 0.007209 2374 0.008267 Sox2 valid
In [78]:
from basepair.cli.evaluate import load_data
In [80]:
ltrain, lvalid, ltest = load_data("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/")
In [89]:
np.mean(ntest[1]['profile/Oct4'] == ltest[1]['profile/Oct4'])
Out[89]:
1.0
In [91]:
np.mean(ntest[1]['profile/Sox2'] == ltest[1]['profile/Sox2'])
Out[91]:
1.0
In [92]:
np.mean(ntest[0] == ltest[0])
Out[92]:
1.0
In [100]:
print(ds.get_config_as_yaml())
task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair/data/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta

In [112]:
y_pred = model.predict(valid[0])
y_true = valid[1]
task='Sox2'
# Counts
In [108]:
from basepair.cli.schemas import HParams
In [109]:
hp = HParams.load("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/hparams.yaml")
In [121]:
task
Out[121]:
'Sox2'
In [120]:
ds.task2idx(task, "profile")
Out[120]:
1
In [119]:
# Profile
yp = softmax(y_pred[ds.task2idx(task, "profile")])
eval_profile(y_true["profile/Sox2"], yp)
Out[119]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.159771 1 0.211562 0.007209 2374 0.008325
1 0.260119 2 0.349331 0.016715 2260 0.019524
2 0.399817 4 0.526987 0.043605 2143 0.053307
3 0.678802 10 0.743816 0.186907 1990 0.237694

Performance notes

  • adding 20k oct4 peaks
    • Oct4:
In [93]:
cat /users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/hparams.yaml
model:
  name: seq_multitask
  kwargs:
    filters: 21
    conv1_kernel_size: 21
    tconv_kernel_size: 25
    n_dil_layers: 6
    lr: 0.004
    c_task_weight: 100
data:
  peak_width: 201
  shuffle: true
  valid_chr:
  - chr2
  - chr3
  - chr4
  test_chr:
  - chr1
  - chr8
  - chr9
train:
  early_stop_patience: 5
  epochs: 200
  batch_size: 256
  balance_peak_classes: true
evaluate:
  pos_min_threshold: 0.05
  neg_max_threshold: 0.01
  required_min_pos_counts: 2.5
  binsizes:
  - 1
  - 10
modisco:
  sliding_window_size: 21
  flank_size: 10
  histogram_bins: 100
  percentiles_in_bandwidth: 10.0
  overlap_portion: 0.5
  min_cluster_size: 50
  threshold_for_counting_sign: 1.0
  weak_threshold_for_counting_sign: 0.7
  max_strand_distance: 0.2
path: hparams.yaml
In [86]:
test[2].head()
Out[86]:
chr end id start task
2 chr1 57780311 2 57780110 Sox2
5 chr8 67966723 5 67966522 Sox2
7 chr1 9955454 7 9955253 Sox2
11 chr9 113438719 11 113438518 Sox2
12 chr8 111452152 12 111451951 Sox2
In [87]:
ltest[2].head()
Out[87]:
chr end id start task
3 chr9 4283494 3 4283293 Sox2
9 chr8 80804382 9 80804181 Sox2
20 chr1 66961154 20 66960953 Sox2
30 chr8 108723188 30 108722987 Sox2
35 chr8 106289458 35 106289257 Sox2
In [68]:
eval_profile(nvalid[1]['profile/Sox2'], softmax(preds[0]), binsizes=[1, 2, 5, 10])
Out[68]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.418850 1 0.211562 0.007209 2374 0.008921
1 0.520032 2 0.349331 0.016715 2260 0.021437
2 0.680696 5 0.588823 0.061181 2091 0.086019
3 0.825783 10 0.743816 0.186907 1990 0.274584
In [69]:
eval_profile(nvalid[1]['profile/Oct4'], softmax(preds[1]), binsizes=[1, 2, 5, 10])
Out[69]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.197214 1 0.196674 0.002904 1309 0.003162
1 0.270291 2 0.325616 0.006772 1275 0.007423
2 0.392995 5 0.560163 0.025020 1229 0.028917
3 0.557236 10 0.730999 0.078357 1177 0.091288
In [ ]:
valid[]
In [41]:
otest[1]
Out[41]:
{'sox2': array([[[0., 0.],
         [0., 1.],
         [0., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 1.]],
 
        [[0., 0.],
         [0., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        ...,
 
        [[1., 1.],
         [1., 1.],
         [1., 1.],
         ...,
         [0., 3.],
         [1., 5.],
         [1., 5.]],
 
        [[2., 2.],
         [1., 0.],
         [1., 0.],
         ...,
         [0., 6.],
         [0., 4.],
         [0., 2.]],
 
        [[0., 0.],
         [4., 0.],
         [3., 2.],
         ...,
         [3., 3.],
         [1., 2.],
         [0., 2.]]], dtype=float32), 'oct4': array([[[0., 0.],
         [3., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 0.],
         ...,
         [1., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 1.],
         ...,
         [1., 0.],
         [1., 0.],
         [0., 0.]],
 
        ...,
 
        [[2., 3.],
         [1., 4.],
         [2., 4.],
         ...,
         [0., 2.],
         [0., 2.],
         [1., 1.]],
 
        [[0., 1.],
         [0., 2.],
         [3., 2.],
         ...,
         [1., 4.],
         [1., 7.],
         [1., 4.]],
 
        [[0., 0.],
         [2., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]]], dtype=float32)}
In [28]:
test[2].head()
Out[28]:
chr end id start task
2 chr1 57780311 2 57780110 Sox2
5 chr8 67966723 5 67966522 Sox2
7 chr1 9955454 7 9955253 Sox2
11 chr9 113438719 11 113438518 Sox2
12 chr8 111452152 12 111451951 Sox2
In [29]:
otest[2].head()
Out[29]:
chr end oct4_neg oct4_pos sox2_neg sox2_pos start seq seq_id
2 chr1 57780311 [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 3.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 57780110 GAGAAAAATCATCTGGAATCCAGCTGAGAGTGAAAGGCGAGGCAAA... chr1:57780110-57780311
5 chr8 67966723 [0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ... [0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 67966522 TAAACTCTCTTTCCCATATAGCCTCTCTCAGTTGCCCTTGCAATTT... chr8:67966522-67966723
7 chr1 9955454 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... 9955253 ATAGAACCCATCCTTAGGGGAGTATCATGTGTACTTCATAGCCTGC... chr1:9955253-9955454
11 chr9 113438719 [0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... 113438518 CAGGTCTTACTTTCTGTCTCTGTAAGCTAACATAGGCCAATTGAGA... chr9:113438518-113438719
12 chr8 111452152 [0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0, ... [0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, ... [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ... [0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ... 111451951 TACTGTACAGAATTTAGCATCACAATACATTAGCAACAGATCAAAC... chr8:111451951-111452152