from basepair.datasets import *

cat  /users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/dataspec.yaml

task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair/data/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta
path: null

from basepair.cli.schemas import DataSpec

ds = DataSpec.load("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/dataspec.yaml")

ds.task_specs['Oct4'].peaks

train, valid, test = chip_exo_nexus(ds, 201, shuffle=False)

100%|██████████| 9396/9396 [00:00<00:00, 694027.90it/s]
2018-06-25 22:06:29,622 [INFO] extract sequence
2018-06-25 22:06:30,038 [INFO] extract counts
100%|██████████| 2/2 [00:18<00:00,  9.40s/it]

otrain, ovalid, otest = sox2_oct4_peaks_sox2()

np.mean(test[1]['profile/Oct4'] == otest[1]['oct4'])

1.0

np.mean(test[1]['profile/Sox2'] == otest[1]['sox2'])

1.0

from basepair.cli.evaluate import eval_profile

test[1]['profile/Oct4'].sum()

542282.0

test[1]['profile/Sox2'].sum()

268906.0

from basepair.config import get_data_dir, create_tf_session
from keras.models import load_model
from basepair.datasets import *
from basepair.preproc import transform_data
from basepair.plots import regression_eval

a=1

create_tf_session(1)

<tensorflow.python.client.session.Session at 0x7f5822333ac8>

model = load_model("/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/exp/models/p-multi-task/seq_mutlitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,use_profile=True,use_counts=True,c_task_weight=10,lr=0.004.5.h5")

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-06-25 21:37:39,455 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-06-25 21:37:44,062 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

preds = model.predict(valid[0])

from basepair.math import softmax

eval_profile(valid[1]['profile/Sox2'], softmax(preds[0]), binsizes=[1, 2, 5, 10])

eval_profile(valid[1]['profile/Oct4'], softmax(preds[1]), binsizes=[1, 2, 5, 10])

from kipoi.utils import read_pickle

ntrain, nvalid, ntest = read_pickle("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/data.pkl")

preds = model.predict(nvalid[0])

pd.read_table("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/eval/profile_metrics.tsv").query('binsize==1 & split=="valid"')

from basepair.cli.evaluate import load_data

ltrain, lvalid, ltest = load_data("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/")

np.mean(ntest[1]['profile/Oct4'] == ltest[1]['profile/Oct4'])

1.0

np.mean(ntest[1]['profile/Sox2'] == ltest[1]['profile/Sox2'])

1.0

np.mean(ntest[0] == ltest[0])

1.0

print(ds.get_config_as_yaml())

task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair/data/raw/chipnexus/scratch_jzeitlinger_collab/meme_chip_analysis/Sox2_123b_1_ppr.IDR0.05.filt.summit_centered_200bp.narrowPeak
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta

y_pred = model.predict(valid[0])
y_true = valid[1]
task='Sox2'
# Counts

from basepair.cli.schemas import HParams

hp = HParams.load("/users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/hparams.yaml")

task

'Sox2'

ds.task2idx(task, "profile")

1

# Profile
yp = softmax(y_pred[ds.task2idx(task, "profile")])
eval_profile(y_true["profile/Sox2"], yp)

Performance notes¶

adding 20k oct4 peaks
- Oct4:

cat /users/avsec/workspace/basepair-workflow/models/old_sox2_peaks/hparams.yaml

model:
  name: seq_multitask
  kwargs:
    filters: 21
    conv1_kernel_size: 21
    tconv_kernel_size: 25
    n_dil_layers: 6
    lr: 0.004
    c_task_weight: 100
data:
  peak_width: 201
  shuffle: true
  valid_chr:
  - chr2
  - chr3
  - chr4
  test_chr:
  - chr1
  - chr8
  - chr9
train:
  early_stop_patience: 5
  epochs: 200
  batch_size: 256
  balance_peak_classes: true
evaluate:
  pos_min_threshold: 0.05
  neg_max_threshold: 0.01
  required_min_pos_counts: 2.5
  binsizes:
  - 1
  - 10
modisco:
  sliding_window_size: 21
  flank_size: 10
  histogram_bins: 100
  percentiles_in_bandwidth: 10.0
  overlap_portion: 0.5
  min_cluster_size: 50
  threshold_for_counting_sign: 1.0
  weak_threshold_for_counting_sign: 0.7
  max_strand_distance: 0.2
path: hparams.yaml

test[2].head()

ltest[2].head()

eval_profile(nvalid[1]['profile/Sox2'], softmax(preds[0]), binsizes=[1, 2, 5, 10])

eval_profile(nvalid[1]['profile/Oct4'], softmax(preds[1]), binsizes=[1, 2, 5, 10])

valid[]

otest[1]

{'sox2': array([[[0., 0.],
         [0., 1.],
         [0., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 1.]],
 
        [[0., 0.],
         [0., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        ...,
 
        [[1., 1.],
         [1., 1.],
         [1., 1.],
         ...,
         [0., 3.],
         [1., 5.],
         [1., 5.]],
 
        [[2., 2.],
         [1., 0.],
         [1., 0.],
         ...,
         [0., 6.],
         [0., 4.],
         [0., 2.]],
 
        [[0., 0.],
         [4., 0.],
         [3., 2.],
         ...,
         [3., 3.],
         [1., 2.],
         [0., 2.]]], dtype=float32), 'oct4': array([[[0., 0.],
         [3., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 0.],
         ...,
         [1., 0.],
         [0., 0.],
         [0., 0.]],
 
        [[0., 0.],
         [0., 0.],
         [0., 1.],
         ...,
         [1., 0.],
         [1., 0.],
         [0., 0.]],
 
        ...,
 
        [[2., 3.],
         [1., 4.],
         [2., 4.],
         ...,
         [0., 2.],
         [0., 2.],
         [1., 1.]],
 
        [[0., 1.],
         [0., 2.],
         [3., 2.],
         ...,
         [1., 4.],
         [1., 7.],
         [1., 4.]],
 
        [[0., 0.],
         [2., 0.],
         [1., 0.],
         ...,
         [0., 0.],
         [0., 0.],
         [0., 0.]]], dtype=float32)}

test[2].head()

otest[2].head()

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc
0	0.418850	1	0.211562	0.007209	2374	0.009344
1	0.520032	2	0.349331	0.016715	2260	0.022433
2	0.680696	5	0.588823	0.061181	2091	0.088653
3	0.825783	10	0.743816	0.186907	1990	0.277635

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc
0	0.197214	1	0.196674	0.002904	1309	0.002937
1	0.270291	2	0.325616	0.006772	1275	0.007018
2	0.392995	5	0.560163	0.025020	1229	0.027106
3	0.557236	10	0.730999	0.078357	1177	0.089163

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc	task	split
4	0.065714	1	0.196674	0.002904	1309	0.003295	Oct4	valid
6	0.159771	1	0.211562	0.007209	2374	0.008267	Sox2	valid

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc
0	0.159771	1	0.211562	0.007209	2374	0.008325
1	0.260119	2	0.349331	0.016715	2260	0.019524
2	0.399817	4	0.526987	0.043605	2143	0.053307
3	0.678802	10	0.743816	0.186907	1990	0.237694

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc
0	0.418850	1	0.211562	0.007209	2374	0.008921
1	0.520032	2	0.349331	0.016715	2260	0.021437
2	0.680696	5	0.588823	0.061181	2091	0.086019
3	0.825783	10	0.743816	0.186907	1990	0.274584

	chr	end	id	start	task
2	chr1	57780311	2	57780110	Sox2
5	chr8	67966723	5	67966522	Sox2
7	chr1	9955454	7	9955253	Sox2
11	chr9	113438719	11	113438518	Sox2
12	chr8	111452152	12	111451951	Sox2

	chr	end	id	start	task
3	chr9	4283494	3	4283293	Sox2
9	chr8	80804382	9	80804181	Sox2
20	chr1	66961154	20	66960953	Sox2
30	chr8	108723188	30	108722987	Sox2
35	chr8	106289458	35	106289257	Sox2

	chr	end	oct4_neg	oct4_pos	sox2_neg	sox2_pos	start	seq	seq_id
2	chr1	57780311	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 3.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ...	[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	57780110	GAGAAAAATCATCTGGAATCCAGCTGAGAGTGAAAGGCGAGGCAAA...	chr1:57780110-57780311
5	chr8	67966723	[0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, ...	[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	67966522	TAAACTCTCTTTCCCATATAGCCTCTCTCAGTTGCCCTTGCAATTT...	chr8:67966522-67966723
7	chr1	9955454	[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 2.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...	9955253	ATAGAACCCATCCTTAGGGGAGTATCATGTGTACTTCATAGCCTGC...	chr1:9955253-9955454
11	chr9	113438719	[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 1.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	113438518	CAGGTCTTACTTTCTGTCTCTGTAAGCTAACATAGGCCAATTGAGA...	chr9:113438518-113438719
12	chr8	111452152	[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 2.0, 0.0, 1.0, ...	[0.0, 0.0, 0.0, 2.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...	[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...	[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ...	111451951	TACTGTACAGAATTTAGCATCACAATACATTAGCAACAGATCAAAC...	chr8:111451951-111452152