Goal¶

compare ChIP-seq to ChIP-nexus

Conclusions¶

check how many peaks were actually accessible
- Oct4: 15327/21841 (70%)
- Sox2: 7695/9396 (82%)
- Nanog: 9957/18017 (55%)
- Klf4: 33588/49174 (68%)

TODO¶

filter the evaluated regions to DNA accessible
[x] put gdrive to zapy

from basepair.config import valid_chr, test_chr

from basepair.imports import *

" ".join(test_chr)

" ".join(valid_chr)

'chr2 chr3 chr4'

Data¶

ChIP-seq¶

ls /srv/scratch/avsec/workspace/basepair/chip-seq/Oct4_12/out/peak/idr/optimal_set/

Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz
Oct4_12_ppr.IDR0.05.filt.12-col.bed.hammock.gz
Oct4_12_ppr.IDR0.05.filt.12-col.bed.hammock.gz.tbi
Oct4_12_ppr.IDR0.05.filt.narrowPeak
Oct4_12_ppr.IDR0.05.filt.narrowPeak.gz

ls /srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/

Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz
Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.hammock.gz
Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.hammock.gz.tbi
Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak
Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak.gz

sox_chipseq = "/srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak.gz"

oct_chipseq = "/srv/scratch/avsec/workspace/basepair/chip-seq/Oct4_12/out/peak/idr/optimal_set/Oct4_12_ppr.IDR0.05.filt.narrowPeak.gz"

!zcat {sox_chipseq} | wc -l

4073

!zcat {oct_chipseq} | wc -l

19401

ChIP-nexus¶

!zcat /srv/scratch/avsec/workspace/basepair/chip-seq/Sox2_1/out/peak/idr/optimal_set/Sox2_1_rep1-pr.IDR0.05.filt.narrowPeak.gz | wc -l

cat {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/dataspec.yml

task_specs:
  Oct4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Oct4/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Oct4/peaks.bed.gz
  Sox2:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Sox2/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Sox2/peaks.bed.gz
  Nanog:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Nanog/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Nanog/peaks.bed.gz
  Klf4:
    pos_counts: /users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.pos.bw
    neg_counts: /users/avsec/workspace/basepair-workflow/data/Klf4/5prime.counts.neg.bw
    peaks: /users/avsec/workspace/basepair-workflow/data/Klf4/peaks.bed.gz
  # DNase:
  #   pos_counts: /srv/scratch/avsec/workspace/basepair-workflow/data/DNase/signal/raw/merged.pos.bw
  #   neg_counts: /srv/scratch/avsec/workspace/basepair-workflow/data/DNase/signal/raw/merged.neg.bw
  #   # - for now, don't use DNase peaks (300k). Other four factors have 100k peaks altogether
  #   # peaks: /srv/scratch/avsec/workspace/basepair-workflow/data/DNase/raw/peaks/ENCFF426TTH.bed.gz
  #   ignore_strand: true
  #   bias_model: /users/avsec/workspace/basepair/data/processed/dnase/exp/models/background/models/w=1000/model.h5
fasta_file: /mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta

oct_chipnexus = "/users/avsec/workspace/basepair-workflow/data/Oct4/peaks.bed.gz"
sox_chipnexus = "/users/avsec/workspace/basepair-workflow/data/Sox2/peaks.bed.gz"

ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/conservative_set/

Oct4_rep2-rep3.IDR0.05.filt.12-col.bed.gz
Oct4_rep2-rep3.IDR0.05.filt.12-col.bed.hammock.gz
Oct4_rep2-rep3.IDR0.05.filt.12-col.bed.hammock.gz.tbi
Oct4_rep2-rep3.IDR0.05.filt.narrowPeak.gz

!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/conservative_set/Oct4_rep2-rep3.IDR0.05.filt.12-col.bed.gz | wc -l

17257

!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/pooled_rep/Oct4_2c_matched_barcode.filt_filtered_pooled.pval0.01.500K.narrowPeak.gz | wc -l

112693

!ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/

idr  overlap  pooled_pseudo_reps  pooled_rep  pseudo_reps  rep1  rep2  rep3

!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/pooled_rep/Oct4_2c_matched_barcode.filt_filtered_pooled.pval0.01.500K.narrowPeak.gz | head

chr2	98666215	98667389	Peak_1	1282324	.	26.17635	128232.40625	128222.96875	1015
chr9	35305138	35305590	Peak_2	267635	.	40.03848	26763.50195	26756.49023	304
chrX	143482930	143483183	Peak_3	220366	.	65.28529	22036.60547	22029.75195	114
chr2	98662591	98662904	Peak_4	86798	.	5.02023	8679.87402	8673.39551	173
chr9	3030061	3030307	Peak_5	85478	.	35.71801	8547.84375	8541.37012	127
chr9	24541944	24542232	Peak_6	77360	.	60.53094	7736.05127	7729.62988	144
chr3	122145223	122145986	Peak_7	70813	.	34.27969	7081.38135	7075.00146	340
chr9	3035729	3036450	Peak_8	63895	.	34.32119	6389.53320	6383.20557	139
chr6	103648981	103649334	Peak_9	63313	.	51.58100	6331.31006	6324.99512	206
chr2	98665055	98665267	Peak_10	51200	.	3.89864	5120.08154	5113.87207	82

gzip: stdout: Broken pipe

ls /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/idr/optimal_set

Oct4_ppr.IDR0.05.filt.12-col.bed.gz
Oct4_ppr.IDR0.05.filt.12-col.bed.hammock.gz
Oct4_ppr.IDR0.05.filt.12-col.bed.hammock.gz.tbi
Oct4_ppr.IDR0.05.filt.narrowPeak.gz
summits.bed.gz

!zcat /users/avsec/workspace/basepair-workflow/data/Oct4/peak/macs2/overlap/pooled_pseudo_reps/Oct4_ppr.naive_overlap.filt.narrowPeak.gz | wc -l

51200

oct_chipnexus_sorted = "/tmp/oct4.chipnexus.sorted.bed"
sox_chipnexus_sorted = "/tmp/sox2.chipnexus.sorted.bed"
oct_chipseq_sorted = "/tmp/oct4.chipseq.sorted.bed"
sox_chipseq_sorted = "/tmp/sox2.chipseq.sorted.bed"

!zcat {oct_chipnexus} | wc -l

21841

!zcat {sox_chipnexus} | wc -l

9396

!bedtools intersect -a {oct_chipnexus} -b {oct_chipseq} -wa -u | wc -l

13927

genome_file = "/mnt/data/pipeline_genome_data/mm10/mm10.chrom.sizes"

!bedtools sort -i {oct_chipseq}  > {oct_chipseq_sorted}
!bedtools sort -i {sox_chipseq}  > {sox_chipseq_sorted}
!bedtools sort -i {oct_chipnexus} | bedtools slop -b 100 -g {genome_file}> {oct_chipnexus_sorted}
!bedtools sort -i {sox_chipnexus} | bedtools slop -b 100 -g {genome_file}> {sox_chipnexus_sorted}

oct_chipseq_sorted

'/tmp/oct4.chipseq.sorted.bed'

!bedtools jaccard -a {oct_chipnexus_sorted} -b {oct_chipseq_sorted}

intersection	union-intersection	jaccard	n_intersections
0	4390041	0	0

!bedtools jaccard -a {sox_chipnexus_sorted} -b {oct_chipseq_sorted}

intersection	union-intersection	jaccard	n_intersections
0	3733502	0	0

13971 / 21841

0.6396685133464585

sox_chipseq_sorted

'/tmp/sox2.chipseq.sorted.bed'

!bedtools intersect -a {sox_chipnexus} -b {sox_chipseq} -wa -u | wc -l

3192

!bedtools intersect -a {sox_chipseq} -b {sox_chipnexus} -wa -u | wc -l

3180

3192/9396

0.3397190293742018

3192/4073

0.78369752025534

Conclusion¶

~60% of the peaks overlap with ChIP-seq and ChIP-nexus

Label the regions¶

which command to run?
- exclude ambiguous peaks
- create also a set using only accessible peaks

!bedtools intersect -e -f 0.5 -F 0.5

Evaluate¶

oct_sox_binary_intervals = "/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/oct4-sox2.Oct4.intervals_file.tsv.gz"

fasta_file = "/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"

!zcat {oct_sox_binary_intervals} | head -n 2

chr18	47638541	47639541	0	0
chr2	32244581	32245581	0	0

gzip: stdout: Broken pipe

df = pd.read_csv(oct_sox_binary_intervals, sep='\t', header=None)
df.columns = ['chr', 'start', 'end', 'Oct4', 'Sox2']

len(df)

54504198

df['Oct4'].value_counts() / len(df['Oct4'])

 0    0.991043
-1    0.007338
 1    0.001619
Name: Oct4, dtype: float64

df['Sox2'].value_counts() / len(df)

 0    0.998827
-1    0.000837
 1    0.000336
Name: Sox2, dtype: float64

df['Oct4'].value_counts()

 0    54015986
-1      399946
 1       88266
Name: Oct4, dtype: int64

df['Sox2'].value_counts()

 0    54440285
-1       45610
 1       18303
Name: Sox2, dtype: int64

from basepair.datasets import *

tsv = TsvReader(oct_sox_binary_intervals, label_dtype=int, mask_ambigous=-1)

np.all(tsv.df.iloc[:, 3:] == -1, axis=1).sum()

24639

from basepair.config import test_chr

ds_seq = SeqClassification(oct_sox_binary_intervals, fasta_file, incl_chromosomes=test_chr)

ds_seq.tsv.df[0].unique()

array(['chr9', 'chr1', 'chr8'], dtype=object)

ds_seq.tsv.df.head()

len(ds_seq)

24639

ds_seq[0]

{'inputs': {'seq': array([[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 0., 1.],
         ...,
         [1., 0., 0., 0.],
         [1., 0., 0., 0.],
         [0., 0., 1., 0.]], dtype=float32)},
 'targets': array([-1, -1]),
 'metadata': {'ranges': GenomicRanges(chr='chr13', start=51203031, end=51204031, id='0', strand='*')}}

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7f0cfdaa2358>

model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

# Load the model
model = load_model(model_dir / "model.h5")

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-09-22 03:10:21,061 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-09-22 03:10:27,284 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

from kipoi.data_utils import numpy_collate_concat

lpreds = []
llabels = []
for inputs, targets in tqdm(ds_seq.batch_train_iter(cycle=False, num_workers=10, batch_size=256)):
    lpreds.append(model.predict(inputs))
    # TODO - discard the targets
    llabels.append(targets)
preds = numpy_collate_concat(lpreds)
labels = numpy_collate_concat(llabels)

34792it [49:49, 11.64it/s]

del llabels
del lpreds

a=1

oct4_total = preds[4].sum(axis=1)

plt.hist(oct4_total, 100);

import seaborn as sns

labels.max()

-1

auPR¶

dfl = pd.DataFrame(dict(label=labels[:,1], counts=sox2_counts.sum(axis=-1)))
auprc(dfl.label, dfl.counts)

0.05284194007081463

dfl = pd.DataFrame(dict(label=labels[:,0], counts=oct4_counts.sum(axis=-1)))
auprc(dfl.label, dfl.counts)

0.05224924797725083

Other¶

preds[0].shape

(8906667, 1000, 2)

sox2_profile = preds[1]
oct4_profile = preds[0]
oct4_counts = np.exp(preds[4])-1
sox2_counts = np.exp(preds[5])-1

del preds

dfl = pd.DataFrame(dict(label=labels[:,0], counts=oct4_counts.mean(axis=-1)*oct4_profile.max(axis=1).mean(axis=1)))
auprc(dfl.label, dfl.counts)

0.05664696951811246

np.mean(oct4_profile[:, 400:600].sum(axis=1) * oct4_counts, axis=1)

(8906667, 1000, 2)

# only sum counts for the central 200 bp
dfl = pd.DataFrame(dict(label=labels[:,0], counts=np.mean(oct4_profile[:, 400:600].sum(axis=1) * oct4_counts, axis=1)))
auprc(dfl.label, dfl.counts)

0.056669328054557926

dfl.groupby("label").mean()

auprc(labels[:,0], oct4_total)

0.05225863433787134

dfl.groupby("label").std()

dfl.label.value_counts()

0    8893004
1      13663
Name: label, dtype: int64

from concise.eval_metrics import auprc, auc

from plotnine import *

ggplot(aes(x="label", y="counts"), dfl) + geom_boxplot()

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/utils.py:281: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  ndistinct = ids.apply(len_unique, axis=0).as_matrix()
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4388: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version.
  object.__getattribute__(self, name)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py:4389: FutureWarning: Attribute 'is_copy' is deprecated and will be removed in a future version.
  return object.__setattr__(self, name, value)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/positions/position.py:188: FutureWarning: Method .as_matrix will be removed in a future version. Use .values instead.
  intervals = data[xminmax].drop_duplicates().as_matrix().flatten()

<ggplot: (-9223363324759093318)>

sns.boxplot()

preds[4].shape

(4544, 2)

	0	1	2
3	chr9	73381021	73382021
6	chr1	177272341	177273341
9	chr8	127376321	127377321
15	chr1	29076401	29077401
16	chr9	18611301	18612301

	counts
label
0	9.657067
1	10.329344

	counts
label
0	0.291717
1	0.599414