In [1]:
from collections import OrderedDict
exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
imp_score = 'profile/wn'

motifs = OrderedDict([
    ("Oct4-Sox2", 'Oct4/m0_p0'),
    ("Oct4", 'Oct4/m0_p1'),
    # ("Strange-sym-motif", 'Oct4/m0_p5'),
    ("Sox2", 'Sox2/m0_p1'),
    ("Nanog", 'Nanog/m0_p1'),
    ("Zic3", 'Nanog/m0_p2'),
    ("Nanog-partner", 'Nanog/m0_p4'),
    ("Klf4", 'Klf4/m0_p0'),
])

Goal

  • plot the seqlets histogram
In [2]:
from basepair.imports import *
from basepair.exp.paper.config import *
Using TensorFlow backend.
In [45]:
paper_config()
In [81]:
fdir = Path(f'{ddir}/figures/modisco/{exp}/')
In [3]:
model_dir = models_dir / exp
In [4]:
ls {model_dir}
activity/                                     log/
config.gin                                    model.h5
config.gin.json                               note_params.json
deeplift/                                     null.deeplift.imp_score.h5
deeplift.imp_score.h5                         perturbation-analysis/
evaluation.valid.json                         preds.test.pkl
events.out.tfevents.1552394698.sh-112-11.int  seq_model.pkl
history.csv                                   train.smk-benchmark.tsv
imp_score_deeplift.smk-benchmark.tsv          wandb.json
In [38]:
ls {model_dir}
activity/                                     log/
config.gin                                    model.h5
config.gin.json                               note_params.json
deeplift/                                     null.deeplift.imp_score.h5
deeplift.imp_score.h5                         perturbation-analysis/
evaluation.valid.json                         preds.test.pkl
events.out.tfevents.1552394698.sh-112-11.int  seq_model.pkl
history.csv                                   train.smk-benchmark.tsv
imp_score_deeplift.smk-benchmark.tsv          wandb.json
In [16]:
import modisco
TF-MoDISco is using the TensorFlow backend.
In [61]:
window_fn = modisco.coordproducers.get_simple_window_sum_function(21)
In [9]:
isf = ImpScoreFile(model_dir / 'deeplift.imp_score.h5', default_imp_score='profile/wn')
In [62]:
contrib_scores = isf.get_contrib()
In [45]:
paper_config()
In [39]:
isf_null = ImpScoreFile(model_dir / 'null.deeplift.imp_score.h5', default_imp_score='profile/wn')
contrib_scores_null = isf_null.get_contrib()
In [10]:
ranges = isf.get_ranges()
In [11]:
ranges.head()
Out[11]:
chrom start end strand interval_from_task idx
0 chr9 3001633 3002633 . Oct4 0
1 chr3 122145077 122146077 . Oct4 1
2 chr13 21199761 21200761 . Oct4 2
3 chr8 20424345 20425345 . Oct4 3
4 chr6 47777726 47778726 . Oct4 4
In [12]:
oct4_range = ranges.interval_from_task == 'Oct4'
In [63]:
task = 'Oct4'
In [65]:
tasks
Out[65]:
['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [76]:
sl_d = dict()
sl_null_d = dict()
for i, task in enumerate(tasks):
    seqlet_scores = np.stack(window_fn(contrib_scores[task][ranges.interval_from_task == task]))
    sl_d[task] = np.ravel(seqlet_scores)
    seqlet_scores_null = np.stack(window_fn(contrib_scores_null[task]))
    sl_null_d[task] = np.ravel(seqlet_scores_null)
In [79]:
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(1, .3), sharex=True, sharey=True, gridspec_kw=dict(hspace=0, wspace=0))
for i, task in enumerate(tasks):
    axes[0, i].set_title(task)
    axes[0, i].hist(sl_d[task], 100, log=True);
    axes[1, i].hist(sl_null_d[task], 100, log=True);
axes[0, 0].set_ylabel("Signal")
axes[1, 0].set_ylabel("Null")
axes[1, 1].set_xlabel("Importance (21bp window)")
Out[79]:
Text(0.5, 0, 'Importance (21bp window)')
In [88]:
np.median(sl_d['Oct4'])
Out[88]:
0.0029651522636413574
In [87]:
plt.hist(sl_d['Oct4'], 100, log=False);
In [82]:
fig.savefig(fdir / 'seqlet-imp-histogram.21bp.pdf')