Goal

  • prepare code for meta-plots
  • add additional meta-plots to the report
In [5]:
from basepair.imports import *
import pybedtools
from genomelake.extractors import BigwigExtractor
from basepair.extractors import StrandedBigWigExtractor, bw_extract
from kipoiseq.transforms import ResizeInterval
from basepair.modisco.table import ModiscoData
from pybedtools import BedTool
from basepair.cli.modisco import load_ranges
from basepair.plot.heatmaps import heatmap_importance_profile, normalize

from basepair.plot.profiles import plot_stranded_profile
In [3]:
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
In [4]:
df
Out[4]:
assay rep strand path
0 DNase 1 None /srv/scratch/avsec/workspace/basepair-workflow...
1 DNase 2 None /srv/scratch/avsec/workspace/basepair-workflow...
2 H3K27ac 1 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
3 H3K27ac 2 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
4 H3K4me1 1 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
5 H3K4me1 2 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
6 Input 1 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
7 PolII 1 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
8 PolII 2 None /srv/scratch/avsec/workspace/chipnexus/data/ra...
In [33]:
import dask
from dask.diagnostics import ProgressBar
In [35]:
#from joblib import Parallel, delayed

class MultiAssayExtractor:
    def __init__(self, df, interval_transform, use_strand=True, n_jobs=1):
        self.n_jobs = n_jobs
        self.df = df
        self.interval_transform = interval_transform
        self.use_strand = use_strand
        
    def extract(self, intervals, progbar=False):
        with ProgressBar():
            with dask.config.set(num_workers=self.n_jobs, scheduler='multiprocessing'):
                extracted = [dask.delayed(bw_extract)(fname, intervals, 
                                                      self.interval_transform, 
                                                      self.use_strand)
                    for fname in self.df.path]
                extracted = dask.compute(*extracted)
        
        d = {}
        for assay in df.assay.unique():
            idx = df[df.assay == assay].index
            d[assay] = sum([x for i, x in enumerate(extracted) 
                            if i in idx])
        return d

Tasks

In [36]:
pattern = 'm0_p0'
In [37]:
mdir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/")
!ls {mdir}
centroid_seqlet_matches.csv  hparams.yaml    modisco.h5     plots
figures			     instances.parq  modisco.yaml   strand_distances.h5
figures.bak		     kwargs.json     modisco.yaml~
In [38]:
mr = ModiscoResult(mdir / 'modisco.h5')
mr.open()
In [40]:
extractor = MultiAssayExtractor(df, ResizeInterval(6000), use_strand=True, n_jobs=10)
In [53]:
center = extractor.interval_transform.width // 2
In [41]:
from basepair.modisco.utils import shorten_pattern, longer_pattern
In [42]:
len(mr.patterns())
Out[42]:
142
In [43]:
mr.plot_pattern(longer_pattern(pattern), 'seq');
In [44]:
from basepair.extractors import Interval
from basepair.plot.profiles import multiple_plot_stranded_profile
from basepair.plot.heatmaps import multiple_heatmap_importance_profile
In [116]:
cache = True
profiles = {}

Oct4-Sox2 1

In [117]:
pattern = "m0_p0"
In [128]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [119]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [120]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed | 15.6s
CPU times: user 8.57 s, sys: 10.6 s, total: 19.2 s
Wall time: 18.2 s
In [121]:
multiple_plot_stranded_profile(o);
In [122]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [123]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [124]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [125]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Oct4-Sox2 2

In [129]:
pattern = "m3_p0"
In [130]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [131]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [132]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed |  8.7s
CPU times: user 5.25 s, sys: 8.71 s, total: 14 s
Wall time: 11.9 s
In [133]:
multiple_plot_stranded_profile(o);
In [134]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [135]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [136]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [137]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Sox2

In [138]:
pattern = "m0_p1"
In [139]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [140]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [141]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed |  5.7s
CPU times: user 3.91 s, sys: 7.59 s, total: 11.5 s
Wall time: 9.76 s
In [142]:
multiple_plot_stranded_profile(o);
In [143]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [144]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [145]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [146]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Nanog 1

In [147]:
pattern = "m0_p3"
In [148]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [149]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [150]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed |  2.2s
CPU times: user 1.56 s, sys: 6.17 s, total: 7.74 s
Wall time: 6.33 s
In [151]:
multiple_plot_stranded_profile(o);
In [152]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [153]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [154]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [155]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Nanog 2

In [156]:
pattern = "m1_p0"
In [157]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [158]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [159]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed |  9.8s
CPU times: user 6.23 s, sys: 11.5 s, total: 17.7 s
Wall time: 14.7 s
In [160]:
multiple_plot_stranded_profile(o);
In [161]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [162]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [163]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [164]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Klf4

In [165]:
pattern = "m2_p0"
In [166]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [167]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [168]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed | 21.4s
CPU times: user 11.1 s, sys: 15.8 s, total: 26.9 s
Wall time: 27.5 s
In [169]:
multiple_plot_stranded_profile(o);
In [170]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [171]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [172]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [173]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Essrb

In [174]:
pattern = "m0_p2"
In [175]:
mr.plot_pattern(longer_pattern(pattern), ['seq', 'contrib']);
In [176]:
intervals = mr.get_seqlet_intervals(longer_pattern(pattern))
In [177]:
%%time
if pattern not in profiles or not cache:
    profiles[pattern] = extractor.extract(intervals, progbar=True)
o = profiles[pattern]
[########################################] | 100% Completed |  3.6s
CPU times: user 2.49 s, sys: 8.42 s, total: 10.9 s
Wall time: 10.4 s
In [178]:
multiple_plot_stranded_profile(o);
In [179]:
sort_idx = np.argsort(-o['DNase'].sum(axis=1))
In [180]:
multiple_heatmap_importance_profile({k: normalize(v, 10,99) for k,v in o.items()}, 
                                    sort_idx=sort_idx, figsize=(25,25), tick_step=1000, aspect=1);
In [181]:
# DNase footprint
plt.plot(o['DNase'].mean(0)[(center -100):(center+100)]);
In [182]:
heatmap_importance_profile(normalize(o['DNase'][sort_idx[:1000], (center -100):(center+100)], pmin=50, pmax=99), figsize=(10,10))

Store the meta plots

In [191]:
from basepair.utils import write_pkl
In [193]:
write_pkl(profiles, mdir / "pattern-meta-profiles.pkl")
In [196]:
!du -sh {mdir}/*.pkl
4.7G	/users/avsec/workspace/basepair/data/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile/pattern-meta-profiles.pkl