Goal

  • plot DNase metaplots for main modisco motifs

Open questions

  • [ ] are seqlets the best for showing meta-plots or shall we try with FIMO motifs?

Required files

  • files specified in {ddir}/processed/chipnexus/external-data.tsv
In [10]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from kipoiseq.transforms import ResizeInterval
hv.extension('bokeh')
In [2]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
In [7]:
# load file paths
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df_dnase = df[df.assay.str.startswith("DNase")]
In [4]:
from basepair.extractors import MultiAssayExtractor
from genomelake.extractors import FastaExtractor

ds = DataSpec.load(model_dir / "dataspec.yaml")

seq_extractor = FastaExtractor(ds.fasta_file)
In [19]:
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()

DNase only plots

In [38]:
from basepair.extractors import StrandedBigWigExtractor, bw_extract

class MultiAssayExtractor:
    def __init__(self, df, interval_transform, use_strand=True, n_jobs=1):
        self.n_jobs = n_jobs
        self.df = df
        self.interval_transform = interval_transform
        self.use_strand = use_strand
        
    def extract(self, intervals, progbar=False):
        extracted = [bw_extract(fname, intervals, 
                                self.interval_transform, 
                                self.use_strand)
                    for fname in tqdm(self.df.path, disable=not progbar)]
        d = {}
        for assay in df.assay.unique():
            idx = df[df.assay == assay].index
            d[assay] = sum([x for i, x in enumerate(extracted) 
                            if i in idx])
        return d
In [35]:
extractor_dnase = MultiAssayExtractor(df_dnase, ResizeInterval(200), use_strand=True, n_jobs=1)
In [12]:
from basepair.exp.dnase.models import KmerBiasModel
# load the bias models
kmer_model = read_pkl(f"{ddir}/raw/dnase/6mer/IMR90_6mer.pickle")
km = KmerBiasModel({k: float(v['forward']) for k,v in kmer_model.items()})

# re-trained kmer model
km2 = read_pkl(f"{ddir}/processed/dnase/6mer/new_model.train-pos3.w1000.normalized.pkl")
In [13]:
def plot_dnase(o, dnase_bias, dnase_bias2):
    center_dnase=100
    fig, axes = plt.subplots(1, 5, figsize=(22, 2.5))
    axes[0].plot((o['DNase']).mean(0)[(center_dnase -100):(center_dnase+100)]);
    axes[0].set_title("DNase")
    axes[1].plot((o['DNase']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
    axes[1].set_title("DNase * dnase_bias")
    axes[2].plot((o['DNase']*dnase_bias2).mean(0)[(center_dnase -100):(center_dnase+100)]);
    axes[2].set_title("DNase * dnase_bias2")
    axes[3].plot((o['DNase-HINT']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
    axes[3].set_title("DNase-HINT")
    axes[4].plot((o['DNase-seqOutBias']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
    axes[4].set_title("DNase-seqOutBias");
    plt.tight_layout()
    return fig

Run for patterns

In [15]:
cache = True
profiles = {}
In [16]:
pattern = "m0_p0"
In [36]:
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
    profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
100%|██████████| 4/4 [00:12<00:00,  2.80s/it]
CPU times: user 9.88 s, sys: 872 ms, total: 10.8 s
Wall time: 16.9 s
In [37]:
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2)
Out[37]:
In [173]:
pattern = "m0_p1"
In [176]:
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
    profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
CPU times: user 1.33 s, sys: 48 ms, total: 1.38 s
Wall time: 1.38 s
In [188]:
# DNase footprintb
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);
In [194]:
pattern = "m0_p3"
In [195]:
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
    profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
[########################################] | 100% Completed |  4.9s
CPU times: user 856 ms, sys: 5.65 s, total: 6.51 s
Wall time: 11 s
In [196]:
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);
In [191]:
pattern = "m2_p0"
In [192]:
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
    profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
[########################################] | 100% Completed |  1min 23.6s
CPU times: user 10.4 s, sys: 6.44 s, total: 16.9 s
Wall time: 1min 35s
In [193]:
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);