{ddir}/processed/chipnexus/external-data.tsv# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from kipoiseq.transforms import ResizeInterval
hv.extension('bokeh')
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
# load file paths
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df_dnase = df[df.assay.str.startswith("DNase")]
from basepair.extractors import MultiAssayExtractor
from genomelake.extractors import FastaExtractor
ds = DataSpec.load(model_dir / "dataspec.yaml")
seq_extractor = FastaExtractor(ds.fasta_file)
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
from basepair.extractors import StrandedBigWigExtractor, bw_extract
class MultiAssayExtractor:
def __init__(self, df, interval_transform, use_strand=True, n_jobs=1):
self.n_jobs = n_jobs
self.df = df
self.interval_transform = interval_transform
self.use_strand = use_strand
def extract(self, intervals, progbar=False):
extracted = [bw_extract(fname, intervals,
self.interval_transform,
self.use_strand)
for fname in tqdm(self.df.path, disable=not progbar)]
d = {}
for assay in df.assay.unique():
idx = df[df.assay == assay].index
d[assay] = sum([x for i, x in enumerate(extracted)
if i in idx])
return d
extractor_dnase = MultiAssayExtractor(df_dnase, ResizeInterval(200), use_strand=True, n_jobs=1)
from basepair.exp.dnase.models import KmerBiasModel
# load the bias models
kmer_model = read_pkl(f"{ddir}/raw/dnase/6mer/IMR90_6mer.pickle")
km = KmerBiasModel({k: float(v['forward']) for k,v in kmer_model.items()})
# re-trained kmer model
km2 = read_pkl(f"{ddir}/processed/dnase/6mer/new_model.train-pos3.w1000.normalized.pkl")
def plot_dnase(o, dnase_bias, dnase_bias2):
center_dnase=100
fig, axes = plt.subplots(1, 5, figsize=(22, 2.5))
axes[0].plot((o['DNase']).mean(0)[(center_dnase -100):(center_dnase+100)]);
axes[0].set_title("DNase")
axes[1].plot((o['DNase']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
axes[1].set_title("DNase * dnase_bias")
axes[2].plot((o['DNase']*dnase_bias2).mean(0)[(center_dnase -100):(center_dnase+100)]);
axes[2].set_title("DNase * dnase_bias2")
axes[3].plot((o['DNase-HINT']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
axes[3].set_title("DNase-HINT")
axes[4].plot((o['DNase-seqOutBias']*dnase_bias).mean(0)[(center_dnase -100):(center_dnase+100)]);
axes[4].set_title("DNase-seqOutBias");
plt.tight_layout()
return fig
cache = True
profiles = {}
pattern = "m0_p0"
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2)
pattern = "m0_p1"
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
# DNase footprintb
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);
pattern = "m0_p3"
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);
pattern = "m2_p0"
%%time
intervals = [extractor_dnase.interval_transform(interval) for interval in mr.get_seqlet_intervals(longer_pattern(pattern))]
if pattern not in profiles or not cache:
profiles[pattern] = extractor_dnase.extract(intervals, progbar=True)
o = profiles[pattern]
seqs = seq_extractor(intervals)
# DNase footprint
dnase_bias = km.predict_on_batch(seqs, pad=True)
dnase_bias2 = km2.predict_on_batch(seqs, pad=True)
plot_dnase(o, dnase_bias, dnase_bias2);