# common imports
from basepair.imports import *
from basepair.config import test_chr, valid_chr
pattern1 = "metacluster_0/pattern_0"
pattern2 = "metacluster_0/pattern_1"
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
# Modisco
mr = ModiscoResult(modisco_dir / "modisco.h5")
mr.open()
tasks = mr.tasks()
ds = DataSpec.load(model_dir / "dataspec.yaml")
d = HDF5Reader.load(model_dir / "grad.all.h5")
id_hash = pd.DataFrame({"peak_id": d['metadata']['interval_from_task'],
"example_idx": np.arange(d['metadata']['interval_from_task'].shape[0])})
# load the instances data frame
df = pd.read_csv(modisco_dir / "all.instances.tsv", sep='\t')
df['pattern_id'] = "metacluster_" + df.metacluster.astype(str) + "/pattern_" + df.pattern.astype(str)
df['seqlet_center'] = (df.seqlet_start + df.seqlet_end) / 2
patterns = df.pattern_id.unique().tolist()
pattern_pssms = {pattern: mr.get_pssm(*pattern.split("/")) for pattern in patterns}
append_pattern_loc(df, pattern_pssms, trim_frac=0.08)
df['pattern_center'] = (df.pattern_start + df.pattern_end) / 2
df = df.merge(id_hash, on="example_idx")
from pysam import FastaFile
fa = FastaFile(ds.fasta_file)
chrl = pd.DataFrame([(c,l) for c,l in zip(fa.references, fa.lengths) if "_" not in c], columns=['chr', 'l'])
mfrac = chrl.l.sum() / chrl[chrl.chr.isin(valid_chr)].l.sum()
mfrac
dfn = pd.DataFrame([dict(pattern_id=p, old=mr.n_seqlets(*p.split("/"))) for p in mr.patterns()])
dfm = df[df.percnormed_score > 0.7].groupby("pattern_id").size().reset_index().merge(dfn, on='pattern_id').rename(columns={0:"new"})
import seaborn as sns
import statsmodels.api as sm
model = sm.OLS(dfm.new, dfm.old)
results = model.fit()
results.conf_int() # confidence interval for beta
dfm.new.sum() / dfm.old.sum()
sns.jointplot("old", "new", data=dfm, kind="reg")
from basepair.plot.vdom import get_signal
from basepair.modisco.utils import ic_scale
from concise.utils.plot import seqlogo_fig, seqlogo
from basepair.plot.profiles import extract_signal
def df2seqlets(df, width=None):
"""Convert the pd.DataFrame of instances
"""
return [Seqlet(row.example_idx,
row.pattern_start,
row.pattern_end,
name=row.pattern_id,
strand="-" if row.revcomp else "+").resize(width)
for i, row in df.iterrows()]
from basepair.plot.vdom import fig2vdom, vdm_heatmaps
included_samples = np.ones(d['inputs'].shape[:1], dtype=bool)
tasks = mr.tasks()
top_n = 1000
pattern = 'metacluster_0/pattern_1'
dff = df[(df.pattern_id == pattern) & \
(0.7 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, width=40) # randomly sample 10% of the seqlets to speedup plotting
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
pattern = 'metacluster_0/pattern_1'
dff = df[(df.pattern_id == pattern) & \
(0.5 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, width=40) # randomly sample 10% of the seqlets to speedup plotting
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
dff = df[(df.pattern_id == pattern) & \
(0.2 < df.percnormed_score) & \
(df.percnormed_score < 0.3)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
pattern = 'metacluster_0/pattern_0'
dff = df[(df.pattern_id == pattern) & \
(100 < df.pattern_start) & \
(900 > df.pattern_end) & \
(0.7 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
dff = df[(df.pattern_id == pattern) & \
(100 < df.pattern_start) & \
(900 > df.pattern_end) & \
(0.5 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
dff = df[(df.pattern_id == pattern) & \
(100 < df.pattern_start) & \
(900 > df.pattern_end) & \
(0.2 < df.percnormed_score) & \
(df.percnormed_score < 0.3)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
pattern = 'metacluster_0/pattern_9'
dff = df[(df.pattern_id == pattern) & \
(100 < df.pattern_start) & \
(900 > df.pattern_end) & \
(0.7 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
dff = df[(df.pattern_id == pattern) & \
(100 < df.pattern_start) & \
(900 > df.pattern_end) & \
(0.5 < df.percnormed_score) & \
(df.percnormed_score < 1)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)
dff = df[(df.pattern_id == pattern) & \
(0.2 < df.percnormed_score) & \
(df.percnormed_score < 0.3)
]
# Number of seqlets
len(dff)
seqlets = df2seqlets(dff, 40)
display(fig2vdom(mr.plot_pssm(*pattern.split("/")))) # original
display(fig2vdom(seqlogo_fig(ic_scale(extract_signal(d['inputs'][included_samples], seqlets).mean(axis=0))))); # inferred
vdm_heatmaps(seqlets, d, included_samples, tasks, pattern,
pssm_fig=p(), top_n=top_n, opened=False)