p1.aligned_distance(p2, which='seq'), where which can be 'seq', 'counts_profile,counts_total`seq, contrib, hyp_contrib or profile using say kl.divergence# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/deeplift/profile/"
mr = ModiscoResult(modisco_dir / 'modisco.h5')
mr.open()
patterns = read_pkl(modisco_dir / 'patterns.pkl')
patterns[0].attrs.keys()
tasks = patterns[0].tasks()
tasks
dfa = mr.metacluster_activity()
dfa.columns = dfa.columns.str.replace("/weighted", "")
def get_metacluster(pname):
return int(pname.split("/")[0].split("_")[1])
patterns = [p.add_attr('metacluster', get_metacluster(p.name)) for p in patterns]
patterns = [p.add_attr('metacluster_activity', dict(dfa.loc[p.attrs['metacluster']])) for p in patterns]
p1 = patterns[0].attrs['stacked_seqlet_imp'].aggregate()
p2 = patterns[1].attrs['stacked_seqlet_imp'].aggregate()
p1.plot("seq_ic")
p2.plot("seq_ic");
p1.aligned_distance(p2, which='seq'), where which can be 'seq', 'counts_profile,counts_total`seq, contrib, hyp_contrib or profile using say kl.divergencep1.aligned_distance_seq(p2)
p1.aligned_distance_profile(p2, pseudo_p=0)
s1 = patterns[0].attrs['stacked_seqlet_imp']
s2 = patterns[1].attrs['stacked_seqlet_imp']
from basepair.modisco.core import StackedSeqletImp, shuffle_seqlets
s1s, s2s = shuffle_seqlets(s1, s2)
s1.aggregate().plot('seq_ic');
s2.aggregate().plot('seq_ic');
s1s.aggregate().plot('seq_ic');
s2s.aggregate().plot('seq_ic');
from basepair.utils import flatten
from basepair.modisco.core import StackedSeqletImp, shuffle_seqlets
def compare_patterns(p1, p2):
d = {"seq": p1.aligned_distance_seq(p2),
"profile": p1.aligned_distance_profile(p2)}
return flatten(d, separator='/')
def null_dist(s12, len_s1):
s1s, s2s = s12.shuffle().split(len_s1)
return compare_patterns(s1s.aggregate(), s2s.aggregate())
def dist_pval(s1, s2, permute=100, parallel=None, verbose=True):
if parallel is None:
parallel = Parallel(n_jobs=20)
s12 = s1.append(s2)
len_s1 = len(s1)
df_null = pd.DataFrame(parallel(delayed(null_dist)(s12, len_s1)
for i in tqdm(range(permute), disable=not verbose)))
alt = compare_patterns(s1.aggregate(), s2.aggregate())
# what's the probability that we would observe the event or more extreme by chance?
pvals = {t: np.mean(df_null[t] >= alt[t]) for t in alt}
return pvals
%tqdm_restart
%time dist_pval(s1, s2, 1000, Parallel(n_jobs=40))
dfp_bak = dfp
out = []
#with Parallel(n_jobs=20) as parallel: # re-use the pool of workers
for i in tqdm(range(len(patterns) - 1)):
p1 = patterns[i]
p2 = patterns[i+1]
s1 = p1.attrs['stacked_seqlet_imp']
s2 = p2.attrs['stacked_seqlet_imp']
c1 = p1.attrs['cluster']
c2 = p2.attrs['cluster']
out.append({"patterns": f"{shorten_pattern(p1.name)}-{shorten_pattern(p2.name)}",
"clusters": f"{c1}-{c2}",
"same_cluster": c1 == c2,
**dist_pval(s1, s2, permute=10000, parallel=Parallel(n_jobs=40), verbose=False)
})
dfp = pd.DataFrame(out)
dfp = dfp.set_index(['clusters', 'patterns'])
ls {modisco_dir}
dfp.to_csv(f"{modisco_dir}/motif_clustering/motif-similarity-test.csv")
same_cluster = dfp['same_cluster']
del dfp['same_cluster']
%matplotlib inline
TODO - wilcoxon test on the different counts
dfp.head()
# Why is this thing still slow as hell?
# Does it copy a lot of the memory? -> what's stored in attrs?
# Try out aggregate(idx) -> is it any faster?
fig, ax = plt.subplots(figsize=(3, 25))
sns.heatmap(-np.log10(dfp+1e-5), ax=ax, cmap='Blues')
clusters = pd.Series([p.attrs['cluster'] for p in patterns])
cluster_idx = {c: clusters[clusters == c].index.values for c in clusters.unique()}
cluster_idx
pattern_groups = {c: [p for i,p in enumerate(patterns) if i in idx] for c,idx in cluster_idx.items()}
# Select the major pattern from each group by max n_seqlets
major_patterns = {c: cp[np.argmax([p.attrs['features']['n seqlets'] for p in cp])]
for c,cp in pattern_groups.items()}
for c, mp in major_patterns.items():
mp.plot("seq_ic");
plt.title(f"{mp.name} {c}")
named_cluster = {8: "Klf4", 4: "Essrb", 0: "Oct4-Sox2", 1: "Sox2", 2: "Nanog"}
# slice(*p._trim_seq_ic_ij(trim_frac=0.08))
# 15:55
df = pd.concat([pd.DataFrame({"imp": p.attrs['stacked_seqlet_imp'].contrib[t][:,15:55].mean(axis=(1,2)), # [:,slice(*p._trim_seq_ic_ij(trim_frac=0.08))]
"task": t,
"pattern": shorten_pattern(p.name),
"metacluster_act": p.attrs['metacluster_activity'][t],
"cluster": c})
for t in tasks
for c,cpatterns in pattern_groups.items()
for p in cpatterns])
df['ma'] = df.metacluster_act.map({1: "+", 0: ".", "-1": "-"})
df.pattern = pd.Categorical(df.pattern, categories=df.pattern.unique())
df.head()
cluster=8
dodge_text = position_dodge(width=0.9)
data = df[df.cluster==cluster]
g = ggplot(aes(x='pattern', color='task', y='imp'), data=data) + \
geom_violin(position='dodge') + \
geom_text(aes(label='ma', y=data.imp.min()), position=dodge_text, data=data[['pattern', 'task', 'ma']].drop_duplicates()) +\
theme_bw() + ggtitle(f"Cluster {cluster}")
g
df = pd.concat([pd.DataFrame({"imp": p.attrs['stacked_seqlet_imp'].contrib[t][:,slice(*major_patterns[c]._trim_seq_ic_ij(trim_frac=0.08))].mean(axis=(1,2)),
"task": t,
"pattern": shorten_pattern(p.name),
"metacluster_act": p.attrs['metacluster_activity'][t],
"cluster": c})
for t in tasks
for c,cpatterns in pattern_groups.items()
for p in cpatterns])
df['ma'] = df.metacluster_act.map({1: "+", 0: ".", "-1": "-"})
df.pattern = pd.Categorical(df.pattern, categories=df.pattern.unique())
df.head()
plotnine.options.figure_size=(18, 3)
for cluster in df.cluster.unique():
if cluster not in named_cluster:
continue
dodge_text = position_dodge(width=0.9)
data = df[df.cluster==cluster]
g = ggplot(aes(x='pattern', color='task', y='imp'), data=data) + \
geom_violin(position='dodge') + \
geom_text(aes(label='ma', y=data.imp.min()), position=dodge_text, data=data[['pattern', 'task', 'ma']].drop_duplicates()) +\
theme_bw() + ggtitle(f"Cluster {cluster} {named_cluster[cluster]}")
g.draw()
[:, 15:55]¶# slice(*p._trim_seq_ic_ij(trim_frac=0.08))
# 15:55
df = pd.concat([pd.DataFrame({"imp": p.attrs['stacked_seqlet_imp'].contrib[t][:,15:55].mean(axis=(1,2)), # [:,slice(*p._trim_seq_ic_ij(trim_frac=0.08))]
"task": t,
"pattern": shorten_pattern(p.name),
"metacluster_act": p.attrs['metacluster_activity'][t],
"cluster": c})
for t in tasks
for c,cpatterns in pattern_groups.items()
for p in cpatterns])
df['ma'] = df.metacluster_act.map({1: "+", 0: ".", "-1": "-"})
df.pattern = pd.Categorical(df.pattern, categories=df.pattern.unique())
df.head()
plotnine.options.figure_size=(18, 3)
for cluster in df.cluster.unique():
if cluster not in named_cluster:
continue
dodge_text = position_dodge(width=0.9)
data = df[df.cluster==cluster]
g = ggplot(aes(x='pattern', color='task', y='imp'), data=data) + \
geom_violin(position='dodge') + \
geom_text(aes(label='ma', y=data.imp.min()), position=dodge_text, data=data[['pattern', 'task', 'ma']].drop_duplicates()) +\
theme_bw() + ggtitle(f"Cluster {cluster} {named_cluster[cluster]}")
g.draw()
Experiment with dask
import dask
import dask.dataframe as dd
import dask.multiprocessing
out = []
permute = 1000
for i in tqdm(range(len(patterns) - 1)):
p1 = patterns[i]
p2 = patterns[i+1]
s1 = p1.attrs['stacked_seqlet_imp']
s2 = p2.attrs['stacked_seqlet_imp']
c1 = p1.attrs['cluster']
c2 = p2.attrs['cluster']
s12 = dask.delayed(s1.append(s2))
len_s1 = len(s1)
null_objects = [dask.delayed(null_dist)(s12, len_s1) for i in range(permute)]
alt = compare_patterns(s1.aggregate(), s2.aggregate())
out.append({"metadata": {"patterns": f"{shorten_pattern(p1.name)}-{shorten_pattern(p2.name)}",
"clusters": f"{c1}-{c2}",
"same_cluster": c1 == c2
},
"null_objects": null_objects,
"alt": alt
})
outl = []
from dask.diagnostics import ProgressBar
with ProgressBar():
objects, = dask.compute(out, scheduler='processes', num_workers=40)
for d in objects:
df_null = pd.DataFrame(d['null_objects'])
alt = d['alt']
pvals = {t: np.mean(df_null[t] >= alt[t]) for t in alt}
outl.append({**d['metadata'], **pvals})
dfp = pd.DataFrame(outl)
out_comp = dask.compute(out)
out_comp
# Aggregate
outl = []
objects = dask.compute(out)
for d in objects:
df_null = pd.DataFrame(d['null_objects'])
alt = d['alt']
pvals = {t: np.mean(df_null[t] >= alt[t]) for t in alt}
outl.append({**d['metadata'], **pvals})
dfp = pd.DataFrame(outl)