# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
paper_config()
from basepair.cli.imp_score import ImpScoreFile
import basepair
import kipoi
from basepair.modisco.core import StackedSeqletImp, shuffle_seqlets
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
Decide which TF to take from where.
# Config
per_tf_modisco_dirs = {tf: Path(f"{model_dir}/modisco/by_peak_tasks/per-tf/{tf}/") for tf in ["Oct4", "Sox2", "Nanog", "Klf4"]}
per_tf_motifs = {"Oct4": "metacluster_0/pattern_0",
"Sox2": "metacluster_0/pattern_1",
"Nanog": "metacluster_0/pattern_0",
"Klf4": "metacluster_0/pattern_0",
}
tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]
patterns = {}
stacked_seqlets = {}
stacked_seqlets_count = {}
for tf, pname in tqdm(per_tf_motifs.items()):
print(tf)
mr = ModiscoResult(per_tf_modisco_dirs[tf] / 'modisco.h5')
mr.open()
seqlets = mr._get_seqlets(per_tf_motifs[tf])
with kipoi.utils.cd(per_tf_modisco_dirs[tf]):
imp_scores = ImpScoreFile.from_modisco_dir(per_tf_modisco_dirs[tf])
p = mr.get_pattern(per_tf_motifs[tf])
# Extract all the importance scores
simp = imp_scores.extract(seqlets, profile_width=70)
# p.attrs['stacked_seqlet_imp'] = simp
stacked_seqlets[tf] = simp
simp_count = imp_scores.extract(seqlets, profile_width=70, pred_summary='count')
# p.attrs['stacked_seqlet_imp'] = simp
stacked_seqlets_count[tf] = simp_count
patterns[tf] = p
tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]
dfa = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].contrib[t][:,slice(*stacked_seqlets[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2))
for t in tasks}).assign(motif=tf)
for tf in tasks], axis=0)
dfm = dfa.melt(id_vars=['motif'], var_name='tf', value_name='importance')
dfasimp = pd.concat([pd.DataFrame({t: stacked_seqlets_count[tf].contrib[t][:,slice(*stacked_seqlets_count[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2))
for t in tasks}).assign(motif=tf)
for tf in tasks], axis=0)
dfmsimp = dfasimp.melt(id_vars=['motif'], var_name='tf', value_name='count_importance')
dfac = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].profile[t].mean(axis=(1,2))
for t in tasks}).assign(motif=tf)
for tf in tasks], axis=0)
dfmc = dfac.melt(id_vars=['motif'], var_name='tf', value_name='counts')
dfmc['log10_counts'] = np.log10(dfmc['counts'])
for t in tasks:
patterns[t].trim_seq_ic(0.08).plot("seq_ic")
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='importance', color='tf'), data=dfm) +
geom_jitter(alpha=0.01, shape='.', size=.5) +
scale_color_brewer('qual', palette=3) +
geom_boxplot() + theme_bw() +
facet_grid(".~motif"))
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='count_importance', color='tf'), data=dfmsimp) +
geom_jitter(alpha=0.01, shape='.', size=.5) +
scale_color_brewer('qual', palette=3) +
geom_boxplot() + theme_bw() +
facet_grid(".~motif"))
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='log10_counts', color='tf'), data=dfmc) +
geom_jitter(alpha=0.01, shape='.', size=.5) +
scale_color_brewer('qual', palette=3) +
geom_boxplot() + theme_bw() +
facet_grid(".~motif"))
for task in tasks:
g = sns.pairplot(dfa[dfa.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
g.fig.suptitle("motif = " + task)
for task in tasks:
g = sns.pairplot(dfasimp[dfasimp.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
g.fig.suptitle("motif = " + task)
for task in tasks:
g = sns.pairplot(np.log10(dfac[dfac.motif == task][tasks]), diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
g.fig.suptitle("motif = " + task)
for task in tasks:
g = sns.clustermap(dfa[dfa.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
g.fig.suptitle(task)
for task in tasks:
g = sns.clustermap(dfasimp[dfasimp.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
g.fig.suptitle(task)
for task in tasks:
g = sns.clustermap(np.log10(1+dfac[dfac.motif == task][tasks]), cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
g.fig.suptitle(task)