Tasks

  • [X] scatterplot / parallel components / boxplot the importance scores (count + profile) for all the 4 factors on per-TF run

Open tasks

  • [ ] add the count importance scores

Notes

  • For all seqlet locations for each TF I plotted:
    • importance scores at the trimmed motif region
    • counts at the profile (70bp)
In [3]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
Using TensorFlow backend.
In [4]:
%matplotlib inline
paper_config()
In [5]:
from basepair.cli.imp_score import ImpScoreFile
import basepair
import kipoi
from basepair.modisco.core import StackedSeqletImp, shuffle_seqlets
In [6]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

Decide which TF to take from where.

In [7]:
# Config
per_tf_modisco_dirs = {tf: Path(f"{model_dir}/modisco/by_peak_tasks/per-tf/{tf}/") for tf in ["Oct4", "Sox2", "Nanog", "Klf4"]}
per_tf_motifs = {"Oct4": "metacluster_0/pattern_0",
                 "Sox2": "metacluster_0/pattern_1",
                 "Nanog": "metacluster_0/pattern_0",
                 "Klf4": "metacluster_0/pattern_0",
                }
In [16]:
tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]
In [49]:
patterns = {}
stacked_seqlets = {}
stacked_seqlets_count = {}
for tf, pname in tqdm(per_tf_motifs.items()):
    print(tf)
    mr = ModiscoResult(per_tf_modisco_dirs[tf] / 'modisco.h5')
    mr.open()
    seqlets = mr._get_seqlets(per_tf_motifs[tf])
    with kipoi.utils.cd(per_tf_modisco_dirs[tf]):
        imp_scores = ImpScoreFile.from_modisco_dir(per_tf_modisco_dirs[tf])
    p = mr.get_pattern(per_tf_motifs[tf])
    # Extract all the importance scores
    simp = imp_scores.extract(seqlets, profile_width=70)
    # p.attrs['stacked_seqlet_imp'] = simp
    stacked_seqlets[tf] = simp
    simp_count = imp_scores.extract(seqlets, profile_width=70, pred_summary='count')
    # p.attrs['stacked_seqlet_imp'] = simp
    stacked_seqlets_count[tf] = simp_count
    patterns[tf] = p
  0%|          | 0/4 [00:00<?, ?it/s]
Oct4
 25%|██▌       | 1/4 [03:19<09:58, 199.60s/it]
Sox2
 50%|█████     | 2/4 [06:28<06:32, 196.38s/it]
Nanog
 75%|███████▌  | 3/4 [09:42<03:15, 195.74s/it]
Klf4
100%|██████████| 4/4 [13:07<00:00, 198.53s/it]
In [15]:
tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]
In [51]:
dfa = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].contrib[t][:,slice(*stacked_seqlets[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfm = dfa.melt(id_vars=['motif'], var_name='tf', value_name='importance')
In [52]:
dfasimp = pd.concat([pd.DataFrame({t: stacked_seqlets_count[tf].contrib[t][:,slice(*stacked_seqlets_count[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfmsimp = dfasimp.melt(id_vars=['motif'], var_name='tf', value_name='count_importance')
In [53]:
dfac = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].profile[t].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfmc = dfac.melt(id_vars=['motif'], var_name='tf', value_name='counts')
dfmc['log10_counts'] = np.log10(dfmc['counts'])

Motifs

In [54]:
for t in tasks:
    patterns[t].trim_seq_ic(0.08).plot("seq_ic")

Boxplots

Profile importance

In [55]:
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='importance', color='tf'), data=dfm) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))
Out[55]:
<ggplot: (8782262194247)>

Count importance

In [61]:
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='count_importance', color='tf'), data=dfmsimp) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))
Out[61]:
<ggplot: (8782265904076)>

Profile counts (70bp)

In [62]:
plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='log10_counts', color='tf'), data=dfmc) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_boxplot : Removed 445 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)
Out[62]:
<ggplot: (-9223363254592690621)>

Pairwise scatterplots

Profile importance

In [64]:
for task in tasks:
    g = sns.pairplot(dfa[dfa.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)    

Count importance

In [65]:
for task in tasks:
    g = sns.pairplot(dfasimp[dfasimp.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)    

Profile counts (70bp)

In [66]:
for task in tasks:
    g = sns.pairplot(np.log10(dfac[dfac.motif == task][tasks]), diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)    

Heatmaps

Profile importance

In [ ]:
for task in tasks:
    g = sns.clustermap(dfa[dfa.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task) 

Count importance

In [ ]:
for task in tasks:
    g = sns.clustermap(dfasimp[dfasimp.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task) 

Profile counts (70bp)

In [ ]:
for task in tasks:
    g = sns.clustermap(np.log10(1+dfac[dfac.motif == task][tasks]), cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task)