Tasks¶

[X] scatterplot / parallel components / boxplot the importance scores (count + profile) for all the 4 factors on per-TF run

Open tasks¶

[ ] add the count importance scores

Notes¶

For all seqlet locations for each TF I plotted:
- importance scores at the trimmed motif region
- counts at the profile (70bp)

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')

Using TensorFlow backend.

%matplotlib inline
paper_config()

from basepair.cli.imp_score import ImpScoreFile
import basepair
import kipoi
from basepair.modisco.core import StackedSeqletImp, shuffle_seqlets

# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")

Decide which TF to take from where.

# Config
per_tf_modisco_dirs = {tf: Path(f"{model_dir}/modisco/by_peak_tasks/per-tf/{tf}/") for tf in ["Oct4", "Sox2", "Nanog", "Klf4"]}
per_tf_motifs = {"Oct4": "metacluster_0/pattern_0",
                 "Sox2": "metacluster_0/pattern_1",
                 "Nanog": "metacluster_0/pattern_0",
                 "Klf4": "metacluster_0/pattern_0",
                }

tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]

patterns = {}
stacked_seqlets = {}
stacked_seqlets_count = {}
for tf, pname in tqdm(per_tf_motifs.items()):
    print(tf)
    mr = ModiscoResult(per_tf_modisco_dirs[tf] / 'modisco.h5')
    mr.open()
    seqlets = mr._get_seqlets(per_tf_motifs[tf])
    with kipoi.utils.cd(per_tf_modisco_dirs[tf]):
        imp_scores = ImpScoreFile.from_modisco_dir(per_tf_modisco_dirs[tf])
    p = mr.get_pattern(per_tf_motifs[tf])
    # Extract all the importance scores
    simp = imp_scores.extract(seqlets, profile_width=70)
    # p.attrs['stacked_seqlet_imp'] = simp
    stacked_seqlets[tf] = simp
    simp_count = imp_scores.extract(seqlets, profile_width=70, pred_summary='count')
    # p.attrs['stacked_seqlet_imp'] = simp
    stacked_seqlets_count[tf] = simp_count
    patterns[tf] = p

  0%|          | 0/4 [00:00<?, ?it/s]

Oct4

 25%|██▌       | 1/4 [03:19<09:58, 199.60s/it]

Sox2

 50%|█████     | 2/4 [06:28<06:32, 196.38s/it]

Nanog

 75%|███████▌  | 3/4 [09:42<03:15, 195.74s/it]

Klf4

100%|██████████| 4/4 [13:07<00:00, 198.53s/it]

tasks = ["Oct4", "Sox2", "Nanog", "Klf4"]

dfa = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].contrib[t][:,slice(*stacked_seqlets[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfm = dfa.melt(id_vars=['motif'], var_name='tf', value_name='importance')

dfasimp = pd.concat([pd.DataFrame({t: stacked_seqlets_count[tf].contrib[t][:,slice(*stacked_seqlets_count[tf].aggregate()._trim_seq_ic_ij(0.08))].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfmsimp = dfasimp.melt(id_vars=['motif'], var_name='tf', value_name='count_importance')

dfac = pd.concat([pd.DataFrame({t: stacked_seqlets[tf].profile[t].mean(axis=(1,2)) 
                               for t in tasks}).assign(motif=tf)
                      for tf in tasks], axis=0)
dfmc = dfac.melt(id_vars=['motif'], var_name='tf', value_name='counts')
dfmc['log10_counts'] = np.log10(dfmc['counts'])

Motifs¶

for t in tasks:
    patterns[t].trim_seq_ic(0.08).plot("seq_ic")

Boxplots¶

Profile importance¶

plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='importance', color='tf'), data=dfm) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))

<ggplot: (8782262194247)>

Count importance¶

plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='count_importance', color='tf'), data=dfmsimp) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))

<ggplot: (8782265904076)>

Profile counts (70bp)¶

plotnine.options.figure_size = (10, 3)
(ggplot(aes(x='tf', y='log10_counts', color='tf'), data=dfmc) + 
 geom_jitter(alpha=0.01, shape='.', size=.5) +
 scale_color_brewer('qual', palette=3) +
 geom_boxplot() + theme_bw() + 
 facet_grid(".~motif"))

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/layer.py:360: UserWarning: stat_boxplot : Removed 445 rows containing non-finite values.
  data = self.stat.compute_layer(data, params, layout)

<ggplot: (-9223363254592690621)>

Pairwise scatterplots¶

Profile importance¶

for task in tasks:
    g = sns.pairplot(dfa[dfa.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)

Count importance¶

for task in tasks:
    g = sns.pairplot(dfasimp[dfasimp.motif == task][tasks], diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)

Profile counts (70bp)¶

for task in tasks:
    g = sns.pairplot(np.log10(dfac[dfac.motif == task][tasks]), diag_kind="kde", plot_kws=dict(s=3, alpha=0.3, marker='.', linewidth=0), height=1.5)
    g.fig.suptitle("motif = " + task)

Heatmaps¶

Profile importance¶

for task in tasks:
    g = sns.clustermap(dfa[dfa.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task)

Count importance¶

for task in tasks:
    g = sns.clustermap(dfasimp[dfasimp.motif == task][tasks], cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task)

Profile counts (70bp)¶

for task in tasks:
    g = sns.clustermap(np.log10(1+dfac[dfac.motif == task][tasks]), cmap='Blues', col_cluster=False, figsize=get_figsize(.5, aspect=1))
    g.fig.suptitle(task)