In [1]:
import matplotlib
import matplotlib.pyplot as plt
plt.ioff()
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/")
from modisco_results_utils import *
from reports import *
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap

import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import argparse
from reports import SaveAllModiscoResults
import vdom.helpers as vdomh
from vdom.helpers import h1, p, img, div, b
import sys
sys.path.append('/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils')
from generalized_sequence_alignment import *
from plot_utils import *
import h5py 
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from util import figure_to_vdom_image

    # Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/modisco_results_utils.py:32: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:47: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
In [2]:
# %load_ext autoreload
# %autoreload 2

Fetch Environment variables

In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf_dataset = os.environ["TF_DATASET"]
tf_true_name = os.environ["TF_TRUE_NAME"]
savedir = os.environ["SAVEDIR"]

Reports on TF motifs and subcluster diversity:

In [4]:
ob = SaveAllModiscoResults(tf=tf_dataset,dataset=dataset,savedir=savedir)
Loading shap scores 

Loading tfm object 

Motifs and subpattern diversity

In [5]:
ob.save_main_patterns()
Saving main patterns
Saving pattern: 0
Saving pattern: 1
Saving pattern: 2
Saving pattern: 3
Saving pattern: 4
Saving pattern: 5
In [6]:
ob.plot_motif_heterogeneity()

Pattern 1/6

/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/reports.py:642: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  emb_fig, ax = plt.subplots()
SubpatternSeqletsEmbeddingsCWMPFM
Agg.7989
01316
11212
21124
3931
4596
5569
6526
7477
8476
9432
10208
1172
1243
137

Pattern 2/6

SubpatternSeqletsEmbeddingsCWMPFM
Agg.263
074
153
247
339
429
512
65
74

Pattern 3/6

SubpatternSeqletsEmbeddingsCWMPFM
Agg.221
063
140
238
328
417
516
610
79

Pattern 4/6

SubpatternSeqletsEmbeddingsCWMPFM
Agg.159
048
137
230
323
416
55

Pattern 5/6

SubpatternSeqletsEmbeddingsCWMPFM
Agg.94
034
127
223
310

Pattern 6/6

SubpatternSeqletsEmbeddingsCWMPFM
Agg.84
031
131
216
36

Profile heatmaps and motif distribution around peak summits

In [7]:
ob.save_profiles_and_seqlet_histograms()
Pattern 1
Total Seqlets 7989
Pattern 2
Total Seqlets 263
Pattern 3
Total Seqlets 221
Pattern 4
Total Seqlets 159
Pattern 5
Total Seqlets 94
Pattern 6
Total Seqlets 84

Display B1H - aligned motifs:

In [8]:
def generate_b1h_aligned_table(dataset,tf_name,tf_true_name):
    

    plt.ioff()
    results_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
    motifs_saved_file = '{}/{}/{}/main_patterns/all_motifs.h5'.format(results_base,dataset,tf_name)
    filtered_motif_file = '{}/{}/{}/filtered_patterns_list.txt'.format(results_base,dataset,tf_name)
    assert(os.path.exists(filtered_motif_file))
    assert(os.path.exists(motifs_saved_file))
    
    alignments_dir = '{}/{}/{}/b1h_aligments'.format(results_base,dataset,tf_name)
    aligned_results_savefile  = '{}/b1h_alignment.h5'.format(alignments_dir)
    
    header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Pattern number:", style={"text-align": "center"}),
        vdomh.th("n-Seqlets", style={"text-align": "center"}),
        vdomh.th("Alignment", style={"text-align": "center"})

    )
)
    
    f = h5py.File(aligned_results_savefile,'r')
    g = h5py.File(motifs_saved_file,'r')
    
    table_rows = []
    for motif_idx in f.keys():
        b1h_aligned = f[motif_idx]['b1h_aligned'].value 
        pfm_aligned = f[motif_idx]['pfm_aligned'].value
        n_seqlets = g[motif_idx]['n_seqlets'].value
        fig,ax = plot_motifs_list([b1h_aligned,pfm_aligned],ic_scale=True)
        table_idx = int(motif_idx)+1
        table_rows.append(vdomh.tr(
            vdomh.td(str(table_idx)),
            vdomh.td(str(n_seqlets)),
            vdomh.td(figure_to_vdom_image(fig))
        ))
    motif_table = vdomh.table(header,vdomh.tbody(*table_rows))
    display(motif_table)
    
    f.close()
    g.close()
    
In [9]:
generate_b1h_aligned_table(dataset,tf_dataset,tf_true_name)
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:28: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:29: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:30: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
Pattern number:n-SeqletsAlignment
17989
2263
4159
In [ ]: