In [1]:
import matplotlib
import matplotlib.pyplot as plt
plt.ioff()
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/")
from modisco_results_utils import *
from reports import *
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap

import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import argparse
from reports import SaveAllModiscoResults
import vdom.helpers as vdomh
from vdom.helpers import h1, p, img, div, b
import sys
sys.path.append('/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils')
from generalized_sequence_alignment import *
from plot_utils import *
import h5py 
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from util import figure_to_vdom_image

    # Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/modisco_results_utils.py:32: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:47: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
In [2]:
# %load_ext autoreload
# %autoreload 2

Fetch Environment variables

In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf_dataset = os.environ["TF_DATASET"]
tf_true_name = os.environ["TF_TRUE_NAME"]
savedir = os.environ["SAVEDIR"]

Reports on TF motifs and subcluster diversity:

In [4]:
ob = SaveAllModiscoResults(tf=tf_dataset,dataset=dataset,savedir=savedir)
Loading shap scores 

Loading tfm object 

Motifs and subpattern diversity

In [5]:
ob.save_main_patterns()
Saving main patterns
Saving pattern: 0
Saving pattern: 1
Saving pattern: 2
Saving pattern: 3
Saving pattern: 4
In [6]:
ob.plot_motif_heterogeneity()

Pattern 1/5

/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/reports.py:642: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  emb_fig, ax = plt.subplots()
SubpatternSeqletsEmbeddingsCWMPFM
Agg.12972
02933
12745
22372
32038
41969
5322
6302
790
865
964
1021
1113
1212
138
156
166
146

Pattern 2/5

SubpatternSeqletsEmbeddingsCWMPFM
Agg.87
027
126
215
315
44

Pattern 3/5

SubpatternSeqletsEmbeddingsCWMPFM
Agg.75
029
127
219

Pattern 4/5

SubpatternSeqletsEmbeddingsCWMPFM
Agg.58
020
114
213
311

Pattern 5/5

SubpatternSeqletsEmbeddingsCWMPFM
Agg.38
014
212
112

Profile heatmaps and motif distribution around peak summits

In [7]:
ob.save_profiles_and_seqlet_histograms()
Pattern 1
Total Seqlets 12972
Pattern 2
Total Seqlets 87
Pattern 3
Total Seqlets 75
Pattern 4
Total Seqlets 58
Pattern 5
Total Seqlets 38

Display B1H - aligned motifs:

In [8]:
def generate_b1h_aligned_table(dataset,tf_name,tf_true_name):
    

    plt.ioff()
    results_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
    motifs_saved_file = '{}/{}/{}/main_patterns/all_motifs.h5'.format(results_base,dataset,tf_name)
    filtered_motif_file = '{}/{}/{}/filtered_patterns_list.txt'.format(results_base,dataset,tf_name)
    assert(os.path.exists(filtered_motif_file))
    assert(os.path.exists(motifs_saved_file))
    
    alignments_dir = '{}/{}/{}/b1h_aligments'.format(results_base,dataset,tf_name)
    aligned_results_savefile  = '{}/b1h_alignment.h5'.format(alignments_dir)
    
    header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Pattern number:", style={"text-align": "center"}),
        vdomh.th("n-Seqlets", style={"text-align": "center"}),
        vdomh.th("Alignment", style={"text-align": "center"})

    )
)
    
    f = h5py.File(aligned_results_savefile,'r')
    g = h5py.File(motifs_saved_file,'r')
    
    table_rows = []
    for motif_idx in f.keys():
        b1h_aligned = f[motif_idx]['b1h_aligned'].value 
        pfm_aligned = f[motif_idx]['pfm_aligned'].value
        n_seqlets = g[motif_idx]['n_seqlets'].value
        fig,ax = plot_motifs_list([b1h_aligned,pfm_aligned],ic_scale=True)
        table_idx = int(motif_idx)+1
        table_rows.append(vdomh.tr(
            vdomh.td(str(table_idx)),
            vdomh.td(str(n_seqlets)),
            vdomh.td(figure_to_vdom_image(fig))
        ))
    motif_table = vdomh.table(header,vdomh.tbody(*table_rows))
    display(motif_table)
    
    f.close()
    g.close()
    
In [9]:
generate_b1h_aligned_table(dataset,tf_dataset,tf_true_name)
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:28: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:29: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:30: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
Pattern number:n-SeqletsAlignment
112972
In [ ]: