In [1]:
import matplotlib
import matplotlib.pyplot as plt
plt.ioff()
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/")
from modisco_results_utils import *
from reports import *
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap

import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import argparse
from reports import SaveAllModiscoResults
import vdom.helpers as vdomh
from vdom.helpers import h1, p, img, div, b
import sys
sys.path.append('/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils')
from generalized_sequence_alignment import *
from plot_utils import *
import h5py 
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from util import figure_to_vdom_image

    # Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/modisco_results_utils.py:32: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  font_manager.findSystemFonts(fontpaths="/oak/stanford/groups/akundaje/manyu/utils/fonts/")
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:47: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
In [2]:
# %load_ext autoreload
# %autoreload 2

Fetch Environment variables

In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf_dataset = os.environ["TF_DATASET"]
tf_true_name = os.environ["TF_TRUE_NAME"]
savedir = os.environ["SAVEDIR"]

Reports on TF motifs and subcluster diversity:

In [4]:
ob = SaveAllModiscoResults(tf=tf_dataset,dataset=dataset,savedir=savedir)
Loading shap scores 

Loading tfm object 

Motifs and subpattern diversity

In [5]:
ob.save_main_patterns()
Saving main patterns
Saving pattern: 0
Saving pattern: 1
Saving pattern: 2
Saving pattern: 3
Saving pattern: 4
Saving pattern: 5
Saving pattern: 6
Saving pattern: 7
In [6]:
ob.plot_motif_heterogeneity()

Pattern 1/8

/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils/reports.py:642: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  emb_fig, ax = plt.subplots()
SubpatternSeqletsEmbeddingsCWMPFM
Agg.5081
0862
1776
2739
3693
4632
5557
6501
7296
825

Pattern 2/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.2405
0608
1591
2571
3570
459
53
63

Pattern 3/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.394
0113
1112
264
360
437
55
63

Pattern 4/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.83
033
118
316
216

Pattern 5/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.65
029
128
28

Pattern 6/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.49
017
114
213
35

Pattern 7/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.46
017
116
27
36

Pattern 8/8

SubpatternSeqletsEmbeddingsCWMPFM
Agg.42
017
112
27
36

Profile heatmaps and motif distribution around peak summits

In [7]:
ob.save_profiles_and_seqlet_histograms()
Pattern 1
Total Seqlets 5081
Pattern 2
Total Seqlets 2405
Pattern 3
Total Seqlets 394
Pattern 4
Total Seqlets 83
Pattern 5
Total Seqlets 65
Pattern 6
Total Seqlets 49
Pattern 7
Total Seqlets 46
Pattern 8
Total Seqlets 42

Display B1H - aligned motifs:

In [8]:
def generate_b1h_aligned_table(dataset,tf_name,tf_true_name):
    

    plt.ioff()
    results_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
    motifs_saved_file = '{}/{}/{}/main_patterns/all_motifs.h5'.format(results_base,dataset,tf_name)
    filtered_motif_file = '{}/{}/{}/filtered_patterns_list.txt'.format(results_base,dataset,tf_name)
    assert(os.path.exists(filtered_motif_file))
    assert(os.path.exists(motifs_saved_file))
    
    alignments_dir = '{}/{}/{}/b1h_aligments'.format(results_base,dataset,tf_name)
    aligned_results_savefile  = '{}/b1h_alignment.h5'.format(alignments_dir)
    
    header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Pattern number:", style={"text-align": "center"}),
        vdomh.th("n-Seqlets", style={"text-align": "center"}),
        vdomh.th("Alignment", style={"text-align": "center"})

    )
)
    
    f = h5py.File(aligned_results_savefile,'r')
    g = h5py.File(motifs_saved_file,'r')
    
    table_rows = []
    for motif_idx in f.keys():
        b1h_aligned = f[motif_idx]['b1h_aligned'].value 
        pfm_aligned = f[motif_idx]['pfm_aligned'].value
        n_seqlets = g[motif_idx]['n_seqlets'].value
        fig,ax = plot_motifs_list([b1h_aligned,pfm_aligned],ic_scale=True)
        table_idx = int(motif_idx)+1
        table_rows.append(vdomh.tr(
            vdomh.td(str(table_idx)),
            vdomh.td(str(n_seqlets)),
            vdomh.td(figure_to_vdom_image(fig))
        ))
    motif_table = vdomh.table(header,vdomh.tbody(*table_rows))
    display(motif_table)
    
    f.close()
    g.close()
    
In [9]:
generate_b1h_aligned_table(dataset,tf_dataset,tf_true_name)
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:28: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:29: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/oak/stanford/groups/akundaje/manyu/anaconda3/envs/basepairmodels_py3.6.8/lib/python3.6/site-packages/ipykernel_launcher.py:30: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
Pattern number:n-SeqletsAlignment
22405
3394
In [ ]: