Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR370NFS in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/38

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.919
0229
1208
2122
3103
468
565
659
740
817
94
102
112

Pattern 2/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.381
097
164
258
345
433
525
620
719
89
94
103
112
122

Pattern 3/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.285
062
147
243
337
434
532
612
77
86
95

Pattern 4/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.252
080
150
239
332
415
513
67
76
84
94
102

Pattern 5/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.258
056
144
234
330
421
519
611
79
86
95
105
114
124
133
143
152
162

Pattern 6/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.176
050
147
245
319
47
56
62

Pattern 7/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.167
041
140
231
328
414
57
63
73

Pattern 8/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.170
039
139
231
325
415
514
64
73

Pattern 9/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.161
049
148
213
312
412
511
68
76
82

Pattern 10/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.144
046
140
219
317
415
55
62

Pattern 11/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.147
050
144
234
314
43
52

Pattern 12/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.138
032
132
226
322
413
57
64
72

Pattern 13/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.102
024
122
222
320
46
56
62

Pattern 14/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.103
032
123
220
310
47
57
62
72

Pattern 15/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.101
027
120
215
314
411
58
66

Pattern 16/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.120
030
124
220
315
414
513
64

Pattern 17/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.91
031
122
217
310
46
53
62

Pattern 18/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.121
028
126
225
313
49
59
69
72

Pattern 19/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.84
026
121
220
310
43
52
62

Pattern 20/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.97
033
131
220
39
44

Pattern 21/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.114
028
125
221
312
410
56
66
74
82

Pattern 22/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.56
017
116
211
37
45

Pattern 23/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.70
017
116
210
310
49
56
62

Pattern 24/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.62
023
119
216
34

Pattern 25/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.108
023
123
221
316
49
56
65
75

Pattern 26/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.109
038
137
227
34
43

Pattern 27/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.68
023
117
216
312

Pattern 28/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.56
014
114
210
39
49

Pattern 29/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.61
015
114
211
311
410

Pattern 30/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.64
016
116
211
39
47
53
62

Pattern 31/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.85
024
116
214
310
49
57
63
72

Pattern 32/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.55
023
117
215

Pattern 33/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.47
025
118
24

Pattern 34/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.82
023
121
218
310
47
53

Pattern 35/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.71
039
128
22
32

Pattern 36/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.56
031
114
27
32
42

Pattern 37/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.67
028
120
211
34
44

Pattern 38/38

SubpatternSeqletsEmbeddingsCWMPFM
Agg.73
023
122
213
39
44
52

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/38 Footprint

Pattern 1/38 Motif Distance Distribution

Pattern 2/38 Footprint

Pattern 2/38 Motif Distance Distribution

Pattern 3/38 Footprint

Pattern 3/38 Motif Distance Distribution

Pattern 4/38 Footprint

Pattern 4/38 Motif Distance Distribution

Pattern 5/38 Footprint

Pattern 5/38 Motif Distance Distribution

Pattern 6/38 Footprint

Pattern 6/38 Motif Distance Distribution

Pattern 7/38 Footprint

Pattern 7/38 Motif Distance Distribution

Pattern 8/38 Footprint

Pattern 8/38 Motif Distance Distribution

Pattern 9/38 Footprint

Pattern 9/38 Motif Distance Distribution

Pattern 10/38 Footprint

Pattern 10/38 Motif Distance Distribution

Pattern 11/38 Footprint

Pattern 11/38 Motif Distance Distribution

Pattern 12/38 Footprint

Pattern 12/38 Motif Distance Distribution

Pattern 13/38 Footprint

Pattern 13/38 Motif Distance Distribution

Pattern 14/38 Footprint

Pattern 14/38 Motif Distance Distribution

Pattern 15/38 Footprint

Pattern 15/38 Motif Distance Distribution

Pattern 16/38 Footprint

Pattern 16/38 Motif Distance Distribution

Pattern 17/38 Footprint

Pattern 17/38 Motif Distance Distribution

Pattern 18/38 Footprint

Pattern 18/38 Motif Distance Distribution

Pattern 19/38 Footprint

Pattern 19/38 Motif Distance Distribution

Pattern 20/38 Footprint

Pattern 20/38 Motif Distance Distribution

Pattern 21/38 Footprint

Pattern 21/38 Motif Distance Distribution

Pattern 22/38 Footprint

Pattern 22/38 Motif Distance Distribution

Pattern 23/38 Footprint

Pattern 23/38 Motif Distance Distribution

Pattern 24/38 Footprint

Pattern 24/38 Motif Distance Distribution

Pattern 25/38 Footprint

Pattern 25/38 Motif Distance Distribution

Pattern 26/38 Footprint

Pattern 26/38 Motif Distance Distribution

Pattern 27/38 Footprint

Pattern 27/38 Motif Distance Distribution

Pattern 28/38 Footprint

Pattern 28/38 Motif Distance Distribution

Pattern 29/38 Footprint

Pattern 29/38 Motif Distance Distribution

Pattern 30/38 Footprint

Pattern 30/38 Motif Distance Distribution

Pattern 31/38 Footprint

Pattern 31/38 Motif Distance Distribution

Pattern 32/38 Footprint

Pattern 32/38 Motif Distance Distribution

Pattern 33/38 Footprint

Pattern 33/38 Motif Distance Distribution

Pattern 34/38 Footprint

Pattern 34/38 Motif Distance Distribution

Pattern 35/38 Footprint

Pattern 35/38 Motif Distance Distribution

Pattern 36/38 Footprint

Pattern 36/38 Motif Distance Distribution

Pattern 37/38 Footprint

Pattern 37/38 Motif Distance Distribution

Pattern 38/38 Footprint

Pattern 38/38 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
1919
2381
3285
4252
5258
6176
7167
8170
9161
10144
11147
12138
13102
14103
15101
16120
1791
18121
1984
2097
21114
2256
2370
2462
25108
26109
2768
2856
2961
3064
3185
3255
3347
3482
3571
3656
3767
3873
In [10]: