Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR149ZBI in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/35

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1543
0239
1186
2185
3180
4172
5158
6121
793
891
955
1055
116
122

Pattern 2/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1216
0309
1182
2174
3108
495
592
687
767
849
935
108
114
123
133

Pattern 3/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.878
0237
1184
2142
3137
4127
548
63

Pattern 4/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.521
092
177
263
351
449
543
639
738
834
923
108
114

Pattern 5/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.445
065
164
256
349
446
543
643
736
830
913

Pattern 6/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.351
048
147
243
339
438
538
635
734
829

Pattern 7/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.311
060
156
252
348
436
532
627

Pattern 8/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.281
054
144
242
338
428
528
623
721
83

Pattern 9/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.251
053
140
234
331
424
523
621
718
85
92

Pattern 10/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.212
058
148
235
328
410
510
68
78
84
93

Pattern 11/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.211
047
143
230
323
422
518
616
712

Pattern 12/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.165
074
162
217
312

Pattern 13/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.157
035
123
222
320
419
516
613
79

Pattern 14/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.151
028
128
220
317
415
515
615
713

Pattern 15/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.125
022
122
221
319
416
515
68
72

Pattern 16/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.101
030
130
225
316

Pattern 17/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.150
046
126
225
324
420
59

Pattern 18/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.86
030
124
224
35
43

Pattern 19/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.74
025
119
215
315

Pattern 20/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.73
021
116
216
313
43
52
62

Pattern 21/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.83
022
122
215
315
45
52
62

Pattern 22/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.77
021
116
212
310
410
58

Pattern 23/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.74
015
114
212
311
411
511

Pattern 24/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.56
016
115
215
38
42

Pattern 25/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.52
018
118
216

Pattern 26/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.43
020
119
24

Pattern 27/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.52
017
114
214
37

Pattern 28/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.39
027
112

Pattern 29/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.40
018
110
29
33

Pattern 30/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.42
026
18
28

Pattern 31/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.38
017
116
25

Pattern 32/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.46
018
118
26
34

Pattern 33/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.41
019
113
29

Pattern 34/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.37
012
112
210
33

Pattern 35/35

SubpatternSeqletsEmbeddingsCWMPFM
Agg.42
015
110
29
38

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/35 Footprint

Pattern 1/35 Motif Distance Distribution

Pattern 2/35 Footprint

Pattern 2/35 Motif Distance Distribution

Pattern 3/35 Footprint

Pattern 3/35 Motif Distance Distribution

Pattern 4/35 Footprint

Pattern 4/35 Motif Distance Distribution

Pattern 5/35 Footprint

Pattern 5/35 Motif Distance Distribution

Pattern 6/35 Footprint

Pattern 6/35 Motif Distance Distribution

Pattern 7/35 Footprint

Pattern 7/35 Motif Distance Distribution

Pattern 8/35 Footprint

Pattern 8/35 Motif Distance Distribution

Pattern 9/35 Footprint

Pattern 9/35 Motif Distance Distribution

Pattern 10/35 Footprint

Pattern 10/35 Motif Distance Distribution

Pattern 11/35 Footprint

Pattern 11/35 Motif Distance Distribution

Pattern 12/35 Footprint

Pattern 12/35 Motif Distance Distribution

Pattern 13/35 Footprint

Pattern 13/35 Motif Distance Distribution

Pattern 14/35 Footprint

Pattern 14/35 Motif Distance Distribution

Pattern 15/35 Footprint

Pattern 15/35 Motif Distance Distribution

Pattern 16/35 Footprint

Pattern 16/35 Motif Distance Distribution

Pattern 17/35 Footprint

Pattern 17/35 Motif Distance Distribution

Pattern 18/35 Footprint

Pattern 18/35 Motif Distance Distribution

Pattern 19/35 Footprint

Pattern 19/35 Motif Distance Distribution

Pattern 20/35 Footprint

Pattern 20/35 Motif Distance Distribution

Pattern 21/35 Footprint

Pattern 21/35 Motif Distance Distribution

Pattern 22/35 Footprint

Pattern 22/35 Motif Distance Distribution

Pattern 23/35 Footprint

Pattern 23/35 Motif Distance Distribution

Pattern 24/35 Footprint

Pattern 24/35 Motif Distance Distribution

Pattern 25/35 Footprint

Pattern 25/35 Motif Distance Distribution

Pattern 26/35 Footprint

Pattern 26/35 Motif Distance Distribution

Pattern 27/35 Footprint

Pattern 27/35 Motif Distance Distribution

Pattern 28/35 Footprint

Pattern 28/35 Motif Distance Distribution

Pattern 29/35 Footprint

Pattern 29/35 Motif Distance Distribution

Pattern 30/35 Footprint

Pattern 30/35 Motif Distance Distribution

Pattern 31/35 Footprint

Pattern 31/35 Motif Distance Distribution

Pattern 32/35 Footprint

Pattern 32/35 Motif Distance Distribution

Pattern 33/35 Footprint

Pattern 33/35 Motif Distance Distribution

Pattern 34/35 Footprint

Pattern 34/35 Motif Distance Distribution

Pattern 35/35 Footprint

Pattern 35/35 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11543
21216
3878
4521
5445
6351
7311
8281
9251
10212
11211
12165
13157
14151
15125
16101
17150
1886
1974
2073
2183
2277
2374
2456
2552
2643
2752
2839
2940
3042
3138
3246
3341
3437
3542
In [10]: