Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR011PEI in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/29

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1205
0300
1207
2178
3130
498
596
693
764
839

Pattern 2/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1126
0210
1167
2152
3140
497
596
695
784
873
912

Pattern 3/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1026
0264
1167
2149
3141
4116
561
655
752
812
97
102

Pattern 4/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.895
0168
1114
2109
3106
494
587
679
761
841
936

Pattern 5/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.687
0116
1112
289
373
472
571
667
741
827
919

Pattern 6/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.624
093
188
277
374
473
544
640
734
834
925
1018
1114
1210

Pattern 7/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.534
0107
186
283
373
468
567
624
710
810
94
102

Pattern 8/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.536
083
182
269
359
457
544
642
740
830
926
104

Pattern 9/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.432
079
170
256
349
446
535
633
729
813
911
106
115

Pattern 10/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.408
099
183
276
363
443
540
64

Pattern 11/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.260
058
144
242
340
429
529
66
76
86

Pattern 12/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.258
055
155
246
327
419
516
615
715
84
92
102
112

Pattern 13/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.236
053
149
236
327
426
515
613
712
85

Pattern 14/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.273
059
156
256
332
419
518
617
79
87

Pattern 15/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.213
071
156
252
321
45
54
62
72

Pattern 16/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.198
052
145
231
329
423
516
62

Pattern 17/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.155
041
136
226
322
417
57
64
72

Pattern 18/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.149
045
130
225
322
416
59
62

Pattern 19/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.120
039
136
230
39
42
52
62

Pattern 20/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.124
051
138
214
36
45
54
64
72

Pattern 21/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.114
024
120
218
315
414
513
68
72

Pattern 22/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.112
022
121
219
317
416
513
62
72

Pattern 23/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.125
028
125
222
320
415
512
63

Pattern 24/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.99
023
118
216
316
415
511

Pattern 25/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.97
018
118
215
315
414
511
62
72
82

Pattern 26/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.66
024
121
213
38

Pattern 27/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.52
019
117
212
32
42

Pattern 28/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.79
025
117
212
312
47
56

Pattern 29/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.31
031

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/29 Footprint

Pattern 1/29 Motif Distance Distribution

Pattern 2/29 Footprint

Pattern 2/29 Motif Distance Distribution

Pattern 3/29 Footprint

Pattern 3/29 Motif Distance Distribution

Pattern 4/29 Footprint

Pattern 4/29 Motif Distance Distribution

Pattern 5/29 Footprint

Pattern 5/29 Motif Distance Distribution

Pattern 6/29 Footprint

Pattern 6/29 Motif Distance Distribution

Pattern 7/29 Footprint

Pattern 7/29 Motif Distance Distribution

Pattern 8/29 Footprint

Pattern 8/29 Motif Distance Distribution

Pattern 9/29 Footprint

Pattern 9/29 Motif Distance Distribution

Pattern 10/29 Footprint

Pattern 10/29 Motif Distance Distribution

Pattern 11/29 Footprint

Pattern 11/29 Motif Distance Distribution

Pattern 12/29 Footprint

Pattern 12/29 Motif Distance Distribution

Pattern 13/29 Footprint

Pattern 13/29 Motif Distance Distribution

Pattern 14/29 Footprint

Pattern 14/29 Motif Distance Distribution

Pattern 15/29 Footprint

Pattern 15/29 Motif Distance Distribution

Pattern 16/29 Footprint

Pattern 16/29 Motif Distance Distribution

Pattern 17/29 Footprint

Pattern 17/29 Motif Distance Distribution

Pattern 18/29 Footprint

Pattern 18/29 Motif Distance Distribution

Pattern 19/29 Footprint

Pattern 19/29 Motif Distance Distribution

Pattern 20/29 Footprint

Pattern 20/29 Motif Distance Distribution

Pattern 21/29 Footprint

Pattern 21/29 Motif Distance Distribution

Pattern 22/29 Footprint

Pattern 22/29 Motif Distance Distribution

Pattern 23/29 Footprint

Pattern 23/29 Motif Distance Distribution

Pattern 24/29 Footprint

Pattern 24/29 Motif Distance Distribution

Pattern 25/29 Footprint

Pattern 25/29 Motif Distance Distribution

Pattern 26/29 Footprint

Pattern 26/29 Motif Distance Distribution

Pattern 27/29 Footprint

Pattern 27/29 Motif Distance Distribution

Pattern 28/29 Footprint

Pattern 28/29 Motif Distance Distribution

Pattern 29/29 Footprint

Pattern 29/29 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11205
21126
31026
4895
5687
6624
7534
8536
9432
10408
11260
12258
13236
14273
15213
16198
17155
18149
19120
20124
21114
22112
23125
2499
2597
2666
2752
2879
2931
In [10]: