Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR257XVY in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/29

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1089
0163
1159
2156
3147
4136
5135
6122
765
84
92

Pattern 2/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.747
0155
1155
2117
388
487
570
636
720
812
93
102
112

Pattern 3/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.690
0204
1136
2123
365
463
527
624
716
815
97
104
112
122
132

Pattern 4/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.445
065
153
251
348
439
538
635
731
830
926
1023
113
123

Pattern 5/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.416
093
193
274
362
461
529
64

Pattern 6/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.384
078
151
245
340
437
533
627
726
826
913
106
112

Pattern 7/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.316
046
143
240
339
435
533
633
720
815
98
104

Pattern 8/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.267
044
142
239
338
433
531
621
715
84

Pattern 9/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.248
072
143
241
331
428
512
611
710

Pattern 10/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.257
044
142
237
336
427
527
625
710
89

Pattern 11/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.261
036
132
232
331
430
530
623
722
816
99

Pattern 12/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.272
048
143
237
334
431
522
617
716
811
910
103

Pattern 13/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.226
038
134
233
333
431
528
618
75
82
92
102

Pattern 14/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.201
061
159
231
324
414
56
66

Pattern 15/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.206
051
144
238
331
420
519
63

Pattern 16/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.182
034
132
230
326
417
514
68
77
85
95
104

Pattern 17/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.151
042
133
224
320
417
511
64

Pattern 18/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.133
031
127
223
320
419
58
65

Pattern 19/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.135
050
136
230
36
45
54
62
72

Pattern 20/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.96
022
121
220
317
414
52

Pattern 21/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.89
030
118
218
317
46

Pattern 22/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.89
045
133
29
32

Pattern 23/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.62
021
117
216
36
42

Pattern 24/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.76
018
114
212
312
411
54
63
72

Pattern 25/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.67
027
121
29
35
45

Pattern 26/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.55
022
118
215

Pattern 27/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.58
015
112
212
311
48

Pattern 28/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.51
018
118
28
34
43

Pattern 29/29

SubpatternSeqletsEmbeddingsCWMPFM
Agg.63
022
117
214
38
42

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/29 Footprint

Pattern 1/29 Motif Distance Distribution

Pattern 2/29 Footprint

Pattern 2/29 Motif Distance Distribution

Pattern 3/29 Footprint

Pattern 3/29 Motif Distance Distribution

Pattern 4/29 Footprint

Pattern 4/29 Motif Distance Distribution

Pattern 5/29 Footprint

Pattern 5/29 Motif Distance Distribution

Pattern 6/29 Footprint

Pattern 6/29 Motif Distance Distribution

Pattern 7/29 Footprint

Pattern 7/29 Motif Distance Distribution

Pattern 8/29 Footprint

Pattern 8/29 Motif Distance Distribution

Pattern 9/29 Footprint

Pattern 9/29 Motif Distance Distribution

Pattern 10/29 Footprint

Pattern 10/29 Motif Distance Distribution

Pattern 11/29 Footprint

Pattern 11/29 Motif Distance Distribution

Pattern 12/29 Footprint

Pattern 12/29 Motif Distance Distribution

Pattern 13/29 Footprint

Pattern 13/29 Motif Distance Distribution

Pattern 14/29 Footprint

Pattern 14/29 Motif Distance Distribution

Pattern 15/29 Footprint

Pattern 15/29 Motif Distance Distribution

Pattern 16/29 Footprint

Pattern 16/29 Motif Distance Distribution

Pattern 17/29 Footprint

Pattern 17/29 Motif Distance Distribution

Pattern 18/29 Footprint

Pattern 18/29 Motif Distance Distribution

Pattern 19/29 Footprint

Pattern 19/29 Motif Distance Distribution

Pattern 20/29 Footprint

Pattern 20/29 Motif Distance Distribution

Pattern 21/29 Footprint

Pattern 21/29 Motif Distance Distribution

Pattern 22/29 Footprint

Pattern 22/29 Motif Distance Distribution

Pattern 23/29 Footprint

Pattern 23/29 Motif Distance Distribution

Pattern 24/29 Footprint

Pattern 24/29 Motif Distance Distribution

Pattern 25/29 Footprint

Pattern 25/29 Motif Distance Distribution

Pattern 26/29 Footprint

Pattern 26/29 Motif Distance Distribution

Pattern 27/29 Footprint

Pattern 27/29 Motif Distance Distribution

Pattern 28/29 Footprint

Pattern 28/29 Motif Distance Distribution

Pattern 29/29 Footprint

Pattern 29/29 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11089
2747
3690
4445
5416
6384
7316
8267
9248
10257
11261
12272
13226
14201
15206
16182
17151
18133
19135
2096
2189
2289
2362
2476
2567
2655
2758
2851
2963
In [10]: