Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/www/kundaje/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/www/kundaje/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR464KFG'
# dataset = 'ENCODE'
In [5]:
a = ReportGenerator(tf=tf,dataset=dataset)
In [6]:
# a.copy_to_mitra()
In [7]:
a.show_subcluster_table()

Pattern 1/30

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1236
0262
1177
2176
3129
4127
5126
6112
763
858
94
102

Pattern 2/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1219
0282
1230
2213
3188
4183
571
638
714

Pattern 3/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1069
0170
1159
2121
3105
4100
578
664
758
857
952
1048
1139
1218

Pattern 4/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1022
0313
1188
2178
3139
4120
584

Pattern 5/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.992
0191
1140
2105
3104
4102
579
679
768
841
934
1033
1116

Pattern 6/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.934
0150
1132
2108
382
481
574
656
752
845
942
1040
1138
1234

Pattern 7/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.821
0180
195
293
360
458
552
652
750
845
944
1038
1133
1217
134

Pattern 8/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.760
0157
1124
291
371
469
547
646
746
844
932
1018
1115

Pattern 9/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.724
0154
191
274
361
460
557
644
744
839
939
1027
1121
127
136

Pattern 10/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.709
0130
199
289
366
459
550
649
745
834
934
1027
1127

Pattern 11/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.600
097
191
272
362
456
547
636
731
830
930
1027
1121

Pattern 12/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.593
090
178
269
364
454
554
640
739
835
932
1020
1118

Pattern 13/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.581
097
177
271
366
456
555
654
744
830
927
104

Pattern 14/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.569
0113
185
267
364
452
538
637
737
833
929
1014

Pattern 15/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.563
0107
178
272
370
444
541
637
736
831
921
1012
118
123
133

Pattern 16/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.533
0122
1111
2105
397
487
59
62

Pattern 17/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.485
083
165
261
355
451
543
640
738
827
912
1010

Pattern 18/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.462
079
164
250
344
442
539
636
734
827
925
1022

Pattern 19/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.445
0122
1118
291
364
450

Pattern 20/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.400
0127
175
264
361
419
517
613
712
88
94

Pattern 21/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.314
055
144
242
342
441
534
627
719
810

Pattern 22/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.263
048
142
238
330
423
522
621
718
813
98

Pattern 23/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.254
071
153
250
341
430
59

Pattern 24/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.245
041
135
234
331
429
526
621
713
813
92

Pattern 25/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.222
036
133
225
322
421
519
618
718
815
915

Pattern 26/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.183
063
141
234
317
414
514

Pattern 27/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.123
031
125
222
311
411
59
68
73
83

Pattern 28/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.109
034
131
215
310
47
56
66

Pattern 29/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.101
030
129
228
314

Pattern 30/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.44
021
116
27
In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/30 Footprint

Pattern 1/30 Motif Distance Distribution

Pattern 2/30 Footprint

Pattern 2/30 Motif Distance Distribution

Pattern 3/30 Footprint

Pattern 3/30 Motif Distance Distribution

Pattern 4/30 Footprint

Pattern 4/30 Motif Distance Distribution

Pattern 5/30 Footprint

Pattern 5/30 Motif Distance Distribution

Pattern 6/30 Footprint

Pattern 6/30 Motif Distance Distribution

Pattern 7/30 Footprint

Pattern 7/30 Motif Distance Distribution

Pattern 8/30 Footprint

Pattern 8/30 Motif Distance Distribution

Pattern 9/30 Footprint

Pattern 9/30 Motif Distance Distribution

Pattern 10/30 Footprint

Pattern 10/30 Motif Distance Distribution

Pattern 11/30 Footprint

Pattern 11/30 Motif Distance Distribution

Pattern 12/30 Footprint

Pattern 12/30 Motif Distance Distribution

Pattern 13/30 Footprint

Pattern 13/30 Motif Distance Distribution

Pattern 14/30 Footprint

Pattern 14/30 Motif Distance Distribution

Pattern 15/30 Footprint

Pattern 15/30 Motif Distance Distribution

Pattern 16/30 Footprint

Pattern 16/30 Motif Distance Distribution

Pattern 17/30 Footprint

Pattern 17/30 Motif Distance Distribution

Pattern 18/30 Footprint

Pattern 18/30 Motif Distance Distribution

Pattern 19/30 Footprint

Pattern 19/30 Motif Distance Distribution

Pattern 20/30 Footprint

Pattern 20/30 Motif Distance Distribution

Pattern 21/30 Footprint

Pattern 21/30 Motif Distance Distribution

Pattern 22/30 Footprint

Pattern 22/30 Motif Distance Distribution

Pattern 23/30 Footprint

Pattern 23/30 Motif Distance Distribution

Pattern 24/30 Footprint

Pattern 24/30 Motif Distance Distribution

Pattern 25/30 Footprint

Pattern 25/30 Motif Distance Distribution

Pattern 26/30 Footprint

Pattern 26/30 Motif Distance Distribution

Pattern 27/30 Footprint

Pattern 27/30 Motif Distance Distribution

Pattern 28/30 Footprint

Pattern 28/30 Motif Distance Distribution

Pattern 29/30 Footprint

Pattern 29/30 Motif Distance Distribution

Pattern 30/30 Footprint

Pattern 30/30 Motif Distance Distribution

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11236
21219
31069
41022
5992
6934
7821
8760
9724
10709
11600
12593
13581
14569
15563
16533
17485
18462
19445
20400
21314
22263
23254
24245
25222
26183
27123
28109
29101
3044
In [10]:
 
In [10]: