Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR550HCT in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/31

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1214
0274
1178
2163
3120
4107
593
677
761
851
935
1027
1120
128

Pattern 2/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1068
0182
1163
2154
3147
4135
5125
691
742
811
910
106
112

Pattern 3/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1081
0187
1136
2124
3112
4110
580
675
770
862
952
1039
1127
127

Pattern 4/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1045
0200
1191
2121
3108
4100
592
668
743
842
933
1030
1117

Pattern 5/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.955
0146
1115
2112
3106
492
592
673
756
847
939
1039
1138

Pattern 6/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.529
080
171
250
345
445
541
640
740
835
932
1025
1125

Pattern 7/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.446
057
154
247
342
440
535
635
735
834
933
1020
1114

Pattern 8/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.449
052
152
250
345
444
538
632
729
828
925
1024
1119
1211

Pattern 9/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.378
045
143
243
337
435
533
633
732
826
925
1020
116

Pattern 10/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.334
088
161
253
342
431
530
619
75
85

Pattern 11/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.207
031
127
225
321
421
519
618
717
815
911
102

Pattern 12/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.173
036
132
230
327
419
59
69
78
83

Pattern 13/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.180
036
132
226
325
423
519
617
72

Pattern 14/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.160
029
125
222
321
420
517
614
712

Pattern 15/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.165
028
127
224
324
422
521
619

Pattern 16/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.147
031
131
230
326
415
514

Pattern 17/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.123
033
131
225
314
410
55
63
72

Pattern 18/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.127
037
123
220
317
411
58
67
74

Pattern 19/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.106
025
123
219
318
416
55

Pattern 20/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.114
017
117
216
315
415
514
612
76
82

Pattern 21/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.100
023
122
222
315
410
55
63

Pattern 22/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.84
021
118
210
310
49
58
68

Pattern 23/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.57
014
113
211
310
49

Pattern 24/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.56
017
112
212
310
45

Pattern 25/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.78
020
119
214
313
49
53

Pattern 26/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.64
027
124
27
36

Pattern 27/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.48
016
112
211
39

Pattern 28/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.78
019
118
218
316
45
52

Pattern 29/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.57
031
119
27

Pattern 30/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.66
015
113
212
311
47
56
62

Pattern 31/31

SubpatternSeqletsEmbeddingsCWMPFM
Agg.46
020
119
27

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/31 Footprint

Pattern 1/31 Motif Distance Distribution

Pattern 2/31 Footprint

Pattern 2/31 Motif Distance Distribution

Pattern 3/31 Footprint

Pattern 3/31 Motif Distance Distribution

Pattern 4/31 Footprint

Pattern 4/31 Motif Distance Distribution

Pattern 5/31 Footprint

Pattern 5/31 Motif Distance Distribution

Pattern 6/31 Footprint

Pattern 6/31 Motif Distance Distribution

Pattern 7/31 Footprint

Pattern 7/31 Motif Distance Distribution

Pattern 8/31 Footprint

Pattern 8/31 Motif Distance Distribution

Pattern 9/31 Footprint

Pattern 9/31 Motif Distance Distribution

Pattern 10/31 Footprint

Pattern 10/31 Motif Distance Distribution

Pattern 11/31 Footprint

Pattern 11/31 Motif Distance Distribution

Pattern 12/31 Footprint

Pattern 12/31 Motif Distance Distribution

Pattern 13/31 Footprint

Pattern 13/31 Motif Distance Distribution

Pattern 14/31 Footprint

Pattern 14/31 Motif Distance Distribution

Pattern 15/31 Footprint

Pattern 15/31 Motif Distance Distribution

Pattern 16/31 Footprint

Pattern 16/31 Motif Distance Distribution

Pattern 17/31 Footprint

Pattern 17/31 Motif Distance Distribution

Pattern 18/31 Footprint

Pattern 18/31 Motif Distance Distribution

Pattern 19/31 Footprint

Pattern 19/31 Motif Distance Distribution

Pattern 20/31 Footprint

Pattern 20/31 Motif Distance Distribution

Pattern 21/31 Footprint

Pattern 21/31 Motif Distance Distribution

Pattern 22/31 Footprint

Pattern 22/31 Motif Distance Distribution

Pattern 23/31 Footprint

Pattern 23/31 Motif Distance Distribution

Pattern 24/31 Footprint

Pattern 24/31 Motif Distance Distribution

Pattern 25/31 Footprint

Pattern 25/31 Motif Distance Distribution

Pattern 26/31 Footprint

Pattern 26/31 Motif Distance Distribution

Pattern 27/31 Footprint

Pattern 27/31 Motif Distance Distribution

Pattern 28/31 Footprint

Pattern 28/31 Motif Distance Distribution

Pattern 29/31 Footprint

Pattern 29/31 Motif Distance Distribution

Pattern 30/31 Footprint

Pattern 30/31 Motif Distance Distribution

Pattern 31/31 Footprint

Pattern 31/31 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11214
21068
31081
41045
5955
6529
7446
8449
9378
10334
11207
12173
13180
14160
15165
16147
17123
18127
19106
20114
21100
2284
2357
2456
2578
2664
2748
2878
2957
3066
3146
In [10]: