Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR277BXW in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/32

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1255
0186
1177
2129
3114
4111
574
674
770
840
939
1038
1134
1228
1326
1424
1524
1621
1719
1819
198

Pattern 2/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.913
0198
1174
2170
3133
4129
557
646
76

Pattern 3/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.485
0131
197
290
372
453
531
69
72

Pattern 4/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.480
078
176
263
359
457
555
626
720
819
912
107
116
122

Pattern 5/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.456
085
182
278
376
472
543
68
76
84
92

Pattern 6/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.417
0126
193
280
351
425
525
614
73

Pattern 7/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.414
084
182
263
361
455
541
624
74

Pattern 8/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.371
086
168
262
360
457
531
65
72

Pattern 9/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.366
0105
197
248
334
427
518
616
77
86
96
102

Pattern 10/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.350
0153
1115
230
323
415
57
63
72
82

Pattern 11/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.340
066
164
257
351
433
530
626
77
83
93

Pattern 12/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.234
054
148
236
334
428
516
610
76
82

Pattern 13/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.219
055
153
238
332
428
59
62
72

Pattern 14/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.204
047
141
233
332
430
58
67
73
83

Pattern 15/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.222
059
141
238
335
433
516

Pattern 16/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.200
051
143
229
329
417
517
67
77

Pattern 17/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.184
055
151
248
320
44
52
62
72

Pattern 18/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.133
052
133
219
311
47
56
63
72

Pattern 19/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.128
050
129
226
310
45
54
64

Pattern 20/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.137
035
130
230
315
49
59
64
73
82

Pattern 21/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.104
035
122
217
310
49
57
64

Pattern 22/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.102
029
125
223
312
47
52
62
72

Pattern 23/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.87
034
130
216
33
42
52

Pattern 24/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.74
019
117
212
311
410
55

Pattern 25/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.53
025
118
28
32

Pattern 26/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.58
020
117
216
35

Pattern 27/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.57
021
117
210
39

Pattern 28/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.79
026
120
217
312
44

Pattern 29/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.44
011
19
28
38
48

Pattern 30/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.50
017
114
28
36
45

Pattern 31/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.47
020
112
210
35

Pattern 32/32

SubpatternSeqletsEmbeddingsCWMPFM
Agg.62
021
118
212
311

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/32 Footprint

Pattern 1/32 Motif Distance Distribution

Pattern 2/32 Footprint

Pattern 2/32 Motif Distance Distribution

Pattern 3/32 Footprint

Pattern 3/32 Motif Distance Distribution

Pattern 4/32 Footprint

Pattern 4/32 Motif Distance Distribution

Pattern 5/32 Footprint

Pattern 5/32 Motif Distance Distribution

Pattern 6/32 Footprint

Pattern 6/32 Motif Distance Distribution

Pattern 7/32 Footprint

Pattern 7/32 Motif Distance Distribution

Pattern 8/32 Footprint

Pattern 8/32 Motif Distance Distribution

Pattern 9/32 Footprint

Pattern 9/32 Motif Distance Distribution

Pattern 10/32 Footprint

Pattern 10/32 Motif Distance Distribution

Pattern 11/32 Footprint

Pattern 11/32 Motif Distance Distribution

Pattern 12/32 Footprint

Pattern 12/32 Motif Distance Distribution

Pattern 13/32 Footprint

Pattern 13/32 Motif Distance Distribution

Pattern 14/32 Footprint

Pattern 14/32 Motif Distance Distribution

Pattern 15/32 Footprint

Pattern 15/32 Motif Distance Distribution

Pattern 16/32 Footprint

Pattern 16/32 Motif Distance Distribution

Pattern 17/32 Footprint

Pattern 17/32 Motif Distance Distribution

Pattern 18/32 Footprint

Pattern 18/32 Motif Distance Distribution

Pattern 19/32 Footprint

Pattern 19/32 Motif Distance Distribution

Pattern 20/32 Footprint

Pattern 20/32 Motif Distance Distribution

Pattern 21/32 Footprint

Pattern 21/32 Motif Distance Distribution

Pattern 22/32 Footprint

Pattern 22/32 Motif Distance Distribution

Pattern 23/32 Footprint

Pattern 23/32 Motif Distance Distribution

Pattern 24/32 Footprint

Pattern 24/32 Motif Distance Distribution

Pattern 25/32 Footprint

Pattern 25/32 Motif Distance Distribution

Pattern 26/32 Footprint

Pattern 26/32 Motif Distance Distribution

Pattern 27/32 Footprint

Pattern 27/32 Motif Distance Distribution

Pattern 28/32 Footprint

Pattern 28/32 Motif Distance Distribution

Pattern 29/32 Footprint

Pattern 29/32 Motif Distance Distribution

Pattern 30/32 Footprint

Pattern 30/32 Motif Distance Distribution

Pattern 31/32 Footprint

Pattern 31/32 Motif Distance Distribution

Pattern 32/32 Footprint

Pattern 32/32 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11255
2913
3485
4480
5456
6417
7414
8371
9366
10350
11340
12234
13219
14204
15222
16200
17184
18133
19128
20137
21104
22102
2387
2474
2553
2658
2757
2879
2944
3050
3147
3262
In [10]: