Purpose of the notebook is to generate reports via saved images

In [1]:
import os
import sys
sys.path.append("/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/utils/modisco_utils")
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/src/"))
sys.path.append(os.path.abspath("/oak/stanford/groups/akundaje/manyu/softwares/tfmodisco_tf_models/notebooks/reports/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import motif.moods as moods
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
import io
import base64
import urllib
import deepdish as dd
import shutil
from vdom.helpers import h1, p, img, div, b

Main class to generate reports in HTML form using the saved images and h5 files

In [2]:
from shutil import copyfile
def get_read_handle(h5_file):
    '''Copy to temp with a hashed name and give the read handle'''
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    
    if os.path.exists(copy_file):
        os.remove(copy_file)
    copyfile(h5_file,copy_file)
    f = h5py.File(copy_file,'r')
    return(f)


def close_h5file_read_handle(h5_file,read_handle):
    '''Removes the copied h5 file in temp and the read handle to avoid bloating'''
    read_handle.close()
    name = h5_file.split('/')[-1]
    #copy_file = '/users/manyu/temp/{}_{}'.format(str(hash(h5_file)),name)
    copy_file = '/srv/scratch/manyu/tmp/{}_{}'.format(str(hash(h5_file)),name)
    if os.path.exists(copy_file):
        os.remove(copy_file)
    
    
    
  

    

class ReportGenerator():
    def __init__(self,tf,dataset):
        self.dataset = dataset
        self.tf = tf
        # Path to saved modisco results on /oak. Images and h5 files
        self.reports_base = '/oak/stanford/groups/akundaje/manyu/C2H2_ZNF_project/train_profile_models_2020/modisco_reports'
        self.results_dir = '{}/{}/{}'.format(self.reports_base,dataset,tf)
        
        ## Paths to the saved modisco report results to copy/copied on Mitra: Images and h5 files
        self.modisco_reports_results_mitra = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_report_saved_information'
        self.mitra_results_dataset_dir = '{}/{}'.format(self.modisco_reports_results_mitra,self.dataset)
        if not os.path.exists(self.mitra_results_dataset_dir):
            os.makedirs(self.mitra_results_dataset_dir)
        self.save_results_mitra_dir  = '{}/{}'.format(self.mitra_results_dataset_dir,self.tf)
        
        self.main_patterns_dir = '{}/main_patterns'.format(self.save_results_mitra_dir)
        self.main_patterns = '{}/all_motifs.h5'.format(self.main_patterns_dir)
        
        self.subpatterns_dir  = '{}/sub_patterns'.format(self.save_results_mitra_dir)
        self.subpatterns = '{}/all_motif_subclusters.h5'.format(self.subpatterns_dir)
        
        
        ## Paths to the modisco reports html on lab cluster
        self.modisco_reports_base = '/srv/www/kundaje/manyu/C2H2_ZNF_project/modisco_reports'
        self.modisco_reports_dataset_base = '{}/{}'.format(self.modisco_reports_base,self.dataset)
        if not os.path.exists(self.modisco_reports_dataset_base):
            os.makedirs(self.modisco_reports_dataset_base)
        self.modisco_report_filename = '{}/{}.html'.format(self.modisco_reports_dataset_base,self.tf)
        
        ## Path to webpages and weblinks to images(needs weblinks to load; not local links)
        self.mitra_web_base = 'http://mitra.stanford.edu/kundaje/manyu/C2H2_ZNF_project/'
        self.mitra_web_results_base = '{}/modisco_report_saved_information/{}/{}'.format(self.mitra_web_base,self.dataset,self.tf)
        self.mitra_web_results_main_patterns = '{}/main_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_sub_patterns = '{}/sub_patterns'.format(self.mitra_web_results_base)
        self.mitra_web_results_profiles = '{}/profiles_dir'.format(self.mitra_web_results_base)
        self.mitra_web_results_b1h_aln = '{}/b1h_aligments'.format(self.mitra_web_results_base)
        
        ##Do we need this here?
        #self.mitra_web_reports_base = '{}/modisco_reports'.format(self.mitra_web_base)
        
        
    
    def copy_to_mitra(self):
        assert(os.path.exists(self.results_dir))
        if os.path.exists(self.save_results_mitra_dir):
            shutil.rmtree(self.save_results_mitra_dir)
        shutil.copytree(self.results_dir,self.save_results_mitra_dir)
        print('Finished copying files to Mitra')
        permissions_cmd = 'chmod -R 777 {}'.format(self.save_results_mitra_dir)
        os.system(permissions_cmd)
        
        
        
    def show_subcluster_table(self):
      
        assert(os.path.exists(self.main_patterns))
        
        ## Opening a read handle for h5. Remmeber to close it also
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        try:
            sub_patterns_read_handle = get_read_handle(self.subpatterns)
        except Exception as e:
            print(e)

        num_patterns  = len(patterns_read_handle.keys())

     
        
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{}".format(str(pattern_i + 1), num_patterns)))
        
            

            # Sort this out:
            colgroup = vdomh.colgroup(
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "5%"}),
                vdomh.col(style={"width": "50%"}),
                vdomh.col(style={"width": "40%"})
            )
            header = vdomh.thead(
                vdomh.tr(
                    vdomh.th("Subpattern", style={"text-align": "center"}),
                    vdomh.th("Seqlets", style={"text-align": "center"}),
                    vdomh.th("Embeddings", style={"text-align": "center"}),
                    vdomh.th("CWM", style={"text-align": "center"}),
                    vdomh.th("PFM", style={"text-align": "center"})
                )
            )
            
            
            ## Fill in the paths and seqlets of the main pattern
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            emb_agg_path = '{}/{}_subcluster_agg.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i))
            pwm_path = '{}/{}_pfm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            cwm_path = '{}/{}_cwm_full.png'.format(self.mitra_web_results_main_patterns,str(pattern_i))
            
            
            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_agg_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            )]

            
            total_sub_pats = len(sub_patterns_read_handle[str(pattern_i)].keys())
            for subpat_i in range(total_sub_pats):
                emb_path = '{}/{}_subcluster_{}.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
            
                total_seqlets = sub_patterns_read_handle[str(pattern_i)]['subcluster_{}'.format(str(subpat_i))]['n_seqlets'].value
                pwm_path = '{}/{}_subcluster_{}_pfm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                cwm_path = '{}/{}_subcluster_{}_cwm_trimmed.png'.format(self.mitra_web_results_sub_patterns,str(pattern_i),str(subpat_i))
                table_rows.append(vdomh.tr(
                vdomh.td('{}'.format(str(subpat_i))),
                vdomh.td(str(total_seqlets)),
                vdomh.td(img(src=emb_path)),
                vdomh.td(img(src=cwm_path)),
                vdomh.td(img(src=pwm_path))
            ))
            table = vdomh.table(header,vdomh.tbody(*table_rows))
            
            display(table)
            
        #Remove unnecessary temp files
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        close_h5file_read_handle(self.subpatterns,sub_patterns_read_handle)
    
    
    def show_footprints_and_peak_dist(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        
        num_patterns  = len(patterns_read_handle.keys())
        for pattern_i in range(num_patterns):
            
            display(vdomh.h3("Pattern {}/{} Footprint".format(str(pattern_i + 1), num_patterns)))
            footprint_plot = '{}/profiles_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            motif_distplot  = '{}/motif_distances_pattern_{}.png'.format(self.mitra_web_results_profiles,str(pattern_i))
            display(img(src=footprint_plot))
            display(vdomh.h3("Pattern {}/{} Motif Distance Distribution".format(str(pattern_i + 1), num_patterns)))
            display(img(src=motif_distplot))
    
    
    def show_b1h_alignments(self):
        try:
            patterns_read_handle = get_read_handle(self.main_patterns)
        except Exception as e:
            print(e)
        


        num_patterns  = len(patterns_read_handle.keys())
        header = vdomh.thead(
            vdomh.tr(
                vdomh.th("Pattern", style={"text-align": "center"}),
                vdomh.th("N-Seqlets", style={"text-align": "center"}),
                vdomh.th("Alignment", style={"text-align": "center"})

            )
        )

         
        display(vdomh.h2("B1H alignments of all patterns"))
        table_rows = []
        for pattern_i in range(num_patterns):
            
            
        
            total_seqlets = patterns_read_handle[str(pattern_i)]['n_seqlets'].value
            alignment_img = '{}/pattern_{}_aligned.png'.format(self.mitra_web_results_b1h_aln,str(pattern_i))
            

            table_rows.append(vdomh.tr(
            vdomh.td('{}'.format(str(pattern_i+1))),
            vdomh.td('{}'.format(str(total_seqlets))),
            vdomh.td(img(src=alignment_img)),

            ))
        close_h5file_read_handle(self.main_patterns,patterns_read_handle)
        table = vdomh.table(header,vdomh.tbody(*table_rows))
        display(table)
In [3]:
# Define parameters/fetch arguments
dataset = os.environ["DATASET"]
tf = os.environ["TF"]
In [4]:
# tf = 'ENCSR904JEY'
# dataset = 'ENCODE_new'
In [5]:
vdomh.h1('Running {} in dataset: {}'.format(tf,dataset))
Out[5]:

Running ENCSR701AQS in dataset: ENCODE_new

In [6]:
a = ReportGenerator(tf=tf,dataset=dataset)

Show motifs and submotifs and relevant embeddings

In [7]:
a.show_subcluster_table()

Pattern 1/30

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:127: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:146: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
SubpatternSeqletsEmbeddingsCWMPFM
Agg.1162
0252
1205
2197
3185
4164
5159

Pattern 2/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.1074
0124
1121
2114
3113
4106
593
678
764
862
956
1050
1148
1245

Pattern 3/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.571
078
175
265
351
447
544
644
743
843
940
1035
114
122

Pattern 4/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.528
078
169
266
360
452
552
649
740
840
922

Pattern 5/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.373
054
153
250
347
439
537
630
724
818
917
104

Pattern 6/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.322
080
179
266
352
431
511
63

Pattern 7/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.265
056
150
249
340
440
528
62

Pattern 8/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.260
049
136
231
328
427
525
622
722
820

Pattern 9/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.201
044
144
234
327
425
519
65
73

Pattern 10/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.197
068
167
228
326
48

Pattern 11/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.201
042
138
232
328
425
522
67
75
82

Pattern 12/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.169
050
138
228
323
419
511

Pattern 13/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.115
030
125
220
317
413
57
63

Pattern 14/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.105
026
124
215
315
413
512

Pattern 15/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.101
025
122
219
314
413
58

Pattern 16/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.103
032
125
221
313
412

Pattern 17/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.99
030
128
225
313
43

Pattern 18/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.93
020
120
218
316
412
54
63

Pattern 19/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.97
024
118
218
317
412
58

Pattern 20/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.79
029
128
222

Pattern 21/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.73
023
118
217
310
45

Pattern 22/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.80
028
126
220
34
42

Pattern 23/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.117
033
125
224
318
417

Pattern 24/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.52
021
115
29
37

Pattern 25/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.64
016
115
215
39
49

Pattern 26/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.60
023
116
216
33
42

Pattern 27/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.51
023
117
29
32

Pattern 28/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.55
018
116
211
310

Pattern 29/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.51
017
110
29
38
47

Pattern 30/30

SubpatternSeqletsEmbeddingsCWMPFM
Agg.45
018
117
210

Show footprints and motif distributions within peaks

In [8]:
a.show_footprints_and_peak_dist()

Pattern 1/30 Footprint

Pattern 1/30 Motif Distance Distribution

Pattern 2/30 Footprint

Pattern 2/30 Motif Distance Distribution

Pattern 3/30 Footprint

Pattern 3/30 Motif Distance Distribution

Pattern 4/30 Footprint

Pattern 4/30 Motif Distance Distribution

Pattern 5/30 Footprint

Pattern 5/30 Motif Distance Distribution

Pattern 6/30 Footprint

Pattern 6/30 Motif Distance Distribution

Pattern 7/30 Footprint

Pattern 7/30 Motif Distance Distribution

Pattern 8/30 Footprint

Pattern 8/30 Motif Distance Distribution

Pattern 9/30 Footprint

Pattern 9/30 Motif Distance Distribution

Pattern 10/30 Footprint

Pattern 10/30 Motif Distance Distribution

Pattern 11/30 Footprint

Pattern 11/30 Motif Distance Distribution

Pattern 12/30 Footprint

Pattern 12/30 Motif Distance Distribution

Pattern 13/30 Footprint

Pattern 13/30 Motif Distance Distribution

Pattern 14/30 Footprint

Pattern 14/30 Motif Distance Distribution

Pattern 15/30 Footprint

Pattern 15/30 Motif Distance Distribution

Pattern 16/30 Footprint

Pattern 16/30 Motif Distance Distribution

Pattern 17/30 Footprint

Pattern 17/30 Motif Distance Distribution

Pattern 18/30 Footprint

Pattern 18/30 Motif Distance Distribution

Pattern 19/30 Footprint

Pattern 19/30 Motif Distance Distribution

Pattern 20/30 Footprint

Pattern 20/30 Motif Distance Distribution

Pattern 21/30 Footprint

Pattern 21/30 Motif Distance Distribution

Pattern 22/30 Footprint

Pattern 22/30 Motif Distance Distribution

Pattern 23/30 Footprint

Pattern 23/30 Motif Distance Distribution

Pattern 24/30 Footprint

Pattern 24/30 Motif Distance Distribution

Pattern 25/30 Footprint

Pattern 25/30 Motif Distance Distribution

Pattern 26/30 Footprint

Pattern 26/30 Motif Distance Distribution

Pattern 27/30 Footprint

Pattern 27/30 Motif Distance Distribution

Pattern 28/30 Footprint

Pattern 28/30 Motif Distance Distribution

Pattern 29/30 Footprint

Pattern 29/30 Motif Distance Distribution

Pattern 30/30 Footprint

Pattern 30/30 Motif Distance Distribution

Show B1H alignments of main motifs

In [9]:
a.show_b1h_alignments()

B1H alignments of all patterns

/users/manyu/.local/lib/python3.6/site-packages/ipykernel_launcher.py:207: H5pyDeprecationWarning: dataset.value has been deprecated. Use dataset[()] instead.
PatternN-SeqletsAlignment
11162
21074
3571
4528
5373
6322
7265
8260
9201
10197
11201
12169
13115
14105
15101
16103
1799
1893
1997
2079
2173
2280
23117
2452
2564
2660
2751
2855
2951
3045
In [10]: