In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py

Generating modisco motifs

In [10]:
def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/deeplift_out/summit.h5"
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['hyp_scores', 'scores', 'one_hot'])
In [11]:
import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
In [ ]:
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=15,
                    flank_size=5,
                    target_seqlet_fdr=0.15,
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        trim_to_window_size=15,
                        initial_flank_to_add=5,
                        kmer_len=5, num_gaps=1,
                        num_mismatches=0,
                        final_min_cluster_size=60)
                )(
                 task_names=["late0"],
                 contrib_scores={'late0': deeplift_data['scores']},
                 hypothetical_contribs={'late0': deeplift_data['hyp_scores'] },
                 one_hot=deeplift_data['one_hot'],
                 null_per_pos_scores = null_per_pos_scores)
MEMORY 6.15686144
On task late0
Computing windowed sums on original
Generating null dist
peak(mu)= 1.8009368794593642e-06
Computing threshold
In [ ]:
modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out/results.h5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)
In [10]:
from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def show_patterns_using_hoccomocco_db(sample_name, modiscodir):
    ob = DenovoModisco(modiscodir)
    # ob.fetch_tomtom_matches(
    #             meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
    #             database_name="HOCOMOCO.nonredundant.annotated",
    #             save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
    ob.get_motif_per_celltype(match_threshold=0.03, database_name="HOCOMOCO.nonredundant.annotated")
    #ob.display_individual_table()
    
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    return None



def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(save_report=True, 
#                                   tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    
    ob.load_matched_motifs()
    ob.get_motif_per_celltype(match_threshold=match_threshold)
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    #ob.display_individual_table()
    
    return None
In [ ]:
ob.visualize_pattern_tabl

Early timepoint

In [7]:
sample_name = 'early_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"
)
early_fold0
Click here for Denovo Patterns by MoDISco in early_fold0: #23
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 3893 SequenceContrib ScoresHyp_Contrib Scores
Bach2, Jund, Nfe2l2, Fos, Fosb, Nfe2, Junb, Atf3, Fosl1, Jun,

Jdp2, Fosl2, Bach1, Batf, Mafk, Mafa, Mafb
metacluster_0/pattern_1 # seqlets: 714 SequenceContrib ScoresHyp_Contrib Scores
Trp73, Trp63, Trp53
metacluster_0/pattern_2 # seqlets: 593 SequenceContrib ScoresHyp_Contrib Scores
Klf4, Klf5, Klf8, Klf1, Klf12, Klf7, Sp4, Klf3, Klf6, Egr1,

Egr2, Sp3, Sp1, Sp2, Sall4, Zbtb17, Klf15
metacluster_0/pattern_3 # seqlets: 486 SequenceContrib ScoresHyp_Contrib Scores
Runx1, Cbfb, Runx2, Runx3
metacluster_0/pattern_4 # seqlets: 437 SequenceContrib ScoresHyp_Contrib Scores
Etv6, Etv4, Etv2, Gabpa, Ets1, Erg, Etv5, Fli1, XP_911724.4, Etv3,

Elf4, Elk3, Etv1, Elk1, Spi1, Elk4, Elf2, Elf3, Ehf, Spic, Spib,

Elf5, Spdef
metacluster_0/pattern_5 # seqlets: 380 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 339 SequenceContrib ScoresHyp_Contrib Scores
Tfap2a, Tfap2c, Tfap2e, Tfap2b
metacluster_0/pattern_7 # seqlets: 274 SequenceContrib ScoresHyp_Contrib Scores
Atf2, Atf7, Jdp2, Creb5
metacluster_0/pattern_8 # seqlets: 209 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_9 # seqlets: 185 SequenceContrib ScoresHyp_Contrib Scores
Nfkb1, Rela, Nfkb2, Rel, Relb
metacluster_0/pattern_10 # seqlets: 178 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_11 # seqlets: 173 SequenceContrib ScoresHyp_Contrib Scores
Irf1, Irf9, Irf4, Irf7, Irf2, Prdm1, Irf3, Irf5, Stat2, Irf8,

Stat1
metacluster_0/pattern_12 # seqlets: 175 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_13 # seqlets: 160 SequenceContrib ScoresHyp_Contrib Scores
Nfia, Nfib, Nfic
metacluster_0/pattern_14 # seqlets: 174 SequenceContrib ScoresHyp_Contrib Scores
Tcf7l1, Ptf1a, Tcf3, Myod1, Nhlh2
metacluster_0/pattern_15 # seqlets: 171 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_16 # seqlets: 108 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_17 # seqlets: 100 SequenceContrib ScoresHyp_Contrib Scores
Grhl2
metacluster_0/pattern_18 # seqlets: 98 SequenceContrib ScoresHyp_Contrib Scores
Nfe2l2, Nfe2, Bach2
metacluster_0/pattern_19 # seqlets: 148 SequenceContrib ScoresHyp_Contrib Scores
Ctcf, Ctcfl
metacluster_0/pattern_20 # seqlets: 96 SequenceContrib ScoresHyp_Contrib Scores
Rfx1, Rfx4, Rfx2, Rfx7, Rfx3, Rfx6
metacluster_0/pattern_21 # seqlets: 89 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_22 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores
Nfyc, Nfya, Nfyb, Pbx3, Foxi1
In [11]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #23
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 3893 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-101_NFE2.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 714 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_2 # seqlets: 593 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-184_KLF12.UNK.0.A
metacluster_0/pattern_3 # seqlets: 486 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-176_CBFB.UNK.0.A
metacluster_0/pattern_4 # seqlets: 437 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_5 # seqlets: 380 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_6 # seqlets: 339 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-169_TFAP2A.UNK.0.A
metacluster_0/pattern_7 # seqlets: 274 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-175_ATF1.UNK.0.A
metacluster_0/pattern_8 # seqlets: 209 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-106_ATF6.UNK.0.A
metacluster_0/pattern_9 # seqlets: 185 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-167_NFKB1.UNK.0.A, HCLUST-145_RELA.UNK.0.A
metacluster_0/pattern_10 # seqlets: 178 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_11 # seqlets: 173 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-10_IRF1.UNK.0.A, HCLUST-9_IRF7.UNK.0.A
metacluster_0/pattern_12 # seqlets: 175 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_13 # seqlets: 160 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-137_NFIA.UNK.0.A
metacluster_0/pattern_14 # seqlets: 174 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_15 # seqlets: 171 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_16 # seqlets: 108 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_17 # seqlets: 100 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-110_GRHL2.UNK.0.A
metacluster_0/pattern_18 # seqlets: 98 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_19 # seqlets: 148 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A, HCLUST-134_INSM1.UNK.0.A
metacluster_0/pattern_20 # seqlets: 96 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-16_RFX1.UNK.0.A, HCLUST-15_RFX5.UNK.0.A
metacluster_0/pattern_21 # seqlets: 89 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_22 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-173_NFYA.UNK.0.A, HCLUST-109_FOXI1.UNK.0.A

Late timepoint

In [12]:
sample_name = 'late_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
)
late_fold0
Click here for Denovo Patterns by MoDISco in late_fold0: #18
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 2460 SequenceContrib ScoresHyp_Contrib Scores
Jund, Fos, Nfe2l2, Nfe2, Bach2, Fosb, Junb, Fosl1, Atf3, Jdp2,

Jun, Fosl2, Bach1, Batf, Mafk, Mafa
metacluster_0/pattern_1 # seqlets: 1372 SequenceContrib ScoresHyp_Contrib Scores
Klf4, Klf1, Klf8, Klf5, Klf12, Klf7, Sp4, Klf3, Klf6, Egr1,

Egr2
metacluster_0/pattern_2 # seqlets: 1217 SequenceContrib ScoresHyp_Contrib Scores
Cebpb, Cebpa, Cebpd, Dbp, Nfil3, Hlf, Cebpg, Ddit3, Atf4, Cebpe
metacluster_0/pattern_3 # seqlets: 817 SequenceContrib ScoresHyp_Contrib Scores
Ddit3, Atf4, Cebpg, Cebpb, Nfil3, Cebpd, Dbp, Cebpa
metacluster_0/pattern_4 # seqlets: 709 SequenceContrib ScoresHyp_Contrib Scores
Grhl2
metacluster_0/pattern_5 # seqlets: 654 SequenceContrib ScoresHyp_Contrib Scores
Trp73, Trp63, Trp53, Tcfcp2l1
metacluster_0/pattern_6 # seqlets: 475 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_7 # seqlets: 475 SequenceContrib ScoresHyp_Contrib Scores
Atf2, Atf7, Jdp2, Creb5
metacluster_0/pattern_8 # seqlets: 387 SequenceContrib ScoresHyp_Contrib Scores
Nfia
metacluster_0/pattern_9 # seqlets: 341 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_10 # seqlets: 331 SequenceContrib ScoresHyp_Contrib Scores
Tfap2c, Tfap2a, Tfap2e, Tfap2b
metacluster_0/pattern_11 # seqlets: 224 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_12 # seqlets: 192 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_13 # seqlets: 123 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_14 # seqlets: 84 SequenceContrib ScoresHyp_Contrib Scores
Mitf, Tfec, Srebf1, Mlx, Usf2, Tfe3, Bhlhe40, Tfeb, Arntl, Clock,

Usf1, Npas2, Srebf2, Six1
metacluster_0/pattern_15 # seqlets: 74 SequenceContrib ScoresHyp_Contrib Scores
Gata4, Gata3, Gata1, Gata2, Gata6, Gata5
metacluster_0/pattern_16 # seqlets: 90 SequenceContrib ScoresHyp_Contrib Scores
Etv4, Spi1, Zkscan5, Etv6, Elf2, Elk1, Ets1, Elf5, Ehf, Elf4,

Erg, Elk3, Fli1, Gabpa, Elf3, Etv3, XP_911724.4, Etv1, Elk4, Spib, Etv5,

Prdm1, Irf7, Spdef, Fev, Elf1, Zscan10, Ets2
metacluster_0/pattern_17 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores
Mitf, Tfe3, Tfec, Usf2, Mlx, Tfeb, Srebf1, Arnt, Mycn, Arntl,

Creb3l2, Max, Myc, Npas2, Clock, Nkx2-6, Bhlhe40, Xbp1, Nkx2-4, Usf1, Mlxip
In [13]:
sample_name = 'late_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in late_fold0: #18
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 2460 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 1372 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_2 # seqlets: 1217 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-174_CEBPA.UNK.0.A, HCLUST-133_CEBPD.UNK.0.A, HCLUST-105_ATF4.UNK.0.A
metacluster_0/pattern_3 # seqlets: 817 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-105_ATF4.UNK.0.A, HCLUST-133_CEBPD.UNK.0.A
metacluster_0/pattern_4 # seqlets: 709 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-110_GRHL2.UNK.0.A
metacluster_0/pattern_5 # seqlets: 654 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_6 # seqlets: 475 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_7 # seqlets: 475 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-175_ATF1.UNK.0.A
metacluster_0/pattern_8 # seqlets: 387 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-137_NFIA.UNK.0.A
metacluster_0/pattern_9 # seqlets: 341 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_10 # seqlets: 331 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-169_TFAP2A.UNK.0.A
metacluster_0/pattern_11 # seqlets: 224 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_12 # seqlets: 192 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_13 # seqlets: 123 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_14 # seqlets: 84 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-186_ARNTL.UNK.0.A
metacluster_0/pattern_15 # seqlets: 74 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-2_GATA4.UNK.0.A, HCLUST-0_GATA3.UNK.0.A
metacluster_0/pattern_16 # seqlets: 90 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-159_EHF.UNK.0.A, HCLUST-130_ERG.UNK.0.A
metacluster_0/pattern_17 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores

Loading keras model

In [22]:
from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()
channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_11 (Batc (None, 1000, 300)         1200      
_________________________________________________________________
activation_14 (Activation)   (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_12 (Batc (None, 333, 200)          800       
_________________________________________________________________
activation_15 (Activation)   (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_13 (Batc (None, 83, 200)           800       
_________________________________________________________________
activation_16 (Activation)   (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_14 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_17 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_15 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_18 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________
In [11]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape
Out[11]:
((35024, 19), (35024, 19), (35024, 1, 1000, 4))
In [23]:
from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)
1/1 [==============================] - 1s 664ms/step
In [24]:
from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')
Out[24]:
Text(0, 0.5, 'keras predictions')
In [25]:
import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))
(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model

In [8]:
from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape
load data from labcluster
TF-MoDISco is using the TensorFlow backend.
nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_4_0 the preceding linear layer is conv1d_8_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_5_0 the preceding linear layer is conv1d_9_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_6_0 the preceding linear layer is conv1d_10_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_7_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_8_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
Out[8]:
[None, 1000, 4]
In [9]:
#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]
Out[9]:
(list, 35024)
In [3]:
def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)
In [27]:
from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)
In [28]:
num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]
One hot encoding sequences...
One hot encoding done...
Out[28]:
((450, 1000, 4), 1000)
In [11]:
from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}
In [29]:
task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape
Done 0
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
In [ ]:
# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)
In [8]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape
['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']
Out[8]:
((35024, 10, 1000, 4), (35024, 10, 160, 4))
In [10]:
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)
-0.4860307276248932 1.3380632400512695
-0.0440836176276207 0.1500825583934784
In [5]:
scores_ggr.shape
Out[5]:
(35024, 10, 1000, 4)
In [36]:
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)
-0.0688343504909426 0.35437235310673715
-0.976342553505674 0.35437235310673715
0.0 1.0
In [ ]: