In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py

Generating modisco motifs

In [ ]:
def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

# deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
# deeplift_data = load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot'])
# print(deeplift_data['scores'].shape, deeplift_data['one_hot'].shape)

# traj_no = -1
# if traj_no==8:
#     indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# else:
#     indices = traj_lab[:, traj_no]==1

# if traj_no<0:
#     _score = deeplift_data['scores']
#     _hyp_score = deeplift_data['hyp_scores']
#     _one_hot = deeplift_data['one_hot']
# else:
#     _score = deeplift_data['scores'][indices,:,:]
#     _hyp_score = deeplift_data['hyp_scores'][indices,:,:]
#     _one_hot = deeplift_data['one_hot'][indices,:,:]
In [12]:
import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
In [7]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
ggr_data = OrderedDict()
with h5py.File(ggrfile, "r") as fp:
    ggr_data['one_hot'] = fp['sequence'][:]
    ggr_data['scores'] = fp['sequence-weighted'][:]
    ggr_data['hyp_scores'] = fp['gradients'][:]
ggr_data['hyp_scores'].shape, ggr_data['scores'].shape, ggr_data['one_hot'].shape
(35024, 1000, 4) (35024, 1000, 4)
Out[7]:
((35024, 10, 1000, 4), (35024, 10, 1000, 4), (35024, 1, 1000, 4))
In [11]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
#     print(list(fp))
    traj_lab = fp['TRAJ_LABELS'][:]
traj_lab.shape
# #0, 7, 8-10-11, and 9
# indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# indices.shape, deeplift_data['scores'][indices,:,:].shape, np.sum(indices)
Out[11]:
(35024, 15)
In [ ]:
traj_no = 8
if traj_no==8:
    indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
else:
    indices = traj_lab[:, traj_no]==1

if traj_no<0:
    _score = ggr_data['scores'][:,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][:,0,:,:]
    _one_hot = ggr_data['one_hot'][:,0,:,:]
else:
    _score = ggr_data['scores'][indices,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][indices,0,:,:]
    _one_hot = ggr_data['one_hot'][indices,0,:,:]
_score.shape, _hyp_score.shape, _one_hot.shape
In [ ]:
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=21,
                    flank_size=10,
                    target_seqlet_fdr=0.05,
                    
                    min_passing_windows_frac=0.03,
                    max_seqlets_per_metacluster=20000,
                    
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        
                        embedder_factory=(modisco.seqlet_embedding
                          .advanced_gapped_kmer.AdvancedGappedKmerEmbedderFactory(max_entries=500)),
                        
                        trim_to_window_size=30,
                        initial_flank_to_add=10,
                        #kmer_len=5, num_gaps=1,
                        #num_mismatches=0,
                        final_min_cluster_size=30)
                )(
                 task_names=["early0"],# "task1", "task2"],
                 contrib_scores={'early0': _score}, #[:,420:580,:]
                 hypothetical_contribs={'early0': _hyp_score }, #[:,420:580,:]
                 one_hot=_one_hot, #[:,420:580,:]
                 null_per_pos_scores = null_per_pos_scores)
In [ ]:
modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8_1k/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)
In [34]:
print('..')
..
In [12]:
from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(save_report=True, 
#                                   tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    
    ob.load_matched_motifs()
    ob.get_motif_per_celltype(match_threshold=match_threshold)
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    #ob.display_individual_table()
    
    return None


def show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0.01):
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(
#                 meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
#                 database_name="HOCOMOCO.nonredundant.annotated",
#                 save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
    ob.get_motif_per_celltype(match_threshold=match_threshold, match_criteria='q-value',
                              database_name="HOCOMOCO.nonredundant.annotated")
    #ob.display_individual_table()
    
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    return None

Early timepoint

With all peaks

using central 160bp

In [ ]:
# sample_name = 'early_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out_1k"
# )
In [18]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #9
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 753 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-141_BATF3.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_1 # seqlets: 489 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 166 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-141_BATF3.UNK.0.A
metacluster_0/pattern_3 # seqlets: 154 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_4 # seqlets: 120 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_5 # seqlets: 100 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A, HCLUST-134_INSM1.UNK.0.A
metacluster_0/pattern_6 # seqlets: 89 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_7 # seqlets: 57 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-141_BATF3.UNK.0.A
metacluster_0/pattern_8 # seqlets: 45 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-141_BATF3.UNK.0.A, HCLUST-88_ZFP28.UNK.0.A

using central 1kb

In [7]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=1e-15)

With time-point based peaks

using central 160bp

In [28]:
# sample_name = 'early_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [20]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_0"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #7
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 1413 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 237 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_2 # seqlets: 202 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-156_TEAD1.UNK.0.A, HCLUST-141_BATF3.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_3 # seqlets: 130 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 142 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_5 # seqlets: 70 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 55 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A
In [21]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_7"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #4
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 586 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 153 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_2 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 57 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
In [22]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #7
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 1596 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 238 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 148 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_3 # seqlets: 61 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_4 # seqlets: 58 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A, HCLUST-134_INSM1.UNK.0.A
metacluster_0/pattern_5 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 49 SequenceContrib ScoresHyp_Contrib Scores
In [23]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_9"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #5
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 412 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 79 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-179_BACH1.UNK.0.A, HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_2 # seqlets: 48 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 45 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 40 SequenceContrib ScoresHyp_Contrib Scores

using central 1kbp

In [8]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_0_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [9]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_7_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [10]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [11]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_9_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Late timepoint

In [37]:
# sample_name = 'late_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
# )
In [38]:
# sample_name = 'late_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Loading keras model

In [2]:
from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.
channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 300)         1200      
_________________________________________________________________
activation_1 (Activation)    (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_2 (Batch (None, 333, 200)          800       
_________________________________________________________________
activation_2 (Activation)    (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_3 (Batch (None, 83, 200)           800       
_________________________________________________________________
activation_3 (Activation)    (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_4 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_4 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_5 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_5 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________
In [3]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape
Out[3]:
((35024, 19), (35024, 19), (35024, 1, 1000, 4))
In [23]:
from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)
1/1 [==============================] - 1s 664ms/step
In [24]:
from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')
Out[24]:
Text(0, 0.5, 'keras predictions')
In [25]:
import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))
(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model

In [4]:
from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape
load data from labcluster
TF-MoDISco is using the TensorFlow backend.
nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_1_0 the preceding linear layer is conv1d_1_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_2_0 the preceding linear layer is conv1d_2_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_3_0 the preceding linear layer is conv1d_3_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_4_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_5_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
Out[4]:
[None, 1000, 4]
In [9]:
#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]
Out[9]:
(list, 35024)
In [3]:
def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)
In [27]:
from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)
In [28]:
num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]
One hot encoding sequences...
One hot encoding done...
Out[28]:
((450, 1000, 4), 1000)
In [11]:
from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}
In [29]:
task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape
Done 0
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
In [ ]:
# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)
In [8]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape
['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']
Out[8]:
((35024, 10, 1000, 4), (35024, 10, 160, 4))
In [10]:
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)
-0.4860307276248932 1.3380632400512695
-0.0440836176276207 0.1500825583934784
In [5]:
scores_ggr.shape
Out[5]:
(35024, 10, 1000, 4)
In [36]:
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)
-0.0688343504909426 0.35437235310673715
-0.976342553505674 0.35437235310673715
0.0 1.0
In [ ]: