%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py

Generating modisco motifs¶

def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

# deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
# deeplift_data = load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot'])
# print(deeplift_data['scores'].shape, deeplift_data['one_hot'].shape)

# traj_no = -1
# if traj_no==8:
#     indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# else:
#     indices = traj_lab[:, traj_no]==1

# if traj_no<0:
#     _score = deeplift_data['scores']
#     _hyp_score = deeplift_data['hyp_scores']
#     _one_hot = deeplift_data['one_hot']
# else:
#     _score = deeplift_data['scores'][indices,:,:]
#     _hyp_score = deeplift_data['hyp_scores'][indices,:,:]
#     _one_hot = deeplift_data['one_hot'][indices,:,:]

import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
ggr_data = OrderedDict()
with h5py.File(ggrfile, "r") as fp:
    ggr_data['one_hot'] = fp['sequence'][:]
    ggr_data['scores'] = fp['sequence-weighted'][:]
    ggr_data['hyp_scores'] = fp['gradients'][:]
ggr_data['hyp_scores'].shape, ggr_data['scores'].shape, ggr_data['one_hot'].shape

((35024, 10, 1000, 4), (35024, 10, 1000, 4), (35024, 1, 1000, 4))

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
#     print(list(fp))
    traj_lab = fp['TRAJ_LABELS'][:]
traj_lab.shape
# #0, 7, 8-10-11, and 9
# indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# indices.shape, deeplift_data['scores'][indices,:,:].shape, np.sum(indices)

(35024, 15)

traj_no = -1
if traj_no==8:
    indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
else:
    indices = traj_lab[:, traj_no]==1

if traj_no<0:
    _score = ggr_data['scores'][:,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][:,0,:,:]
    _one_hot = ggr_data['one_hot'][:,0,:,:]
else:
    _score = ggr_data['scores'][indices,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][indices,0,:,:]
    _one_hot = ggr_data['one_hot'][indices,0,:,:]
_score.shape, _hyp_score.shape, _one_hot.shape

((35024, 1000, 4), (35024, 1000, 4), (35024, 1000, 4))

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=21,
                    flank_size=10,
                    #target_seqlet_fdr=0.05,
                    
                    min_passing_windows_frac=0.03,
                    max_seqlets_per_metacluster=60000,
                    
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        
                        embedder_factory=(modisco.seqlet_embedding
                          .advanced_gapped_kmer.AdvancedGappedKmerEmbedderFactory(max_entries=500)),
                        
                        trim_to_window_size=30,
                        initial_flank_to_add=10,
                        #kmer_len=5, num_gaps=1,
                        #num_mismatches=0,
                        final_min_cluster_size=30)
                )(
                 task_names=["early0"],# "task1", "task2"],
                 contrib_scores={'early0': _score[:,420:580,:]}, #[:,420:580,:]
                 hypothetical_contribs={'early0': _hyp_score[:,420:580,:] }, #[:,420:580,:]
                 one_hot=_one_hot[:,420:580,:], #[:,420:580,:]
                 null_per_pos_scores = null_per_pos_scores)

MEMORY 12.083171328
On task early0
Computing windowed sums on original
Generating null dist
peak(mu)= 0.032145559787750244
Computing threshold
Subsampling!
For increasing = True , the minimum IR precision was 0.33385234625600296 occurring at 0.0 implying a frac_neg of 0.501168688922987
To be conservative, adjusted frac neg is 0.95
For increasing = False , the minimum IR precision was 0.40646257508671163 occurring at -5.0067901611328125e-06 implying a frac_neg of 0.6848137253452771
To be conservative, adjusted frac neg is 0.95
Thresholds from null dist were -3.5901737213134766  and  16.52375066280365 with frac passing 0.000257
Passing windows frac was 0.000257 , which is below  0.03 ; adjusting
Final raw thresholds are -6.971453695297242  and  6.971453695297242
Final transformed thresholds are -0.970051964367291  and  0.970051964367291

Got 13337 coords
After resolving overlaps, got 13337 seqlets
Across all tasks, the weakest transformed threshold used was: 0.969951964367291
MEMORY 12.29840384
13337 identified in total
min_metacluster_size_frac * len(seqlets) = 133 is more than min_metacluster_size=100.
Using it as a new min_metacluster_size
1 activity patterns with support >= 133 out of 2 possible patterns
Metacluster sizes:  [13337]
Idx to activities:  {0: '1'}
MEMORY 12.298973184
On metacluster 0
Metacluster size 13337
Relevant tasks:  ('early0',)
Relevant signs:  (1,)
TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 13337
(Round 1) Computing coarse affmat
MEMORY 12.298981376
Beginning embedding computation
MEMORY 12.298981376

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  44 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done 632 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done 1632 tasks      | elapsed:   36.1s
[Parallel(n_jobs=4)]: Done 3032 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 4832 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 7032 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 9632 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 12632 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 13337 out of 13337 | elapsed:  4.8min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   18.3s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:   32.6s
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:   52.2s
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 8076 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 9976 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 12076 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 13330 out of 13337 | elapsed:  4.9min remaining:    0.2s
[Parallel(n_jobs=4)]: Done 13337 out of 13337 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 13337 out of 13337 | elapsed:   40.6s finished

Constructing csr matrix...
csr matrix made in 3.4400315284729004 s

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 13337 out of 13337 | elapsed:   41.2s finished

Constructing csr matrix...
csr matrix made in 3.366770029067993 s
Finished embedding computation in 683.91 s
MEMORY 13.642887168
Starting affinity matrix computations
MEMORY 13.642625024
Batching in slices of size 5031

100%|██████████| 3/3 [00:56<00:00, 18.82s/it]

Finished affinity matrix computations in 56.57 s
MEMORY 13.691904
(Round 1) Computed coarse affmat
MEMORY 13.424238592
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 13.424238592

Launching nearest neighbors affmat calculation job
MEMORY 13.469171712
Parallel runs completed
MEMORY 13.592117248
Job completed in: 211.87 s
MEMORY 13.591330816
Launching nearest neighbors affmat calculation job
MEMORY 13.591441408
Parallel runs completed
MEMORY 13.709467648
Job completed in: 213.24 s
MEMORY 13.709467648
(Round 1) Computed affinity matrix on nearest neighbors in 428.1 s
MEMORY 13.709467648
Filtered down to 9576 of 13337
(Round 1) Retained 9576 rows out of 13337 after filtering
MEMORY 13.709930496
(Round 1) Computing density adapted affmat
MEMORY 13.71564032
Symmetrizing nearest neighbors
Computing betas for density adaptation
Computing normalizing denominators
(Round 1) Computing clustering
MEMORY 14.079770624
Beginning preprocessing + Leiden
Affmat shape: 9576

modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)

/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/ipykernel_launcher.py:2: H5pyDeprecationWarning: The default file mode will change to 'r' (read-only) in h5py 3.0. To suppress this warning, pass the mode you need to h5py.File(), or set the global default h5.get_config().default_file_mode, or set the environment variable H5PY_DEFAULT_READONLY=1. Available modes are: 'r', 'r+', 'w', 'w-'/'x', 'a'. See the docs for details.

print('..')

from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05, prep=False):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
    if prep:
        ob.fetch_tomtom_matches(save_report=True, 
                                  tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    else:
        ob.load_matched_motifs()
        ob.get_motif_per_celltype(match_threshold=match_threshold)
        pattern_tab, pattern_dict = ob.visualize_pattern_table()
        display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                            ' in ', b(sample_name),
                            ": #{}".format(len(pattern_dict)),
                           ), pattern_tab))
    #ob.display_individual_table()
    
    return None


def show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0.01, prep=False):
    ob = DenovoModisco(modiscodir)
    if prep:
        ob.fetch_tomtom_matches(
                    meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
                    database_name="HOCOMOCO.nonredundant.annotated",
                    save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    else:
        ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
        ob.get_motif_per_celltype(match_threshold=match_threshold, match_criteria='q-value',
                                  database_name="HOCOMOCO.nonredundant.annotated")
        #ob.display_individual_table()

        pattern_tab, pattern_dict = ob.visualize_pattern_table()
        display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                            ' in ', b(sample_name),
                            ": #{}".format(len(pattern_dict)),
                           ), pattern_tab))
    return None

Early timepoint¶

With all peaks¶

using central 160bp¶

# sample_name = 'early_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out_1k"
# )

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

using central 1kb¶

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=1e-15)

With time-point based peaks¶

using central 160bp¶

# sample_name = 'early_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_0"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_7"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_8"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_9"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

using central 1kbp¶

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_0_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_7_1k"

show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_8_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_9_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

# generating modisco.meme
from matlas.matches import DenovoModisco
task_dir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out"
ob = DenovoModisco(task_dir)

ob.write_meme_file(task_dir+"/modisco.meme")

Late timepoint¶

# sample_name = 'late_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
# )

# sample_name = 'late_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Loading keras model¶

from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.

channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 300)         1200      
_________________________________________________________________
activation_1 (Activation)    (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_2 (Batch (None, 333, 200)          800       
_________________________________________________________________
activation_2 (Activation)    (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_3 (Batch (None, 83, 200)           800       
_________________________________________________________________
activation_3 (Activation)    (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_4 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_4 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_5 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_5 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape

((35024, 19), (35024, 19), (35024, 1, 1000, 4))

from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)

1/1 [==============================] - 1s 664ms/step

from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')

Text(0, 0.5, 'keras predictions')

import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))

(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model¶

from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape

load data from labcluster

TF-MoDISco is using the TensorFlow backend.

nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_1_0 the preceding linear layer is conv1d_1_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_2_0 the preceding linear layer is conv1d_2_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_3_0 the preceding linear layer is conv1d_3_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_4_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_5_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel

[None, 1000, 4]

#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]

(list, 35024)

def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)

from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)

num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]

One hot encoding sequences...
One hot encoding done...

((450, 1000, 4), 1000)

from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}

task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape

Done 0

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape

['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']

((35024, 10, 1000, 4), (35024, 10, 160, 4))

import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)

-0.4860307276248932 1.3380632400512695

-0.0440836176276207 0.1500825583934784

scores_ggr.shape

(35024, 10, 1000, 4)

import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)

-0.0688343504909426 0.35437235310673715

-0.976342553505674 0.35437235310673715

0.0 1.0

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 2359 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 151 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 142 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 131 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 98 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_5 # seqlets: 89 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 55 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_7 # seqlets: 43 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-149_CTCFL.UNK.0.A

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 1413 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 237 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_2 # seqlets: 202 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-124_FOSB.UNK.0.A, HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_3 # seqlets: 130 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 142 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_5 # seqlets: 70 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 55 SequenceContrib ScoresHyp_Contrib Scores

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 586 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 153 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_2 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 57 SequenceContrib ScoresHyp_Contrib Scores

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 1596 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 238 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 148 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_3 # seqlets: 61 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_4 # seqlets: 58 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-149_CTCFL.UNK.0.A
metacluster_0/pattern_5 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 49 SequenceContrib ScoresHyp_Contrib Scores

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 412 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 79 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-179_BACH1.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_2 # seqlets: 48 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 45 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 40 SequenceContrib ScoresHyp_Contrib Scores