%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Generating modisco motifs¶

def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['hyp_scores', 'scores', 'one_hot'])
deeplift_data['scores'].shape, deeplift_data['hyp_scores'].shape, deeplift_data['one_hot'].shape

((35024, 1000, 4), (35024, 1000, 4), (35024, 1000, 4))

import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=15,
                    flank_size=5,
                    target_seqlet_fdr=0.15,
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        trim_to_window_size=15,
                        initial_flank_to_add=5,
                        kmer_len=5, num_gaps=1,
                        num_mismatches=0,
                        final_min_cluster_size=60)
                )(
                 task_names=["early0"],
                 contrib_scores={'early0': deeplift_data['scores'][:,420:580,:]},
                 hypothetical_contribs={'early0': deeplift_data['hyp_scores'][:,420:580,:] },
                 one_hot=deeplift_data['one_hot'][:,420:580,:],
                 null_per_pos_scores = null_per_pos_scores)

MEMORY 4.630016
On task early0
Computing windowed sums on original
Generating null dist
peak(mu)= 0.0034060630339271536
Computing threshold
Thresholds from null dist were -0.5258131270413289  and  1.1217074577463788
Final raw thresholds are -0.5258131270413289  and  1.1217074577463788
Final transformed thresholds are -0.8864842972646545  and  0.9668417194941081

Got 15833 coords
After resolving overlaps, got 15833 seqlets
Across all tasks, the weakest transformed threshold used was: 0.8863842972646545
MEMORY 4.9975296
15833 identified in total
min_metacluster_size_frac * len(seqlets) = 158 is more than min_metacluster_size=100.
Using it as a new min_metacluster_size
2 activity patterns with support >= 158 out of 2 possible patterns
Metacluster sizes:  [15458, 375]
Idx to activities:  {0: '1', 1: '-1'}
MEMORY 4.997758976
On metacluster 1
Metacluster size 375
Relevant tasks:  ('early0',)
Relevant signs:  (-1,)
TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 375
(Round 1) Computing coarse affmat
MEMORY 4.997758976
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.14 s
Starting affinity matrix computations
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.47 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.48 s
(Round 1) Compute nearest neighbors from coarse affmat
MEMORY 5.001498624
Computed nearest neighbors in 0.06 s
MEMORY 5.00400128
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 5.00400128
Launching nearest neighbors affmat calculation job
MEMORY 5.004263424
Parallel runs completed
MEMORY 5.011603456
Job completed in: 2.6 s
MEMORY 5.011603456
Launching nearest neighbors affmat calculation job
MEMORY 5.011603456
Parallel runs completed
MEMORY 5.014093824
Job completed in: 2.52 s
MEMORY 5.014122496
(Round 1) Computed affinity matrix on nearest neighbors in 5.27 s
MEMORY 5.014122496
Filtered down to 373 of 375
(Round 1) Retained 373 rows out of 375 after filtering
MEMORY 5.014122496
(Round 1) Computing density adapted affmat
MEMORY 5.014122496
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 373 samples in 0.000s...
[t-SNE] Computed neighbors for 373 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 373 / 373
[t-SNE] Mean sigma: 0.253450
(Round 1) Computing clustering
MEMORY 5.014122496
Beginning preprocessing + Leiden

  0%|          | 0/50 [00:00<?, ?it/s]

Quality: 0.5750669252729659
Quality: 0.5751015221292706

100%|██████████| 50/50 [00:01<00:00, 33.92it/s]

Got 15 clusters after round 1
Counts:
{6: 20, 4: 38, 8: 14, 12: 4, 9: 7, 1: 59, 2: 57, 7: 17, 13: 3, 10: 6, 5: 25, 0: 64, 3: 52, 14: 2, 11: 5}
MEMORY 5.01508096
(Round 1) Aggregating seqlets in each cluster
MEMORY 5.01508096
Aggregating for cluster 0 with 64 seqlets
MEMORY 5.01508096

Trimming eliminated 0 seqlets out of 64
Skipped 2 seqlets
Aggregating for cluster 1 with 59 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 59
Skipped 1 seqlets
Aggregating for cluster 2 with 57 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 57
Skipped 8 seqlets
Aggregating for cluster 3 with 52 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 52
Skipped 5 seqlets
Aggregating for cluster 4 with 38 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 38
Skipped 1 seqlets
Aggregating for cluster 5 with 25 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 25
Skipped 1 seqlets
Aggregating for cluster 6 with 20 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 20
Skipped 2 seqlets
Dropping cluster 6 with 18 seqlets due to sign disagreement
Aggregating for cluster 7 with 17 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 17
Skipped 3 seqlets
Aggregating for cluster 8 with 14 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 14
Aggregating for cluster 9 with 7 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 7
Aggregating for cluster 10 with 6 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 6
Aggregating for cluster 11 with 5 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 5
Skipped 1 seqlets
Aggregating for cluster 12 with 4 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 4
Skipped 1 seqlets
Aggregating for cluster 13 with 3 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 3
Dropping cluster 13 with 3 seqlets due to sign disagreement
Aggregating for cluster 14 with 2 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 2
Dropping cluster 14 with 2 seqlets due to sign disagreement
(Round 2) num seqlets: 325
(Round 2) Computing coarse affmat
MEMORY 5.01508096
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.08 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.02 s
(Round 2) Compute nearest neighbors from coarse affmat
MEMORY 5.015146496
Computed nearest neighbors in 0.06 s
MEMORY 5.015998464
(Round 2) Computing affinity matrix on nearest neighbors
MEMORY 5.015998464
Launching nearest neighbors affmat calculation job
MEMORY 5.015998464
Parallel runs completed
MEMORY 5.015998464
Job completed in: 2.07 s
MEMORY 5.015998464
Launching nearest neighbors affmat calculation job
MEMORY 5.015998464
Parallel runs completed
MEMORY 5.015998464
Job completed in: 1.97 s
MEMORY 5.015998464
(Round 2) Computed affinity matrix on nearest neighbors in 4.22 s
MEMORY 5.015998464
Not applying filtering for rounds above first round
MEMORY 5.015998464
(Round 2) Computing density adapted affmat
MEMORY 5.015998464
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 325 samples in 0.000s...
[t-SNE] Computed neighbors for 325 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 325 / 325
[t-SNE] Mean sigma: 0.244780
(Round 2) Computing clustering
MEMORY 5.015998464
Beginning preprocessing + Leiden

  0%|          | 0/50 [00:00<?, ?it/s]

Quality: 0.5636795151701894
Quality: 0.5644660820642647

modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)

/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/ipykernel_launcher.py:2: H5pyDeprecationWarning: The default file mode will change to 'r' (read-only) in h5py 3.0. To suppress this warning, pass the mode you need to h5py.File(), or set the global default h5.get_config().default_file_mode, or set the environment variable H5PY_DEFAULT_READONLY=1. Available modes are: 'r', 'r+', 'w', 'w-'/'x', 'a'. See the docs for details.

from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def show_patterns_using_hoccomocco_db(sample_name, modiscodir):
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(
#                 meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
#                 database_name="HOCOMOCO.nonredundant.annotated",
#                 save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
    ob.get_motif_per_celltype(match_threshold=0.03, database_name="HOCOMOCO.nonredundant.annotated")
    #ob.display_individual_table()
    
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    return None



def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(save_report=True, 
#                                   tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    
    ob.load_matched_motifs()
    ob.get_motif_per_celltype(match_threshold=match_threshold)
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    #ob.display_individual_table()
    
    return None

Early timepoint¶

sample_name = 'early_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"
)

sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Late timepoint¶

sample_name = 'late_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
)

sample_name = 'late_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Loading keras model¶

from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()

channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_11 (Batc (None, 1000, 300)         1200      
_________________________________________________________________
activation_14 (Activation)   (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_12 (Batc (None, 333, 200)          800       
_________________________________________________________________
activation_15 (Activation)   (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_13 (Batc (None, 83, 200)           800       
_________________________________________________________________
activation_16 (Activation)   (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_14 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_17 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_15 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_18 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape

((35024, 19), (35024, 19), (35024, 1, 1000, 4))

from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)

1/1 [==============================] - 1s 664ms/step

from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')

Text(0, 0.5, 'keras predictions')

import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))

(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model¶

from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape

load data from labcluster

TF-MoDISco is using the TensorFlow backend.

nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_4_0 the preceding linear layer is conv1d_8_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_5_0 the preceding linear layer is conv1d_9_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_6_0 the preceding linear layer is conv1d_10_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_7_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_8_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel

[None, 1000, 4]

#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]

(list, 35024)

def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)

from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)

num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]

One hot encoding sequences...
One hot encoding done...

((450, 1000, 4), 1000)

from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}

task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape

Done 0

/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.

# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)

ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape

['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']

((35024, 10, 1000, 4), (35024, 10, 160, 4))

import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)

-0.4860307276248932 1.3380632400512695

-0.0440836176276207 0.1500825583934784

scores_ggr.shape

(35024, 10, 1000, 4)

import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)

-0.0688343504909426 0.35437235310673715

-0.976342553505674 0.35437235310673715

0.0 1.0

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 5085 SequenceContrib ScoresHyp_Contrib Scores	Jund, Fos, Junb, Fosb, Jdp2, Fosl1, Atf3, Fosl2, Jun, Batf, Bach2, Nfe2, Nfe2l2, Bach1
metacluster_0/pattern_1 # seqlets: 537 SequenceContrib ScoresHyp_Contrib Scores	Trp63, Trp73, Trp53, Tcfcp2l1
metacluster_0/pattern_2 # seqlets: 370 SequenceContrib ScoresHyp_Contrib Scores	Jund, Jun, Batf, Fos, Fosl1, Fosl2, Jdp2, Junb, Fosb, Atf3, Tead1, Tead4
metacluster_0/pattern_3 # seqlets: 226 SequenceContrib ScoresHyp_Contrib Scores	Ctcf, Ctcfl
metacluster_0/pattern_4 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores	Runx1, Cbfb, Runx2
metacluster_0/pattern_5 # seqlets: 87 SequenceContrib ScoresHyp_Contrib Scores	Atf2, Jdp2, Creb5, Atf7, Creb1, Atf3, Atf1, Crem

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 5085 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-124_FOSB.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_1 # seqlets: 537 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_2 # seqlets: 370 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-156_TEAD1.UNK.0.A, HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_3 # seqlets: 226 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-149_CTCFL.UNK.0.A, HCLUST-134_INSM1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-176_CBFB.UNK.0.A
metacluster_0/pattern_5 # seqlets: 87 SequenceContrib ScoresHyp_Contrib Scores	HCLUST-175_ATF1.UNK.0.A

Pattern Name	TF Name(s)	Modisco
metacluster_0/pattern_0 # seqlets: 2434 SequenceContrib ScoresHyp_Contrib Scores	Jund, Junb, Fos, Fosb, Jdp2, Fosl1, Atf3, Fosl2, Batf, Jun, Bach2, Nfe2, Nfe2l2, Bach1
metacluster_0/pattern_1 # seqlets: 641 SequenceContrib ScoresHyp_Contrib Scores	Cebpb, Cebpa, Dbp, Cebpg, Hlf, Cebpd, Nfil3, Cebpe, Tef, Atf4, Ddit3
metacluster_0/pattern_2 # seqlets: 470 SequenceContrib ScoresHyp_Contrib Scores	Grhl2
metacluster_0/pattern_3 # seqlets: 372 SequenceContrib ScoresHyp_Contrib Scores	Klf1, Klf4, Klf8, Klf12, Klf7, Sp4, Klf5, Klf3, Klf6
metacluster_0/pattern_4 # seqlets: 367 SequenceContrib ScoresHyp_Contrib Scores	Atf4, Ddit3, Cebpg, Cebpb, Cebpd, Nfil3, Cebpa, Dbp
metacluster_0/pattern_5 # seqlets: 311 SequenceContrib ScoresHyp_Contrib Scores	Trp73, Trp63, Trp53
metacluster_0/pattern_6 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores	Atf2, Jdp2, Creb5, Atf7, Creb1, Atf3, Crem, Nfil3, Mafb
metacluster_0/pattern_7 # seqlets: 88 SequenceContrib ScoresHyp_Contrib Scores	Ctcf, Ctcfl
metacluster_0/pattern_8 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores	Tfap2a, Tfap2c, Tfap2e, Tfap2b
metacluster_0/pattern_9 # seqlets: 66 SequenceContrib ScoresHyp_Contrib Scores