%load_ext autoreload
%autoreload 2
import pandas as pd
import numpy as np
import glob
import os
from collections import OrderedDict
import pickle
import h5py
def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
fp = h5py.File(deeplift_hdf, "r")
hyp_scores = fp['deeplift_scores'][:]
one_hot = fp['inputs'][:]
shuffled_onehot = fp['shuffled_inputs'][:]
deeplift_data = OrderedDict()
if 'one_hot' in keys:
deeplift_data['one_hot'] = one_hot
if 'peaks' in keys :
df = OrderedDict()
for key in list(fp['metadata/range']):
df[key] = fp['metadata/range/{}'.format(key)][:]
df = pd.DataFrame(df)
df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
deeplift_data['peaks'] = df
if 'hyp_scores' in keys :
deeplift_data['hyp_scores'] = hyp_scores
if 'scores' in keys :
deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
return deeplift_data
# deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
# deeplift_data = load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot'])
# print(deeplift_data['scores'].shape, deeplift_data['one_hot'].shape)
# traj_no = -1
# if traj_no==8:
# indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1, traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# else:
# indices = traj_lab[:, traj_no]==1
# if traj_no<0:
# _score = deeplift_data['scores']
# _hyp_score = deeplift_data['hyp_scores']
# _one_hot = deeplift_data['one_hot']
# else:
# _score = deeplift_data['scores'][indices,:,:]
# _hyp_score = deeplift_data['hyp_scores'][indices,:,:]
# _one_hot = deeplift_data['one_hot'][indices,:,:]
import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
ggr_data = OrderedDict()
with h5py.File(ggrfile, "r") as fp:
ggr_data['one_hot'] = fp['sequence'][:]
ggr_data['scores'] = fp['sequence-weighted'][:]
ggr_data['hyp_scores'] = fp['gradients'][:]
ggr_data['hyp_scores'].shape, ggr_data['scores'].shape, ggr_data['one_hot'].shape
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
# print(list(fp))
traj_lab = fp['TRAJ_LABELS'][:]
traj_lab.shape
# #0, 7, 8-10-11, and 9
# indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1, traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# indices.shape, deeplift_data['scores'][indices,:,:].shape, np.sum(indices)
traj_no = 8
if traj_no==8:
indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1, traj_lab[:, 10]==1), traj_lab[:, 11]==1)
else:
indices = traj_lab[:, traj_no]==1
if traj_no<0:
_score = ggr_data['scores'][:,0,:,:]
_hyp_score = ggr_data['hyp_scores'][:,0,:,:]
_one_hot = ggr_data['one_hot'][:,0,:,:]
else:
_score = ggr_data['scores'][indices,0,:,:]
_hyp_score = ggr_data['hyp_scores'][indices,0,:,:]
_one_hot = ggr_data['one_hot'][indices,0,:,:]
_score.shape, _hyp_score.shape, _one_hot.shape
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
#Slight modifications from the default settings
sliding_window_size=21,
flank_size=10,
target_seqlet_fdr=0.05,
min_passing_windows_frac=0.03,
max_seqlets_per_metacluster=20000,
seqlets_to_patterns_factory=
modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
#Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
# software like MEME to improve the TF-MoDISco clustering. To use the meme-based
# initialization, you would specify the initclusterer_factory as shown in the
# commented-out code below:
#initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(
# meme_command="meme", base_outdir="meme_out",
# max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
embedder_factory=(modisco.seqlet_embedding
.advanced_gapped_kmer.AdvancedGappedKmerEmbedderFactory(max_entries=500)),
trim_to_window_size=30,
initial_flank_to_add=10,
#kmer_len=5, num_gaps=1,
#num_mismatches=0,
final_min_cluster_size=30)
)(
task_names=["early0"],# "task1", "task2"],
contrib_scores={'early0': _score}, #[:,420:580,:]
hypothetical_contribs={'early0': _hyp_score }, #[:,420:580,:]
one_hot=_one_hot, #[:,420:580,:]
null_per_pos_scores = null_per_pos_scores)
modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8_1k/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)
print('..')
from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np
def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05):
display(summary(b(sample_name)))
ob = DenovoModisco(modiscodir)
# ob.fetch_tomtom_matches(save_report=True,
# tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
ob.load_matched_motifs()
ob.get_motif_per_celltype(match_threshold=match_threshold)
pattern_tab, pattern_dict = ob.visualize_pattern_table()
display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
' in ', b(sample_name),
": #{}".format(len(pattern_dict)),
), pattern_tab))
#ob.display_individual_table()
return None
def show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0.01):
ob = DenovoModisco(modiscodir)
# ob.fetch_tomtom_matches(
# meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
# database_name="HOCOMOCO.nonredundant.annotated",
# save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
ob.get_motif_per_celltype(match_threshold=match_threshold, match_criteria='q-value',
database_name="HOCOMOCO.nonredundant.annotated")
#ob.display_individual_table()
pattern_tab, pattern_dict = ob.visualize_pattern_table()
display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
' in ', b(sample_name),
": #{}".format(len(pattern_dict)),
), pattern_tab))
return None
# sample_name = 'early_fold0'
# display_denovo_patterns(
# sample_name,
# modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out_1k"
# )
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out"
show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out_1k"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=1e-15)
# sample_name = 'early_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_0"
show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_7"
show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8"
show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_9"
show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_0_1k"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_7_1k"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_8_1k"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_9_1k"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
# sample_name = 'late_fold0'
# display_denovo_patterns(
# sample_name,
# modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
# )
# sample_name = 'late_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
labels = fp['labels'][:]
logits = fp['logits'][:]
seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape
from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
#batch = e_generator.get_batch(i)
#e = model.predict_on_batch(batch[0])
e = model.predict_generator(
e_generator,
max_queue_size=100,
workers=1,
use_multiprocessing=False,
verbose=1
)
return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)
from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')
import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))
from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
model_h5,
algorithm="rescale_conv_revealcancel_fc",
regression=True,
sequential=False,
w0=None, w1=None, logger=None)
input_layer_shape
#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
# with h5py.File(ggr_h5, "r") as fp:
# seqs = fp['sequence.active.string'][:]
# sequences = []
# for seq in seqs:
# sequences.append(seq[0].decode('utf-8'))
# return sequences
# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]
def get_genome_coordinates(ggr_h5, bed_file):
with h5py.File(ggr_h5, "r") as fp:
regions = fp['example_metadata'][:]
chroms = []
starts = []
ends = []
for region in regions[:,0]:
region = region.decode("utf-8")
if region!='':
region = region.split("features=")[1]
else:
continue
chroms.append(region.split(":")[0])
starts.append(region.split(":")[1].split("-")[0])
ends.append(region.split(":")[1].split("-")[1])
df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)
from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
bed_file,
fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)
num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]
from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
for input_name, score_func in contrib_funcs.items()}
task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
hyp_scores = None
b = 10000
c = int(np.ceil(1.0*len(input_data_list[0])/b))
for si in range(c):
if(si==c-1):
tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
progress_update=10000)
else:
#print('batch: ', si, si*b, (si+1)*b)
tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
input_references_list=[input_references_list[0][si*b:(si+1)*b]],
num_refs_per_seq=num_refs_per_seq,
batch_size=batch_size,
progress_update=10000)
if(hyp_scores is None):
hyp_scores = tmp
else:
hyp_scores = np.vstack((hyp_scores, tmp))
input_data_list[0] = np.squeeze(input_data_list[0])
input_references_list[0] = np.squeeze(input_references_list[0])
one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq,
input_references_list[0].shape[-2], #seq_len
input_references_list[0].shape[-1]))#alphabet
scores = np.multiply(hyp_scores, one_hot)
hyp_scores.shape, one_hot.shape, scores.shape
# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
print(list(fp))
scores_ggr = fp['sequence-weighted'][:]
scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)
scores_ggr.shape
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)