import basepair
import modisco
from keras.models import Model, load_model
from basepair.losses import twochannel_multinomial_nll
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
model = load_model("model.h5", custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.utils import read_pkl
train,valid,test = read_pkl("/users/avsec/workspace/basepair-workflow/models/0/data.pkl")
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
task_names = ["profile/Oct4", "profile/Sox2", "profile/Klf4", "profile/Nanog",
"counts/Oct4", "counts/Sox2", "counts/Klf4", "counts/Nanog"]
import keras.backend as K
inp = model.inputs[0]
fn_pos = {}
fn_neg = {}
for task_id, task_name in enumerate(task_names):
if "counts" in task_name:
fn_pos[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 0], inp))
fn_neg[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 1], inp))
else:
fn_pos[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 0])) * model.outputs[task_id][:, :, 0], axis=-1), inp))
fn_neg[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 1])) * model.outputs[task_id][:, :, 1], axis=-1), inp))
import numpy as np
from basepair.data import numpy_minibatch
grads_pos = {}
grads_neg = {}
for task_name in task_names:
grads_pos[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
grads_neg[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
# Setup different scores
hyp_scores = {}
scores = {}
for task_name in task_names:
hyp_scores[task_name] = grads_pos[task_name] + grads_neg[task_name]
hyp_scores[task_name] = hyp_scores[task_name] - hyp_scores[task_name].mean(-1, keepdims=True)
scores[task_name] = hyp_scores[task_name] * test[0]
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
fig, (ax0, ax1)= plt.subplots(2, 1, sharex=True, figsize=(20, 6))
ax0.set_title("scores")
seqlogo(scores["profile/Oct4"][0], ax=ax0)
ax1.set_title("hyp_scores")
seqlogo(hyp_scores["profile/Oct4"][0], ax=ax1)
onehot_data = test[0]
task_to_scores = scores
task_to_hyp_scores = hyp_scores
from collections import OrderedDict, Counter
#Compute the contributions at each position, for each task
per_position_contrib_scores = OrderedDict([
(x, np.sum(task_to_scores[x],axis=2))
for x in task_names])
from modisco import coordproducers
from importlib import reload
reload(coordproducers)
from modisco import core
contrib_scores_tracks = [core.DataTrack(name=key+"_contrib_scores",
fwd_tracks=task_to_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_scores[key]],
has_pos_axis=True) for key in task_names]
hypothetical_contribs_tracks = [core.DataTrack(name=key+"_hypothetical_contribs",
fwd_tracks=task_to_hyp_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_hyp_scores[key]],
has_pos_axis=True) for key in task_names]
onehot_track = core.DataTrack(name="sequence", fwd_tracks=onehot_data,
rev_tracks=[x[::-1, ::-1] for x in onehot_data],
has_pos_axis=True)
track_set = core.TrackSet(data_tracks=contrib_scores_tracks
+hypothetical_contribs_tracks+[onehot_track])
# load modisco object
grads_pos_valid = {}
grads_neg_valid = {}
for task_name in task_names:
grads_pos_valid[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
grads_neg_valid[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
# Setup different scores
hyp_scores_valid = {}
scores_valid = {}
for task_name in task_names:
hyp_scores_valid[task_name] = grads_pos_valid[task_name] + grads_neg_valid[task_name]
hyp_scores_valid[task_name] = hyp_scores_valid[task_name] - hyp_scores_valid[task_name].mean(-1, keepdims=True)
scores_valid[task_name] = hyp_scores_valid[task_name] * valid[0]
onehot_data_valid = valid[0]
task_to_scores_valid = scores_valid
task_to_hyp_scores_valid = hyp_scores_valid
import h5py
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow
track_set_valid = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=task_names,
contrib_scores=task_to_scores_valid,
hypothetical_contribs=task_to_hyp_scores_valid,
one_hot=onehot_data_valid)
grp = h5py.File("modisco_results_on_valid.hdf5","r")
loaded_tfmodisco_results =\
workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set_valid)
grp.close()
# get seqlets from test set
per_position_contrib_scores = OrderedDict([
(x, [np.sum(s,axis=1) for s in task_to_scores[x]]) for x in task_names])
seqlets_from_data =(
loaded_tfmodisco_results.multitask_seqlet_creation_results.multitask_seqlet_creator(
task_name_to_score_track=per_position_contrib_scores,
track_set=track_set,
task_name_to_thresholding_results=
loaded_tfmodisco_results
.multitask_seqlet_creation_results
.task_name_to_thresholding_results)).final_seqlets
from modisco import affinitymat
reload(affinitymat.core)
reload(affinitymat)
from modisco import hit_scoring
reload(hit_scoring.fast_hit_scoring)
reload(hit_scoring)
from collections import OrderedDict
seqlet_size_to_score_with = 25
metacluster_idx_to_scorer = OrderedDict()
all_pattern_scorers = []
all_pattern_names = []
for metacluster_name in\
sorted(loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results.keys()):
submetacluster_results =(
loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results[metacluster_name])
activity_pattern = submetacluster_results.activity_pattern
relevant_task_names = [task_name for (task_name,x) in
zip(task_names, activity_pattern) if np.abs(x) != 0]
patterns_in_submetacluster =\
submetacluster_results.seqlets_to_patterns_result.patterns
pattern_comparison_settings = affinitymat.core.PatternComparisonSettings(
track_names=([x+"_contrib_scores" for x in relevant_task_names]
+[x+"_hypothetical_contribs" for x in relevant_task_names]),
track_transformer=affinitymat.L1Normalizer(),
min_overlap=0.7)
pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
pattern_comparison_settings=pattern_comparison_settings,
cross_metric_computer=\
affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
n_cores=1,
cross_metric_single_region=
affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
verbose=False),
seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
window_size=seqlet_size_to_score_with,
track_names=[x+"_contrib_scores" for x
in relevant_task_names]))
#Get a list of scorers for all the patterns in the metacluster
metacluster_pattern_scorers = []
if submetacluster_results.seqlets_to_patterns_result.patterns is None or \
len(submetacluster_results.seqlets_to_patterns_result.patterns) == 0:
# metacluster has no patterns
# don't append anything
continue
for pattern_idx, pattern in\
enumerate(submetacluster_results.
seqlets_to_patterns_result.patterns):
metacluster_idx = int(metacluster_name.split("_")[1])
all_pattern_names.append("metacluster_"+str(metacluster_idx)
+",pattern_"+str(pattern_idx))
pattern_scorer = hit_scoring.RankBasedPatternScorer(
aggseqlets=pattern,
patterns_to_seqlets_sim_computer=
pattern_to_seqlets_sim_computer)
metacluster_pattern_scorers.append(pattern_scorer)
all_pattern_scorers.append(pattern_scorer)
#This is the final scorer for the metacluster;
# it takes the maximum score produced by all the
# individual scorers
max_rank_based_pattern_scorer = hit_scoring.MaxRankBasedPatternScorer(
pattern_scorers=metacluster_pattern_scorers)
metacluster_idx_to_scorer[metacluster_idx] = max_rank_based_pattern_scorer
cross_metacluster_scorer = hit_scoring.MaxRankBasedPatternScorer(
pattern_scorers=all_pattern_scorers)
from modisco.visualization import viz_sequence
from collections import defaultdict, OrderedDict
metacluster_indices = loaded_tfmodisco_results.metaclustering_results.metaclusterer.transform(
seqlets_from_data).metacluster_indices
metacluster_idx_to_seqlets = defaultdict(list)
for a_seqlet, metacluster_idx in zip(seqlets_from_data, metacluster_indices):
metacluster_idx_to_seqlets[metacluster_idx].append(a_seqlet)
#visualize a few seqlets from each metacluster
num_to_viz = 3
for metacluster_idx in metacluster_idx_to_seqlets:
print("Example seqlets for metacluster ",metacluster_idx,"('-1' is 'unmapped')")
for a_seqlet in metacluster_idx_to_seqlets[metacluster_idx][:num_to_viz]:
viz_sequence.plot_weights(a_seqlet["profile/Oct4_contrib_scores"].fwd)
viz_sequence.plot_weights(a_seqlet["profile/Sox2_contrib_scores"].fwd)
viz_sequence.plot_weights(a_seqlet["profile/Klf4_contrib_scores"].fwd)
viz_sequence.plot_weights(a_seqlet["profile/Nanog_contrib_scores"].fwd)
#map to patterns within each metacluster
metacluster_to_seqlet_pattern_matches = OrderedDict()
for metacluster_idx in metacluster_idx_to_scorer:
print(metacluster_idx)
metacluster_to_seqlet_pattern_matches[metacluster_idx] =\
metacluster_idx_to_scorer[metacluster_idx](metacluster_idx_to_seqlets[metacluster_idx])