import basepair
import modisco
from keras.models import Model, load_model
from basepair.losses import twochannel_multinomial_nll
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
model = load_model("model.h5", custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.utils import read_pkl
train,valid,test = read_pkl("/users/avsec/workspace/basepair-workflow/models/0/data.pkl")
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
task_names = ["profile/Oct4", "profile/Sox2", "profile/Klf4", "profile/Nanog",
"counts/Oct4", "counts/Sox2", "counts/Klf4", "counts/Nanog"]
import keras.backend as K
inp = model.inputs[0]
fn_pos = {}
fn_neg = {}
for task_id, task_name in enumerate(task_names):
if "counts" in task_name:
fn_pos[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 0], inp))
fn_neg[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 1], inp))
else:
fn_pos[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 0])) * model.outputs[task_id][:, :, 0], axis=-1), inp))
fn_neg[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 1])) * model.outputs[task_id][:, :, 1], axis=-1), inp))
import numpy as np
from basepair.data import numpy_minibatch
grads_pos = {}
grads_neg = {}
for task_name in task_names:
grads_pos[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
grads_neg[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
# Setup different scores
hyp_scores = {}
scores = {}
for task_name in task_names:
hyp_scores[task_name] = grads_pos[task_name] + grads_neg[task_name]
hyp_scores[task_name] = hyp_scores[task_name] - hyp_scores[task_name].mean(-1, keepdims=True)
scores[task_name] = hyp_scores[task_name] * test[0]
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
fig, (ax0, ax1)= plt.subplots(2, 1, sharex=True, figsize=(20, 6))
ax0.set_title("scores")
seqlogo(scores["profile/Oct4"][0], ax=ax0)
ax1.set_title("hyp_scores")
seqlogo(hyp_scores["profile/Oct4"][0], ax=ax1)
onehot_data = test[0]
task_to_scores = scores
task_to_hyp_scores = hyp_scores
from collections import OrderedDict, Counter
#Compute the contributions at each position, for each task
per_position_contrib_scores = OrderedDict([
(x, np.sum(task_to_scores[x],axis=2))
for x in task_names])
from modisco import coordproducers
from importlib import reload
reload(coordproducers)
from modisco import core
contrib_scores_tracks = [core.DataTrack(name=key+"_contrib_scores",
fwd_tracks=task_to_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_scores[key]],
has_pos_axis=True) for key in task_names]
hypothetical_contribs_tracks = [core.DataTrack(name=key+"_hypothetical_contribs",
fwd_tracks=task_to_hyp_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_hyp_scores[key]],
has_pos_axis=True) for key in task_names]
onehot_track = core.DataTrack(name="sequence", fwd_tracks=onehot_data,
rev_tracks=[x[::-1, ::-1] for x in onehot_data],
has_pos_axis=True)
track_set = core.TrackSet(data_tracks=contrib_scores_tracks
+hypothetical_contribs_tracks+[onehot_track])
# load modisco object
grads_pos_valid = {}
grads_neg_valid = {}
for task_name in task_names:
grads_pos_valid[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
grads_neg_valid[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
# Setup different scores
hyp_scores_valid = {}
scores_valid = {}
for task_name in task_names:
hyp_scores_valid[task_name] = grads_pos_valid[task_name] + grads_neg_valid[task_name]
hyp_scores_valid[task_name] = hyp_scores_valid[task_name] - hyp_scores_valid[task_name].mean(-1, keepdims=True)
scores_valid[task_name] = hyp_scores_valid[task_name] * valid[0]
onehot_data_valid = valid[0]
task_to_scores_valid = scores_valid
task_to_hyp_scores_valid = hyp_scores_valid
import h5py
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow
track_set_valid = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=task_names,
contrib_scores=task_to_scores_valid,
hypothetical_contribs=task_to_hyp_scores_valid,
one_hot=onehot_data_valid)
grp = h5py.File("modisco_results_on_valid.hdf5","r")
loaded_tfmodisco_results =\
workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set_valid)
grp.close()
# get seqlets from test set
per_position_contrib_scores = OrderedDict([
(x, [np.sum(s,axis=1) for s in task_to_scores[x]]) for x in task_names])
seqlets_from_data =(
loaded_tfmodisco_results.multitask_seqlet_creation_results.multitask_seqlet_creator(
task_name_to_score_track=per_position_contrib_scores,
track_set=track_set,
task_name_to_thresholding_results=
loaded_tfmodisco_results
.multitask_seqlet_creation_results
.task_name_to_thresholding_results)).final_seqlets
from modisco import affinitymat
reload(affinitymat.core)
reload(affinitymat)
from modisco import hit_scoring
reload(hit_scoring.fast_hit_scoring)
reload(hit_scoring)
from collections import OrderedDict
from modisco import aggregator
trim_pattern = True
def trim_pssm_idx(pssm, frac=0.05):
pssm = np.abs(pssm)
threshold = pssm.sum(axis=-1).max() * frac
for i in range(len(pssm)):
if pssm[i].sum(axis=-1) > threshold:
break
for j in reversed(range(len(pssm))):
if pssm[j].sum(axis=-1) > threshold:
break
return i, j
def ic_scale(x):
from modisco.visualization import viz_sequence
background = np.array([0.27, 0.23, 0.23, 0.27])
return viz_sequence.ic_scale(x, background=background)
seqlet_size_to_score_with = 25
metacluster_idx_to_scorer = OrderedDict()
all_pattern_scorers = []
all_pattern_names = []
# loop through the metaclusters
for metacluster_name in sorted(loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results.keys()):
submetacluster_results = (loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results[metacluster_name])
activity_pattern = submetacluster_results.activity_pattern
relevant_task_names = [task_name for (task_name, x) in
zip(task_names, activity_pattern) if np.abs(x) != 0]
if trim_pattern:
trim_sizes = {}
trimmed_patterns = []
for pattern_idx, pattern in\
enumerate(submetacluster_results.
seqlets_to_patterns_result.patterns):
pssm = ic_scale(pattern["sequence"].fwd)
t1, t2 = trim_pssm_idx(pssm)
trim_sizes[pattern_idx] = t2 - t1
trimmer = aggregator.TrimToBestWindow(
window_size=trim_sizes[pattern_idx],
track_names=([x+"_contrib_scores" for x in relevant_task_names]
+[x+"_hypothetical_contribs" for x in relevant_task_names]))
trimmed_patterns.extend(trimmer([pattern]))
submetacluster_results.seqlets_to_patterns_result.patterns = trimmed_patterns
pattern_comparison_settings = affinitymat.core.PatternComparisonSettings(
track_names=([x + "_contrib_scores" for x in relevant_task_names] +
[x + "_hypothetical_contribs" for x in relevant_task_names]), # only compare across relevant tasks
track_transformer=affinitymat.L1Normalizer(),
min_overlap=0.7)
pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
pattern_comparison_settings=pattern_comparison_settings,
cross_metric_computer=affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
n_cores=10,
cross_metric_single_region=affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
verbose=False),
seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
window_size=seqlet_size_to_score_with,
track_names=[x + "_contrib_scores" for x
in relevant_task_names])
)
# Get a list of scorers for all the patterns in the metacluster
metacluster_pattern_scorers = []
if submetacluster_results.seqlets_to_patterns_result.patterns is None or \
len(submetacluster_results.seqlets_to_patterns_result.patterns) == 0:
# metacluster has no patterns
# don't append anything
continue
for pattern_idx, pattern in\
enumerate(submetacluster_results.
seqlets_to_patterns_result.patterns):
metacluster_idx = int(metacluster_name.split("_")[1])
all_pattern_names.append("metacluster_"+str(metacluster_idx)
+",pattern_"+str(pattern_idx))
if trim_pattern:
pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
pattern_comparison_settings=pattern_comparison_settings,
cross_metric_computer=affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
n_cores=10,
cross_metric_single_region=affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
verbose=False),
seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
window_size=min(seqlet_size_to_score_with, trim_sizes[pattern_idx]),
track_names=[x + "_contrib_scores" for x
in relevant_task_names])
)
pattern_scorer = hit_scoring.RankBasedPatternScorer(
aggseqlets=pattern,
patterns_to_seqlets_sim_computer=
pattern_to_seqlets_sim_computer)
metacluster_pattern_scorers.append(pattern_scorer)
all_pattern_scorers.append(pattern_scorer)
#This is the final scorer for the metacluster;
# it takes the maximum score produced by all the
# individual scorers
max_rank_based_pattern_scorer = hit_scoring.MaxRankBasedPatternScorer(
pattern_scorers=metacluster_pattern_scorers)
metacluster_idx_to_scorer[metacluster_idx] = max_rank_based_pattern_scorer
cross_metacluster_scorer = hit_scoring.MaxRankBasedPatternScorer(
pattern_scorers=all_pattern_scorers)
from modisco.visualization import viz_sequence
from collections import defaultdict, OrderedDict
metacluster_indices = loaded_tfmodisco_results.metaclustering_results.metaclusterer.transform(
seqlets_from_data).metacluster_indices
metacluster_idx_to_seqlets = defaultdict(list)
for a_seqlet, metacluster_idx in zip(seqlets_from_data, metacluster_indices):
metacluster_idx_to_seqlets[metacluster_idx].append(a_seqlet)
#map to patterns within each metacluster
metacluster_idx_to_localized_and_trimmed_seqlets = OrderedDict()
metacluster_to_seqlet_pattern_matches = OrderedDict()
for metacluster_idx in metacluster_idx_to_scorer:
print("working on metacluster ", metacluster_idx)
metacluster_to_seqlet_pattern_matches[metacluster_idx] =\
metacluster_idx_to_scorer[metacluster_idx](metacluster_idx_to_seqlets[metacluster_idx])
metacluster_idx_to_localized_and_trimmed_seqlets[metacluster_idx] = []
submetacluster_results = (loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results['metacluster_'+str(metacluster_idx)])
for score_result, seqlet in \
zip(metacluster_to_seqlet_pattern_matches[metacluster_idx], metacluster_idx_to_seqlets[metacluster_idx]):
pattern_start = seqlet.coor.start + score_result.offset
pattern_end = pattern_start + len(submetacluster_results.seqlets_to_patterns_result.patterns[score_result.pattern_idx])
metacluster_idx_to_localized_and_trimmed_seqlets[metacluster_idx].append((seqlet.coor.example_idx, pattern_start, pattern_end, score_result.revcomp))