import basepair
import modisco
from keras.models import Model, load_model
from basepair.losses import twochannel_multinomial_nll
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
model = load_model("model.h5", custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.utils import read_pkl
train,valid,test = read_pkl("/users/avsec/workspace/basepair-workflow/models/0/data.pkl")
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
task_names = ["profile/Oct4", "profile/Sox2", "profile/Klf4", "profile/Nanog",
"counts/Oct4", "counts/Sox2", "counts/Klf4", "counts/Nanog"]
import keras.backend as K
inp = model.inputs[0]
fn_pos = {}
fn_neg = {}
for task_id, task_name in enumerate(task_names):
if "counts" in task_name:
fn_pos[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 0], inp))
fn_neg[task_name] = K.function([inp], K.gradients(model.outputs[task_id][:, 1], inp))
else:
fn_pos[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 0])) * model.outputs[task_id][:, :, 0], axis=-1), inp))
fn_neg[task_name] = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(
model.outputs[task_id][:, :, 1])) * model.outputs[task_id][:, :, 1], axis=-1), inp))
import numpy as np
from basepair.data import numpy_minibatch
grads_pos = {}
grads_neg = {}
for task_name in task_names:
grads_pos[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
grads_neg[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(test[0], 512)])
# Setup different scores
hyp_scores = {}
scores = {}
for task_name in task_names:
hyp_scores[task_name] = grads_pos[task_name] + grads_neg[task_name]
hyp_scores[task_name] = hyp_scores[task_name] - hyp_scores[task_name].mean(-1, keepdims=True)
scores[task_name] = hyp_scores[task_name] * test[0]
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
fig, (ax0, ax1)= plt.subplots(2, 1, sharex=True, figsize=(20, 6))
ax0.set_title("scores")
seqlogo(scores["profile/Oct4"][0], ax=ax0)
ax1.set_title("hyp_scores")
seqlogo(hyp_scores["profile/Oct4"][0], ax=ax1)
onehot_data = test[0]
task_to_scores = scores
task_to_hyp_scores = hyp_scores
from collections import OrderedDict, Counter
#Compute the contributions at each position, for each task
per_position_contrib_scores = OrderedDict([
(x, np.sum(task_to_scores[x],axis=2))
for x in task_names])
from modisco import coordproducers
from importlib import reload
reload(coordproducers)
sliding_window_size = 15
flank_size = 5
target_seqlet_fdr = 0.01
#Instantiate the coordinate producer
coord_producer = coordproducers.FixedWindowAroundChunks(
sliding=sliding_window_size,
flank=flank_size,
thresholding_function=coordproducers.LaplaceThreshold(
min_seqlets=100,
target_fdr=target_seqlet_fdr,
verbose=True))
#Call the coordinate producer on per-position contribution scores for each task
task_name_to_coord_producer_results = OrderedDict()
for task_name in per_position_contrib_scores:
print("#####\nON TASK",task_name,"\n######")
score_track = per_position_contrib_scores[task_name]
coord_producer_results = coord_producer(score_track=score_track)
#Given the coordinates, seqlets can be created by retrieving the
# necessary data from the TrackSet
task_name_to_coord_producer_results[task_name] = coord_producer_results
from modisco import core
contrib_scores_tracks = [core.DataTrack(name=key+"_contrib_scores",
fwd_tracks=task_to_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_scores[key]],
has_pos_axis=True) for key in task_names]
hypothetical_contribs_tracks = [core.DataTrack(name=key+"_hypothetical_contribs",
fwd_tracks=task_to_hyp_scores[key],
rev_tracks=[x[::-1, ::-1] for x in
task_to_hyp_scores[key]],
has_pos_axis=True) for key in task_names]
onehot_track = core.DataTrack(name="sequence", fwd_tracks=onehot_data,
rev_tracks=[x[::-1, ::-1] for x in onehot_data],
has_pos_axis=True)
track_set = core.TrackSet(data_tracks=contrib_scores_tracks
+hypothetical_contribs_tracks+[onehot_track])
task_name_to_seqlets = OrderedDict()
for task_name in task_name_to_coord_producer_results:
coord_producer_results = task_name_to_coord_producer_results[task_name]
task_name_to_seqlets[task_name] = track_set.create_seqlets(
coords=coord_producer_results.coords)
import itertools
overlap_portion = 0.5
#Create a class that resolves overlaps. overlap_detector
# specifies how to decide if seqlets are overlapping, and
# seqlet_comparator specifies how to pick between
# seqlets that are deemed to be overlapping.
overlap_resolver = core.SeqletsOverlapResolver(
overlap_detector=core.CoordOverlapDetector(overlap_portion),
seqlet_comparator=core.SeqletComparator(
value_provider=lambda x: x.coor.score))
#take the union over everything, resolving overlaps as specified
# by overlap_resolver
final_seqlets = overlap_resolver(itertools.chain(*task_name_to_seqlets.values()))
# load modisco object
grads_pos_valid = {}
grads_neg_valid = {}
for task_name in task_names:
grads_pos_valid[task_name] = np.concatenate([np.array(fn_pos[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
grads_neg_valid[task_name] = np.concatenate([np.array(fn_neg[task_name]([batch])).squeeze()
for batch in numpy_minibatch(valid[0], 512)])
# Setup different scores
hyp_scores_valid = {}
scores_valid = {}
for task_name in task_names:
hyp_scores_valid[task_name] = grads_pos_valid[task_name] + grads_neg_valid[task_name]
hyp_scores_valid[task_name] = hyp_scores_valid[task_name] - hyp_scores_valid[task_name].mean(-1, keepdims=True)
scores_valid[task_name] = hyp_scores_valid[task_name] * valid[0]
onehot_data_valid = valid[0]
task_to_scores_valid = scores_valid
task_to_hyp_scores_valid = hyp_scores_valid
import h5py
import modisco
import modisco.util
import modisco.core
import modisco.tfmodisco_workflow.seqlets_to_patterns
from modisco.tfmodisco_workflow import workflow
track_set_valid = modisco.tfmodisco_workflow.workflow.prep_track_set(
task_names=task_names,
contrib_scores=task_to_scores_valid,
hypothetical_contribs=task_to_hyp_scores_valid,
one_hot=onehot_data_valid)
grp = h5py.File("modisco_results_on_valid.hdf5","r")
loaded_tfmodisco_results =\
workflow.TfModiscoResults.from_hdf5(grp, track_set=track_set_valid)
grp.close()
# next step sampling N seqlets for each of the motif objects identified by modisco
num_communities = 0
num_samples = 50
U = []
for metacluster_idx in\
sorted(loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results.keys()):
for pattern in loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results[metacluster_idx].seqlets_to_patterns_result.patterns:
num_communities += 1
ids = np.random.choice(len(pattern.seqlets), num_samples-1, replace=False)
U.append(pattern.to_seqlet())
for idx in ids:
U.append(pattern.seqlets[idx])
len(U)
len(final_seqlets)
# making it smaller because it is taking very long...
#final_seqlets = final_seqlets[:100]
# for testing only
# what proportion of seqlets get mapped to the correct cluster when you set the candidate seqlets to be the motif seqlets?
metacluster_assignments = []
true_assignments = []
final_seqlets = []
for metacluster_idx in\
sorted(loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results.keys()):
for idx, pattern in enumerate(loaded_tfmodisco_results.metacluster_idx_to_submetacluster_results[metacluster_idx].seqlets_to_patterns_result.patterns):
final_seqlets.extend(pattern.seqlets[:50])
true_assignments.extend([idx]*len(pattern.seqlets[:50]))
metacluster_assignments.extend([metacluster_idx]*len(pattern.seqlets[:50]))
print(num_communities, len(final_seqlets), len(true_assignments), len(metacluster_assignments))
# fine grained affmat calculation
from importlib import reload
from modisco import affinitymat
reload(affinitymat.core)
reload(affinitymat)
from modisco import hit_scoring
reload(hit_scoring.fast_hit_scoring)
reload(hit_scoring)
from collections import OrderedDict
pattern_to_scorer = OrderedDict()
pattern_scorers = []
pattern_names = []
seqlet_size_to_score_with = 25
A_motifsseqlets = {}
A_candidateseqlets = {}
for metacluster_idx in\
sorted(loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results.keys()):
print("now working on metacluster: ", metacluster_idx)
submetacluster_results =(
loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results[metacluster_idx])
activity_pattern = submetacluster_results.activity_pattern
relevant_task_names = [task_name for (task_name,x) in
zip(task_names, activity_pattern) if np.abs(x) != 0]
patterns_in_submetacluster =\
submetacluster_results.seqlets_to_patterns_result.patterns
pattern_comparison_settings = affinitymat.core.PatternComparisonSettings(
track_names=([x+"_contrib_scores" for x in relevant_task_names]
+[x+"_hypothetical_contribs" for x in relevant_task_names]),
track_transformer=affinitymat.L1Normalizer(),
min_overlap=0.7)
pattern_to_seqlets_sim_computer = hit_scoring.PatternsToSeqletsSimComputer(
pattern_comparison_settings=pattern_comparison_settings,
cross_metric_computer=\
affinitymat.core.ParallelCpuCrossMetricOnNNpairs(
n_cores=10,
cross_metric_single_region=
affinitymat.core.CrossContinJaccardSingleRegionWithArgmax(),
verbose=False),
seqlet_trimmer=modisco.hit_scoring.SeqletTrimToBestWindow(
window_size=seqlet_size_to_score_with,
track_names=[x+"_contrib_scores" for x
in relevant_task_names]))
print("about to get A_motifsseqlets")
A_motifsseqlets[metacluster_idx] = pattern_to_seqlets_sim_computer(U, U)
print("about to get A_candidateseqlets")
A_candidateseqlets[metacluster_idx] = pattern_to_seqlets_sim_computer(final_seqlets, U)
# got a matrix A_motifsseqlets for each metacluster
slim_A_motifsseqlets = {}
slim_A_candidateseqlets = {}
for metacluster_idx in\
sorted(loaded_tfmodisco_results
.metacluster_idx_to_submetacluster_results.keys()):
slim_A_motifsseqlets[metacluster_idx] = A_motifsseqlets[metacluster_idx][:,:,0]
slim_A_candidateseqlets[metacluster_idx] = A_candidateseqlets[metacluster_idx][:,:,0]
# compute DeltaQ for all the communities,
# and then assign the seqlet to the community
# that gives the highest DeltaQ, as per the formula
# on this page: https://en.wikipedia.org/wiki/Louvain_Modularity#Algorithm
assignments = {}
correct = 0
total = 0
for idx, candidate in enumerate(final_seqlets):
relevant_metacluster = metacluster_assignments[idx]
m = np.sum(slim_A_motifsseqlets[relevant_metacluster])
k_i = np.sum(slim_A_candidateseqlets[relevant_metacluster][idx,:])
max_delta_q = float("-inf")
best_assignment = -1
for i in range(num_communities):
Sigma_in = np.sum(slim_A_motifsseqlets[relevant_metacluster][(i*num_samples):((i+1)*num_samples), (i*num_samples):((i+1)*num_samples)])
Sigma_tot = np.sum(slim_A_motifsseqlets[relevant_metacluster][(i*num_samples):((i+1)*num_samples), :])
k_i_in = np.sum(slim_A_candidateseqlets[relevant_metacluster][idx,(i*num_samples):((i+1)*num_samples)])
delta_q = (((Sigma_in+(2*k_i_in))/(2*m))-(((Sigma_tot+k_i)/(2*m))**2))-(((Sigma_in)/(2*m))-((Sigma_tot)/(2*m))**2-((k_i)/(2*m))**2)
if delta_q >= max_delta_q:
max_delta_q = delta_q
best_assignment = i
print("we assigned candidate ", idx, " to community ", best_assignment)
if (true_assignments[idx] == best_assignment):
correct += 1
total += 1
correct/total