from __future__ import print_function
!./grab_model_and_data.sh
# Use gpus 2, 3
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3"
import simdna
import simdna.synthetic as synthetic
import gzip
data_filename = "sequences.simdata.gz"
#read in the data in the testing set
test_ids_fh = gzip.open("test.txt.gz","rb")
ids_to_load = [x.decode('utf-8').rstrip("\n") for x in test_ids_fh]
data = synthetic.read_simdata_file(data_filename, ids_to_load=ids_to_load)
import numpy as np
#this is set up for 1d convolutions where examples
#have dimensions (len, num_channels)
#the channel axis is the axis for one-hot encoding.
def one_hot_encode_along_channel_axis(sequence):
to_return = np.zeros((len(sequence),4), dtype=np.int8)
seq_to_one_hot_fill_in_array(zeros_array=to_return,
sequence=sequence, one_hot_axis=1)
return to_return
def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
assert one_hot_axis==0 or one_hot_axis==1
if (one_hot_axis==0):
assert zeros_array.shape[1] == len(sequence)
elif (one_hot_axis==1):
assert zeros_array.shape[0] == len(sequence)
#will mutate zeros_array
for (i,char) in enumerate(sequence):
if (char=="A" or char=="a"):
char_idx = 0
elif (char=="C" or char=="c"):
char_idx = 1
elif (char=="G" or char=="g"):
char_idx = 2
elif (char=="T" or char=="t"):
char_idx = 3
elif (char=="N" or char=="n"):
continue #leave that pos as all 0's
else:
raise RuntimeError("Unsupported character: "+str(char))
if (one_hot_axis==0):
zeros_array[char_idx,i] = 1
elif (one_hot_axis==1):
zeros_array[i,char_idx] = 1
onehot_data = np.array([one_hot_encode_along_channel_axis(seq) for seq in data.sequences])
keras_model_weights = "keras2_conv1d_record_5_model_PQzyq_modelWeights.h5"
keras_model_json = "keras2_conv1d_record_5_model_PQzyq_modelJson.json"
from keras import backend as K
from keras.models import Sequential, Model
from keras.models import model_from_json
import deepexplain
from deepexplain.tensorflow.methods import DeepLIFTRescale
from deepexplain.tensorflow import DeepExplain
from deeplift.dinuc_shuffle import dinuc_shuffle
from collections import OrderedDict
funcs = OrderedDict()
hyp_attributions = OrderedDict()
attributions = OrderedDict()
num_refs_per_seq=10 #number of references to generate per sequence
K.clear_session()
keras_model = model_from_json(open(keras_model_json).read())
keras_model.load_weights(keras_model_weights)
with deepexplain.tensorflow.DeepExplain(session=K.get_session()) as de:
input_tensor = keras_model.layers[0].input
fModel = Model(inputs=input_tensor, outputs = keras_model.layers[-2].output)
target_tensor = fModel(input_tensor)
for task in [0,1,2]:
#hyp_attributions[task] = de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(onehot_data))
#attributions[task] = onehot_data*hyp_attributions[task]
funcs[task] = de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(onehot_data))
hyp_attributions[task] = np.array(funcs[task](onehot_data)).squeeze()
attributions[task] = onehot_data*hyp_attributions[task]
# for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
# (1,197) #illustrates non-specific firing of guided backprop
# ]:
# attributions[idx] = 0.0
# for shuff in range(num_refs_per_seq):
# reference = one_hot_encode_along_channel_axis(dinuc_shuffle(data.sequences[idx]))
# #attributions[idx] = de.explain('grad*input', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
# #attributions[idx] = de.explain('saliency', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
# #attributions[idx] = de.explain('intgrad', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
# #attributions[idx] = de.explain('elrp', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
# #attributions[idx] = de.explain('occlusion', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
# attributions[idx] += de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)), baseline=reference)
# attributions[idx] /= num_refs_per_seq
# attributions[idx] = np.sum(attributions[idx], axis=2)
# attributions[idx] = onehot_data[idx]*np.transpose(attributions[idx])
import deeplift
from deeplift.layers import NonlinearMxtsMode
import deeplift.conversion.kerasapi_conversion as kc
keras_model = model_from_json(open(keras_model_json).read())
keras_model.load_weights(keras_model_weights)
method_to_model = OrderedDict()
for method_name, nonlinear_mxts_mode in [('rescale_all_layers', NonlinearMxtsMode.Rescale)]:
method_to_model[method_name] = kc.convert_model_from_saved_files(
h5_file=keras_model_weights,
json_file=keras_model_json,
nonlinear_mxts_mode=nonlinear_mxts_mode)
print("Compiling scoring functions")
method_to_scoring_func = OrderedDict()
for method,model in method_to_model.items():
print("Compiling scoring function for: "+method)
method_to_scoring_func[method] = model.get_target_contribs_func(find_scores_layer_idx=0,
target_layer_idx=-2)
method_to_task_to_scores = OrderedDict()
# from deeplift.util import get_shuffle_seq_ref_function
# from deeplift.dinuc_shuffle import dinuc_shuffle #function to do a dinucleotide shuffle
# rescale_all_layers_many_refs_func = get_shuffle_seq_ref_function(
# #score_computation_function is the original function to compute scores
# score_computation_function=method_to_scoring_func['rescale_all_layers'],
# #shuffle_func is the function that shuffles the sequence
# #technically, given the background of this simulation, randomly_shuffle_seq
# #makes more sense. However, on real data, a dinuc shuffle is advisable due to
# #the strong bias against CG dinucleotides
# shuffle_func=dinuc_shuffle,
# one_hot_func=lambda x: np.array([one_hot_encode_along_channel_axis(seq) for seq in x]))
# num_refs_per_seq=10 #number of references to generate per sequence
# method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)] = OrderedDict()
# for task_idx in [0,1,2]:
# #The sum over the ACGT axis in the code below is important! Recall that DeepLIFT
# # assigns contributions based on difference-from-reference; if
# # a position is [1,0,0,0] (i.e. 'A') in the actual sequence and [0, 1, 0, 0]
# # in the reference, importance will be assigned to the difference (1-0)
# # in the 'A' channel, and (0-1) in the 'C' channel. You want to take the importance
# # on all channels and sum them up, so that at visualization-time you can project the
# # total importance over all four channels onto the base that is actually present (i.e. the 'A'). If you
# # don't do this, your visualization will look very confusing as multiple bases will be highlighted at
# # every position and you won't know which base is the one that is actually present in the sequence!
# method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)][task_idx] =\
# np.sum(rescale_all_layers_many_refs_func(
# task_idx=task_idx,
# input_data_sequences=data.sequences,
# num_refs_per_seq=num_refs_per_seq,
# batch_size=200,
# progress_update=1000,
# ),axis=2)
method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)] = OrderedDict()
for task_idx in [0,1,2]:
#The sum over the ACGT axis in the code below is important! Recall that DeepLIFT
# assigns contributions based on difference-from-reference; if
# a position is [1,0,0,0] (i.e. 'A') in the actual sequence and [0, 1, 0, 0]
# in the reference, importance will be assigned to the difference (1-0)
# in the 'A' channel, and (0-1) in the 'C' channel. You want to take the importance
# on all channels and sum them up, so that at visualization-time you can project the
# total importance over all four channels onto the base that is actually present (i.e. the 'A'). If you
# don't do this, your visualization will look very confusing as multiple bases will be highlighted at
# every position and you won't know which base is the one that is actually present in the sequence!
method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)][task_idx] =\
np.sum(method_to_scoring_func['rescale_all_layers'](
task_idx=task_idx,
input_data_list=[onehot_data],
input_references_list=[np.zeros_like(onehot_data)],
batch_size=200,
progress_update=None,
),axis=2)
#visualize scores + ground-truth locations of motifs
%matplotlib inline
from deeplift.visualization import viz_sequence
for task_idx, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
(1,197) #illustrates non-specific firing of guided backprop
]:
print("Scores for task",task_idx,"for example",idx)
for method_name in [
'rescale_all_layers_multiref_10'
]:
scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
highlight = {'blue':[
(embedding.startPos, embedding.startPos+len(embedding.what))
for embedding in data.embeddings[idx] if 'GATA_disc1' in embedding.what.getDescription()],
'green':[
(embedding.startPos, embedding.startPos+len(embedding.what))
for embedding in data.embeddings[idx] if 'TAL1_known1' in embedding.what.getDescription()]}
viz_sequence.plot_weights(scores_for_idx, subticks_frequency=10, highlight=highlight)
for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
(1,197) #illustrates non-specific firing of guided backprop
]:
print("Scores for task",task,"for example",idx)
viz_sequence.plot_weights(hyp_attributions[task][idx], subticks_frequency=10)
for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
(1,197) #illustrates non-specific firing of guided backprop
]:
print("Scores for task",task,"for example",idx)
viz_sequence.plot_weights(attributions[task][idx], subticks_frequency=10)
sum_of_absolute_diffs = 0.0
for task_idx in [0,1,2]:
for idx in range(len(onehot_data)):
scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
sum_of_absolute_diffs += np.sum(np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx)))
sum_of_absolute_diffs
len(onehot_data)
max_of_absolute_diffs = 0.0
for task_idx in [0,1,2]:
for idx in range(len(onehot_data)):
scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
max_of_absolute_diffs = max(max_of_absolute_diffs, np.max(np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx))))
max_of_absolute_diffs
task_idx, idx = (0,731)
scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
diffs = np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx))
max_of_absolute_diffs = np.max(diffs)
max_diff_bp_idx = np.unravel_index(np.argmax(diffs), diffs.shape)
max_diff_bp_idx
max_of_absolute_diffs
diffs[max_diff_bp_idx]
scores_for_idx[max_diff_bp_idx[0]]
attributions[task_idx][idx][max_diff_bp_idx[0]]
viz_sequence.plot_weights(attributions[task_idx][idx][50:60], subticks_frequency=10)
viz_sequence.plot_weights(scores_for_idx[50:60], subticks_frequency=10)
viz_sequence.plot_weights(attributions[task_idx][idx], subticks_frequency=10)
viz_sequence.plot_weights(scores_for_idx, subticks_frequency=10)