In [1]:
from __future__ import print_function
In [2]:
!./grab_model_and_data.sh
File sequences.simdata.gz exists already
File keras2_conv1d_record_5_model_PQzyq_modelJson.json exists already
File keras2_record_5_model_PQzyq_modelWeights.h5 exists already
File test.txt.gz exists already
In [3]:
# Use gpus 2, 3
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "2, 3"
In [4]:
import simdna
In [5]:
import simdna.synthetic as synthetic
import gzip
data_filename = "sequences.simdata.gz"

#read in the data in the testing set
test_ids_fh = gzip.open("test.txt.gz","rb")
ids_to_load = [x.decode('utf-8').rstrip("\n") for x in test_ids_fh]
data = synthetic.read_simdata_file(data_filename, ids_to_load=ids_to_load)
In [6]:
import numpy as np

#this is set up for 1d convolutions where examples
#have dimensions (len, num_channels)
#the channel axis is the axis for one-hot encoding.
def one_hot_encode_along_channel_axis(sequence):
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
            
onehot_data = np.array([one_hot_encode_along_channel_axis(seq) for seq in data.sequences])
In [7]:
keras_model_weights = "keras2_conv1d_record_5_model_PQzyq_modelWeights.h5"
keras_model_json = "keras2_conv1d_record_5_model_PQzyq_modelJson.json"
In [8]:
from keras import backend as K
from keras.models import Sequential, Model
from keras.models import model_from_json
import deepexplain
from deepexplain.tensorflow.methods import DeepLIFTRescale
from deepexplain.tensorflow import DeepExplain
from deeplift.dinuc_shuffle import dinuc_shuffle
from collections import OrderedDict

funcs = OrderedDict()
hyp_attributions = OrderedDict()
attributions = OrderedDict()
num_refs_per_seq=10 #number of references to generate per sequence

K.clear_session()
keras_model = model_from_json(open(keras_model_json).read())
keras_model.load_weights(keras_model_weights)

with deepexplain.tensorflow.DeepExplain(session=K.get_session()) as de:
    input_tensor = keras_model.layers[0].input
    
    fModel = Model(inputs=input_tensor, outputs = keras_model.layers[-2].output)
    target_tensor = fModel(input_tensor)
    
    for task in [0,1,2]:
        #hyp_attributions[task] = de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(onehot_data))
        #attributions[task] = onehot_data*hyp_attributions[task]
        funcs[task] = de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(onehot_data))
        hyp_attributions[task] = np.array(funcs[task](onehot_data)).squeeze()
        attributions[task] = onehot_data*hyp_attributions[task]
    
#     for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
#                       (1,197)  #illustrates non-specific firing of guided backprop
#                      ]:
#         attributions[idx] = 0.0
#         for shuff in range(num_refs_per_seq):
#             reference = one_hot_encode_along_channel_axis(dinuc_shuffle(data.sequences[idx]))
#             #attributions[idx] = de.explain('grad*input', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
#             #attributions[idx] = de.explain('saliency', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
#             #attributions[idx] = de.explain('intgrad', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
#             #attributions[idx] = de.explain('elrp', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
#             #attributions[idx] = de.explain('occlusion', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)))
#             attributions[idx] += de.explain('deeplift', target_tensor[:,task], input_tensor, K.cast_to_floatx(np.expand_dims(onehot_data[idx], axis=0)), baseline=reference)
#         attributions[idx] /= num_refs_per_seq
#         attributions[idx] = np.sum(attributions[idx], axis=2)
#         attributions[idx] = onehot_data[idx]*np.transpose(attributions[idx])
Using TensorFlow backend.
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  False
Wrapping the inputs in a list...
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  False
Wrapping the inputs in a list...
DeepExplain: running "deeplift" explanation method (5)
Model with multiple inputs:  False
Wrapping the inputs in a list...
In [9]:
import deeplift
from deeplift.layers import NonlinearMxtsMode
import deeplift.conversion.kerasapi_conversion as kc

keras_model = model_from_json(open(keras_model_json).read())
keras_model.load_weights(keras_model_weights)

method_to_model = OrderedDict()
for method_name, nonlinear_mxts_mode in [('rescale_all_layers', NonlinearMxtsMode.Rescale)]:
    method_to_model[method_name] = kc.convert_model_from_saved_files(
        h5_file=keras_model_weights,
        json_file=keras_model_json,
        nonlinear_mxts_mode=nonlinear_mxts_mode)
nonlinear_mxts_mode is set to: Rescale
Heads-up: I assume sigmoid is the output layer, not an intermediate one; if it's an intermediate layer then please bug me and I will implement the grad func
In [10]:
print("Compiling scoring functions")
method_to_scoring_func = OrderedDict()
for method,model in method_to_model.items():
    print("Compiling scoring function for: "+method)
    method_to_scoring_func[method] = model.get_target_contribs_func(find_scores_layer_idx=0,
                                                                    target_layer_idx=-2)
Compiling scoring functions
Compiling scoring function for: rescale_all_layers
In [11]:
method_to_task_to_scores = OrderedDict()

# from deeplift.util import get_shuffle_seq_ref_function
# from deeplift.dinuc_shuffle import dinuc_shuffle #function to do a dinucleotide shuffle

# rescale_all_layers_many_refs_func = get_shuffle_seq_ref_function(
#     #score_computation_function is the original function to compute scores
#     score_computation_function=method_to_scoring_func['rescale_all_layers'],
#     #shuffle_func is the function that shuffles the sequence
#     #technically, given the background of this simulation, randomly_shuffle_seq
#     #makes more sense. However, on real data, a dinuc shuffle is advisable due to
#     #the strong bias against CG dinucleotides
#     shuffle_func=dinuc_shuffle,
#     one_hot_func=lambda x: np.array([one_hot_encode_along_channel_axis(seq) for seq in x]))

# num_refs_per_seq=10 #number of references to generate per sequence
# method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)] = OrderedDict()
# for task_idx in [0,1,2]:
#     #The sum over the ACGT axis in the code below is important! Recall that DeepLIFT
#     # assigns contributions based on difference-from-reference; if
#     # a position is [1,0,0,0] (i.e. 'A') in the actual sequence and [0, 1, 0, 0]
#     # in the reference, importance will be assigned to the difference (1-0)
#     # in the 'A' channel, and (0-1) in the 'C' channel. You want to take the importance
#     # on all channels and sum them up, so that at visualization-time you can project the
#     # total importance over all four channels onto the base that is actually present (i.e. the 'A'). If you
#     # don't do this, your visualization will look very confusing as multiple bases will be highlighted at
#     # every position and you won't know which base is the one that is actually present in the sequence!
#     method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)][task_idx] =\
#         np.sum(rescale_all_layers_many_refs_func(
#             task_idx=task_idx,
#             input_data_sequences=data.sequences,
#             num_refs_per_seq=num_refs_per_seq,
#             batch_size=200,
#             progress_update=1000,
#         ),axis=2)
In [12]:
method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)] = OrderedDict()
for task_idx in [0,1,2]:
    #The sum over the ACGT axis in the code below is important! Recall that DeepLIFT
    # assigns contributions based on difference-from-reference; if
    # a position is [1,0,0,0] (i.e. 'A') in the actual sequence and [0, 1, 0, 0]
    # in the reference, importance will be assigned to the difference (1-0)
    # in the 'A' channel, and (0-1) in the 'C' channel. You want to take the importance
    # on all channels and sum them up, so that at visualization-time you can project the
    # total importance over all four channels onto the base that is actually present (i.e. the 'A'). If you
    # don't do this, your visualization will look very confusing as multiple bases will be highlighted at
    # every position and you won't know which base is the one that is actually present in the sequence!
    method_to_task_to_scores['rescale_all_layers_multiref_'+str(num_refs_per_seq)][task_idx] =\
        np.sum(method_to_scoring_func['rescale_all_layers'](
            task_idx=task_idx,
            input_data_list=[onehot_data],
            input_references_list=[np.zeros_like(onehot_data)],
            batch_size=200,
            progress_update=None,
        ),axis=2)
In [13]:
#visualize scores + ground-truth locations of motifs
%matplotlib inline
from deeplift.visualization import viz_sequence

for task_idx, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
                      (1,197)  #illustrates non-specific firing of guided backprop
                     ]:
    print("Scores for task",task_idx,"for example",idx)
    for method_name in [
                        'rescale_all_layers_multiref_10'
                        ]:
        scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
        highlight = {'blue':[
                (embedding.startPos, embedding.startPos+len(embedding.what))
                for embedding in data.embeddings[idx] if 'GATA_disc1' in embedding.what.getDescription()],
                'green':[
                (embedding.startPos, embedding.startPos+len(embedding.what))
                for embedding in data.embeddings[idx] if 'TAL1_known1' in embedding.what.getDescription()]}
        viz_sequence.plot_weights(scores_for_idx, subticks_frequency=10, highlight=highlight)
Scores for task 0 for example 731
Scores for task 1 for example 197
In [14]:
for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
                  (1,197)  #illustrates non-specific firing of guided backprop
                 ]:
    print("Scores for task",task,"for example",idx)
    viz_sequence.plot_weights(hyp_attributions[task][idx], subticks_frequency=10) 
Scores for task 0 for example 731
Scores for task 1 for example 197
In [15]:
for task, idx in [(0,731), #illustrates failure of grad*inp, integrated grads, deeplift-rescale
                  (1,197)  #illustrates non-specific firing of guided backprop
                 ]:
    print("Scores for task",task,"for example",idx)
    viz_sequence.plot_weights(attributions[task][idx], subticks_frequency=10) 
Scores for task 0 for example 731
Scores for task 1 for example 197
In [16]:
sum_of_absolute_diffs = 0.0
for task_idx in [0,1,2]:
    for idx in range(len(onehot_data)):
        scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
        sum_of_absolute_diffs += np.sum(np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx)))
In [17]:
sum_of_absolute_diffs
Out[17]:
0.015788874117674823
In [18]:
len(onehot_data)
Out[18]:
800
In [19]:
max_of_absolute_diffs = 0.0

for task_idx in [0,1,2]:
    for idx in range(len(onehot_data)):
        scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
        max_of_absolute_diffs = max(max_of_absolute_diffs, np.max(np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx))))
In [20]:
max_of_absolute_diffs
Out[20]:
3.33786e-06
In [21]:
task_idx, idx = (0,731)
scores_for_idx = onehot_data[idx]*method_to_task_to_scores['rescale_all_layers_multiref_10'][task_idx][idx][:,None]
diffs = np.absolute(np.subtract(attributions[task_idx][idx], scores_for_idx))
max_of_absolute_diffs = np.max(diffs)
max_diff_bp_idx = np.unravel_index(np.argmax(diffs), diffs.shape)
In [22]:
max_diff_bp_idx
Out[22]:
(53, 1)
In [23]:
max_of_absolute_diffs
Out[23]:
1.9073486e-06
In [24]:
diffs[max_diff_bp_idx]
Out[24]:
1.9073486e-06
In [25]:
scores_for_idx[max_diff_bp_idx[0]]
Out[25]:
array([0.      , 4.844485, 0.      , 0.      ], dtype=float32)
In [26]:
attributions[task_idx][idx][max_diff_bp_idx[0]]
Out[26]:
array([-0.      ,  4.844483, -0.      , -0.      ], dtype=float32)
In [27]:
viz_sequence.plot_weights(attributions[task_idx][idx][50:60], subticks_frequency=10) 
In [28]:
viz_sequence.plot_weights(scores_for_idx[50:60], subticks_frequency=10) 
In [29]:
viz_sequence.plot_weights(attributions[task_idx][idx], subticks_frequency=10) 
In [30]:
viz_sequence.plot_weights(scores_for_idx, subticks_frequency=10)