In [1]:
import deepdish as dd
import json
import numpy as np
import tensorflow as tf
import pandas as pd
import shap
from tensorflow import keras
import pyfaidx
import shutil
import errno
import os
import sys
sys.path.append("../../../lib/chrombpnet/src/")
from training.utils.losses import multinomial_nll
from training.utils.one_hot import dna_to_one_hot
from evaluation.interpret.shap_utils import *
sys.path.append("../../")
from cross_species_models.utils.viz_sequence import *
import matplotlib.pyplot as plt
import seaborn as sns
2022-05-18 12:56:15.112672: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

Load Data and Define Functions

In [2]:
tf.compat.v1.disable_eager_execution()
In [3]:
SEQ_LEN = 2114
assay = "ENCSR460DKJ"
out_dir = "/oak/stanford/groups/akundaje/patelas/cross_species_models/experiments/20220509_chen_lab_collab/results/human_promoter_subs_atlas/"
species_list = pd.read_csv("/users/patelas/cross_species_models/experiments/20220509_chen_lab_collab/files/task_list.txt", sep="\t", header=None)
promoter_list = pd.read_csv("/users/patelas/cross_species_models/experiments/20220509_chen_lab_collab/files/promoter_list.txt", sep="\t", header=None)
human_fasta = pyfaidx.Fasta("/oak/stanford/groups/akundaje/patelas/reference/hg38/GRCh38_no_alt_analysis_set_GCA_000001405.15.fasta", sequence_always_upper=True)
model_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/ATAC/%s/chrombpnet_model_feb15/chrombpnet_wo_bias.h5"%(assay)
single_motifers = ["chimpanzee", "human", "gorilla", "orangutan", "olive_baboon", "rhesus", "mas_night_monkey", 
                  "porcupine", "paca", "naked_mole-rat", "nutria", "degu", "capybara", "american_beaver"]

double_motifers = ["medium_lemur", "galago", "algerian_mouse", "house_mouse", "ryukyu_mouse", 
                  "shrewmouse", "rat", "deer_mouse", "muskrat", "golden_hamster", "woodchuck", "ground_squirrel",
                  "egyptian_jerboa", "pika", "blind_mole-rat"]

predict_dir = "/oak/stanford/groups/akundaje/patelas/cross_species_models/experiments/20220509_chen_lab_collab/results/human_promoter_subs_atlas/"

preds = pd.read_csv(predict_dir + assay + ".tsv", header=None, sep="\t").applymap(np.log)
In [4]:
def get_seq_for_predict(tert_prom, fasta):
	'''
	Basically, we need to create a sequence of length 2114
	We take the TERT promoter we have, and we extend it on both sides using the real human genome sequence flanking the human promoter
	This is essentially equivalent to replacing the human promoter with another species' promoter and padding using the human genome to reach 2114
	'''
	leftover_len = (SEQ_LEN - len(tert_prom)) 

	if leftover_len > 0:
		left_flanker = fasta["chr5"][1294899 - leftover_len // 2:1294899].seq
		if leftover_len %2:
			right_flanker = fasta["chr5"][1295300 : 1295300 + leftover_len // 2 + 1].seq
		else:
			right_flanker = fasta["chr5"][1295300 : 1295300 + leftover_len // 2].seq

		final_seq = left_flanker + tert_prom + right_flanker
	else:
		if SEQ_LEN % 2:
			final_seq = tert_prom[len(tert_prom) // 2 - SEQ_LEN // 2 : len(tert_prom) // 2 + SEQ_LEN // 2 + 1]
		else:
			final_seq = tert_prom[len(tert_prom) // 2 - SEQ_LEN // 2 : len(tert_prom) // 2 + SEQ_LEN // 2]

	assert len(final_seq) == SEQ_LEN

	return final_seq
In [5]:
def generate_shap_dict(seqs, scores):
    print(seqs.shape, scores.shape)
    assert(seqs.shape==scores.shape)
    assert(seqs.shape[2]==4)

    # construct a dictionary for the raw shap scores and the
    # the projected shap scores
    # MODISCO workflow expects one hot sequences with shape (None,4,inputlen)
    d = {
            'raw': {'seq': np.transpose(seqs, (0, 2, 1))},
            'shap': {'seq': np.transpose(scores, (0, 2, 1))},
            'projected_shap': {'seq': np.transpose(seqs*scores, (0, 2, 1))}
        }

    return d

def softmax(x, temp=1):
    norm_x = x - np.mean(x,axis=1, keepdims=True)
    return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
In [6]:
with keras.utils.CustomObjectScope({'multinomial_nll':multinomial_nll, 'tf':tf}):
    model = keras.models.load_model(model_file)
2022-05-18 12:56:23.255340: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcuda.so.1
2022-05-18 12:56:23.294238: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:02:00.0 name: TITAN X (Pascal) computeCapability: 6.1
coreClock: 1.531GHz coreCount: 28 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 447.48GiB/s
2022-05-18 12:56:23.294296: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-05-18 12:56:23.298107: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-05-18 12:56:23.298235: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
2022-05-18 12:56:23.299516: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcufft.so.10
2022-05-18 12:56:23.299832: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcurand.so.10
2022-05-18 12:56:23.303507: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusolver.so.11
2022-05-18 12:56:23.304482: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcusparse.so.11
2022-05-18 12:56:23.304667: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2022-05-18 12:56:23.307198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2022-05-18 12:56:23.308131: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-05-18 12:56:23.314636: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1733] Found device 0 with properties: 
pciBusID: 0000:02:00.0 name: TITAN X (Pascal) computeCapability: 6.1
coreClock: 1.531GHz coreCount: 28 deviceMemorySize: 11.91GiB deviceMemoryBandwidth: 447.48GiB/s
2022-05-18 12:56:23.316872: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1871] Adding visible gpu devices: 0
2022-05-18 12:56:23.316952: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
2022-05-18 12:56:23.944495: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1258] Device interconnect StreamExecutor with strength 1 edge matrix:
2022-05-18 12:56:23.944539: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1264]      0 
2022-05-18 12:56:23.944547: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1277] 0:   N 
2022-05-18 12:56:23.947400: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1418] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 11436 MB memory) -> physical GPU (device: 0, name: TITAN X (Pascal), pci bus id: 0000:02:00.0, compute capability: 6.1)
2022-05-18 12:56:23.974556: I tensorflow/core/platform/profile_utils/cpu_utils.cc:114] CPU Frequency: 2594005000 Hz
WARNING:tensorflow:No training configuration found in the save file, so the model was *not* compiled. Compile it manually.
In [7]:
one_hot_seqs = dna_to_one_hot([get_seq_for_predict(promoter_list.loc[spec, 0], human_fasta) for spec in range(len(species_list))])
In [8]:
outlen = model.output_shape[0][1]

profile_model_input = model.input
profile_input = one_hot_seqs
counts_model_input = model.input
counts_input = one_hot_seqs

Plot Predictions

Here, we plot predictions for all species from the relevant Atlas model. Single-motifers are depicted in orange, and double-motifers are depicted in green. Because predicted count values may vary wildly in magnitude, plots are in log space.

In [9]:
def plot_preds_separate(preds, assay, task_list):
    plt.rc('axes', labelsize=6)
    plt.rc('ytick', labelsize=6)
    plt.rc('xtick', labelsize=6)
#     plt.rc('title', labelsize=8)
    maxval = preds.max().max()
    for ind, spec in enumerate(task_list):
        plt.figure(dpi=300, figsize=(5,1))
        df_index = list(species_list[0]).index(spec)
        if spec in single_motifers:
            sns.lineplot(x=np.arange(1000), y=preds.loc[df_index], alpha=0.7, color="orange")
        else:
            sns.lineplot(x=np.arange(1000), y=preds.loc[df_index], alpha=0.7, color="green")
        plt.ylim(0, maxval)
        plt.title("Predictions for %s - %s"%(assay, spec.upper()), size=8)
        plt.ylabel("Predicted Log Counts")
        plt.show()
    return
In [10]:
plot_preds_separate(preds, assay, single_motifers + double_motifers)

Calculate and Plot SHAP Scores

In this section, we plot SHAP scores, which reflect the importance of each input nucleotide to the model's prediction. Because the model predicts two quantities (probability profile and total counts), we have two sets of scores. These scores are very helpful in identifying sequence motifs that influenced the predicted output. For each species, the corresponding plots are of the 100bp window around the nucleotide of highest importance.

In [11]:
profile_model_counts_explainer = shap.explainers.deep.TFDeepExplainer(
    (counts_model_input, tf.reduce_sum(model.outputs[1], axis=-1)),
    shuffle_several_times,
    combine_mult_and_diffref=combine_mult_and_diffref)
counts_shap_scores = profile_model_counts_explainer.shap_values(
    counts_input, progress_message=100)
print(counts_shap_scores.shape)
# counts_shap_scores = counts_shap_scores[0]
counts_scores_dict = generate_shap_dict(one_hot_seqs, counts_shap_scores)
WARNING:tensorflow:From /users/patelas/anaconda3/envs/crossspecies/lib/python3.7/site-packages/shap/explainers/deep/deep_tf.py:140: The name tf.keras.backend.get_session is deprecated. Please use tf.compat.v1.keras.backend.get_session instead.

Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Done 0 examples of 30
2022-05-18 12:56:33.224083: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudnn.so.8
2022-05-18 12:56:33.613974: I tensorflow/stream_executor/cuda/cuda_dnn.cc:359] Loaded cuDNN version 8100
2022-05-18 12:56:34.016559: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublas.so.11
2022-05-18 12:56:34.315586: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcublasLt.so.11
(30, 2114, 4)
(30, 2114, 4) (30, 2114, 4)
In [12]:
weightedsum_meannormed_logits = get_weightedsum_meannormed_logits(model)
profile_model_profile_explainer = shap.explainers.deep.TFDeepExplainer(
    (profile_model_input, weightedsum_meannormed_logits),
    shuffle_several_times,
    combine_mult_and_diffref=combine_mult_and_diffref)
profile_shap_scores = profile_model_profile_explainer.shap_values(
    profile_input, progress_message=100)
profile_scores_dict = generate_shap_dict(one_hot_seqs, profile_shap_scores)
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  AddV2 used in model but handling of op is not specified by shap; will use original  gradients
Warning:  StopGradient used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  SpaceToBatchND used in model but handling of op is not specified by shap; will use original  gradients
Warning:  BatchToSpaceND used in model but handling of op is not specified by shap; will use original  gradients
Done 0 examples of 30
(30, 2114, 4) (30, 2114, 4)

We first plot the importance scores for the profile prediction.

In [13]:
max_score = profile_scores_dict["projected_shap"]["seq"].max()
for ind, spec in enumerate(single_motifers + double_motifers):
    spec_index = list(species_list[0]).index(spec)
    curr_argmax = profile_scores_dict["projected_shap"]["seq"][spec_index].max(axis=0).argmax()
    plot_weights(profile_scores_dict["projected_shap"]["seq"][spec_index][:,curr_argmax - 50 : curr_argmax + 50], subticks_frequency=100)
    plt.title("Profile SHAP - %s"%(species_list.loc[spec_index, 0].upper()))
    plt.ylim(0, max_score)
    plt.show()

Next, we plot the importance scores for the count prediction.

In [14]:
max_score = counts_scores_dict["projected_shap"]["seq"].max()
for ind, spec in enumerate(single_motifers + double_motifers):
    spec_index = list(species_list[0]).index(spec)
    curr_argmax = counts_scores_dict["projected_shap"]["seq"][spec_index].max(axis=0).argmax()
    plot_weights(counts_scores_dict["projected_shap"]["seq"][spec_index][:,curr_argmax - 50 : curr_argmax + 50], subticks_frequency=100)
    plt.title("Counts SHAP - %s"%(species_list.loc[spec_index, 0].upper()))
    plt.ylim(0, max_score)
    plt.show()
In [ ]: