import os
import util
import viz_sequence
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:11: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` # This is added back by InteractiveShellApp.init_path()
<tqdm.notebook.tqdm_notebook at 0x7f18325790d0>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
preds_path = os.environ["TFM_PRED_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
print("Predictions path: %s" % preds_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
Predictions path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//predictions/ENCSR000EEC/profile_predictions.h5 DeepSHAP scores path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//shap/ENCSR000EEC/counts_scores_alex_format.h5 TF-MoDISco results path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//modisco/ENCSR000EEC/counts/modisco_results.hd5
# Define constants
input_length, profile_length = 2114, 1000
shap_score_center_size = 400
hyp_score_key = "hyp_scores"
task_index = None
For plotting and organizing things
def plot_profiles(profs, title=None, return_fig=False):
"""
Plots the given profiles as a signal track.
It should be a T x O x 2 NumPy array, where the subarrays are the
tracks for the plus and minus strand, for each task. No normalization is
performed prior to plotting.
"""
assert len(profs.shape) == 3
num_tasks, prof_length, _ = profs.shape
fig, ax = plt.subplots(num_tasks, figsize=(20, num_tasks * 3 * 2))
if num_tasks == 1:
ax = [ax]
for i in range(num_tasks):
ax[i].plot(profs[i,:,0], color="royalblue")
ax[i].plot(-profs[i,:,1], color="goldenrod")
if title:
fig.suptitle(title)
if return_fig:
return fig
plt.show()
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 0%| | 0/49 [00:00<?, ?it/s] Importing SHAP scores: 2%|▏ | 1/49 [00:00<00:09, 5.04it/s] Importing SHAP scores: 4%|▍ | 2/49 [00:00<00:09, 5.15it/s] Importing SHAP scores: 6%|▌ | 3/49 [00:00<00:08, 5.49it/s] Importing SHAP scores: 8%|▊ | 4/49 [00:00<00:08, 5.04it/s] Importing SHAP scores: 10%|█ | 5/49 [00:00<00:08, 5.43it/s] Importing SHAP scores: 12%|█▏ | 6/49 [00:01<00:07, 5.69it/s] Importing SHAP scores: 14%|█▍ | 7/49 [00:01<00:07, 5.29it/s] Importing SHAP scores: 16%|█▋ | 8/49 [00:01<00:07, 5.56it/s] Importing SHAP scores: 18%|█▊ | 9/49 [00:01<00:06, 5.74it/s] Importing SHAP scores: 20%|██ | 10/49 [00:01<00:07, 5.34it/s] Importing SHAP scores: 22%|██▏ | 11/49 [00:02<00:06, 5.58it/s] Importing SHAP scores: 24%|██▍ | 12/49 [00:02<00:06, 5.75it/s] Importing SHAP scores: 27%|██▋ | 13/49 [00:02<00:06, 5.23it/s] Importing SHAP scores: 29%|██▊ | 14/49 [00:02<00:06, 5.43it/s] Importing SHAP scores: 31%|███ | 15/49 [00:02<00:06, 5.56it/s] Importing SHAP scores: 33%|███▎ | 16/49 [00:02<00:05, 5.66it/s] Importing SHAP scores: 35%|███▍ | 17/49 [00:03<00:06, 5.22it/s] Importing SHAP scores: 37%|███▋ | 18/49 [00:03<00:05, 5.45it/s] Importing SHAP scores: 39%|███▉ | 19/49 [00:03<00:05, 5.59it/s] Importing SHAP scores: 41%|████ | 20/49 [00:03<00:05, 5.19it/s] Importing SHAP scores: 43%|████▎ | 21/49 [00:03<00:05, 5.39it/s] Importing SHAP scores: 45%|████▍ | 22/49 [00:04<00:04, 5.51it/s] Importing SHAP scores: 47%|████▋ | 23/49 [00:04<00:05, 5.14it/s] Importing SHAP scores: 49%|████▉ | 24/49 [00:04<00:04, 5.37it/s] Importing SHAP scores: 51%|█████ | 25/49 [00:04<00:04, 5.51it/s] Importing SHAP scores: 53%|█████▎ | 26/49 [00:04<00:04, 5.14it/s] Importing SHAP scores: 55%|█████▌ | 27/49 [00:04<00:04, 5.36it/s] Importing SHAP scores: 57%|█████▋ | 28/49 [00:05<00:03, 5.41it/s] Importing SHAP scores: 59%|█████▉ | 29/49 [00:05<00:03, 5.13it/s] Importing SHAP scores: 61%|██████ | 30/49 [00:05<00:03, 5.43it/s] Importing SHAP scores: 63%|██████▎ | 31/49 [00:05<00:03, 5.70it/s] Importing SHAP scores: 65%|██████▌ | 32/49 [00:05<00:02, 5.72it/s] Importing SHAP scores: 67%|██████▋ | 33/49 [00:06<00:02, 5.37it/s] Importing SHAP scores: 69%|██████▉ | 34/49 [00:06<00:02, 5.68it/s] Importing SHAP scores: 71%|███████▏ | 35/49 [00:06<00:02, 5.92it/s] Importing SHAP scores: 73%|███████▎ | 36/49 [00:06<00:02, 5.54it/s] Importing SHAP scores: 76%|███████▌ | 37/49 [00:06<00:02, 5.45it/s] Importing SHAP scores: 78%|███████▊ | 38/49 [00:06<00:01, 5.55it/s] Importing SHAP scores: 80%|███████▉ | 39/49 [00:07<00:01, 5.13it/s] Importing SHAP scores: 82%|████████▏ | 40/49 [00:07<00:01, 5.35it/s] Importing SHAP scores: 84%|████████▎ | 41/49 [00:07<00:01, 5.50it/s] Importing SHAP scores: 86%|████████▌ | 42/49 [00:07<00:01, 5.11it/s] Importing SHAP scores: 88%|████████▊ | 43/49 [00:07<00:01, 5.31it/s] Importing SHAP scores: 90%|████████▉ | 44/49 [00:08<00:00, 5.45it/s] Importing SHAP scores: 92%|█████████▏| 45/49 [00:08<00:00, 5.06it/s] Importing SHAP scores: 94%|█████████▍| 46/49 [00:08<00:00, 5.28it/s] Importing SHAP scores: 96%|█████████▌| 47/49 [00:08<00:00, 5.46it/s] Importing SHAP scores: 98%|█████████▊| 48/49 [00:08<00:00, 5.57it/s] Importing SHAP scores: 100%|██████████| 49/49 [00:09<00:00, 5.43it/s]
# Import long version of SHAP coordinates and one-hot sequences
hyp_scores_long, _, one_hot_seqs_long, shap_coords_long = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=None, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 0%| | 0/49 [00:00<?, ?it/s] Importing SHAP scores: 2%|▏ | 1/49 [00:00<00:43, 1.10it/s] Importing SHAP scores: 4%|▍ | 2/49 [00:01<00:38, 1.21it/s] Importing SHAP scores: 6%|▌ | 3/49 [00:02<00:37, 1.22it/s] Importing SHAP scores: 8%|▊ | 4/49 [00:03<00:43, 1.04it/s] Importing SHAP scores: 10%|█ | 5/49 [00:04<00:39, 1.10it/s] Importing SHAP scores: 12%|█▏ | 6/49 [00:05<00:36, 1.16it/s] Importing SHAP scores: 14%|█▍ | 7/49 [00:06<00:39, 1.06it/s] Importing SHAP scores: 16%|█▋ | 8/49 [00:07<00:37, 1.10it/s] Importing SHAP scores: 18%|█▊ | 9/49 [00:07<00:34, 1.15it/s] Importing SHAP scores: 20%|██ | 10/49 [00:09<00:37, 1.05it/s] Importing SHAP scores: 22%|██▏ | 11/49 [00:09<00:34, 1.10it/s] Importing SHAP scores: 24%|██▍ | 12/49 [00:10<00:32, 1.12it/s] Importing SHAP scores: 27%|██▋ | 13/49 [00:11<00:34, 1.04it/s] Importing SHAP scores: 29%|██▊ | 14/49 [00:12<00:32, 1.06it/s] Importing SHAP scores: 31%|███ | 15/49 [00:13<00:30, 1.11it/s] Importing SHAP scores: 33%|███▎ | 16/49 [00:14<00:29, 1.12it/s] Importing SHAP scores: 35%|███▍ | 17/49 [00:15<00:30, 1.03it/s] Importing SHAP scores: 37%|███▋ | 18/49 [00:16<00:28, 1.10it/s] Importing SHAP scores: 39%|███▉ | 19/49 [00:17<00:26, 1.12it/s] Importing SHAP scores: 41%|████ | 20/49 [00:18<00:28, 1.03it/s] Importing SHAP scores: 43%|████▎ | 21/49 [00:19<00:27, 1.03it/s] Importing SHAP scores: 45%|████▍ | 22/49 [00:20<00:25, 1.07it/s] Importing SHAP scores: 47%|████▋ | 23/49 [00:21<00:26, 1.01s/it] Importing SHAP scores: 49%|████▉ | 24/49 [00:22<00:24, 1.02it/s] Importing SHAP scores: 51%|█████ | 25/49 [00:23<00:21, 1.09it/s] Importing SHAP scores: 53%|█████▎ | 26/49 [00:24<00:22, 1.01it/s] Importing SHAP scores: 55%|█████▌ | 27/49 [00:25<00:20, 1.07it/s] Importing SHAP scores: 57%|█████▋ | 28/49 [00:25<00:19, 1.09it/s] Importing SHAP scores: 59%|█████▉ | 29/49 [00:27<00:20, 1.01s/it] Importing SHAP scores: 61%|██████ | 30/49 [00:27<00:18, 1.05it/s] Importing SHAP scores: 63%|██████▎ | 31/49 [00:28<00:16, 1.10it/s] Importing SHAP scores: 65%|██████▌ | 32/49 [00:29<00:15, 1.13it/s] Importing SHAP scores: 67%|██████▋ | 33/49 [00:30<00:15, 1.03it/s] Importing SHAP scores: 69%|██████▉ | 34/49 [00:31<00:13, 1.07it/s] Importing SHAP scores: 71%|███████▏ | 35/49 [00:32<00:12, 1.11it/s] Importing SHAP scores: 73%|███████▎ | 36/49 [00:33<00:12, 1.04it/s] Importing SHAP scores: 76%|███████▌ | 37/49 [00:34<00:10, 1.10it/s] Importing SHAP scores: 78%|███████▊ | 38/49 [00:35<00:09, 1.13it/s] Importing SHAP scores: 80%|███████▉ | 39/49 [00:36<00:09, 1.02it/s] Importing SHAP scores: 82%|████████▏ | 40/49 [00:37<00:08, 1.09it/s] Importing SHAP scores: 84%|████████▎ | 41/49 [00:37<00:07, 1.14it/s] Importing SHAP scores: 86%|████████▌ | 42/49 [00:39<00:06, 1.04it/s] Importing SHAP scores: 88%|████████▊ | 43/49 [00:39<00:05, 1.09it/s] Importing SHAP scores: 90%|████████▉ | 44/49 [00:40<00:04, 1.11it/s] Importing SHAP scores: 92%|█████████▏| 45/49 [00:41<00:03, 1.03it/s] Importing SHAP scores: 94%|█████████▍| 46/49 [00:42<00:02, 1.09it/s] Importing SHAP scores: 96%|█████████▌| 47/49 [00:43<00:01, 1.12it/s] Importing SHAP scores: 98%|█████████▊| 48/49 [00:44<00:00, 1.13it/s] Importing SHAP scores: 100%|██████████| 49/49 [00:45<00:00, 1.09it/s]
# Subset the long SHAP data to the SHAP data that was used to run TF-MoDISco
shap_coords_table = pd.DataFrame(shap_coords, columns=["chrom", "start", "end"])
shap_coords_long_table = pd.DataFrame(shap_coords_long, columns=["chrom", "start", "end"])
subset_inds = shap_coords_long_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values
hyp_scores_long = hyp_scores_long[subset_inds]
one_hot_seqs_long = one_hot_seqs_long[subset_inds]
shap_coords_long = shap_coords_long[subset_inds]
# Make sure the coordinates all match
assert np.all(shap_coords_long == shap_coords)
# Import the set of all profiles and their coordinates
true_profs, pred_profs, all_pred_coords = util.import_profiles(preds_path)
# Subset the predicted profiles/coordinates to the task-specific SHAP coordinates/scores
pred_coords_table = pd.DataFrame(all_pred_coords, columns=["chrom", "start", "end"])
subset_inds = pred_coords_table.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
shap_coords_table.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values
true_profs = true_profs[subset_inds]
pred_profs = pred_profs[subset_inds]
pred_coords = all_pred_coords[subset_inds]
# Make sure the coordinates all match
assert np.all(pred_coords == shap_coords)
# Import the TF-MoDISco results object
tfm_obj = util.import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
For each motif, show:
num_seqlets_to_show = 3
sizes_to_show = [1000, 400]
metaclusters = tfm_obj.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
seqlets = pattern.seqlets
display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
display(vdomh.p("%d seqlets" % len(seqlets)))
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
cwm = pattern["task0_contrib_scores"].fwd
# Trim motif based on information content
ic = util.info_content(pfm)
pass_inds = np.where(ic >= 0.2)[0] # Cut off flanks with less than 0.2 IC
trimmed_hcwm = hcwm[max(0, np.min(pass_inds) - 4): min(len(pfm), np.max(pass_inds) + 4 + 1)]
viz_sequence.plot_weights(trimmed_hcwm, figsize=(20, 4), subticks_frequency=(len(trimmed_hcwm) + 1))
# Pick some random seqlets to show
for seqlet_i in np.random.choice(len(seqlets), size=num_seqlets_to_show, replace=False):
seqlet = seqlets[seqlet_i]
coord_index = seqlet.coor.example_idx
seqlet_start = seqlet.coor.start
seqlet_end = seqlet.coor.end
seqlet_rc = seqlet.coor.is_revcomp
hyp = hyp_scores_long[coord_index]
seq = one_hot_seqs_long[coord_index]
seqlet_seq = one_hot_seqs[coord_index, seqlet_start:seqlet_end]
seqlet_hyp = hyp_scores[coord_index, seqlet_start:seqlet_end]
true_prof = true_profs[coord_index]
pred_prof = pred_profs[coord_index]
true_prof_fig = plot_profiles(true_prof, return_fig=True)
pred_prof_fig = plot_profiles(pred_prof, return_fig=True)
true_prof_fig.tight_layout()
pred_prof_fig.tight_layout()
table_rows = [
vdomh.tr(
vdomh.td("Observed profiles"),
vdomh.td(util.figure_to_vdom_image(true_prof_fig))
),
vdomh.tr(
vdomh.td("Predicted profiles"),
vdomh.td(util.figure_to_vdom_image(pred_prof_fig))
)
]
for size in sizes_to_show:
start = (input_length // 2) - (size // 2)
end = start + size
fig = viz_sequence.plot_weights(hyp[start:end] * seq[start:end], subticks_frequency=(size + 1), return_fig=True)
fig.tight_layout()
table_rows.append(
vdomh.tr(
vdomh.td("Importance scores (%d bp)" % size),
vdomh.td(util.figure_to_vdom_image(fig))
)
)
fig = viz_sequence.plot_weights(seqlet_hyp * seqlet_seq, subticks_frequency=(len(seqlet_hyp) + 1), return_fig=True)
fig.tight_layout()
table_rows.append(
vdomh.tr(
vdomh.td("Seqlet"),
vdomh.td(util.figure_to_vdom_image(fig))
)
)
table = vdomh.table(*table_rows)
display(table)
plt.close("all")
13036 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
1233 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
868 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
287 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
203 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
198 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
62 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
60 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
51 seqlets
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |
| Observed profiles | |
| Predicted profiles | |
| Importance scores (1000 bp) | |
| Importance scores (400 bp) | |
| Seqlet |