import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import pfm_info_content, trim_motif_by_ic
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, import_peak_table
import h5py
import pandas as pd
import numpy as np
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:19: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7fced9e59710>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_CLUSTER_CACHE" in os.environ:
cluster_cache_dir = os.environ["TFM_CLUSTER_CACHE"]
else:
cluster_cache_dir = None
print("TF-MoDISco results path: %s" % tfm_results_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved clusters cache: %s" % cluster_cache_dir)
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/BPNet/BPNet_Oct4_ChIPseq/BPNet_Oct4_ChIPseq_profile_tfm.h5 DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/BPNet/BPNet_Oct4_ChIPseq_imp_scores.h5 Importance score key: profile_hyp_scores Saved clusters cache: /users/amtseng/tfmodisco/results/reports/submotif_clustering//cache/BPNet/BPNet_Oct4_ChIPseq_profile
# Define constants
shap_score_center_size = 400
if cluster_cache_dir:
os.makedirs(cluster_cache_dir, exist_ok=True)
For plotting and organizing things
def check_tfmodisco_motif_subcluster(tfm_results):
"""
From an imported TF-MoDISco results object, returns whether or not
the results contain the subclustering of each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
# Take an arbitrary metacluster
metacluster = next(iter(metaclusters.values()))
patterns = metacluster.seqlets_to_patterns_result.patterns
# Take an arbitrary pattern
pattern = patterns[0]
return pattern.subclusters is not None
def compute_tfmodisco_motif_subclusters(tfm_results):
"""
From an imported TF-MoDISco results object, computes the subclustering
of heterogeneity within each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
# Compute subclustering for each pattern (motif)
pattern.compute_subclusters_and_embedding(
pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
track_transformer=modisco.affinitymat.L1Normalizer(),
min_overlap=None # This argument is irrelevant here
),
perplexity=30, n_jobs=4, verbose=True
)
def plot_motif_heterogeneity(tfm_results, save_dir=None):
"""
Plots subclusters of motifs. If `save_dir` is provided, saves the
results and figures there.
"""
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "50%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Subpattern", style={"text-align": "center"}),
vdomh.th("Seqlets", style={"text-align": "center"}),
vdomh.th("Embeddings", style={"text-align": "center"}),
vdomh.th("hCWM", style={"text-align": "center"})
)
)
if save_dir:
motif_hdf5 = h5py.File(os.path.join(save_dir, "all_motif_subclusters.h5"), "w")
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
embedding = pattern.twod_embedding
subpattern_clusters = pattern.subclusters
# Aggregate motif
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
)
table_rows = [vdomh.tr(
vdomh.td("Agg."),
vdomh.td(str(len(pattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
)]
if save_dir:
# Save aggregate embedding plot
motif_id = "%d_%d" % (metacluster_i, pattern_i)
emb_fig.savefig(os.path.join(save_dir, motif_id + "_subcluster_agg.png"))
# Create dictionaries for subclusters
sc_emb_figs, sc_hcwm_figs, sc_motifs = {}, {}, {}
for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
pfm = subpattern["sequence"].fwd
cwm = subpattern["task0_contrib_scores"].fwd
hcwm = subpattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
)
table_rows.append(vdomh.tr(
vdomh.td(str(subpattern_key)),
vdomh.td(str(len(subpattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
))
if save_dir:
sc_emb_figs[subpattern_key] = emb_fig
sc_hcwm_figs[subpattern_key] = hcwm_fig
sc_motifs[subpattern_key] = (pfm, cwm, hcwm, trimmed_hcwm)
if save_dir:
# Save embedding plots, hCWM figure, and motifs
for sc_key, emb_fig in sc_emb_figs.items():
emb_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s.png" % sc_key)))
for sc_key, hcwm_fig in sc_hcwm_figs.items():
hcwm_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s_hcwm_trimmed.png" % sc_key)))
motif_dset = motif_hdf5.create_group(motif_id)
motif_dset.create_dataset("embeddings", data=embedding, compression="gzip")
motif_dset.create_dataset("clusters", data=subpattern_clusters, compression="gzip")
for sc_key, (pfm, cwm, hcwm, trimmed_hcwm) in sc_motifs.items():
sc_dset = motif_dset.create_group("subcluster_%s" % sc_key)
sc_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
sc_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
sc_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
sc_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
table = vdomh.table(header, vdomh.tbody(*table_rows))
display(table)
plt.close("all") # Remove all standing figures
Run motif subclustering
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(
shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=True
)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 0%| | 0/19 [00:00<?, ?it/s] Importing SHAP scores: 5%|▌ | 1/19 [00:00<00:10, 1.76it/s] Importing SHAP scores: 11%|█ | 2/19 [00:01<00:10, 1.70it/s] Importing SHAP scores: 16%|█▌ | 3/19 [00:01<00:08, 1.87it/s] Importing SHAP scores: 21%|██ | 4/19 [00:02<00:07, 2.09it/s] Importing SHAP scores: 26%|██▋ | 5/19 [00:02<00:06, 2.20it/s] Importing SHAP scores: 32%|███▏ | 6/19 [00:02<00:06, 2.12it/s] Importing SHAP scores: 37%|███▋ | 7/19 [00:03<00:04, 2.46it/s] Importing SHAP scores: 42%|████▏ | 8/19 [00:03<00:04, 2.40it/s] Importing SHAP scores: 47%|████▋ | 9/19 [00:04<00:04, 2.36it/s] Importing SHAP scores: 53%|█████▎ | 10/19 [00:05<00:06, 1.31it/s] Importing SHAP scores: 58%|█████▊ | 11/19 [00:06<00:07, 1.12it/s] Importing SHAP scores: 63%|██████▎ | 12/19 [00:07<00:05, 1.28it/s] Importing SHAP scores: 68%|██████▊ | 13/19 [00:07<00:03, 1.62it/s] Importing SHAP scores: 74%|███████▎ | 14/19 [00:08<00:02, 1.76it/s] Importing SHAP scores: 79%|███████▉ | 15/19 [00:08<00:02, 1.78it/s] Importing SHAP scores: 84%|████████▍ | 16/19 [00:08<00:01, 2.15it/s] Importing SHAP scores: 89%|████████▉ | 17/19 [00:09<00:00, 2.15it/s] Importing SHAP scores: 95%|█████████▍| 18/19 [00:10<00:00, 1.65it/s] Importing SHAP scores: 100%|██████████| 19/19 [00:10<00:00, 1.82it/s]
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
if not check_tfmodisco_motif_subcluster(tfm_obj):
compute_tfmodisco_motif_subclusters(tfm_obj)
For each motif, show the subclusters that exist within the TF-MoDISco-identified subpatterns
plot_motif_heterogeneity(tfm_obj, cluster_cache_dir)
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 8372 | ||
0 | 1808 | ||
1 | 1538 | ||
2 | 1135 | ||
3 | 795 | ||
4 | 627 | ||
5 | 537 | ||
6 | 499 | ||
7 | 425 | ||
8 | 342 | ||
9 | 238 | ||
10 | 220 | ||
11 | 196 | ||
12 | 8 | ||
13 | 4 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 844 | ||
0 | 194 | ||
1 | 165 | ||
2 | 163 | ||
3 | 150 | ||
4 | 71 | ||
5 | 52 | ||
6 | 37 | ||
7 | 12 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 236 | ||
0 | 80 | ||
1 | 59 | ||
2 | 49 | ||
3 | 36 | ||
4 | 5 | ||
5 | 3 | ||
7 | 2 | ||
6 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 236 | ||
0 | 85 | ||
1 | 43 | ||
2 | 31 | ||
3 | 30 | ||
4 | 28 | ||
5 | 19 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 197 | ||
0 | 72 | ||
1 | 54 | ||
2 | 49 | ||
3 | 22 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 182 | ||
0 | 62 | ||
1 | 30 | ||
2 | 28 | ||
3 | 26 | ||
4 | 17 | ||
5 | 15 | ||
6 | 4 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 162 | ||
0 | 41 | ||
1 | 31 | ||
2 | 29 | ||
3 | 26 | ||
4 | 14 | ||
5 | 10 | ||
6 | 5 | ||
7 | 4 | ||
8 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 136 | ||
0 | 44 | ||
1 | 33 | ||
2 | 26 | ||
3 | 18 | ||
4 | 8 | ||
5 | 5 | ||
6 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 126 | ||
0 | 30 | ||
1 | 23 | ||
3 | 20 | ||
2 | 20 | ||
4 | 17 | ||
5 | 10 | ||
6 | 6 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 65 | ||
0 | 24 | ||
1 | 22 | ||
2 | 12 | ||
3 | 7 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 50 | ||
0 | 50 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 36 | ||
0 | 36 |