import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import trim_motif_by_ic
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image
import h5py
import numpy as np
import modisco
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_CLUSTER_CACHE" in os.environ:
cluster_cache_dir = os.environ["TFM_CLUSTER_CACHE"]
else:
cluster_cache_dir = None
print("TF-MoDISco results path: %s" % tfm_results_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved clusters cache: %s" % cluster_cache_dir)
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/BPNet/BPNet_Nanog_ChIPseq/BPNet_Nanog_ChIPseq_count_tfm.h5 DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/BPNet/BPNet_Nanog_ChIPseq_imp_scores.h5 Importance score key: count_hyp_scores Saved clusters cache: /users/amtseng/tfmodisco/results/reports/submotif_clustering//cache/BPNet/BPNet_Nanog_ChIPseq_count
# Define constants
shap_score_center_size = 400
if cluster_cache_dir:
os.makedirs(cluster_cache_dir, exist_ok=True)
For plotting and organizing things
def check_tfmodisco_motif_subcluster(tfm_results):
"""
From an imported TF-MoDISco results object, returns whether or not
the results contain the subclustering of each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
# Take an arbitrary metacluster
metacluster = next(iter(metaclusters.values()))
patterns = metacluster.seqlets_to_patterns_result.patterns
# Take an arbitrary pattern
pattern = patterns[0]
return pattern.subclusters is not None
def compute_tfmodisco_motif_subclusters(tfm_results):
"""
From an imported TF-MoDISco results object, computes the subclustering
of heterogeneity within each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
# Compute subclustering for each pattern (motif)
pattern.compute_subclusters_and_embedding(
pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
track_transformer=modisco.affinitymat.L1Normalizer(),
min_overlap=None # This argument is irrelevant here
),
perplexity=30, n_jobs=4, verbose=True
)
def plot_motif_heterogeneity(tfm_results, save_dir=None):
"""
Plots subclusters of motifs. If `save_dir` is provided, saves the
results and figures there.
"""
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "50%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Subpattern", style={"text-align": "center"}),
vdomh.th("Seqlets", style={"text-align": "center"}),
vdomh.th("Embeddings", style={"text-align": "center"}),
vdomh.th("hCWM", style={"text-align": "center"})
)
)
if save_dir:
motif_hdf5 = h5py.File(os.path.join(save_dir, "all_motif_subclusters.h5"), "w")
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
embedding = pattern.twod_embedding
subpattern_clusters = pattern.subclusters
# Aggregate motif
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
)
table_rows = [vdomh.tr(
vdomh.td("Agg."),
vdomh.td(str(len(pattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
)]
if save_dir:
# Save aggregate embedding plot
motif_id = "%d_%d" % (metacluster_i, pattern_i)
emb_fig.savefig(os.path.join(save_dir, motif_id + "_subcluster_agg.png"))
# Create dictionaries for subclusters
sc_emb_figs, sc_hcwm_figs, sc_motifs = {}, {}, {}
for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
pfm = subpattern["sequence"].fwd
cwm = subpattern["task0_contrib_scores"].fwd
hcwm = subpattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
)
table_rows.append(vdomh.tr(
vdomh.td(str(subpattern_key)),
vdomh.td(str(len(subpattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
))
if save_dir:
sc_emb_figs[subpattern_key] = emb_fig
sc_hcwm_figs[subpattern_key] = hcwm_fig
sc_motifs[subpattern_key] = (pfm, cwm, hcwm, trimmed_hcwm)
if save_dir:
# Save embedding plots, hCWM figure, and motifs
for sc_key, emb_fig in sc_emb_figs.items():
emb_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s.png" % sc_key)))
for sc_key, hcwm_fig in sc_hcwm_figs.items():
hcwm_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s_hcwm_trimmed.png" % sc_key)))
motif_dset = motif_hdf5.create_group(motif_id)
motif_dset.create_dataset("embeddings", data=embedding, compression="gzip")
motif_dset.create_dataset("clusters", data=subpattern_clusters, compression="gzip")
for sc_key, (pfm, cwm, hcwm, trimmed_hcwm) in sc_motifs.items():
sc_dset = motif_dset.create_group("subcluster_%s" % sc_key)
sc_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
sc_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
sc_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
sc_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
table = vdomh.table(header, vdomh.tbody(*table_rows))
display(table)
plt.close("all") # Remove all standing figures
Run motif subclustering
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(
shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=True
)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 40/40 [00:17<00:00, 2.23it/s]
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
if not check_tfmodisco_motif_subcluster(tfm_obj):
compute_tfmodisco_motif_subclusters(tfm_obj)
For each motif, show the subclusters that exist within the TF-MoDISco-identified subpatterns
plot_motif_heterogeneity(tfm_obj, cluster_cache_dir)
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 5019 | ||
0 | 988 | ||
1 | 974 | ||
2 | 800 | ||
3 | 335 | ||
4 | 306 | ||
5 | 300 | ||
6 | 272 | ||
7 | 223 | ||
8 | 210 | ||
9 | 199 | ||
10 | 120 | ||
11 | 109 | ||
12 | 98 | ||
13 | 62 | ||
14 | 23 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 2696 | ||
0 | 433 | ||
1 | 382 | ||
2 | 281 | ||
3 | 269 | ||
4 | 268 | ||
5 | 212 | ||
6 | 210 | ||
7 | 206 | ||
8 | 201 | ||
9 | 145 | ||
10 | 37 | ||
11 | 33 | ||
12 | 19 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 2675 | ||
0 | 432 | ||
1 | 389 | ||
2 | 341 | ||
3 | 300 | ||
4 | 236 | ||
5 | 204 | ||
6 | 158 | ||
7 | 149 | ||
8 | 145 | ||
9 | 142 | ||
10 | 73 | ||
11 | 67 | ||
12 | 25 | ||
13 | 10 | ||
14 | 4 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 1561 | ||
0 | 354 | ||
1 | 243 | ||
2 | 177 | ||
3 | 169 | ||
4 | 166 | ||
5 | 157 | ||
6 | 86 | ||
7 | 81 | ||
8 | 76 | ||
9 | 37 | ||
10 | 15 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 895 | ||
0 | 200 | ||
1 | 184 | ||
2 | 153 | ||
3 | 150 | ||
4 | 81 | ||
5 | 70 | ||
6 | 47 | ||
7 | 10 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 694 | ||
0 | 120 | ||
1 | 117 | ||
2 | 102 | ||
3 | 98 | ||
4 | 87 | ||
5 | 75 | ||
6 | 73 | ||
7 | 22 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 638 | ||
0 | 158 | ||
1 | 140 | ||
2 | 89 | ||
3 | 79 | ||
4 | 78 | ||
5 | 61 | ||
6 | 33 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 468 | ||
0 | 99 | ||
1 | 88 | ||
2 | 74 | ||
3 | 67 | ||
4 | 63 | ||
5 | 38 | ||
6 | 29 | ||
7 | 10 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 390 | ||
0 | 143 | ||
1 | 94 | ||
2 | 51 | ||
3 | 49 | ||
4 | 39 | ||
5 | 10 | ||
6 | 4 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 371 | ||
0 | 83 | ||
1 | 64 | ||
2 | 59 | ||
3 | 54 | ||
4 | 41 | ||
5 | 36 | ||
6 | 17 | ||
7 | 12 | ||
8 | 5 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 213 | ||
0 | 96 | ||
1 | 75 | ||
2 | 13 | ||
3 | 12 | ||
4 | 6 | ||
5 | 4 | ||
6 | 3 | ||
8 | 2 | ||
7 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 145 | ||
0 | 54 | ||
1 | 53 | ||
2 | 26 | ||
4 | 4 | ||
3 | 4 | ||
5 | 2 | ||
6 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 137 | ||
0 | 35 | ||
1 | 33 | ||
2 | 33 | ||
3 | 18 | ||
4 | 10 | ||
5 | 6 | ||
6 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 86 | ||
0 | 29 | ||
1 | 26 | ||
2 | 24 | ||
3 | 7 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 72 | ||
0 | 24 | ||
1 | 20 | ||
2 | 14 | ||
3 | 11 | ||
4 | 3 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 70 | ||
0 | 33 | ||
1 | 29 | ||
2 | 6 | ||
3 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 55 | ||
0 | 32 | ||
1 | 21 | ||
2 | 2 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 54 | ||
0 | 25 | ||
1 | 16 | ||
2 | 13 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 49 | ||
0 | 49 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 42 | ||
0 | 42 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 41 | ||
0 | 41 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 41 | ||
0 | 41 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 30 | ||
0 | 30 |