import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores, import_tfmodisco_results
from motif.read_motifs import trim_motif_by_ic
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image
import h5py
import numpy as np
import modisco
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
tfm_results_path = os.environ["TFM_TFM_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_CLUSTER_CACHE" in os.environ:
cluster_cache_dir = os.environ["TFM_CLUSTER_CACHE"]
else:
cluster_cache_dir = None
print("TF-MoDISco results path: %s" % tfm_results_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("Importance score key: %s" % hyp_score_key)
print("Saved clusters cache: %s" % cluster_cache_dir)
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/BPNet/BPNet_Sox2_ChIPseq/BPNet_Sox2_ChIPseq_profile_tfm.h5 DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/BPNet/BPNet_Sox2_ChIPseq_imp_scores.h5 Importance score key: profile_hyp_scores Saved clusters cache: /users/amtseng/tfmodisco/results/reports/submotif_clustering//cache/BPNet/BPNet_Sox2_ChIPseq_profile
# Define constants
shap_score_center_size = 400
if cluster_cache_dir:
os.makedirs(cluster_cache_dir, exist_ok=True)
For plotting and organizing things
def check_tfmodisco_motif_subcluster(tfm_results):
"""
From an imported TF-MoDISco results object, returns whether or not
the results contain the subclustering of each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
# Take an arbitrary metacluster
metacluster = next(iter(metaclusters.values()))
patterns = metacluster.seqlets_to_patterns_result.patterns
# Take an arbitrary pattern
pattern = patterns[0]
return pattern.subclusters is not None
def compute_tfmodisco_motif_subclusters(tfm_results):
"""
From an imported TF-MoDISco results object, computes the subclustering
of heterogeneity within each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
# Compute subclustering for each pattern (motif)
pattern.compute_subclusters_and_embedding(
pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
track_transformer=modisco.affinitymat.L1Normalizer(),
min_overlap=None # This argument is irrelevant here
),
perplexity=30, n_jobs=4, verbose=True
)
def plot_motif_heterogeneity(tfm_results, save_dir=None):
"""
Plots subclusters of motifs. If `save_dir` is provided, saves the
results and figures there.
"""
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "50%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Subpattern", style={"text-align": "center"}),
vdomh.th("Seqlets", style={"text-align": "center"}),
vdomh.th("Embeddings", style={"text-align": "center"}),
vdomh.th("hCWM", style={"text-align": "center"})
)
)
if save_dir:
motif_hdf5 = h5py.File(os.path.join(save_dir, "all_motif_subclusters.h5"), "w")
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
embedding = pattern.twod_embedding
subpattern_clusters = pattern.subclusters
# Aggregate motif
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
)
table_rows = [vdomh.tr(
vdomh.td("Agg."),
vdomh.td(str(len(pattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
)]
if save_dir:
# Save aggregate embedding plot
motif_id = "%d_%d" % (metacluster_i, pattern_i)
emb_fig.savefig(os.path.join(save_dir, motif_id + "_subcluster_agg.png"))
# Create dictionaries for subclusters
sc_emb_figs, sc_hcwm_figs, sc_motifs = {}, {}, {}
for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
pfm = subpattern["sequence"].fwd
cwm = subpattern["task0_contrib_scores"].fwd
hcwm = subpattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_motif_by_ic(pfm, hcwm, pad=4)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
)
table_rows.append(vdomh.tr(
vdomh.td(str(subpattern_key)),
vdomh.td(str(len(subpattern.seqlets))),
vdomh.td(figure_to_vdom_image(emb_fig)),
vdomh.td(figure_to_vdom_image(hcwm_fig))
))
if save_dir:
sc_emb_figs[subpattern_key] = emb_fig
sc_hcwm_figs[subpattern_key] = hcwm_fig
sc_motifs[subpattern_key] = (pfm, cwm, hcwm, trimmed_hcwm)
if save_dir:
# Save embedding plots, hCWM figure, and motifs
for sc_key, emb_fig in sc_emb_figs.items():
emb_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s.png" % sc_key)))
for sc_key, hcwm_fig in sc_hcwm_figs.items():
hcwm_fig.savefig(os.path.join(save_dir, motif_id + ("_subcluster_%s_hcwm_trimmed.png" % sc_key)))
motif_dset = motif_hdf5.create_group(motif_id)
motif_dset.create_dataset("embeddings", data=embedding, compression="gzip")
motif_dset.create_dataset("clusters", data=subpattern_clusters, compression="gzip")
for sc_key, (pfm, cwm, hcwm, trimmed_hcwm) in sc_motifs.items():
sc_dset = motif_dset.create_group("subcluster_%s" % sc_key)
sc_dset.create_dataset("pfm_full", data=pfm, compression="gzip")
sc_dset.create_dataset("hcwm_full", data=hcwm, compression="gzip")
sc_dset.create_dataset("cwm_full", data=cwm, compression="gzip")
sc_dset.create_dataset("hcwm_trimmed", data=trimmed_hcwm, compression="gzip")
table = vdomh.table(header, vdomh.tbody(*table_rows))
display(table)
plt.close("all") # Remove all standing figures
Run motif subclustering
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = import_shap_scores(
shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=True
)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 100%|██████████| 10/10 [00:06<00:00, 1.56it/s]
# Import the TF-MoDISco results object
tfm_obj = import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
if not check_tfmodisco_motif_subcluster(tfm_obj):
compute_tfmodisco_motif_subclusters(tfm_obj)
For each motif, show the subclusters that exist within the TF-MoDISco-identified subpatterns
plot_motif_heterogeneity(tfm_obj, cluster_cache_dir)
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 3890 | ||
0 | 839 | ||
1 | 735 | ||
2 | 597 | ||
3 | 429 | ||
4 | 309 | ||
5 | 261 | ||
6 | 222 | ||
7 | 138 | ||
8 | 131 | ||
9 | 79 | ||
10 | 79 | ||
11 | 71 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 1706 | ||
0 | 234 | ||
1 | 223 | ||
2 | 221 | ||
3 | 185 | ||
4 | 146 | ||
5 | 142 | ||
6 | 137 | ||
7 | 129 | ||
8 | 97 | ||
9 | 94 | ||
10 | 84 | ||
11 | 14 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 275 | ||
0 | 51 | ||
1 | 47 | ||
2 | 32 | ||
3 | 27 | ||
4 | 27 | ||
6 | 25 | ||
5 | 25 | ||
7 | 18 | ||
8 | 13 | ||
9 | 10 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 79 | ||
0 | 27 | ||
1 | 24 | ||
2 | 16 | ||
3 | 8 | ||
4 | 4 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 75 | ||
0 | 31 | ||
1 | 22 | ||
2 | 22 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 61 | ||
0 | 23 | ||
1 | 20 | ||
2 | 15 | ||
3 | 3 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 60 | ||
0 | 34 | ||
1 | 26 |
Subpattern | Seqlets | Embeddings | hCWM |
---|---|---|---|
Agg. | 34 | ||
0 | 34 |