import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from util import figure_to_vdom_image, create_motif_similarity_matrix, aggregate_motifs
import plot.viz_sequence as viz_sequence
import numpy as np
import h5py
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import scipy.signal
import scipy.cluster.hierarchy
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:15: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` from ipykernel import kernelapp as app
<tqdm.notebook.tqdm_notebook at 0x7efea9117810>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
motif_files = os.environ["TFM_MOTIF_FILES"].split(",")
group_names = os.environ["TFM_GROUP_NAMES"].split(",")
if "TFM_HEATMAP_CACHE" in os.environ:
tfm_heatmap_cache_dir = os.environ["TFM_HEATMAP_CACHE"]
else:
tfm_heatmap_cache_dir = None
assert len(motif_files) == len(group_names)
assert len(group_names) == len(set(group_names))
print("Motif files: %s" % motif_files)
print("Group names: %s" % group_names)
print("Saved heatmap cache: %s" % tfm_heatmap_cache_dir)
Motif files: ['/users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/BPNet/BPNet_Sox2_ChIPseq/BPNet_Sox2_ChIPseq_profile/all_motifs.h5', '/users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/BPNet/BPNet_Oct4_ChIPseq/BPNet_Oct4_ChIPseq_profile/all_motifs.h5', '/users/amtseng/tfmodisco/results/reports/tfmodisco_results/cache/BPNet/BPNet_Nanog_ChIPseq/BPNet_Nanog_ChIPseq_profile/all_motifs.h5'] Group names: ['Sox2', 'Oct4', 'Nanog'] Saved heatmap cache: /users/amtseng/tfmodisco/results/reports/motif_heatmaps/cache/BPNet/BPNet_ChIPseq_profile
# Define constants
cluster_color_cycle = plt.rcParams["axes.prop_cycle"].by_key()["color"]
default_cluster_color = "gray"
if tfm_heatmap_cache_dir:
os.makedirs(tfm_heatmap_cache_dir, exist_ok=True)
For plotting and organizing things
def import_motifs(motif_files, group_names):
"""
Imports a set of motifs from the saved HDF5 files.
`group_names` is a list of group names, one for each motif file.
Returns a list of motifs as L x 4 arrays, a parallel list of
motif names, and a dictionary mapping group names to lists of
motif names.
"""
motifs, motif_names = [], []
groups = {}
for motif_file, stem in zip(motif_files, group_names):
groups[stem] = []
with h5py.File(motif_file, "r") as f:
for key in f.keys():
motif_name = "%s:%s" % (stem, key)
motif_names.append(motif_name)
motifs.append(f[key]["cwm_trimmed"][:])
groups[stem].append(motif_name)
return motifs, motif_names, groups
def compute_clusters(linkage, goal_clusters, tolerance=(-2, 2), start=50, max_iter=10):
"""
From a linkage map, computes clusters with a goal of `goal_clusters` clusters.
Will allow the given tolerance. `start` is what distance threshold to check first.
`max_iter` is the maximum number of checks to do.
Returns the clustering.
"""
clusters = scipy.cluster.hierarchy.fcluster(
linkage, start, criterion="distance"
)
num_clusters = len(np.unique(clusters))
if num_clusters > goal_clusters + tolerance[1] and max_iter:
return compute_clusters(linkage, goal_clusters, tolerance, start * 2, max_iter - 1)
elif num_clusters < goal_clusters - tolerance[0] and max_iter:
return compute_clusters(linkage, goal_clusters, tolerance, start / 2, max_iter - 1)
else:
return clusters
def plot_heatmap(sim_matrix, labels, linkage, clusters):
"""
Given a similariy matrix and labels, plots a heatmap with
dendrogram. `linkage` is the linkage map computed on the matrix, and
`clusters` is the cluster ID of each entry.
Returns the figure and the indices in which to order the entries.
"""
fig, ax = plt.subplots(
nrows=2, ncols=2, figsize=(20, 20),
gridspec_kw={
"width_ratios": [20, 1],
"height_ratios": [1, 4],
"hspace": 0,
"wspace": 0.1
}
)
# Compute the color of every link based on cluster assignments
# Adapted from https://stackoverflow.com/questions/38153829/custom-cluster-colors-of-scipy-dendrogram-in-python-link-color-func
leaf_colors = [cluster_color_cycle[i % len(cluster_color_cycle)] for i in clusters]
link_colors = {}
for i, i_link in enumerate(linkage[:, :2].astype(int)):
color_0 = link_colors[i_link[0]] if i_link[0] > len(linkage) else leaf_colors[i_link[0]]
color_1 = link_colors[i_link[1]] if i_link[1] > len(linkage) else leaf_colors[i_link[1]]
link_colors[i + 1 + len(linkage)] = color_0 if color_0 == color_1 else default_cluster_color
dend = scipy.cluster.hierarchy.dendrogram(
linkage, ax=ax[0, 0], link_color_func=(lambda x: link_colors[x])
)
order_inds = dend["leaves"]
sim_matrix_reordered = sim_matrix[:, order_inds][order_inds, :]
heatmap = ax[1, 0].imshow(sim_matrix_reordered, aspect="auto", cmap="Blues")
ax[1, 0].set_yticks([])
ax[1, 0].set_xticks(range(len(sim_matrix)))
ax[1, 0].set_xticklabels(np.array(labels)[order_inds], rotation=90, fontsize=10)
fig.colorbar(heatmap, cax=ax[1, 1])
ax[0, 0].axis("off")
ax[0, 1].axis("off")
plt.show()
return fig, order_inds
motifs, motif_names, motif_groups = import_motifs(motif_files, group_names)
# Flip all motifs to be the purine-rich version
for i, motif in enumerate(motifs):
if np.sum(motif[:, [0, 2]]) < 0.5 * np.sum(motif):
motifs[i] = np.flip(motif)
# Compute similarity matrix
sim_matrix = create_motif_similarity_matrix(motifs)
# Compute linkage
linkage = scipy.cluster.hierarchy.linkage(sim_matrix, method="ward")
# Compute clusters
expected_clusters = np.max([len(m) for m in motif_groups.values()])
clusters = compute_clusters(linkage, expected_clusters, max_iter=100)
display(vdomh.h4("Number of motifs: %d" % len(clusters)))
display(vdomh.h4("Number of clusters: %d" % len(np.unique(clusters))))
# Plot heatmap
fig, order_inds = plot_heatmap(sim_matrix, motif_names, linkage, clusters)
if tfm_heatmap_cache_dir:
fig.savefig(os.path.join(tfm_heatmap_cache_dir, "motif_cluster_heatmap.png"))
# Show aggregated and constituent motifs for each cluster
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "50%"}),
vdomh.col(style={"width": "50%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Aggregate motif", style={"text-align": "center"}),
vdomh.th("Constituent motifs", style={"text-align": "center"})
)
)
all_consensus = {}
cluster_ids, counts = np.unique(clusters, return_counts=True)
cluster_ids = cluster_ids[np.flip(np.argsort(counts))]
for i, cluster_id in enumerate(cluster_ids):
match_inds = np.where(clusters == cluster_id)[0]
matches = [motifs[j] for j in match_inds]
match_names = [motif_names[j] for j in match_inds]
consensus = aggregate_motifs(matches)
all_consensus[cluster_id] = consensus
display(vdomh.h3("Cluster %d (%d/%d)" % (cluster_id, i + 1, len(cluster_ids))))
display(vdomh.h4("%d motifs" % len(matches)))
agg_fig = viz_sequence.plot_weights(consensus, figsize=(20, 4), return_fig=True)
agg_fig.tight_layout()
const_figs = []
for motif, motif_name in zip(matches, match_names):
fig = viz_sequence.plot_weights(motif, figsize=(20, 4), return_fig=True)
plt.title(motif_name)
fig.tight_layout()
const_figs.append(figure_to_vdom_image(fig))
body = vdomh.tbody(vdomh.tr(vdomh.td(figure_to_vdom_image(agg_fig)), vdomh.td(*const_figs)))
display(vdomh.table(colgroup, header, body))
if tfm_heatmap_cache_dir:
agg_fig.savefig(os.path.join(tfm_heatmap_cache_dir, "cluster_%d_aggregate_motif.png" % cluster_id))
plt.close("all")
if tfm_heatmap_cache_dir:
with h5py.File(os.path.join(tfm_heatmap_cache_dir, "motif_clusters.h5"), "w") as f:
f.create_dataset("motif_names", data=np.array(motif_names).astype("S"), compression="gzip")
all_motifs = f.create_group("all_motifs")
for name, motif in zip(motif_names, motifs):
all_motifs.create_dataset(name, data=motif, compression="gzip")
f.create_dataset("similarity_matrix", data=sim_matrix, compression="gzip")
f.create_dataset("dendrogram_order", data=order_inds, compression="gzip")
f.create_dataset("clusters", data=clusters, compression="gzip")
agg_motifs = f.create_group("cluster_motifs")
for cluster_id, consensus in all_consensus.items():
agg_motifs.create_dataset(str(cluster_id), data=consensus, compression="gzip")
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|
| Aggregate motif | Constituent motifs |
|---|---|