import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, create_motif_similarity_matrix, purine_rich_motif
import h5py
import numpy as np
import sklearn.cluster
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7fe9ae08c2d0>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 16,
"axes.labelsize": 16,
"legend.fontsize": 18,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
if "TFM_TF_NAME" in os.environ:
tf_name = os.environ["TFM_TF_NAME"]
if "TFM_TASK_INDEX" in os.environ:
task_index = int(os.environ["TFM_TASK_INDEX"])
else:
task_index = None
else:
tf_name = "MAX"
task_index = 2
if task_index is None:
out_path = "/users/amtseng/tfmodisco/figures/fold_model_motif_reproducibility/%s_fold_model_motif_reproducibility" % tf_name
else:
out_path = "/users/amtseng/tfmodisco/figures/fold_model_motif_reproducibility/%s_task%d_fold_model_motif_reproducibility" % (tf_name, task_index)
os.makedirs(out_path, exist_ok=True)
tf_num_tasks = {
"E2F6": 2,
"FOXA2": 4,
"SPI1": 4,
"CEBPB": 7,
"MAX": 7,
"GABPA": 9,
"MAFK": 9,
"JUND": 14,
"NR3C1-reddytime": 16,
"REST": 20
}
tf_best_model_types = {
"E2F6": list("MM"),
"FOXA2": list("SSMM"),
"SPI1": list("MSSS"),
"CEBPB": list("MMMMSMM"),
"MAX": list("MMSMMSS"),
"GABPA": list("MMMSMMMMM"),
"MAFK": list("MMMMMMMMM"),
"JUND": list("SMMSMSSSSSSSMS"),
"NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
"REST": list("MMMMMMMMMSMMSMMSMMMM")
}
num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]
seed = 20210910
motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_motifs.h5" % tf_name
multitask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/multitask_profile_finetune_stats.tsv"
singletask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/singletask_profile_finetune_stats.tsv"
sanitize_fold = lambda fold: "F" + fold.split("_")[1]
sanitize_task = lambda task: "T" + task.split("_")[1]
def import_all_tfmodisco_motifs(motif_file, task_index=None):
"""
From a file containing all motifs for that TF, imports the
trimmed CWMs of all conditions as a dictionary. Returns a dictionary
of motifs, mapping a unique key to the trimmed CWM.
If `task_index` is given, only import the single-task motifs and
fine-tuned multi-task motifs for that task.
"""
motifs = {}
with h5py.File(motif_file, "r") as f:
# Multi-task, all 10 folds
if task_index is None:
mt = f["multitask"]
for fold in mt.keys():
mt_fold = mt[fold]
for key in ("count", "profile"):
mt_fold_key = mt_fold[key]
for motif_key in mt_fold_key.keys():
if motif_key.startswith("0_"):
# Positive metacluster only
name = "MT:%s:%s:%s" % (sanitize_fold(fold), key[0].upper(), motif_key)
motifs[name] = purine_rich_motif(mt_fold_key[motif_key]["cwm_trimmed"][:])
# Single-task, all 10 folds for all tasks or just a single task
st = f["singletask"]
for task in (st.keys() if task_index is None else ["task_%d" % task_index]):
st_task = st[task]
for fold in st_task.keys():
st_task_fold = st_task[fold]
for key in ("count", "profile"):
st_task_fold_key = st_task_fold[key]
for motif_key in st_task_fold_key.keys():
if motif_key.startswith("0_"):
# Positive metacluster only
name = "ST:%s:%s:%s:%s" % (sanitize_task(task), sanitize_fold(fold), key[0].upper(), motif_key)
motifs[name] = purine_rich_motif(st_task_fold_key[motif_key]["cwm_trimmed"][:])
# Multi-task fine-tune, all tasks plus aggregate task or just a single task
mtft = f["multitask_finetune"]
for task in (mtft.keys() if task_index is None else ["task_%d" % task_index]):
mtft_task = mtft[task]
for key in ("count", "profile"):
mtft_task_key = mtft_task[key]
for motif_key in mtft_task_key.keys():
if motif_key.startswith("0_"):
# Positive metacluster only
name = "MTFT:%s:%s:%s" % (sanitize_task(task), key[0].upper(), motif_key)
motifs[name] = purine_rich_motif(mtft_task_key[motif_key]["cwm_trimmed"][:])
# Single-task fine-tune, all tasks or just a single task
stft = f["singletask_finetune"]
for task in (stft.keys() if task_index is None else ["task_%d" % task_index]):
stft_task = stft[task]
for key in ("count", "profile"):
stft_task_key = stft_task[key]
for motif_key in stft_task_key.keys():
if motif_key.startswith("0_"):
# Positive metacluster only
name = "STFT:%s:%s:%s" % (sanitize_task(task), key[0].upper(), motif_key)
motifs[name] = purine_rich_motif(stft_task_key[motif_key]["cwm_trimmed"][:])
return motifs
def get_num_seqlets(motif_key):
"""
Fetches the number of seqlets initially mapped to the given motif from the
original TF-MoDISco file.
"""
base_path = "/users/amtseng/tfmodisco/results/tfmodisco"
cond_type = motif_key.split(":")[0]
if cond_type == "MTFT":
# Get best fold
best_mt_fold = None
with open(multitask_finetune_model_def_tsv, "r") as f:
for line in f:
tokens = line.strip().split("\t")
if tokens[0] == tf_name and int(tokens[1]) == num_tasks - 1:
assert best_mt_fold is None
best_mt_fold = int(tokens[2])
task, head, pattern = motif_key.split(":")[1:]
task = task[1:]
head = "profile" if head == "P" else "count"
path = os.path.join(
base_path,
"multitask_profile_finetune",
"%s_multitask_profile_finetune_fold%d" % (tf_name, best_mt_fold)
)
if task == "agg":
path = os.path.join(
path,
"%s_multitask_profile_finetune_fold%d_%s_tfm.h5" % (tf_name, best_mt_fold, head)
)
else:
path = os.path.join(
path,
"%s_multitask_profile_finetune_task%s_fold%d_%s_tfm.h5" % (tf_name, task, best_mt_fold, head)
)
elif cond_type == "STFT":
# Get best fold
best_st_folds = []
with open(singletask_finetune_model_def_tsv, "r") as f:
for line in f:
tokens = line.strip().split("\t")
if tokens[0] == tf_name:
best_st_folds.append(int(tokens[2]))
assert len(best_st_folds) == num_tasks
task, head, pattern = motif_key.split(":")[1:]
task = task[1:]
head = "profile" if head == "P" else "count"
best_st_fold = best_st_folds[int(task)]
path = os.path.join(
base_path,
"singletask_profile_finetune",
"%s_singletask_profile_finetune_fold%d" % (tf_name, best_st_fold),
"task_%s" % task,
"%s_singletask_profile_finetune_task%s_fold%d_%s_tfm.h5" % (tf_name, task, best_st_fold, head)
)
elif cond_type == "MT":
fold, head, pattern = motif_key.split(":")[1:]
fold = fold[1:]
head = "profile" if head == "P" else "count"
path = os.path.join(
base_path,
"multitask_profile",
"%s_multitask_profile_fold%s" % (tf_name, fold),
"%s_multitask_profile_fold%s_%s_tfm.h5" % (tf_name, fold, head)
)
elif cond_type == "ST":
task, fold, head, pattern = motif_key.split(":")[1:]
task = task[1:]
fold = fold[1:]
head = "profile" if head == "P" else "count"
path = os.path.join(
base_path,
"singletask_profile",
"%s_singletask_profile_fold%s" % (tf_name, fold),
"task_%s" % task,
"%s_singletask_profile_task%s_fold%s_%s_tfm.h5" % (tf_name, task, fold, head)
)
else:
raise ValueError("Unknown model condition type")
# Now that we have the path, import the number of seqlets for the given pattern
metacluster_key, pattern_key = pattern.split("_")
with h5py.File(path, "r") as f:
metacluster = f["metacluster_idx_to_submetacluster_results"]["metacluster_%s" % metacluster_key]
patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
seqlets = patterns["pattern_%s" % pattern_key]["seqlets_and_alnmts"]["seqlets"]
return len(seqlets)
def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
"""
Plots a CDF to the given axes. `steps` is the number of steps in the
CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
`density` is whether or not to normalize to fractions.
"""
hist, bin_edges = np.histogram(data, bins=steps)
if inverse:
cumsum = len(data) - np.cumsum(hist)
else:
cumsum = np.cumsum(hist)
if density:
cumsum = cumsum / len(data)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
ax.step(bin_centers, cumsum, **kwargs)
def cluster_matrix_indices(matrix, num_clusters):
"""
Clusters matrix using k-means. Always clusters on the first
axis. Returns the indices needed to optimally order the matrix
by clusters.
"""
if len(matrix) == 1:
# Don't cluster at all
return np.array([0])
num_clusters = min(num_clusters, len(matrix))
# Perform k-means clustering
kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters)
cluster_assignments = kmeans.fit_predict(matrix)
# Order cluster centers to determine optimal ordering
kmeans_centers = kmeans.cluster_centers_
cluster_order = np.argsort(-np.mean(kmeans_centers, axis=1))
# Order the peaks so that the cluster assignments follow the optimal ordering
cluster_inds = []
for cluster_id in cluster_order:
cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
cluster_inds = np.concatenate(cluster_inds)
return cluster_inds, cluster_assignments[cluster_inds]
def plot_motif_examples(motif_keys, motifs, cond_names, cond_inds):
"""
Creates a table of motifs. The full set of motifs and keys
are given, and `cond_names` and `cond_inds` are parallel lists
containing conditions and a list of indices to plot for each.
A pair of columns is created for each condition.
"""
assert len(cond_names) == len(cond_inds)
cols = []
heads = []
for i in range(len(cond_names)):
cols.append(vdomh.col(style={"width": str(10 / len(cond_inds)) + "%"}))
cols.append(vdomh.col(style={"width": str(90 / len(cond_inds)) + "%"}))
heads.append(vdomh.th("Motif ID", style={"text-align": "center"}))
heads.append(vdomh.th("Motif (%s)" % cond_names[i], style={"text-align": "center"}))
colgroup = vdomh.colgroup(*cols)
header = vdomh.thead(vdomh.tr(*heads))
max_length = max([len(inds) for inds in cond_inds])
rows = []
for row_i in range(max_length):
row = []
for col_i in range(len(cond_names)):
if row_i < len(cond_inds[col_i]):
motif_i = cond_inds[col_i][row_i]
fig = viz_sequence.plot_weights(
motifs[motif_i], subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td(motif_keys[motif_i]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
motif_keys, motifs = zip(*import_all_tfmodisco_motifs(motif_file, task_index).items())
motif_keys, motifs = list(motif_keys), list(motifs)
raw_sim_matrix = create_motif_similarity_matrix(motifs, mean_normalize=False)
# Extract the number of seqlest for each motif
motif_num_seqlets = np.array([
get_num_seqlets(key) for key in tqdm.notebook.tqdm(motif_keys)
])
# First, create a dictionary mapping each condition to the column indices
cond_inds = {}
for i, key in enumerate(motif_keys):
cond_key = ":".join(key.split(":")[:-2]) # Strip off last two pieces
try:
cond_inds[cond_key].append(i)
except KeyError:
cond_inds[cond_key] = [i]
for key in list(cond_inds.keys()):
cond_inds[key] = np.array(cond_inds[key])
# Normalize the number of seqlets over each condition separately
motif_prop_seqlets = np.empty(len(motif_num_seqlets))
for inds in cond_inds.values():
motif_prop_seqlets[inds] = motif_num_seqlets[inds] / np.sum(motif_num_seqlets[inds])
# Construct a limited version of the similarity matrix, where the columns
# are the highest similarity over the entire condition
cond_sim_matrix = np.empty((len(motif_keys), len(cond_inds)))
for i, cond_key in enumerate(cond_inds.keys()):
cond_sim_matrix[:, i] = np.max(raw_sim_matrix[:, cond_inds[cond_key]], axis=1)
# Order similarity matrix rows by optimal cluster order
order_inds, cluster_ids = cluster_matrix_indices(
cond_sim_matrix,
max([len(inds) for inds in cond_inds.values()])
# Maximum number of clusters should be biggest condition size
)
sorted_cond_sim_matrix = cond_sim_matrix[order_inds]
# Order motif keys and motifs
sorted_motif_keys = [motif_keys[i] for i in order_inds]
sorted_motifs = [motifs[i] for i in order_inds]
sorted_motif_prop_seqlets = np.array([motif_prop_seqlets[i] for i in order_inds])
# Sort each matrix row independently, most to least similar
sorted_cond_sim_matrix = -np.sort(-sorted_cond_sim_matrix, axis=1) # Negatives make it sort descending
fig, ax = plt.subplots()
make_cdf(ax, np.ravel(cond_sim_matrix), density=True)
ax.set_title("CDF of similarities")
ax.set_xlabel("Similarity score")
plt.show()
# # Normalize the matrix entries to percentile
# cond_sim_matrix_perc = scipy.stats.rankdata(cond_sim_matrix).reshape(cond_sim_matrix.shape) / cond_sim_matrix.size
fig, ax = plt.subplots()
ax.set_title("Motif prevalence by seqlets")
ax.set_xlabel("Motif rank by reproducibility")
ax.set_ylabel("Fraction of seqlets")
ax.plot(sorted_motif_prop_seqlets)
plt.show()
# Extract the CWM score of each motif in order
sorted_motif_scores = np.array([np.max(cwm) for cwm in sorted_motifs])
# Separate out profile and count CWMs
profile_inds = np.array([i for i in range(len(sorted_motif_keys)) if sorted_motif_keys[i].split(":")[-2] == "P"])
count_inds = np.array([i for i in range(len(sorted_motif_keys)) if sorted_motif_keys[i].split(":")[-2] == "C"])
fig, ax = plt.subplots()
make_cdf(ax, sorted_motif_scores[profile_inds], density=True, label="Profile")
make_cdf(ax, sorted_motif_scores[count_inds], density=True, label="Count")
ax.set_title("CDF of CWM score")
ax.set_xlabel("CWM score")
plt.legend()
plt.show()
# Normalize the CWM scores to map to percentile (i.e. map CDF x-axis to y-axis)
# Do counts/profile CWMs separately
sorted_motif_score_percs = np.empty_like(sorted_motif_scores)
sorted_motif_score_percs[profile_inds] = scipy.stats.rankdata(sorted_motif_scores[profile_inds]) / len(profile_inds)
sorted_motif_score_percs[count_inds] = scipy.stats.rankdata(sorted_motif_scores[count_inds]) / len(count_inds)
# Create good non-uniform color maps for the data
sim_decay_scale = 1.5
score_decay_scale = 2.5
seq_decay_scale = 1.5
x = np.linspace(0, 1, 256)
transformed = np.power(np.flip(x), sim_decay_scale)
sim_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
"sim_cmap", plt.get_cmap("Greys")(transformed)
)
transformed = np.exp(score_decay_scale * x) / np.exp(score_decay_scale)
score_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
"sim_cmap", plt.get_cmap("Purples")(transformed)
)
transformed = np.exp(seq_decay_scale * x) / np.exp(seq_decay_scale)
seq_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
"sim_cmap", plt.get_cmap("Oranges")(transformed)
)
# Show the color map with the original
fig, ax = plt.subplots(nrows=6, figsize=(20, 6), gridspec_kw={"hspace": 1})
grad = np.linspace(0, 1, 256)
ax[0].imshow(grad[None, :], aspect="auto", cmap="Greys_r")
ax[1].imshow(grad[None, :], aspect="auto", cmap=sim_cmap)
ax[2].imshow(grad[None, :], aspect="auto", cmap="Purples")
ax[3].imshow(grad[None, :], aspect="auto", cmap=score_cmap)
ax[4].imshow(grad[None, :], aspect="auto", cmap="Oranges")
ax[5].imshow(grad[None, :], aspect="auto", cmap=seq_cmap)
for i in range(6):
ax[i].axis("off")
if i % 2 == 0:
ax[i].set_title("Original")
else:
ax[i].set_title("Transformed")
plt.show()
grid_height = max(5, len(motif_keys) * 0.1)
grid_width = max(5, len(cond_inds) * 0.1)
height_spacing, width_spacing = 0.1, 0.1
height = grid_height + 3 + (height_spacing * 3)
width = grid_width + 6 + (width_spacing * 3)
fig, ax = plt.subplots(
nrows=4, ncols=4, figsize=(width, height),
gridspec_kw={
"height_ratios": [grid_height / 0.1, 1, 1, 1],
"width_ratios": [grid_width / 0.1, 2, 2, 2],
"hspace": height_spacing,
"wspace": width_spacing
}
)
sim_hmap = ax[0, 0].imshow(sorted_cond_sim_matrix, cmap=sim_cmap, aspect="auto") # auto aspect to stretch
score_hmap = ax[0, 2].imshow(sorted_motif_score_percs[:, None], cmap=score_cmap, aspect="auto")
seq_hmap = ax[0, 3].imshow(sorted_motif_prop_seqlets[:, None], cmap=seq_cmap, aspect="auto")
# Cluster IDs may not be in order, so sort the groups
diffs = np.concatenate([[0], np.diff(cluster_ids)])
diffs[diffs != 0] = 1
sorted_cluster_ids = np.cumsum(diffs)
cmap = cm.get_cmap("tab20")
group_colors = cmap(sorted_cluster_ids[:, None] % 20 / 20)[:, :, :3]
ax[0, 1].imshow(group_colors, aspect="auto")
# we need to first translate into colors instead of specifying `cmap=` to `imshow`,
# otherwise the colors will be different
ax[0, 0].set_yticks(np.arange(len(motif_keys)))
ax[0, 0].set_yticklabels(sorted_motif_keys)
ax[0, 0].set_xticks([])
ax[0, 0].set_title("Similarity of closest motif")
ax[0, 1].set_title("Grp")
ax[0, 2].set_title("CWM")
ax[0, 3].set_title("#Seq")
for i in range(1, 4):
ax[0, i].set_yticks([])
ax[0, i].set_xticks([])
for j in range(1, 4):
ax[j, i].axis("off")
ax[j, i].axis("off")
ax[j, i].axis("off")
fig.colorbar(sim_hmap, cax=ax[1, 0], orientation="horizontal")
fig.colorbar(score_hmap, cax=ax[2, 0], orientation="horizontal")
fig.colorbar(seq_hmap, cax=ax[3, 0], orientation="horizontal")
plt.savefig(
os.path.join(out_path, "%s_fold_model_motif_reproducibility.svg" % tf_name),
format="svg"
)
plt.show()
# Show the average similarity of each cluster, and overall
cmap = cm.get_cmap("tab20")
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "10%"}),
vdomh.col(style={"width": "10%"}),
vdomh.col(style={"width": "10%"}),
vdomh.col(style={"width": "10%"}),
vdomh.col(style={"width": "10%"})
)
header = vdomh.thead(vdomh.tr(
vdomh.th("Cluster ID", style={"text-align": "center"}),
vdomh.th("Cluster color", style={"text-align": "center"}),
vdomh.th("Number of motifs", style={"text-align": "center"}),
vdomh.th("Mean similarity", style={"text-align": "center"}),
vdomh.th("Standard error", style={"text-align": "center"})
))
rows = []
means, stderrs, cluster_sizes = [], [], []
for cluster_id in np.unique(sorted_cluster_ids):
match_inds = np.where(sorted_cluster_ids == cluster_id)[0]
sims = sorted_cond_sim_matrix[match_inds][:, 1:] # First column is always 1, so remove it
mean, stderr = np.mean(sims), np.std(sims) / np.sqrt(sims.size)
cluster_size = len(match_inds)
means.append(mean)
stderrs.append(stderr)
cluster_sizes.append(cluster_size)
group_color = matplotlib.colors.rgb2hex(cmap(cluster_id % 20 / 20))
fig, ax = plt.subplots(figsize=(0.3, 0.3))
circle = plt.Circle((0, 0), 10, color=group_color)
ax.add_patch(circle)
ax.axis("off")
plt.axis("scaled")
rows.append(vdomh.tr(
vdomh.td(str(cluster_id)),
vdomh.td(figure_to_vdom_image(fig)),
vdomh.td(str(cluster_size)),
vdomh.td(str(mean)),
vdomh.td(str(stderr))
))
total_sims = sorted_cond_sim_matrix[:, 1:]
total_mean, total_stderr = np.mean(total_sims), np.std(total_sims) / np.sqrt(total_sims.size)
rows.append(vdomh.tr(
vdomh.td("All"),
vdomh.td(""),
vdomh.td(str(len(total_sims))),
vdomh.td(str(total_mean)),
vdomh.td(str(total_stderr))
))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
# Save results of clustering
out_hdf5 = os.path.join(out_path, "%s_clusters.h5" % tf_name)
with h5py.File(out_hdf5, "w") as f:
f.create_dataset("raw_sim_matrix", data=raw_sim_matrix)
f.create_dataset("sorted_cond_sim_matrix", data=sorted_cond_sim_matrix)
f.create_dataset("sorted_motif_prop_seqlets", data=sorted_motif_prop_seqlets)
f.create_dataset("sorted_motif_scores", data=sorted_motif_scores)
f.create_dataset("sorted_motif_keys", data=sorted_motif_keys)
f.create_dataset("sorted_cluster_ids", data=sorted_cluster_ids)
f.create_dataset("cluster_sizes", data=cluster_sizes)
f.create_dataset("cluster_mean_sims", data=means)
f.create_dataset("cluster_stderr_sims", data=stderrs)
f.create_dataset("total_mean_sim", data=total_mean)
f.create_dataset("total_stderr_sim", data=total_stderr)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:33: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
Cluster ID | Cluster color | Number of motifs | Mean similarity | Standard error |
---|---|---|---|---|
0 | 28 | 0.9296519089171698 | 0.0026098546193969044 | |
1 | 24 | 0.9006607026521601 | 0.002446331456165322 | |
2 | 21 | 0.9009254000304207 | 0.003471411413794625 | |
3 | 21 | 0.875879928851423 | 0.004503995468821779 | |
4 | 9 | 0.8671420889103525 | 0.00964666226848454 | |
5 | 18 | 0.8582832246845791 | 0.004773028073576986 | |
6 | 5 | 0.8545331949558836 | 0.009470704214919638 | |
7 | 16 | 0.8536334627581016 | 0.004599246217035412 | |
8 | 3 | 0.8204041791284304 | 0.01723670140862845 | |
9 | 13 | 0.8226208137623816 | 0.00790765023015964 | |
10 | 14 | 0.8226802103617673 | 0.004534972155148469 | |
11 | 10 | 0.7923868767663449 | 0.006672353397896703 | |
12 | 9 | 0.7881807797016992 | 0.005964020494437429 | |
13 | 5 | 0.7771620302549282 | 0.006947093829074759 | |
14 | 7 | 0.754713769113028 | 0.006541738788713363 | |
15 | 5 | 0.7603428013695588 | 0.005730879204027705 | |
16 | 5 | 0.745739605206518 | 0.008059043084228598 | |
17 | 2 | 0.7406717330483308 | 0.013791818203482725 | |
18 | 4 | 0.7312529865111913 | 0.01526900325642838 | |
19 | 5 | 0.720957315067407 | 0.009231425739910863 | |
20 | 6 | 0.6833186176249878 | 0.009287435666359378 | |
21 | 1 | 0.5947102254870338 | 0.01785884630663637 | |
22 | 1 | 0.58631053000691 | 0.022529014774953386 | |
23 | 1 | 0.4757842016046434 | 0.015402560483585714 | |
24 | 1 | 0.42651795722577907 | 0.013526385536503812 | |
All | 234 | 0.8407140917312591 | 0.001957361784337858 |
# Show some examples of motifs in each cluster, from highest to lowest reproducibility
num_examples_to_show = 5
cmap = cm.get_cmap("tab20")
rng = np.random.RandomState(seed)
for cluster_id in np.unique(sorted_cluster_ids):
match_inds = np.where(sorted_cluster_ids == cluster_id)[0]
display(vdomh.h3("Cluster %d (%d motifs)" % (cluster_id, len(match_inds))))
group_color = matplotlib.colors.rgb2hex(cmap(cluster_id % 20 / 20))
fig, ax = plt.subplots(figsize=(1, 1))
circle = plt.Circle((0, 0), 10, color=group_color)
ax.add_patch(circle)
ax.axis("off")
plt.axis("scaled")
plt.show()
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "10%"}),
vdomh.col(style={"width": "90%"})
)
header = vdomh.thead(vdomh.tr(
vdomh.th("Motif ID", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"})
))
rows = []
inds_to_show = rng.choice(match_inds, size=min(num_examples_to_show, len(match_inds)), replace=False)
for i in inds_to_show:
fig = viz_sequence.plot_weights(
sorted_motifs[i], subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_cluster_%d_motif_%s.svg" % (tf_name, cluster_id, sorted_motif_keys[i])),
format="svg"
)
rows.append(vdomh.tr(
vdomh.td(sorted_motif_keys[i]),
vdomh.td(figure_to_vdom_image(fig))
))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
Motif ID | Motif |
---|---|
ST:T12:F9:P:0_0 | |
STFT:T12:P:0_1 | |
ST:T12:F10:C:0_0 | |
ST:T12:F3:P:0_0 | |
ST:T12:F3:C:0_0 |
Motif ID | Motif |
---|---|
ST:T12:F1:C:0_2 | |
STFT:T12:C:0_3 | |
ST:T12:F4:P:0_3 | |
ST:T12:F1:C:0_6 | |
ST:T12:F3:C:0_4 |
Motif ID | Motif |
---|---|
ST:T12:F9:P:0_2 | |
ST:T12:F9:C:0_5 | |
STFT:T12:P:0_5 | |
ST:T12:F5:P:0_8 | |
ST:T12:F2:P:0_0 |
Motif ID | Motif |
---|---|
ST:T12:F6:C:0_1 | |
ST:T12:F5:P:0_10 | |
ST:T12:F8:P:0_6 | |
ST:T12:F6:P:0_3 | |
ST:T12:F8:C:0_10 |
Motif ID | Motif |
---|---|
ST:T12:F8:P:0_2 | |
ST:T12:F6:P:0_1 | |
STFT:T12:P:0_2 | |
ST:T12:F5:C:0_2 | |
ST:T12:F3:C:0_1 |
Motif ID | Motif |
---|---|
ST:T12:F9:C:0_4 | |
ST:T12:F10:C:0_3 | |
ST:T12:F9:P:0_1 | |
ST:T12:F7:C:0_4 | |
ST:T12:F8:C:0_7 |
Motif ID | Motif |
---|---|
ST:T12:F6:P:0_8 | |
ST:T12:F2:P:0_7 | |
ST:T12:F6:C:0_6 | |
ST:T12:F5:P:0_7 | |
ST:T12:F8:P:0_7 |
Motif ID | Motif |
---|---|
ST:T12:F5:C:0_4 | |
ST:T12:F4:C:0_1 | |
ST:T12:F5:P:0_6 | |
ST:T12:F4:P:0_2 | |
ST:T12:F6:C:0_5 |
Motif ID | Motif |
---|---|
ST:T12:F4:C:0_5 | |
ST:T12:F4:P:0_4 | |
ST:T12:F7:C:0_5 |
Motif ID | Motif |
---|---|
ST:T12:F6:P:0_9 | |
ST:T12:F9:P:0_8 | |
ST:T12:F6:P:0_4 | |
ST:T12:F10:P:0_3 | |
ST:T12:F10:C:0_2 |
Motif ID | Motif |
---|---|
ST:T12:F3:P:0_11 | |
ST:T12:F3:C:0_6 | |
ST:T12:F3:P:0_8 | |
ST:T12:F3:P:0_9 | |
ST:T12:F9:C:0_6 |
Motif ID | Motif |
---|---|
MTFT:T12:C:0_7 | |
MTFT:T12:P:0_5 | |
MTFT:T12:C:0_2 | |
MTFT:T12:C:0_0 | |
MTFT:T12:C:0_5 |
Motif ID | Motif |
---|---|
ST:T12:F2:P:0_3 | |
ST:T12:F2:P:0_10 | |
ST:T12:F2:C:0_1 | |
ST:T12:F9:P:0_10 | |
ST:T12:F2:P:0_12 |
Motif ID | Motif |
---|---|
ST:T12:F5:C:0_7 | |
ST:T12:F5:P:0_12 | |
ST:T12:F5:P:0_13 | |
ST:T12:F5:C:0_5 | |
ST:T12:F5:P:0_11 |
Motif ID | Motif |
---|---|
ST:T12:F7:P:0_12 | |
ST:T12:F7:P:0_9 | |
ST:T12:F7:P:0_10 | |
ST:T12:F7:P:0_8 | |
ST:T12:F7:P:0_11 |
Motif ID | Motif |
---|---|
ST:T12:F4:C:0_7 | |
ST:T12:F4:C:0_9 | |
ST:T12:F4:P:0_5 | |
ST:T12:F4:C:0_8 | |
ST:T12:F4:P:0_7 |
Motif ID | Motif |
---|---|
ST:T12:F8:P:0_8 | |
ST:T12:F8:C:0_8 | |
ST:T12:F8:P:0_12 | |
ST:T12:F8:P:0_10 | |
ST:T12:F8:P:0_9 |
Motif ID | Motif |
---|---|
ST:T12:F1:P:0_7 | |
ST:T12:F1:P:0_8 |
Motif ID | Motif |
---|---|
ST:T12:F7:P:0_7 | |
ST:T12:F6:P:0_7 | |
ST:T12:F8:P:0_5 | |
ST:T12:F7:P:0_5 |
Motif ID | Motif |
---|---|
ST:T12:F9:P:0_9 | |
ST:T12:F3:P:0_12 | |
ST:T12:F10:C:0_6 | |
ST:T12:F10:P:0_7 | |
ST:T12:F9:P:0_12 |
Motif ID | Motif |
---|---|
MTFT:T12:P:0_1 | |
MTFT:T12:C:0_1 | |
MTFT:T12:P:0_7 | |
MTFT:T12:P:0_4 | |
MTFT:T12:P:0_2 |
Motif ID | Motif |
---|---|
ST:T12:F9:P:0_7 |
Motif ID | Motif |
---|---|
ST:T12:F8:P:0_11 |
Motif ID | Motif |
---|---|
ST:T12:F4:P:0_6 |
Motif ID | Motif |
---|---|
ST:T12:F9:P:0_11 |