import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif.match_motifs as match_motifs
import motif.read_motifs as read_motifs
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, create_motif_similarity_matrix, purine_rich_motif
import tempfile
import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:17: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7f9660aec9d0>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
if "TFM_TF_NAME" in os.environ:
tf_name = os.environ["TFM_TF_NAME"]
else:
tf_name = "E2F6"
out_path = "/users/amtseng/tfmodisco/figures/benchmark_reproducibility/%s_benchmark_reproducibility" % tf_name
os.makedirs(out_path, exist_ok=True)
tf_num_tasks = {
"E2F6": 2,
"FOXA2": 4,
"SPI1": 4,
"CEBPB": 7,
"MAX": 7,
"GABPA": 9,
"MAFK": 9,
"JUND": 14,
"NR3C1-reddytime": 16,
"REST": 20
}
tf_best_model_types = {
"E2F6": list("MM"),
"FOXA2": list("SSMM"),
"SPI1": list("MSSS"),
"CEBPB": list("MMMMSMM"),
"MAX": list("MMSMMSS"),
"GABPA": list("MMMSMMMMM"),
"MAFK": list("MMMMMMMMM"),
"JUND": list("SMMSMSSSSSSSMS"),
"NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
"REST": list("MMMMMMMMMSMMSMMSMMMM")
}
num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]
tfm_motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_cpmerged_motifs.h5" % tf_name
memechip_motif_file = "/users/amtseng/tfmodisco/results/motifs/memechip/%s_memechip_motifs.h5" % tf_name
homer_motif_file = "/users/amtseng/tfmodisco/results/motifs/homer/%s_homer_motifs.h5" % tf_name
dichipmunk_motif_file = "/users/amtseng/tfmodisco/results/motifs/dichipmunk/%s_dichipmunk_motifs.h5" % tf_name
motif_database_path = "/users/amtseng/tfmodisco/data/processed/motif_databases/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt"
def renorm_motif(motif, pseudocount=1e-10):
"""
Renormalizes a motif (L x 4 array) so that the bases sum to 1.
"""
s = np.sum(motif, axis=1, keepdims=True)
assert np.all(s > 0)
return motif / s
def import_tfmodisco_motifs(motif_file, model_types, motif_type="cwm_trimmed"):
"""
From a file containing all motifs for that TF, imports the
trimmed CWMs (or another kind of motif type) of the fine-tuned models
corresponding to the model type for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
motifs = []
with h5py.File(motif_file, "r") as f:
mtft = f["multitask_finetune"]
stft = f["singletask_finetune"]
for i, model_type in enumerate(model_types):
task = "task_%d" % i
if model_type == "M":
dset = mtft[task]
else:
dset = stft[task]
task_motifs = {}
for motif_key in dset.keys():
if "0_" in motif_key:
# Motifs that are (or are constructed from) positive metacluster only
motif = dset[motif_key][motif_type][:]
if motif_type.startswith("pfm"):
motif = renorm_motif(motif)
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(motif)
motifs.append(task_motifs)
return motifs
def import_classic_benchmark_motifs(motif_file, mode):
"""
From a file containing all motifs for that TF from a benchmark
method, imports the PFMs of the motifs for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
if mode == "dichipmunk":
score_key = "supporting_seqs"
elif mode == "homer":
score_key = "log_enrichment"
elif mode == "memechip":
score_key = "evalue"
motifs = []
with h5py.File(motif_file, "r") as f:
tasks = sorted([int(key[5:]) for key in f.keys() if key != "task_agg"])
for i in tasks:
dset = f["task_%d" % i]
task_motifs = {}
for motif_key in dset.keys():
if motif_key == score_key:
continue
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(renorm_motif(dset[motif_key][:]))
motifs.append(task_motifs)
return motifs
def import_database_pfms(database_path):
"""
Imports the database of PFMs by reading through the entire database and
constructing a dictionary mapping motif IDs to NumPy arrays of PFMs.
"""
motif_dict = {}
with open(database_path, "r") as f:
try:
while True:
line = next(f)
if line.startswith("MOTIF"):
key = line.strip().split()[1]
header = next(f)
motif_width = int(header.split()[5])
motif = np.empty((motif_width, 4))
for i in range(motif_width):
motif[i] = np.array([
float(x) for x in next(f).strip().split()
])
# Add the motif with a shortened key
motif_dict[key.split("_")[1]] = purine_rich_motif(renorm_motif(motif))
except StopIteration:
pass
return motif_dict
def get_closest_tomtom_motif_similarities(query_dict, target_dict):
"""
From a dictionary mapping N motif keys to query motifs, and a
dictionary mapping M motif keys to target motifs, returns a
dictionary mapping the N query motif keys to the similarity and
key of the closest target motif (a pair). Similarity is the
-log(p) from TOMTOM.
"""
query_keys, query_pfms = list(zip(*query_dict.items()))
target_keys, target_pfms = list(zip(*target_dict.items()))
# Create temporary directory to do work in
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir = temp_dir_obj.name
# Convert motifs to MEME format
query_motif_file = os.path.join(temp_dir, "query_motifs.txt")
target_motif_file = os.path.join(temp_dir, "target_motifs.txt")
match_motifs.export_pfms_to_meme_format(query_pfms, query_motif_file)
match_motifs.export_pfms_to_meme_format(target_pfms, target_motif_file)
# Run TOMTOM
tomtom_dir = os.path.join(temp_dir, "tomtom")
match_motifs.run_tomtom(
query_motif_file, target_motif_file, tomtom_dir,
show_output=False
)
# Find results, mapping each query motif to target index
# The query/target IDs are the indices
tomtom_table = match_motifs.import_tomtom_results(tomtom_dir)
matches = []
for i in range(len(query_pfms)):
rows = tomtom_table[tomtom_table["Query_ID"] == i]
if rows.empty:
matches.append((0, "N/A"))
continue
min_row = rows.loc[rows["p-value"].idxmin()]
score = -np.log10(min_row["p-value"])
target_key = target_keys[min_row["Target_ID"]]
matches.append((score, target_key))
temp_dir_obj.cleanup()
return dict(zip(query_keys, matches))
tfm_motif_cwms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "cwm_trimmed")
tfm_motif_pfms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "pfm_trimmed")
memechip_motifs = import_classic_benchmark_motifs(memechip_motif_file, "memechip")
homer_motifs = import_classic_benchmark_motifs(homer_motif_file, "homer")
dichipmunk_motifs = import_classic_benchmark_motifs(dichipmunk_motif_file, "dichipmunk")
database_motifs = import_database_pfms(motif_database_path)
For each benchmark motif, compute the closest TF-MoDISco motif to it.
# Compute TF-MoDISco similarity dictionaries by task
memechip_tfm_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(memechip_motifs))
]
homer_tfm_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(homer_motifs))
]
dichipmunk_tfm_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(dichipmunk_motifs))
]
# Compute database similarity dictionaries by task
memechip_database_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], database_motifs)
for task_index in range(len(memechip_motifs))
]
homer_database_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], database_motifs)
for task_index in range(len(homer_motifs))
]
dichipmunk_database_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], database_motifs)
for task_index in range(len(dichipmunk_motifs))
]
# For each task, similarity of motifs to database vs TF-MoDISco
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.scatter(
[memechip_tfm_sims[task_index][k][0] for k in memechip_keys],
[memechip_database_sims[task_index][k][0] for k in memechip_keys],
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.scatter(
[homer_tfm_sims[task_index][k][0] for k in homer_keys],
[homer_database_sims[task_index][k][0] for k in homer_keys],
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.scatter(
[dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys],
[dichipmunk_database_sims[task_index][k][0] for k in dichipmunk_keys],
label="DiChIPMunk"
)
(min_x, max_x), (min_y, max_y) = ax.get_xlim(), ax.get_ylim()
min_both, max_both = min(min_x, min_y), max(max_x, max_y)
ax.set_xlim(min_both, max_both)
ax.set_ylim(min_both, max_both)
ax.plot(
[min_both, max_both], [min_both, max_both],
color="black", linestyle="--", alpha=0.3, zorder=0
)
ax.legend()
ax.set_xlabel("Maximum similarity to a TF-MoDISco motif")
ax.set_ylabel("Maximum similarity to a JASPAR motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_database_vs_tfm_similarity.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show all motifs in order of similarity ratio
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "30"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
)
header = vdomh.thead(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
)
safe_div = lambda a, b: a / b if b else float("inf")
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by decreasing similarity ratio of TF-MoDISco / database
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -safe_div(memechip_tfm_sims[task_index][k][0], memechip_database_sims[task_index][k][0])
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -safe_div(homer_tfm_sims[task_index][k][0], homer_database_sims[task_index][k][0])
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -safe_div(dichipmunk_tfm_sims[task_index][k][0], dichipmunk_database_sims[task_index][k][0])
)
for bench_type, key_list, motif_dict, tfm_sim_dict, database_sim_dict in [
("MEMEChIP", memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index], memechip_database_sims[task_index]),
("HOMER", homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index], homer_database_sims[task_index]),
("DiChIPMunk", dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index], dichipmunk_database_sims[task_index])
]:
subheader = vdomh.tr(
vdomh.td(vdomh.b(bench_type), colspan="1", style={"text-align": "center"}),
vdomh.td(vdomh.b("TF-MoDISco"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("Database"), colspan="3", style={"text-align": "center"}),
)
rows = [subheader]
for i in range(len(key_list)):
bench_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
bench_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_motif_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
if tfm_sim_dict[key_list[i]][1] == "N/A":
tfm_fig_cell = "N/A"
else:
tfm_fig = viz_sequence.plot_weights(
tfm_motif_cwms[task_index][tfm_sim_dict[key_list[i]][1]],
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
tfm_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_tfmmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
tfm_fig_cell = figure_to_vdom_image(tfm_fig)
if database_sim_dict[key_list[i]][1] == "N/A":
database_fig_cell = "N/A"
else:
database_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(database_motifs[database_sim_dict[key_list[i]][1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
database_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_dbmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
database_fig_cell = figure_to_vdom_image(database_fig)
rows.append(vdomh.tr(
vdomh.td(figure_to_vdom_image(bench_fig)),
vdomh.td(tfm_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % tfm_sim_dict[key_list[i]][0]),
vdomh.td(tfm_fig_cell),
vdomh.td(database_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % database_sim_dict[key_list[i]][0]),
vdomh.td(database_fig_cell)
))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_1:P0_1 | 7.880 | E2F6 | 5.572 | |||
T0:C0_0:P0_0 | 6.573 | Npas2 | 7.815 | |||
T0:C0_3:P0_3 | 5.700 | CTCF | 6.812 | |||
T0:C0_1:P0_1 | 3.569 | ZKSCAN5 | 4.376 | |||
T0:C0_1:P0_1 | 3.309 | ZNF384 | 4.702 | |||
N/A | 0.000 | N/A | KLF15 | 6.382 | ||
N/A | 0.000 | N/A | KLF9 | 3.494 | ||
N/A | 0.000 | N/A | Wt1 | 5.627 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_1:P0_1 | 3.413 | N/A | 0.000 | N/A | ||
T0:C0_6 | 2.123 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:P0_9 | 4.628 | FOS | 4.731 | |||
T0:C0_0:P0_0 | 6.046 | MYC | 6.533 | |||
T0:C0_1:P0_1 | 5.446 | E2F6 | 7.038 | |||
T0:C0_3:P0_3 | 5.395 | CTCF | 8.826 | |||
T0:C0_2:P0_2 | 2.895 | NRF1 | 7.249 | |||
T0:C0_1:P0_1 | 2.267 | ELF1 | 5.694 | |||
T0:P0_9 | 2.433 | THAP11 | 8.021 | |||
T0:C0_3:P0_3 | 2.284 | YY1 | 7.921 | |||
N/A | 0.000 | N/A | MYOG | 4.688 | ||
N/A | 0.000 | N/A | YY2 | 3.355 | ||
N/A | 0.000 | N/A | NFYC | 9.868 | ||
N/A | 0.000 | N/A | KLF9 | 3.552 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
T0:C0_4 | 2.399 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_2:P0_2 | 5.711 | MYC | 5.344 | |||
T0:P0_9 | 2.849 | LBX1 | 3.035 | |||
T0:C0_1:P0_1 | 5.718 | E2F6 | 8.525 | |||
T0:P0_7 | 1.587 | Zfx | 5.973 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_8 | 10.619 | ZNF384 | 3.967 | |||
T1:P0_8 | 10.481 | SPI1 | 5.327 | |||
T1:C0_1:P0_1 | 5.865 | E2F6 | 4.883 | |||
T1:C0_0:P0_0 | 3.761 | MAX::MYC | 5.990 | |||
N/A | 0.000 | N/A | ZNF740 | 3.561 | ||
N/A | 0.000 | N/A | ZNF148 | 4.195 | ||
N/A | 0.000 | N/A | THAP11 | 4.200 | ||
N/A | 0.000 | N/A | NFYB | 4.827 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_8 | 14.160 | ZNF384 | 5.156 | |||
T1:P0_8 | 9.067 | SPI1 | 4.341 | |||
T1:C0_1:P0_1 | 10.372 | E2F6 | 6.755 | |||
T1:C0_0:P0_0 | 10.575 | MYCN | 6.980 | |||
T1:C0_7:P0_11 | 8.115 | Bach1::Mafk | 6.501 | |||
T1:C0_4:P0_5 | 6.902 | CTCF | 11.379 | |||
T1:C0_7:P0_11 | 3.775 | ZNF24 | 9.499 | |||
T1:C0_6:P0_2 | 2.044 | NRF1 | 6.779 | |||
T1:C0_5 | 1.301 | YY1 | 9.535 | |||
N/A | 0.000 | N/A | TCF12(var.2) | 3.684 | ||
N/A | 0.000 | N/A | NFYC | 6.816 | ||
N/A | 0.000 | N/A | THAP11 | 7.024 | ||
N/A | 0.000 | N/A | CREB3L4(var.2) | 7.106 | ||
N/A | 0.000 | N/A | ZBTB33 | 5.586 | ||
N/A | 0.000 | N/A | Tcf12 | 4.635 | ||
N/A | 0.000 | N/A | ZNF460 | 4.310 | ||
N/A | 0.000 | N/A | ZNF449 | 4.328 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_8 | 1.910 | N/A | 0.000 | N/A | ||
T1:C0_1:P0_1 | 2.229 | N/A | 0.000 | N/A | ||
T1:P0_8 | 3.151 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_1:P0_1 | 3.737 | E2F6 | 6.713 | |||
T1:C0_1:P0_1 | 1.982 | IRF7 | 3.567 | |||
T1:C0_0:P0_0 | 5.404 | MYC | 9.961 |
# For each task, plot (cumulative) distribution of similarities
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.plot(
np.arange(len(memechip_tfm_sims[task_index])),
np.sort([memechip_tfm_sims[task_index][k][0] for k in memechip_keys]),
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.plot(
np.arange(len(homer_tfm_sims[task_index])),
np.sort([homer_tfm_sims[task_index][k][0] for k in homer_keys]),
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.plot(
np.arange(len(dichipmunk_tfm_sims[task_index])),
np.sort([dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys]),
label="DiChIPMunk"
)
ax.legend()
ax.set_xlabel("Motif rank (by max similarity to TF-MoDISco motif)")
ax.set_ylabel("Maximum similarity to a TF-MoDISco motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_tfm_similarity_vs_rank.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show the most and least reproducible motifs for each benchmark
num_to_show = 5
cols, heads = [], []
for _ in range(3):
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (80 / 3)}))
heads.append(vdomh.th("TF-MoDISco similarity", style={"text-align": "center"}))
heads.append(vdomh.th("TF-MoDISco key", style={"text-align": "center"}))
heads.append(vdomh.th("Motif", style={"text-align": "center"}))
colgroup = vdomh.colgroup(*cols)
header = vdomh.thead(heads)
subheader = vdomh.tr(
vdomh.td(vdomh.b("MEMEChIP"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("HOMER"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("DiChIPMunk"), colspan="3", style={"text-align": "center"}),
)
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by similarity
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -memechip_tfm_sims[task_index][k][0]
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -homer_tfm_sims[task_index][k][0]
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -dichipmunk_tfm_sims[task_index][k][0]
)
display(vdomh.h4("Most reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]):
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list):
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[i]][0]),
vdomh.td(sim_dict[key_list[i]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
display(vdomh.h4("Least reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]) - num_to_show:
# Don't show motifs that have already been shown in the previous "most reproducible" table
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list) - num_to_show:
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[len(key_list) - i - 1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[len(key_list) - i - 1]][0]),
vdomh.td(sim_dict[key_list[len(key_list) - i - 1]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
7.880 | T0:C0_1:P0_1 | 6.046 | T0:C0_0:P0_0 | 5.718 | T0:C0_1:P0_1 | |||
6.573 | T0:C0_0:P0_0 | 5.446 | T0:C0_1:P0_1 | 5.711 | T0:C0_2:P0_2 | |||
5.700 | T0:C0_3:P0_3 | 5.395 | T0:C0_3:P0_3 | 2.849 | T0:P0_9 | |||
3.569 | T0:C0_1:P0_1 | 4.628 | T0:P0_9 | 2.399 | T0:C0_4 | |||
3.309 | T0:C0_1:P0_1 | 3.413 | T0:C0_1:P0_1 | 1.587 | T0:P0_7 |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
10.619 | T1:P0_8 | 14.160 | T1:P0_8 | 5.404 | T1:C0_0:P0_0 | |||
10.481 | T1:P0_8 | 10.575 | T1:C0_0:P0_0 | 3.737 | T1:C0_1:P0_1 | |||
5.865 | T1:C0_1:P0_1 | 10.372 | T1:C0_1:P0_1 | 3.151 | T1:P0_8 | |||
3.761 | T1:C0_0:P0_0 | 9.067 | T1:P0_8 | 2.229 | T1:C0_1:P0_1 | |||
0.000 | N/A | 8.115 | T1:C0_7:P0_11 | 1.982 | T1:C0_1:P0_1 |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 1.910 | T1:P0_8 |