import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif.match_motifs as match_motifs
import motif.read_motifs as read_motifs
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, create_motif_similarity_matrix, purine_rich_motif
import tempfile
import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:17: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7fc7e43fdf50>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
if "TFM_TF_NAME" in os.environ:
tf_name = os.environ["TFM_TF_NAME"]
else:
tf_name = "E2F6"
out_path = "/users/amtseng/tfmodisco/figures/benchmark_reproducibility/%s_benchmark_reproducibility" % tf_name
os.makedirs(out_path, exist_ok=True)
tf_num_tasks = {
"E2F6": 2,
"FOXA2": 4,
"SPI1": 4,
"CEBPB": 7,
"MAX": 7,
"GABPA": 9,
"MAFK": 9,
"JUND": 14,
"NR3C1-reddytime": 16,
"REST": 20
}
tf_best_model_types = {
"E2F6": list("MM"),
"FOXA2": list("SSMM"),
"SPI1": list("MSSS"),
"CEBPB": list("MMMMSMM"),
"MAX": list("MMSMMSS"),
"GABPA": list("MMMSMMMMM"),
"MAFK": list("MMMMMMMMM"),
"JUND": list("SMMSMSSSSSSSMS"),
"NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
"REST": list("MMMMMMMMMSMMSMMSMMMM")
}
num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]
tfm_motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_cpmerged_motifs.h5" % tf_name
memechip_motif_file = "/users/amtseng/tfmodisco/results/motifs/memechip/%s_memechip_motifs.h5" % tf_name
homer_motif_file = "/users/amtseng/tfmodisco/results/motifs/homer/%s_homer_motifs.h5" % tf_name
dichipmunk_motif_file = "/users/amtseng/tfmodisco/results/motifs/dichipmunk/%s_dichipmunk_motifs.h5" % tf_name
motif_database_path = "/users/amtseng/tfmodisco/data/processed/motif_databases/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt"
def renorm_motif(motif, pseudocount=1e-10):
"""
Renormalizes a motif (L x 4 array) so that the bases sum to 1.
"""
s = np.sum(motif, axis=1, keepdims=True)
assert np.all(s > 0)
return motif / s
def import_tfmodisco_motifs(motif_file, model_types, motif_type="cwm_trimmed"):
"""
From a file containing all motifs for that TF, imports the
trimmed CWMs (or another kind of motif type) of the fine-tuned models
corresponding to the model type for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
motifs = []
with h5py.File(motif_file, "r") as f:
mtft = f["multitask_finetune"]
stft = f["singletask_finetune"]
for i, model_type in enumerate(model_types):
task = "task_%d" % i
if model_type == "M":
dset = mtft[task]
else:
dset = stft[task]
task_motifs = {}
for motif_key in dset.keys():
if "0_" in motif_key:
# Motifs that are (or are constructed from) positive metacluster only
motif = dset[motif_key][motif_type][:]
if motif_type.startswith("pfm"):
motif = renorm_motif(motif)
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(motif)
motifs.append(task_motifs)
return motifs
def import_classic_benchmark_motifs(motif_file, mode):
"""
From a file containing all motifs for that TF from a benchmark
method, imports the PFMs of the motifs for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
if mode == "dichipmunk":
score_key = "supporting_seqs"
elif mode == "homer":
score_key = "log_enrichment"
elif mode == "memechip":
score_key = "evalue"
motifs = []
with h5py.File(motif_file, "r") as f:
tasks = sorted([int(key[5:]) for key in f.keys() if key != "task_agg"])
for i in tasks:
dset = f["task_%d" % i]
task_motifs = {}
for motif_key in dset.keys():
if motif_key == score_key:
continue
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(renorm_motif(dset[motif_key][:]))
motifs.append(task_motifs)
return motifs
def import_database_pfms(database_path):
"""
Imports the database of PFMs by reading through the entire database and
constructing a dictionary mapping motif IDs to NumPy arrays of PFMs.
"""
motif_dict = {}
with open(database_path, "r") as f:
try:
while True:
line = next(f)
if line.startswith("MOTIF"):
key = line.strip().split()[1]
header = next(f)
motif_width = int(header.split()[5])
motif = np.empty((motif_width, 4))
for i in range(motif_width):
motif[i] = np.array([
float(x) for x in next(f).strip().split()
])
# Add the motif with a shortened key
motif_dict[key.split("_")[1]] = purine_rich_motif(renorm_motif(motif))
except StopIteration:
pass
return motif_dict
def get_closest_tomtom_motif_similarities(query_dict, target_dict):
"""
From a dictionary mapping N motif keys to query motifs, and a
dictionary mapping M motif keys to target motifs, returns a
dictionary mapping the N query motif keys to the similarity and
key of the closest target motif (a pair). Similarity is the
-log(p) from TOMTOM.
"""
query_keys, query_pfms = list(zip(*query_dict.items()))
target_keys, target_pfms = list(zip(*target_dict.items()))
# Create temporary directory to do work in
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir = temp_dir_obj.name
# Convert motifs to MEME format
query_motif_file = os.path.join(temp_dir, "query_motifs.txt")
target_motif_file = os.path.join(temp_dir, "target_motifs.txt")
match_motifs.export_pfms_to_meme_format(query_pfms, query_motif_file)
match_motifs.export_pfms_to_meme_format(target_pfms, target_motif_file)
# Run TOMTOM
tomtom_dir = os.path.join(temp_dir, "tomtom")
match_motifs.run_tomtom(
query_motif_file, target_motif_file, tomtom_dir,
show_output=False
)
# Find results, mapping each query motif to target index
# The query/target IDs are the indices
tomtom_table = match_motifs.import_tomtom_results(tomtom_dir)
matches = []
for i in range(len(query_pfms)):
rows = tomtom_table[tomtom_table["Query_ID"] == i]
if rows.empty:
matches.append((0, "N/A"))
continue
min_row = rows.loc[rows["p-value"].idxmin()]
score = -np.log10(min_row["p-value"])
target_key = target_keys[min_row["Target_ID"]]
matches.append((score, target_key))
temp_dir_obj.cleanup()
return dict(zip(query_keys, matches))
tfm_motif_cwms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "cwm_trimmed")
tfm_motif_pfms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "pfm_trimmed")
memechip_motifs = import_classic_benchmark_motifs(memechip_motif_file, "memechip")
homer_motifs = import_classic_benchmark_motifs(homer_motif_file, "homer")
dichipmunk_motifs = import_classic_benchmark_motifs(dichipmunk_motif_file, "dichipmunk")
database_motifs = import_database_pfms(motif_database_path)
For each benchmark motif, compute the closest TF-MoDISco motif to it.
# Compute TF-MoDISco similarity dictionaries by task
memechip_tfm_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(memechip_motifs))
]
homer_tfm_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(homer_motifs))
]
dichipmunk_tfm_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(dichipmunk_motifs))
]
# Compute database similarity dictionaries by task
memechip_database_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], database_motifs)
for task_index in range(len(memechip_motifs))
]
homer_database_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], database_motifs)
for task_index in range(len(homer_motifs))
]
dichipmunk_database_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], database_motifs)
for task_index in range(len(dichipmunk_motifs))
]
# For each task, similarity of motifs to database vs TF-MoDISco
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.scatter(
[memechip_tfm_sims[task_index][k][0] for k in memechip_keys],
[memechip_database_sims[task_index][k][0] for k in memechip_keys],
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.scatter(
[homer_tfm_sims[task_index][k][0] for k in homer_keys],
[homer_database_sims[task_index][k][0] for k in homer_keys],
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.scatter(
[dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys],
[dichipmunk_database_sims[task_index][k][0] for k in dichipmunk_keys],
label="DiChIPMunk"
)
(min_x, max_x), (min_y, max_y) = ax.get_xlim(), ax.get_ylim()
min_both, max_both = min(min_x, min_y), max(max_x, max_y)
ax.set_xlim(min_both, max_both)
ax.set_ylim(min_both, max_both)
ax.plot(
[min_both, max_both], [min_both, max_both],
color="black", linestyle="--", alpha=0.3, zorder=0
)
ax.legend()
ax.set_xlabel("Maximum similarity to a TF-MoDISco motif")
ax.set_ylabel("Maximum similarity to a JASPAR motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_database_vs_tfm_similarity.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show all motifs in order of similarity ratio
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "30"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
)
header = vdomh.thead(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
)
safe_div = lambda a, b: a / b if b else float("inf")
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by decreasing similarity ratio of TF-MoDISco / database
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -safe_div(memechip_tfm_sims[task_index][k][0], memechip_database_sims[task_index][k][0])
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -safe_div(homer_tfm_sims[task_index][k][0], homer_database_sims[task_index][k][0])
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -safe_div(dichipmunk_tfm_sims[task_index][k][0], dichipmunk_database_sims[task_index][k][0])
)
for bench_type, key_list, motif_dict, tfm_sim_dict, database_sim_dict in [
("MEMEChIP", memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index], memechip_database_sims[task_index]),
("HOMER", homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index], homer_database_sims[task_index]),
("DiChIPMunk", dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index], dichipmunk_database_sims[task_index])
]:
subheader = vdomh.tr(
vdomh.td(vdomh.b(bench_type), colspan="1", style={"text-align": "center"}),
vdomh.td(vdomh.b("TF-MoDISco"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("Database"), colspan="3", style={"text-align": "center"}),
)
rows = [subheader]
for i in range(len(key_list)):
bench_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
bench_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_motif_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
if tfm_sim_dict[key_list[i]][1] == "N/A":
tfm_fig_cell = "N/A"
else:
tfm_fig = viz_sequence.plot_weights(
tfm_motif_cwms[task_index][tfm_sim_dict[key_list[i]][1]],
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
tfm_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_tfmmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
tfm_fig_cell = figure_to_vdom_image(tfm_fig)
if database_sim_dict[key_list[i]][1] == "N/A":
database_fig_cell = "N/A"
else:
database_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(database_motifs[database_sim_dict[key_list[i]][1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
database_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_dbmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
database_fig_cell = figure_to_vdom_image(database_fig)
rows.append(vdomh.tr(
vdomh.td(figure_to_vdom_image(bench_fig)),
vdomh.td(tfm_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % tfm_sim_dict[key_list[i]][0]),
vdomh.td(tfm_fig_cell),
vdomh.td(database_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % database_sim_dict[key_list[i]][0]),
vdomh.td(database_fig_cell)
))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_0:P0_0 | 3.136 | N/A | 0.000 | N/A | ||
T0:C0_5:P0_5 | 1.826 | N/A | 0.000 | N/A | ||
T0:C0_1:P0_1 | 7.070 | FOXD2 | 3.948 | |||
T0:C0_0:P0_0 | 6.111 | FOXO4 | 5.410 | |||
T0:C0_2:P0_7 | 5.064 | NR2F1 | 4.649 | |||
T0:C0_1:P0_1 | 3.641 | ZNF384 | 4.494 | |||
T0:C0_5:P0_5 | 1.969 | ZNF148 | 7.307 | |||
N/A | 0.000 | N/A | MAZ | 5.039 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
T0:C0_9:P0_8 | 2.316 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_12 | 1.611 | N/A | 0.000 | N/A | ||
T0:C0_1:P0_1 | 1.745 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_11 | 11.992 | PPARA::RXRA | 7.424 | |||
T0:C0_5:P0_5 | 14.869 | CTCF | 10.985 | |||
T0:C0_0:P0_0 | 6.721 | FOXD1 | 6.941 | |||
T0:C0_5:P0_5 | 3.171 | KLF5 | 3.656 | |||
T0:C0_10 | 2.510 | HAND2 | 3.264 | |||
T0:C0_12 | 2.934 | SMAD2::SMAD3::SMAD4 | 3.962 | |||
T0:C0_1:P0_1 | 3.293 | ZNF384 | 5.069 | |||
T0:C0_5:P0_5 | 2.084 | ZNF460 | 3.588 | |||
T0:C0_1:P0_1 | 1.873 | TBXT | 3.925 | |||
T0:C0_6:P0_9 | 3.088 | HNF1B | 9.416 | |||
T0:C0_5:P0_5 | 1.787 | KLF15 | 5.905 | |||
N/A | 0.000 | N/A | ZNF24 | 5.717 | ||
N/A | 0.000 | N/A | EBF1 | 3.595 | ||
N/A | 0.000 | N/A | KLF9 | 3.558 | ||
N/A | 0.000 | N/A | ZNF148 | 6.275 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_0:P0_0 | 11.044 | FOXD2 | 9.148 | |||
T0:C0_8:P0_2 | 5.792 | RXRB | 5.083 | |||
T0:C0_5:P0_5 | 1.681 | YY1 | 4.145 | |||
N/A | 0.000 | N/A | ELF3 | 3.988 | ||
N/A | 0.000 | N/A | ZNF410 | 2.372 | ||
N/A | 0.000 | N/A | PAX5 | 3.684 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_4 | 3.713 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_0:P0_0 | 1.919 | N/A | 0.000 | N/A | ||
T1:C0_0:P0_0 | 12.520 | FOXC2 | 7.475 | |||
T1:C0_2:P0_3 | 4.529 | ZNF274 | 3.433 | |||
T1:P0_1 | 5.399 | RREB1 | 4.136 | |||
T1:P0_1 | 3.603 | ZNF384 | 4.967 | |||
N/A | 0.000 | N/A | ZNF460 | 7.220 | ||
N/A | 0.000 | N/A | ZNF148 | 6.349 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
T1:C0_6 | 2.292 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_1:P0_2 | 1.666 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_3:P0_5 | 1.707 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_2:P0_3 | 3.812 | N/A | 0.000 | N/A | ||
T1:C0_2:P0_3 | 3.773 | N/A | 0.000 | N/A | ||
T1:P0_1 | 1.757 | N/A | 0.000 | N/A | ||
T1:C0_0:P0_0 | 9.041 | FOXC1 | 7.922 | |||
T1:P0_4 | 3.502 | CEBPE | 3.229 | |||
T1:P0_7 | 2.098 | NFIB | 4.034 | |||
T1:P0_6 | 1.887 | ZNF460 | 4.911 | |||
T1:P0_1 | 1.767 | ZNF384 | 5.226 | |||
N/A | 0.000 | N/A | NFKB1 | 3.509 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
T1:C0_4 | 1.815 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_5 | 6.491 | NR2F2 | 5.317 | |||
T1:C0_0:P0_0 | 9.545 | FOXD2 | 10.678 | |||
N/A | 0.000 | N/A | ELF5 | 4.414 | ||
N/A | 0.000 | N/A | ONECUT3 | 3.162 | ||
N/A | 0.000 | N/A | PLAGL2 | 5.100 | ||
N/A | 0.000 | N/A | ELF3 | 3.919 | ||
N/A | 0.000 | N/A | ZNF148 | 3.459 | ||
N/A | 0.000 | N/A | ZNF740 | 3.743 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
T2:C0_7 | 2.293 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_6 | 2.315 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_0:P0_0 | 17.341 | FOXK2 | 8.752 | |||
T2:C0_5 | 7.841 | HNF4G | 5.957 | |||
T2:C0_3:P0_3 | 5.348 | CEBPD | 5.173 | |||
N/A | 0.000 | N/A | MAZ | 5.851 | ||
N/A | 0.000 | N/A | Zic1::Zic2 | 4.034 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_3:P0_3 | 1.949 | N/A | 0.000 | N/A | ||
T2:C0_7 | 1.723 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:P0_1 | 6.354 | N/A | 0.000 | N/A | ||
T2:C0_6 | 13.161 | HNF4G | 6.161 | |||
T2:C0_0:P0_0 | 14.962 | FOXD2 | 7.965 | |||
T2:C0_4:P0_6 | 11.364 | HNF1B | 10.736 | |||
T2:P0_5 | 2.992 | FOS::JUN | 3.335 | |||
T2:C0_6 | 2.332 | NFIB | 4.369 | |||
T2:C0_1 | 1.829 | ZNF384 | 5.259 | |||
T2:C0_8 | 1.713 | KLF15 | 5.786 | |||
N/A | 0.000 | N/A | KLF9 | 3.541 | ||
N/A | 0.000 | N/A | PKNOX2 | 3.743 | ||
N/A | 0.000 | N/A | Nr2f6(var.2) | 3.431 | ||
N/A | 0.000 | N/A | NFYA | 5.876 | ||
N/A | 0.000 | N/A | KLF4 | 3.458 | ||
N/A | 0.000 | N/A | HOXA13 | 5.095 | ||
N/A | 0.000 | N/A | TEAD1 | 3.549 | ||
N/A | 0.000 | N/A | Tcf12 | 5.353 | ||
N/A | 0.000 | N/A | ZNF263 | 4.424 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_0:P0_0 | 18.045 | FOXD2 | 8.810 | |||
T2:C0_2:P0_2 | 5.100 | Pparg::Rxra | 5.505 | |||
T2:P0_5 | 2.299 | ZFP57 | 3.596 | |||
T2:C0_6 | 1.835 | MEIS2(var.2) | 3.568 | |||
N/A | 0.000 | N/A | E2F6 | 3.903 | ||
N/A | 0.000 | N/A | ZBTB14 | 4.322 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:C0_0:P0_0 | 12.626 | FOXD2 | 7.012 | |||
T3:P0_3 | 10.932 | HNF4G | 6.192 | |||
T3:P0_5 | 2.510 | MYC | 3.066 | |||
T3:C0_9 | 1.923 | Zic2 | 3.686 | |||
N/A | 0.000 | N/A | ZNF148 | 6.178 | ||
N/A | 0.000 | N/A | ZNF341 | 3.533 | ||
N/A | 0.000 | N/A | KLF15 | 5.853 | ||
N/A | 0.000 | N/A | KLF10 | 3.577 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:P0_2 | 2.771 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:C0_3 | 2.632 | N/A | 0.000 | N/A | ||
T3:C0_0:P0_0 | 4.642 | Foxo1 | 3.161 | |||
T3:C0_5 | 11.139 | HNF1B | 9.059 | |||
T3:P0_6 | 4.148 | FOSL1::JUND | 3.684 | |||
T3:P0_1 | 6.502 | FOXD1 | 6.678 | |||
T3:P0_8 | 7.023 | Pparg::Rxra | 7.583 | |||
T3:P0_3 | 3.013 | NR2F2 | 3.632 | |||
T3:C0_4 | 8.122 | CEBPA | 10.237 | |||
T3:C0_7 | 2.344 | HNF4A | 3.296 | |||
T3:C0_0:P0_0 | 2.411 | RBPJ | 3.974 | |||
T3:C0_1 | 2.041 | ZNF384 | 4.234 | |||
T3:C0_1 | 2.519 | Stat2 | 6.518 | |||
N/A | 0.000 | N/A | GATA4 | 3.755 | ||
N/A | 0.000 | N/A | KLF9 | 3.540 | ||
N/A | 0.000 | N/A | MYOG | 4.229 | ||
N/A | 0.000 | N/A | KLF5 | 8.943 | ||
N/A | 0.000 | N/A | Nr2f6(var.2) | 3.422 | ||
N/A | 0.000 | N/A | ZNF24 | 7.158 | ||
N/A | 0.000 | N/A | ZNF317 | 3.443 | ||
N/A | 0.000 | N/A | ZKSCAN5 | 3.382 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
T3:P0_8 | 1.955 | N/A | 0.000 | N/A | ||
T3:C0_5 | 1.974 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:C0_0:P0_0 | 18.484 | FOXD2 | 8.700 | |||
T3:P0_3 | 5.746 | Pparg::Rxra | 5.535 | |||
N/A | 0.000 | N/A | MAZ | 4.653 | ||
N/A | 0.000 | N/A | FOXH1 | 3.519 |
# For each task, plot (cumulative) distribution of similarities
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.plot(
np.arange(len(memechip_tfm_sims[task_index])),
np.sort([memechip_tfm_sims[task_index][k][0] for k in memechip_keys]),
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.plot(
np.arange(len(homer_tfm_sims[task_index])),
np.sort([homer_tfm_sims[task_index][k][0] for k in homer_keys]),
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.plot(
np.arange(len(dichipmunk_tfm_sims[task_index])),
np.sort([dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys]),
label="DiChIPMunk"
)
ax.legend()
ax.set_xlabel("Motif rank (by max similarity to TF-MoDISco motif)")
ax.set_ylabel("Maximum similarity to a TF-MoDISco motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_tfm_similarity_vs_rank.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show the most and least reproducible motifs for each benchmark
num_to_show = 5
cols, heads = [], []
for _ in range(3):
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (80 / 3)}))
heads.append(vdomh.th("TF-MoDISco similarity", style={"text-align": "center"}))
heads.append(vdomh.th("TF-MoDISco key", style={"text-align": "center"}))
heads.append(vdomh.th("Motif", style={"text-align": "center"}))
colgroup = vdomh.colgroup(*cols)
header = vdomh.thead(heads)
subheader = vdomh.tr(
vdomh.td(vdomh.b("MEMEChIP"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("HOMER"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("DiChIPMunk"), colspan="3", style={"text-align": "center"}),
)
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by similarity
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -memechip_tfm_sims[task_index][k][0]
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -homer_tfm_sims[task_index][k][0]
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -dichipmunk_tfm_sims[task_index][k][0]
)
display(vdomh.h4("Most reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]):
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list):
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[i]][0]),
vdomh.td(sim_dict[key_list[i]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
display(vdomh.h4("Least reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]) - num_to_show:
# Don't show motifs that have already been shown in the previous "most reproducible" table
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list) - num_to_show:
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[len(key_list) - i - 1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[len(key_list) - i - 1]][0]),
vdomh.td(sim_dict[key_list[len(key_list) - i - 1]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
7.070 | T0:C0_1:P0_1 | 14.869 | T0:C0_5:P0_5 | 11.044 | T0:C0_0:P0_0 | |||
6.111 | T0:C0_0:P0_0 | 11.992 | T0:C0_11 | 5.792 | T0:C0_8:P0_2 | |||
5.064 | T0:C0_2:P0_7 | 6.721 | T0:C0_0:P0_0 | 1.681 | T0:C0_5:P0_5 | |||
3.641 | T0:C0_1:P0_1 | 3.293 | T0:C0_1:P0_1 | 0.000 | N/A | |||
3.136 | T0:C0_0:P0_0 | 3.171 | T0:C0_5:P0_5 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.826 | T0:C0_5:P0_5 | 0.000 | N/A | 0.000 | N/A | |||
1.969 | T0:C0_5:P0_5 | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
12.520 | T1:C0_0:P0_0 | 9.041 | T1:C0_0:P0_0 | 9.545 | T1:C0_0:P0_0 | |||
5.399 | T1:P0_1 | 3.812 | T1:C0_2:P0_3 | 6.491 | T1:C0_5 | |||
4.529 | T1:C0_2:P0_3 | 3.773 | T1:C0_2:P0_3 | 1.815 | T1:C0_4 | |||
3.713 | T1:P0_4 | 3.502 | T1:P0_4 | 0.000 | N/A | |||
3.603 | T1:P0_1 | 2.292 | T1:C0_6 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.919 | T1:C0_0:P0_0 | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
17.341 | T2:C0_0:P0_0 | 14.962 | T2:C0_0:P0_0 | 18.045 | T2:C0_0:P0_0 | |||
7.841 | T2:C0_5 | 13.161 | T2:C0_6 | 5.100 | T2:C0_2:P0_2 | |||
5.348 | T2:C0_3:P0_3 | 11.364 | T2:C0_4:P0_6 | 2.299 | T2:P0_5 | |||
2.315 | T2:C0_6 | 6.354 | T2:P0_1 | 1.835 | T2:C0_6 | |||
2.293 | T2:C0_7 | 2.992 | T2:P0_5 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
12.626 | T3:C0_0:P0_0 | 11.139 | T3:C0_5 | 18.484 | T3:C0_0:P0_0 | |||
10.932 | T3:P0_3 | 8.122 | T3:C0_4 | 5.746 | T3:P0_3 | |||
2.510 | T3:P0_5 | 7.023 | T3:P0_8 | 1.974 | T3:C0_5 | |||
1.923 | T3:C0_9 | 6.502 | T3:P0_1 | 1.955 | T3:P0_8 | |||
0.000 | N/A | 4.642 | T3:C0_0:P0_0 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A |