import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif.match_motifs as match_motifs
import motif.read_motifs as read_motifs
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, create_motif_similarity_matrix, purine_rich_motif
import tempfile
import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:17: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7fe73e740050>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
if "TFM_TF_NAME" in os.environ:
tf_name = os.environ["TFM_TF_NAME"]
else:
tf_name = "E2F6"
out_path = "/users/amtseng/tfmodisco/figures/benchmark_reproducibility/%s_benchmark_reproducibility" % tf_name
os.makedirs(out_path, exist_ok=True)
tf_num_tasks = {
"E2F6": 2,
"FOXA2": 4,
"SPI1": 4,
"CEBPB": 7,
"MAX": 7,
"GABPA": 9,
"MAFK": 9,
"JUND": 14,
"NR3C1-reddytime": 16,
"REST": 20
}
tf_best_model_types = {
"E2F6": list("MM"),
"FOXA2": list("SSMM"),
"SPI1": list("MSSS"),
"CEBPB": list("MMMMSMM"),
"MAX": list("MMSMMSS"),
"GABPA": list("MMMSMMMMM"),
"MAFK": list("MMMMMMMMM"),
"JUND": list("SMMSMSSSSSSSMS"),
"NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
"REST": list("MMMMMMMMMSMMSMMSMMMM")
}
num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]
tfm_motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_cpmerged_motifs.h5" % tf_name
memechip_motif_file = "/users/amtseng/tfmodisco/results/motifs/memechip/%s_memechip_motifs.h5" % tf_name
homer_motif_file = "/users/amtseng/tfmodisco/results/motifs/homer/%s_homer_motifs.h5" % tf_name
dichipmunk_motif_file = "/users/amtseng/tfmodisco/results/motifs/dichipmunk/%s_dichipmunk_motifs.h5" % tf_name
motif_database_path = "/users/amtseng/tfmodisco/data/processed/motif_databases/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt"
def renorm_motif(motif, pseudocount=1e-10):
"""
Renormalizes a motif (L x 4 array) so that the bases sum to 1.
"""
s = np.sum(motif, axis=1, keepdims=True)
assert np.all(s > 0)
return motif / s
def import_tfmodisco_motifs(motif_file, model_types, motif_type="cwm_trimmed"):
"""
From a file containing all motifs for that TF, imports the
trimmed CWMs (or another kind of motif type) of the fine-tuned models
corresponding to the model type for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
motifs = []
with h5py.File(motif_file, "r") as f:
mtft = f["multitask_finetune"]
stft = f["singletask_finetune"]
for i, model_type in enumerate(model_types):
task = "task_%d" % i
if model_type == "M":
dset = mtft[task]
else:
dset = stft[task]
task_motifs = {}
for motif_key in dset.keys():
if "0_" in motif_key:
# Motifs that are (or are constructed from) positive metacluster only
motif = dset[motif_key][motif_type][:]
if motif_type.startswith("pfm"):
motif = renorm_motif(motif)
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(motif)
motifs.append(task_motifs)
return motifs
def import_classic_benchmark_motifs(motif_file, mode):
"""
From a file containing all motifs for that TF from a benchmark
method, imports the PFMs of the motifs for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
if mode == "dichipmunk":
score_key = "supporting_seqs"
elif mode == "homer":
score_key = "log_enrichment"
elif mode == "memechip":
score_key = "evalue"
motifs = []
with h5py.File(motif_file, "r") as f:
tasks = sorted([int(key[5:]) for key in f.keys() if key != "task_agg"])
for i in tasks:
dset = f["task_%d" % i]
task_motifs = {}
for motif_key in dset.keys():
if motif_key == score_key:
continue
task_motifs["T%d:%s" % (i, motif_key)] = purine_rich_motif(renorm_motif(dset[motif_key][:]))
motifs.append(task_motifs)
return motifs
def import_database_pfms(database_path):
"""
Imports the database of PFMs by reading through the entire database and
constructing a dictionary mapping motif IDs to NumPy arrays of PFMs.
"""
motif_dict = {}
with open(database_path, "r") as f:
try:
while True:
line = next(f)
if line.startswith("MOTIF"):
key = line.strip().split()[1]
header = next(f)
motif_width = int(header.split()[5])
motif = np.empty((motif_width, 4))
for i in range(motif_width):
motif[i] = np.array([
float(x) for x in next(f).strip().split()
])
# Add the motif with a shortened key
motif_dict[key.split("_")[1]] = purine_rich_motif(renorm_motif(motif))
except StopIteration:
pass
return motif_dict
def get_closest_tomtom_motif_similarities(query_dict, target_dict):
"""
From a dictionary mapping N motif keys to query motifs, and a
dictionary mapping M motif keys to target motifs, returns a
dictionary mapping the N query motif keys to the similarity and
key of the closest target motif (a pair). Similarity is the
-log(p) from TOMTOM.
"""
query_keys, query_pfms = list(zip(*query_dict.items()))
target_keys, target_pfms = list(zip(*target_dict.items()))
# Create temporary directory to do work in
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir = temp_dir_obj.name
# Convert motifs to MEME format
query_motif_file = os.path.join(temp_dir, "query_motifs.txt")
target_motif_file = os.path.join(temp_dir, "target_motifs.txt")
match_motifs.export_pfms_to_meme_format(query_pfms, query_motif_file)
match_motifs.export_pfms_to_meme_format(target_pfms, target_motif_file)
# Run TOMTOM
tomtom_dir = os.path.join(temp_dir, "tomtom")
match_motifs.run_tomtom(
query_motif_file, target_motif_file, tomtom_dir,
show_output=False
)
# Find results, mapping each query motif to target index
# The query/target IDs are the indices
tomtom_table = match_motifs.import_tomtom_results(tomtom_dir)
matches = []
for i in range(len(query_pfms)):
rows = tomtom_table[tomtom_table["Query_ID"] == i]
if rows.empty:
matches.append((0, "N/A"))
continue
min_row = rows.loc[rows["p-value"].idxmin()]
score = -np.log10(min_row["p-value"])
target_key = target_keys[min_row["Target_ID"]]
matches.append((score, target_key))
temp_dir_obj.cleanup()
return dict(zip(query_keys, matches))
tfm_motif_cwms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "cwm_trimmed")
tfm_motif_pfms = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "pfm_trimmed")
memechip_motifs = import_classic_benchmark_motifs(memechip_motif_file, "memechip")
homer_motifs = import_classic_benchmark_motifs(homer_motif_file, "homer")
dichipmunk_motifs = import_classic_benchmark_motifs(dichipmunk_motif_file, "dichipmunk")
database_motifs = import_database_pfms(motif_database_path)
For each benchmark motif, compute the closest TF-MoDISco motif to it.
# Compute TF-MoDISco similarity dictionaries by task
memechip_tfm_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(memechip_motifs))
]
homer_tfm_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(homer_motifs))
]
dichipmunk_tfm_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], tfm_motif_pfms[task_index])
for task_index in range(len(dichipmunk_motifs))
]
# Compute database similarity dictionaries by task
memechip_database_sims = [
get_closest_tomtom_motif_similarities(memechip_motifs[task_index], database_motifs)
for task_index in range(len(memechip_motifs))
]
homer_database_sims = [
get_closest_tomtom_motif_similarities(homer_motifs[task_index], database_motifs)
for task_index in range(len(homer_motifs))
]
dichipmunk_database_sims = [
get_closest_tomtom_motif_similarities(dichipmunk_motifs[task_index], database_motifs)
for task_index in range(len(dichipmunk_motifs))
]
# For each task, similarity of motifs to database vs TF-MoDISco
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.scatter(
[memechip_tfm_sims[task_index][k][0] for k in memechip_keys],
[memechip_database_sims[task_index][k][0] for k in memechip_keys],
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.scatter(
[homer_tfm_sims[task_index][k][0] for k in homer_keys],
[homer_database_sims[task_index][k][0] for k in homer_keys],
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.scatter(
[dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys],
[dichipmunk_database_sims[task_index][k][0] for k in dichipmunk_keys],
label="DiChIPMunk"
)
(min_x, max_x), (min_y, max_y) = ax.get_xlim(), ax.get_ylim()
min_both, max_both = min(min_x, min_y), max(max_x, max_y)
ax.set_xlim(min_both, max_both)
ax.set_ylim(min_both, max_both)
ax.plot(
[min_both, max_both], [min_both, max_both],
color="black", linestyle="--", alpha=0.3, zorder=0
)
ax.legend()
ax.set_xlabel("Maximum similarity to a TF-MoDISco motif")
ax.set_ylabel("Maximum similarity to a JASPAR motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_database_vs_tfm_similarity.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show all motifs in order of similarity ratio
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "30"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "5"}),
vdomh.col(style={"width": "20"}),
)
header = vdomh.thead(
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
vdomh.th("Key", style={"text-align": "center"}),
vdomh.th("Similarity", style={"text-align": "center"}),
vdomh.th("Motif", style={"text-align": "center"}),
)
safe_div = lambda a, b: a / b if b else float("inf")
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by decreasing similarity ratio of TF-MoDISco / database
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -safe_div(memechip_tfm_sims[task_index][k][0], memechip_database_sims[task_index][k][0])
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -safe_div(homer_tfm_sims[task_index][k][0], homer_database_sims[task_index][k][0])
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -safe_div(dichipmunk_tfm_sims[task_index][k][0], dichipmunk_database_sims[task_index][k][0])
)
for bench_type, key_list, motif_dict, tfm_sim_dict, database_sim_dict in [
("MEMEChIP", memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index], memechip_database_sims[task_index]),
("HOMER", homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index], homer_database_sims[task_index]),
("DiChIPMunk", dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index], dichipmunk_database_sims[task_index])
]:
subheader = vdomh.tr(
vdomh.td(vdomh.b(bench_type), colspan="1", style={"text-align": "center"}),
vdomh.td(vdomh.b("TF-MoDISco"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("Database"), colspan="3", style={"text-align": "center"}),
)
rows = [subheader]
for i in range(len(key_list)):
bench_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
bench_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_motif_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
if tfm_sim_dict[key_list[i]][1] == "N/A":
tfm_fig_cell = "N/A"
else:
tfm_fig = viz_sequence.plot_weights(
tfm_motif_cwms[task_index][tfm_sim_dict[key_list[i]][1]],
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
tfm_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_tfmmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
tfm_fig_cell = figure_to_vdom_image(tfm_fig)
if database_sim_dict[key_list[i]][1] == "N/A":
database_fig_cell = "N/A"
else:
database_fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(database_motifs[database_sim_dict[key_list[i]][1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
database_fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_task%d_%s_dbmatch_%d.svg" % (tf_name, task_index, bench_type.lower(), i)),
format="svg"
)
database_fig_cell = figure_to_vdom_image(database_fig)
rows.append(vdomh.tr(
vdomh.td(figure_to_vdom_image(bench_fig)),
vdomh.td(tfm_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % tfm_sim_dict[key_list[i]][0]),
vdomh.td(tfm_fig_cell),
vdomh.td(database_sim_dict[key_list[i]][1]),
vdomh.td("%.3f" % database_sim_dict[key_list[i]][0]),
vdomh.td(database_fig_cell)
))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
/users/amtseng/tfmodisco/src/plot/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
T0:C0_5 | 1.635 | N/A | 0.000 | N/A | ||
T0:P0_5 | 10.707 | ZNF384 | 5.024 | |||
T0:C0_0:P0_0 | 21.336 | SPIB | 14.838 | |||
T0:P0_4 | 5.061 | ETV1 | 5.234 | |||
T0:P0_4 | 3.204 | Stat2 | 3.584 | |||
T0:C0_2 | 2.098 | KLF9 | 3.650 | |||
T0:C0_4 | 2.109 | KLF4 | 5.125 | |||
N/A | 0.000 | N/A | KLF15 | 8.850 | ||
N/A | 0.000 | N/A | ZBTB33 | 3.591 | ||
N/A | 0.000 | N/A | CTCF | 7.818 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
T0:C0_3 | 1.864 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:P0_2 | 2.196 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_2 | 2.873 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_1 | 1.961 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_2 | 3.121 | N/A | 0.000 | N/A | ||
T0:P0_5 | 11.000 | ZNF384 | 5.262 | |||
T0:C0_0:P0_0 | 15.234 | SPIB | 10.475 | |||
T0:C0_4 | 2.428 | Zfx | 3.841 | |||
T0:C0_5 | 2.206 | REL | 3.821 | |||
T0:C0_2 | 1.731 | KLF9 | 3.573 | |||
T0:C0_5 | 1.746 | MYOG | 5.131 | |||
N/A | 0.000 | N/A | ZNF24 | 5.234 | ||
N/A | 0.000 | N/A | FOXB1 | 3.266 | ||
N/A | 0.000 | N/A | TFAP2C | 3.725 | ||
N/A | 0.000 | N/A | ZNF384 | 4.684 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T0:C0_0:P0_0 | 15.858 | SPIB | 10.680 | |||
T0:P0_1 | 2.052 | SP1 | 3.128 | |||
T0:P0_4 | 2.658 | Stat2 | 4.339 | |||
T0:P0_5 | 2.218 | SOX15 | 4.173 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
T1:C0_0:P0_0 | 25.106 | SPI1 | 9.557 | |||
T1:C0_0:P0_0 | 7.437 | ELF1 | 5.492 | |||
T1:C0_4 | 5.676 | PRDM1 | 5.243 | |||
T1:P0_5 | 5.493 | ZNF148 | 5.514 | |||
T1:P0_6 | 1.697 | GATA4 | 4.124 | |||
T1:C0_4 | 2.328 | KLF5 | 6.264 | |||
N/A | 0.000 | N/A | JUND(var.2) | 3.218 | ||
N/A | 0.000 | N/A | ZBTB33 | 5.489 | ||
N/A | 0.000 | N/A | TWIST1 | 4.547 | ||
N/A | 0.000 | N/A | CTCF | 9.325 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_3 | 1.990 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_4 | 3.158 | N/A | 0.000 | N/A | ||
T1:P0_4 | 1.428 | N/A | 0.000 | N/A | ||
T1:C0_3 | 2.156 | N/A | 0.000 | N/A | ||
T1:C0_3 | 1.907 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:P0_3 | 12.861 | ZNF384 | 5.263 | |||
T1:C0_0:P0_0 | 16.601 | SPIB | 11.578 | |||
T1:P0_2 | 3.297 | PRDM1 | 5.685 | |||
T1:P0_7 | 1.977 | TFAP2B(var.2) | 3.507 | |||
T1:P0_5 | 2.504 | Zfx | 4.898 | |||
T1:P0_3 | 2.322 | ZNF384 | 4.671 | |||
N/A | 0.000 | N/A | EBF1 | 3.469 | ||
N/A | 0.000 | N/A | KLF9 | 3.549 | ||
N/A | 0.000 | N/A | ZBTB33 | 4.166 | ||
N/A | 0.000 | N/A | NRF1 | 6.905 | ||
N/A | 0.000 | N/A | KLF5 | 8.144 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_0:P0_0 | 1.974 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T1:C0_2 | 7.469 | ZKSCAN5 | 5.709 | |||
T1:C0_0:P0_0 | 14.967 | SPIB | 11.886 | |||
T1:P0_3 | 2.253 | RUNX3 | 3.988 | |||
N/A | 0.000 | N/A | Sox17 | 3.248 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:P0_5 | 2.372 | N/A | 0.000 | N/A | ||
T2:P0_5 | 1.638 | N/A | 0.000 | N/A | ||
T2:P0_4 | 2.244 | N/A | 0.000 | N/A | ||
T2:C0_0:P0_0 | 23.237 | SPIB | 18.665 | |||
T2:C0_0:P0_0 | 6.128 | Stat2 | 6.066 | |||
T2:C0_9 | 3.200 | ZNF384 | 5.120 | |||
T2:P0_8 | 2.352 | KLF15 | 5.549 | |||
N/A | 0.000 | N/A | FOSB::JUN | 3.165 | ||
N/A | 0.000 | N/A | KLF15 | 6.193 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_6 | 2.356 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_0:P0_0 | 16.949 | SPIB | 12.142 | |||
T2:P0_4 | 4.059 | SMAD2::SMAD3::SMAD4 | 4.279 | |||
T2:C0_9 | 3.017 | ZNF384 | 5.235 | |||
T2:P0_2 | 1.613 | KLF9 | 3.546 | |||
T2:C0_6 | 1.812 | ZNF24 | 4.327 | |||
T2:C0_6 | 2.472 | ZNF24 | 8.324 | |||
N/A | 0.000 | N/A | Zfx | 4.138 | ||
N/A | 0.000 | N/A | ZNF460 | 4.422 | ||
N/A | 0.000 | N/A | STAT3 | 3.319 | ||
N/A | 0.000 | N/A | CTCF | 5.599 | ||
N/A | 0.000 | N/A | ESRRA | 5.222 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:P0_1 | 2.229 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T2:C0_0:P0_0 | 15.130 | SPIB | 10.905 | |||
T2:C0_1 | 6.481 | Stat2 | 7.592 | |||
T2:C0_8 | 2.827 | GFI1 | 3.540 | |||
N/A | 0.000 | N/A | HNF1A | 3.832 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
MEMEChIP | TF-MoDISco | Database | ||||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:P0_6 | 7.029 | ZNF384 | 3.481 | |||
T3:C0_0:P0_0 | 20.568 | SPI1 | 10.909 | |||
T3:C0_2 | 7.213 | SPIB | 5.686 | |||
T3:C0_2 | 5.973 | ZKSCAN5 | 5.137 | |||
T3:P0_5 | 2.983 | NFATC2 | 3.870 | |||
T3:P0_3 | 1.907 | ZNF274 | 3.043 | |||
T3:C0_2 | 3.620 | ZNF148 | 5.805 | |||
N/A | 0.000 | N/A | ZBTB33 | 5.168 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
HOMER | TF-MoDISco | Database | ||||
T3:P0_2 | 2.339 | N/A | 0.000 | N/A | ||
T3:P0_5 | 1.603 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:C0_0:P0_0 | 1.576 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:P0_6 | 12.061 | ZNF384 | 5.232 | |||
T3:C0_0:P0_0 | 12.156 | SPIB | 9.033 | |||
T3:C0_2 | 5.406 | ZNF263 | 4.651 | |||
T3:P0_2 | 2.379 | Zfx | 4.798 | |||
T3:P0_6 | 1.756 | ZNF384 | 3.742 | |||
N/A | 0.000 | N/A | SMAD2::SMAD3::SMAD4 | 3.521 | ||
N/A | 0.000 | N/A | KLF9 | 3.582 | ||
N/A | 0.000 | N/A | ZBTB33 | 3.518 |
Motif | Key | Similarity | Motif | Key | Similarity | Motif |
---|---|---|---|---|---|---|
DiChIPMunk | TF-MoDISco | Database | ||||
T3:P0_5 | 2.202 | N/A | 0.000 | N/A | ||
T3:C0_0:P0_0 | 1.707 | N/A | 0.000 | N/A | ||
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
N/A | 0.000 | N/A | N/A | 0.000 | N/A | |
T3:P0_2 | 1.279 | N/A | 0.000 | N/A | ||
T3:C0_0:P0_0 | 13.429 | SPIB | 8.097 | |||
T3:C0_0:P0_0 | 9.587 | SPIB | 10.458 | |||
T3:C0_3 | 1.861 | Nkx3-2 | 3.346 | |||
T3:C0_2 | 1.609 | TFDP1 | 3.311 |
# For each task, plot (cumulative) distribution of similarities
for task_index in range(len(tfm_motif_pfms)):
fig, ax = plt.subplots(figsize=(8, 5))
memechip_keys = memechip_motifs[task_index].keys()
ax.plot(
np.arange(len(memechip_tfm_sims[task_index])),
np.sort([memechip_tfm_sims[task_index][k][0] for k in memechip_keys]),
label="MEMEChIP"
)
homer_keys = homer_motifs[task_index].keys()
ax.plot(
np.arange(len(homer_tfm_sims[task_index])),
np.sort([homer_tfm_sims[task_index][k][0] for k in homer_keys]),
label="HOMER"
)
dichipmunk_keys = dichipmunk_motifs[task_index].keys()
ax.plot(
np.arange(len(dichipmunk_tfm_sims[task_index])),
np.sort([dichipmunk_tfm_sims[task_index][k][0] for k in dichipmunk_keys]),
label="DiChIPMunk"
)
ax.legend()
ax.set_xlabel("Motif rank (by max similarity to TF-MoDISco motif)")
ax.set_ylabel("Maximum similarity to a TF-MoDISco motif")
ax.set_title("Motif benchmark reproducibility: task %d" % task_index)
plt.savefig(
os.path.join(out_path, "%s_task%d_tfm_similarity_vs_rank.svg" % (tf_name, task_index)),
format="svg"
)
# For each task, show the most and least reproducible motifs for each benchmark
num_to_show = 5
cols, heads = [], []
for _ in range(3):
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (10 / 3)}))
cols.append(vdomh.col(style={"width": "%.2f%%" % (80 / 3)}))
heads.append(vdomh.th("TF-MoDISco similarity", style={"text-align": "center"}))
heads.append(vdomh.th("TF-MoDISco key", style={"text-align": "center"}))
heads.append(vdomh.th("Motif", style={"text-align": "center"}))
colgroup = vdomh.colgroup(*cols)
header = vdomh.thead(heads)
subheader = vdomh.tr(
vdomh.td(vdomh.b("MEMEChIP"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("HOMER"), colspan="3", style={"text-align": "center"}),
vdomh.td(vdomh.b("DiChIPMunk"), colspan="3", style={"text-align": "center"}),
)
for task_index in range(len(tfm_motif_pfms)):
display(vdomh.h3("Task %d" % task_index))
# Rank motif keys by similarity
memechip_keys = sorted(
memechip_database_sims[task_index].keys(),
key=lambda k: -memechip_tfm_sims[task_index][k][0]
)
homer_keys = sorted(
homer_database_sims[task_index].keys(),
key=lambda k: -homer_tfm_sims[task_index][k][0]
)
dichipmunk_keys = sorted(
dichipmunk_database_sims[task_index].keys(),
key=lambda k: -dichipmunk_tfm_sims[task_index][k][0]
)
display(vdomh.h4("Most reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]):
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list):
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[i]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[i]][0]),
vdomh.td(sim_dict[key_list[i]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
display(vdomh.h4("Least reproducible motifs"))
rows = [subheader]
for i in range(num_to_show):
if i >= max([len(memechip_keys), len(homer_keys), len(dichipmunk_keys)]) - num_to_show:
# Don't show motifs that have already been shown in the previous "most reproducible" table
break
row = []
for key_list, motif_dict, sim_dict in [
(memechip_keys, memechip_motifs[task_index], memechip_tfm_sims[task_index]),
(homer_keys, homer_motifs[task_index], homer_tfm_sims[task_index]),
(dichipmunk_keys, dichipmunk_motifs[task_index], dichipmunk_tfm_sims[task_index])
]:
if i < len(key_list) - num_to_show:
fig = viz_sequence.plot_weights(
read_motifs.pfm_to_pwm(motif_dict[key_list[len(key_list) - i - 1]]),
subticks_frequency=100, figsize=(20, 4), return_fig=True
)
fig.tight_layout()
row.extend([
vdomh.td("%.3f" % sim_dict[key_list[len(key_list) - i - 1]][0]),
vdomh.td(sim_dict[key_list[len(key_list) - i - 1]][1]),
vdomh.td(figure_to_vdom_image(fig))
])
else:
row.extend([vdomh.td(), vdomh.td(), vdomh.td()])
rows.append(vdomh.tr(*row))
display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
21.336 | T0:C0_0:P0_0 | 15.234 | T0:C0_0:P0_0 | 15.858 | T0:C0_0:P0_0 | |||
10.707 | T0:P0_5 | 11.000 | T0:P0_5 | 2.658 | T0:P0_4 | |||
5.061 | T0:P0_4 | 3.121 | T0:C0_2 | 2.218 | T0:P0_5 | |||
3.204 | T0:P0_4 | 2.873 | T0:C0_2 | 2.052 | T0:P0_1 | |||
2.109 | T0:C0_4 | 2.428 | T0:C0_4 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.635 | T0:C0_5 | 0.000 | N/A | 0.000 | N/A | |||
2.098 | T0:C0_2 | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
25.106 | T1:C0_0:P0_0 | 16.601 | T1:C0_0:P0_0 | 14.967 | T1:C0_0:P0_0 | |||
7.437 | T1:C0_0:P0_0 | 12.861 | T1:P0_3 | 7.469 | T1:C0_2 | |||
5.676 | T1:C0_4 | 3.297 | T1:P0_2 | 2.253 | T1:P0_3 | |||
5.493 | T1:P0_5 | 3.158 | T1:P0_4 | 1.974 | T1:C0_0:P0_0 | |||
2.328 | T1:C0_4 | 2.504 | T1:P0_5 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.697 | T1:P0_6 | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
23.237 | T2:C0_0:P0_0 | 16.949 | T2:C0_0:P0_0 | 15.130 | T2:C0_0:P0_0 | |||
6.128 | T2:C0_0:P0_0 | 4.059 | T2:P0_4 | 6.481 | T2:C0_1 | |||
3.200 | T2:C0_9 | 3.017 | T2:C0_9 | 2.827 | T2:C0_8 | |||
2.372 | T2:P0_5 | 2.472 | T2:C0_6 | 2.229 | T2:P0_1 | |||
2.352 | T2:P0_8 | 2.356 | T2:C0_6 | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.638 | T2:P0_5 | 0.000 | N/A | 0.000 | N/A | |||
2.244 | T2:P0_4 | 0.000 | N/A | 0.000 | N/A |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
20.568 | T3:C0_0:P0_0 | 12.156 | T3:C0_0:P0_0 | 13.429 | T3:C0_0:P0_0 | |||
7.213 | T3:C0_2 | 12.061 | T3:P0_6 | 9.587 | T3:C0_0:P0_0 | |||
7.029 | T3:P0_6 | 5.406 | T3:C0_2 | 2.202 | T3:P0_5 | |||
5.973 | T3:C0_2 | 2.379 | T3:P0_2 | 1.861 | T3:C0_3 | |||
3.620 | T3:C0_2 | 2.339 | T3:P0_2 | 1.707 | T3:C0_0:P0_0 |
TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif | TF-MoDISco similarity | TF-MoDISco key | Motif |
---|---|---|---|---|---|---|---|---|
MEMEChIP | HOMER | DiChIPMunk | ||||||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
0.000 | N/A | 0.000 | N/A | 0.000 | N/A | |||
1.907 | T3:P0_3 | 0.000 | N/A | 1.279 | T3:P0_2 | |||
2.983 | T3:P0_5 | 0.000 | N/A | 1.609 | T3:C0_2 |