import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif.tfmodisco_hit_scoring as tfmodisco_hit_scoring
import motif.match_motifs as match_motifs
import motif.read_motifs as read_motifs
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, motif_similarity_score, create_motif_similarity_matrix, aggregate_motifs, aggregate_motifs_from_inds, purine_rich_motif
import tempfile
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.font_manager as font_manager
import scipy.signal
import scipy.cluster.hierarchy
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:22: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
<tqdm.notebook.tqdm_notebook at 0x7ff57821ff50>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
if "TFM_TF_NAME" in os.environ:
tf_name = os.environ["TFM_TF_NAME"]
else:
tf_name = "SPI1"
# Manually define the clusters of core motifs
if tf_name == "E2F6":
core_motif_defs = [
("MAX", ["T0:C0_0:P0_0", "T1:C0_0:P0_0"]),
("E2F6", ["T0:C0_1:P0_1", "T1:C0_1:P0_1"]),
("AP-1", ["T0:C0_5", "T0:P0_9", "T1:C0_7:P0_11"]),
("E2F v1", ["T0:C0_4", "T0:P0_4", "T0:P0_6", "T1:C0_3:P0_3"]),
("MAX v1", ["T1:C0_2:P0_7", "T1:C0_6:P0_2", "T0:C0_2:P0_2", "T1:C0_5", "T1:P0_4"])
]
elif tf_name == "SPI1":
core_motif_defs = [
("SPI1", ["T1:C0_0:P0_0", "T3:C0_0:P0_0", "T1:P0_7", "T0:C0_4", "T0:C0_0:P0_0", "T2:C0_0:P0_0", "T0:P0_5"]),
("SPI1 v1", ["T1:C0_4", "T1:P0_5", "T3:C0_2", "T1:P0_6"]),
("AP-1", ["T2:C0_5", "T1:C0_3", "T2:C0_6", "T2:C0_7", "T2:P0_7", "T2:C0_2"]),
("IRF", ["T0:C0_1", "T0:C0_3", "T0:P0_2", "T2:C0_1", "T2:C0_4", "T2:P0_4"]),
("RUNX", ["T2:C0_8", "T3:C0_3", "T3:P0_5", "T3:P0_7", "T2:P0_3"]),
("CEBP", ["T3:C0_1:P0_1"]),
("GATA", ["T1:C0_1"])
]
elif tf_name == "FOXA2":
core_motif_defs = [
("FOXA", ["T1:C0_0:P0_0", "T0:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0"]),
("FOX v1", ["T1:P0_1", "T0:C0_1:P0_1", "T2:C0_1", "T2:C0_7", "T2:P0_7", "T3:C0_1", "T3:P0_2"]),
("FOX v2", ["T0:C0_3:P0_4", "T1:C0_3:P0_5", "T3:C0_3", "T2:P0_1", "T3:P0_1"]),
("HNF4", ["T0:C0_2:P0_7", "T0:C0_7", "T0:C0_8:P0_2", "T1:C0_1:P0_2", "T1:C0_5", "T1:C0_6", "T1:P0_6", "T1:P0_7", "T2:C0_2:P0_2", "T2:C0_5", "T2:C0_6", "T2:C0_8", "T3:C0_2", "T3:C0_6", "T3:C0_7", "T3:P0_3", "T3:P0_8", "T0:C0_12", "T0:C0_11"]),
("HNF1", ["T0:C0_6:P0_9", "T2:C0_4:P0_6", "T3:C0_5", "T3:P0_7"]),
("CEBP", ["T0:C0_4:P0_3", "T1:C0_4", "T1:P0_4", "T2:C0_3:P0_3", "T2:C0_9", "T3:C0_4", "T3:C0_8", "T3:P0_4", "T3:C0_10"]),
("AP-1", ["T0:C0_9:P0_8", "T1:C0_2:P0_3", "T2:P0_5", "T3:P0_6"]),
("FOX dimer", ["T3:C0_9"]),
("CTCF", ["T0:C0_5:P0_5"])
]
elif tf_name == "CEBPB":
core_motif_defs = [
("CEBP", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_0"]),
("FOXA", ["T0:C0_9", "T0:C0_2:P0_2", "T0:C0_7:P0_8", "T0:C0_8", "T3:C0_1:P0_1"]),
("FOXB", ["T0:C0_5:P0_7", "T3:C0_6"]),
("HNF4", ["T0:P0_14", "T0:P0_11", "T0:C0_1:P0_1", "T0:P0_13", "T2:C0_6", "T2:P0_7", "T3:C0_2:P0_4", "T3:C0_5"]),
("ATF/CEBP", ["T6:C0_1"]),
("AP-1", ["T1:C0_5", "T0:C0_10", "T5:P0_13", "T0:C0_4", "T0:P0_4", "T1:C0_2:P0_3", "T2:C0_1:P0_1", "T2:C0_4", "T2:C0_5", "T2:P0_5", "T3:C0_3:P0_2", "T4:C0_1:P0_9", "T4:P0_1", "T4:P0_4", "T4:P0_5", "T4:P0_6", "T4:P0_7", "T4:P0_8", "T5:C0_3:P0_2", "T6:P0_2", "T2:C0_7"]),
("NFE", ["T5:P0_12"]),
("TEAD", ["T4:P0_10", "T4:C0_2", "T4:C0_3", "T4:P0_11", "T4:P0_2"]),
("TEAD v1", ["T4:C0_4", "T4:P0_3"]),
("GATA", ["T1:C0_1", "T1:P0_1", "T1:P0_9", "T5:C0_1:P0_1"]),
("CTCF", ["T0:C0_3:P0_3", "T1:C0_3:P0_2", "T1:C0_4", "T1:P0_11", "T2:C0_2:P0_2", "T3:C0_4:P0_5", "T5:C0_2:P0_3", "T6:C0_2:P0_1", "T3:C0_7", "T5:C0_4", "T0:C0_6"])
]
elif tf_name == "MAX":
core_motif_defs = [
("MAX", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T2:C0_1:P0_5", "T2:C0_5", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_2:P0_2", "T6:C0_0:P0_0"]),
("weak MAX", ["T6:P0_8", "T1:P0_5", "T1:P0_14", "T2:P0_3", "T3:P0_4", "T3:P0_5", "T6:C0_2:P0_2", "T1:P0_1", "T3:P0_1", "T3:P0_3"]),
("CTCF", ["T4:C0_2", "T4:P0_2", "T0:C0_2:P0_1", "T0:P0_2", "T1:C0_3:P0_2", "T3:C0_2:P0_2", "T4:C0_1:P0_1", "T6:C0_3:P0_3", "T6:P0_6"]),
("HNF4", ["T2:C0_2:P0_1", "T2:C0_3", "T2:C0_4", "T5:C0_1:P0_0", "T5:P0_7", "T6:C0_1:P0_1"]),
("CEBP", ["T5:C0_0:P0_1", "T6:C0_4:P0_11", "T6:P0_10", "T6:P0_13"]),
("FOX", ["T2:P0_2", "T5:C0_3", "T5:C0_6", "T6:C0_7:P0_12", "T6:C0_8"]),
("ELK-like", ["T6:P0_7", "T5:C0_5", "T6:C0_5", "T6:P0_4", "T6:P0_9"]),
("SP1-like", ["T6:C0_6:P0_5"]),
("GC repeat", ["T0:C0_1", "T1:C0_1:P0_6", "T1:C0_2", "T1:C0_4", "T1:C0_5", "T1:C0_6", "T1:C0_7", "T1:P0_10", "T1:P0_4", "T1:P0_8", "T1:P0_9", "T3:C0_1", "T5:C0_4", "T5:P0_3", "T5:P0_5", "T5:P0_6", "T1:P0_11", "T1:P0_3", "T1:P0_7"])
]
elif tf_name == "GABPA":
core_motif_defs = [
("GABPA monomer", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_2", "T7:C0_0:P0_0", "T8:C0_0:P0_0", "T4:P0_9", "T1:C0_2", "T2:C0_3", "T2:P0_4", "T3:P0_2", "T4:C0_4:P0_4", "T7:P0_5", "T4:P0_11"]),
("weak GABPA monomer", ["T4:P0_2", "T7:P0_2", "T8:P0_3"]),
("GABPA dimer", ["T0:P0_2", "T1:C0_1:P0_1", "T2:C0_1", "T3:C0_2", "T4:C0_1", "T5:C0_1:P0_1", "T6:C0_1:P0_0", "T2:P0_8"]),
("GABPA dimer v1", ["T1:P0_8"]),
("GABPA dimer v2", ["T8:P0_10", "T7:P0_10"]),
("GABPA dimer v3", ["T5:P0_7"]),
("GABPA dimer v4", ["T8:P0_8"]),
("GABPA dimer v5", ["T8:P0_9"]),
("GABPA dimer v6", ["T5:P0_8"]),
("GABPA dimer v7", ["T7:P0_9", "T5:P0_6", "T7:C0_2"]),
("GABPA dimer v8", ["T0:C0_2"]),
("GABPA dimer v9", ["T4:C0_7"]),
("GABPA dimer v10", ["T8:P0_12"]),
("GABPA dimer v11", ["T8:P0_12"]),
("GABPA dimer v12", ["T1:C0_5"]),
("GABPA dimer v13", ["T8:P0_6"]),
("GABPA dimer v14", ["T4:C0_8"]),
("GABPA dimer v15", ["T5:P0_12"]),
("GABPA dimer v16", ["T6:P0_9", "T7:P0_13", "T8:P0_11"]),
("GABPA/ATF dimer", ["T7:P0_7", "T8:P0_5"]),
("GABPA/HNF4 dimer", ["T0:P0_7", "T1:C0_4", "T2:P0_9", "T4:C0_6", "T4:P0_12", "T7:P0_4"]),
("THAP11", ["T0:C0_1:P0_1", "T1:C0_3:P0_5", "T2:C0_2:P0_1", "T3:C0_3:P0_4", "T4:C0_2:P0_1", "T5:C0_2:P0_2", "T6:C0_2", "T6:P0_1", "T7:C0_1", "T7:P0_1", "T8:C0_1", "T8:P0_1"]),
("GATA", ["T3:C0_1:P0_1", "T3:P0_5"]),
("ATF", ["T4:C0_5", "T4:P0_3"]),
("GC repeat", ["T2:C0_4", "T5:P0_10", "T6:P0_3", "T7:P0_6", "T8:P0_2", "T0:P0_4", "T0:P0_5", "T1:P0_2", "T1:P0_3", "T1:P0_4", "T2:C0_5", "T2:P0_2", "T2:P0_3", "T2:P0_5", "T2:P0_6", "T2:P0_7", "T4:C0_3"])
]
elif tf_name == "MAFK":
core_motif_defs = [
("TMARE", ["T3:P0_12", "T0:C0_0:P0_0", "T1:C0_1:P0_1", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T3:C0_5", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T5:C0_10", "T5:P0_5", "T6:C0_1:P0_1", "T7:C0_0:P0_0", "T8:C0_0:P0_0"]),
("CMARE", ["T3:C0_1", "T4:C0_1:P0_1", "T5:C0_4"]),
("CsMBE", ["T4:P0_2", "T5:C0_2", "T5:C0_8", "T8:P0_1", "T2:C0_1:P0_1", "T5:P0_2", "T6:P0_10", "T8:C0_1", "T1:C0_0:P0_0", "T5:C0_1:P0_1", "T6:C0_0:P0_0", "T7:C0_1:P0_1"]),
("TRE", ["T2:P0_5", "T5:P0_10", "T0:P0_1", "T4:C0_2", "T3:C0_2:P0_1"]),
("CRE", ["T2:C0_4"]),
("CTCF", ["T5:C0_7:P0_4", "T6:C0_3:P0_3", "T2:C0_3:P0_3", "T1:C0_3:P0_3", "T8:C0_3:P0_2", "T2:P0_10", "T1:P0_4", "T6:P0_4", "T2:P0_4", "T0:C0_1:P0_2", "T1:C0_2:P0_2", "T1:C0_5:P0_6", "T2:C0_2:P0_2", "T2:P0_6", "T3:C0_4:P0_3", "T5:C0_3:P0_3", "T5:C0_9", "T6:C0_2:P0_2", "T7:C0_2:P0_3", "T7:C0_5", "T7:P0_8", "T8:C0_2:P0_3", "T8:C0_5", "T4:C0_5:P0_5", "T3:P0_10"]),
("MAF v1", ["T2:C0_5", "T5:C0_6", "T5:P0_9", "T6:C0_5:P0_9", "T8:C0_4:P0_5"]),
("MAF v2", ["T3:C0_3:P0_2", "T4:C0_4:P0_4", "T5:C0_5:P0_6", "T7:C0_3", "T7:P0_2"]),
("MAF v3", ["T3:P0_9", "T4:C0_3:P0_3", "T7:P0_6"]),
("MAF v4", ["T6:C0_4:P0_7", "T7:C0_8:P0_9"]),
("MAF v5", ["T0:P0_6", "T1:C0_4", "T1:P0_5", "T3:P0_6", "T4:C0_6", "T7:C0_6"]),
("MAF v6", ["T4:P0_14", "T5:P0_8", "T7:P0_4"]),
("MAF v7", ["T7:C0_4", "T7:P0_7", "T7:C0_7"])
]
elif tf_name == "JUND":
core_motif_defs = [
("TRE", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:P0_0", "T4:P0_0", "T5:C0_0", "T5:P0_0", "T6:P0_1", "T8:C0_0:P0_0", "T10:C0_0:P0_0", "T11:C0_0:P0_0", "T12:C0_0:P0_0", "T12:C0_8", "T13:C0_2:P0_2", "T3:C0_0", "T4:C0_0", "T9:C0_0:P0_0", "T3:P0_8", "T4:P0_4", "T6:C0_1", "T12:P0_3", "T12:P0_4", "T13:P0_10", "T4:P0_3"]),
("CRE", ["T4:C0_1", "T7:P0_3", "T9:C0_1:P0_1", "T0:C0_4:P0_3", "T1:C0_2:P0_1", "T2:C0_1:P0_1", "T3:C0_4", "T3:P0_4", "T4:P0_1", "T7:C0_1", "T8:C0_2:P0_2", "T10:C0_3:P0_2", "T10:P0_5", "T11:C0_1:P0_1", "T12:C0_1:P0_1", "T13:C0_5", "T13:P0_4"]),
("TRE/TRE/TRE", ["T3:C0_9"]),
("TRE/TRE/TEAD", ["T3:C0_10"]),
("TRE/TRE", ["T3:P0_10", "T3:C0_7", "T4:C0_5", "T2:C0_5"]),
("TRE/TRE v2", ["T2:C0_3:P0_6", "T4:C0_3", "T5:P0_3"]),
("TRE/TRE v1", ["T3:C0_6", "T4:C0_4", "T2:C0_4"]),
("TRE/TRE v3", ["T3:C0_8"]),
("TRE-CRE", ["T3:C0_11"]),
("TRE-TEAD", ["T8:C0_7:P0_6", "T11:C0_5", "T3:C0_3", "T3:C0_1:P1_1", "T3:C0_2", "T3:C0_5", "T0:C0_9", "T3:P0_1"]),
("TEAD-TRE", ["T13:C0_7:P0_7", "T13:P0_16"]),
("CRE-TEAD", ["T3:P0_7", "T6:P0_9", "T8:C0_3:P0_3", "T8:P0_7", "T11:C0_2:P0_2"]),
("TRE-IRF", ["T7:C0_0:P0_0", "T7:P0_1"]),
("TEAD", ["T3:P0_3", "T6:P0_5", "T6:P0_10"]),
("TEAD v1", ["T6:P0_8"]),
("TRE v1", ["T2:P0_7"]),
("TRE v2", ["T0:P0_11"]),
("CRE v1", ["T4:P0_5"]),
("IRF", ["T7:P0_4"]),
("HNF1", ["T0:C0_7", "T13:C0_4:P0_5"]),
("HNF4", ["T0:C0_12", "T0:C0_2:P0_1", "T0:C0_6", "T13:C0_0:P0_0", "T13:C0_9", "T13:P0_15"]),
("CEBPB", ["T12:P0_6", "T9:C0_3", "T9:P0_3"]),
("CEBPD", ["T0:C0_3:P0_2", "T13:C0_1", "T13:P0_1", "T0:C0_13", "T0:P0_14"]),
("FOXA", ["T5:C0_1:P0_1", "T5:C0_2", "T6:P0_2"]),
("FOXD", ["T0:C0_10", "T0:C0_1:P0_10", "T6:P0_0", "T13:C0_3:P0_3"]),
("FOXB", ["T0:C0_5", "T13:C0_6", "T13:P0_6"]),
("CTCF", ["T10:C0_2:P0_1", "T11:C0_6", "T8:C0_1:P0_1"]),
("NFIC", ["T13:C0_8", "T13:P0_11", "T13:P0_14", "T13:P0_8", "T13:P0_9"]),
("SP1", ["T13:P0_12", "T10:C0_5"]),
("ATF", ["T1:P0_5", "T2:C0_2:P0_4", "T8:C0_4", "T9:C0_2:P0_2", "T10:C0_4:P0_3", "T11:C0_4", "T12:C0_2:P0_2"]),
("ATF v1", ["T4:C0_2", "T9:P0_5"]),
("AP2", ["T6:P0_11"]),
("AP2 v1", ["T6:P0_4"]),
("ELK-CRE", ["T0:C0_11", "T0:P0_12", "T0:P0_7", "T10:C0_8"])
]
elif tf_name == "NR3C1-reddytime":
core_motif_defs = [
("NR3C1", ["T0:C0_1", "T1:C0_0", "T2:C0_0", "T4:C0_0", "T4:C0_6", "T5:C0_0:P0_0", "T5:P0_6", "T7:C0_0:P0_0", "T7:C0_6", "T8:C0_0:P0_0", "T9:C0_0:P0_0", "T9:P0_7", "T10:C0_0:P0_0", "T12:C0_0:P0_0", "T13:C0_0:P0_0", "T14:C0_0:P0_0", "T15:C0_0:P0_0", "T0:P0_1", "T1:P0_0", "T2:P0_0", "T3:C0_0:P0_0", "T4:P0_0", "T6:C0_0:P0_0", "T11:C0_0:P0_0"]),
("NR3C1 half site", ["T0:P0_3", "T1:P0_7", "T2:P0_7"]),
("AP-1 (TRE)", ["T1:C0_1", "T2:C0_1", "T4:C0_1", "T5:C0_1", "T5:P0_2", "T7:C0_1:P0_2", "T8:C0_1:P0_2", "T9:C0_1:P0_2", "T10:C0_1", "T10:P0_2", "T12:C0_1:P0_2", "T12:P0_8", "T13:C0_1:P0_2", "T14:C0_1:P0_2", "T15:C0_1:P0_2", "T0:P0_0", "T1:P0_1", "T2:P0_1", "T3:C0_3:P0_4", "T4:P0_1", "T6:C0_2:P0_2", "T6:P0_3", "T11:C0_2:P0_2"]),
("AP-1 (TRE) v1", ["T3:C0_4", "T3:P0_5", "T3:P0_7", "T3:C0_7", "T3:C0_8"]),
("AP-1 (TRE) v2", ["T0:P0_8", "T2:P0_10", "T10:C0_6"]),
("AP-1 (CRE)", ["T0:C0_4", "T0:P0_6", "T1:C0_5", "T2:C0_4", "T4:C0_4", "T5:C0_4:P0_4", "T7:C0_4:P0_4", "T8:C0_4:P0_4", "T9:C0_4:P0_4", "T9:P0_8", "T10:C0_4:P0_4", "T10:P0_7", "T12:C0_4:P0_4", "T13:C0_4", "T13:C0_7", "T13:P0_4", "T14:C0_4:P0_4", "T15:C0_4:P0_4", "T0:P0_5", "T1:P0_4", "T2:P0_4", "T4:P0_3", "T11:C0_3"]),
("TEAD", ["T0:P0_9", "T2:P0_8", "T3:C0_1:P0_1", "T3:P0_8", "T11:P0_6", "T0:P0_7", "T2:C0_7", "T7:C0_7", "T1:C0_7", "T2:P0_6", "T5:P0_5", "T7:P0_5", "T8:C0_6", "T8:P0_5", "T9:C0_7", "T9:P0_5", "T10:P0_5", "T12:P0_5", "T13:P0_5", "T14:P0_5", "T15:P0_5"]),
("FOXB", ["T0:C0_3", "T1:C0_6", "T1:P0_9", "T2:C0_5", "T2:P0_9", "T4:C0_5", "T4:P0_9", "T5:C0_3:P0_3", "T7:C0_5:P0_6", "T8:C0_5:P0_6", "T9:C0_5:P0_6", "T10:C0_5:P0_6", "T12:C0_5:P0_6", "T13:C0_5:P0_6", "T14:C0_5:P0_6", "T15:C0_5:P0_6", "T13:C0_8"]),
("FOXD", ["T0:P0_4", "T1:P0_2", "T2:P0_3", "T2:P0_5", "T4:P0_4", "T4:P0_8", "T11:C0_4", "T11:P0_4", "T1:C0_3", "T2:C0_3", "T4:C0_3", "T7:C0_3:P0_3", "T7:P0_7", "T7:P0_8", "T8:C0_3:P0_3", "T9:C0_3:P0_3", "T10:C0_3:P0_3", "T12:C0_3:P0_3", "T12:P0_7", "T13:C0_3:P0_3", "T13:P0_7", "T14:C0_3:P0_3", "T15:C0_3:P0_3"]),
("FOX-like", ["T2:C0_6", "T9:C0_6", "T13:C0_6", "T15:C0_6"]),
("CEBPB", ["T0:C0_2", "T1:C0_2", "T1:P0_6", "T1:P0_8", "T2:C0_2", "T4:C0_2", "T5:C0_2:P0_1", "T7:C0_2:P0_1", "T8:C0_2:P0_1", "T9:C0_2:P0_1", "T10:C0_2:P0_1", "T12:C0_2:P0_1", "T12:C0_6", "T12:C0_7", "T13:C0_2:P0_1", "T14:C0_2:P0_1", "T14:C0_6", "T15:C0_2:P0_1", "T15:P0_7"]),
("CEBPG", ["T0:P0_2", "T1:P0_5", "T2:P0_2", "T4:P0_2", "T4:P0_6", "T4:P0_7", "T6:C0_1:P0_1", "T11:C0_1:P0_1", "T11:P0_3", "T11:P0_7"]),
("CTCF", ["T3:C0_5", "T3:P0_3"]),
("GRHL-like", ["T3:C0_6:P0_6"]),
("NFIA", ["T3:C0_2", "T3:P0_2"])
]
elif tf_name == "REST":
core_motif_defs = [
("REST full", ["T17:C0_12", "T0:C0_0:P0_0", "T1:C0_1:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_0", "T7:C0_0:P0_0", "T8:C0_0:P0_0", "T9:C0_0:P0_1", "T10:C0_0:P0_1", "T11:C0_0:P0_0", "T12:C0_0:P0_0", "T13:C0_0:P0_0", "T14:C0_0", "T14:C0_1:P0_0", "T15:C0_4:P0_3", "T16:C0_0:P0_0", "T17:C0_1:P0_0", "T17:P0_11", "T18:C0_0:P0_0", "T19:C0_0:P0_0"]),
("REST full (4 bp space)", ["T6:P0_16"]),
("REST full (5 bp space)", ["T0:C0_6", "T1:C0_10", "T2:C0_5:P0_5", "T3:C0_9:P0_8", "T4:P0_9", "T5:C0_7", "T6:C0_11", "T7:C0_10", "T8:C0_7:P0_11", "T10:C0_10", "T11:P0_7", "T13:P0_8", "T14:C0_9", "T16:C0_8:P0_10", "T17:C0_15:P0_15", "T19:C0_10", "T0:C0_9", "T0:P0_13", "T1:C0_9:P0_13", "T6:C0_13:P0_10", "T7:C0_11:P0_11", "T8:P0_12", "T9:C0_10", "T10:C0_12:P0_18", "T13:C0_8", "T14:C0_10:P0_12", "T16:C0_9", "T17:C0_16", "T18:C0_6:P0_12", "T19:P0_10", "T5:P0_8"]),
("REST full (6 bp space)", ["T0:C0_2", "T0:P0_4", "T1:C0_5:P0_5", "T2:C0_2", "T2:P0_2", "T3:C0_2", "T3:P0_3", "T4:C0_3:P0_6", "T5:C0_3:P0_5", "T6:C0_4:P0_4", "T7:C0_2:P0_3", "T8:C0_2:P0_6", "T9:C0_5:P0_5", "T10:C0_3:P0_4", "T11:C0_2:P0_3", "T12:C0_6:P0_8", "T13:C0_2:P0_2", "T14:C0_4:P0_6", "T15:C0_11:P0_12", "T16:C0_3:P0_4", "T17:C0_7:P0_6", "T18:C0_3:P0_3", "T19:C0_5:P0_4"]),
("REST full (7 bp space)", ["T0:C1_0:P0_5", "T1:C0_12:P0_12", "T1:C0_8:P0_9", "T2:C0_3:P0_3", "T3:C0_6", "T3:P0_5", "T4:C0_5:P0_7", "T5:C0_5:P0_6", "T6:C0_7:P0_7", "T7:C0_4:P0_5", "T8:C0_4:P0_7", "T9:C0_7:P0_6", "T10:C0_6:P0_7", "T11:P0_6", "T12:C0_13:P0_17", "T13:C0_5:P0_3", "T14:C0_7:P0_10", "T16:C0_4:P0_7", "T17:P0_7", "T18:C0_5:P0_8", "T19:C0_9:P0_6"]),
("REST full (8 bp space)", ["T1:P0_11", "T5:P0_11", "T6:C0_15:P0_12", "T7:C0_13:P0_10", "T8:P0_22", "T10:C0_13:P0_17", "T13:C0_9:P0_10", "T14:C0_13:P0_15", "T16:C0_10", "T16:P0_13", "T18:C0_8:P0_13", "T19:C0_14", "T19:P0_13", "T0:C1_0:P0_5"]),
("REST full (9 bp space)", ["T6:P0_19"]),
("REST full (-1 bp space)", ["T0:C0_4:P0_6", "T1:C0_13:P0_15", "T1:C0_6", "T3:C0_7:P0_7", "T4:P0_8", "T6:C0_16", "T6:C0_6:P0_17", "T7:C0_6:P0_6", "T8:C0_13", "T10:C0_5:P0_6", "T11:C0_5", "T13:C0_4:P0_4", "T14:C0_6:P0_7", "T15:C0_13", "T16:C0_6:P0_8", "T17:C0_14", "T18:P0_9", "T19:P0_11"]),
("REST full (-2 bp space)", ["T0:P0_8", "T1:C0_7", "T3:C0_8:P0_9", "T5:C0_6:P0_7", "T6:C0_5:P0_6", "T7:C0_5:P0_7", "T8:C0_5", "T10:C0_9:P0_11", "T12:C0_10:P0_11", "T13:C0_7:P0_5", "T14:C0_5", "T16:C0_5:P0_6", "T17:C0_11:P0_10", "T18:C0_4:P0_4", "T19:C0_8", "T19:P0_9"]),
("REST full (-3 bp space)", ["T1:P0_7", "T8:P0_8", "T9:C0_6:P0_7", "T14:P0_8", "T15:P0_11", "T15:C0_12"]),
("REST full (left reverse, -1 bp space)", ["T0:C0_7", "T0:P0_12", "T1:C0_11:P0_14", "T5:C0_8", "T6:C0_12:P0_14", "T7:C0_12:P0_15", "T8:C0_11:P0_19", "T9:C0_9:P0_13", "T10:C0_11", "T10:P0_12", "T13:C0_6", "T14:C0_11", "T14:P0_16", "T16:C0_7", "T17:C0_17", "T18:C0_7", "T18:P0_18", "T19:C0_13", "T19:P0_15"]),
("REST full (left reverse, 0 bp space)", ["T6:C0_17", "T8:P0_21", "T11:C0_6"]),
("REST full v1", ["T1:C0_2", "T6:C0_2", "T10:C0_2:P0_0", "T15:C0_6", "T19:C0_12"]),
("REST full v2", ["T12:P0_6"]),
("REST full v3", ["T17:C0_18", "T19:P0_16"]),
("REST full v4", ["T6:C0_3", "T14:C0_3", "T15:C0_8:P0_8", "T16:P0_5", "T5:P0_4"]),
("REST left", ["T14:P0_13", "T13:P0_7", "T7:C0_7", "T12:C0_4:P0_4", "T0:C0_8", "T1:C0_3:P0_2", "T7:P0_8", "T8:P0_18", "T9:C0_4:P0_3", "T10:C0_7", "T10:P0_9", "T14:P0_9", "T17:P0_3", "T19:P0_3", "T17:C0_3", "T19:C0_3", "T18:P0_17"]),
("REST right", ["T10:P0_8", "T14:P0_11", "T6:P0_9", "T0:C0_3:P0_1", "T17:C0_10:P0_1", "T0:C0_1", "T1:C0_0:P0_1", "T2:C0_1:P0_1", "T3:C0_1:P0_1", "T4:C0_1:P0_2", "T5:C0_1:P0_1", "T6:C0_1:P0_1", "T7:C0_1:P0_1", "T8:C0_1:P0_1", "T9:C0_1:P0_0", "T10:C0_1", "T11:C0_1:P0_1", "T12:C0_2:P0_3", "T13:C0_1:P0_1", "T14:C0_2:P0_1", "T15:C0_3:P0_4", "T16:C0_2:P0_2", "T18:C0_2", "T18:P0_2", "T19:P0_1", "T8:P0_14", "T17:C0_2", "T19:C0_1"]),
("REST left v1", ["T0:P0_7", "T7:P0_12", "T3:C0_3", "T3:P0_4", "T4:C0_4", "T11:C0_3"]),
("CTCF", ["T1:C0_4", "T1:P0_3", "T3:C0_4", "T4:C0_2:P0_1", "T5:C0_2", "T5:P0_3", "T6:C0_9", "T6:P0_5", "T7:C0_9", "T7:P0_2", "T8:P0_2", "T10:C0_4:P0_2", "T14:C0_8", "T14:P0_5", "T15:C0_0:P0_0", "T17:C0_6:P0_2", "T19:C0_7", "T19:P0_2", "T0:C0_5", "T1:P0_10", "T3:P0_6", "T5:C0_4", "T6:C0_10", "T8:C0_12", "T8:P0_5", "T11:C0_4:P0_4", "T13:C0_3"]),
("AP-1", ["T17:C0_4", "T5:C0_9", "T6:C0_8", "T7:C0_3", "T7:P0_9", "T8:C0_3", "T10:C0_8", "T12:C0_1:P0_1", "T15:C0_5", "T17:P0_4", "T17:P0_9", "T15:P0_6", "T19:C0_11"]),
("FOX", ["T12:P0_10", "T12:C0_3:P0_2"]),
("ARNT-like", ["T5:P0_10", "T8:P0_13", "T15:C0_2:P0_2"]),
("ELF-like", ["T9:P0_4", "T9:C0_2", "T9:C0_8"]),
("GATA", ["T15:C0_7:P0_5"]),
("KAISO", ["T6:C0_14", "T16:C0_1:P0_1", "T18:C0_1:P0_1", "T18:P0_16"]),
("GRHL", ["T7:P0_16", "T15:C0_1:P0_1", "T16:P0_12"]),
("GC repeat", ["T17:C0_13", "T18:P0_5", "T18:P0_6", "T12:C0_12", "T12:C0_5", "T12:C0_8", "T12:P0_15", "T0:P0_2", "T0:P0_3", "T1:P0_4", "T1:P0_6", "T2:C0_4", "T2:P0_4", "T2:P0_6", "T2:P0_7", "T3:P0_11", "T3:P0_2", "T4:P0_10", "T4:P0_3", "T4:P0_4", "T4:P0_5", "T5:P0_2", "T6:P0_2", "T6:P0_3", "T6:P0_8", "T7:C0_8", "T7:P0_4", "T8:C0_6:P0_3", "T8:C0_9", "T8:P0_17", "T8:P0_4", "T9:C0_3:P0_11", "T9:P0_12", "T9:P0_14", "T9:P0_2", "T9:P0_8", "T10:P0_10", "T10:P0_19", "T10:P0_3", "T11:P0_2", "T11:P0_5", "T12:C0_11", "T12:C0_7", "T12:C0_9", "T14:P0_14", "T14:P0_2", "T14:P0_3", "T14:P0_4", "T17:C0_5", "T17:C0_9", "T8:C0_10", "T10:P0_16"]),
("Trans. Elem.", ["T6:P0_18", "T7:P0_14", "T8:P0_16", "T12:P0_13", "T12:P0_16", "T12:P0_18", "T16:P0_11"])
]
out_path = "/users/amtseng/tfmodisco/figures/motifs_across_tasks/%s_motifs_across_tasks" % tf_name
os.makedirs(out_path, exist_ok=True)
tf_num_tasks = {
"E2F6": 2,
"FOXA2": 4,
"SPI1": 4,
"CEBPB": 7,
"MAX": 7,
"GABPA": 9,
"MAFK": 9,
"JUND": 14,
"NR3C1-reddytime": 16,
"REST": 20
}
tf_best_model_types = {
"E2F6": list("MM"),
"FOXA2": list("SSMM"),
"SPI1": list("MSSS"),
"CEBPB": list("MMMMSMM"),
"MAX": list("MMSMMSS"),
"GABPA": list("MMMSMMMMM"),
"MAFK": list("MMMMMMMMM"),
"JUND": list("SMMSMSSSSSSSMS"),
"NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
"REST": list("MMMMMMMMMSMMSMMSMMMM")
}
num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]
tfm_motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_cpmerged_motifs.h5" % tf_name
memechip_motif_file = "/users/amtseng/tfmodisco/results/motifs/memechip/%s_memechip_motifs.h5" % tf_name
homer_motif_file = "/users/amtseng/tfmodisco/results/motifs/homer/%s_homer_motifs.h5" % tf_name
dichipmunk_motif_file = "/users/amtseng/tfmodisco/results/motifs/dichipmunk/%s_dichipmunk_motifs.h5" % tf_name
multitask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/multitask_profile_finetune_stats.tsv"
singletask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/singletask_profile_finetune_stats.tsv"
motif_database_path = "/users/amtseng/tfmodisco/data/processed/motif_databases/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt"
def get_motif_hit_paths():
"""
Returns a list of pairs, where each pair is the count and profile
motif hit paths for the task.
"""
# First, import the best fold definitions
# Finetuned multi-task model
best_mt_fold = None
with open(multitask_finetune_model_def_tsv, "r") as f:
for line in f:
tokens = line.strip().split("\t")
if tokens[0] == tf_name and int(tokens[1]) == num_tasks - 1:
assert best_mt_fold is None
best_mt_fold = int(tokens[2])
# Finetuned single-task models
best_st_folds = []
with open(singletask_finetune_model_def_tsv, "r") as f:
for line in f:
tokens = line.strip().split("\t")
if tokens[0] == tf_name:
best_st_folds.append(int(tokens[2]))
assert len(best_st_folds) == num_tasks
# Get paths to motif hits
task_motif_hit_paths = []
base_path = "/users/amtseng/tfmodisco/results/reports/motif_hits/cache/tfm"
for task_index, model_type in enumerate(best_model_types):
if model_type == "M":
path = os.path.join(
base_path,
"multitask_profile_finetune",
"%s_multitask_profile_finetune_fold%d" % (tf_name, best_mt_fold),
"%s_multitask_profile_finetune_task%d_fold%d_{0}" % (tf_name, task_index, best_mt_fold),
"filtered_hits.tsv"
)
else:
path = os.path.join(
base_path,
"singletask_profile_finetune",
"%s_singletask_profile_finetune_fold%d" % (tf_name, best_st_folds[task_index]),
"task_%d" % task_index,
"%s_singletask_profile_finetune_task%d_fold%d_{0}" % (tf_name, task_index, best_st_folds[task_index]),
"filtered_hits.tsv"
)
task_motif_hit_paths.append(
(path.format("count"), path.format("profile"))
)
return task_motif_hit_paths
def import_tfmodisco_motifs(motif_file, model_types, motif_type="cwm_trimmed"):
"""
From a file containing all motifs for that TF, imports the
trimmed CWMs (or another kind of motif type) of the fine-tuned models
corresponding to the model type for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
motifs = []
with h5py.File(motif_file, "r") as f:
mtft = f["multitask_finetune"]
stft = f["singletask_finetune"]
for i, model_type in enumerate(model_types):
task = "task_%d" % i
if model_type == "M":
dset = mtft[task]
else:
dset = stft[task]
task_motifs = {}
for motif_key in dset.keys():
if "0_" in motif_key:
# Motifs that are (or are constructed from) positive metacluster only
task_motifs["T%d:%s" % (i, motif_key)] = dset[motif_key][motif_type][:]
motifs.append(task_motifs)
return motifs
def import_classic_benchmark_motifs(motif_file, mode):
"""
From a file containing all motifs for that TF from a benchmark
method, imports the PFMs of the motifs for each task.
Returns a list of dictionaries (one for each task), where
each dictionary maps motif key to motif.
"""
if mode == "dichipmunk":
score_key = "supporting_seqs"
elif mode == "homer":
score_key = "log_enrichment"
elif mode.startswith("meme"):
score_key = "evalue"
motifs = []
with h5py.File(motif_file, "r") as f:
tasks = sorted([int(key[5:]) for key in f.keys() if key != "task_agg"])
for i in tasks:
dset = f["task_%d" % i]
task_motifs = {}
for motif_key in dset.keys():
if motif_key == score_key:
continue
task_motifs["T%d:%s" % (i, motif_key)] = dset[motif_key][:]
motifs.append(task_motifs)
return motifs
def import_database_pfms(database_path):
"""
Imports the database of PFMs by reading through the entire database and
constructing a dictionary mapping motif IDs to NumPy arrays of PFMs.
"""
motif_dict = {}
with open(database_path, "r") as f:
try:
while True:
line = next(f)
if line.startswith("MOTIF"):
key = line.strip().split()[1]
header = next(f)
motif_width = int(header.split()[5])
motif = np.empty((motif_width, 4))
for i in range(motif_width):
motif[i] = np.array([
float(x) for x in next(f).strip().split()
])
# Add the motif with a shortened key
motif_dict[key.split("_")[1]] = motif
except StopIteration:
pass
return motif_dict
def get_closest_tomtom_motif_similarities(query_dict, target_dict):
"""
From a dictionary mapping N motif keys to query motifs, and a
dictionary mapping M motif keys to target motifs, returns a
dictionary mapping the N query motif keys to the similarity and
key of the closest target motif (a pair). Similarity is the
-log(p) from TOMTOM.
"""
query_keys, query_pfms = list(zip(*query_dict.items()))
target_keys, target_pfms = list(zip(*target_dict.items()))
# Create temporary directory to do work in
temp_dir_obj = tempfile.TemporaryDirectory()
temp_dir = temp_dir_obj.name
# Convert motifs to MEME format
query_motif_file = os.path.join(temp_dir, "query_motifs.txt")
target_motif_file = os.path.join(temp_dir, "target_motifs.txt")
match_motifs.export_pfms_to_meme_format(query_pfms, query_motif_file)
match_motifs.export_pfms_to_meme_format(target_pfms, target_motif_file)
# Run TOMTOM
tomtom_dir = os.path.join(temp_dir, "tomtom")
match_motifs.run_tomtom(
query_motif_file, target_motif_file, tomtom_dir,
show_output=False
)
# Find results, mapping each query motif to target index
# The query/target IDs are the indices
tomtom_table = match_motifs.import_tomtom_results(tomtom_dir)
matches = []
for i in range(len(query_pfms)):
rows = tomtom_table[tomtom_table["Query_ID"] == i]
if rows.empty:
matches.append((0, "N/A"))
continue
min_row = rows.loc[rows["p-value"].idxmin()]
score = -np.log10(min_row["p-value"])
target_key = target_keys[min_row["Target_ID"]]
matches.append((score, target_key))
temp_dir_obj.cleanup()
return dict(zip(query_keys, matches))
tfm_cwm_motifs = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "cwm_trimmed")
tfm_pfm_motifs = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "pfm_trimmed")
memechip_motifs = import_classic_benchmark_motifs(memechip_motif_file, "memechip")
homer_motifs = import_classic_benchmark_motifs(homer_motif_file, "homer")
dichipmunk_motifs = import_classic_benchmark_motifs(dichipmunk_motif_file, "dichipmunk")
# For easier viewing/clustering, flip all motifs to purine-rich orientation
# Note that this is not a perfect process, so automated clustering may be imperfect with
# respect to orientation. Final aggregate motifs are done in a reverse-complement-sensitive
# manner to fix this
for motif_list in (memechip_motifs, homer_motifs, dichipmunk_motifs):
for motif_dict in motif_list:
for key in list(motif_dict.keys()):
motif_dict[key] = purine_rich_motif(motif_dict[key])
# For TF-MoDISco motifs, make sure we flip the CWM and PFM to match
for cwm_motif_dict, pfm_motif_dict in zip(tfm_cwm_motifs, tfm_pfm_motifs):
for key in list(cwm_motif_dict.keys()):
cwm = purine_rich_motif(cwm_motif_dict[key])
cwm_motif_dict[key] = cwm # Flip CWM to purine-rich orientation
pfm = pfm_motif_dict[key]
pwm = read_motifs.pfm_to_pwm(pfm)
# Flip PFM if its PWM should be flipped to better match the CWM
score = motif_similarity_score(cwm, pwm, mean_normalize=False)
rev_score = motif_similarity_score(cwm, np.flip(pwm, axis=(0, 1)), mean_normalize=False)
if rev_score > score:
pfm_motif_dict[key] = np.flip(pfm_motif_dict[key], axis=(0, 1))
We need to decide which ones to merge together. We start with a best guess for clustering, and then manually decide on the right motifs from each task to cluster.
# Flatten all TF-MoDISco motifs across all tasks into a single list
tfm_motif_keys = [list(d.keys()) for d in tfm_cwm_motifs]
tfm_motif_cwms = [[tfm_cwm_motifs[i][key] for key in tfm_motif_keys[i]] for i in range(len(tfm_motif_keys))]
tfm_motif_pfms = [[tfm_pfm_motifs[i][key] for key in tfm_motif_keys[i]] for i in range(len(tfm_motif_keys))]
tfm_motif_keys = sum(tfm_motif_keys, [])
tfm_motif_cwms = sum(tfm_motif_cwms, [])
tfm_motif_pfms = sum(tfm_motif_pfms, [])
# Compute similarity matrix
sim_matrix = create_motif_similarity_matrix(tfm_motif_cwms)
# Compute linkage and clusters
dist_matrix = 1 - sim_matrix
np.fill_diagonal(dist_matrix, 0)
dist_vec = scipy.spatial.distance.squareform(dist_matrix)
cluster_distance = 0.6 # On the greedy side
linkage = scipy.cluster.hierarchy.linkage(dist_vec, method="ward")
clusters = scipy.cluster.hierarchy.fcluster(
linkage, cluster_distance, criterion="distance"
)
# # Show aggregated and constituent motifs for each cluster
# colgroup = vdomh.colgroup(
# vdomh.col(style={"width": "45%"}),
# vdomh.col(style={"width": "45%"}),
# vdomh.col(style={"width": "10%"})
# )
# header = vdomh.thead(
# vdomh.tr(
# vdomh.th("Aggregate motif", style={"text-align": "center"}),
# vdomh.th("Constituent motifs", style={"text-align": "center"}),
# vdomh.th("Constituent motif IDs", style={"text-align": "center"})
# )
# )
# cluster_ids, counts = np.unique(clusters, return_counts=True)
# for i, cluster_id in enumerate(cluster_ids):
# match_inds = np.where(clusters == cluster_id)[0]
# match_cwms = [tfm_motif_cwms[j] for j in match_inds]
# match_keys = [tfm_motif_keys[j] for j in match_inds]
# consensus_cwm = aggregate_motifs(match_cwms)
# display(vdomh.h3("Cluster %d (%d/%d)" % (cluster_id, i + 1, len(cluster_ids))))
# display(vdomh.h4("%d motifs" % len(match_cwms)))
# agg_fig = viz_sequence.plot_weights(consensus_cwm, figsize=(20, 4), return_fig=True)
# agg_fig.tight_layout()
# const_figs = []
# for cwm in match_cwms:
# fig = viz_sequence.plot_weights(cwm, figsize=(20, 4), return_fig=True)
# fig.tight_layout()
# const_figs.append(figure_to_vdom_image(fig))
# body = vdomh.tbody(*([
# vdomh.tr(
# vdomh.td(figure_to_vdom_image(agg_fig), rowspan=str(len(match_cwms))),
# vdomh.td(const_figs[0]),
# vdomh.td(match_keys[0])
# )] + [
# vdomh.tr(
# vdomh.td(const_figs[j + 1]),
# vdomh.td(match_keys[j + 1])
# ) for j in range(len(match_cwms) - 1)
# ]
# ))
# display(vdomh.table(colgroup, header, body))
# plt.close("all")
# Show aggregated and constituent motifs for the final clusterings
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "45%"}),
vdomh.col(style={"width": "45%"}),
vdomh.col(style={"width": "10%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Aggregate motif", style={"text-align": "center"}),
vdomh.th("Constituent motifs", style={"text-align": "center"}),
vdomh.th("Constituent motif IDs", style={"text-align": "center"})
)
)
core_motif_cwms = {}
core_motif_pfms = {}
for agg_motif_name, const_motif_keys in core_motif_defs:
const_motif_cwms = [tfm_motif_cwms[tfm_motif_keys.index(key)] for key in const_motif_keys]
const_motif_pfms = [tfm_motif_pfms[tfm_motif_keys.index(key)] for key in const_motif_keys]
agg_cwm, (const_inds, agg_inds) = aggregate_motifs(const_motif_cwms, return_inds=True)
core_motif_cwms[agg_motif_name] = agg_cwm
# Construct aggregate PFM
agg_pfm = aggregate_motifs_from_inds(const_motif_pfms, const_inds, agg_inds)
agg_pfm = agg_pfm / np.sum(agg_pfm, axis=1, keepdims=True)
core_motif_pfms[agg_motif_name] = agg_pfm
display(vdomh.h3(agg_motif_name))
agg_fig = viz_sequence.plot_weights(agg_cwm, figsize=(20, 4), return_fig=True)
agg_fig.tight_layout()
const_figs = []
for motif in const_motif_cwms:
fig = viz_sequence.plot_weights(motif, figsize=(20, 4), return_fig=True)
fig.tight_layout()
const_figs.append(figure_to_vdom_image(fig))
body = vdomh.tbody(*([
vdomh.tr(
vdomh.td(figure_to_vdom_image(agg_fig), rowspan=str(len(const_motif_keys))),
vdomh.td(const_figs[0]),
vdomh.td(const_motif_keys[0])
)] + [
vdomh.tr(
vdomh.td(const_figs[j + 1]),
vdomh.td(const_motif_keys[j + 1])
) for j in range(len(const_motif_keys) - 1)
]
))
display(vdomh.table(colgroup, header, body))
plt.close("all")
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T1:C0_0:P0_0 | ||
| T0:C0_0:P0_0 | ||
| T2:C0_0:P0_0 | ||
| T3:C0_0:P0_0 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T1:P0_1 | ||
| T0:C0_1:P0_1 | ||
| T2:C0_1 | ||
| T2:C0_7 | ||
| T2:P0_7 | ||
| T3:C0_1 | ||
| T3:P0_2 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_3:P0_4 | ||
| T1:C0_3:P0_5 | ||
| T3:C0_3 | ||
| T2:P0_1 | ||
| T3:P0_1 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_2:P0_7 | ||
| T0:C0_7 | ||
| T0:C0_8:P0_2 | ||
| T1:C0_1:P0_2 | ||
| T1:C0_5 | ||
| T1:C0_6 | ||
| T1:P0_6 | ||
| T1:P0_7 | ||
| T2:C0_2:P0_2 | ||
| T2:C0_5 | ||
| T2:C0_6 | ||
| T2:C0_8 | ||
| T3:C0_2 | ||
| T3:C0_6 | ||
| T3:C0_7 | ||
| T3:P0_3 | ||
| T3:P0_8 | ||
| T0:C0_12 | ||
| T0:C0_11 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_6:P0_9 | ||
| T2:C0_4:P0_6 | ||
| T3:C0_5 | ||
| T3:P0_7 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_4:P0_3 | ||
| T1:C0_4 | ||
| T1:P0_4 | ||
| T2:C0_3:P0_3 | ||
| T2:C0_9 | ||
| T3:C0_4 | ||
| T3:C0_8 | ||
| T3:P0_4 | ||
| T3:C0_10 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_9:P0_8 | ||
| T1:C0_2:P0_3 | ||
| T2:P0_5 | ||
| T3:P0_6 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T3:C0_9 |
| Aggregate motif | Constituent motifs | Constituent motif IDs |
|---|---|---|
| T0:C0_5:P0_5 |
For each aggregated motif, extract the prevalence of the constituent motifs (by task) in the peaks.
# Import the motif hits for each task
import_motif_hits = lambda hits_path: pd.read_csv(hits_path, sep="\t", header=0, index_col=False)
task_motif_hit_paths = get_motif_hit_paths()
task_motif_hits = []
for count_path, profile_path in task_motif_hit_paths:
count_table = import_motif_hits(count_path)[["key", "peak_index"]]
profile_table = import_motif_hits(profile_path)[["key", "peak_index"]]
# We only need the key and peak index
task_motif_hits.append({"C": count_table, "P": profile_table})
def get_hit_prevalence(hit_table, motif_keys):
"""
Computes the motif prevalence from the hit table, as the proportion of
peaks which have hits in the given motif keys.
"""
total_peaks = len(np.unique(hit_table["peak_index"]))
hit_peaks = len(np.unique(hit_table[np.isin(hit_table["key"], motif_keys)]["peak_index"]))
return hit_peaks / total_peaks
# Create matrix of motif prevalences
motif_prevalences = np.zeros((len(core_motif_defs), len(task_motif_hits)))
for i, (_, motif_keys) in enumerate(core_motif_defs):
# Map each task index to the motif keys belonging to this aggregate motif (if any)
task_const_keys = {j : [] for j in range(len(task_motif_hits))}
for const_key in motif_keys:
tokens = const_key.split(":")
task_index = int(tokens[0][1:])
task_const_keys[task_index].append(":".join(tokens[1:]))
# Get the sum of prevalences for each task
for j in task_const_keys:
# Extract the set of motif keys, separately for counts/profiles
motif_keys = {}
for key in task_const_keys[j]:
tokens = key.split(":")
# May be compound key
for token in tokens:
head, motif_key = token[0], token[1:]
try:
motif_keys[head].append(motif_key)
except KeyError:
motif_keys[head] = [motif_key]
# Compute prevalence over the motif keys, taking the average over the count/profile heads
motif_prevalences[i, j] = np.mean([
get_hit_prevalence(task_motif_hits[j][head], motif_keys[head])
for head in motif_keys.keys()
]) if motif_keys else 0
print(motif_prevalences)
[[6.66807852e-01 7.97510188e-01 7.47573627e-01 6.13414209e-01] [3.44493115e-01 3.69811603e-01 1.99223535e-01 3.38370027e-01] [1.75442817e-01 2.39599180e-01 3.55120086e-01 1.83465616e-01] [3.26952608e-01 2.99152522e-01 3.88646421e-01 3.55733996e-01] [3.43819655e-02 0.00000000e+00 5.01205013e-02 6.77126403e-02] [2.23066545e-01 1.50093366e-01 2.65666888e-01 2.44901415e-01] [3.07574617e-02 1.53805403e-01 3.20229418e-02 8.32919193e-02] [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.24638332e-04] [9.37120843e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00]]
For each aggregated motif, compute the similarity of the closest motif in each benchmark for each task.
def get_closest_motifs(query_motifs, target_motifs):
"""
From a list of N target motifs in `target_motifs`, and a list of
M query motifs in `query_motifs`, computes the most similar target
motif to each query motif. Returns an N-array.
Any ill-formed motifs (e.g. rows of all 0) are ignored.
"""
# Build similarity matrix
sim_matrix = create_motif_similarity_matrix(query_motifs, target_motifs, show_progress=False)
return np.nanmax(sim_matrix, axis=1)
def get_benchmark_similarities(query_motifs, benchmark_motifs):
"""
From a list of N target motifs and a list of T dictionaries mapping
motif keys to motifs, computes an N x T matrix of the best motif
similarity in each task to each query motif.
"""
matrix = np.empty((len(query_motifs), len(benchmark_motifs)))
for i in range(len(query_motifs)):
for j in range(len(benchmark_motifs)):
matrix[:, j] = get_closest_motifs(query_motifs, list(benchmark_motifs[j].values()))
return matrix
query_motifs = [core_motif_cwms[pair[0]] for pair in core_motif_defs]
memechip_best_sims = get_benchmark_similarities(query_motifs, memechip_motifs)
homer_best_sims = get_benchmark_similarities(query_motifs, homer_motifs)
dichipmunk_best_sims = get_benchmark_similarities(query_motifs, dichipmunk_motifs)
/users/amtseng/tfmodisco/notebooks/reports/util.py:103: RuntimeWarning: invalid value encountered in true_divide motif_2 = motif_2 / np.sqrt(np.sum(motif_2 * motif_2, axis=1, keepdims=True))
For each aggregated motif, compute the similarity of the closest motif in the database of motifs.
database_motifs = import_database_pfms(motif_database_path)
query_motifs = {
pair[0] : core_motif_pfms[pair[0]] for pair in core_motif_defs
}
database_best_sims = get_closest_tomtom_motif_similarities(
query_motifs, database_motifs
)
# Create a good color map
benchmark_decay_scale = 8.0
database_decay_scale = 0.6
x = np.linspace(0, 1, 256)
transformed = np.exp(benchmark_decay_scale * x) / np.exp(benchmark_decay_scale)
benchmark_cmap = colors.LinearSegmentedColormap.from_list(
"newOranges",
plt.get_cmap("Oranges")(transformed)
)
transformed = np.exp(database_decay_scale * x) / np.exp(database_decay_scale)
database_cmap = colors.LinearSegmentedColormap.from_list(
"newBlues",
plt.get_cmap("Blues")(transformed)
)
# Show the color map with the original
fig, ax = plt.subplots(nrows=4, figsize=(20, 4), gridspec_kw={"hspace": 1})
grad = np.linspace(0, 1, 256)
ax[0].imshow(grad[None, :], aspect="auto", cmap="Oranges")
ax[1].imshow(grad[None, :], aspect="auto", cmap=benchmark_cmap)
ax[0].set_title("Original")
ax[1].set_title("Transformed")
ax[2].imshow(grad[None, :], aspect="auto", cmap="Blues")
ax[3].imshow(grad[None, :], aspect="auto", cmap=database_cmap)
ax[2].set_title("Original")
ax[3].set_title("Transformed")
for i in range(4):
ax[i].axis("off")
plt.show()
grid_height = motif_prevalences.shape[0] * 2
grid_width = motif_prevalences.shape[1] * 2
width_spacing = 0.1
width = (grid_width * 2) + 2 + (width_spacing * 2)
fig, ax = plt.subplots(
ncols=3, figsize=(width, grid_height),
gridspec_kw={
"height_ratios": [grid_height],
"width_ratios": [grid_width, 2, grid_width],
"wspace": width_spacing
}
)
# Plot motif prevalences in each task
y, x = np.unravel_index(np.arange(motif_prevalences.size), motif_prevalences.shape)
x, y = x + 0.5, y + 0.5
# Set the radius such that the area is proportional to the prevalence
max_area = np.pi * (0.5 ** 2)
assert np.min(motif_prevalences) >= 0 and np.max(motif_prevalences) <= 1
area = motif_prevalences * max_area
radius = np.sqrt(area / np.pi)
# Plot the data
ax[0].set_xlim(0, motif_prevalences.shape[1])
ax[0].set_ylim(0, motif_prevalences.shape[0])
ax[0].set_xticks(np.arange(0.5, motif_prevalences.shape[1] + 0.5))
ax[0].set_yticks(np.arange(0.5, motif_prevalences.shape[0] + 0.5))
ax[0].set_xticklabels(["task_%d" % i for i in np.arange(0, motif_prevalences.shape[1])])
ax[0].set_yticklabels([pair[0] for pair in core_motif_defs][::-1]) # Flip y-axis
for i in range(motif_prevalences.shape[1]):
ax[0].axvline(i, color="gray", alpha=0.2)
for i in range(motif_prevalences.shape[0]):
ax[0].axhline(i, color="gray", alpha=0.2)
for i in range(motif_prevalences.shape[0]):
for j in range(motif_prevalences.shape[1]):
circle = plt.Circle((j + 0.5, motif_prevalences.shape[0] - i - 1 + 0.5), radius[i, j], alpha=0.3)
ax[0].add_patch(circle)
# Plot the database motif distances
database_sims = np.array([database_best_sims[pair[0]][0] for pair in core_motif_defs])
database_labels = [database_best_sims[pair[0]][1] for pair in core_motif_defs]
database_hm = ax[1].imshow(database_sims[:, None], cmap=database_cmap)
fig.colorbar(database_hm)
# Create annotations
for i in range(len(database_labels)):
ax[1].text(0, i, database_labels[i], ha="center", va="center")
ax[1].set_aspect("auto")
ax[1].set_xticks([0])
ax[1].set_xticklabels(["Database"])
ax[1].set_yticks([])
# Plot benchmark motif distances in each task
# Create the benchmark array to show
full_sim_matrix = np.empty((memechip_best_sims.shape[0], memechip_best_sims.shape[1] * 3))
sim_matrices = [memechip_best_sims, homer_best_sims, dichipmunk_best_sims]
for i in range(3):
full_sim_matrix[:, np.arange(0, memechip_best_sims.shape[1] * 3, 3) + i] = sim_matrices[i]
benchmark_hm = ax[2].imshow(full_sim_matrix, cmap=benchmark_cmap)
fig.colorbar(benchmark_hm)
ax[2].set_aspect("auto")
ax[2].set_xticks(np.arange(full_sim_matrix.shape[1]))
ax[2].set_xticklabels(
sum([["task_%d_%s" % (i, s) for s in ("M", "H", "D")] for i in range(memechip_best_sims.shape[1])], []),
rotation=90
)
ax[2].set_yticks([])
fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_motifs_across_tasks.svg" % tf_name),
format="svg"
)
plt.show()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:76: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.
# Save the results
def sanitize_name(name):
return name.replace("/", ":")
out_hdf5 = os.path.join(out_path, "%s_motifs_and_similarities.h5" % tf_name)
with h5py.File(out_hdf5, "w") as f:
# Aggregate motif names
agg_motif_names = [pair[0] for pair in core_motif_defs]
f.create_dataset("agg_motif_names", data=np.array(agg_motif_names).astype("S"))
# Aggregate motif CWMs and PWMs
agg_cwms = f.create_group("agg_motif_cwms")
agg_pfms = f.create_group("agg_motif_pfms")
for motif_name in agg_motif_names:
sanitized_motif_name = sanitize_name(motif_name)
agg_cwms.create_dataset(sanitized_motif_name, data=core_motif_cwms[motif_name])
agg_pfms.create_dataset(sanitized_motif_name, data=core_motif_pfms[motif_name])
# Motif prevalences
f.create_dataset("agg_motif_prevalences", data=motif_prevalences)
# Database similarities
f.create_dataset("database_log_pvals", data=np.array([
database_best_sims[motif_name][0] for motif_name in agg_motif_names
]))
f.create_dataset("database_matches", data=np.array([
database_best_sims[motif_name][1] for motif_name in agg_motif_names
]).astype("S"))
# Benchmark similarity matrices
f.create_dataset("memechip_best_sims", data=memechip_best_sims)
f.create_dataset("homer_best_sims", data=homer_best_sims)
f.create_dataset("dichipmunk_best_sims", data=dichipmunk_best_sims)
# Show aggregate motifs
for i, (key, _) in enumerate(core_motif_defs):
display(vdomh.h3(key))
fig = viz_sequence.plot_weights(core_motif_cwms[key], subticks_frequency=100, return_fig=True)
fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_agg_motif_%d.svg" % (tf_name, i)),
format="svg"
)
plt.show()
# Show database matches
for i, (key, _) in enumerate(core_motif_defs):
display(vdomh.h3(key))
if database_best_sims[key][1] == "N/A":
continue
fig = viz_sequence.plot_weights(
purine_rich_motif(read_motifs.pfm_to_pwm(database_motifs[database_best_sims[key][1]])),
subticks_frequency=100, return_fig=True
)
fig.tight_layout()
plt.savefig(
os.path.join(out_path, "%s_agg_motif_%d_database_match.svg" % (tf_name, i)),
format="svg"
)
plt.show()