import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import motif.tfmodisco_hit_scoring as tfmodisco_hit_scoring
import motif.match_motifs as match_motifs
import motif.read_motifs as read_motifs
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, motif_similarity_score, create_motif_similarity_matrix, aggregate_motifs, aggregate_motifs_from_inds, purine_rich_motif
import tempfile
import h5py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.font_manager as font_manager
import scipy.signal
import scipy.cluster.hierarchy
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:22: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

<tqdm.notebook.tqdm_notebook at 0x7ff57821ff50>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold",
    "svg.fonttype": "none"
}
plt.rcParams.update(plot_params)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


if "TFM_TF_NAME" in os.environ:
    tf_name = os.environ["TFM_TF_NAME"]
else:
    tf_name = "SPI1"


# Manually define the clusters of core motifs

if tf_name == "E2F6":
    core_motif_defs = [
        ("MAX", ["T0:C0_0:P0_0", "T1:C0_0:P0_0"]),
        ("E2F6", ["T0:C0_1:P0_1", "T1:C0_1:P0_1"]),
        ("AP-1", ["T0:C0_5", "T0:P0_9", "T1:C0_7:P0_11"]),
        ("E2F v1", ["T0:C0_4", "T0:P0_4", "T0:P0_6", "T1:C0_3:P0_3"]),
        ("MAX v1", ["T1:C0_2:P0_7", "T1:C0_6:P0_2", "T0:C0_2:P0_2", "T1:C0_5", "T1:P0_4"])
    ]
elif tf_name == "SPI1":
    core_motif_defs = [
        ("SPI1", ["T1:C0_0:P0_0", "T3:C0_0:P0_0", "T1:P0_7", "T0:C0_4", "T0:C0_0:P0_0", "T2:C0_0:P0_0", "T0:P0_5"]),
        ("SPI1 v1", ["T1:C0_4", "T1:P0_5", "T3:C0_2", "T1:P0_6"]),
        ("AP-1", ["T2:C0_5", "T1:C0_3", "T2:C0_6", "T2:C0_7", "T2:P0_7", "T2:C0_2"]),
        ("IRF", ["T0:C0_1", "T0:C0_3", "T0:P0_2", "T2:C0_1", "T2:C0_4", "T2:P0_4"]),
        ("RUNX", ["T2:C0_8", "T3:C0_3", "T3:P0_5", "T3:P0_7", "T2:P0_3"]),
        ("CEBP", ["T3:C0_1:P0_1"]),
        ("GATA", ["T1:C0_1"])
    ]
elif tf_name == "FOXA2":
    core_motif_defs = [
        ("FOXA", ["T1:C0_0:P0_0", "T0:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0"]),
        ("FOX v1", ["T1:P0_1", "T0:C0_1:P0_1", "T2:C0_1", "T2:C0_7", "T2:P0_7", "T3:C0_1", "T3:P0_2"]),
        ("FOX v2", ["T0:C0_3:P0_4", "T1:C0_3:P0_5", "T3:C0_3", "T2:P0_1", "T3:P0_1"]),
        ("HNF4", ["T0:C0_2:P0_7", "T0:C0_7", "T0:C0_8:P0_2", "T1:C0_1:P0_2", "T1:C0_5", "T1:C0_6", "T1:P0_6", "T1:P0_7", "T2:C0_2:P0_2", "T2:C0_5", "T2:C0_6", "T2:C0_8", "T3:C0_2", "T3:C0_6", "T3:C0_7", "T3:P0_3", "T3:P0_8", "T0:C0_12", "T0:C0_11"]),
        ("HNF1", ["T0:C0_6:P0_9", "T2:C0_4:P0_6", "T3:C0_5", "T3:P0_7"]),
        ("CEBP", ["T0:C0_4:P0_3", "T1:C0_4", "T1:P0_4", "T2:C0_3:P0_3", "T2:C0_9", "T3:C0_4", "T3:C0_8", "T3:P0_4", "T3:C0_10"]),
        ("AP-1", ["T0:C0_9:P0_8", "T1:C0_2:P0_3", "T2:P0_5", "T3:P0_6"]),
        ("FOX dimer", ["T3:C0_9"]),
        ("CTCF", ["T0:C0_5:P0_5"])
    ]
elif tf_name == "CEBPB":
    core_motif_defs = [
        ("CEBP", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_0"]),
        ("FOXA", ["T0:C0_9", "T0:C0_2:P0_2", "T0:C0_7:P0_8", "T0:C0_8", "T3:C0_1:P0_1"]),
        ("FOXB", ["T0:C0_5:P0_7", "T3:C0_6"]),
        ("HNF4", ["T0:P0_14", "T0:P0_11", "T0:C0_1:P0_1", "T0:P0_13", "T2:C0_6", "T2:P0_7", "T3:C0_2:P0_4", "T3:C0_5"]),
        ("ATF/CEBP", ["T6:C0_1"]),
        ("AP-1", ["T1:C0_5", "T0:C0_10", "T5:P0_13", "T0:C0_4", "T0:P0_4", "T1:C0_2:P0_3", "T2:C0_1:P0_1", "T2:C0_4", "T2:C0_5", "T2:P0_5", "T3:C0_3:P0_2", "T4:C0_1:P0_9", "T4:P0_1", "T4:P0_4", "T4:P0_5", "T4:P0_6", "T4:P0_7", "T4:P0_8", "T5:C0_3:P0_2", "T6:P0_2", "T2:C0_7"]),
        ("NFE", ["T5:P0_12"]),
        ("TEAD", ["T4:P0_10", "T4:C0_2", "T4:C0_3", "T4:P0_11", "T4:P0_2"]),
        ("TEAD v1", ["T4:C0_4", "T4:P0_3"]),
        ("GATA", ["T1:C0_1", "T1:P0_1", "T1:P0_9", "T5:C0_1:P0_1"]),
        ("CTCF", ["T0:C0_3:P0_3", "T1:C0_3:P0_2", "T1:C0_4", "T1:P0_11", "T2:C0_2:P0_2", "T3:C0_4:P0_5", "T5:C0_2:P0_3", "T6:C0_2:P0_1", "T3:C0_7", "T5:C0_4", "T0:C0_6"])
    ]
elif tf_name == "MAX":
    core_motif_defs = [
        ("MAX", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T2:C0_1:P0_5", "T2:C0_5", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_2:P0_2", "T6:C0_0:P0_0"]),
        ("weak MAX", ["T6:P0_8", "T1:P0_5", "T1:P0_14", "T2:P0_3", "T3:P0_4", "T3:P0_5", "T6:C0_2:P0_2", "T1:P0_1", "T3:P0_1", "T3:P0_3"]),
        ("CTCF", ["T4:C0_2", "T4:P0_2", "T0:C0_2:P0_1", "T0:P0_2", "T1:C0_3:P0_2", "T3:C0_2:P0_2", "T4:C0_1:P0_1", "T6:C0_3:P0_3", "T6:P0_6"]),
        ("HNF4", ["T2:C0_2:P0_1", "T2:C0_3", "T2:C0_4", "T5:C0_1:P0_0", "T5:P0_7", "T6:C0_1:P0_1"]),
        ("CEBP", ["T5:C0_0:P0_1", "T6:C0_4:P0_11", "T6:P0_10", "T6:P0_13"]),
        ("FOX", ["T2:P0_2", "T5:C0_3", "T5:C0_6", "T6:C0_7:P0_12", "T6:C0_8"]),
        ("ELK-like", ["T6:P0_7", "T5:C0_5", "T6:C0_5", "T6:P0_4", "T6:P0_9"]),
        ("SP1-like", ["T6:C0_6:P0_5"]),
        ("GC repeat", ["T0:C0_1", "T1:C0_1:P0_6", "T1:C0_2", "T1:C0_4", "T1:C0_5", "T1:C0_6", "T1:C0_7", "T1:P0_10", "T1:P0_4", "T1:P0_8", "T1:P0_9", "T3:C0_1", "T5:C0_4", "T5:P0_3", "T5:P0_5", "T5:P0_6", "T1:P0_11", "T1:P0_3", "T1:P0_7"])
    ]
elif tf_name == "GABPA":
    core_motif_defs = [
        ("GABPA monomer", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_2", "T7:C0_0:P0_0", "T8:C0_0:P0_0", "T4:P0_9", "T1:C0_2", "T2:C0_3", "T2:P0_4", "T3:P0_2", "T4:C0_4:P0_4", "T7:P0_5", "T4:P0_11"]),
        ("weak GABPA monomer", ["T4:P0_2", "T7:P0_2", "T8:P0_3"]),
        ("GABPA dimer", ["T0:P0_2", "T1:C0_1:P0_1", "T2:C0_1", "T3:C0_2", "T4:C0_1", "T5:C0_1:P0_1", "T6:C0_1:P0_0", "T2:P0_8"]),
        ("GABPA dimer v1", ["T1:P0_8"]),
        ("GABPA dimer v2", ["T8:P0_10", "T7:P0_10"]),
        ("GABPA dimer v3", ["T5:P0_7"]),
        ("GABPA dimer v4", ["T8:P0_8"]),
        ("GABPA dimer v5", ["T8:P0_9"]),
        ("GABPA dimer v6", ["T5:P0_8"]),
        ("GABPA dimer v7", ["T7:P0_9", "T5:P0_6", "T7:C0_2"]),
        ("GABPA dimer v8", ["T0:C0_2"]),
        ("GABPA dimer v9", ["T4:C0_7"]),
        ("GABPA dimer v10", ["T8:P0_12"]),
        ("GABPA dimer v11", ["T8:P0_12"]),
        ("GABPA dimer v12", ["T1:C0_5"]),
        ("GABPA dimer v13", ["T8:P0_6"]),
        ("GABPA dimer v14", ["T4:C0_8"]),
        ("GABPA dimer v15", ["T5:P0_12"]),
        ("GABPA dimer v16", ["T6:P0_9", "T7:P0_13", "T8:P0_11"]),
        ("GABPA/ATF dimer", ["T7:P0_7", "T8:P0_5"]),
        ("GABPA/HNF4 dimer", ["T0:P0_7", "T1:C0_4", "T2:P0_9", "T4:C0_6", "T4:P0_12", "T7:P0_4"]),
        ("THAP11", ["T0:C0_1:P0_1", "T1:C0_3:P0_5", "T2:C0_2:P0_1", "T3:C0_3:P0_4", "T4:C0_2:P0_1", "T5:C0_2:P0_2", "T6:C0_2", "T6:P0_1", "T7:C0_1", "T7:P0_1", "T8:C0_1", "T8:P0_1"]),
        ("GATA", ["T3:C0_1:P0_1", "T3:P0_5"]),
        ("ATF", ["T4:C0_5", "T4:P0_3"]),
        ("GC repeat", ["T2:C0_4", "T5:P0_10", "T6:P0_3", "T7:P0_6", "T8:P0_2", "T0:P0_4", "T0:P0_5", "T1:P0_2", "T1:P0_3", "T1:P0_4", "T2:C0_5", "T2:P0_2", "T2:P0_3", "T2:P0_5", "T2:P0_6", "T2:P0_7", "T4:C0_3"])
    ]
elif tf_name == "MAFK":
    core_motif_defs = [
        ("TMARE", ["T3:P0_12", "T0:C0_0:P0_0", "T1:C0_1:P0_1", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T3:C0_5", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T5:C0_10", "T5:P0_5", "T6:C0_1:P0_1", "T7:C0_0:P0_0", "T8:C0_0:P0_0"]),
        ("CMARE", ["T3:C0_1", "T4:C0_1:P0_1", "T5:C0_4"]),
        ("CsMBE", ["T4:P0_2", "T5:C0_2", "T5:C0_8", "T8:P0_1", "T2:C0_1:P0_1", "T5:P0_2", "T6:P0_10", "T8:C0_1", "T1:C0_0:P0_0", "T5:C0_1:P0_1", "T6:C0_0:P0_0", "T7:C0_1:P0_1"]),
        ("TRE", ["T2:P0_5", "T5:P0_10", "T0:P0_1", "T4:C0_2", "T3:C0_2:P0_1"]),
        ("CRE", ["T2:C0_4"]),
        ("CTCF", ["T5:C0_7:P0_4", "T6:C0_3:P0_3", "T2:C0_3:P0_3", "T1:C0_3:P0_3", "T8:C0_3:P0_2", "T2:P0_10", "T1:P0_4", "T6:P0_4", "T2:P0_4", "T0:C0_1:P0_2", "T1:C0_2:P0_2", "T1:C0_5:P0_6", "T2:C0_2:P0_2", "T2:P0_6", "T3:C0_4:P0_3", "T5:C0_3:P0_3", "T5:C0_9", "T6:C0_2:P0_2", "T7:C0_2:P0_3", "T7:C0_5", "T7:P0_8", "T8:C0_2:P0_3", "T8:C0_5", "T4:C0_5:P0_5", "T3:P0_10"]),
        ("MAF v1", ["T2:C0_5", "T5:C0_6", "T5:P0_9", "T6:C0_5:P0_9", "T8:C0_4:P0_5"]),
        ("MAF v2", ["T3:C0_3:P0_2", "T4:C0_4:P0_4", "T5:C0_5:P0_6", "T7:C0_3", "T7:P0_2"]),
        ("MAF v3", ["T3:P0_9", "T4:C0_3:P0_3", "T7:P0_6"]),
        ("MAF v4", ["T6:C0_4:P0_7", "T7:C0_8:P0_9"]),
        ("MAF v5", ["T0:P0_6", "T1:C0_4", "T1:P0_5", "T3:P0_6", "T4:C0_6", "T7:C0_6"]),
        ("MAF v6", ["T4:P0_14", "T5:P0_8", "T7:P0_4"]),
        ("MAF v7", ["T7:C0_4", "T7:P0_7", "T7:C0_7"])
    ]
elif tf_name == "JUND":
    core_motif_defs = [
        ("TRE", ["T0:C0_0:P0_0", "T1:C0_0:P0_0", "T2:C0_0:P0_0", "T3:P0_0", "T4:P0_0", "T5:C0_0", "T5:P0_0", "T6:P0_1", "T8:C0_0:P0_0", "T10:C0_0:P0_0", "T11:C0_0:P0_0", "T12:C0_0:P0_0", "T12:C0_8", "T13:C0_2:P0_2", "T3:C0_0", "T4:C0_0", "T9:C0_0:P0_0", "T3:P0_8", "T4:P0_4", "T6:C0_1", "T12:P0_3", "T12:P0_4", "T13:P0_10", "T4:P0_3"]),
        ("CRE", ["T4:C0_1", "T7:P0_3", "T9:C0_1:P0_1", "T0:C0_4:P0_3", "T1:C0_2:P0_1", "T2:C0_1:P0_1", "T3:C0_4", "T3:P0_4", "T4:P0_1", "T7:C0_1", "T8:C0_2:P0_2", "T10:C0_3:P0_2", "T10:P0_5", "T11:C0_1:P0_1", "T12:C0_1:P0_1", "T13:C0_5", "T13:P0_4"]),
        ("TRE/TRE/TRE", ["T3:C0_9"]),
        ("TRE/TRE/TEAD", ["T3:C0_10"]),
        ("TRE/TRE", ["T3:P0_10", "T3:C0_7", "T4:C0_5", "T2:C0_5"]),
        ("TRE/TRE v2", ["T2:C0_3:P0_6", "T4:C0_3", "T5:P0_3"]),
        ("TRE/TRE v1", ["T3:C0_6", "T4:C0_4", "T2:C0_4"]),
        ("TRE/TRE v3", ["T3:C0_8"]),
        ("TRE-CRE", ["T3:C0_11"]),
        ("TRE-TEAD", ["T8:C0_7:P0_6", "T11:C0_5", "T3:C0_3", "T3:C0_1:P1_1", "T3:C0_2", "T3:C0_5", "T0:C0_9", "T3:P0_1"]),
        ("TEAD-TRE", ["T13:C0_7:P0_7", "T13:P0_16"]),
        ("CRE-TEAD", ["T3:P0_7", "T6:P0_9", "T8:C0_3:P0_3", "T8:P0_7", "T11:C0_2:P0_2"]),
        ("TRE-IRF", ["T7:C0_0:P0_0", "T7:P0_1"]),
        ("TEAD", ["T3:P0_3", "T6:P0_5", "T6:P0_10"]),
        ("TEAD v1", ["T6:P0_8"]),
        ("TRE v1", ["T2:P0_7"]),
        ("TRE v2", ["T0:P0_11"]),
        ("CRE v1", ["T4:P0_5"]),
        ("IRF", ["T7:P0_4"]),
        ("HNF1", ["T0:C0_7", "T13:C0_4:P0_5"]),
        ("HNF4", ["T0:C0_12", "T0:C0_2:P0_1", "T0:C0_6", "T13:C0_0:P0_0", "T13:C0_9", "T13:P0_15"]),
        ("CEBPB", ["T12:P0_6", "T9:C0_3", "T9:P0_3"]),
        ("CEBPD", ["T0:C0_3:P0_2", "T13:C0_1", "T13:P0_1", "T0:C0_13", "T0:P0_14"]),
        ("FOXA", ["T5:C0_1:P0_1", "T5:C0_2", "T6:P0_2"]),
        ("FOXD", ["T0:C0_10", "T0:C0_1:P0_10", "T6:P0_0", "T13:C0_3:P0_3"]),
        ("FOXB", ["T0:C0_5", "T13:C0_6", "T13:P0_6"]),
        ("CTCF", ["T10:C0_2:P0_1", "T11:C0_6", "T8:C0_1:P0_1"]),
        ("NFIC", ["T13:C0_8", "T13:P0_11", "T13:P0_14", "T13:P0_8", "T13:P0_9"]),
        ("SP1", ["T13:P0_12", "T10:C0_5"]),
        ("ATF", ["T1:P0_5", "T2:C0_2:P0_4", "T8:C0_4", "T9:C0_2:P0_2", "T10:C0_4:P0_3", "T11:C0_4", "T12:C0_2:P0_2"]),
        ("ATF v1", ["T4:C0_2", "T9:P0_5"]),
        ("AP2", ["T6:P0_11"]),
        ("AP2 v1", ["T6:P0_4"]),
        ("ELK-CRE", ["T0:C0_11", "T0:P0_12", "T0:P0_7", "T10:C0_8"])
    ]
elif tf_name == "NR3C1-reddytime":
    core_motif_defs = [
        ("NR3C1", ["T0:C0_1", "T1:C0_0", "T2:C0_0", "T4:C0_0", "T4:C0_6", "T5:C0_0:P0_0", "T5:P0_6", "T7:C0_0:P0_0", "T7:C0_6", "T8:C0_0:P0_0", "T9:C0_0:P0_0", "T9:P0_7", "T10:C0_0:P0_0", "T12:C0_0:P0_0", "T13:C0_0:P0_0", "T14:C0_0:P0_0", "T15:C0_0:P0_0", "T0:P0_1", "T1:P0_0", "T2:P0_0", "T3:C0_0:P0_0", "T4:P0_0", "T6:C0_0:P0_0", "T11:C0_0:P0_0"]),
        ("NR3C1 half site", ["T0:P0_3", "T1:P0_7", "T2:P0_7"]),
        ("AP-1 (TRE)", ["T1:C0_1", "T2:C0_1", "T4:C0_1", "T5:C0_1", "T5:P0_2", "T7:C0_1:P0_2", "T8:C0_1:P0_2", "T9:C0_1:P0_2", "T10:C0_1", "T10:P0_2", "T12:C0_1:P0_2", "T12:P0_8", "T13:C0_1:P0_2", "T14:C0_1:P0_2", "T15:C0_1:P0_2", "T0:P0_0", "T1:P0_1", "T2:P0_1", "T3:C0_3:P0_4", "T4:P0_1", "T6:C0_2:P0_2", "T6:P0_3", "T11:C0_2:P0_2"]),
        ("AP-1 (TRE) v1", ["T3:C0_4", "T3:P0_5", "T3:P0_7", "T3:C0_7", "T3:C0_8"]),
        ("AP-1 (TRE) v2", ["T0:P0_8", "T2:P0_10", "T10:C0_6"]),
        ("AP-1 (CRE)", ["T0:C0_4", "T0:P0_6", "T1:C0_5", "T2:C0_4", "T4:C0_4", "T5:C0_4:P0_4", "T7:C0_4:P0_4", "T8:C0_4:P0_4", "T9:C0_4:P0_4", "T9:P0_8", "T10:C0_4:P0_4", "T10:P0_7", "T12:C0_4:P0_4", "T13:C0_4", "T13:C0_7", "T13:P0_4", "T14:C0_4:P0_4", "T15:C0_4:P0_4", "T0:P0_5", "T1:P0_4", "T2:P0_4", "T4:P0_3", "T11:C0_3"]),
        ("TEAD", ["T0:P0_9", "T2:P0_8", "T3:C0_1:P0_1", "T3:P0_8", "T11:P0_6", "T0:P0_7", "T2:C0_7", "T7:C0_7", "T1:C0_7", "T2:P0_6", "T5:P0_5", "T7:P0_5", "T8:C0_6", "T8:P0_5", "T9:C0_7", "T9:P0_5", "T10:P0_5", "T12:P0_5", "T13:P0_5", "T14:P0_5", "T15:P0_5"]),
        ("FOXB", ["T0:C0_3", "T1:C0_6", "T1:P0_9", "T2:C0_5", "T2:P0_9", "T4:C0_5", "T4:P0_9", "T5:C0_3:P0_3", "T7:C0_5:P0_6", "T8:C0_5:P0_6", "T9:C0_5:P0_6", "T10:C0_5:P0_6", "T12:C0_5:P0_6", "T13:C0_5:P0_6", "T14:C0_5:P0_6", "T15:C0_5:P0_6", "T13:C0_8"]),
        ("FOXD", ["T0:P0_4", "T1:P0_2", "T2:P0_3", "T2:P0_5", "T4:P0_4", "T4:P0_8", "T11:C0_4", "T11:P0_4", "T1:C0_3", "T2:C0_3", "T4:C0_3", "T7:C0_3:P0_3", "T7:P0_7", "T7:P0_8", "T8:C0_3:P0_3", "T9:C0_3:P0_3", "T10:C0_3:P0_3", "T12:C0_3:P0_3", "T12:P0_7", "T13:C0_3:P0_3", "T13:P0_7", "T14:C0_3:P0_3", "T15:C0_3:P0_3"]),
        ("FOX-like", ["T2:C0_6", "T9:C0_6", "T13:C0_6", "T15:C0_6"]),
        ("CEBPB", ["T0:C0_2", "T1:C0_2", "T1:P0_6", "T1:P0_8", "T2:C0_2", "T4:C0_2", "T5:C0_2:P0_1", "T7:C0_2:P0_1", "T8:C0_2:P0_1", "T9:C0_2:P0_1", "T10:C0_2:P0_1", "T12:C0_2:P0_1", "T12:C0_6", "T12:C0_7", "T13:C0_2:P0_1", "T14:C0_2:P0_1", "T14:C0_6", "T15:C0_2:P0_1", "T15:P0_7"]),
        ("CEBPG", ["T0:P0_2", "T1:P0_5", "T2:P0_2", "T4:P0_2", "T4:P0_6", "T4:P0_7", "T6:C0_1:P0_1", "T11:C0_1:P0_1", "T11:P0_3", "T11:P0_7"]),
        ("CTCF", ["T3:C0_5", "T3:P0_3"]),
        ("GRHL-like", ["T3:C0_6:P0_6"]),
        ("NFIA", ["T3:C0_2", "T3:P0_2"])
    ]
elif tf_name == "REST":
    core_motif_defs = [
        ("REST full", ["T17:C0_12", "T0:C0_0:P0_0", "T1:C0_1:P0_0", "T2:C0_0:P0_0", "T3:C0_0:P0_0", "T4:C0_0:P0_0", "T5:C0_0:P0_0", "T6:C0_0:P0_0", "T7:C0_0:P0_0", "T8:C0_0:P0_0", "T9:C0_0:P0_1", "T10:C0_0:P0_1", "T11:C0_0:P0_0", "T12:C0_0:P0_0", "T13:C0_0:P0_0", "T14:C0_0", "T14:C0_1:P0_0", "T15:C0_4:P0_3", "T16:C0_0:P0_0", "T17:C0_1:P0_0", "T17:P0_11", "T18:C0_0:P0_0", "T19:C0_0:P0_0"]),
        ("REST full (4 bp space)", ["T6:P0_16"]),
        ("REST full (5 bp space)", ["T0:C0_6", "T1:C0_10", "T2:C0_5:P0_5", "T3:C0_9:P0_8", "T4:P0_9", "T5:C0_7", "T6:C0_11", "T7:C0_10", "T8:C0_7:P0_11", "T10:C0_10", "T11:P0_7", "T13:P0_8", "T14:C0_9", "T16:C0_8:P0_10", "T17:C0_15:P0_15", "T19:C0_10", "T0:C0_9", "T0:P0_13", "T1:C0_9:P0_13", "T6:C0_13:P0_10", "T7:C0_11:P0_11", "T8:P0_12", "T9:C0_10", "T10:C0_12:P0_18", "T13:C0_8", "T14:C0_10:P0_12", "T16:C0_9", "T17:C0_16", "T18:C0_6:P0_12", "T19:P0_10", "T5:P0_8"]),
        ("REST full (6 bp space)", ["T0:C0_2", "T0:P0_4", "T1:C0_5:P0_5", "T2:C0_2", "T2:P0_2", "T3:C0_2", "T3:P0_3", "T4:C0_3:P0_6", "T5:C0_3:P0_5", "T6:C0_4:P0_4", "T7:C0_2:P0_3", "T8:C0_2:P0_6", "T9:C0_5:P0_5", "T10:C0_3:P0_4", "T11:C0_2:P0_3", "T12:C0_6:P0_8", "T13:C0_2:P0_2", "T14:C0_4:P0_6", "T15:C0_11:P0_12", "T16:C0_3:P0_4", "T17:C0_7:P0_6", "T18:C0_3:P0_3", "T19:C0_5:P0_4"]),
        ("REST full (7 bp space)", ["T0:C1_0:P0_5", "T1:C0_12:P0_12", "T1:C0_8:P0_9", "T2:C0_3:P0_3", "T3:C0_6", "T3:P0_5", "T4:C0_5:P0_7", "T5:C0_5:P0_6", "T6:C0_7:P0_7", "T7:C0_4:P0_5", "T8:C0_4:P0_7", "T9:C0_7:P0_6", "T10:C0_6:P0_7", "T11:P0_6", "T12:C0_13:P0_17", "T13:C0_5:P0_3", "T14:C0_7:P0_10", "T16:C0_4:P0_7", "T17:P0_7", "T18:C0_5:P0_8", "T19:C0_9:P0_6"]),
        ("REST full (8 bp space)", ["T1:P0_11", "T5:P0_11", "T6:C0_15:P0_12", "T7:C0_13:P0_10", "T8:P0_22", "T10:C0_13:P0_17", "T13:C0_9:P0_10", "T14:C0_13:P0_15", "T16:C0_10", "T16:P0_13", "T18:C0_8:P0_13", "T19:C0_14", "T19:P0_13", "T0:C1_0:P0_5"]),
        ("REST full (9 bp space)", ["T6:P0_19"]),
        ("REST full (-1 bp space)", ["T0:C0_4:P0_6", "T1:C0_13:P0_15", "T1:C0_6", "T3:C0_7:P0_7", "T4:P0_8", "T6:C0_16", "T6:C0_6:P0_17", "T7:C0_6:P0_6", "T8:C0_13", "T10:C0_5:P0_6", "T11:C0_5", "T13:C0_4:P0_4", "T14:C0_6:P0_7", "T15:C0_13", "T16:C0_6:P0_8", "T17:C0_14", "T18:P0_9", "T19:P0_11"]),
        ("REST full (-2 bp space)", ["T0:P0_8", "T1:C0_7", "T3:C0_8:P0_9", "T5:C0_6:P0_7", "T6:C0_5:P0_6", "T7:C0_5:P0_7", "T8:C0_5", "T10:C0_9:P0_11", "T12:C0_10:P0_11", "T13:C0_7:P0_5", "T14:C0_5", "T16:C0_5:P0_6", "T17:C0_11:P0_10", "T18:C0_4:P0_4", "T19:C0_8", "T19:P0_9"]),
        ("REST full (-3 bp space)", ["T1:P0_7", "T8:P0_8", "T9:C0_6:P0_7", "T14:P0_8", "T15:P0_11", "T15:C0_12"]),
        ("REST full (left reverse, -1 bp space)", ["T0:C0_7", "T0:P0_12", "T1:C0_11:P0_14", "T5:C0_8", "T6:C0_12:P0_14", "T7:C0_12:P0_15", "T8:C0_11:P0_19", "T9:C0_9:P0_13", "T10:C0_11", "T10:P0_12", "T13:C0_6", "T14:C0_11", "T14:P0_16", "T16:C0_7", "T17:C0_17", "T18:C0_7", "T18:P0_18", "T19:C0_13", "T19:P0_15"]),
        ("REST full (left reverse, 0 bp space)", ["T6:C0_17", "T8:P0_21", "T11:C0_6"]),
        ("REST full v1", ["T1:C0_2", "T6:C0_2", "T10:C0_2:P0_0", "T15:C0_6", "T19:C0_12"]),
        ("REST full v2", ["T12:P0_6"]),
        ("REST full v3", ["T17:C0_18", "T19:P0_16"]),
        ("REST full v4", ["T6:C0_3", "T14:C0_3", "T15:C0_8:P0_8", "T16:P0_5", "T5:P0_4"]),
        ("REST left", ["T14:P0_13", "T13:P0_7", "T7:C0_7", "T12:C0_4:P0_4", "T0:C0_8", "T1:C0_3:P0_2", "T7:P0_8", "T8:P0_18", "T9:C0_4:P0_3", "T10:C0_7", "T10:P0_9", "T14:P0_9", "T17:P0_3", "T19:P0_3", "T17:C0_3", "T19:C0_3", "T18:P0_17"]),
        ("REST right", ["T10:P0_8", "T14:P0_11", "T6:P0_9", "T0:C0_3:P0_1", "T17:C0_10:P0_1", "T0:C0_1", "T1:C0_0:P0_1", "T2:C0_1:P0_1", "T3:C0_1:P0_1", "T4:C0_1:P0_2", "T5:C0_1:P0_1", "T6:C0_1:P0_1", "T7:C0_1:P0_1", "T8:C0_1:P0_1", "T9:C0_1:P0_0", "T10:C0_1", "T11:C0_1:P0_1", "T12:C0_2:P0_3", "T13:C0_1:P0_1", "T14:C0_2:P0_1", "T15:C0_3:P0_4", "T16:C0_2:P0_2", "T18:C0_2", "T18:P0_2", "T19:P0_1", "T8:P0_14", "T17:C0_2", "T19:C0_1"]),
        ("REST left v1", ["T0:P0_7", "T7:P0_12", "T3:C0_3", "T3:P0_4", "T4:C0_4", "T11:C0_3"]),
        ("CTCF", ["T1:C0_4", "T1:P0_3", "T3:C0_4", "T4:C0_2:P0_1", "T5:C0_2", "T5:P0_3", "T6:C0_9", "T6:P0_5", "T7:C0_9", "T7:P0_2", "T8:P0_2", "T10:C0_4:P0_2", "T14:C0_8", "T14:P0_5", "T15:C0_0:P0_0", "T17:C0_6:P0_2", "T19:C0_7", "T19:P0_2", "T0:C0_5", "T1:P0_10", "T3:P0_6", "T5:C0_4", "T6:C0_10", "T8:C0_12", "T8:P0_5", "T11:C0_4:P0_4", "T13:C0_3"]),
        ("AP-1", ["T17:C0_4", "T5:C0_9", "T6:C0_8", "T7:C0_3", "T7:P0_9", "T8:C0_3", "T10:C0_8", "T12:C0_1:P0_1", "T15:C0_5", "T17:P0_4", "T17:P0_9", "T15:P0_6", "T19:C0_11"]),
        ("FOX", ["T12:P0_10", "T12:C0_3:P0_2"]),
        ("ARNT-like", ["T5:P0_10", "T8:P0_13", "T15:C0_2:P0_2"]),
        ("ELF-like", ["T9:P0_4", "T9:C0_2", "T9:C0_8"]),
        ("GATA", ["T15:C0_7:P0_5"]),
        ("KAISO", ["T6:C0_14", "T16:C0_1:P0_1", "T18:C0_1:P0_1", "T18:P0_16"]),
        ("GRHL", ["T7:P0_16", "T15:C0_1:P0_1", "T16:P0_12"]),
        ("GC repeat", ["T17:C0_13", "T18:P0_5", "T18:P0_6", "T12:C0_12", "T12:C0_5", "T12:C0_8", "T12:P0_15", "T0:P0_2", "T0:P0_3", "T1:P0_4", "T1:P0_6", "T2:C0_4", "T2:P0_4", "T2:P0_6", "T2:P0_7", "T3:P0_11", "T3:P0_2", "T4:P0_10", "T4:P0_3", "T4:P0_4", "T4:P0_5", "T5:P0_2", "T6:P0_2", "T6:P0_3", "T6:P0_8", "T7:C0_8", "T7:P0_4", "T8:C0_6:P0_3", "T8:C0_9", "T8:P0_17", "T8:P0_4", "T9:C0_3:P0_11", "T9:P0_12", "T9:P0_14", "T9:P0_2", "T9:P0_8", "T10:P0_10", "T10:P0_19", "T10:P0_3", "T11:P0_2", "T11:P0_5", "T12:C0_11", "T12:C0_7", "T12:C0_9", "T14:P0_14", "T14:P0_2", "T14:P0_3", "T14:P0_4", "T17:C0_5", "T17:C0_9", "T8:C0_10", "T10:P0_16"]),
        ("Trans. Elem.", ["T6:P0_18", "T7:P0_14", "T8:P0_16", "T12:P0_13", "T12:P0_16", "T12:P0_18", "T16:P0_11"])
    ]


out_path = "/users/amtseng/tfmodisco/figures/motifs_across_tasks/%s_motifs_across_tasks" % tf_name
os.makedirs(out_path, exist_ok=True)

tf_num_tasks = {
    "E2F6": 2,
    "FOXA2": 4,
    "SPI1": 4,
    "CEBPB": 7,
    "MAX": 7,
    "GABPA": 9,
    "MAFK": 9,
    "JUND": 14,
    "NR3C1-reddytime": 16,
    "REST": 20
}

tf_best_model_types = {
    "E2F6": list("MM"),
    "FOXA2": list("SSMM"),
    "SPI1": list("MSSS"),
    "CEBPB": list("MMMMSMM"),
    "MAX": list("MMSMMSS"),
    "GABPA": list("MMMSMMMMM"),
    "MAFK": list("MMMMMMMMM"),
    "JUND": list("SMMSMSSSSSSSMS"),
    "NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
    "REST": list("MMMMMMMMMSMMSMMSMMMM")
}

num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]

tfm_motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_cpmerged_motifs.h5" % tf_name
memechip_motif_file = "/users/amtseng/tfmodisco/results/motifs/memechip/%s_memechip_motifs.h5" % tf_name
homer_motif_file = "/users/amtseng/tfmodisco/results/motifs/homer/%s_homer_motifs.h5" % tf_name
dichipmunk_motif_file = "/users/amtseng/tfmodisco/results/motifs/dichipmunk/%s_dichipmunk_motifs.h5" % tf_name

multitask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/multitask_profile_finetune_stats.tsv"
singletask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/singletask_profile_finetune_stats.tsv"


motif_database_path = "/users/amtseng/tfmodisco/data/processed/motif_databases/JASPAR2020_CORE_vertebrates_non-redundant_pfms_meme.txt"


def get_motif_hit_paths():
    """
    Returns a list of pairs, where each pair is the count and profile
    motif hit paths for the task.
    """
    # First, import the best fold definitions
    
    # Finetuned multi-task model
    best_mt_fold = None
    with open(multitask_finetune_model_def_tsv, "r") as f:
        for line in f:
            tokens = line.strip().split("\t")
            if tokens[0] == tf_name and int(tokens[1]) == num_tasks - 1:
                assert best_mt_fold is None
                best_mt_fold = int(tokens[2])

    # Finetuned single-task models
    best_st_folds = []
    with open(singletask_finetune_model_def_tsv, "r") as f:
        for line in f:
            tokens = line.strip().split("\t")
            if tokens[0] == tf_name:
                best_st_folds.append(int(tokens[2]))
                
    assert len(best_st_folds) == num_tasks

    # Get paths to motif hits
    task_motif_hit_paths = []
    base_path = "/users/amtseng/tfmodisco/results/reports/motif_hits/cache/tfm"
    for task_index, model_type in enumerate(best_model_types):
        if model_type == "M":
            path = os.path.join(
                base_path,
                "multitask_profile_finetune",
                "%s_multitask_profile_finetune_fold%d" % (tf_name, best_mt_fold),
                "%s_multitask_profile_finetune_task%d_fold%d_{0}" % (tf_name, task_index, best_mt_fold),
                "filtered_hits.tsv"
            )
        else:
            path = os.path.join(
                base_path,
                "singletask_profile_finetune",
                "%s_singletask_profile_finetune_fold%d" % (tf_name, best_st_folds[task_index]),
                "task_%d" % task_index,
                "%s_singletask_profile_finetune_task%d_fold%d_{0}" % (tf_name, task_index, best_st_folds[task_index]),
                "filtered_hits.tsv"
            )
        task_motif_hit_paths.append(
            (path.format("count"), path.format("profile"))
        )
    return task_motif_hit_paths


def import_tfmodisco_motifs(motif_file, model_types, motif_type="cwm_trimmed"):
    """
    From a file containing all motifs for that TF, imports the
    trimmed CWMs (or another kind of motif type) of the fine-tuned models
    corresponding to the model type for each task.
    Returns a list of dictionaries (one for each task), where
    each dictionary maps motif key to motif.
    """
    motifs = []
    with h5py.File(motif_file, "r") as f:
        mtft = f["multitask_finetune"]
        stft = f["singletask_finetune"]
        for i, model_type in enumerate(model_types):
            task = "task_%d" % i
            if model_type == "M":
                dset = mtft[task]
            else:
                dset = stft[task]
            task_motifs = {}
            for motif_key in dset.keys():
                if "0_" in motif_key:
                    # Motifs that are (or are constructed from) positive metacluster only
                    task_motifs["T%d:%s" % (i, motif_key)] = dset[motif_key][motif_type][:]
            motifs.append(task_motifs)
    return motifs


def import_classic_benchmark_motifs(motif_file, mode):
    """
    From a file containing all motifs for that TF from a benchmark
    method, imports the PFMs of the motifs for each task.
    Returns a list of dictionaries (one for each task), where
    each dictionary maps motif key to motif.
    """
    if mode == "dichipmunk":
        score_key = "supporting_seqs"
    elif mode == "homer":
        score_key = "log_enrichment"
    elif mode.startswith("meme"):
        score_key = "evalue"
    motifs = []
    with h5py.File(motif_file, "r") as f:
        tasks = sorted([int(key[5:]) for key in f.keys() if key != "task_agg"])
        for i in tasks:
            dset = f["task_%d" % i]
            task_motifs = {}
            for motif_key in dset.keys():
                if motif_key == score_key:
                    continue
                task_motifs["T%d:%s" % (i, motif_key)] = dset[motif_key][:]
            motifs.append(task_motifs)
    return motifs


def import_database_pfms(database_path):
    """
    Imports the database of PFMs by reading through the entire database and
    constructing a dictionary mapping motif IDs to NumPy arrays of PFMs.
    """
    motif_dict = {}
    with open(database_path, "r") as f:
        try:
            while True:
                line = next(f)
                if line.startswith("MOTIF"):
                    key = line.strip().split()[1]
                    header = next(f)
                    motif_width = int(header.split()[5])
                    motif = np.empty((motif_width, 4))
                    for i in range(motif_width):
                        motif[i] = np.array([
                            float(x) for x in next(f).strip().split()
                        ])
                    # Add the motif with a shortened key
                    motif_dict[key.split("_")[1]] = motif
        except StopIteration:
            pass
    return motif_dict


def get_closest_tomtom_motif_similarities(query_dict, target_dict):
    """
    From a dictionary mapping N motif keys to query motifs, and a
    dictionary mapping M motif keys to target motifs, returns a
    dictionary mapping the N query motif keys to the similarity and
    key of the closest target motif (a pair). Similarity is the
    -log(p) from TOMTOM.
    """
    query_keys, query_pfms = list(zip(*query_dict.items()))
    target_keys, target_pfms = list(zip(*target_dict.items()))
    
    # Create temporary directory to do work in
    temp_dir_obj = tempfile.TemporaryDirectory()
    temp_dir = temp_dir_obj.name

    # Convert motifs to MEME format
    query_motif_file = os.path.join(temp_dir, "query_motifs.txt")
    target_motif_file = os.path.join(temp_dir, "target_motifs.txt")
    match_motifs.export_pfms_to_meme_format(query_pfms, query_motif_file)
    match_motifs.export_pfms_to_meme_format(target_pfms, target_motif_file)

    # Run TOMTOM
    tomtom_dir = os.path.join(temp_dir, "tomtom")
    match_motifs.run_tomtom(
        query_motif_file, target_motif_file, tomtom_dir,
        show_output=False
    )

    # Find results, mapping each query motif to target index
    # The query/target IDs are the indices
    tomtom_table = match_motifs.import_tomtom_results(tomtom_dir)
    matches = []
    for i in range(len(query_pfms)):
        rows = tomtom_table[tomtom_table["Query_ID"] == i]
        if rows.empty:
            matches.append((0, "N/A"))
            continue
        min_row = rows.loc[rows["p-value"].idxmin()]
        score = -np.log10(min_row["p-value"])
        target_key = target_keys[min_row["Target_ID"]]
        matches.append((score, target_key))

    temp_dir_obj.cleanup()
    
    return dict(zip(query_keys, matches))


tfm_cwm_motifs = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "cwm_trimmed")
tfm_pfm_motifs = import_tfmodisco_motifs(tfm_motif_file, best_model_types, "pfm_trimmed")
memechip_motifs = import_classic_benchmark_motifs(memechip_motif_file, "memechip")
homer_motifs = import_classic_benchmark_motifs(homer_motif_file, "homer")
dichipmunk_motifs = import_classic_benchmark_motifs(dichipmunk_motif_file, "dichipmunk")


# For easier viewing/clustering, flip all motifs to purine-rich orientation
# Note that this is not a perfect process, so automated clustering may be imperfect with
# respect to orientation. Final aggregate motifs are done in a reverse-complement-sensitive
# manner to fix this

for motif_list in (memechip_motifs, homer_motifs, dichipmunk_motifs):
    for motif_dict in motif_list:
        for key in list(motif_dict.keys()):
            motif_dict[key] = purine_rich_motif(motif_dict[key])

# For TF-MoDISco motifs, make sure we flip the CWM and PFM to match
for cwm_motif_dict, pfm_motif_dict in zip(tfm_cwm_motifs, tfm_pfm_motifs):
    for key in list(cwm_motif_dict.keys()):
        cwm = purine_rich_motif(cwm_motif_dict[key])
        cwm_motif_dict[key] = cwm  # Flip CWM to purine-rich orientation
        pfm = pfm_motif_dict[key]
        pwm = read_motifs.pfm_to_pwm(pfm)
        
        # Flip PFM if its PWM should be flipped to better match the CWM
        score = motif_similarity_score(cwm, pwm, mean_normalize=False)
        rev_score = motif_similarity_score(cwm, np.flip(pwm, axis=(0, 1)), mean_normalize=False)
        if rev_score > score:
            pfm_motif_dict[key] = np.flip(pfm_motif_dict[key], axis=(0, 1))


# Flatten all TF-MoDISco motifs across all tasks into a single list
tfm_motif_keys = [list(d.keys()) for d in tfm_cwm_motifs]
tfm_motif_cwms = [[tfm_cwm_motifs[i][key] for key in tfm_motif_keys[i]] for i in range(len(tfm_motif_keys))]
tfm_motif_pfms = [[tfm_pfm_motifs[i][key] for key in tfm_motif_keys[i]] for i in range(len(tfm_motif_keys))]
tfm_motif_keys = sum(tfm_motif_keys, [])
tfm_motif_cwms = sum(tfm_motif_cwms, [])
tfm_motif_pfms = sum(tfm_motif_pfms, [])


# Compute similarity matrix
sim_matrix = create_motif_similarity_matrix(tfm_motif_cwms)


# Compute linkage and clusters
dist_matrix = 1 - sim_matrix
np.fill_diagonal(dist_matrix, 0)
dist_vec = scipy.spatial.distance.squareform(dist_matrix)

cluster_distance = 0.6  # On the greedy side
linkage = scipy.cluster.hierarchy.linkage(dist_vec, method="ward")
clusters = scipy.cluster.hierarchy.fcluster(
    linkage, cluster_distance, criterion="distance"
)


# # Show aggregated and constituent motifs for each cluster
# colgroup = vdomh.colgroup(
#     vdomh.col(style={"width": "45%"}),
#     vdomh.col(style={"width": "45%"}),
#     vdomh.col(style={"width": "10%"})
# )

# header = vdomh.thead(
#     vdomh.tr(
#         vdomh.th("Aggregate motif", style={"text-align": "center"}),
#         vdomh.th("Constituent motifs", style={"text-align": "center"}),
#         vdomh.th("Constituent motif IDs", style={"text-align": "center"})
#     )
# )

# cluster_ids, counts = np.unique(clusters, return_counts=True)
# for i, cluster_id in enumerate(cluster_ids):
#     match_inds = np.where(clusters == cluster_id)[0]
#     match_cwms = [tfm_motif_cwms[j] for j in match_inds]
#     match_keys = [tfm_motif_keys[j] for j in match_inds]
    
#     consensus_cwm = aggregate_motifs(match_cwms)
    
#     display(vdomh.h3("Cluster %d (%d/%d)" % (cluster_id, i + 1, len(cluster_ids))))
#     display(vdomh.h4("%d motifs" % len(match_cwms)))
    
#     agg_fig = viz_sequence.plot_weights(consensus_cwm, figsize=(20, 4), return_fig=True)
#     agg_fig.tight_layout()
    
#     const_figs = []
#     for cwm in match_cwms:
#         fig = viz_sequence.plot_weights(cwm, figsize=(20, 4), return_fig=True)
#         fig.tight_layout()
#         const_figs.append(figure_to_vdom_image(fig))

#     body = vdomh.tbody(*([
#         vdomh.tr(
#             vdomh.td(figure_to_vdom_image(agg_fig), rowspan=str(len(match_cwms))),
#             vdomh.td(const_figs[0]),
#             vdomh.td(match_keys[0])
#         )] + [
#             vdomh.tr(
#                 vdomh.td(const_figs[j + 1]),
#                 vdomh.td(match_keys[j + 1])
#             ) for j in range(len(match_cwms) - 1)
#         ]
#     ))
#     display(vdomh.table(colgroup, header, body))
#     plt.close("all")


# Show aggregated and constituent motifs for the final clusterings
colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "45%"}),
    vdomh.col(style={"width": "10%"})
)

header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Aggregate motif", style={"text-align": "center"}),
        vdomh.th("Constituent motifs", style={"text-align": "center"}),
        vdomh.th("Constituent motif IDs", style={"text-align": "center"})
    )
)

core_motif_cwms = {}
core_motif_pfms = {}
for agg_motif_name, const_motif_keys in core_motif_defs:    
    const_motif_cwms = [tfm_motif_cwms[tfm_motif_keys.index(key)] for key in const_motif_keys]
    const_motif_pfms = [tfm_motif_pfms[tfm_motif_keys.index(key)] for key in const_motif_keys]
    
    agg_cwm, (const_inds, agg_inds) = aggregate_motifs(const_motif_cwms, return_inds=True)
    core_motif_cwms[agg_motif_name] = agg_cwm
    
    # Construct aggregate PFM
    agg_pfm = aggregate_motifs_from_inds(const_motif_pfms, const_inds, agg_inds)
    agg_pfm = agg_pfm / np.sum(agg_pfm, axis=1, keepdims=True)
    core_motif_pfms[agg_motif_name] = agg_pfm
        
    display(vdomh.h3(agg_motif_name))
    
    agg_fig = viz_sequence.plot_weights(agg_cwm, figsize=(20, 4), return_fig=True)
    agg_fig.tight_layout()
    
    const_figs = []
    for motif in const_motif_cwms:
        fig = viz_sequence.plot_weights(motif, figsize=(20, 4), return_fig=True)
        fig.tight_layout()
        const_figs.append(figure_to_vdom_image(fig))

    body = vdomh.tbody(*([
        vdomh.tr(
            vdomh.td(figure_to_vdom_image(agg_fig), rowspan=str(len(const_motif_keys))),
            vdomh.td(const_figs[0]),
            vdomh.td(const_motif_keys[0])
        )] + [
            vdomh.tr(
                vdomh.td(const_figs[j + 1]),
                vdomh.td(const_motif_keys[j + 1])
            ) for j in range(len(const_motif_keys) - 1)
        ]
    ))
    display(vdomh.table(colgroup, header, body))
    plt.close("all")


# Import the motif hits for each task
import_motif_hits = lambda hits_path: pd.read_csv(hits_path, sep="\t", header=0, index_col=False)
task_motif_hit_paths = get_motif_hit_paths()
task_motif_hits = []
for count_path, profile_path in task_motif_hit_paths:
    count_table = import_motif_hits(count_path)[["key", "peak_index"]]
    profile_table = import_motif_hits(profile_path)[["key", "peak_index"]]
    # We only need the key and peak index
    task_motif_hits.append({"C": count_table, "P": profile_table})


def get_hit_prevalence(hit_table, motif_keys):
    """
    Computes the motif prevalence from the hit table, as the proportion of
    peaks which have hits in the given motif keys.
    """
    total_peaks = len(np.unique(hit_table["peak_index"]))
    hit_peaks = len(np.unique(hit_table[np.isin(hit_table["key"], motif_keys)]["peak_index"]))
    return hit_peaks / total_peaks


# Create matrix of motif prevalences
motif_prevalences = np.zeros((len(core_motif_defs), len(task_motif_hits)))
for i, (_, motif_keys) in enumerate(core_motif_defs):
    # Map each task index to the motif keys belonging to this aggregate motif (if any)
    task_const_keys = {j : [] for j in range(len(task_motif_hits))}
    for const_key in motif_keys:
        tokens = const_key.split(":")
        task_index = int(tokens[0][1:])
        task_const_keys[task_index].append(":".join(tokens[1:]))
        
    # Get the sum of prevalences for each task
    for j in task_const_keys:
        # Extract the set of motif keys, separately for counts/profiles
        motif_keys = {}
        for key in task_const_keys[j]:
            tokens = key.split(":")
            # May be compound key
            for token in tokens:
                head, motif_key = token[0], token[1:]
                try:
                    motif_keys[head].append(motif_key)
                except KeyError:
                    motif_keys[head] = [motif_key]
        
        # Compute prevalence over the motif keys, taking the average over the count/profile heads
        motif_prevalences[i, j] = np.mean([
            get_hit_prevalence(task_motif_hits[j][head], motif_keys[head])
            for head in motif_keys.keys()
        ]) if motif_keys else 0
print(motif_prevalences)

[[6.66807852e-01 7.97510188e-01 7.47573627e-01 6.13414209e-01]
 [3.44493115e-01 3.69811603e-01 1.99223535e-01 3.38370027e-01]
 [1.75442817e-01 2.39599180e-01 3.55120086e-01 1.83465616e-01]
 [3.26952608e-01 2.99152522e-01 3.88646421e-01 3.55733996e-01]
 [3.43819655e-02 0.00000000e+00 5.01205013e-02 6.77126403e-02]
 [2.23066545e-01 1.50093366e-01 2.65666888e-01 2.44901415e-01]
 [3.07574617e-02 1.53805403e-01 3.20229418e-02 8.32919193e-02]
 [0.00000000e+00 0.00000000e+00 0.00000000e+00 2.24638332e-04]
 [9.37120843e-02 0.00000000e+00 0.00000000e+00 0.00000000e+00]]


def get_closest_motifs(query_motifs, target_motifs):
    """
    From a list of N target motifs in `target_motifs`, and a list of
    M query motifs in `query_motifs`, computes the most similar target
    motif to each query motif. Returns an N-array.
    Any ill-formed motifs (e.g. rows of all 0) are ignored.
    """
    # Build similarity matrix
    sim_matrix = create_motif_similarity_matrix(query_motifs, target_motifs, show_progress=False)
    return np.nanmax(sim_matrix, axis=1)


def get_benchmark_similarities(query_motifs, benchmark_motifs):
    """
    From a list of N target motifs and a list of T dictionaries mapping
    motif keys to motifs, computes an N x T matrix of the best motif
    similarity in each task to each query motif.
    """
    matrix = np.empty((len(query_motifs), len(benchmark_motifs)))
    for i in range(len(query_motifs)):
        for j in range(len(benchmark_motifs)):
            matrix[:, j] = get_closest_motifs(query_motifs, list(benchmark_motifs[j].values()))
    return matrix


query_motifs = [core_motif_cwms[pair[0]] for pair in core_motif_defs]

memechip_best_sims = get_benchmark_similarities(query_motifs, memechip_motifs)
homer_best_sims = get_benchmark_similarities(query_motifs, homer_motifs)
dichipmunk_best_sims = get_benchmark_similarities(query_motifs, dichipmunk_motifs)

/users/amtseng/tfmodisco/notebooks/reports/util.py:103: RuntimeWarning: invalid value encountered in true_divide
  motif_2 = motif_2 / np.sqrt(np.sum(motif_2 * motif_2, axis=1, keepdims=True))


database_motifs = import_database_pfms(motif_database_path)


query_motifs = {
    pair[0] : core_motif_pfms[pair[0]] for pair in core_motif_defs
}

database_best_sims = get_closest_tomtom_motif_similarities(
    query_motifs, database_motifs
)


# Create a good color map
benchmark_decay_scale = 8.0
database_decay_scale = 0.6

x = np.linspace(0, 1, 256)
transformed = np.exp(benchmark_decay_scale * x) / np.exp(benchmark_decay_scale)
benchmark_cmap = colors.LinearSegmentedColormap.from_list(
    "newOranges",
    plt.get_cmap("Oranges")(transformed)
)

transformed = np.exp(database_decay_scale * x) / np.exp(database_decay_scale)
database_cmap = colors.LinearSegmentedColormap.from_list(
    "newBlues",
    plt.get_cmap("Blues")(transformed)
)

# Show the color map with the original
fig, ax = plt.subplots(nrows=4, figsize=(20, 4), gridspec_kw={"hspace": 1})
grad = np.linspace(0, 1, 256)
ax[0].imshow(grad[None, :], aspect="auto", cmap="Oranges")
ax[1].imshow(grad[None, :], aspect="auto", cmap=benchmark_cmap)
ax[0].set_title("Original")
ax[1].set_title("Transformed")
ax[2].imshow(grad[None, :], aspect="auto", cmap="Blues")
ax[3].imshow(grad[None, :], aspect="auto", cmap=database_cmap)
ax[2].set_title("Original")
ax[3].set_title("Transformed")
for i in range(4):
    ax[i].axis("off")
plt.show()


grid_height = motif_prevalences.shape[0] * 2
grid_width = motif_prevalences.shape[1] * 2
width_spacing = 0.1
width = (grid_width * 2) + 2 + (width_spacing * 2)

fig, ax = plt.subplots(
    ncols=3, figsize=(width, grid_height),
    gridspec_kw={
        "height_ratios": [grid_height],
        "width_ratios": [grid_width, 2, grid_width],
        "wspace": width_spacing
    }
)

# Plot motif prevalences in each task

y, x = np.unravel_index(np.arange(motif_prevalences.size), motif_prevalences.shape)
x, y = x + 0.5, y + 0.5

# Set the radius such that the area is proportional to the prevalence
max_area = np.pi * (0.5 ** 2)
assert np.min(motif_prevalences) >= 0 and np.max(motif_prevalences) <= 1
area = motif_prevalences * max_area
radius = np.sqrt(area / np.pi)

# Plot the data
ax[0].set_xlim(0, motif_prevalences.shape[1])
ax[0].set_ylim(0, motif_prevalences.shape[0])
ax[0].set_xticks(np.arange(0.5, motif_prevalences.shape[1] + 0.5))
ax[0].set_yticks(np.arange(0.5, motif_prevalences.shape[0] + 0.5))
ax[0].set_xticklabels(["task_%d" % i for i in np.arange(0, motif_prevalences.shape[1])])
ax[0].set_yticklabels([pair[0] for pair in core_motif_defs][::-1])  # Flip y-axis
for i in range(motif_prevalences.shape[1]):
    ax[0].axvline(i, color="gray", alpha=0.2)
for i in range(motif_prevalences.shape[0]):
    ax[0].axhline(i, color="gray", alpha=0.2)

for i in range(motif_prevalences.shape[0]):
    for j in range(motif_prevalences.shape[1]):
        circle = plt.Circle((j + 0.5, motif_prevalences.shape[0] - i - 1 + 0.5), radius[i, j], alpha=0.3)
        ax[0].add_patch(circle)
        
# Plot the database motif distances

database_sims = np.array([database_best_sims[pair[0]][0] for pair in core_motif_defs])
database_labels = [database_best_sims[pair[0]][1] for pair in core_motif_defs]
database_hm = ax[1].imshow(database_sims[:, None], cmap=database_cmap)
fig.colorbar(database_hm)

# Create annotations
for i in range(len(database_labels)):
    ax[1].text(0, i, database_labels[i], ha="center", va="center")

ax[1].set_aspect("auto")
ax[1].set_xticks([0])
ax[1].set_xticklabels(["Database"])
ax[1].set_yticks([])
        
# Plot benchmark motif distances in each task

# Create the benchmark array to show
full_sim_matrix = np.empty((memechip_best_sims.shape[0], memechip_best_sims.shape[1] * 3))
sim_matrices = [memechip_best_sims, homer_best_sims, dichipmunk_best_sims]
for i in range(3):
    full_sim_matrix[:, np.arange(0, memechip_best_sims.shape[1] * 3, 3) + i] = sim_matrices[i]
benchmark_hm = ax[2].imshow(full_sim_matrix, cmap=benchmark_cmap)
fig.colorbar(benchmark_hm)
ax[2].set_aspect("auto")
ax[2].set_xticks(np.arange(full_sim_matrix.shape[1]))
ax[2].set_xticklabels(
    sum([["task_%d_%s" % (i, s) for s in ("M", "H", "D")] for i in range(memechip_best_sims.shape[1])], []),
    rotation=90
)
ax[2].set_yticks([])

fig.tight_layout()

plt.savefig(
    os.path.join(out_path, "%s_motifs_across_tasks.svg" % tf_name),
    format="svg"
)
plt.show()

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:76: UserWarning: This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.


# Save the results

def sanitize_name(name):
    return name.replace("/", ":")

out_hdf5 = os.path.join(out_path, "%s_motifs_and_similarities.h5" % tf_name)
with h5py.File(out_hdf5, "w") as f:
    # Aggregate motif names
    agg_motif_names = [pair[0] for pair in core_motif_defs]
    f.create_dataset("agg_motif_names", data=np.array(agg_motif_names).astype("S"))
    
    # Aggregate motif CWMs and PWMs
    agg_cwms = f.create_group("agg_motif_cwms")
    agg_pfms = f.create_group("agg_motif_pfms")
    for motif_name in agg_motif_names:
        sanitized_motif_name = sanitize_name(motif_name)
        agg_cwms.create_dataset(sanitized_motif_name, data=core_motif_cwms[motif_name])
        agg_pfms.create_dataset(sanitized_motif_name, data=core_motif_pfms[motif_name])
    
    # Motif prevalences
    f.create_dataset("agg_motif_prevalences", data=motif_prevalences)
    
    # Database similarities
    f.create_dataset("database_log_pvals", data=np.array([
        database_best_sims[motif_name][0] for motif_name in agg_motif_names
    ]))
    f.create_dataset("database_matches", data=np.array([
        database_best_sims[motif_name][1] for motif_name in agg_motif_names
    ]).astype("S"))
    
    # Benchmark similarity matrices
    f.create_dataset("memechip_best_sims", data=memechip_best_sims)
    f.create_dataset("homer_best_sims", data=homer_best_sims)
    f.create_dataset("dichipmunk_best_sims", data=dichipmunk_best_sims)


# Show aggregate motifs
for i, (key, _) in enumerate(core_motif_defs):
    display(vdomh.h3(key))
    fig = viz_sequence.plot_weights(core_motif_cwms[key], subticks_frequency=100, return_fig=True)
    fig.tight_layout()
    plt.savefig(
        os.path.join(out_path, "%s_agg_motif_%d.svg" % (tf_name, i)),
        format="svg"
    )
    plt.show()


# Show database matches
for i, (key, _) in enumerate(core_motif_defs):
    display(vdomh.h3(key))
    if database_best_sims[key][1] == "N/A":
        continue
    fig = viz_sequence.plot_weights(
        purine_rich_motif(read_motifs.pfm_to_pwm(database_motifs[database_best_sims[key][1]])),
        subticks_frequency=100, return_fig=True
    )
    fig.tight_layout()
    plt.savefig(
        os.path.join(out_path, "%s_agg_motif_%d_database_match.svg" % (tf_name, i)),
        format="svg"
    )
    plt.show()

Helper functions¶

Import motifs¶

Cluster the TF-MoDISco motifs across each task¶

FOXA

FOX v1

FOX v2

HNF4

HNF1

CEBP

AP-1

FOX dimer

CTCF

Extract constituent motif prevalences¶

Compute similarity of benchmark motifs to aggregated motifs¶

Compute similarity of benchmark motifs to aggregated motifs¶

Construct the plot¶

FOXA

FOX v1

FOX v2

HNF4

HNF1

CEBP

AP-1

FOX dimer

CTCF

FOXA

FOX v1

FOX v2

HNF4

HNF1

CEBP

AP-1

FOX dimer

CTCF

Aggregate motif	Constituent motifs	Constituent motif IDs
		T0:C0_2:P0_7
		T0:C0_7
		T0:C0_8:P0_2
		T1:C0_1:P0_2
		T1:C0_5
		T1:C0_6
		T1:P0_6
		T1:P0_7
		T2:C0_2:P0_2
		T2:C0_5
		T2:C0_6
		T2:C0_8
		T3:C0_2
		T3:C0_6
		T3:C0_7
		T3:P0_3
		T3:P0_8
		T0:C0_12
		T0:C0_11

Aggregate motif	Constituent motifs	Constituent motif IDs
		T0:C0_4:P0_3
		T1:C0_4
		T1:P0_4
		T2:C0_3:P0_3
		T2:C0_9
		T3:C0_4
		T3:C0_8
		T3:P0_4
		T3:C0_10

Aggregate motif	Constituent motifs	Constituent motif IDs
		T1:P0_1
		T0:C0_1:P0_1
		T2:C0_1
		T2:C0_7
		T2:P0_7
		T3:C0_1
		T3:P0_2