import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/notebooks/reports/"))
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
import plot.viz_sequence as viz_sequence
from util import figure_to_vdom_image, create_motif_similarity_matrix, purine_rich_motif
import h5py
import numpy as np
import sklearn.cluster
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook(range(1))

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:18: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

<tqdm.notebook.tqdm_notebook at 0x7fe9ae08c2d0>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 16,
    "axes.labelsize": 16,
    "legend.fontsize": 18,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "font.family": "Roboto",
    "font.weight": "bold",
    "svg.fonttype": "none"
}
plt.rcParams.update(plot_params)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


if "TFM_TF_NAME" in os.environ:
    tf_name = os.environ["TFM_TF_NAME"]
    if "TFM_TASK_INDEX" in os.environ:
        task_index = int(os.environ["TFM_TASK_INDEX"])
    else:
        task_index = None
else:
    tf_name = "MAX"
    task_index = 2


if task_index is None:
    out_path = "/users/amtseng/tfmodisco/figures/fold_model_motif_reproducibility/%s_fold_model_motif_reproducibility" % tf_name
else:
    out_path = "/users/amtseng/tfmodisco/figures/fold_model_motif_reproducibility/%s_task%d_fold_model_motif_reproducibility" % (tf_name, task_index)
os.makedirs(out_path, exist_ok=True)

tf_num_tasks = {
    "E2F6": 2,
    "FOXA2": 4,
    "SPI1": 4,
    "CEBPB": 7,
    "MAX": 7,
    "GABPA": 9,
    "MAFK": 9,
    "JUND": 14,
    "NR3C1-reddytime": 16,
    "REST": 20
}

tf_best_model_types = {
    "E2F6": list("MM"),
    "FOXA2": list("SSMM"),
    "SPI1": list("MSSS"),
    "CEBPB": list("MMMMSMM"),
    "MAX": list("MMSMMSS"),
    "GABPA": list("MMMSMMMMM"),
    "MAFK": list("MMMMMMMMM"),
    "JUND": list("SMMSMSSSSSSSMS"),
    "NR3C1-reddytime": list("MMMSMMSMMMMSMMMM"),
    "REST": list("MMMMMMMMMSMMSMMSMMMM")
}

num_tasks = tf_num_tasks[tf_name]
best_model_types = tf_best_model_types[tf_name]

seed = 20210910

motif_file = "/users/amtseng/tfmodisco/results/motifs/tfmodisco/%s_tfmodisco_motifs.h5" % tf_name

multitask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/multitask_profile_finetune_stats.tsv"
singletask_finetune_model_def_tsv = "/users/amtseng/tfmodisco/results/model_stats/singletask_profile_finetune_stats.tsv"


sanitize_fold = lambda fold: "F" + fold.split("_")[1]
sanitize_task = lambda task: "T" + task.split("_")[1]

def import_all_tfmodisco_motifs(motif_file, task_index=None):
    """
    From a file containing all motifs for that TF, imports the
    trimmed CWMs of all conditions as a dictionary. Returns a dictionary
    of motifs, mapping a unique key to the trimmed CWM.
    If `task_index` is given, only import the single-task motifs and
    fine-tuned multi-task motifs for that task.
    """
    motifs = {}
    with h5py.File(motif_file, "r") as f:
        # Multi-task, all 10 folds
        if task_index is None:
            mt = f["multitask"]
            for fold in mt.keys():
                mt_fold = mt[fold]
                for key in ("count", "profile"):
                    mt_fold_key = mt_fold[key]
                    for motif_key in mt_fold_key.keys():
                        if motif_key.startswith("0_"):
                            # Positive metacluster only
                            name = "MT:%s:%s:%s" % (sanitize_fold(fold), key[0].upper(), motif_key)
                            motifs[name] = purine_rich_motif(mt_fold_key[motif_key]["cwm_trimmed"][:])
        
        # Single-task, all 10 folds for all tasks or just a single task
        st = f["singletask"]
        for task in (st.keys() if task_index is None else ["task_%d" % task_index]):
            st_task = st[task]
            for fold in st_task.keys():
                st_task_fold = st_task[fold]
                for key in ("count", "profile"):
                    st_task_fold_key = st_task_fold[key]
                    for motif_key in st_task_fold_key.keys():
                        if motif_key.startswith("0_"):
                            # Positive metacluster only
                            name = "ST:%s:%s:%s:%s" % (sanitize_task(task), sanitize_fold(fold), key[0].upper(), motif_key)
                            motifs[name] = purine_rich_motif(st_task_fold_key[motif_key]["cwm_trimmed"][:])
        
        # Multi-task fine-tune, all tasks plus aggregate task or just a single task
        mtft = f["multitask_finetune"]
        for task in (mtft.keys() if task_index is None else ["task_%d" % task_index]):
            mtft_task = mtft[task]
            for key in ("count", "profile"):
                mtft_task_key = mtft_task[key]
                for motif_key in mtft_task_key.keys():
                    if motif_key.startswith("0_"):
                        # Positive metacluster only
                        name = "MTFT:%s:%s:%s" % (sanitize_task(task), key[0].upper(), motif_key)
                        motifs[name] = purine_rich_motif(mtft_task_key[motif_key]["cwm_trimmed"][:])
        
        # Single-task fine-tune, all tasks or just a single task
        stft = f["singletask_finetune"]
        for task in (stft.keys() if task_index is None else ["task_%d" % task_index]):
            stft_task = stft[task]
            for key in ("count", "profile"):
                stft_task_key = stft_task[key]
                for motif_key in stft_task_key.keys():
                    if motif_key.startswith("0_"):
                        # Positive metacluster only
                        name = "STFT:%s:%s:%s" % (sanitize_task(task), key[0].upper(), motif_key)
                        motifs[name] = purine_rich_motif(stft_task_key[motif_key]["cwm_trimmed"][:])
        
    return motifs


def get_num_seqlets(motif_key):
    """
    Fetches the number of seqlets initially mapped to the given motif from the
    original TF-MoDISco file.
    """
    base_path = "/users/amtseng/tfmodisco/results/tfmodisco"
    cond_type = motif_key.split(":")[0]
    if cond_type == "MTFT":
        # Get best fold
        best_mt_fold = None
        with open(multitask_finetune_model_def_tsv, "r") as f:
            for line in f:
                tokens = line.strip().split("\t")
                if tokens[0] == tf_name and int(tokens[1]) == num_tasks - 1:
                    assert best_mt_fold is None
                    best_mt_fold = int(tokens[2])
        task, head, pattern = motif_key.split(":")[1:]
        task = task[1:]
        head = "profile" if head == "P" else "count"
        path = os.path.join(
            base_path,
            "multitask_profile_finetune",
            "%s_multitask_profile_finetune_fold%d" % (tf_name, best_mt_fold)
        )
        if task == "agg":
            path = os.path.join(
                path,
                "%s_multitask_profile_finetune_fold%d_%s_tfm.h5" % (tf_name, best_mt_fold, head)
            )
        else:
            path = os.path.join(
                path,
                "%s_multitask_profile_finetune_task%s_fold%d_%s_tfm.h5" % (tf_name, task, best_mt_fold, head)
            )
    elif cond_type == "STFT":
        # Get best fold
        best_st_folds = []
        with open(singletask_finetune_model_def_tsv, "r") as f:
            for line in f:
                tokens = line.strip().split("\t")
                if tokens[0] == tf_name:
                    best_st_folds.append(int(tokens[2]))
        assert len(best_st_folds) == num_tasks
        task, head, pattern = motif_key.split(":")[1:]
        task = task[1:]
        head = "profile" if head == "P" else "count"
        best_st_fold = best_st_folds[int(task)]
        path = os.path.join(
            base_path,
            "singletask_profile_finetune",
            "%s_singletask_profile_finetune_fold%d" % (tf_name, best_st_fold),
            "task_%s" % task,
            "%s_singletask_profile_finetune_task%s_fold%d_%s_tfm.h5" % (tf_name, task, best_st_fold, head)
        )        
    elif cond_type == "MT":
        fold, head, pattern = motif_key.split(":")[1:]
        fold = fold[1:]
        head = "profile" if head == "P" else "count"
        path = os.path.join(
            base_path,
            "multitask_profile",
            "%s_multitask_profile_fold%s" % (tf_name, fold),
            "%s_multitask_profile_fold%s_%s_tfm.h5" % (tf_name, fold, head)
        )
    elif cond_type == "ST":
        task, fold, head, pattern = motif_key.split(":")[1:]
        task = task[1:]
        fold = fold[1:]
        head = "profile" if head == "P" else "count"
        path = os.path.join(
            base_path,
            "singletask_profile",
            "%s_singletask_profile_fold%s" % (tf_name, fold),
            "task_%s" % task,
            "%s_singletask_profile_task%s_fold%s_%s_tfm.h5" % (tf_name, task, fold, head)
        )
    else:
        raise ValueError("Unknown model condition type")
        
    # Now that we have the path, import the number of seqlets for the given pattern
    metacluster_key, pattern_key = pattern.split("_")
    with h5py.File(path, "r") as f:
        metacluster = f["metacluster_idx_to_submetacluster_results"]["metacluster_%s" % metacluster_key]
        patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
        seqlets = patterns["pattern_%s" % pattern_key]["seqlets_and_alnmts"]["seqlets"]
        return len(seqlets)


def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
    """
    Plots a CDF to the given axes. `steps` is the number of steps in the
    CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
    `density` is whether or not to normalize to fractions.
    """
    hist, bin_edges = np.histogram(data, bins=steps)
    if inverse:
        cumsum = len(data) - np.cumsum(hist)
    else:
        cumsum = np.cumsum(hist)
    if density:
        cumsum = cumsum / len(data)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
    ax.step(bin_centers, cumsum, **kwargs)


def cluster_matrix_indices(matrix, num_clusters):
    """
    Clusters matrix using k-means. Always clusters on the first
    axis. Returns the indices needed to optimally order the matrix
    by clusters.
    """
    if len(matrix) == 1:
        # Don't cluster at all
        return np.array([0])

    num_clusters = min(num_clusters, len(matrix))
    
    # Perform k-means clustering
    kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters)
    cluster_assignments = kmeans.fit_predict(matrix)

    # Order cluster centers to determine optimal ordering
    kmeans_centers = kmeans.cluster_centers_
    cluster_order = np.argsort(-np.mean(kmeans_centers, axis=1))

    # Order the peaks so that the cluster assignments follow the optimal ordering
    cluster_inds = []
    for cluster_id in cluster_order:
        cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
    cluster_inds = np.concatenate(cluster_inds)
    return cluster_inds, cluster_assignments[cluster_inds]


def plot_motif_examples(motif_keys, motifs, cond_names, cond_inds):
    """
    Creates a table of motifs. The full set of motifs and keys
    are given, and `cond_names` and `cond_inds` are parallel lists
    containing conditions and a list of indices to plot for each.
    A pair of columns is created for each condition.
    """
    assert len(cond_names) == len(cond_inds)
    cols = []
    heads = []
    for i in range(len(cond_names)):
        cols.append(vdomh.col(style={"width": str(10 / len(cond_inds)) + "%"}))
        cols.append(vdomh.col(style={"width": str(90 / len(cond_inds)) + "%"}))
        heads.append(vdomh.th("Motif ID", style={"text-align": "center"}))
        heads.append(vdomh.th("Motif (%s)" % cond_names[i], style={"text-align": "center"}))
    colgroup = vdomh.colgroup(*cols)
    header = vdomh.thead(vdomh.tr(*heads))

    max_length = max([len(inds) for inds in cond_inds])
    rows = []
    for row_i in range(max_length):
        row = []
        for col_i in range(len(cond_names)):
            if row_i < len(cond_inds[col_i]):
                motif_i = cond_inds[col_i][row_i]
                fig = viz_sequence.plot_weights(
                    motifs[motif_i], subticks_frequency=100, figsize=(20, 4), return_fig=True
                )
                fig.tight_layout()
                row.extend([
                    vdomh.td(motif_keys[motif_i]),
                    vdomh.td(figure_to_vdom_image(fig))
                ])
            else:
                row.extend([vdomh.td(), vdomh.td()])
        rows.append(vdomh.tr(*row))
    display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
    plt.close("all")


motif_keys, motifs = zip(*import_all_tfmodisco_motifs(motif_file, task_index).items())
motif_keys, motifs = list(motif_keys), list(motifs)


raw_sim_matrix = create_motif_similarity_matrix(motifs, mean_normalize=False)


# Extract the number of seqlest for each motif
motif_num_seqlets = np.array([
    get_num_seqlets(key) for key in tqdm.notebook.tqdm(motif_keys)
])


# First, create a dictionary mapping each condition to the column indices
cond_inds = {}
for i, key in enumerate(motif_keys):
    cond_key = ":".join(key.split(":")[:-2])  # Strip off last two pieces
    try:
        cond_inds[cond_key].append(i)
    except KeyError:
        cond_inds[cond_key] = [i]
for key in list(cond_inds.keys()):
    cond_inds[key] = np.array(cond_inds[key])


# Normalize the number of seqlets over each condition separately
motif_prop_seqlets = np.empty(len(motif_num_seqlets))
for inds in cond_inds.values():
    motif_prop_seqlets[inds] = motif_num_seqlets[inds] / np.sum(motif_num_seqlets[inds])


# Construct a limited version of the similarity matrix, where the columns
# are the highest similarity over the entire condition
cond_sim_matrix = np.empty((len(motif_keys), len(cond_inds)))
for i, cond_key in enumerate(cond_inds.keys()):
    cond_sim_matrix[:, i] = np.max(raw_sim_matrix[:, cond_inds[cond_key]], axis=1)


# Order similarity matrix rows by optimal cluster order
order_inds, cluster_ids = cluster_matrix_indices(
    cond_sim_matrix,
    max([len(inds) for inds in cond_inds.values()])
    # Maximum number of clusters should be biggest condition size
)
sorted_cond_sim_matrix = cond_sim_matrix[order_inds]

# Order motif keys and motifs
sorted_motif_keys = [motif_keys[i] for i in order_inds]
sorted_motifs = [motifs[i] for i in order_inds]
sorted_motif_prop_seqlets = np.array([motif_prop_seqlets[i] for i in order_inds])

# Sort each matrix row independently, most to least similar
sorted_cond_sim_matrix = -np.sort(-sorted_cond_sim_matrix, axis=1)  # Negatives make it sort descending


fig, ax = plt.subplots()
make_cdf(ax, np.ravel(cond_sim_matrix), density=True)
ax.set_title("CDF of similarities")
ax.set_xlabel("Similarity score")
plt.show()

# # Normalize the matrix entries to percentile
# cond_sim_matrix_perc = scipy.stats.rankdata(cond_sim_matrix).reshape(cond_sim_matrix.shape) / cond_sim_matrix.size


fig, ax = plt.subplots()
ax.set_title("Motif prevalence by seqlets")
ax.set_xlabel("Motif rank by reproducibility")
ax.set_ylabel("Fraction of seqlets")
ax.plot(sorted_motif_prop_seqlets)
plt.show()


# Extract the CWM score of each motif in order
sorted_motif_scores = np.array([np.max(cwm) for cwm in sorted_motifs])

# Separate out profile and count CWMs
profile_inds = np.array([i for i in range(len(sorted_motif_keys)) if sorted_motif_keys[i].split(":")[-2] == "P"])
count_inds = np.array([i for i in range(len(sorted_motif_keys)) if sorted_motif_keys[i].split(":")[-2] == "C"])

fig, ax = plt.subplots()
make_cdf(ax, sorted_motif_scores[profile_inds], density=True, label="Profile")
make_cdf(ax, sorted_motif_scores[count_inds], density=True, label="Count")
ax.set_title("CDF of CWM score")
ax.set_xlabel("CWM score")
plt.legend()
plt.show()

# Normalize the CWM scores to map to percentile (i.e. map CDF x-axis to y-axis)
# Do counts/profile CWMs separately
sorted_motif_score_percs = np.empty_like(sorted_motif_scores)
sorted_motif_score_percs[profile_inds] = scipy.stats.rankdata(sorted_motif_scores[profile_inds]) / len(profile_inds)
sorted_motif_score_percs[count_inds] = scipy.stats.rankdata(sorted_motif_scores[count_inds]) / len(count_inds)


# Create good non-uniform color maps for the data
sim_decay_scale = 1.5
score_decay_scale = 2.5
seq_decay_scale = 1.5

x = np.linspace(0, 1, 256)

transformed = np.power(np.flip(x), sim_decay_scale)
sim_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
    "sim_cmap", plt.get_cmap("Greys")(transformed)
)

transformed = np.exp(score_decay_scale * x) / np.exp(score_decay_scale)
score_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
    "sim_cmap", plt.get_cmap("Purples")(transformed)
)

transformed = np.exp(seq_decay_scale * x) / np.exp(seq_decay_scale)
seq_cmap = matplotlib.colors.LinearSegmentedColormap.from_list(
    "sim_cmap", plt.get_cmap("Oranges")(transformed)
)

# Show the color map with the original
fig, ax = plt.subplots(nrows=6, figsize=(20, 6), gridspec_kw={"hspace": 1})
grad = np.linspace(0, 1, 256)
ax[0].imshow(grad[None, :], aspect="auto", cmap="Greys_r")
ax[1].imshow(grad[None, :], aspect="auto", cmap=sim_cmap)
ax[2].imshow(grad[None, :], aspect="auto", cmap="Purples")
ax[3].imshow(grad[None, :], aspect="auto", cmap=score_cmap)
ax[4].imshow(grad[None, :], aspect="auto", cmap="Oranges")
ax[5].imshow(grad[None, :], aspect="auto", cmap=seq_cmap)
for i in range(6):
    ax[i].axis("off")
    if i % 2 == 0:
        ax[i].set_title("Original")
    else:
        ax[i].set_title("Transformed")
plt.show()


grid_height = max(5, len(motif_keys) * 0.1)
grid_width = max(5, len(cond_inds) * 0.1)
height_spacing, width_spacing = 0.1, 0.1
height = grid_height + 3 + (height_spacing * 3)
width = grid_width + 6 + (width_spacing * 3)

fig, ax = plt.subplots(
    nrows=4, ncols=4, figsize=(width, height),
    gridspec_kw={
        "height_ratios": [grid_height / 0.1, 1, 1, 1],
        "width_ratios": [grid_width / 0.1, 2, 2, 2],
        "hspace": height_spacing,
        "wspace": width_spacing
    }
)

sim_hmap = ax[0, 0].imshow(sorted_cond_sim_matrix, cmap=sim_cmap, aspect="auto")  # auto aspect to stretch
score_hmap = ax[0, 2].imshow(sorted_motif_score_percs[:, None], cmap=score_cmap, aspect="auto")
seq_hmap = ax[0, 3].imshow(sorted_motif_prop_seqlets[:, None], cmap=seq_cmap, aspect="auto")

# Cluster IDs may not be in order, so sort the groups
diffs = np.concatenate([[0], np.diff(cluster_ids)])
diffs[diffs != 0] = 1
sorted_cluster_ids = np.cumsum(diffs)

cmap = cm.get_cmap("tab20")
group_colors = cmap(sorted_cluster_ids[:, None] % 20 / 20)[:, :, :3]
ax[0, 1].imshow(group_colors, aspect="auto")
# we need to first translate into colors instead of specifying `cmap=` to `imshow`,
# otherwise the colors will be different

ax[0, 0].set_yticks(np.arange(len(motif_keys)))
ax[0, 0].set_yticklabels(sorted_motif_keys)
ax[0, 0].set_xticks([])
ax[0, 0].set_title("Similarity of closest motif")
ax[0, 1].set_title("Grp")
ax[0, 2].set_title("CWM")
ax[0, 3].set_title("#Seq")

for i in range(1, 4):
    ax[0, i].set_yticks([])
    ax[0, i].set_xticks([])
    for j in range(1, 4):
        ax[j, i].axis("off")
        ax[j, i].axis("off")
        ax[j, i].axis("off")

fig.colorbar(sim_hmap, cax=ax[1, 0], orientation="horizontal")
fig.colorbar(score_hmap, cax=ax[2, 0], orientation="horizontal")
fig.colorbar(seq_hmap, cax=ax[3, 0], orientation="horizontal")

plt.savefig(
    os.path.join(out_path, "%s_fold_model_motif_reproducibility.svg" % tf_name),
    format="svg"
)
plt.show()


# Show the average similarity of each cluster, and overall
cmap = cm.get_cmap("tab20")

colgroup = vdomh.colgroup(
    vdomh.col(style={"width": "10%"}),
    vdomh.col(style={"width": "10%"}),
    vdomh.col(style={"width": "10%"}),
    vdomh.col(style={"width": "10%"}),
    vdomh.col(style={"width": "10%"})
)
header = vdomh.thead(vdomh.tr(
    vdomh.th("Cluster ID", style={"text-align": "center"}),
    vdomh.th("Cluster color", style={"text-align": "center"}),
    vdomh.th("Number of motifs", style={"text-align": "center"}),
    vdomh.th("Mean similarity", style={"text-align": "center"}),
    vdomh.th("Standard error", style={"text-align": "center"})
))

rows = []
means, stderrs, cluster_sizes = [], [], []
for cluster_id in np.unique(sorted_cluster_ids):
    match_inds = np.where(sorted_cluster_ids == cluster_id)[0]
    
    sims = sorted_cond_sim_matrix[match_inds][:, 1:]  # First column is always 1, so remove it
    
    mean, stderr = np.mean(sims), np.std(sims) / np.sqrt(sims.size)
    cluster_size = len(match_inds)
    means.append(mean)
    stderrs.append(stderr)
    cluster_sizes.append(cluster_size)
        
    group_color = matplotlib.colors.rgb2hex(cmap(cluster_id % 20 / 20))
    fig, ax = plt.subplots(figsize=(0.3, 0.3))
    circle = plt.Circle((0, 0), 10, color=group_color)
    ax.add_patch(circle)
    ax.axis("off")
    plt.axis("scaled")
    
    rows.append(vdomh.tr(
        vdomh.td(str(cluster_id)),
        vdomh.td(figure_to_vdom_image(fig)),
        vdomh.td(str(cluster_size)),
        vdomh.td(str(mean)),
        vdomh.td(str(stderr))
    ))

total_sims = sorted_cond_sim_matrix[:, 1:]
total_mean, total_stderr = np.mean(total_sims), np.std(total_sims) / np.sqrt(total_sims.size)
rows.append(vdomh.tr(
    vdomh.td("All"),
    vdomh.td(""),
    vdomh.td(str(len(total_sims))),
    vdomh.td(str(total_mean)),
    vdomh.td(str(total_stderr))
))

display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
plt.close("all")

# Save results of clustering
out_hdf5 = os.path.join(out_path, "%s_clusters.h5" % tf_name)
with h5py.File(out_hdf5, "w") as f:
    f.create_dataset("raw_sim_matrix", data=raw_sim_matrix)
    f.create_dataset("sorted_cond_sim_matrix", data=sorted_cond_sim_matrix)
    f.create_dataset("sorted_motif_prop_seqlets", data=sorted_motif_prop_seqlets)
    f.create_dataset("sorted_motif_scores", data=sorted_motif_scores)
    f.create_dataset("sorted_motif_keys", data=sorted_motif_keys)
    f.create_dataset("sorted_cluster_ids", data=sorted_cluster_ids)
    f.create_dataset("cluster_sizes", data=cluster_sizes)
    f.create_dataset("cluster_mean_sims", data=means)
    f.create_dataset("cluster_stderr_sims", data=stderrs)
    f.create_dataset("total_mean_sim", data=total_mean)
    f.create_dataset("total_stderr_sim", data=total_stderr)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:33: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).


# Show some examples of motifs in each cluster, from highest to lowest reproducibility
num_examples_to_show = 5
cmap = cm.get_cmap("tab20")
rng = np.random.RandomState(seed)

for cluster_id in np.unique(sorted_cluster_ids):
    match_inds = np.where(sorted_cluster_ids == cluster_id)[0]
    display(vdomh.h3("Cluster %d (%d motifs)" % (cluster_id, len(match_inds))))
    
    group_color = matplotlib.colors.rgb2hex(cmap(cluster_id % 20 / 20))
    fig, ax = plt.subplots(figsize=(1, 1))
    circle = plt.Circle((0, 0), 10, color=group_color)
    ax.add_patch(circle)
    ax.axis("off")
    plt.axis("scaled")
    plt.show()
    
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "10%"}),
        vdomh.col(style={"width": "90%"})
    )
    header = vdomh.thead(vdomh.tr(
        vdomh.th("Motif ID", style={"text-align": "center"}),
        vdomh.th("Motif", style={"text-align": "center"})
    ))

    rows = []
    inds_to_show = rng.choice(match_inds, size=min(num_examples_to_show, len(match_inds)), replace=False)
    for i in inds_to_show:
        fig = viz_sequence.plot_weights(
            sorted_motifs[i], subticks_frequency=100, figsize=(20, 4), return_fig=True
        )
        fig.tight_layout()
        plt.savefig(
            os.path.join(out_path, "%s_cluster_%d_motif_%s.svg" % (tf_name, cluster_id, sorted_motif_keys[i])),
            format="svg"
        )
        rows.append(vdomh.tr(
            vdomh.td(sorted_motif_keys[i]),
            vdomh.td(figure_to_vdom_image(fig))
        ))
    display(vdomh.table(colgroup, header, vdomh.tbody(*rows)))
    plt.close("all")

Cluster ID	Number of motifs	Mean similarity	Standard error
0	28	0.9296519089171698	0.0026098546193969044
1	24	0.9006607026521601	0.002446331456165322
2	21	0.9009254000304207	0.003471411413794625
3	21	0.875879928851423	0.004503995468821779
4	9	0.8671420889103525	0.00964666226848454
5	18	0.8582832246845791	0.004773028073576986
6	5	0.8545331949558836	0.009470704214919638
7	16	0.8536334627581016	0.004599246217035412
8	3	0.8204041791284304	0.01723670140862845
9	13	0.8226208137623816	0.00790765023015964
10	14	0.8226802103617673	0.004534972155148469
11	10	0.7923868767663449	0.006672353397896703
12	9	0.7881807797016992	0.005964020494437429
13	5	0.7771620302549282	0.006947093829074759
14	7	0.754713769113028	0.006541738788713363
15	5	0.7603428013695588	0.005730879204027705
16	5	0.745739605206518	0.008059043084228598
17	2	0.7406717330483308	0.013791818203482725
18	4	0.7312529865111913	0.01526900325642838
19	5	0.720957315067407	0.009231425739910863
20	6	0.6833186176249878	0.009287435666359378
21	1	0.5947102254870338	0.01785884630663637
22	1	0.58631053000691	0.022529014774953386
23	1	0.4757842016046434	0.015402560483585714
24	1	0.42651795722577907	0.013526385536503812
All	234	0.8407140917312591	0.001957361784337858

Motif ID	Motif
ST:T12:F9:P:0_0
STFT:T12:P:0_1
ST:T12:F10:C:0_0
ST:T12:F3:P:0_0
ST:T12:F3:C:0_0

Motif ID	Motif
ST:T12:F1:C:0_2
STFT:T12:C:0_3
ST:T12:F4:P:0_3
ST:T12:F1:C:0_6
ST:T12:F3:C:0_4

Motif ID	Motif
ST:T12:F9:P:0_2
ST:T12:F9:C:0_5
STFT:T12:P:0_5
ST:T12:F5:P:0_8
ST:T12:F2:P:0_0

Motif ID	Motif
ST:T12:F6:C:0_1
ST:T12:F5:P:0_10
ST:T12:F8:P:0_6
ST:T12:F6:P:0_3
ST:T12:F8:C:0_10

Helper functions¶

Import motifs and construct similarity matrix¶

Plot similarity of each motif to closest motif in each condition¶

Cluster 0 (28 motifs)

Cluster 1 (24 motifs)

Cluster 2 (21 motifs)

Cluster 3 (21 motifs)

Cluster 4 (9 motifs)

Cluster 5 (18 motifs)

Cluster 6 (5 motifs)

Cluster 7 (16 motifs)

Cluster 8 (3 motifs)

Cluster 9 (13 motifs)

Cluster 10 (14 motifs)

Cluster 11 (10 motifs)

Cluster 12 (9 motifs)

Cluster 13 (5 motifs)

Cluster 14 (7 motifs)

Cluster 15 (5 motifs)

Cluster 16 (5 motifs)

Cluster 17 (2 motifs)

Cluster 18 (4 motifs)

Cluster 19 (5 motifs)

Cluster 20 (6 motifs)

Cluster 21 (1 motifs)

Cluster 22 (1 motifs)

Cluster 23 (1 motifs)

Cluster 24 (1 motifs)

Motif ID	Motif
ST:T12:F8:P:0_2
ST:T12:F6:P:0_1
STFT:T12:P:0_2
ST:T12:F5:C:0_2
ST:T12:F3:C:0_1

Motif ID	Motif
ST:T12:F9:C:0_4
ST:T12:F10:C:0_3
ST:T12:F9:P:0_1
ST:T12:F7:C:0_4
ST:T12:F8:C:0_7

Motif ID	Motif
ST:T12:F6:P:0_8
ST:T12:F2:P:0_7
ST:T12:F6:C:0_6
ST:T12:F5:P:0_7
ST:T12:F8:P:0_7

Motif ID	Motif
ST:T12:F5:C:0_4
ST:T12:F4:C:0_1
ST:T12:F5:P:0_6
ST:T12:F4:P:0_2
ST:T12:F6:C:0_5

Motif ID	Motif
ST:T12:F6:P:0_9
ST:T12:F9:P:0_8
ST:T12:F6:P:0_4
ST:T12:F10:P:0_3
ST:T12:F10:C:0_2

Motif ID	Motif
ST:T12:F3:P:0_11
ST:T12:F3:C:0_6
ST:T12:F3:P:0_8
ST:T12:F3:P:0_9
ST:T12:F9:C:0_6

Motif ID	Motif
MTFT:T12:C:0_7
MTFT:T12:P:0_5
MTFT:T12:C:0_2
MTFT:T12:C:0_0
MTFT:T12:C:0_5

Motif ID	Motif
ST:T12:F2:P:0_3
ST:T12:F2:P:0_10
ST:T12:F2:C:0_1
ST:T12:F9:P:0_10
ST:T12:F2:P:0_12

Motif ID	Motif
ST:T12:F5:C:0_7
ST:T12:F5:P:0_12
ST:T12:F5:P:0_13
ST:T12:F5:C:0_5
ST:T12:F5:P:0_11

Motif ID	Motif
ST:T12:F7:P:0_12
ST:T12:F7:P:0_9
ST:T12:F7:P:0_10
ST:T12:F7:P:0_8
ST:T12:F7:P:0_11

Motif ID	Motif
ST:T12:F4:C:0_5
ST:T12:F4:P:0_4
ST:T12:F7:C:0_5

Motif ID	Motif
ST:T12:F4:C:0_7
ST:T12:F4:C:0_9
ST:T12:F4:P:0_5
ST:T12:F4:C:0_8
ST:T12:F4:P:0_7

Motif ID	Motif
ST:T12:F8:P:0_8
ST:T12:F8:C:0_8
ST:T12:F8:P:0_12
ST:T12:F8:P:0_10
ST:T12:F8:P:0_9

Motif ID	Motif
ST:T12:F1:P:0_7
ST:T12:F1:P:0_8

Motif ID	Motif
ST:T12:F7:P:0_7
ST:T12:F6:P:0_7
ST:T12:F8:P:0_5
ST:T12:F7:P:0_5

Motif ID	Motif
ST:T12:F9:P:0_9
ST:T12:F3:P:0_12
ST:T12:F10:C:0_6
ST:T12:F10:P:0_7
ST:T12:F9:P:0_12

Motif ID	Motif
MTFT:T12:P:0_1
MTFT:T12:C:0_1
MTFT:T12:P:0_7
MTFT:T12:P:0_4
MTFT:T12:P:0_2