import os
import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 16,
    "axes.labelsize": 16,
    "legend.fontsize": 18,
    "xtick.labelsize": 8,
    "ytick.labelsize": 8,
    "font.family": "Roboto",
    "font.weight": "bold",
    "svg.fonttype": "none"
}
plt.rcParams.update(plot_params)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


out_path = "/users/amtseng/tfmodisco/figures/motifs_across_tasks_summary"
os.makedirs(out_path, exist_ok=True)

sim_hdf5s_base = "/users/amtseng/tfmodisco/figures/motifs_across_tasks"

sim_hdf5_paths = []  # (TF name, path)
for subdir in os.listdir(sim_hdf5s_base):
    if not os.path.isdir(os.path.join(sim_hdf5s_base, subdir)):
        continue
    h5_name = [item for item in os.listdir(os.path.join(sim_hdf5s_base, subdir)) if item.endswith(".h5")][0]
    tf_name = subdir.split("_")[0]
    sim_hdf5_paths.append(
        (tf_name, os.path.join(sim_hdf5s_base, subdir, h5_name))
    )


def get_bench_sims(sim_hdf5_path, bench_type, only_with_database_match=False):
    """
    Imports the full set of closest similarities of a benchmark to the
    TF-MoDISco motifs, for the given benchmark type, across all cell types.
    If a TF-MoDISco motif was not found in a cell type, that benchmark
    similarity is not included.
    If `only_with_database_match` is True, also limit to TF-MoDISco motifs
    that had a JASPAR match.
    """
    with h5py.File(sim_hdf5_path, "r") as f:
        sims = f["%s_best_sims" % bench_type][:]
        
        prevalences = f["agg_motif_prevalences"][:]
        prev_mask = prevalences > 0
        
        db_mask = np.ones(sims.shape).astype(bool)
        if only_with_database_match:
            db_matches = f["database_matches"][:].astype(str)
            db_mask[db_matches == "N/A"] = False
        
        
        return np.ravel(sims)[np.ravel(prev_mask & db_mask)]


def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
    """
    Plots a CDF to the given axes. `steps` is the number of steps in the
    CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
    `density` is whether or not to normalize to fractions.
    """
    hist, bin_edges = np.histogram(data, bins=steps)
    if inverse:
        cumsum = len(data) - np.cumsum(hist)
    else:
        cumsum = np.cumsum(hist)
    if density:
        cumsum = cumsum / len(data)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
    ax.step(bin_centers, cumsum, **kwargs)


memechip_sims = np.concatenate([
    get_bench_sims(pair[1], "memechip") for pair in sim_hdf5_paths
])
homer_sims = np.concatenate([
    get_bench_sims(pair[1], "homer") for pair in sim_hdf5_paths
])
dichipmunk_sims = np.concatenate([
    get_bench_sims(pair[1], "dichipmunk") for pair in sim_hdf5_paths
])


fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
bins = np.linspace(0, 1, 30)

for i, (bench_name, bench_sims) in enumerate([
    ("MEME", memechip_sims), ("HOMER", homer_sims), ("DiChIPMunk", dichipmunk_sims)
]):
    ax[0].hist(bench_sims, bins=bins, density=True, alpha=0.3, histtype="step", label=bench_name)

    cdf_hist, cdf_bins = np.histogram(bench_sims, bins=bins)
    cdf_cumsum = np.cumsum(cdf_hist) / len(bench_sims)
    bin_centers = (cdf_bins[:-1] + cdf_bins[1:]) / 2.
    ax[1].step(bin_centers, cdf_cumsum, label=bench_name)

ax[0].set_title("Histogram of best motif similarity to TF-MoDISco motif")
ax[1].set_title("Cumulative distribution of best similarity to TF-MoDISco motif")
ax[0].set_ylabel("Proportion of motifs")
ax[1].set_ylabel("Proportion of motifs with similarity at least s")

ax[0].set_xlabel("Cosine similarity")
ax[1].set_xlabel("Cosine similarity s")

ax[0].legend()
ax[1].legend()

plt.savefig(
    os.path.join(out_path, "motifs_across_tasks_benchmark_similarities.svg"),
    format="svg"
)
plt.show()


memechip_sims_with_db = np.concatenate([
    get_bench_sims(pair[1], "memechip", True) for pair in sim_hdf5_paths
])
homer_sims_with_db = np.concatenate([
    get_bench_sims(pair[1], "homer", True) for pair in sim_hdf5_paths
])
dichipmunk_sims_with_db = np.concatenate([
    get_bench_sims(pair[1], "dichipmunk", True) for pair in sim_hdf5_paths
])


fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
bins = np.linspace(0, 1, 30)

for i, (bench_name, bench_sims) in enumerate([
    ("MEME", memechip_sims_with_db), ("HOMER", homer_sims_with_db), ("DiChIPMunk", dichipmunk_sims_with_db)
]):
    ax[0].hist(bench_sims, bins=bins, alpha=0.3, histtype="step", density=True, label=bench_name)

    cdf_hist, cdf_bins = np.histogram(bench_sims, bins=bins)
    cdf_cumsum = np.cumsum(cdf_hist) / len(bench_sims)
    bin_centers = (cdf_bins[:-1] + cdf_bins[1:]) / 2.
    ax[1].step(bin_centers, cdf_cumsum, label=bench_name)

ax[0].set_title("Histogram of best motif similarity to TF-MoDISco motif")
ax[1].set_title("Cumulative distribution of best similarity to TF-MoDISco motif")
ax[0].set_ylabel("Proportion of motifs")
ax[1].set_ylabel("Proportion of motifs with similarity at least s")

ax[0].set_xlabel("Cosine similarity")
ax[1].set_xlabel("Cosine similarity s")

ax[0].legend()
ax[1].legend()

plt.savefig(
    os.path.join(out_path, "motifs_across_tasks_benchmark_similarities_with_db.svg"),
    format="svg"
)
plt.show()

Helper functions¶

Import and plot reproducibility values¶