import os
import h5py
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 16,
"axes.labelsize": 16,
"legend.fontsize": 18,
"xtick.labelsize": 8,
"ytick.labelsize": 8,
"font.family": "Roboto",
"font.weight": "bold",
"svg.fonttype": "none"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
out_path = "/users/amtseng/tfmodisco/figures/motifs_across_tasks_summary"
os.makedirs(out_path, exist_ok=True)
sim_hdf5s_base = "/users/amtseng/tfmodisco/figures/motifs_across_tasks"
sim_hdf5_paths = [] # (TF name, path)
for subdir in os.listdir(sim_hdf5s_base):
if not os.path.isdir(os.path.join(sim_hdf5s_base, subdir)):
continue
h5_name = [item for item in os.listdir(os.path.join(sim_hdf5s_base, subdir)) if item.endswith(".h5")][0]
tf_name = subdir.split("_")[0]
sim_hdf5_paths.append(
(tf_name, os.path.join(sim_hdf5s_base, subdir, h5_name))
)
def get_bench_sims(sim_hdf5_path, bench_type, only_with_database_match=False):
"""
Imports the full set of closest similarities of a benchmark to the
TF-MoDISco motifs, for the given benchmark type, across all cell types.
If a TF-MoDISco motif was not found in a cell type, that benchmark
similarity is not included.
If `only_with_database_match` is True, also limit to TF-MoDISco motifs
that had a JASPAR match.
"""
with h5py.File(sim_hdf5_path, "r") as f:
sims = f["%s_best_sims" % bench_type][:]
prevalences = f["agg_motif_prevalences"][:]
prev_mask = prevalences > 0
db_mask = np.ones(sims.shape).astype(bool)
if only_with_database_match:
db_matches = f["database_matches"][:].astype(str)
db_mask[db_matches == "N/A"] = False
return np.ravel(sims)[np.ravel(prev_mask & db_mask)]
def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
"""
Plots a CDF to the given axes. `steps` is the number of steps in the
CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
`density` is whether or not to normalize to fractions.
"""
hist, bin_edges = np.histogram(data, bins=steps)
if inverse:
cumsum = len(data) - np.cumsum(hist)
else:
cumsum = np.cumsum(hist)
if density:
cumsum = cumsum / len(data)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
ax.step(bin_centers, cumsum, **kwargs)
memechip_sims = np.concatenate([
get_bench_sims(pair[1], "memechip") for pair in sim_hdf5_paths
])
homer_sims = np.concatenate([
get_bench_sims(pair[1], "homer") for pair in sim_hdf5_paths
])
dichipmunk_sims = np.concatenate([
get_bench_sims(pair[1], "dichipmunk") for pair in sim_hdf5_paths
])
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
bins = np.linspace(0, 1, 30)
for i, (bench_name, bench_sims) in enumerate([
("MEME", memechip_sims), ("HOMER", homer_sims), ("DiChIPMunk", dichipmunk_sims)
]):
ax[0].hist(bench_sims, bins=bins, density=True, alpha=0.3, histtype="step", label=bench_name)
cdf_hist, cdf_bins = np.histogram(bench_sims, bins=bins)
cdf_cumsum = np.cumsum(cdf_hist) / len(bench_sims)
bin_centers = (cdf_bins[:-1] + cdf_bins[1:]) / 2.
ax[1].step(bin_centers, cdf_cumsum, label=bench_name)
ax[0].set_title("Histogram of best motif similarity to TF-MoDISco motif")
ax[1].set_title("Cumulative distribution of best similarity to TF-MoDISco motif")
ax[0].set_ylabel("Proportion of motifs")
ax[1].set_ylabel("Proportion of motifs with similarity at least s")
ax[0].set_xlabel("Cosine similarity")
ax[1].set_xlabel("Cosine similarity s")
ax[0].legend()
ax[1].legend()
plt.savefig(
os.path.join(out_path, "motifs_across_tasks_benchmark_similarities.svg"),
format="svg"
)
plt.show()
memechip_sims_with_db = np.concatenate([
get_bench_sims(pair[1], "memechip", True) for pair in sim_hdf5_paths
])
homer_sims_with_db = np.concatenate([
get_bench_sims(pair[1], "homer", True) for pair in sim_hdf5_paths
])
dichipmunk_sims_with_db = np.concatenate([
get_bench_sims(pair[1], "dichipmunk", True) for pair in sim_hdf5_paths
])
fig, ax = plt.subplots(ncols=2, figsize=(20, 8))
bins = np.linspace(0, 1, 30)
for i, (bench_name, bench_sims) in enumerate([
("MEME", memechip_sims_with_db), ("HOMER", homer_sims_with_db), ("DiChIPMunk", dichipmunk_sims_with_db)
]):
ax[0].hist(bench_sims, bins=bins, alpha=0.3, histtype="step", density=True, label=bench_name)
cdf_hist, cdf_bins = np.histogram(bench_sims, bins=bins)
cdf_cumsum = np.cumsum(cdf_hist) / len(bench_sims)
bin_centers = (cdf_bins[:-1] + cdf_bins[1:]) / 2.
ax[1].step(bin_centers, cdf_cumsum, label=bench_name)
ax[0].set_title("Histogram of best motif similarity to TF-MoDISco motif")
ax[1].set_title("Cumulative distribution of best similarity to TF-MoDISco motif")
ax[0].set_ylabel("Proportion of motifs")
ax[1].set_ylabel("Proportion of motifs with similarity at least s")
ax[0].set_xlabel("Cosine similarity")
ax[1].set_xlabel("Cosine similarity s")
ax[0].legend()
ax[1].legend()
plt.savefig(
os.path.join(out_path, "motifs_across_tasks_benchmark_similarities_with_db.svg"),
format="svg"
)
plt.show()