import h5py
import numpy as np
import scipy.stats
import scipy.special
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
import json
import os
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 15,
"font.size": 13,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
# Define parameters/fetch arguments
tf_name = os.environ["TFM_RESULTS_TF_NAME"]
num_tasks = int(os.environ["TFM_RESULTS_NUM_TASKS"])
best_multitask_fold = int(os.environ["TFM_RESULTS_BEST_MULTITASK_FOLD"])
best_singletask_folds = [int(x) for x in os.environ["TFM_RESULTS_BEST_SINGLETASK_FOLDS"].split(",")]
if "TFM_PERF_CACHE" in os.environ:
perf_cache_dir = os.environ["TFM_PERF_CACHE"]
else:
perf_cache_dir = None
print("TF name: %s" % tf_name)
print("Number of tasks: %d" % num_tasks)
print("Best multi-task fold: %d" % best_multitask_fold)
print("Best single-task folds: %s" % " ".join([str(x) for x in best_singletask_folds]))
print("Saved performance cache: %s" % perf_cache_dir)
preds_base = "/users/amtseng/tfmodisco/results/peak_predictions/"
# Paths for all 10 folds for multi-task profile models
multitask_preds_paths = [
os.path.join(
preds_base,
"multitask_profile",
"{0}_multitask_profile_fold{1}/{0}_multitask_profile_fold{1}_pred_perf.h5".format(tf_name, fold)
) for fold in range(1, 11)
]
# Paths for all 10 folds for single-task profile models, for each task
singletask_preds_paths = [
[
os.path.join(
preds_base,
"singletask_profile",
"{0}_singletask_profile_fold{1}/task_{2}/{0}_singletask_profile_task{2}_fold{1}_pred_perf.h5".format(
tf_name, fold, task_index
)
) for fold in range(1, 11)
] for task_index in range(num_tasks)
]
# Path for fine-tuned multi-task profile model
multitask_finetuned_preds_path = os.path.join(
preds_base,
"multitask_profile_finetune",
"{0}_multitask_profile_finetune_fold{1}/{0}_multitask_profile_finetune_fold{1}_pred_perf.h5".format(
tf_name, best_multitask_fold
)
)
# Paths for fine-tuned single-task profile models, for each task
singletask_finetuned_preds_paths = [
os.path.join(
preds_base,
"singletask_profile_finetune",
"{0}_singletask_profile_finetune_fold{1}/task_{2}/{0}_singletask_profile_finetune_task{2}_fold{1}_pred_perf.h5".format(
tf_name, fold, task_index
)
) for task_index, fold in enumerate(best_singletask_folds)
]
# Path for upper-bound and lower-bound performance metrics
perf_bounds_path = "/users/amtseng/tfmodisco/results/performance_bounds/{0}_performance_bounds.h5".format(tf_name)
# File specifications (including peak files)
files_spec_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/config/{0}/{0}_training_paths.json".format(tf_name)
with open(files_spec_json, "r") as f:
files_spec = json.load(f)
# Chromosome split definition (i.e. test set chromosomes)
chrom_splits_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(chrom_splits_json, "r") as f:
chrom_splits = json.load(f)
all_fold_test_chroms = [
chrom_splits[str(fold)]["test"] for fold in range(1, 11)
]
best_multitask_fold_test_chroms = chrom_splits[str(best_multitask_fold)]["test"]
best_singletask_fold_test_chroms = [chrom_splits[str(fold)]["test"] for fold in best_singletask_folds]
input_length = 2114
profile_length = 1000
if perf_cache_dir:
os.makedirs(perf_cache_dir, exist_ok=True)
For subsetting predictions/performance metrics to peak subsets, extracting performance metrics, and plotting
def subset_coord_inds(superset_coords, subset_coords):
"""
Both `superset_coords` and `subset_coords` are Pandas DataFrames of
coordinates. This will return the indices within `superset_coords`
(indices being the 0-indexed row numbers) that correspond to the
coordinates in `subset_coords`. Returns a sorted NumPy array of indices.
Note that if there are duplicates in either set of coordinates, they
will be dropped (i.e. the returned indices will be unique).
"""
inds = superset_coords.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
subset_coords.reset_index(), on=["chrom", "start", "end"]
).sort_values("index_y")["index_x"].values
return np.unique(inds)
def filter_coords_by_chroms(coords, chroms):
"""
Given a Pandas DataFrame of coordinates (column names "chrom", "start",
and "end"), filters for the chromosomes in `chroms`
"""
return coords[coords["chrom"].isin(chroms)]
def extract_performance_metrics(pred_perf_path, coord_sets=None, task_inds=None, pool=False):
"""
Extracts the set of performance metrics from a saved predictions/performance
HDF5 file. If specified, filters for coordinates that are in `coord_sets`.
`coord_sets` is a list of coordinate DataFrames, and a set of metrics will
be fetched for each table of coordinates provided. Otherwise, will simply
return all coordinates available (i.e. only one coordinate set with all
coordinates). If `task_inds` is specified, it must be a list of indices
parallel to `coord_sets`. For each coordinate set, the metrics extracted
will be for that task index only. If unspecified, the average over all tasks
is retained for each coordinate set.
If `pool` is specified, then all resulting vectors and scalars will be pooled
together to form a single vector (or scalar).
Returns a dictionary of the following form:
`nll`: [
<NLL vector for coord set 1>
<NLL vector for coord set 2>
...
],
`count_mse`: [
<MSE scalar for coord set 1>
<MSE scalar for coord set 2>
...
]
...
"""
result = {}
reader = h5py.File(pred_perf_path, "r")
coord_reader = reader["coords"]
pred_reader = reader["predictions"]
perf_reader = reader["performance"]
# First, get the set of indices within the HDF5 predictions/performance that
# correspond to the given coordinate sets
if coord_sets is None:
subset_inds = [np.arange(perf_reader["nll"].shape[0])] # The entire vector
else:
# Import the DataFrame of coordinates in this HDF5
pred_perf_coords = pd.DataFrame(
data={
"chrom": coord_reader["coords_chrom"][:].astype(str),
"start": coord_reader["coords_start"][:],
"end": coord_reader["coords_end"][:]
}
)
subset_inds = [
subset_coord_inds(pred_perf_coords, coord_set)
for coord_set in coord_sets
]
# If we didn't specify a task index for each coordinate set, just use
# all tasks; either way, let's get each set of task indices into a
# NumPy array form
if task_inds is None:
task_inds = [np.arange(perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
else:
task_inds = [np.array([i]) for i in task_inds]
# For each performance metric, for each coordinate set/task index, extract
# the metrics values
for key in perf_reader.keys():
metrics_list = []
for i in range(len(subset_inds)):
subset = subset_inds[i]
tasks = task_inds[i]
if len(perf_reader[key].shape) >= 2: # Profile metric
metrics_list.append(
np.mean(perf_reader[key][subset][:, tasks], axis=1)
)
result[key] = metrics_list
# Now extract the count values for recomputing the count metrics
log_true_counts, log_pred_counts = [], []
if coord_sets is None:
# Technically, there should be no need to recompute the count metrics,
# since these were saved for the entire set, but for simplicity of code
# we can recompute them anyways, since they are cheap operations
log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][:] + 1)))
log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][:]))
else:
for i in range(len(subset_inds)):
subset = subset_inds[i]
tasks = task_inds[i]
log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][subset][:, tasks] + 1)))
log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][subset][:, tasks]))
reader.close()
# Perform any pooling
if pool:
for key in result:
result[key] = [np.concatenate(result[key])]
log_true_counts = [np.concatenate(log_true_counts)]
log_pred_counts = [np.concatenate(log_pred_counts)]
# Now recompute the count metrics
num_sets = len(log_true_counts)
result["count_mse"] = [
np.mean(np.square(t - p)) for t, p in zip(log_true_counts, log_pred_counts)
]
result["count_pearson"] = [
scipy.stats.pearsonr(t, p)[0] for t, p in zip(log_true_counts, log_pred_counts)
]
result["count_spearman"] = [
scipy.stats.spearmanr(t, p)[0] for t, p in zip(log_true_counts, log_pred_counts)
]
# Convert NaNs to 0
for key in result:
result[key] = [np.nan_to_num(a) for a in result[key]]
return result
def extract_performance_bounds(perf_bounds_path, input_length, coord_sets=None, task_inds=None, pool=False):
"""
Extracts the set of lower and upper bound performance metrics from a saved
HDF5 file. `input_length` is the lenght of input sequence to use.
If specified, filters for coordinates that are in `coord_sets`.
`coord_sets` is a list of coordinate DataFrames, and a set of metrics will
be fetched for each table of coordinates provided. Otherwise, will simply
return all coordinates available (i.e. only one coordinate set with all
coordinates). If `task_inds` is specified, it must be a list of indices
parallel to `coord_sets`. For each coordinate set, the metrics extracted
will be for that task index only. If unspecified, the average over all tasks
is retained for each coordinate set.
If `pool` is specified, then all resulting vectors and scalars will be pooled
together to form a single vector (or scalar).
Note that only profile metrics will be extracted.
Returns a dictionary of the following form:
`nll`: [
(
<lower-bound NLL vector for coord set 1>,
<upper-bound NLL vector for coord set 1>
),
(
<lower-bound NLL vector for coord set 2>,
<upper-bound NLL vector for coord set 2>
),
...
],
`count_mse`: [
(
<lower-bound MSE scalar for coord set 1>,
<upper-bound MSE scalar for coord set 1>
),
(
<lower-bound MSE scalar for coord set 2>,
<upper-bound MSE scalar for coord set 2>
),
...
]
...
"""
result = {}
reader = h5py.File(perf_bounds_path, "r")
coord_reader = reader["coords"]
lower_perf_reader = reader["performance_lower"]
upper_perf_reader = reader["performance_upper"]
# First, get the set of indices within the HDF5 predictions/performance that
# correspond to the given coordinate sets
if coord_sets is None:
subset_inds = [np.arange(lower_perf_reader["nll"].shape[0])] # The entire vector
else:
# Import the DataFrame of coordinates in this HDF5
perf_coords = pd.DataFrame(
data={
"chrom": coord_reader["coords_chrom"][:].astype(str),
"start": coord_reader["coords_start"][:],
"end": coord_reader["coords_end"][:]
}
)
# Unlike the predictions, the performance bounds are computed solely
# based on profiles, so their saved coordinates have a different
# length, although they are centered at the same summit. So we need
# to re-pad them.
perf_coords["midpoint"] = (perf_coords["start"] + perf_coords["end"]) // 2
perf_coords["start"] = perf_coords["midpoint"] - (input_length // 2)
perf_coords["end"] = perf_coords["start"] + input_length
del perf_coords["midpoint"]
subset_inds = [
subset_coord_inds(perf_coords, coord_set)
for coord_set in coord_sets
]
# If we didn't specify a task index for each coordinate set, just use
# all tasks; either way, let's get each set of task indices into a
# NumPy array form
if task_inds is None:
task_inds = [np.arange(lower_perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
else:
task_inds = [np.array([i]) for i in task_inds]
# For each performance metric, for each coordinate set/task index, extract
# the metrics values for lower and upper bound
for key in lower_perf_reader.keys():
metrics_list = []
for i in range(len(subset_inds)):
subset = subset_inds[i]
tasks = task_inds[i]
if len(lower_perf_reader[key].shape) >= 2: # Profile metric
metrics_list.append(
(
np.mean(lower_perf_reader[key][subset][:, tasks], axis=1),
np.mean(upper_perf_reader[key][subset][:, tasks], axis=1)
)
)
else: # Count metric
# For the performance bounds, we'll use counts metrics as-is without
# recomputing them; this is because the counts metrics bounds are
# distributed very uniformly
metrics_list.append(
(
np.mean(lower_perf_reader[key][tasks]),
np.mean(upper_perf_reader[key][tasks]),
)
)
result[key] = metrics_list
reader.close()
# Perform any pooling
if pool:
for key in result:
if type(result[key][0][0]) is np.ndarray:
lower = np.concatenate([pair[0] for pair in result[key]])
upper = np.concatenate([pair[1] for pair in result[key]])
else:
lower = np.mean([pair[0] for pair in result[key]])
upper = np.mean([pair[1] for pair in result[key]])
result[key] = [(lower, upper)]
# Convert NaNs to 0
for key in result:
result[key] = [(np.nan_to_num(p1), np.nan_to_num(p2)) for p1, p2 in result[key]]
return result
def extract_counts(pred_perf_path, coord_sets=None, task_inds=None):
"""
Extracts the set of all true and predicted log counts from a saved
predictions/performance HDF5 file. Returns N x T x 2 arrays for the
true and predicted log counts, respectively. `N` is all peaks in the
file. If specified, filters for coordinates that are in `coord_sets`.
`coord_sets` is a list of coordinate DataFrames, and a set of counts will
be fetched for each table of coordinates provided. Otherwise, will simply
return all coordinates available. If `task_inds` is specified, it must be
a list of indices parallel to `coord_sets`. For each coordinate set, the
counts extracted will be for that task index only. If unspecified, all
tasks are retained for each coordinate set.
"""
result = {}
reader = h5py.File(pred_perf_path, "r")
coord_reader = reader["coords"]
pred_reader = reader["predictions"]
perf_reader = reader["performance"]
# First, get the set of indices within the HDF5 predictions/performance that
# correspond to the given coordinate sets
if coord_sets is None:
subset_inds = [np.arange(perf_reader["nll"].shape[0])] # The entire vector
else:
# Import the DataFrame of coordinates in this HDF5
pred_perf_coords = pd.DataFrame(
data={
"chrom": coord_reader["coords_chrom"][:].astype(str),
"start": coord_reader["coords_start"][:],
"end": coord_reader["coords_end"][:]
}
)
subset_inds = [
subset_coord_inds(pred_perf_coords, coord_set)
for coord_set in coord_sets
]
# If we didn't specify a task index for each coordinate set, just use
# all tasks; either way, let's get each set of task indices into a
# NumPy array form
if task_inds is None:
task_inds = [np.arange(perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
else:
task_inds = [np.array([i]) for i in task_inds]
# Extract the count values
log_true_counts, log_pred_counts = [], []
if coord_sets is None:
log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][:] + 1)))
log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][:]))
else:
for i in range(len(subset_inds)):
subset = subset_inds[i]
tasks = task_inds[i]
log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][subset][:, tasks] + 1)))
log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][subset][:, tasks]))
reader.close()
log_true_counts = [np.concatenate(log_true_counts)]
log_pred_counts = [np.concatenate(log_pred_counts)]
return log_true_counts, log_pred_counts
def min_max_normalize(lower_arr, query_arr, upper_arr):
"""
Given three same-size arrays, returns a version of the query array
in which the array is min-max normalized to the lower and upper
bounds (i.e. if the lower bound is 0 and the upper bound is 1).
If the upper bound is not higher than the lower bound, then the
bounds will be flipped. This is done independently for each entry.
"""
normed = (query_arr - lower_arr) / (upper_arr - lower_arr)
# Clamp between 0 and 1
return np.clip(normed, 0, 1)
def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
"""
Plots a CDF to the given axes. `steps` is the number of steps in the
CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
`density` is whether or not to normalize to fractions.
"""
hist, bin_edges = np.histogram(data, bins=steps)
if inverse:
cumsum = len(data) - np.cumsum(hist)
else:
cumsum = np.cumsum(hist)
if density:
cumsum = cumsum / len(data)
bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
ax.step(bin_centers, cumsum, **kwargs)
def plot_performances(perf_dict, title_header, cond_labels=None):
"""
Creates plots for a performance dictionary of the following form:
`nll`: [
<NLL vector for cond 1>
<NLL vector for cond 2>
...
],
`count_mse`: [
<MSE scalar for cond 1>
<MSE scalar for cond 2>
...
]
...
For profile metrics (i.e. where the metrics are a vector), creates CDF
plots. For count metrics (i.e. where the metrics are a scalar), creates bar
plots.
`cond_labels` must be arrays parallel to the set of vectors or scalars for
each metric.
Returns a dictionary mapping metric keys to figures.
"""
figs = {}
labels = [None] * len(perf_dict[metric_key]) if not cond_labels else cond_labels
# Profile metrics
for metric_key, metric_name in [
("nll", "NLL"), ("cross_ent", "Cross Entropy"), ("jsd", "JSD"), ("profile_mse", "Profile MSE"),
("profile_pearson", "Profile Pearson"), ("profile_spearman", "Profile Spearman")
]:
fig, ax = plt.subplots(figsize=(20, 8))
if metric_key.endswith("pearson") or metric_key.endswith("spearman"):
inverse = True
title = "%s: inverse CDF of %s" % (title_header, metric_name)
else:
inverse = False
title = "%s: CDF of %s" % (title_header, metric_name)
for i, arr in enumerate(perf_dict[metric_key]):
# Remove any NaNs
arr = arr[np.isfinite(arr)]
# If the data is out of the range [0, 1] (e.g. with correlations), clip them
arr = np.clip(arr, 0, 1)
make_cdf(ax, arr, steps=1000, density=True, inverse=inverse, label=labels[i])
if cond_labels:
plt.legend()
ax.set_title(title)
plt.show()
figs[metric_key] = fig
# Count metrics
for metric_key, metric_name in [
("count_mse", "Count MSE"), ("count_pearson", "Count Pearson"), ("count_spearman", "Count Spearman")
]:
fig, ax = plt.subplots(figsize=(20, 5))
label_locs = np.arange(len(perf_dict[metric_key])) # Location of labels
bars = ax.bar(
label_locs, perf_dict[metric_key], color="mediumorchid", alpha=0.7
)
ax.set_title("%s: %s" % (title_header, metric_name))
if cond_labels:
ax.set_xticks(label_locs)
ax.set_xticklabels(labels)
for bar in bars:
height = bar.get_height()
ax.annotate(
"%.3f" % height, xy=(bar.get_x() + (bar.get_width() / 2), height),
xytext=(0, 1), textcoords="offset points", ha="center", va="bottom"
)
plt.show()
figs[metric_key] = fig
return figs
def plot_count_scatterplots(counts, cond_labels=None):
"""
Creates a row of scatterplots for the counts of the following form:
[
(
<array of true log counts>
<array of predicted log counts>
),
...
]
`cond_labels` must be arrays parallel to the set of pairs of counts.
Returns the figure
"""
fig, ax = plt.subplots(ncols=len(counts), figsize=(8 * len(counts), 8))
if len(counts) == 1:
ax = [ax]
for i, (true_log_counts, pred_log_counts) in enumerate(counts):
ax[i].scatter(np.ravel(true_log_counts), np.ravel(pred_log_counts), alpha=0.02)
ax[i].set_xlabel("True log counts")
ax[i].set_ylabel("Predicted log counts")
(min_x, max_x), (min_y, max_y) = ax[i].get_xlim(), ax[i].get_ylim()
min_both, max_both = min(min_x, min_y), max(max_x, max_y)
ax[i].set_xlim(min_both, max_both)
ax[i].set_ylim(min_both, max_both)
ax[i].plot(
[min_both, max_both], [min_both, max_both],
color="black", linestyle="--", alpha=0.3, zorder=0
)
if cond_labels:
ax[i].set_title(cond_labels[i])
plt.show()
return fig
def save_perf_dict(perf_dict, path):
"""
Saves the performance dictionary mapping keys to lists of arrays
or lists of scalars.
"""
with h5py.File(path, "w") as f:
for key in perf_dict:
group = f.create_group(key)
for i, data in enumerate(perf_dict[key]):
if type(data) is np.ndarray:
group.create_dataset(str(i), data=data, compression="gzip")
else:
group.create_dataset(str(i), data=data)
def load_perf_dict(path):
"""
Saves the performance dictionary mapping keys to
"""
perf_dict = {}
with h5py.File(path, "r") as f:
for key in f:
perf_dict[key] = []
for i in range(len(f[key].keys())):
if not f[key][str(i)].shape:
perf_dict[key].append(f[key][str(i)][()])
else:
perf_dict[key].append(f[key][str(i)][:])
return perf_dict
For each task, import the set of peaks belonging to that task. This allows us to get a set of indices for coordinates in the saved predictions/performance files that correspond to each task.
# Read in tables containing the peak coordinates, padded to `input_length`
task_coords = []
assert len(files_spec["peak_beds"]) == num_tasks
for peak_bed_path in files_spec["peak_beds"]:
table = pd.read_csv(
peak_bed_path, sep="\t", header=None, # Infer compression
names=[
"chrom", "peak_start", "peak_end", "name", "score",
"strand", "signal", "pval", "qval", "summit_offset"
]
)
# Add summit location column:
table["summit"] = table["peak_start"] + table["summit_offset"]
# Add start and end columns, at proper length
table["start"] = table["summit"] - (input_length // 2)
table["end"] = table["start"] + input_length
task_coords.append(table[["chrom", "start", "end"]])
all_coords = pd.concat(task_coords)
A comparison of the test-set performance (averaged across tasks) between:
compute_dict = True
if perf_cache_dir:
# Import if it exists
perf_dict_path = os.path.join(perf_cache_dir, "multitask_allfolds_finetuned.h5")
if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
multitask_perf_dict = load_perf_dict(perf_dict_path)
compute_dict = False
if compute_dict:
# Make sure to extract metrics for each task individually, but pool them together
# in the end to form one set of metrics
# Get bounds for each fold
multitask_bounds_perf_dict = {}
for chroms in all_fold_test_chroms:
d = extract_performance_bounds(
perf_bounds_path, input_length,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], chroms)
for task_index in range(num_tasks)
],
task_inds=list(range(num_tasks)),
pool=True
)
for key in d:
try:
multitask_bounds_perf_dict[key].extend(d[key])
except KeyError:
multitask_bounds_perf_dict[key] = d[key]
multitask_perf_dict = {key : [] for key in multitask_bounds_perf_dict}
# 10 folds
for fold_index, pred_path in enumerate(multitask_preds_paths):
perf_dict = extract_performance_metrics(
pred_path,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])
for task_index in range(num_tasks)
],
task_inds=list(range(num_tasks)),
pool=True
)
for key in multitask_perf_dict.keys():
vals = perf_dict[key][0]
if not key.endswith("pearson") and not key.endswith("spearman"):
lower = multitask_bounds_perf_dict[key][fold_index][0]
upper = multitask_bounds_perf_dict[key][fold_index][1]
vals = min_max_normalize(lower, vals, upper)
multitask_perf_dict[key].append(vals)
# Fine-tuned
perf_dict = extract_performance_metrics(
multitask_finetuned_preds_path,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)
for task_index in range(num_tasks)
],
task_inds=list(range(num_tasks)),
pool=True
)
for key in multitask_perf_dict.keys():
vals = perf_dict[key][0]
if not key.endswith("pearson") and not key.endswith("spearman"):
# Fold index is best_multitask_fold - 1
lower = multitask_bounds_perf_dict[key][best_multitask_fold - 1][0]
upper = multitask_bounds_perf_dict[key][best_multitask_fold - 1][1]
vals = min_max_normalize(lower, vals, upper)
multitask_perf_dict[key].append(vals)
if perf_cache_dir:
save_perf_dict(multitask_perf_dict, perf_dict_path)
# Plot the CDFs and bar plots of the profile/counts metrics
cond_labels = [("Fold %d" % i) for i in range(1, 11)]
cond_labels += ["Fine-tuned\n(fold %d)" % best_multitask_fold]
metric_figs = plot_performances(
multitask_perf_dict, "%s multi-task models" % tf_name, cond_labels=cond_labels
)
# Import the total counts values and predictions
multitask_counts = []
for fold_index, pred_path in enumerate(multitask_preds_paths):
multitask_counts.append(extract_counts(
pred_path,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])
for task_index in range(num_tasks)
],
task_inds=list(range(num_tasks))
))
multitask_counts.append(extract_counts(
multitask_finetuned_preds_path,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)
for task_index in range(num_tasks)
],
task_inds=list(range(num_tasks)),
))
# Show counts scatterplots for each condition
scatter_fig = plot_count_scatterplots(multitask_counts, cond_labels=cond_labels)
if perf_cache_dir:
for key in metric_figs:
metric_figs[key].savefig(os.path.join(perf_cache_dir, "multitask_allfolds_finetuned_%s.png" % key))
scatter_fig.savefig(os.path.join(perf_cache_dir, "multitask_allfolds_finetuned_scattercounts.png"))
For each task, a comparison of the test-set performance between:
for task_index in range(num_tasks):
display(vdomh.h3("Task %d" % (task_index)))
compute_dict = True
if perf_cache_dir:
# Import if it exists
perf_dict_path = os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned.h5" % task_index)
if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
singletask_perf_dict = load_perf_dict(perf_dict_path)
compute_dict = False
if compute_dict:
# Get bounds for each fold
singletask_bounds_perf_dict = extract_performance_bounds(
perf_bounds_path, input_length,
coord_sets=[
filter_coords_by_chroms(task_coords[task_index], chroms)
for chroms in all_fold_test_chroms
],
task_inds=([task_index] * len(all_fold_test_chroms))
)
singletask_perf_dict = {key : [] for key in singletask_bounds_perf_dict}
# 10 folds
for fold_index, pred_path in enumerate(singletask_preds_paths[task_index]):
perf_dict = extract_performance_metrics(
pred_path,
coord_sets=[filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])]
)
# No need to specify specific task indices, because single-task model
# predictions are saved only for that one task
for key in singletask_perf_dict.keys():
vals = perf_dict[key][0]
if not key.endswith("pearson") and not key.endswith("spearman"):
lower = singletask_bounds_perf_dict[key][fold_index][0]
upper = singletask_bounds_perf_dict[key][fold_index][1]
vals = min_max_normalize(lower, vals, upper)
singletask_perf_dict[key].append(vals)
# Fine-tuned
perf_dict = extract_performance_metrics(
singletask_finetuned_preds_paths[task_index],
coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_singletask_fold_test_chroms[task_index])]
)
# No need to specify specific task indices, because single-task model
# predictions are saved only for that one task
for key in singletask_perf_dict.keys():
vals = perf_dict[key][0]
if not key.endswith("pearson") and not key.endswith("spearman"):
# Fold index is best_singletask_folds[task_index] - 1
lower = singletask_bounds_perf_dict[key][best_singletask_folds[task_index] - 1][0]
upper = singletask_bounds_perf_dict[key][best_singletask_folds[task_index] - 1][1]
vals = min_max_normalize(lower, vals, upper)
singletask_perf_dict[key].append(vals)
if perf_cache_dir:
save_perf_dict(singletask_perf_dict, perf_dict_path)
# Plot the CDFs and bar plots of the profile/counts metrics
cond_labels = [("Fold %d" % i) for i in range(1, 11)]
cond_labels += ["Fine-tuned\n(fold %d)" % best_singletask_folds[task_index]]
metric_figs = plot_performances(
singletask_perf_dict, "%s single-task models (task %d)" % (tf_name, task_index),
cond_labels=cond_labels
)
# Import the total counts values and predictions
singletask_counts = []
for fold_index, pred_path in enumerate(singletask_preds_paths[task_index]):
singletask_counts.append(extract_counts(
pred_path,
coord_sets=[filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])]
))
singletask_counts.append(extract_counts(
singletask_finetuned_preds_paths[task_index],
coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_singletask_fold_test_chroms[task_index])]
))
# No need to specify specific coordinates or task indices, because
# single-task model predictions are saved only for that one task
# Show counts scatterplots for each condition
scatter_fig = plot_count_scatterplots(singletask_counts, cond_labels=cond_labels)
if perf_cache_dir:
for key in metric_figs:
metric_figs[key].savefig(os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned_%s.png" % (task_index, key)))
scatter_fig.savefig(os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned_scattercounts.png" % task_index))
A comparison of the test-set performance for between each task of a multi-task profile model fine-tuned on the best-performing fold
compute_dict = True
if perf_cache_dir:
# Import if it exists
perf_dict_path = os.path.join(perf_cache_dir, "multitask_finetuned_tasks.h5")
if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
finetune_multitask_perf_dict = load_perf_dict(perf_dict_path)
compute_dict = False
if compute_dict:
# Get bounds for each task (best multi-task fold only)
finetune_multitask_bounds_perf_dict = extract_performance_bounds(
perf_bounds_path, input_length,
coord_sets=[
filter_coords_by_chroms(coords, best_multitask_fold_test_chroms)
for coords in task_coords
],
task_inds=(list(range(num_tasks)))
)
finetune_multitask_perf_dict = {key : [] for key in finetune_multitask_bounds_perf_dict}
perf_dict = extract_performance_metrics(
multitask_finetuned_preds_path,
coord_sets=[
filter_coords_by_chroms(coords, best_multitask_fold_test_chroms)
for coords in task_coords
],
task_inds=(list(range(num_tasks)))
)
for task_index in range(num_tasks):
for key in finetune_multitask_perf_dict.keys():
vals = perf_dict[key][task_index]
if not key.endswith("pearson") and not key.endswith("spearman"):
# Fold index is best_singletask_folds[task_index] - 1
lower = finetune_multitask_bounds_perf_dict[key][task_index][0]
upper = finetune_multitask_bounds_perf_dict[key][task_index][1]
vals = min_max_normalize(lower, vals, upper)
finetune_multitask_perf_dict[key].append(vals)
if perf_cache_dir:
save_perf_dict(finetune_multitask_perf_dict, perf_dict_path)
# Plot the CDFs and bar plots of the profile/counts metrics
cond_labels = [("Task %d" % i) for i in range(num_tasks)]
metric_figs = plot_performances(
finetune_multitask_perf_dict, "%s fine-tuned multi-task model" % tf_name, cond_labels=cond_labels
)
# Import the total counts values and predictions
finetune_multitask_counts = []
for task_index in range(num_tasks):
finetune_multitask_counts.append(extract_counts(
multitask_finetuned_preds_path,
coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)],
task_inds=[task_index]
))
# Show counts scatterplots for each condition
scatter_fig = plot_count_scatterplots(finetune_multitask_counts, cond_labels=cond_labels)
if perf_cache_dir:
for key in metric_figs:
metric_figs[key].savefig(os.path.join(perf_cache_dir, "multitask_finetuned_tasks_%s.png" % key))