In [1]:
import h5py
import numpy as np
import scipy.stats
import scipy.special
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import pandas as pd
import json
import os
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:13: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  del sys.path[0]
Out[1]:
0it [00:00, ?it/s]
In [2]:
# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 15,
    "font.size": 13,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.

Define constants and paths

In [3]:
# Define parameters/fetch arguments
tf_name = os.environ["TFM_RESULTS_TF_NAME"]
num_tasks = int(os.environ["TFM_RESULTS_NUM_TASKS"])
best_multitask_fold = int(os.environ["TFM_RESULTS_BEST_MULTITASK_FOLD"])
best_singletask_folds = [int(x) for x in os.environ["TFM_RESULTS_BEST_SINGLETASK_FOLDS"].split(",")]

if "TFM_PERF_CACHE" in os.environ:
    perf_cache_dir = os.environ["TFM_PERF_CACHE"]
else:
    perf_cache_dir = None

print("TF name: %s" % tf_name)
print("Number of tasks: %d" % num_tasks)
print("Best multi-task fold: %d" % best_multitask_fold)
print("Best single-task folds: %s" % " ".join([str(x) for x in best_singletask_folds]))
print("Saved performance cache: %s" % perf_cache_dir)
TF name: FOXA2
Number of tasks: 4
Best multi-task fold: 7
Best single-task folds: 7 7 5 9
Saved performance cache: /users/amtseng/tfmodisco/results/reports/model_performance/cache/FOXA2
In [4]:
preds_base = "/users/amtseng/tfmodisco/results/peak_predictions/"

# Paths for all 10 folds for multi-task profile models
multitask_preds_paths = [
    os.path.join(
        preds_base,
        "multitask_profile",
        "{0}_multitask_profile_fold{1}/{0}_multitask_profile_fold{1}_pred_perf.h5".format(tf_name, fold)
    ) for fold in range(1, 11)
]

# Paths for all 10 folds for single-task profile models, for each task
singletask_preds_paths = [
    [
        os.path.join(
            preds_base,
            "singletask_profile",
            "{0}_singletask_profile_fold{1}/task_{2}/{0}_singletask_profile_task{2}_fold{1}_pred_perf.h5".format(
                tf_name, fold, task_index
            )
        ) for fold in range(1, 11)
    ] for task_index in range(num_tasks)
]

# Path for fine-tuned multi-task profile model
multitask_finetuned_preds_path = os.path.join(
    preds_base,
    "multitask_profile_finetune",
    "{0}_multitask_profile_finetune_fold{1}/{0}_multitask_profile_finetune_fold{1}_pred_perf.h5".format(
        tf_name, best_multitask_fold
    )
)

# Paths for fine-tuned single-task profile models, for each task
singletask_finetuned_preds_paths = [
    os.path.join(
        preds_base,
        "singletask_profile_finetune",
        "{0}_singletask_profile_finetune_fold{1}/task_{2}/{0}_singletask_profile_finetune_task{2}_fold{1}_pred_perf.h5".format(
            tf_name, fold, task_index
        )
    ) for task_index, fold in enumerate(best_singletask_folds)
]

# Path for upper-bound and lower-bound performance metrics
perf_bounds_path = "/users/amtseng/tfmodisco/results/performance_bounds/{0}_performance_bounds.h5".format(tf_name)

# File specifications (including peak files)
files_spec_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/config/{0}/{0}_training_paths.json".format(tf_name)
with open(files_spec_json, "r") as f:
    files_spec = json.load(f)

# Chromosome split definition (i.e. test set chromosomes)
chrom_splits_json = "/users/amtseng/tfmodisco/data/processed/ENCODE/chrom_splits.json"
with open(chrom_splits_json, "r") as f:
    chrom_splits = json.load(f)
all_fold_test_chroms = [
    chrom_splits[str(fold)]["test"] for fold in range(1, 11)
]
best_multitask_fold_test_chroms = chrom_splits[str(best_multitask_fold)]["test"]
best_singletask_fold_test_chroms = [chrom_splits[str(fold)]["test"] for fold in best_singletask_folds]
In [5]:
input_length = 2114
profile_length = 1000
In [6]:
if perf_cache_dir:
    os.makedirs(perf_cache_dir, exist_ok=True)

Helper functions

For subsetting predictions/performance metrics to peak subsets, extracting performance metrics, and plotting

In [7]:
def subset_coord_inds(superset_coords, subset_coords):
    """
    Both `superset_coords` and `subset_coords` are Pandas DataFrames of
    coordinates. This will return the indices within `superset_coords`
    (indices being the 0-indexed row numbers) that correspond to the
    coordinates in `subset_coords`. Returns a sorted NumPy array of indices.
    Note that if there are duplicates in either set of coordinates, they
    will be dropped (i.e. the returned indices will be unique).
    """
    inds = superset_coords.reset_index().drop_duplicates(["chrom", "start", "end"]).merge(
        subset_coords.reset_index(), on=["chrom", "start", "end"]
    ).sort_values("index_y")["index_x"].values
    return np.unique(inds)
In [8]:
def filter_coords_by_chroms(coords, chroms):
    """
    Given a Pandas DataFrame of coordinates (column names "chrom", "start",
    and "end"), filters for the chromosomes in `chroms`
    """
    return coords[coords["chrom"].isin(chroms)]
In [9]:
def extract_performance_metrics(pred_perf_path, coord_sets=None, task_inds=None, pool=False):
    """
    Extracts the set of performance metrics from a saved predictions/performance
    HDF5 file. If specified, filters for coordinates that are in `coord_sets`.
    `coord_sets` is a list of coordinate DataFrames, and a set of metrics will
    be fetched for each table of coordinates provided. Otherwise, will simply
    return all coordinates available (i.e. only one coordinate set with all
    coordinates). If `task_inds` is specified, it must be a list of indices
    parallel to `coord_sets`. For each coordinate set, the metrics extracted
    will be for that task index only. If unspecified, the average over all tasks
    is retained for each coordinate set.
    If `pool` is specified, then all resulting vectors and scalars will be pooled
    together to form a single vector (or scalar).
    Returns a dictionary of the following form:
        `nll`: [
            <NLL vector for coord set 1>
            <NLL vector for coord set 2>
            ...
        ],
        `count_mse`: [
            <MSE scalar for coord set 1>
            <MSE scalar for coord set 2>
            ...
        ]
        ...
    """
    result = {}
    
    reader = h5py.File(pred_perf_path, "r")
    coord_reader = reader["coords"]
    pred_reader = reader["predictions"]
    perf_reader = reader["performance"]
    
    # First, get the set of indices within the HDF5 predictions/performance that
    # correspond to the given coordinate sets
    if coord_sets is None:
        subset_inds = [np.arange(perf_reader["nll"].shape[0])]  # The entire vector
    else:
        # Import the DataFrame of coordinates in this HDF5
        pred_perf_coords = pd.DataFrame(
            data={
                "chrom": coord_reader["coords_chrom"][:].astype(str),
                "start": coord_reader["coords_start"][:],
                "end": coord_reader["coords_end"][:]
            }
        )
        subset_inds = [
            subset_coord_inds(pred_perf_coords, coord_set)
            for coord_set in coord_sets
        ]
    
    # If we didn't specify a task index for each coordinate set, just use
    # all tasks; either way, let's get each set of task indices into a
    # NumPy array form
    if task_inds is None:
        task_inds = [np.arange(perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
    else:
        task_inds = [np.array([i]) for i in task_inds]
        
    # For each performance metric, for each coordinate set/task index, extract
    # the metrics values
    for key in perf_reader.keys():
        metrics_list = []
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            
            if len(perf_reader[key].shape) >= 2:  # Profile metric
                metrics_list.append(
                    np.mean(perf_reader[key][subset][:, tasks], axis=1)
                )
                result[key] = metrics_list

    # Now extract the count values for recomputing the count metrics
    log_true_counts, log_pred_counts = [], []
    if coord_sets is None:
        # Technically, there should be no need to recompute the count metrics,
        # since these were saved for the entire set, but for simplicity of code
        # we can recompute them anyways, since they are cheap operations
        log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][:] + 1)))
        log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][:]))
    else:
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][subset][:, tasks] + 1)))
            log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][subset][:, tasks]))
        
    reader.close()
        
    # Perform any pooling
    if pool:
        for key in result:
            result[key] = [np.concatenate(result[key])]
        log_true_counts = [np.concatenate(log_true_counts)]
        log_pred_counts = [np.concatenate(log_pred_counts)]
        
    # Now recompute the count metrics
    num_sets = len(log_true_counts)
    result["count_mse"] = [
         np.mean(np.square(t - p)) for t, p in zip(log_true_counts, log_pred_counts)
    ]
    result["count_pearson"] = [
         scipy.stats.pearsonr(t, p)[0] for t, p in zip(log_true_counts, log_pred_counts)
    ]
    result["count_spearman"] = [
         scipy.stats.spearmanr(t, p)[0] for t, p in zip(log_true_counts, log_pred_counts)
    ]
    
    # Convert NaNs to 0
    for key in result:
        result[key] = [np.nan_to_num(a) for a in result[key]]
    
    return result
In [10]:
def extract_performance_bounds(perf_bounds_path, input_length, coord_sets=None, task_inds=None, pool=False):
    """
    Extracts the set of lower and upper bound performance metrics from a saved
    HDF5 file. `input_length` is the lenght of input sequence to use.
    If specified, filters for coordinates that are in `coord_sets`.
    `coord_sets` is a list of coordinate DataFrames, and a set of metrics will
    be fetched for each table of coordinates provided. Otherwise, will simply
    return all coordinates available (i.e. only one coordinate set with all
    coordinates). If `task_inds` is specified, it must be a list of indices
    parallel to `coord_sets`. For each coordinate set, the metrics extracted
    will be for that task index only. If unspecified, the average over all tasks
    is retained for each coordinate set.
    If `pool` is specified, then all resulting vectors and scalars will be pooled
    together to form a single vector (or scalar).
    Note that only profile metrics will be extracted.
    Returns a dictionary of the following form:
        `nll`: [
            (
                <lower-bound NLL vector for coord set 1>,
                <upper-bound NLL vector for coord set 1>
            ),
            (
                <lower-bound NLL vector for coord set 2>,
                <upper-bound NLL vector for coord set 2>
            ),
            
            ...
        ],
        `count_mse`: [
            (
                <lower-bound MSE scalar for coord set 1>,
                <upper-bound MSE scalar for coord set 1>
            ),
            (
                <lower-bound MSE scalar for coord set 2>,
                <upper-bound MSE scalar for coord set 2>
            ),
            ...
        ]
        ...
    """
    result = {}
    
    reader = h5py.File(perf_bounds_path, "r")
    coord_reader = reader["coords"]
    lower_perf_reader = reader["performance_lower"]
    upper_perf_reader = reader["performance_upper"]
    
    # First, get the set of indices within the HDF5 predictions/performance that
    # correspond to the given coordinate sets
    if coord_sets is None:
        subset_inds = [np.arange(lower_perf_reader["nll"].shape[0])]  # The entire vector
    else:
        # Import the DataFrame of coordinates in this HDF5
        perf_coords = pd.DataFrame(
            data={
                "chrom": coord_reader["coords_chrom"][:].astype(str),
                "start": coord_reader["coords_start"][:],
                "end": coord_reader["coords_end"][:]
            }
        )
        
        # Unlike the predictions, the performance bounds are computed solely
        # based on profiles, so their saved coordinates have a different
        # length, although they are centered at the same summit. So we need
        # to re-pad them.
        perf_coords["midpoint"] = (perf_coords["start"] + perf_coords["end"]) // 2
        perf_coords["start"] = perf_coords["midpoint"] - (input_length // 2)
        perf_coords["end"] = perf_coords["start"] + input_length
        del perf_coords["midpoint"]
        
        subset_inds = [
            subset_coord_inds(perf_coords, coord_set)
            for coord_set in coord_sets
        ]
    
    # If we didn't specify a task index for each coordinate set, just use
    # all tasks; either way, let's get each set of task indices into a
    # NumPy array form
    if task_inds is None:
        task_inds = [np.arange(lower_perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
    else:
        task_inds = [np.array([i]) for i in task_inds]
        
    # For each performance metric, for each coordinate set/task index, extract
    # the metrics values for lower and upper bound
    for key in lower_perf_reader.keys():
        metrics_list = []
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            
            if len(lower_perf_reader[key].shape) >= 2:  # Profile metric
                metrics_list.append(
                    (
                        np.mean(lower_perf_reader[key][subset][:, tasks], axis=1),
                        np.mean(upper_perf_reader[key][subset][:, tasks], axis=1)
                    )
                )
            else:  # Count metric
                # For the performance bounds, we'll use counts metrics as-is without
                # recomputing them; this is because the counts metrics bounds are
                # distributed very uniformly
                metrics_list.append(
                    (
                        np.mean(lower_perf_reader[key][tasks]),
                        np.mean(upper_perf_reader[key][tasks]),
                    )
                )

        result[key] = metrics_list

    reader.close()
    
    # Perform any pooling
    if pool:
        for key in result:
            if type(result[key][0][0]) is np.ndarray:
                lower = np.concatenate([pair[0] for pair in result[key]])
                upper = np.concatenate([pair[1] for pair in result[key]])
            else:
                lower = np.mean([pair[0] for pair in result[key]])
                upper = np.mean([pair[1] for pair in result[key]])
            result[key] = [(lower, upper)]
    
    # Convert NaNs to 0
    for key in result:
        result[key] = [(np.nan_to_num(p1), np.nan_to_num(p2)) for p1, p2 in result[key]]
    
    return result
In [11]:
def extract_counts(pred_perf_path, coord_sets=None, task_inds=None):
    """
    Extracts the set of all true and predicted log counts from a saved
    predictions/performance HDF5 file. Returns N x T x 2 arrays for the
    true and predicted log counts, respectively. `N` is all peaks in the
    file. If specified, filters for coordinates that are in `coord_sets`.
    `coord_sets` is a list of coordinate DataFrames, and a set of counts will
    be fetched for each table of coordinates provided. Otherwise, will simply
    return all coordinates available. If `task_inds` is specified, it must be
    a list of indices parallel to `coord_sets`. For each coordinate set, the
    counts extracted will be for that task index only. If unspecified, all
    tasks are retained for each coordinate set.
    """
    result = {}
    
    reader = h5py.File(pred_perf_path, "r")
    coord_reader = reader["coords"]
    pred_reader = reader["predictions"]
    perf_reader = reader["performance"]
    
    # First, get the set of indices within the HDF5 predictions/performance that
    # correspond to the given coordinate sets
    if coord_sets is None:
        subset_inds = [np.arange(perf_reader["nll"].shape[0])]  # The entire vector
    else:
        # Import the DataFrame of coordinates in this HDF5
        pred_perf_coords = pd.DataFrame(
            data={
                "chrom": coord_reader["coords_chrom"][:].astype(str),
                "start": coord_reader["coords_start"][:],
                "end": coord_reader["coords_end"][:]
            }
        )
        subset_inds = [
            subset_coord_inds(pred_perf_coords, coord_set)
            for coord_set in coord_sets
        ]
    
    # If we didn't specify a task index for each coordinate set, just use
    # all tasks; either way, let's get each set of task indices into a
    # NumPy array form
    if task_inds is None:
        task_inds = [np.arange(perf_reader["nll"].shape[1]) for _ in range(len(subset_inds))]
    else:
        task_inds = [np.array([i]) for i in task_inds]
        
    # Extract the count values
    log_true_counts, log_pred_counts = [], []
    if coord_sets is None:
        log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][:] + 1)))
        log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][:]))
    else:
        for i in range(len(subset_inds)):
            subset = subset_inds[i]
            tasks = task_inds[i]
            log_true_counts.append(np.ravel(np.log(pred_reader["true_counts"][subset][:, tasks] + 1)))
            log_pred_counts.append(np.ravel(pred_reader["log_pred_counts"][subset][:, tasks]))
        
    reader.close()
        
    log_true_counts = [np.concatenate(log_true_counts)]
    log_pred_counts = [np.concatenate(log_pred_counts)]
    return log_true_counts, log_pred_counts
In [12]:
def min_max_normalize(lower_arr, query_arr, upper_arr):
    """
    Given three same-size arrays, returns a version of the query array
    in which the array is min-max normalized to the lower and upper
    bounds (i.e. if the lower bound is 0 and the upper bound is 1).
    If the upper bound is not higher than the lower bound, then the
    bounds will be flipped. This is done independently for each entry.
    """
    normed = (query_arr - lower_arr) / (upper_arr - lower_arr)
    # Clamp between 0 and 1
    return np.clip(normed, 0, 1)
In [13]:
def make_cdf(ax, data, steps=1000, density=False, inverse=False, **kwargs):
    """
    Plots a CDF to the given axes. `steps` is the number of steps in the
    CDF. If `inverse` is True, plots an inverse CDF (AKA survivorship plot).
    `density` is whether or not to normalize to fractions.
    """
    hist, bin_edges = np.histogram(data, bins=steps)
    if inverse:
        cumsum = len(data) - np.cumsum(hist)
    else:
        cumsum = np.cumsum(hist)
    if density:
        cumsum = cumsum / len(data)
    bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2.
    ax.step(bin_centers, cumsum, **kwargs)
In [14]:
def plot_performances(perf_dict, title_header, cond_labels=None):
    """
    Creates plots for a performance dictionary of the following form:
        `nll`: [
            <NLL vector for cond 1>
            <NLL vector for cond 2>
            ...
        ],
        `count_mse`: [
            <MSE scalar for cond 1>
            <MSE scalar for cond 2>
            ...
        ]
        ...
    For profile metrics (i.e. where the metrics are a vector), creates CDF
    plots. For count metrics (i.e. where the metrics are a scalar), creates bar
    plots.
    `cond_labels` must be arrays parallel to the set of vectors or scalars for
    each metric.
    Returns a dictionary mapping metric keys to figures.
    """
    figs = {}
    labels = [None] * len(perf_dict[metric_key]) if not cond_labels else cond_labels
    # Profile metrics
    for metric_key, metric_name in [
        ("nll", "NLL"), ("cross_ent", "Cross Entropy"), ("jsd", "JSD"), ("profile_mse", "Profile MSE"),
        ("profile_pearson", "Profile Pearson"), ("profile_spearman", "Profile Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 8))
        if metric_key.endswith("pearson") or metric_key.endswith("spearman"):
            inverse = True
            title = "%s: inverse CDF of %s" % (title_header, metric_name)
        else:
            inverse = False
            title = "%s: CDF of %s" % (title_header, metric_name)
        for i, arr in enumerate(perf_dict[metric_key]):
            # Remove any NaNs
            arr = arr[np.isfinite(arr)]
            # If the data is out of the range [0, 1] (e.g. with correlations), clip them
            arr = np.clip(arr, 0, 1)
            make_cdf(ax, arr, steps=1000, density=True, inverse=inverse, label=labels[i])
        if cond_labels:
            plt.legend()
        ax.set_title(title)
        plt.show()
        figs[metric_key] = fig

    # Count metrics
    for metric_key, metric_name in [
        ("count_mse", "Count MSE"), ("count_pearson", "Count Pearson"), ("count_spearman", "Count Spearman")
    ]:
        fig, ax = plt.subplots(figsize=(20, 5))
        label_locs = np.arange(len(perf_dict[metric_key]))  # Location of labels
        bars = ax.bar(
            label_locs, perf_dict[metric_key], color="mediumorchid", alpha=0.7
        )
        ax.set_title("%s: %s" % (title_header, metric_name))
        if cond_labels:
            ax.set_xticks(label_locs)
            ax.set_xticklabels(labels)
            
        for bar in bars:
            height = bar.get_height()
            ax.annotate(
                "%.3f" % height, xy=(bar.get_x() + (bar.get_width() / 2), height),
                xytext=(0, 1), textcoords="offset points", ha="center", va="bottom"
            )
        plt.show()
        figs[metric_key] = fig
    return figs
In [15]:
def plot_count_scatterplots(counts, cond_labels=None):
    """
    Creates a row of scatterplots for the counts of the following form:
        [
            (
                <array of true log counts>
                <array of predicted log counts>
            ),
            ...
        ]
    `cond_labels` must be arrays parallel to the set of pairs of counts.
    Returns the figure
    """
    fig, ax = plt.subplots(ncols=len(counts), figsize=(8 * len(counts), 8))
    if len(counts) == 1:
        ax = [ax]
    for i, (true_log_counts, pred_log_counts) in enumerate(counts):
        ax[i].scatter(np.ravel(true_log_counts), np.ravel(pred_log_counts), alpha=0.02)
        ax[i].set_xlabel("True log counts")
        ax[i].set_ylabel("Predicted log counts")
        (min_x, max_x), (min_y, max_y) = ax[i].get_xlim(), ax[i].get_ylim()
        min_both, max_both = min(min_x, min_y), max(max_x, max_y)
        ax[i].set_xlim(min_both, max_both)
        ax[i].set_ylim(min_both, max_both)
        ax[i].plot(
            [min_both, max_both], [min_both, max_both],
            color="black", linestyle="--", alpha=0.3, zorder=0
        )
        if cond_labels:
            ax[i].set_title(cond_labels[i])
    plt.show()
    return fig
In [16]:
def save_perf_dict(perf_dict, path):
    """
    Saves the performance dictionary mapping keys to lists of arrays
    or lists of scalars.
    """
    with h5py.File(path, "w") as f:
        for key in perf_dict:
            group = f.create_group(key)
            for i, data in enumerate(perf_dict[key]):
                if type(data) is np.ndarray:
                    group.create_dataset(str(i), data=data, compression="gzip")
                else:
                    group.create_dataset(str(i), data=data)
In [17]:
def load_perf_dict(path):
    """
    Saves the performance dictionary mapping keys to 
    """
    perf_dict = {}
    with h5py.File(path, "r") as f:
        for key in f:
            perf_dict[key] = []
            for i in range(len(f[key].keys())):
                if not f[key][str(i)].shape:
                    perf_dict[key].append(f[key][str(i)][()])
                else:
                    perf_dict[key].append(f[key][str(i)][:])
    return perf_dict

Import peak coordinates for each task

For each task, import the set of peaks belonging to that task. This allows us to get a set of indices for coordinates in the saved predictions/performance files that correspond to each task.

In [18]:
# Read in tables containing the peak coordinates, padded to `input_length`
task_coords = []
assert len(files_spec["peak_beds"]) == num_tasks
for peak_bed_path in files_spec["peak_beds"]:
    table = pd.read_csv(
        peak_bed_path, sep="\t", header=None,  # Infer compression
        names=[
            "chrom", "peak_start", "peak_end", "name", "score",
            "strand", "signal", "pval", "qval", "summit_offset"
        ]
    )
    # Add summit location column:
    table["summit"] = table["peak_start"] + table["summit_offset"]
    
    # Add start and end columns, at proper length
    table["start"] = table["summit"] - (input_length // 2)
    table["end"] = table["start"] + input_length
    
    task_coords.append(table[["chrom", "start", "end"]])
    
all_coords = pd.concat(task_coords)

Multi-task profile model performance across all 10 folds

A comparison of the test-set performance (averaged across tasks) between:

  1. Multi-task profile models trained across all 10 folds
  2. A fine-tuned multi-task profile model on the best-performing fold
  3. Upper and lower bounds on the best-performing fold
In [19]:
compute_dict = True
if perf_cache_dir:
    # Import if it exists
    perf_dict_path = os.path.join(perf_cache_dir, "multitask_allfolds_finetuned.h5")
    if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
        multitask_perf_dict = load_perf_dict(perf_dict_path)
        compute_dict = False

if compute_dict:
    # Make sure to extract metrics for each task individually, but pool them together
    # in the end to form one set of metrics
    
    # Get bounds for each fold
    multitask_bounds_perf_dict = {}
    for chroms in all_fold_test_chroms:
        d = extract_performance_bounds(
            perf_bounds_path, input_length,
            coord_sets=[
                filter_coords_by_chroms(task_coords[task_index], chroms)
                for task_index in range(num_tasks)
            ],
            task_inds=list(range(num_tasks)),
            pool=True
        )
        for key in d:
            try:
                multitask_bounds_perf_dict[key].extend(d[key])
            except KeyError:
                multitask_bounds_perf_dict[key] = d[key]

    multitask_perf_dict = {key : [] for key in multitask_bounds_perf_dict}
    # 10 folds
    for fold_index, pred_path in enumerate(multitask_preds_paths):
        perf_dict = extract_performance_metrics(
            pred_path,
            coord_sets=[
                filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])
                for task_index in range(num_tasks)
            ],
            task_inds=list(range(num_tasks)),
            pool=True
        )
        for key in multitask_perf_dict.keys():
            vals = perf_dict[key][0]
            if not key.endswith("pearson") and not key.endswith("spearman"):
                lower = multitask_bounds_perf_dict[key][fold_index][0]
                upper = multitask_bounds_perf_dict[key][fold_index][1]
                vals = min_max_normalize(lower, vals, upper)
            multitask_perf_dict[key].append(vals)

    # Fine-tuned
    perf_dict = extract_performance_metrics(
        multitask_finetuned_preds_path,
        coord_sets=[
            filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)
            for task_index in range(num_tasks)
        ],
        task_inds=list(range(num_tasks)),
        pool=True
    )
    for key in multitask_perf_dict.keys():
        vals = perf_dict[key][0]
        if not key.endswith("pearson") and not key.endswith("spearman"):
            # Fold index is best_multitask_fold - 1
            lower = multitask_bounds_perf_dict[key][best_multitask_fold - 1][0]
            upper = multitask_bounds_perf_dict[key][best_multitask_fold - 1][1]
            vals = min_max_normalize(lower, vals, upper)
        multitask_perf_dict[key].append(vals)
    
    if perf_cache_dir:
        save_perf_dict(multitask_perf_dict, perf_dict_path)
    
# Plot the CDFs and bar plots of the profile/counts metrics
cond_labels = [("Fold %d" % i) for i in range(1, 11)]
cond_labels += ["Fine-tuned\n(fold %d)" % best_multitask_fold]
metric_figs = plot_performances(
    multitask_perf_dict, "%s multi-task models" % tf_name, cond_labels=cond_labels
)

# Import the total counts values and predictions
multitask_counts = []
for fold_index, pred_path in enumerate(multitask_preds_paths):
    multitask_counts.append(extract_counts(
        pred_path,
        coord_sets=[
            filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])
            for task_index in range(num_tasks)
        ],
        task_inds=list(range(num_tasks))
    ))
multitask_counts.append(extract_counts(
    multitask_finetuned_preds_path,
    coord_sets=[
        filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)
        for task_index in range(num_tasks)
    ],
    task_inds=list(range(num_tasks)),
))

# Show counts scatterplots for each condition
scatter_fig = plot_count_scatterplots(multitask_counts, cond_labels=cond_labels)

if perf_cache_dir:
    for key in metric_figs:
        metric_figs[key].savefig(os.path.join(perf_cache_dir, "multitask_allfolds_finetuned_%s.png" % key))
    scatter_fig.savefig(os.path.join(perf_cache_dir, "multitask_allfolds_finetuned_scattercounts.png"))

Single-task profile model performance across all 10 folds

For each task, a comparison of the test-set performance between:

  1. Single-task profile models trained across all 10 folds
  2. A fine-tuned single-task profile model on the best-performing fold
  3. Upper and lower bounds on the best-performing fold
In [20]:
for task_index in range(num_tasks):
    display(vdomh.h3("Task %d" % (task_index)))
    compute_dict = True
    if perf_cache_dir:
        # Import if it exists
        perf_dict_path = os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned.h5" % task_index)
        if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
            singletask_perf_dict = load_perf_dict(perf_dict_path)
            compute_dict = False
            
    if compute_dict:
        # Get bounds for each fold
        singletask_bounds_perf_dict = extract_performance_bounds(
            perf_bounds_path, input_length,
            coord_sets=[
                filter_coords_by_chroms(task_coords[task_index], chroms)
                for chroms in all_fold_test_chroms
            ],
            task_inds=([task_index] * len(all_fold_test_chroms))
        )

        singletask_perf_dict = {key : [] for key in singletask_bounds_perf_dict}

        # 10 folds
        for fold_index, pred_path in enumerate(singletask_preds_paths[task_index]):
            perf_dict = extract_performance_metrics(
                pred_path,
                coord_sets=[filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])]
            )
            # No need to specify specific task indices, because single-task model
            # predictions are saved only for that one task
            for key in singletask_perf_dict.keys():
                vals = perf_dict[key][0]
                if not key.endswith("pearson") and not key.endswith("spearman"):
                    lower = singletask_bounds_perf_dict[key][fold_index][0]
                    upper = singletask_bounds_perf_dict[key][fold_index][1]
                    vals = min_max_normalize(lower, vals, upper)
                singletask_perf_dict[key].append(vals)

        # Fine-tuned
        perf_dict = extract_performance_metrics(
            singletask_finetuned_preds_paths[task_index],
            coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_singletask_fold_test_chroms[task_index])]
        )
        # No need to specify specific task indices, because single-task model
        # predictions are saved only for that one task
        for key in singletask_perf_dict.keys():
            vals = perf_dict[key][0]
            if not key.endswith("pearson") and not key.endswith("spearman"):
                # Fold index is best_singletask_folds[task_index] - 1
                lower = singletask_bounds_perf_dict[key][best_singletask_folds[task_index] - 1][0]
                upper = singletask_bounds_perf_dict[key][best_singletask_folds[task_index] - 1][1]
                vals = min_max_normalize(lower, vals, upper)
            singletask_perf_dict[key].append(vals)
            
        if perf_cache_dir:
            save_perf_dict(singletask_perf_dict, perf_dict_path)
            
    # Plot the CDFs and bar plots of the profile/counts metrics
    cond_labels = [("Fold %d" % i) for i in range(1, 11)]
    cond_labels += ["Fine-tuned\n(fold %d)" % best_singletask_folds[task_index]]
    metric_figs = plot_performances(
        singletask_perf_dict, "%s single-task models (task %d)" % (tf_name, task_index),
        cond_labels=cond_labels
    )
    
    # Import the total counts values and predictions
    singletask_counts = []
    for fold_index, pred_path in enumerate(singletask_preds_paths[task_index]):
        singletask_counts.append(extract_counts(
            pred_path,
            coord_sets=[filter_coords_by_chroms(task_coords[task_index], all_fold_test_chroms[fold_index])]
        ))
    singletask_counts.append(extract_counts(
        singletask_finetuned_preds_paths[task_index],
        coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_singletask_fold_test_chroms[task_index])]
    ))
    # No need to specify specific coordinates or task indices, because
    # single-task model predictions are saved only for that one task

    # Show counts scatterplots for each condition
    scatter_fig = plot_count_scatterplots(singletask_counts, cond_labels=cond_labels)
    
    if perf_cache_dir:
        for key in metric_figs:
            metric_figs[key].savefig(os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned_%s.png" % (task_index, key)))
        scatter_fig.savefig(os.path.join(perf_cache_dir, "singletask_task%d_allfolds_finetuned_scattercounts.png" % task_index))

Task 0

Task 1

Task 2

Task 3

Fine-tuned multi-task profile model task-specific performance

A comparison of the test-set performance for between each task of a multi-task profile model fine-tuned on the best-performing fold

In [21]:
compute_dict = True
if perf_cache_dir:
    # Import if it exists
    perf_dict_path = os.path.join(perf_cache_dir, "multitask_finetuned_tasks.h5")
    if os.path.exists(perf_dict_path) and os.stat(perf_dict_path).st_size:
        finetune_multitask_perf_dict = load_perf_dict(perf_dict_path)
        compute_dict = False

if compute_dict:
    # Get bounds for each task (best multi-task fold only)
    finetune_multitask_bounds_perf_dict = extract_performance_bounds(
        perf_bounds_path, input_length,
        coord_sets=[
            filter_coords_by_chroms(coords, best_multitask_fold_test_chroms)
            for coords in task_coords
        ],
        task_inds=(list(range(num_tasks)))
    )
    
    finetune_multitask_perf_dict = {key : [] for key in finetune_multitask_bounds_perf_dict}
    
    perf_dict = extract_performance_metrics(
        multitask_finetuned_preds_path,
        coord_sets=[
            filter_coords_by_chroms(coords, best_multitask_fold_test_chroms)
            for coords in task_coords
        ],
        task_inds=(list(range(num_tasks)))
    )
    for task_index in range(num_tasks):
        for key in finetune_multitask_perf_dict.keys():
            vals = perf_dict[key][task_index]
            if not key.endswith("pearson") and not key.endswith("spearman"):
                # Fold index is best_singletask_folds[task_index] - 1
                lower = finetune_multitask_bounds_perf_dict[key][task_index][0]
                upper = finetune_multitask_bounds_perf_dict[key][task_index][1]
                vals = min_max_normalize(lower, vals, upper)
            finetune_multitask_perf_dict[key].append(vals)
            
    if perf_cache_dir:
        save_perf_dict(finetune_multitask_perf_dict, perf_dict_path)
        
# Plot the CDFs and bar plots of the profile/counts metrics
cond_labels = [("Task %d" % i) for i in range(num_tasks)]
metric_figs = plot_performances(
    finetune_multitask_perf_dict, "%s fine-tuned multi-task model" % tf_name, cond_labels=cond_labels
)

# Import the total counts values and predictions
finetune_multitask_counts = []
for task_index in range(num_tasks):
    finetune_multitask_counts.append(extract_counts(
        multitask_finetuned_preds_path,
        coord_sets=[filter_coords_by_chroms(task_coords[task_index], best_multitask_fold_test_chroms)],
        task_inds=[task_index]
    ))

# Show counts scatterplots for each condition
scatter_fig = plot_count_scatterplots(finetune_multitask_counts, cond_labels=cond_labels)

if perf_cache_dir:
    for key in metric_figs:
        metric_figs[key].savefig(os.path.join(perf_cache_dir, "multitask_finetuned_tasks_%s.png" % key))