%load_ext autoreload
%autoreload 2
%reset -f
import os
import sys
sys.path.append(os.path.abspath("/users/amtseng/tfmodisco/src/"))
from tfmodisco.run_tfmodisco import import_shap_scores
from motif.read_motifs import trim_motif_by_ic
from motif.moods import import_moods_hits
from motif.tfmodisco_hit_scoring import import_tfmodisco_hits
from util import figure_to_vdom_image, import_peak_table
import plot.viz_sequence as viz_sequence
import h5py
import numpy as np
import pandas as pd
import pomegranate
import sklearn.cluster
import scipy.cluster.hierarchy
import scipy.stats
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:25: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`

<tqdm.notebook.tqdm_notebook at 0x7fdc0cd55b50>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "font.size": 13,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


# Define parameters/fetch arguments
tf_name = os.environ["TFM_TF_NAME"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
shap_scores_path = os.environ["TFM_SHAP_PATH"]
hyp_score_key = os.environ["TFM_HYP_SCORE_KEY"]
if "TFM_TASK_INDEX" in os.environ:
    task_index = int(os.environ["TFM_TASK_INDEX"])
else:
    task_index = None
if "TFM_PEAKS" in os.environ:
    # If provided, this overrides the peaks defined by the TF name and task index
    peak_bed_paths = os.environ["TFM_PEAKS"].split(",")
else:
    peak_bed_paths = []
motif_hits_path = os.environ["TFM_HITS_PATH"]
if "TFM_HITS_CACHE" in os.environ:
    hits_cache_dir = os.environ["TFM_HITS_CACHE"]
else:
    hits_cache_dir = None

print("TF name: %s" % tf_name)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("DeepSHAP scores path: %s" % shap_scores_path)
print("Importance score key: %s" % hyp_score_key)
print("Task index: %s" % task_index)
print("Peak BED paths: %s" % ",".join(peak_bed_paths))
print("Motif hits path: %s" % motif_hits_path)
print("Saved motif hits cache: %s" % hits_cache_dir)

TF name: REST
TF-MoDISco results path: /users/amtseng/tfmodisco/results/tfmodisco/multitask_profile_finetune/REST_multitask_profile_finetune_fold7/REST_multitask_profile_finetune_task0_fold7_profile_tfm.h5
DeepSHAP scores path: /users/amtseng/tfmodisco/results/importance_scores/multitask_profile_finetune/REST_multitask_profile_finetune_fold7/REST_multitask_profile_finetune_task0_fold7_imp_scores.h5
Importance score key: profile_hyp_scores
Task index: 0
Peak BED paths: 
Motif hits path: /users/amtseng/tfmodisco/results/tfmodisco_hit_scoring/multitask_profile_finetune/REST_multitask_profile_finetune_task0_fold7_profile_halfsites/halfsites_only/tfm_matches.bed
Saved motif hits cache: /users/amtseng/tfmodisco/results/reports/motif_hits//cache/tfm/multitask_profile_finetune//REST_multitask_profile_finetune_fold7_profile_halfsites/halfsites_only


# Constants
input_length = 2114 if "TFM_INPUT_LEN" not in os.environ else int(os.environ["TFM_INPUT_LEN"])
motif_moods_imp_perc_cutoff = 0.10  # For MOODS hits
motif_tfm_sim_perc_cutoff = 0.05  # For TF-MoDISco hits
motif_tfm_imp_perc_cutoff = 0.05  # For TF-MoDISco hits

seed = 20210412

# Paths to original called peaks
if not peak_bed_paths:
    # Use TF name and task index
    base_path = "/users/amtseng/tfmodisco/"
    data_path = os.path.join(base_path, "data/processed/ENCODE/")
    labels_path = os.path.join(data_path, "labels/%s" % tf_name)
    
    all_peak_beds = sorted([item for item in os.listdir(labels_path) if item.endswith(".bed.gz")])
    if task_index is None:
        peak_bed_paths = [os.path.join(labels_path, item) for item in all_peak_beds]
    else:
        peak_bed_paths = [os.path.join(labels_path, all_peak_beds[task_index])]


if hits_cache_dir:
    os.makedirs(hits_cache_dir, exist_ok=True)


def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports the PFMs to into a dictionary, mapping `(x, y)` to the PFM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on information content
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of PFMs.
    """ 
    pfms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    pfm = trim_motif_by_ic(pfm, pfm)
                    
                pfms["%d_%d" % (metacluster_i,pattern_i)] = pfm
    return pfms


def import_motif_hits(motif_hits_path):
    """
    Imports the motif hits, which may be an output of MOODS scanning
    or TF-MoDISco hit scanning. Depending on the number of columns, the
    hits are imported appropriately.
    """
    with open(motif_hits_path, "r") as f:
        cols = next(f).split("\t")
    if len(cols) == 10:
        print("MOODS hits")
        hit_table = import_moods_hits(motif_hits_path)
    elif len(cols) == 16:
        print("TF-MoDISco hits")
        hit_table = import_tfmodisco_hits(motif_hits_path)
        
        # Sort by aggregate similarity and drop duplicates (by strand)
        hit_table = hit_table.sort_values("agg_sim")
        hit_table = hit_table.drop_duplicates(["chrom", "start", "end", "peak_index"], keep="last")
    else:
        raise ValueError("Motif hits file of unknown format/source: %s" % motif_hits_path)
    return hit_table


def estimate_mode(x_values, bins=200, levels=1):
    """
    Estimates the mode of the distribution using `levels`
    iterations of histograms.
    """
    hist, edges = np.histogram(x_values, bins=bins)
    bin_mode = np.argmax(hist)
    left_edge, right_edge = edges[bin_mode], edges[bin_mode + 1]
    if levels <= 1:
        return (left_edge + right_edge) / 2
    else:
        return estimate_mode(
            x_values[(x_values >= left_edge) & (x_values < right_edge)],
            bins=bins,
            levels=(levels - 1)
        )


def fit_tight_exponential_dist(x_values, mode=0, percentiles=np.arange(0.05, 1, 0.05)):
    """
    Given an array of x-values and a set of percentiles of the distribution,
    computes the set of lambda values for an exponential distribution if the
    distribution were fit to each percentile of the x-values. Returns an array
    of lambda values parallel to `percentiles`. The exponential distribution
    is assumed to have the given mean/mode, and all data less than this mode
    is tossed out when doing this computation.
    """
    assert np.min(percentiles) >= 0 and np.max(percentiles) <= 1
    x_values = x_values[x_values >= mode]
    per_x_vals = np.percentile(x_values, percentiles * 100)
    return -np.log(1 - percentiles) / (per_x_vals - mode)


def exponential_pdf(x_values, lamb):
    return lamb * np.exp(-lamb * x_values)
def exponential_cdf(x_values, lamb):
    return 1 - np.exp(-lamb * x_values)


def filter_moods_peak_hits(hit_table, score_column="imp_frac_score", imp_perc_cutoff=0.05):
    """
    Filters the table of peak hits by the score defined by
    `score_column` by fitting a mixture model to the score
    distribution, taking the exponential component, and then fitting a
    percentile-tightened exponential distribution to this component.
    The lowest percentile specified by `imp_perc_cutoff` of this null is
    cut out. Returns a reduced hit table of the same format, a figure for the score
    distribution.
    """
    scores = hit_table[score_column].values
    scores_finite = scores[np.isfinite(scores)]
    
    mode = estimate_mode(scores_finite)

    # Fit mixture of models to scores (mode-shifted)
    over_mode_scores = scores_finite[scores_finite >= mode] - mode
    mixed_model = pomegranate.GeneralMixtureModel.from_samples(
        [
            pomegranate.ExponentialDistribution,
            pomegranate.NormalDistribution,
            pomegranate.NormalDistribution
        ],
        3, over_mode_scores[:, None]
    )
    mixed_model = mixed_model.fit(over_mode_scores)
    mixed_model_exp_dist = mixed_model.distributions[0]
    
    # Obtain a distribution of scores that belong to the exponential distribution
    exp_scores = over_mode_scores[mixed_model.predict(over_mode_scores[:, None]) == 0]
    
    # Fit a tight exponential distribution based on percentiles
    lamb = np.max(fit_tight_exponential_dist(exp_scores))
    
    # Plot score distribution and fit
    
    score_fig, ax = plt.subplots(nrows=3, figsize=(20, 20))

    x = np.linspace(np.min(scores_finite), np.max(scores_finite), 200)[1:]  # Skip first bucket (it's usually very large
    mix_dist_pdf = mixed_model.probability(x)
    mixed_model_exp_dist_pdf = mixed_model_exp_dist.probability(x)

    perc_dist_pdf = exponential_pdf(x, lamb)
    perc_dist_cdf = exponential_cdf(x, lamb)
    
    thresh = scipy.stats.expon.ppf(imp_perc_cutoff, loc=mode, scale=(1 / lamb))

    # Plot mixed model
    ax[0].hist(over_mode_scores + mode, bins=500, density=True, alpha=0.3)
    ax[0].axvline(mode)
    ax[0].plot(x + mode, mix_dist_pdf, label="Mixed model")
    ax[0].plot(x + mode, mixed_model_exp_dist_pdf, label="Exponential component")
    ax[0].legend()

    # Plot fitted PDF
    ax[1].hist(exp_scores, bins=500, density=True, alpha=0.3)
    ax[1].plot(x + mode, perc_dist_pdf, label="Percentile-fitted")
    ax[1].axvline(thresh)

    # Plot fitted CDF
    ax[2].hist(exp_scores, bins=500, density=True, alpha=1, cumulative=True, histtype="step")
    ax[2].plot(x + mode, perc_dist_cdf, label="Percentile-fitted")

    ax[0].set_title("Motif hit scores")
    plt.show()
    
    return hit_table.loc[hit_table[score_column] >= thresh].reset_index(drop=True), score_fig


def filter_tfm_peak_hits(
    hit_table, sim_score_column="agg_sim", imp_score_column="imp_frac_score",
    sim_perc_cutoff=0.05, imp_perc_cutoff=0.05
):
    """
    Filters the table of peak hits by the score defined by
    `score_column` by filtering out low-percentile hits in terms of both
    similarity to the cluster, and importance score.
    For the similarity score, a mixed model is fit to the distribution of
    simiarities, and the low percentiles of the exponential component (the
    null) are cut off. For the importance score filter, a Gaussian null
    is fit to the part of the score distribution to the left of the mode
    (mirroring it around the mode), and then applying the percentile
    cut-off. Returns a reduced hit table of the same format, and a figure
    of the similarity distribution and the importance distribution.
    """
    # Similarity filtering
    
    sim_scores = hit_table[sim_score_column].values
    sim_scores_finite = sim_scores[np.isfinite(sim_scores)]
    
    sim_mode = estimate_mode(sim_scores_finite)
    sim_scores_below_mode = sim_scores_finite[sim_scores_finite <= sim_mode]
    sim_scores_symmetric = np.concatenate([sim_scores_below_mode, (2 * sim_mode) - sim_scores_below_mode])

    sim_null_model = pomegranate.NormalDistribution.from_samples(sim_scores_symmetric)
    sim_null_model.fit(sim_scores_symmetric)

    sim_x = np.linspace(np.min(sim_scores_symmetric), np.max(sim_scores_symmetric), 200)
    sim_pdf = sim_null_model.probability(sim_x)
    
    sim_score_fig, ax = plt.subplots(figsize=(20, 6))
    ax.hist(sim_scores_finite, bins=500, density=True, alpha=0.3, label="Similarity scores")
    ax.hist(sim_scores_symmetric, bins=500, density=True, alpha=0.3, label="Null component")
    ax.plot(sim_x, sim_pdf, label="Fitted null model")
    mean, std = sim_null_model.parameters
    sim_thresh = scipy.stats.norm.ppf(sim_perc_cutoff, mean, std)
    ax.axvline(sim_thresh)
    plt.legend()
    plt.show()
    
    # Importance score filtering
    
    imp_scores = hit_table[imp_score_column].values
    imp_scores_finite = imp_scores[np.isfinite(imp_scores)]

    imp_mode = estimate_mode(imp_scores_finite)
    imp_scores_below_mode = imp_scores_finite[imp_scores_finite <= imp_mode]
    imp_scores_symmetric = np.concatenate([imp_scores_below_mode, (2 * imp_mode) - imp_scores_below_mode])

    imp_null_model = pomegranate.NormalDistribution.from_samples(imp_scores_symmetric)
    imp_null_model.fit(imp_scores_symmetric)

    imp_x = np.linspace(np.min(imp_scores_symmetric), np.max(imp_scores_symmetric), 200)
    imp_pdf = imp_null_model.probability(imp_x)
    
    imp_score_fig, ax = plt.subplots(figsize=(20, 6))
    ax.hist(imp_scores_finite, bins=500, density=True, alpha=0.3, label="Importance scores")
    ax.hist(imp_scores_symmetric, bins=500, density=True, alpha=0.3, label="Null component")
    ax.plot(imp_x, imp_pdf, label="Fitted null model")
    mean, std = imp_null_model.parameters
    imp_thresh = scipy.stats.norm.ppf(imp_perc_cutoff, mean, std)
    ax.axvline(imp_thresh)
    plt.legend()
    plt.show()

    return hit_table.loc[
        (hit_table[sim_score_column] >= sim_thresh) & (hit_table[imp_score_column] >= imp_thresh)
    ].reset_index(drop=True), (sim_score_fig, imp_score_fig)


def get_peak_hits(peak_table, hit_table):
    """
    For each peak, extracts the set of motif hits that fall in that peak.
    Returns a list mapping peak index to a subtable of `hit_table`. The index
    of the list is the index of the peak table.
    """
    peak_hits = [pd.DataFrame(columns=list(hit_table))] * len(peak_table)
    for peak_index, matches in tqdm.notebook.tqdm(hit_table.groupby("peak_index")):
        # Check that all of the matches are indeed overlapping the peak
        peak_row = peak_table.iloc[peak_index]
        chrom, start, end = peak_row["chrom"], peak_row["peak_start"], peak_row["peak_end"]
        assert np.all(matches["chrom"] == chrom)
        assert np.all((matches["start"] < end) & (start < matches["end"]))
        
        peak_hits[peak_index] = matches
    return peak_hits


def get_peak_motif_counts(peak_hits, motif_keys):
    """
    From the peak hits (as returned by `get_peak_hits`), computes a count
    array of size N x M, where N is the number of peaks and M is the number of
    motifs. Each entry represents the number of times a motif appears in a peak.
    `motif_keys` is a list of motif keys as they appear in `peak_hits`; the
    order of the motifs M matches this list.
    """
    motif_inds = {motif_keys[i] : i for i in range(len(motif_keys))}
    counts = np.zeros((len(peak_hits), len(motif_keys)), dtype=int)
    for i in tqdm.notebook.trange(len(peak_hits)):
        hits = peak_hits[i]
        for key, num in zip(*np.unique(hits["key"], return_counts=True)):
            counts[i][motif_inds[key]] = num
    return counts


def cluster_matrix_indices(matrix, num_clusters):
    """
    Clusters matrix using k-means. Always clusters on the first
    axis. Returns the indices needed to optimally order the matrix
    by clusters.
    """
    if len(matrix) == 1:
        # Don't cluster at all
        return np.array([0])

    num_clusters = min(num_clusters, len(matrix))
    
    # Perform k-means clustering
    kmeans = sklearn.cluster.MiniBatchKMeans(n_clusters=num_clusters)
    cluster_assignments = kmeans.fit_predict(matrix)

    # Perform hierarchical clustering on the cluster centers to determine optimal ordering
    kmeans_centers = kmeans.cluster_centers_
    cluster_order = scipy.cluster.hierarchy.leaves_list(
        scipy.cluster.hierarchy.optimal_leaf_ordering(
            scipy.cluster.hierarchy.linkage(kmeans_centers, method="centroid"), kmeans_centers
        )
    )

    # Order the peaks so that the cluster assignments follow the optimal ordering
    cluster_inds = []
    for cluster_id in cluster_order:
        cluster_inds.append(np.where(cluster_assignments == cluster_id)[0])
    cluster_inds = np.concatenate(cluster_inds)
    return cluster_inds


def plot_peak_motif_indicator_heatmap(peak_hit_counts, motif_keys, subsample=None):
    """
    Plots a simple indicator heatmap of the motifs in each peak.
    Returns the figure.
    """
    # Subsample peaks
    if subsample:
        peak_hit_counts = peak_hit_counts[np.random.choice(
            len(peak_hit_counts), size=min(len(peak_hit_counts), subsample), replace=False
        )]
        
    peak_hit_indicators = (peak_hit_counts > 0).astype(int)
    
    # Order columns by prevalence (by number of peaks with that motif)
    counts = np.sum(peak_hit_indicators, axis=0)
    inds = np.flip(np.argsort(counts))
    matrix = peak_hit_indicators[:, inds]
    motif_keys = np.array(motif_keys)[inds]
    
    # Order rows in "binary" order
    places = np.power(2, np.flip(np.arange(matrix.shape[1])))
    values = np.sum(matrix * places, axis=1)
    inds = np.flip(np.argsort(values))
    matrix = matrix[inds]
    
    # Create a figure with the right dimensions
    fig_height = min(len(peak_hit_indicators) * 0.004, 8)
    fig, ax = plt.subplots(figsize=(16, fig_height))

    # Plot the heatmap
    ax.imshow(matrix, interpolation="nearest", aspect="auto", cmap="Greens")

    # Set axes on heatmap
    ax.set_yticks([])
    ax.set_yticklabels([])
    ax.set_xticks(np.arange(len(motif_keys)))
    ax.set_xticklabels(motif_keys)
    ax.set_xlabel("Motif")

    fig.tight_layout()
    plt.show()
    return fig


def plot_homotypic_densities(peak_hit_counts, motif_keys):
    """
    Plots a CDF of number of motif hits per peak, for each motif.
    Returns a dictionary mapping motif key to figure.
    """
    figs = {}
    for i in range(len(motif_keys)):
        counts = peak_hit_counts[:, i]
        
        fig, ax = plt.subplots(figsize=(8, 8))
        bins = np.concatenate([np.arange(np.max(counts)), [np.inf]])
        ax.hist(counts, bins=bins, density=True, histtype="step", cumulative=True)
        ax.set_title("Cumulative distribution of number of %s hits per peak" % motif_keys[i])
        ax.set_xlabel("Number of motifs k in peak")
        ax.set_ylabel("Proportion of peaks with at least k motifs")
        plt.show()
        figs[motif_keys[i]] = fig
    return figs


def get_motif_cooccurrence_count_matrix(peak_hit_counts):
    """
    From an N x M (peaks by motifs) array of hit counts, returns
    an M x M array of counts (i.e. how many times two motifs occur
    together in the same peak). For the diagonal entries, we require
    that motif occur at least twice in a peak to be counted.
    """
    peak_hit_indicators = (peak_hit_counts > 0).astype(int)
    num_motifs = peak_hit_indicators.shape[1]
    count_matrix = np.zeros((num_motifs, num_motifs), dtype=int)
    for i in range(num_motifs):
        for j in range(i):
            pair_col = np.sum(peak_hit_indicators[:, [i, j]], axis=1)
            count = np.sum(pair_col == 2)
            count_matrix[i, j] = count
            count_matrix[j, i] = count
        count_matrix[i, i] = np.sum(peak_hit_counts[:, i] >= 2)
    return count_matrix


def compute_cooccurrence_pvals(peak_hit_counts):
    """
    Given the number of motif hits in each peak, computes p-value of
    co-occurrence for each pair of motifs, including self pairs.
    Returns an M x M array of p-values for the M motifs.
    """
    peak_hit_indicators = (peak_hit_counts > 0).astype(int)
    num_peaks, num_motifs = peak_hit_counts.shape
    
    pvals = np.ones((num_motifs, num_motifs))
    
    # Significance is based on a Fisher's exact test. If the motifs were
    # present in peaks randomly, we'd independence of occurrence.
    # For self-co-occurrence, the null model is not independence, but
    # collisions
    for i in range(num_motifs):
        for j in range(i):
            pair_counts = peak_hit_indicators[:, [i, j]]
            peaks_with_1 = pair_counts[:, 0] == 1
            peaks_with_2 = pair_counts[:, 1] == 1
            # Contingency table (universe is set of all peaks):
            #              no motif 1  |  has motif 1
            # no motif 2       A       |      B
            # -------------------------+--------------
            # has motif 2      C       |      D
            # The Fisher's exact test evaluates the significance of the
            # association between the two classifications
            cont_table = np.array([
                [
                    np.sum(~(peaks_with_1) & (~peaks_with_2)),
                    np.sum(peaks_with_1 & (~peaks_with_2))
                ],
                [
                    np.sum(~(peaks_with_1) & peaks_with_2),
                    np.sum(peaks_with_1 & peaks_with_2)
                ]
            ])
            pval = scipy.stats.fisher_exact(
                cont_table, alternative="greater"
            )[1]
            pvals[i, j] = pval
            pvals[j, i] = pval

        # Self-co-occurrence: Poissonize balls in bins
        # Expected number of collisions (via linearity of expectations):
        num_hits = np.sum(peak_hit_indicators[:, i])  # number of "balls"
        expected_collisions = num_hits * (num_hits - 1) / (2 * num_peaks)
        num_collisions = np.sum(peak_hit_counts[:, i] >= 2)
        if num_collisions == 0:
            pval = 1
        else:
            pval = 1 - scipy.stats.poisson.cdf(num_collisions, mu=expected_collisions)
        pvals[i, i] = pval
        
    return pvals


def plot_motif_cooccurrence_heatmaps(count_matrix, pval_matrix, motif_keys):
    """
    Plots a heatmap showing the number of peaks that have both types of
    each motif, as well as a heatmap showing the p-value of co-occurrence.
    Returns the p-value figure and the count figure, as well as the indices
    of motifs used for clustering.
    """
    assert count_matrix.shape == pval_matrix.shape
    num_motifs = pval_matrix.shape[0]
    assert len(motif_keys) == num_motifs

    # Cluster by p-value
    inds = cluster_matrix_indices(pval_matrix, max(5, num_motifs // 4))
    pval_matrix = pval_matrix[inds][:, inds]
    count_matrix = count_matrix[inds][:, inds]
    motif_keys = np.array(motif_keys)[inds]
    
    # Plot the p-value matrix

    fig_width = max(5, num_motifs)
    p_fig, ax = plt.subplots(figsize=(fig_width, fig_width))
    
    # Replace 0s with minimum value (we'll label them properly later)
    zero_mask = pval_matrix == 0
    non_zeros = pval_matrix[~zero_mask]
    if not len(non_zeros):
        logpval_matrix = np.tile(np.inf, pval_matrix.shape)
    else:
        min_val = np.min(pval_matrix[~zero_mask])
        pval_matrix[zero_mask] = min_val
        logpval_matrix = -np.log10(pval_matrix)
    
    hmap = ax.imshow(logpval_matrix)

    ax.set_xticks(np.arange(num_motifs))
    ax.set_yticks(np.arange(num_motifs))
    ax.set_xticklabels(motif_keys, rotation=45)
    ax.set_yticklabels(motif_keys)

    # Loop over data dimensions and create text annotations.
    for i in range(num_motifs):
        for j in range(num_motifs):
            if zero_mask[i, j]:
                text = "Inf"
            else:
                text = "%.2f" % np.abs(logpval_matrix[i, j])
            ax.text(j, i, text, ha="center", va="center")
    p_fig.colorbar(hmap, orientation="horizontal")

    ax.set_title("-log(p) significance of peaks with both motifs")
    p_fig.tight_layout()
    plt.show()
    
    # Plot the counts matrix

    fig_width = max(5, num_motifs)
    c_fig, ax = plt.subplots(figsize=(fig_width, fig_width))
    
    hmap = ax.imshow(count_matrix)

    ax.set_xticks(np.arange(num_motifs))
    ax.set_yticks(np.arange(num_motifs))
    ax.set_xticklabels(motif_keys, rotation=45)
    ax.set_yticklabels(motif_keys)

    # Loop over data dimensions and create text annotations.
    for i in range(num_motifs):
        for j in range(num_motifs):
            ax.text(j, i, count_matrix[i, j], ha="center", va="center")
    c_fig.colorbar(hmap, orientation="horizontal")

    ax.set_title("Number of peaks with both motifs")
    c_fig.tight_layout()
    plt.show()
    return p_fig, c_fig, inds


def create_violin_plot(ax, dist_list, colors):
    """
    Creates a violin plot on the given instantiated axes.
    `dist_list` is a list of vectors. `colors` is a parallel
    list of colors for each violin.
    """
    num_perfs = len(dist_list)

    q1, med, q3 = np.stack([
        np.nanpercentile(data, [25, 50, 70], axis=0) for data in dist_list
    ], axis=1)
    iqr = q3 - q1
    lower_outlier = q1 - (1.5 * iqr)
    upper_outlier = q3 + (1.5 * iqr)


    sorted_clipped_data = [  # Remove outliers based on outlier rule
        np.sort(vec[(vec >= lower_outlier[i]) & (vec <= upper_outlier[i])])
        for i, vec in enumerate(dist_list)
    ]

    plot_parts = ax.violinplot(
        sorted_clipped_data, showmeans=False, showmedians=False, showextrema=False
    )
    violin_parts = plot_parts["bodies"]
    for i in range(num_perfs):
        violin_parts[i].set_facecolor(colors[i])
        violin_parts[i].set_edgecolor(colors[i])
        violin_parts[i].set_alpha(0.7)

    inds = np.arange(1, num_perfs + 1)
    ax.vlines(inds, q1, q3, color="black", linewidth=5, zorder=1)
    ax.scatter(inds, med, marker="o", color="white", s=30, zorder=2)


def plot_intermotif_distance_violins(peak_hits, motif_keys, pair_inds, cluster_inds):
    """
    For each pair of motifs, plots a violin of distances beween
    motifs. Returns a dictionary mapping pairs of motif keys to arrays
    of distances, and the figure.
    """
    # First, compute the distribution of distances for each pair
    distance_dict = {}
    key_pairs = []
    for i, j in tqdm.notebook.tqdm(pair_inds):
        dists = []
        for k in range(len(peak_hits)):
            hits = peak_hits[k]

            hits_1 = hits[hits["key"] == motif_keys[i]]
            hits_2 = hits[hits["key"] == motif_keys[j]]

            if hits_1.empty or hits_2.empty:
                continue

            pos_1 = np.array(hits_1["start"])
            pos_2 = np.array(hits_2["start"])

            len_1 = (hits_1["end"] - hits_1["start"]).values[0]
            len_2 = (hits_2["end"] - hits_2["start"]).values[0]

            # Differences beteween all pairs of positions
            diffs = pos_2[None] - pos_1[:, None]
            # Take minimum distance for each instance of motif 2, but only
            # if the distance is an appropriate length
            for row in diffs:
                row = row[row != 0]
                if not row.size:
                    continue
                dist = row[np.argmin(np.abs(row))]
                if (dist < 0 and dist < -len_2) or (dist > 0 and dist > len_1):
                    dists.append(dist)
        dists = np.array(dists)
        if not dists.size:
            continue
        key_pair = (motif_keys[i], motif_keys[j])
        key_pairs.append(key_pair)
        distance_dict[key_pair] = np.abs(dists)  # Take absolute value of distance
    
    if not distance_dict:
        print("No significantly co-occurring motifs")
        return distance_dict, None
    
    # Create the plot
    fig, ax = plt.subplots(
        nrows=len(motif_keys), ncols=len(motif_keys),
        figsize=(len(motif_keys) * 4, len(motif_keys) * 4)
    )
    if type(ax) is not np.ndarray:
        ax = np.array([[ax]])

    # Map motif key to axis index
    key_to_index = dict(zip(np.array(motif_keys)[cluster_inds], np.arange(len(motif_keys))))

    def clean_subplot(ax):
        # Do this instead of ax.axis("off"), which would also remove any
        # axis labels
        ax.set_yticks([])
        ax.set_xticks([])
        for orient in ("top", "bottom", "left", "right"):
            ax.spines[orient].set_visible(False)

    # Create violins
    for i in range(len(motif_keys)):
        for j in range(i, len(motif_keys)):
            key_1, key_2 = motif_keys[i], motif_keys[j]
            key_pair, rev_key_pair = (key_1, key_2), (key_2, key_1)
            axis_1, axis_2 = key_to_index[key_1], key_to_index[key_2]
            # Always plot lower triangle
            if axis_1 < axis_2:
                axis_1, axis_2 = axis_2, axis_1

            if key_pair in distance_dict or rev_key_pair in distance_dict:
                if rev_key_pair in distance_dict:
                    key_pair = rev_key_pair
                dist = distance_dict[key_pair] 
                create_violin_plot(ax[axis_1, axis_2], [dist], ["mediumorchid"])
                ax[axis_1, axis_2].set_xticks([])  # Remove x-axis labels, as they don't mean much
                if axis_1 != axis_2:
                    # If off diagonal, clean the axes of the symmetric cell
                    clean_subplot(ax[axis_2, axis_1])
            else:
                clean_subplot(ax[axis_1, axis_2])
                clean_subplot(ax[axis_2, axis_1])

    # Make motif labels
    for i in range(len(motif_keys)):
        ax[i, 0].set_ylabel(motif_keys[cluster_inds[i]])
        ax[-1, i].set_xlabel(motif_keys[cluster_inds[i]])

    # Remove x-axis labels/ticks
    ax[-1, -1].set_xticks([])
    fig.suptitle("Distance distributions between co-occurring motifs")
    fig.tight_layout(rect=[0, 0.03, 1, 0.98])

    return distance_dict, fig


# Import the PFMs
pfms = import_tfmodisco_motifs(tfm_results_path)
motif_keys = list(pfms.keys())


# Import peaks
peak_table = import_peak_table(peak_bed_paths)

# Expand to input length
peak_table["peak_start"] = \
    (peak_table["peak_start"] + peak_table["summit_offset"]) - (input_length // 2)
peak_table["peak_end"] = peak_table["peak_start"] + input_length


# Import DeepSHAP scores
hyp_scores, act_scores, one_hot_seqs, shap_coords = import_shap_scores(
    shap_scores_path, hyp_score_key, center_cut_size=None, remove_non_acgt=False
)

Importing SHAP scores:   0%|          | 0/6 [00:00<?, ?it/s]
Importing SHAP scores:  17%|█▋        | 1/6 [00:00<00:02,  1.93it/s]
Importing SHAP scores:  33%|███▎      | 2/6 [00:01<00:02,  1.88it/s]
Importing SHAP scores:  50%|█████     | 3/6 [00:01<00:01,  1.83it/s]
Importing SHAP scores:  67%|██████▋   | 4/6 [00:02<00:01,  1.85it/s]
Importing SHAP scores:  83%|████████▎ | 5/6 [00:02<00:00,  1.81it/s]
Importing SHAP scores: 100%|██████████| 6/6 [00:03<00:00,  1.90it/s]


# Limit SHAP coordinates/scores to only those with matching peak coordinates
shap_coords_table = pd.DataFrame(shap_coords, columns=["chrom", "start", "end"])
peak_coords_table = peak_table[["chrom", "peak_start", "peak_end"]]

order_inds = peak_coords_table.merge(
    shap_coords_table.reset_index(), how="left", left_on=["chrom", "peak_start", "peak_end"],
    right_on=["chrom", "start", "end"]
)["index"].values
order_inds = np.nan_to_num(order_inds, nan=-1).astype(int)

shap_coords = shap_coords[order_inds]
hyp_scores = hyp_scores[order_inds]
act_scores = act_scores[order_inds]
one_hot_seqs = one_hot_seqs[order_inds]

# Whenever a SHAP coord did not exist in the peak table, set to 0
# This ensures that when we search for matches of DeepSHAP scores that don't
# exist, we will find nothing
shap_coords[order_inds < 0] = 0
hyp_scores[order_inds < 0] = 0
act_scores[order_inds < 0] = 0
one_hot_seqs[order_inds < 0] = 0


# Import motif hits results
hit_table = import_motif_hits(motif_hits_path)

TF-MoDISco hits


# Filter motif hit table
if "agg_sim" in list(hit_table):
    hit_table_filtered, (sim_score_fig, imp_score_fig) = filter_tfm_peak_hits(
        hit_table, sim_perc_cutoff=motif_tfm_sim_perc_cutoff, imp_perc_cutoff=motif_tfm_imp_perc_cutoff
    )
else:
    hit_table_filtered, score_fig = filter_moods_peak_hits(
        hit_table, imp_perc_cutoff=motif_moods_imp_perc_cutoff
    )


assert not hit_table_filtered.empty, "Filtered out all %d original hits" % len(hit_table)


# Match peaks to motif hits
peak_hits = get_peak_hits(peak_table, hit_table_filtered)


# Construct count array of peaks and hits
peak_hit_counts = get_peak_motif_counts(peak_hits, motif_keys)


# Construct count matrix of motif co-occurrence
motif_cooccurrence_count_matrix = get_motif_cooccurrence_count_matrix(peak_hit_counts)


# Construct the matrix of p-values for motif co-occurrence
motif_cooccurrence_pval_matrix = compute_cooccurrence_pvals(peak_hit_counts)


if hits_cache_dir:
    # Save the filtered hits in the cache
    hit_table_filtered.reset_index().to_csv(
        os.path.join(hits_cache_dir, "filtered_hits.tsv"), sep="\t", header=True, index=False
    )
    
    # Save the peaks
    peak_table.reset_index().to_csv(
        os.path.join(hits_cache_dir, "peaks.tsv"), sep="\t", header=True, index=False
    )
    
    # Save a mapping between peak index and filtered motif indices
    with open(os.path.join(hits_cache_dir, "peak_matched_hits.tsv"), "w") as f:
        f.write("peak_index\tfiltered_hit_indices\n")
        for i, table in enumerate(peak_hits):
            f.write("%d\t%s\n" % (i, ",".join([str(x) for x in peak_hits[i].index])))
    
    # Save score figures
    if "agg_sim" in list(hit_table):
        sim_score_fig.savefig(os.path.join(hits_cache_dir, "sim_score_dist.png"))
        imp_score_fig.savefig(os.path.join(hits_cache_dir, "imp_score_dist.png"))
    else:
        score_fig.savefig(os.path.join(hits_cache_dir, "imp_score_dist.png"))
    
    # Save co-occurrence matrices
    with h5py.File(os.path.join(hits_cache_dir, "cooccurrences.h5"), "w") as f:
        f.create_dataset("counts", data=motif_cooccurrence_count_matrix, compression="gzip")
        f.create_dataset("pvals", data=motif_cooccurrence_pval_matrix, compression="gzip")


motifs_per_peak = np.array([len(hits) for hits in peak_hits])


display(vdomh.p("Number of peaks: %d" % len(peak_table)))
display(vdomh.p("Number of motif hits before FDR filtering: %d" % len(hit_table)))
display(vdomh.p("Number of motif hits after FDR filtering: %d" % len(hit_table_filtered)))


num_zero = np.sum(motifs_per_peak == 0)
display(vdomh.p("Number of peaks with 0 motif hits: %d" % num_zero))
display(vdomh.p("Percentage of peaks with 0 motif hits: %.1f%%" % (num_zero / len(peak_table) * 100)))


quants = [0, 0.25, 0.50, 0.75, 0.99, 1]
header = vdomh.thead(
    vdomh.tr(
        vdomh.th("Quantile", style={"text-align": "center"}),
        vdomh.th("Number of hits/peak", style={"text-align": "center"})
    )
)
body = vdomh.tbody(*([
    vdomh.tr(
        vdomh.td("%.1f%%" % (q * 100)), vdomh.td("%d" % v)
    ) for q, v in zip(quants, np.quantile(motifs_per_peak, quants))
]))
vdomh.table(header, body)


fig, ax = plt.subplots(figsize=(10, 10))
bins = np.concatenate([np.arange(np.max(motifs_per_peak) + 1), [np.inf]])
ax.hist(motifs_per_peak, bins=bins, density=True, histtype="step", cumulative=True)
ax.set_title("Cumulative distribution of number of motif hits per peak")
ax.set_xlabel("Number of motifs k in peak")
ax.set_ylabel("Proportion of peaks with at least k motifs")
plt.show()

if hits_cache_dir:
    fig.savefig(os.path.join(hits_cache_dir, "peak_hit_count_cdf.png"))

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6662: RuntimeWarning: invalid value encountered in multiply
  tops = (tops * np.diff(bins))[:, slc].cumsum(axis=1)[:, slc]


frac_peaks_with_motif = np.sum(peak_hit_counts > 0, axis=0) / len(peak_hit_counts)
labels = np.array(motif_keys)
sorted_inds = np.flip(np.argsort(frac_peaks_with_motif))
frac_peaks_with_motif = frac_peaks_with_motif[sorted_inds]
labels = labels[sorted_inds]

fig, ax = plt.subplots(figsize=(20, 8))
ax.bar(np.arange(len(labels)), frac_peaks_with_motif)
ax.set_title("Proportion of peaks with each motif")
ax.set_xticks(np.arange(len(labels)))
ax.set_xticklabels(labels)
plt.show()

if hits_cache_dir:
    fig.savefig(os.path.join(hits_cache_dir, "peaks_with_each_motif.png"))


# Show some examples of sequences with motif hits
num_to_draw = 3
center_plot_size = 400

unique_counts = np.sort(np.unique(motifs_per_peak))
motif_nums = []
if 0 in motifs_per_peak:
    motif_nums.append(0)
if 1 in motifs_per_peak:
    motif_nums.append(1)
motif_nums.extend([
    unique_counts[0],  # Minimum
    unique_counts[len(unique_counts) // 2],  # Median
    unique_counts[-1],  # Maximum
])

for motif_num in np.sort(np.unique(motif_nums)):
    display(vdomh.h4("Sequences with %d motif hits" % motif_num))
    
    peak_inds = np.where(motifs_per_peak == motif_num)[0]
    table_rows = []
    rng = np.random.RandomState(seed)
    for i in rng.choice(
        peak_inds, size=min(num_to_draw, len(peak_inds)), replace=False
    ):
        peak_coord = peak_table.iloc[i][["chrom", "peak_start", "peak_end"]].values
        motif_hits = peak_hits[i]
        
        chrom, peak_start, peak_end = peak_coord
        
        # Limit peak start/end here
        mid = (peak_start + peak_end) // 2
        peak_start = mid - (center_plot_size // 2)
        peak_end = peak_start + center_plot_size
        
        peak_len = peak_end - peak_start
        mask = (shap_coords[:, 0] == chrom) & (shap_coords[:, 1] <= peak_start) & (shap_coords[:, 2] >= peak_end)
        if not np.sum(mask):
            fig = "No matching input sequence found"
            table_rows.append(
                vdomh.tr(
                    vdomh.td("%s:%d-%d" % (chrom, peak_start, peak_end)),
                    vdomh.td(fig)
                )
            )
            continue
            
        seq_index = np.where(mask)[0][0]  # Pick one
        imp_scores = act_scores[seq_index]
        _, seq_start, seq_end = shap_coords[seq_index]
        
        highlights = []
        for _, row in motif_hits.iterrows():
            start = row["start"] - peak_start
            end = start + (row["end"] - row["start"])
            highlights.append((start, end))
        
        # Remove highlights that overrun the sequence
        highlights = [(a, b) for a, b in highlights if a >= 0 and b < peak_len]
        
        start = peak_start - seq_start 
        end = start + peak_len
        imp_scores_peak = imp_scores[start:end]
        
        fig = viz_sequence.plot_weights(
            imp_scores_peak, subticks_frequency=(len(imp_scores_peak) + 1),
            highlight={"red" : [pair for pair in highlights]},
            return_fig=True
        )
        fig = figure_to_vdom_image(fig)
        
        table_rows.append(
            vdomh.tr(
                vdomh.td("%s:%d-%d" % (chrom, peak_start, peak_end)),
                vdomh.td(fig)
            )
        )

    table = vdomh.table(*table_rows)
    display(table)
    plt.close("all")


density_figs = plot_homotypic_densities(peak_hit_counts, motif_keys)

if hits_cache_dir:
    for key, fig in density_figs.items():
        fig.savefig(os.path.join(hits_cache_dir, "homotypic_density_%s.png" % key))

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6662: RuntimeWarning: invalid value encountered in multiply
  tops = (tops * np.diff(bins))[:, slc].cumsum(axis=1)[:, slc]

/users/amtseng/miniconda3/envs/tfmodisco-mini/lib/python3.7/site-packages/matplotlib/axes/_axes.py:6662: RuntimeWarning: invalid value encountered in multiply
  tops = (tops * np.diff(bins))[:, slc].cumsum(axis=1)[:, slc]


fig = plot_peak_motif_indicator_heatmap(peak_hit_counts, motif_keys, subsample=10000)

if hits_cache_dir:
    fig.savefig(os.path.join(hits_cache_dir, "peak_motif_indicator_heatmap.png"))


p_fig, c_fig, cluster_inds = plot_motif_cooccurrence_heatmaps(
    motif_cooccurrence_count_matrix, motif_cooccurrence_pval_matrix, motif_keys
)

if hits_cache_dir:
    p_fig.savefig(os.path.join(hits_cache_dir, "cooccurrence_pvals.png"))
    c_fig.savefig(os.path.join(hits_cache_dir, "cooccurrence_counts.png"))


# Get which pairs of motifs are significant
sig_thresh = 1e-6
count_thresh = 100

pvals, sig_pairs = [], []
for i in range(len(motif_keys)):
    for j in range(i + 1):
        if motif_cooccurrence_pval_matrix[i, j] < sig_thresh and motif_cooccurrence_count_matrix[i, j] >= count_thresh:
            sig_pairs.append((i, j))
            pvals.append(motif_cooccurrence_pval_matrix[i, j])
inds = np.argsort(pvals)
sig_pairs = [sig_pairs[i] for i in inds]


distance_dict, fig = plot_intermotif_distance_violins(peak_hits, motif_keys, sig_pairs, cluster_inds)

if hits_cache_dir:
    if fig is not None:
        with h5py.File(os.path.join(hits_cache_dir, "intermotif_dists.h5"), "w") as f:
            for key_pair, dists in distance_dict.items():
                f.create_dataset("%s:%s" % key_pair, data=dists, compression="gzip")
        fig.savefig(os.path.join(hits_cache_dir, "intermotif_dists.png"))

Quantile	Number of hits/peak
0.0%	0
25.0%	1
50.0%	5
75.0%	8
99.0%	13
100.0%	18

chrEBV:86509-86909	No matching input sequence found
chr3:186783410-186783810
chrEBV:161490-161890	No matching input sequence found

Link to results¶

Define constants and paths¶

Helper functions¶

Import hit results¶

Proportion of peaks with hits¶

Examples of motif hits in sequences¶

Sequences with 0 motif hits

Sequences with 1 motif hits

Sequences with 9 motif hits

Sequences with 18 motif hits

Homotypic motif densities¶

Co-occurrence of motifs¶

Distribution of distances between motifs¶

chr2:231464170-231464570
chr22:50508095-50508495
chr9:99221541-99221941

chr4:79141526-79141926
chr6:29733060-29733460
chr20:61331668-61332068

chr19:15080010-15080410
chr1:24197610-24198010