import os
import h5py
import util
import moods
import viz_sequence
import numpy as np
import pandas as pd
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()

/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:16: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  app.launch_new_instance()

<tqdm.notebook.tqdm_notebook at 0x7f73f7060e50>


# Plotting defaults
font_manager.fontManager.ttflist.extend(
    font_manager.createFontList(
        font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
    )
)
plot_params = {
    "figure.titlesize": 22,
    "axes.titlesize": 22,
    "axes.labelsize": 20,
    "legend.fontsize": 18,
    "xtick.labelsize": 16,
    "ytick.labelsize": 16,
    "font.family": "Roboto",
    "font.weight": "bold"
}
plt.rcParams.update(plot_params)

/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: 
The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead.
  after removing the cwd from sys.path.


# Define parameters/fetch arguments
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
moods_dir = os.environ["TFM_MOODS_DIR"]
embeddings_path = os.environ["TFM_EMB_PATH"]

print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Embeddings path: %s" % embeddings_path)
print("MOODS directory: %s" % moods_dir)

DeepSHAP scores path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//shap/ENCSR000EEC/counts_scores_alex_format.h5
TF-MoDISco results path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//modisco/ENCSR000EEC/counts/modisco_results.hd5
Embeddings path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//embeddings/ENCSR000EEC/embeddings.npz
MOODS directory: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/reports/tfmodisco/notebooks/ENCSR000EEC/moods/counts


# Define constants
shap_score_center_size = 400
hyp_score_key = "hyp_scores"
task_index = None


def compute_tfmodisco_motif_subclusters(tfm_results):
    """
    From an imported TF-MoDISco results object, computes the subclustering
    of heterogeneity within each motif/pattern.
    """
    metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
    num_metaclusters = len(metaclusters.keys())
    for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
        metacluster = metaclusters[metacluster_key]
        patterns = metacluster.seqlets_to_patterns_result.patterns
        if not patterns:
            break
        num_patterns = len(patterns)
        for pattern_i, pattern in enumerate(patterns):
            # Compute subclustering for each pattern (motif)
            pattern.compute_subclusters_and_embedding(
                pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
                    track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
                    track_transformer=modisco.affinitymat.L1Normalizer(),
                    min_overlap=None  # This argument is irrelevant here
                ),
                perplexity=30, n_jobs=4, verbose=True
            )


def trim_hcwm(pfm, hcwm):
    # Trim motif based on information content
    ic = util.info_content(pfm)
    pass_inds = np.where(ic >= 0.2)[0]  # Cut off flanks with less than 0.2 IC

    # Expand trimming to +/- 4bp on either side
    start, end = max(0, np.min(pass_inds) - 4), min(len(pfm), np.max(pass_inds) + 4 + 1)
    return hcwm[start:end]


def plot_motif_heterogeneity(tfm_results):
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "50%"}),
        vdomh.col(style={"width": "40%"})
    )
    header = vdomh.thead(
        vdomh.tr(
            vdomh.th("Subpattern", style={"text-align": "center"}),
            vdomh.th("Seqlets", style={"text-align": "center"}),
            vdomh.th("Embeddings", style={"text-align": "center"}),
            vdomh.th("hCWM", style={"text-align": "center"})
        )
    )

    metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
    num_metaclusters = len(metaclusters.keys())
    for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
        metacluster = metaclusters[metacluster_key]
        display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
        patterns = metacluster.seqlets_to_patterns_result.patterns
        if not patterns:
            break
        num_patterns = len(patterns)
        for pattern_i, pattern in enumerate(patterns):
            display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))

            embedding = pattern.twod_embedding
            subpattern_clusters = pattern.subclusters

            # Aggregate motif
            pfm = pattern["sequence"].fwd
            hcwm = pattern["task0_hypothetical_contribs"].fwd
            trimmed_hcwm = trim_hcwm(pfm, hcwm)
            hcwm_fig = viz_sequence.plot_weights(
                trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
            )
            emb_fig, ax = plt.subplots()
            ax.scatter(
                embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
            )

            table_rows = [vdomh.tr(
                vdomh.td("Agg."),
                vdomh.td(str(len(pattern.seqlets))),
                vdomh.td(util.figure_to_vdom_image(emb_fig)),
                vdomh.td(util.figure_to_vdom_image(hcwm_fig))
            )]

            for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
                pfm = subpattern["sequence"].fwd
                hcwm = subpattern["task0_hypothetical_contribs"].fwd
                trimmed_hcwm = trim_hcwm(pfm, hcwm)
                hcwm_fig = viz_sequence.plot_weights(
                    trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
                )
                emb_fig, ax = plt.subplots()
                ax.scatter(
                    embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
                )

                table_rows.append(vdomh.tr(
                    vdomh.td(str(subpattern_key)),
                    vdomh.td(str(len(subpattern.seqlets))),
                    vdomh.td(util.figure_to_vdom_image(emb_fig)),
                    vdomh.td(util.figure_to_vdom_image(hcwm_fig))
                ))

            table = vdomh.table(header, vdomh.tbody(*table_rows))
            display(table)
            plt.close("all")  # Remove all standing figures


def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
    """
    Imports hCWMs to into a dictionary, mapping `(x, y)` to the hCWM,
    where `x` is the metacluster index and `y` is the pattern index.
    Arguments:
        `tfm_results_path`: path to HDF5 containing TF-MoDISco results
        `out_dir`: where to save motifs
        `trim`: if True, trim the motif flanks based on total importance
        `only_pos`: if True, only return motifs with positive contributions
    Returns the dictionary of hCWM.
    """ 
    hcwms = {}
    with h5py.File(tfm_results_path, "r") as f:
        metaclusters = f["metacluster_idx_to_submetacluster_results"]
        num_metaclusters = len(metaclusters.keys())
        for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
            metacluster = metaclusters[metacluster_key]
            if "patterns" not in metacluster["seqlets_to_patterns_result"]:
                continue
            patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
            num_patterns = len(patterns["all_pattern_names"][:])
            for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
                pattern_name = pattern_name.decode()
                pattern = patterns[pattern_name]
                pfm = pattern["sequence"]["fwd"][:]
                hcwm = pattern["task0_hypothetical_contribs"]["fwd"][:]
                cwm = pattern["task0_contrib_scores"]["fwd"][:]
                
                # Check that the contribution scores are overall positive
                if only_pos and np.sum(cwm) < 0:
                    continue
                    
                if trim:
                    hcwm = trim_hcwm(pfm, hcwm)
                    
                hcwms["%d_%d" % (metacluster_i,pattern_i)] = hcwm
    return hcwms


def get_hit_peak_indices(hit_table, motif_keys):
    """
    Returns a dictionary of NumPy arrays, mapping each motif key to
    the set of peak indices that contain that motif.
    """
    hit_peak_indices = {}
    for motif_key in motif_keys:
        hit_peak_indices[motif_key] = hit_table[hit_table["key"] == motif_key]["peak_index"].values
    return hit_peak_indices


def plot_peak_clustering(embeddings, motif_keys, hcwms, hit_peak_indices):
    # First reduce using PCA
    centered = embeddings - np.mean(embeddings, axis=0, keepdims=True)
    pca = sklearn.decomposition.PCA(n_components=20)
    reduced = pca.fit_transform(centered)

    # Run UMAP
    um = umap.UMAP(verbose=False)
    trans = um.fit_transform(centered)
    
    colgroup = vdomh.colgroup(
        vdomh.col(style={"width": "5%"}),
        vdomh.col(style={"width": "55"}),
        vdomh.col(style={"width": "40%"})
    )
    header = vdomh.thead(
        vdomh.tr(
            vdomh.th("Motif key", style={"text-align": "center"}),
            vdomh.th("Embeddings", style={"text-align": "center"}),
            vdomh.th("hCWM", style={"text-align": "center"})
        )
    )

    table_rows = []
    for motif_key in motif_keys:
        hcwm = hcwms[motif_key]
        hcwm_fig = viz_sequence.plot_weights(
            hcwm, subticks_frequency=(len(hcwm) + 1), return_fig=True
        )
        emb_fig, ax = plt.subplots()
        subset = np.zeros(len(embeddings), dtype=int)
        subset[hit_peak_indices[motif_key]] = 1
        ax.scatter(
            trans[:,0], trans[:,1], c=subset, alpha=0.3
        )

        table_rows.append(vdomh.tr(
            vdomh.td(motif_key),
            vdomh.td(util.figure_to_vdom_image(emb_fig)),
            vdomh.td(util.figure_to_vdom_image(hcwm_fig))
        ))

    table = vdomh.table(header, vdomh.tbody(*table_rows))
    display(table)
    plt.close("all")  # Remove all standing figures


# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut

Importing SHAP scores:   0%|          | 0/49 [00:00<?, ?it/s]
Importing SHAP scores:   2%|▏         | 1/49 [00:00<00:15,  3.09it/s]
Importing SHAP scores:   4%|▍         | 2/49 [00:00<00:11,  3.94it/s]
Importing SHAP scores:   6%|▌         | 3/49 [00:00<00:10,  4.58it/s]
Importing SHAP scores:   8%|▊         | 4/49 [00:00<00:10,  4.19it/s]
Importing SHAP scores:  10%|█         | 5/49 [00:01<00:09,  4.59it/s]
Importing SHAP scores:  12%|█▏        | 6/49 [00:01<00:08,  4.98it/s]
Importing SHAP scores:  14%|█▍        | 7/49 [00:01<00:09,  4.40it/s]
Importing SHAP scores:  16%|█▋        | 8/49 [00:01<00:08,  4.73it/s]
Importing SHAP scores:  18%|█▊        | 9/49 [00:01<00:08,  4.98it/s]
Importing SHAP scores:  20%|██        | 10/49 [00:02<00:08,  4.51it/s]
Importing SHAP scores:  22%|██▏       | 11/49 [00:02<00:07,  4.75it/s]
Importing SHAP scores:  24%|██▍       | 12/49 [00:02<00:07,  5.04it/s]
Importing SHAP scores:  27%|██▋       | 13/49 [00:02<00:07,  4.73it/s]
Importing SHAP scores:  29%|██▊       | 14/49 [00:03<00:07,  4.91it/s]
Importing SHAP scores:  31%|███       | 15/49 [00:03<00:06,  4.95it/s]
Importing SHAP scores:  33%|███▎      | 16/49 [00:03<00:06,  5.20it/s]
Importing SHAP scores:  35%|███▍      | 17/49 [00:03<00:06,  4.89it/s]
Importing SHAP scores:  37%|███▋      | 18/49 [00:03<00:06,  4.76it/s]
Importing SHAP scores:  39%|███▉      | 19/49 [00:04<00:06,  4.86it/s]
Importing SHAP scores:  41%|████      | 20/49 [00:04<00:06,  4.38it/s]
Importing SHAP scores:  43%|████▎     | 21/49 [00:04<00:05,  4.71it/s]
Importing SHAP scores:  45%|████▍     | 22/49 [00:04<00:05,  4.52it/s]
Importing SHAP scores:  47%|████▋     | 23/49 [00:04<00:06,  4.27it/s]
Importing SHAP scores:  49%|████▉     | 24/49 [00:05<00:05,  4.65it/s]
Importing SHAP scores:  51%|█████     | 25/49 [00:05<00:04,  4.95it/s]
Importing SHAP scores:  53%|█████▎    | 26/49 [00:05<00:04,  4.60it/s]
Importing SHAP scores:  55%|█████▌    | 27/49 [00:05<00:04,  4.76it/s]
Importing SHAP scores:  57%|█████▋    | 28/49 [00:05<00:04,  5.05it/s]
Importing SHAP scores:  59%|█████▉    | 29/49 [00:06<00:04,  4.74it/s]
Importing SHAP scores:  61%|██████    | 30/49 [00:06<00:03,  4.87it/s]
Importing SHAP scores:  63%|██████▎   | 31/49 [00:06<00:03,  4.84it/s]
Importing SHAP scores:  65%|██████▌   | 32/49 [00:06<00:03,  5.02it/s]
Importing SHAP scores:  67%|██████▋   | 33/49 [00:07<00:03,  4.80it/s]
Importing SHAP scores:  69%|██████▉   | 34/49 [00:07<00:03,  4.95it/s]
Importing SHAP scores:  71%|███████▏  | 35/49 [00:07<00:02,  4.98it/s]
Importing SHAP scores:  73%|███████▎  | 36/49 [00:07<00:02,  4.60it/s]
Importing SHAP scores:  76%|███████▌  | 37/49 [00:07<00:02,  4.92it/s]
Importing SHAP scores:  78%|███████▊  | 38/49 [00:08<00:02,  4.91it/s]
Importing SHAP scores:  80%|███████▉  | 39/49 [00:08<00:02,  4.56it/s]
Importing SHAP scores:  82%|████████▏ | 40/49 [00:08<00:01,  4.77it/s]
Importing SHAP scores:  84%|████████▎ | 41/49 [00:08<00:01,  5.06it/s]
Importing SHAP scores:  86%|████████▌ | 42/49 [00:08<00:01,  4.31it/s]
Importing SHAP scores:  88%|████████▊ | 43/49 [00:09<00:01,  4.43it/s]
Importing SHAP scores:  90%|████████▉ | 44/49 [00:09<00:01,  4.67it/s]
Importing SHAP scores:  92%|█████████▏| 45/49 [00:09<00:00,  4.48it/s]
Importing SHAP scores:  94%|█████████▍| 46/49 [00:09<00:00,  4.72it/s]
Importing SHAP scores:  96%|█████████▌| 47/49 [00:09<00:00,  4.96it/s]
Importing SHAP scores:  98%|█████████▊| 48/49 [00:10<00:00,  5.17it/s]
Importing SHAP scores: 100%|██████████| 49/49 [00:10<00:00,  4.76it/s]


# Import the TF-MoDISco results object
tfm_obj = util.import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)


# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
compute_tfmodisco_motif_subclusters(tfm_obj)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   13.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:   26.4s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:   45.7s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 2442 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done 3192 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed:  3.2min
[Parallel(n_jobs=4)]: Done 4992 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done 6042 tasks      | elapsed:  4.9min
[Parallel(n_jobs=4)]: Done 7192 tasks      | elapsed:  5.9min
[Parallel(n_jobs=4)]: Done 8442 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done 9792 tasks      | elapsed:  7.8min
[Parallel(n_jobs=4)]: Done 11242 tasks      | elapsed:  9.1min
[Parallel(n_jobs=4)]: Done 12792 tasks      | elapsed: 10.4min
[Parallel(n_jobs=4)]: Done 13036 out of 13036 | elapsed: 10.6min finished
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 13036 samples in 0.107s...
[t-SNE] Computed neighbors for 13036 samples in 0.007s...
[t-SNE] Computed conditional probabilities for sample 1000 / 13036
[t-SNE] Computed conditional probabilities for sample 2000 / 13036
[t-SNE] Computed conditional probabilities for sample 3000 / 13036
[t-SNE] Computed conditional probabilities for sample 4000 / 13036
[t-SNE] Computed conditional probabilities for sample 5000 / 13036
[t-SNE] Computed conditional probabilities for sample 6000 / 13036
[t-SNE] Computed conditional probabilities for sample 7000 / 13036
[t-SNE] Computed conditional probabilities for sample 8000 / 13036
[t-SNE] Computed conditional probabilities for sample 9000 / 13036
[t-SNE] Computed conditional probabilities for sample 10000 / 13036
[t-SNE] Computed conditional probabilities for sample 11000 / 13036
[t-SNE] Computed conditional probabilities for sample 12000 / 13036
[t-SNE] Computed conditional probabilities for sample 13000 / 13036
[t-SNE] Computed conditional probabilities for sample 13036 / 13036
[t-SNE] Mean sigma: 0.188281
[t-SNE] Computed conditional probabilities in 0.818s
[t-SNE] Iteration 50: error = 98.0701141, gradient norm = 0.0000268 (50 iterations in 11.948s)
[t-SNE] Iteration 100: error = 98.0660934, gradient norm = 0.0004327 (50 iterations in 13.724s)
[t-SNE] Iteration 150: error = 96.6215210, gradient norm = 0.0002857 (50 iterations in 13.104s)
[t-SNE] Iteration 200: error = 96.4191666, gradient norm = 0.0005257 (50 iterations in 13.984s)
[t-SNE] Iteration 250: error = 96.3145905, gradient norm = 0.0000267 (50 iterations in 14.644s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 96.314590
[t-SNE] Iteration 300: error = 3.9039860, gradient norm = 0.0011251 (50 iterations in 13.321s)
[t-SNE] Iteration 350: error = 3.5472841, gradient norm = 0.0005036 (50 iterations in 13.148s)
[t-SNE] Iteration 400: error = 3.3866339, gradient norm = 0.0002926 (50 iterations in 13.155s)
[t-SNE] Iteration 450: error = 3.2909482, gradient norm = 0.0001994 (50 iterations in 14.589s)
[t-SNE] Iteration 500: error = 3.2247148, gradient norm = 0.0001503 (50 iterations in 13.896s)
[t-SNE] Iteration 550: error = 3.1755595, gradient norm = 0.0001200 (50 iterations in 15.032s)
[t-SNE] Iteration 600: error = 3.1376548, gradient norm = 0.0000995 (50 iterations in 14.044s)
[t-SNE] Iteration 650: error = 3.1073236, gradient norm = 0.0000846 (50 iterations in 14.676s)
[t-SNE] Iteration 700: error = 3.0825818, gradient norm = 0.0000742 (50 iterations in 14.950s)
[t-SNE] Iteration 750: error = 3.0617912, gradient norm = 0.0000696 (50 iterations in 14.558s)
[t-SNE] Iteration 800: error = 3.0438013, gradient norm = 0.0000684 (50 iterations in 14.632s)
[t-SNE] Iteration 850: error = 3.0296595, gradient norm = 0.0000595 (50 iterations in 14.806s)
[t-SNE] Iteration 900: error = 3.0180936, gradient norm = 0.0000535 (50 iterations in 14.706s)
[t-SNE] Iteration 950: error = 3.0082047, gradient norm = 0.0000488 (50 iterations in 14.688s)
[t-SNE] Iteration 1000: error = 2.9999802, gradient norm = 0.0000469 (50 iterations in 14.624s)
[t-SNE] KL divergence after 1000 iterations: 2.999980
[t-SNE] Computed conditional probabilities for sample 1000 / 13036
[t-SNE] Computed conditional probabilities for sample 2000 / 13036
[t-SNE] Computed conditional probabilities for sample 3000 / 13036
[t-SNE] Computed conditional probabilities for sample 4000 / 13036
[t-SNE] Computed conditional probabilities for sample 5000 / 13036
[t-SNE] Computed conditional probabilities for sample 6000 / 13036
[t-SNE] Computed conditional probabilities for sample 7000 / 13036
[t-SNE] Computed conditional probabilities for sample 8000 / 13036
[t-SNE] Computed conditional probabilities for sample 9000 / 13036
[t-SNE] Computed conditional probabilities for sample 10000 / 13036
[t-SNE] Computed conditional probabilities for sample 11000 / 13036
[t-SNE] Computed conditional probabilities for sample 12000 / 13036
[t-SNE] Computed conditional probabilities for sample 13000 / 13036
[t-SNE] Computed conditional probabilities for sample 13036 / 13036
[t-SNE] Mean sigma: 0.188281
Beginning preprocessing + Leiden

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  2.6min

Quality: 0.6705248042564095
Quality: 0.6720375953776699
Quality: 0.6740980607987367
Quality: 0.6751918832396383
Quality: 0.6761101224422381
Quality: 0.6763262829969702

[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  3.2min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1233 out of 1233 | elapsed:    5.4s finished

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 1233 samples in 0.007s...
[t-SNE] Computed neighbors for 1233 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1233
[t-SNE] Computed conditional probabilities for sample 1233 / 1233
[t-SNE] Mean sigma: 0.213123
[t-SNE] Computed conditional probabilities in 0.083s
[t-SNE] Iteration 50: error = 76.5437241, gradient norm = 0.2815674 (50 iterations in 9.054s)
[t-SNE] Iteration 100: error = 78.0789642, gradient norm = 0.2509831 (50 iterations in 8.916s)
[t-SNE] Iteration 150: error = 78.6444168, gradient norm = 0.2488905 (50 iterations in 8.908s)
[t-SNE] Iteration 200: error = 78.5007935, gradient norm = 0.2490332 (50 iterations in 11.128s)
[t-SNE] Iteration 250: error = 78.2442627, gradient norm = 0.2588885 (50 iterations in 13.848s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 78.244263
[t-SNE] Iteration 300: error = 2.3611672, gradient norm = 0.0024051 (50 iterations in 14.178s)
[t-SNE] Iteration 350: error = 2.1742642, gradient norm = 0.0010942 (50 iterations in 13.912s)
[t-SNE] Iteration 400: error = 2.1318431, gradient norm = 0.0003610 (50 iterations in 13.824s)
[t-SNE] Iteration 450: error = 2.1047237, gradient norm = 0.0003574 (50 iterations in 13.696s)
[t-SNE] Iteration 500: error = 2.0912130, gradient norm = 0.0002509 (50 iterations in 14.032s)
[t-SNE] Iteration 550: error = 2.0808151, gradient norm = 0.0002135 (50 iterations in 13.544s)
[t-SNE] Iteration 600: error = 2.0735421, gradient norm = 0.0002009 (50 iterations in 13.516s)
[t-SNE] Iteration 650: error = 2.0662558, gradient norm = 0.0001676 (50 iterations in 13.532s)
[t-SNE] Iteration 700: error = 2.0628028, gradient norm = 0.0001671 (50 iterations in 13.636s)
[t-SNE] Iteration 750: error = 2.0570281, gradient norm = 0.0001614 (50 iterations in 14.292s)
[t-SNE] Iteration 800: error = 2.0551090, gradient norm = 0.0001219 (50 iterations in 14.268s)
[t-SNE] Iteration 850: error = 2.0529013, gradient norm = 0.0001184 (50 iterations in 14.036s)
[t-SNE] Iteration 900: error = 2.0519850, gradient norm = 0.0000608 (50 iterations in 13.976s)
[t-SNE] Iteration 950: error = 2.0511327, gradient norm = 0.0000561 (50 iterations in 13.888s)
[t-SNE] Iteration 1000: error = 2.0505106, gradient norm = 0.0000486 (50 iterations in 13.912s)
[t-SNE] KL divergence after 1000 iterations: 2.050511
[t-SNE] Computed conditional probabilities for sample 1000 / 1233
[t-SNE] Computed conditional probabilities for sample 1233 / 1233
[t-SNE] Mean sigma: 0.213123
Beginning preprocessing + Leiden

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   37.9s

Quality: 0.5025858151383051
Quality: 0.5026321638503229
Quality: 0.5030348498486149

[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   42.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 868 out of 868 | elapsed:    2.4s finished

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 868 samples in 0.006s...
[t-SNE] Computed neighbors for 868 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 868 / 868
[t-SNE] Mean sigma: 0.230231
[t-SNE] Computed conditional probabilities in 0.058s
[t-SNE] Iteration 50: error = 76.6028900, gradient norm = 0.3726501 (50 iterations in 10.408s)
[t-SNE] Iteration 100: error = 78.7410278, gradient norm = 0.3149196 (50 iterations in 9.159s)
[t-SNE] Iteration 150: error = 79.8931503, gradient norm = 0.3349240 (50 iterations in 9.523s)
[t-SNE] Iteration 200: error = 81.3601227, gradient norm = 0.3066660 (50 iterations in 11.497s)
[t-SNE] Iteration 250: error = 80.3943329, gradient norm = 0.3222249 (50 iterations in 14.072s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 80.394333
[t-SNE] Iteration 300: error = 1.7223886, gradient norm = 0.0038616 (50 iterations in 14.038s)
[t-SNE] Iteration 350: error = 1.6462984, gradient norm = 0.0009397 (50 iterations in 13.828s)
[t-SNE] Iteration 400: error = 1.6119348, gradient norm = 0.0004715 (50 iterations in 13.719s)
[t-SNE] Iteration 450: error = 1.5880811, gradient norm = 0.0003733 (50 iterations in 14.085s)
[t-SNE] Iteration 500: error = 1.5780219, gradient norm = 0.0001960 (50 iterations in 14.288s)
[t-SNE] Iteration 550: error = 1.5726110, gradient norm = 0.0002284 (50 iterations in 14.272s)
[t-SNE] Iteration 600: error = 1.5623331, gradient norm = 0.0005394 (50 iterations in 14.000s)
[t-SNE] Iteration 650: error = 1.5540992, gradient norm = 0.0002221 (50 iterations in 13.952s)
[t-SNE] Iteration 700: error = 1.5507047, gradient norm = 0.0001713 (50 iterations in 14.132s)
[t-SNE] Iteration 750: error = 1.5466113, gradient norm = 0.0001892 (50 iterations in 13.820s)
[t-SNE] Iteration 800: error = 1.5447025, gradient norm = 0.0001697 (50 iterations in 15.124s)
[t-SNE] Iteration 850: error = 1.5436926, gradient norm = 0.0001539 (50 iterations in 15.100s)
[t-SNE] Iteration 900: error = 1.5417789, gradient norm = 0.0000864 (50 iterations in 14.792s)
[t-SNE] Iteration 950: error = 1.5407066, gradient norm = 0.0001260 (50 iterations in 14.544s)
[t-SNE] Iteration 1000: error = 1.5365589, gradient norm = 0.0001801 (50 iterations in 15.024s)
[t-SNE] KL divergence after 1000 iterations: 1.536559
[t-SNE] Computed conditional probabilities for sample 868 / 868
[t-SNE] Mean sigma: 0.230231
Beginning preprocessing + Leiden


# Import the hCWMs
hcwms = import_tfmodisco_motifs(tfm_results_path)
motif_keys = list(hcwms.keys())


# Import the motif hits
hit_table = moods.import_moods_hits(os.path.join(moods_dir, "moods_filtered_collapsed.bed"))


hit_peak_indices = get_hit_peak_indices(hit_table, motif_keys)


# Import embeddings (this can take awhile)
embeddings = np.load(embeddings_path)["embeddings"]


# Sum up over sequence axis to remove position dependencies
summed_embeddings = np.sum(embeddings, axis=1)


plot_motif_heterogeneity(tfm_obj)

/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/TF-Atlas/3M/reports/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  fig = plt.figure(figsize=figsize)


plot_peak_clustering(summed_embeddings, motif_keys, hcwms, hit_peak_indices)

Motif key	Embeddings	hCWM
0_0
0_1
0_2
0_3
0_4
0_5
0_6
0_7
0_8

Link to results¶

Define constants and paths¶

Helper functions¶

Import TF-MoDISco results¶

Import motif hits¶

Within-motif heterogeneity¶

Metacluster 1/2

Pattern 1/9

Pattern 2/9

Pattern 3/9

Pattern 4/9

Pattern 5/9

Pattern 6/9

Pattern 7/9

Pattern 8/9

Pattern 9/9

Metacluster 2/2

Peak clustering¶

Subpattern	Seqlets	Embeddings	hCWM
Agg.	13036
0	1974
1	1542
2	1511
3	1484
4	1265
5	1253
6	1177
7	793
8	767
9	723
10	410
11	127
12	10

Subpattern	Seqlets	Embeddings	hCWM
Agg.	1233
0	274
1	190
2	175
3	163
4	130
5	115
6	81
7	70
8	35