import os
import h5py
import util
import moods
import viz_sequence
import numpy as np
import pandas as pd
import modisco
import sklearn.decomposition
import umap
import matplotlib.pyplot as plt
import matplotlib.font_manager as font_manager
import vdom.helpers as vdomh
from IPython.display import display
import tqdm
tqdm.tqdm_notebook()
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:16: TqdmDeprecationWarning: This function will be removed in tqdm==5.0.0 Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook` app.launch_new_instance()
<tqdm.notebook.tqdm_notebook at 0x7fd85bf3b8d0>
# Plotting defaults
font_manager.fontManager.ttflist.extend(
font_manager.createFontList(
font_manager.findSystemFonts(fontpaths="/users/amtseng/modules/fonts")
)
)
plot_params = {
"figure.titlesize": 22,
"axes.titlesize": 22,
"axes.labelsize": 20,
"legend.fontsize": 18,
"xtick.labelsize": 16,
"ytick.labelsize": 16,
"font.family": "Roboto",
"font.weight": "bold"
}
plt.rcParams.update(plot_params)
/users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/ipykernel_launcher.py:4: MatplotlibDeprecationWarning: The createFontList function was deprecated in Matplotlib 3.2 and will be removed two minor releases later. Use FontManager.addfont instead. after removing the cwd from sys.path.
# Define parameters/fetch arguments
shap_scores_path = os.environ["TFM_SHAP_PATH"]
tfm_results_path = os.environ["TFM_TFM_PATH"]
moods_dir = os.environ["TFM_MOODS_DIR"]
embeddings_path = os.environ["TFM_EMB_PATH"]
print("DeepSHAP scores path: %s" % shap_scores_path)
print("TF-MoDISco results path: %s" % tfm_results_path)
print("Embeddings path: %s" % embeddings_path)
print("MOODS directory: %s" % moods_dir)
DeepSHAP scores path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//shap/ENCSR000EEB/profile_scores_alex_format.h5 TF-MoDISco results path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//modisco/ENCSR000EEB/profile/modisco_results.hd5 Embeddings path: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021//embeddings/ENCSR000EEB/embeddings.npz MOODS directory: /mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/reports/tfmodisco/notebooks/ENCSR000EEB/moods/profile
# Define constants
shap_score_center_size = 400
hyp_score_key = "hyp_scores"
task_index = None
For plotting and organizing things
def compute_tfmodisco_motif_subclusters(tfm_results):
"""
From an imported TF-MoDISco results object, computes the subclustering
of heterogeneity within each motif/pattern.
"""
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
# Compute subclustering for each pattern (motif)
pattern.compute_subclusters_and_embedding(
pattern_comparison_settings=modisco.affinitymat.core.PatternComparisonSettings(
track_names=["task0_hypothetical_contribs", "task0_contrib_scores"],
track_transformer=modisco.affinitymat.L1Normalizer(),
min_overlap=None # This argument is irrelevant here
),
perplexity=30, n_jobs=4, verbose=True
)
def trim_hcwm(pfm, hcwm):
# Trim motif based on information content
ic = util.info_content(pfm)
pass_inds = np.where(ic >= 0.2)[0] # Cut off flanks with less than 0.2 IC
# Expand trimming to +/- 4bp on either side
start, end = max(0, np.min(pass_inds) - 4), min(len(pfm), np.max(pass_inds) + 4 + 1)
return hcwm[start:end]
def plot_motif_heterogeneity(tfm_results):
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "50%"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Subpattern", style={"text-align": "center"}),
vdomh.th("Seqlets", style={"text-align": "center"}),
vdomh.th("Embeddings", style={"text-align": "center"}),
vdomh.th("hCWM", style={"text-align": "center"})
)
)
metaclusters = tfm_results.metacluster_idx_to_submetacluster_results
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
display(vdomh.h3("Metacluster %d/%d" % (metacluster_i + 1, num_metaclusters)))
patterns = metacluster.seqlets_to_patterns_result.patterns
if not patterns:
break
num_patterns = len(patterns)
for pattern_i, pattern in enumerate(patterns):
display(vdomh.h4("Pattern %d/%d" % (pattern_i + 1, num_patterns)))
embedding = pattern.twod_embedding
subpattern_clusters = pattern.subclusters
# Aggregate motif
pfm = pattern["sequence"].fwd
hcwm = pattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_hcwm(pfm, hcwm)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=subpattern_clusters, cmap="tab20", alpha=0.3
)
table_rows = [vdomh.tr(
vdomh.td("Agg."),
vdomh.td(str(len(pattern.seqlets))),
vdomh.td(util.figure_to_vdom_image(emb_fig)),
vdomh.td(util.figure_to_vdom_image(hcwm_fig))
)]
for subpattern_key, subpattern in pattern.subcluster_to_subpattern.items():
pfm = subpattern["sequence"].fwd
hcwm = subpattern["task0_hypothetical_contribs"].fwd
trimmed_hcwm = trim_hcwm(pfm, hcwm)
hcwm_fig = viz_sequence.plot_weights(
trimmed_hcwm, subticks_frequency=(len(trimmed_hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
ax.scatter(
embedding[:,0], embedding[:,1], c=(subpattern_clusters == subpattern_key), alpha=0.3
)
table_rows.append(vdomh.tr(
vdomh.td(str(subpattern_key)),
vdomh.td(str(len(subpattern.seqlets))),
vdomh.td(util.figure_to_vdom_image(emb_fig)),
vdomh.td(util.figure_to_vdom_image(hcwm_fig))
))
table = vdomh.table(header, vdomh.tbody(*table_rows))
display(table)
plt.close("all") # Remove all standing figures
def import_tfmodisco_motifs(tfm_results_path, trim=True, only_pos=True):
"""
Imports hCWMs to into a dictionary, mapping `(x, y)` to the hCWM,
where `x` is the metacluster index and `y` is the pattern index.
Arguments:
`tfm_results_path`: path to HDF5 containing TF-MoDISco results
`out_dir`: where to save motifs
`trim`: if True, trim the motif flanks based on total importance
`only_pos`: if True, only return motifs with positive contributions
Returns the dictionary of hCWM.
"""
hcwms = {}
with h5py.File(tfm_results_path, "r") as f:
metaclusters = f["metacluster_idx_to_submetacluster_results"]
num_metaclusters = len(metaclusters.keys())
for metacluster_i, metacluster_key in enumerate(metaclusters.keys()):
metacluster = metaclusters[metacluster_key]
if "patterns" not in metacluster["seqlets_to_patterns_result"]:
continue
patterns = metacluster["seqlets_to_patterns_result"]["patterns"]
num_patterns = len(patterns["all_pattern_names"][:])
for pattern_i, pattern_name in enumerate(patterns["all_pattern_names"][:]):
pattern_name = pattern_name.decode()
pattern = patterns[pattern_name]
pfm = pattern["sequence"]["fwd"][:]
hcwm = pattern["task0_hypothetical_contribs"]["fwd"][:]
cwm = pattern["task0_contrib_scores"]["fwd"][:]
# Check that the contribution scores are overall positive
if only_pos and np.sum(cwm) < 0:
continue
if trim:
hcwm = trim_hcwm(pfm, hcwm)
hcwms["%d_%d" % (metacluster_i,pattern_i)] = hcwm
return hcwms
def get_hit_peak_indices(hit_table, motif_keys):
"""
Returns a dictionary of NumPy arrays, mapping each motif key to
the set of peak indices that contain that motif.
"""
hit_peak_indices = {}
for motif_key in motif_keys:
hit_peak_indices[motif_key] = hit_table[hit_table["key"] == motif_key]["peak_index"].values
return hit_peak_indices
def plot_peak_clustering(embeddings, motif_keys, hcwms, hit_peak_indices):
# First reduce using PCA
centered = embeddings - np.mean(embeddings, axis=0, keepdims=True)
pca = sklearn.decomposition.PCA(n_components=20)
reduced = pca.fit_transform(centered)
# Run UMAP
um = umap.UMAP(verbose=False)
trans = um.fit_transform(centered)
colgroup = vdomh.colgroup(
vdomh.col(style={"width": "5%"}),
vdomh.col(style={"width": "55"}),
vdomh.col(style={"width": "40%"})
)
header = vdomh.thead(
vdomh.tr(
vdomh.th("Motif key", style={"text-align": "center"}),
vdomh.th("Embeddings", style={"text-align": "center"}),
vdomh.th("hCWM", style={"text-align": "center"})
)
)
table_rows = []
for motif_key in motif_keys:
hcwm = hcwms[motif_key]
hcwm_fig = viz_sequence.plot_weights(
hcwm, subticks_frequency=(len(hcwm) + 1), return_fig=True
)
emb_fig, ax = plt.subplots()
subset = np.zeros(len(embeddings), dtype=int)
subset[hit_peak_indices[motif_key]] = 1
ax.scatter(
trans[:,0], trans[:,1], c=subset, alpha=0.3
)
table_rows.append(vdomh.tr(
vdomh.td(motif_key),
vdomh.td(util.figure_to_vdom_image(emb_fig)),
vdomh.td(util.figure_to_vdom_image(hcwm_fig))
))
table = vdomh.table(header, vdomh.tbody(*table_rows))
display(table)
plt.close("all") # Remove all standing figures
Run motif subclustering
# Import SHAP coordinates and one-hot sequences
hyp_scores, _, one_hot_seqs, shap_coords = util.import_shap_scores(shap_scores_path, hyp_score_key, center_cut_size=shap_score_center_size, remove_non_acgt=False)
# This cuts the sequences/scores off just as how TF-MoDISco saw them, but the coordinates are uncut
Importing SHAP scores: 0%| | 0/73 [00:00<?, ?it/s] Importing SHAP scores: 1%|▏ | 1/73 [00:00<00:20, 3.53it/s] Importing SHAP scores: 3%|▎ | 2/73 [00:00<00:27, 2.57it/s] Importing SHAP scores: 4%|▍ | 3/73 [00:01<00:28, 2.43it/s] Importing SHAP scores: 5%|▌ | 4/73 [00:01<00:29, 2.35it/s] Importing SHAP scores: 7%|▋ | 5/73 [00:02<00:29, 2.32it/s] Importing SHAP scores: 8%|▊ | 6/73 [00:02<00:29, 2.28it/s] Importing SHAP scores: 10%|▉ | 7/73 [00:03<00:29, 2.23it/s] Importing SHAP scores: 11%|█ | 8/73 [00:03<00:29, 2.19it/s] Importing SHAP scores: 12%|█▏ | 9/73 [00:03<00:25, 2.55it/s] Importing SHAP scores: 14%|█▎ | 10/73 [00:04<00:25, 2.43it/s] Importing SHAP scores: 15%|█▌ | 11/73 [00:04<00:25, 2.39it/s] Importing SHAP scores: 16%|█▋ | 12/73 [00:05<00:25, 2.37it/s] Importing SHAP scores: 18%|█▊ | 13/73 [00:05<00:25, 2.32it/s] Importing SHAP scores: 19%|█▉ | 14/73 [00:05<00:25, 2.27it/s] Importing SHAP scores: 21%|██ | 15/73 [00:06<00:25, 2.30it/s] Importing SHAP scores: 22%|██▏ | 16/73 [00:06<00:24, 2.31it/s] Importing SHAP scores: 23%|██▎ | 17/73 [00:07<00:24, 2.31it/s] Importing SHAP scores: 25%|██▍ | 18/73 [00:07<00:21, 2.60it/s] Importing SHAP scores: 26%|██▌ | 19/73 [00:07<00:21, 2.51it/s] Importing SHAP scores: 27%|██▋ | 20/73 [00:08<00:21, 2.42it/s] Importing SHAP scores: 29%|██▉ | 21/73 [00:08<00:23, 2.19it/s] Importing SHAP scores: 30%|███ | 22/73 [00:09<00:26, 1.94it/s] Importing SHAP scores: 32%|███▏ | 23/73 [00:10<00:27, 1.79it/s] Importing SHAP scores: 33%|███▎ | 24/73 [00:10<00:29, 1.64it/s] Importing SHAP scores: 34%|███▍ | 25/73 [00:11<00:23, 2.02it/s] Importing SHAP scores: 36%|███▌ | 26/73 [00:11<00:18, 2.49it/s] Importing SHAP scores: 37%|███▋ | 27/73 [00:11<00:18, 2.55it/s] Importing SHAP scores: 38%|███▊ | 28/73 [00:12<00:19, 2.28it/s] Importing SHAP scores: 40%|███▉ | 29/73 [00:12<00:19, 2.28it/s] Importing SHAP scores: 41%|████ | 30/73 [00:13<00:18, 2.27it/s] Importing SHAP scores: 42%|████▏ | 31/73 [00:13<00:18, 2.28it/s] Importing SHAP scores: 44%|████▍ | 32/73 [00:14<00:18, 2.19it/s] Importing SHAP scores: 45%|████▌ | 33/73 [00:14<00:20, 1.95it/s] Importing SHAP scores: 47%|████▋ | 34/73 [00:15<00:21, 1.80it/s] Importing SHAP scores: 48%|████▊ | 35/73 [00:15<00:19, 1.95it/s] Importing SHAP scores: 49%|████▉ | 36/73 [00:16<00:20, 1.79it/s] Importing SHAP scores: 51%|█████ | 37/73 [00:17<00:19, 1.83it/s] Importing SHAP scores: 52%|█████▏ | 38/73 [00:17<00:18, 1.93it/s] Importing SHAP scores: 53%|█████▎ | 39/73 [00:17<00:16, 2.00it/s] Importing SHAP scores: 55%|█████▍ | 40/73 [00:18<00:16, 2.06it/s] Importing SHAP scores: 56%|█████▌ | 41/73 [00:18<00:15, 2.10it/s] Importing SHAP scores: 58%|█████▊ | 42/73 [00:19<00:14, 2.08it/s] Importing SHAP scores: 59%|█████▉ | 43/73 [00:19<00:14, 2.02it/s] Importing SHAP scores: 60%|██████ | 44/73 [00:19<00:11, 2.58it/s] Importing SHAP scores: 62%|██████▏ | 45/73 [00:20<00:09, 2.95it/s] Importing SHAP scores: 63%|██████▎ | 46/73 [00:20<00:08, 3.29it/s] Importing SHAP scores: 64%|██████▍ | 47/73 [00:20<00:07, 3.58it/s] Importing SHAP scores: 66%|██████▌ | 48/73 [00:21<00:07, 3.19it/s] Importing SHAP scores: 67%|██████▋ | 49/73 [00:21<00:08, 2.79it/s] Importing SHAP scores: 68%|██████▊ | 50/73 [00:22<00:09, 2.31it/s] Importing SHAP scores: 70%|██████▉ | 51/73 [00:22<00:09, 2.28it/s] Importing SHAP scores: 71%|███████ | 52/73 [00:22<00:08, 2.56it/s] Importing SHAP scores: 73%|███████▎ | 53/73 [00:23<00:09, 2.13it/s] Importing SHAP scores: 74%|███████▍ | 54/73 [00:24<00:09, 2.00it/s] Importing SHAP scores: 75%|███████▌ | 55/73 [00:24<00:08, 2.08it/s] Importing SHAP scores: 77%|███████▋ | 56/73 [00:24<00:07, 2.39it/s] Importing SHAP scores: 78%|███████▊ | 57/73 [00:25<00:05, 2.76it/s] Importing SHAP scores: 79%|███████▉ | 58/73 [00:25<00:04, 3.05it/s] Importing SHAP scores: 81%|████████ | 59/73 [00:25<00:05, 2.69it/s] Importing SHAP scores: 82%|████████▏ | 60/73 [00:26<00:04, 2.61it/s] Importing SHAP scores: 84%|████████▎ | 61/73 [00:26<00:03, 3.22it/s] Importing SHAP scores: 85%|████████▍ | 62/73 [00:26<00:03, 3.51it/s] Importing SHAP scores: 86%|████████▋ | 63/73 [00:26<00:03, 3.10it/s] Importing SHAP scores: 88%|████████▊ | 64/73 [00:27<00:03, 2.77it/s] Importing SHAP scores: 89%|████████▉ | 65/73 [00:27<00:03, 2.57it/s] Importing SHAP scores: 90%|█████████ | 66/73 [00:28<00:03, 2.17it/s] Importing SHAP scores: 92%|█████████▏| 67/73 [00:28<00:02, 2.23it/s] Importing SHAP scores: 93%|█████████▎| 68/73 [00:29<00:02, 2.35it/s] Importing SHAP scores: 95%|█████████▍| 69/73 [00:29<00:01, 2.41it/s] Importing SHAP scores: 96%|█████████▌| 70/73 [00:29<00:01, 2.67it/s] Importing SHAP scores: 97%|█████████▋| 71/73 [00:30<00:00, 2.50it/s] Importing SHAP scores: 99%|█████████▊| 72/73 [00:30<00:00, 2.41it/s] Importing SHAP scores: 100%|██████████| 73/73 [00:31<00:00, 2.35it/s]
# Import the TF-MoDISco results object
tfm_obj = util.import_tfmodisco_results(tfm_results_path, hyp_scores, one_hot_seqs, shap_score_center_size)
# Compute subclusters (needed for older versions of TF-MoDISco); this takes awhile!
compute_tfmodisco_motif_subclusters(tfm_obj)
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 13.0s [Parallel(n_jobs=4)]: Done 192 tasks | elapsed: 32.4s [Parallel(n_jobs=4)]: Done 442 tasks | elapsed: 1.1min [Parallel(n_jobs=4)]: Done 792 tasks | elapsed: 1.6min [Parallel(n_jobs=4)]: Done 1242 tasks | elapsed: 2.1min [Parallel(n_jobs=4)]: Done 1792 tasks | elapsed: 2.9min [Parallel(n_jobs=4)]: Done 2442 tasks | elapsed: 4.2min [Parallel(n_jobs=4)]: Done 3192 tasks | elapsed: 5.8min [Parallel(n_jobs=4)]: Done 4042 tasks | elapsed: 7.6min [Parallel(n_jobs=4)]: Done 4992 tasks | elapsed: 9.8min [Parallel(n_jobs=4)]: Done 6042 tasks | elapsed: 12.0min [Parallel(n_jobs=4)]: Done 7192 tasks | elapsed: 13.4min [Parallel(n_jobs=4)]: Done 8442 tasks | elapsed: 16.3min [Parallel(n_jobs=4)]: Done 9792 tasks | elapsed: 19.3min [Parallel(n_jobs=4)]: Done 11242 tasks | elapsed: 22.9min [Parallel(n_jobs=4)]: Done 12454 out of 12454 | elapsed: 25.0min finished /users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning. FutureWarning /users/vir/miniconda2/envs/basepairmodels_latest/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data. EfficiencyWarning)
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 12454 samples in 0.132s... [t-SNE] Computed neighbors for 12454 samples in 0.007s... [t-SNE] Computed conditional probabilities for sample 1000 / 12454 [t-SNE] Computed conditional probabilities for sample 2000 / 12454 [t-SNE] Computed conditional probabilities for sample 3000 / 12454 [t-SNE] Computed conditional probabilities for sample 4000 / 12454 [t-SNE] Computed conditional probabilities for sample 5000 / 12454 [t-SNE] Computed conditional probabilities for sample 6000 / 12454 [t-SNE] Computed conditional probabilities for sample 7000 / 12454 [t-SNE] Computed conditional probabilities for sample 8000 / 12454 [t-SNE] Computed conditional probabilities for sample 9000 / 12454 [t-SNE] Computed conditional probabilities for sample 10000 / 12454 [t-SNE] Computed conditional probabilities for sample 11000 / 12454 [t-SNE] Computed conditional probabilities for sample 12000 / 12454 [t-SNE] Computed conditional probabilities for sample 12454 / 12454 [t-SNE] Mean sigma: 0.192659 [t-SNE] Computed conditional probabilities in 0.975s [t-SNE] Iteration 50: error = 98.4057007, gradient norm = 0.0000425 (50 iterations in 16.222s) [t-SNE] Iteration 100: error = 95.3506622, gradient norm = 0.0050386 (50 iterations in 17.225s) [t-SNE] Iteration 150: error = 93.9600220, gradient norm = 0.0000395 (50 iterations in 17.307s) [t-SNE] Iteration 200: error = 93.9615707, gradient norm = 0.0000191 (50 iterations in 21.352s) [t-SNE] Iteration 250: error = 93.9629669, gradient norm = 0.0000323 (50 iterations in 22.048s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 93.962967 [t-SNE] Iteration 300: error = 3.7650309, gradient norm = 0.0011678 (50 iterations in 21.366s) [t-SNE] Iteration 350: error = 3.3906898, gradient norm = 0.0005042 (50 iterations in 24.264s) [t-SNE] Iteration 400: error = 3.2218590, gradient norm = 0.0002994 (50 iterations in 25.544s) [t-SNE] Iteration 450: error = 3.1199949, gradient norm = 0.0002063 (50 iterations in 25.724s) [t-SNE] Iteration 500: error = 3.0483043, gradient norm = 0.0001624 (50 iterations in 25.685s) [t-SNE] Iteration 550: error = 2.9950516, gradient norm = 0.0001293 (50 iterations in 25.483s) [t-SNE] Iteration 600: error = 2.9547381, gradient norm = 0.0001065 (50 iterations in 21.504s) [t-SNE] Iteration 650: error = 2.9226475, gradient norm = 0.0000910 (50 iterations in 22.224s) [t-SNE] Iteration 700: error = 2.8961208, gradient norm = 0.0000812 (50 iterations in 22.504s) [t-SNE] Iteration 750: error = 2.8742714, gradient norm = 0.0000710 (50 iterations in 24.648s) [t-SNE] Iteration 800: error = 2.8558888, gradient norm = 0.0000630 (50 iterations in 24.392s) [t-SNE] Iteration 850: error = 2.8403964, gradient norm = 0.0000570 (50 iterations in 24.772s) [t-SNE] Iteration 900: error = 2.8272529, gradient norm = 0.0000524 (50 iterations in 25.052s) [t-SNE] Iteration 950: error = 2.8160129, gradient norm = 0.0000488 (50 iterations in 25.864s) [t-SNE] Iteration 1000: error = 2.8064637, gradient norm = 0.0000461 (50 iterations in 25.124s) [t-SNE] KL divergence after 1000 iterations: 2.806464 [t-SNE] Computed conditional probabilities for sample 1000 / 12454 [t-SNE] Computed conditional probabilities for sample 2000 / 12454 [t-SNE] Computed conditional probabilities for sample 3000 / 12454 [t-SNE] Computed conditional probabilities for sample 4000 / 12454 [t-SNE] Computed conditional probabilities for sample 5000 / 12454 [t-SNE] Computed conditional probabilities for sample 6000 / 12454 [t-SNE] Computed conditional probabilities for sample 7000 / 12454 [t-SNE] Computed conditional probabilities for sample 8000 / 12454 [t-SNE] Computed conditional probabilities for sample 9000 / 12454 [t-SNE] Computed conditional probabilities for sample 10000 / 12454 [t-SNE] Computed conditional probabilities for sample 11000 / 12454 [t-SNE] Computed conditional probabilities for sample 12000 / 12454 [t-SNE] Computed conditional probabilities for sample 12454 / 12454 [t-SNE] Mean sigma: 0.192659 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 8.0min
Quality: 0.6984924017729724 Quality: 0.7026973059942808 Quality: 0.70274419599841 Quality: 0.702860914742565
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 9.1min finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 133 tasks | elapsed: 0.4s [Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed: 0.5s finished
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 160 samples in 0.002s... [t-SNE] Computed neighbors for 160 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 160 / 160 [t-SNE] Mean sigma: 0.211304 [t-SNE] Computed conditional probabilities in 0.024s [t-SNE] Iteration 50: error = 72.2579041, gradient norm = 0.4505137 (50 iterations in 15.330s) [t-SNE] Iteration 100: error = 67.6835632, gradient norm = 0.5271367 (50 iterations in 15.388s) [t-SNE] Iteration 150: error = 67.4411545, gradient norm = 0.4917274 (50 iterations in 15.276s) [t-SNE] Iteration 200: error = 68.3901062, gradient norm = 0.4735321 (50 iterations in 15.160s) [t-SNE] Iteration 250: error = 71.1792908, gradient norm = 0.4295583 (50 iterations in 15.324s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 71.179291 [t-SNE] Iteration 300: error = 1.4539893, gradient norm = 0.0054988 (50 iterations in 14.887s) [t-SNE] Iteration 350: error = 1.0845932, gradient norm = 0.0113418 (50 iterations in 14.764s) [t-SNE] Iteration 400: error = 0.9816566, gradient norm = 0.0014120 (50 iterations in 13.976s) [t-SNE] Iteration 450: error = 0.9278284, gradient norm = 0.0022429 (50 iterations in 13.192s) [t-SNE] Iteration 500: error = 0.8398767, gradient norm = 0.0045046 (50 iterations in 13.531s) [t-SNE] Iteration 550: error = 0.7970717, gradient norm = 0.0018866 (50 iterations in 16.449s) [t-SNE] Iteration 600: error = 0.7914544, gradient norm = 0.0006061 (50 iterations in 17.456s) [t-SNE] Iteration 650: error = 0.7918545, gradient norm = 0.0003448 (50 iterations in 16.952s) [t-SNE] Iteration 700: error = 0.7921503, gradient norm = 0.0003697 (50 iterations in 15.128s) [t-SNE] Iteration 750: error = 0.7920761, gradient norm = 0.0002425 (50 iterations in 15.100s) [t-SNE] Iteration 800: error = 0.7922307, gradient norm = 0.0003629 (50 iterations in 14.628s) [t-SNE] Iteration 850: error = 0.7922254, gradient norm = 0.0003154 (50 iterations in 16.956s) [t-SNE] Iteration 900: error = 0.7920581, gradient norm = 0.0003651 (50 iterations in 18.128s) [t-SNE] Iteration 950: error = 0.7919893, gradient norm = 0.0003090 (50 iterations in 17.844s) [t-SNE] Iteration 950: did not make any progress during the last 300 episodes. Finished. [t-SNE] KL divergence after 950 iterations: 0.791989 [t-SNE] Computed conditional probabilities for sample 160 / 160 [t-SNE] Mean sigma: 0.211304 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 44.4s
Quality: 0.35012834024737083 Quality: 0.35055138318432294 Quality: 0.35075983174534
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 51.8s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 130 tasks | elapsed: 0.3s [Parallel(n_jobs=4)]: Done 140 out of 140 | elapsed: 0.4s finished
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 140 samples in 0.001s... [t-SNE] Computed neighbors for 140 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 140 / 140 [t-SNE] Mean sigma: 0.314436 [t-SNE] Computed conditional probabilities in 0.009s [t-SNE] Iteration 50: error = 60.8567619, gradient norm = 0.4926721 (50 iterations in 18.149s) [t-SNE] Iteration 100: error = 60.6676903, gradient norm = 0.5301607 (50 iterations in 17.852s) [t-SNE] Iteration 150: error = 65.4259109, gradient norm = 0.4512067 (50 iterations in 17.712s) [t-SNE] Iteration 200: error = 59.3359184, gradient norm = 0.5368801 (50 iterations in 18.200s) [t-SNE] Iteration 250: error = 65.1187057, gradient norm = 0.4593717 (50 iterations in 17.752s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 65.118706 [t-SNE] Iteration 300: error = 1.1024154, gradient norm = 0.0087586 (50 iterations in 17.740s) [t-SNE] Iteration 350: error = 0.7071413, gradient norm = 0.0080944 (50 iterations in 14.888s) [t-SNE] Iteration 400: error = 0.6101210, gradient norm = 0.0023245 (50 iterations in 13.328s) [t-SNE] Iteration 450: error = 0.5318359, gradient norm = 0.0034178 (50 iterations in 12.608s) [t-SNE] Iteration 500: error = 0.4983945, gradient norm = 0.0044925 (50 iterations in 13.440s) [t-SNE] Iteration 550: error = 0.4919178, gradient norm = 0.0034114 (50 iterations in 14.756s) [t-SNE] Iteration 600: error = 0.4724301, gradient norm = 0.0004450 (50 iterations in 15.564s) [t-SNE] Iteration 650: error = 0.4726352, gradient norm = 0.0002596 (50 iterations in 17.114s) [t-SNE] Iteration 700: error = 0.4727548, gradient norm = 0.0003540 (50 iterations in 16.815s) [t-SNE] Iteration 750: error = 0.4713666, gradient norm = 0.0027187 (50 iterations in 15.491s) [t-SNE] Iteration 800: error = 0.4705746, gradient norm = 0.0004672 (50 iterations in 14.804s) [t-SNE] Iteration 850: error = 0.4709438, gradient norm = 0.0002856 (50 iterations in 15.396s) [t-SNE] Iteration 900: error = 0.4709048, gradient norm = 0.0002705 (50 iterations in 15.184s) [t-SNE] Iteration 950: error = 0.4708591, gradient norm = 0.0003570 (50 iterations in 18.048s) [t-SNE] Iteration 1000: error = 0.4709727, gradient norm = 0.0003158 (50 iterations in 17.864s) [t-SNE] KL divergence after 1000 iterations: 0.470973 [t-SNE] Computed conditional probabilities for sample 140 / 140 [t-SNE] Mean sigma: 0.314436 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 56.2s
Quality: 0.4146750979282893
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 1.1min finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 80 out of 80 | elapsed: 0.2s finished
[t-SNE] Computing 79 nearest neighbors... [t-SNE] Indexed 80 samples in 0.002s... [t-SNE] Computed neighbors for 80 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 80 / 80 [t-SNE] Mean sigma: 0.495573 [t-SNE] Computed conditional probabilities in 0.005s [t-SNE] Iteration 50: error = 56.3142166, gradient norm = 0.5011952 (50 iterations in 17.414s) [t-SNE] Iteration 100: error = 60.2771454, gradient norm = 0.4753757 (50 iterations in 16.984s) [t-SNE] Iteration 150: error = 58.2286110, gradient norm = 0.4923075 (50 iterations in 17.280s) [t-SNE] Iteration 200: error = 58.0282326, gradient norm = 0.5235721 (50 iterations in 17.996s) [t-SNE] Iteration 250: error = 56.1997643, gradient norm = 0.4784734 (50 iterations in 15.952s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 56.199764 [t-SNE] Iteration 300: error = 1.0296416, gradient norm = 0.0053011 (50 iterations in 14.844s) [t-SNE] Iteration 350: error = 0.7854857, gradient norm = 0.0025852 (50 iterations in 16.624s) [t-SNE] Iteration 400: error = 0.8439091, gradient norm = 0.0014218 (50 iterations in 14.760s) [t-SNE] Iteration 450: error = 0.8066438, gradient norm = 0.0005166 (50 iterations in 15.048s) [t-SNE] Iteration 500: error = 0.7500018, gradient norm = 0.0007506 (50 iterations in 14.756s) [t-SNE] Iteration 550: error = 0.8244845, gradient norm = 0.0004915 (50 iterations in 15.000s) [t-SNE] Iteration 600: error = 0.7935385, gradient norm = 0.0003612 (50 iterations in 15.213s) [t-SNE] Iteration 650: error = 0.7720208, gradient norm = 0.0002121 (50 iterations in 16.731s) [t-SNE] Iteration 700: error = 0.7560806, gradient norm = 0.0001355 (50 iterations in 21.608s) [t-SNE] Iteration 750: error = 0.7304705, gradient norm = 0.0001414 (50 iterations in 20.716s) [t-SNE] Iteration 800: error = 0.7021521, gradient norm = 0.0001298 (50 iterations in 15.076s) [t-SNE] Iteration 850: error = 0.6829772, gradient norm = 0.0000957 (50 iterations in 14.396s) [t-SNE] Iteration 900: error = 0.6688132, gradient norm = 0.0001045 (50 iterations in 14.388s) [t-SNE] Iteration 950: error = 0.6538274, gradient norm = 0.0001172 (50 iterations in 16.256s) [t-SNE] Iteration 1000: error = 0.6379468, gradient norm = 0.0000970 (50 iterations in 17.424s) [t-SNE] KL divergence after 1000 iterations: 0.637947 [t-SNE] Computed conditional probabilities for sample 80 / 80 [t-SNE] Mean sigma: 0.495573 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 46.8s
Quality: 0.25715545586934896
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 54.4s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 74 tasks | elapsed: 0.2s [Parallel(n_jobs=4)]: Done 85 out of 85 | elapsed: 0.2s finished
[t-SNE] Computing 84 nearest neighbors... [t-SNE] Indexed 85 samples in 0.002s... [t-SNE] Computed neighbors for 85 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 85 / 85 [t-SNE] Mean sigma: 0.337236 [t-SNE] Computed conditional probabilities in 0.026s [t-SNE] Iteration 50: error = 62.4255905, gradient norm = 0.4587097 (50 iterations in 16.838s) [t-SNE] Iteration 100: error = 61.7523499, gradient norm = 0.4958959 (50 iterations in 15.244s) [t-SNE] Iteration 150: error = 59.3819389, gradient norm = 0.5257326 (50 iterations in 14.016s) [t-SNE] Iteration 200: error = 62.0942917, gradient norm = 0.4829517 (50 iterations in 14.452s) [t-SNE] Iteration 250: error = 60.3459396, gradient norm = 0.4862712 (50 iterations in 14.890s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 60.345940 [t-SNE] Iteration 300: error = 1.1607332, gradient norm = 0.0033152 (50 iterations in 14.870s) [t-SNE] Iteration 350: error = 0.8522174, gradient norm = 0.0026717 (50 iterations in 14.912s) [t-SNE] Iteration 400: error = 0.7567443, gradient norm = 0.0006592 (50 iterations in 15.068s) [t-SNE] Iteration 450: error = 0.6954028, gradient norm = 0.0008921 (50 iterations in 14.992s) [t-SNE] Iteration 500: error = 0.6541343, gradient norm = 0.0007076 (50 iterations in 14.892s) [t-SNE] Iteration 550: error = 0.5940102, gradient norm = 0.0009267 (50 iterations in 14.508s) [t-SNE] Iteration 600: error = 0.5786951, gradient norm = 0.0001601 (50 iterations in 14.824s) [t-SNE] Iteration 650: error = 0.5513048, gradient norm = 0.0011036 (50 iterations in 15.476s) [t-SNE] Iteration 700: error = 0.5186256, gradient norm = 0.0009106 (50 iterations in 15.756s) [t-SNE] Iteration 750: error = 0.5018979, gradient norm = 0.0004264 (50 iterations in 15.018s) [t-SNE] Iteration 800: error = 0.4976586, gradient norm = 0.0001820 (50 iterations in 14.342s) [t-SNE] Iteration 850: error = 0.4938800, gradient norm = 0.0002124 (50 iterations in 14.308s) [t-SNE] Iteration 900: error = 0.4915761, gradient norm = 0.0001463 (50 iterations in 14.336s) [t-SNE] Iteration 950: error = 0.4890191, gradient norm = 0.0002426 (50 iterations in 15.012s) [t-SNE] Iteration 1000: error = 0.5725439, gradient norm = 0.0326171 (50 iterations in 14.200s) [t-SNE] KL divergence after 1000 iterations: 0.572544 [t-SNE] Computed conditional probabilities for sample 85 / 85 [t-SNE] Mean sigma: 0.337236 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 41.3s
Quality: 0.26273509697072656 Quality: 0.2643891483852391 Quality: 0.2652975593998126 Quality: 0.26879122838997427 Quality: 0.2689233573150276
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 47.7s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 72 out of 72 | elapsed: 0.1s finished
[t-SNE] Computing 71 nearest neighbors... [t-SNE] Indexed 72 samples in 0.002s... [t-SNE] Computed neighbors for 72 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 72 / 72 [t-SNE] Mean sigma: 0.463236 [t-SNE] Computed conditional probabilities in 0.018s [t-SNE] Iteration 50: error = 52.7568016, gradient norm = 0.5311697 (50 iterations in 17.875s) [t-SNE] Iteration 100: error = 54.5764351, gradient norm = 0.4569130 (50 iterations in 15.740s) [t-SNE] Iteration 150: error = 53.6639328, gradient norm = 0.4177146 (50 iterations in 15.448s) [t-SNE] Iteration 200: error = 54.8619499, gradient norm = 0.5468359 (50 iterations in 16.852s) [t-SNE] Iteration 250: error = 50.1337547, gradient norm = 0.6802748 (50 iterations in 14.664s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 50.133755 [t-SNE] Iteration 300: error = 0.9978130, gradient norm = 0.0036142 (50 iterations in 14.351s) [t-SNE] Iteration 350: error = 0.6727231, gradient norm = 0.0017426 (50 iterations in 14.424s) [t-SNE] Iteration 400: error = 0.5399058, gradient norm = 0.0005978 (50 iterations in 14.836s) [t-SNE] Iteration 450: error = 0.4972408, gradient norm = 0.0005837 (50 iterations in 15.056s) [t-SNE] Iteration 500: error = 0.4632723, gradient norm = 0.0005978 (50 iterations in 15.152s) [t-SNE] Iteration 550: error = 0.4369497, gradient norm = 0.0004824 (50 iterations in 13.136s) [t-SNE] Iteration 600: error = 0.4138511, gradient norm = 0.0018388 (50 iterations in 12.232s) [t-SNE] Iteration 650: error = 0.3677678, gradient norm = 0.0004039 (50 iterations in 12.484s) [t-SNE] Iteration 700: error = 0.3605752, gradient norm = 0.0001795 (50 iterations in 12.988s) [t-SNE] Iteration 750: error = 0.3538247, gradient norm = 0.0002150 (50 iterations in 14.960s) [t-SNE] Iteration 800: error = 0.3487584, gradient norm = 0.0003561 (50 iterations in 13.076s) [t-SNE] Iteration 850: error = 0.3317065, gradient norm = 0.0006829 (50 iterations in 12.152s) [t-SNE] Iteration 900: error = 0.3049172, gradient norm = 0.0009948 (50 iterations in 13.597s) [t-SNE] Iteration 950: error = 0.2946607, gradient norm = 0.0004249 (50 iterations in 12.271s) [t-SNE] Iteration 1000: error = 0.2893874, gradient norm = 0.0006549 (50 iterations in 12.512s) [t-SNE] KL divergence after 1000 iterations: 0.289387 [t-SNE] Computed conditional probabilities for sample 72 / 72 [t-SNE] Mean sigma: 0.463236 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 26.5s
Quality: 0.35839627648406186
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 31.2s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 220 tasks | elapsed: 0.6s [Parallel(n_jobs=4)]: Done 274 out of 274 | elapsed: 0.6s finished
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 274 samples in 0.002s... [t-SNE] Computed neighbors for 274 samples in 0.000s... [t-SNE] Computed conditional probabilities for sample 274 / 274 [t-SNE] Mean sigma: 0.275531 [t-SNE] Computed conditional probabilities in 0.034s [t-SNE] Iteration 50: error = 77.4123306, gradient norm = 0.4804427 (50 iterations in 12.330s) [t-SNE] Iteration 100: error = 81.4974670, gradient norm = 0.4443977 (50 iterations in 12.220s) [t-SNE] Iteration 150: error = 77.5206146, gradient norm = 0.4590671 (50 iterations in 12.960s) [t-SNE] Iteration 200: error = 83.8964996, gradient norm = 0.4522942 (50 iterations in 12.972s) [t-SNE] Iteration 250: error = 81.2890320, gradient norm = 0.4408662 (50 iterations in 13.596s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 81.289032 [t-SNE] Iteration 300: error = 1.8234618, gradient norm = 0.0073143 (50 iterations in 10.167s) [t-SNE] Iteration 350: error = 1.5756892, gradient norm = 0.0028870 (50 iterations in 12.032s) [t-SNE] Iteration 400: error = 1.4654963, gradient norm = 0.0026734 (50 iterations in 10.708s) [t-SNE] Iteration 450: error = 1.3859824, gradient norm = 0.0024025 (50 iterations in 11.021s) [t-SNE] Iteration 500: error = 1.3678026, gradient norm = 0.0018189 (50 iterations in 1.183s) [t-SNE] Iteration 550: error = 1.3325949, gradient norm = 0.0019729 (50 iterations in 8.410s) [t-SNE] Iteration 600: error = 1.3186648, gradient norm = 0.0013382 (50 iterations in 8.515s) [t-SNE] Iteration 650: error = 1.3021207, gradient norm = 0.0007254 (50 iterations in 8.348s) [t-SNE] Iteration 700: error = 1.2941964, gradient norm = 0.0004201 (50 iterations in 8.276s) [t-SNE] Iteration 750: error = 1.2938774, gradient norm = 0.0001737 (50 iterations in 12.609s) [t-SNE] Iteration 800: error = 1.2937863, gradient norm = 0.0002212 (50 iterations in 9.794s) [t-SNE] Iteration 850: error = 1.2933774, gradient norm = 0.0001023 (50 iterations in 13.337s) [t-SNE] Iteration 900: error = 1.2934351, gradient norm = 0.0000770 (50 iterations in 11.604s) [t-SNE] Iteration 950: error = 1.2933544, gradient norm = 0.0002029 (50 iterations in 11.680s) [t-SNE] Iteration 1000: error = 1.2934302, gradient norm = 0.0001333 (50 iterations in 11.360s) [t-SNE] KL divergence after 1000 iterations: 1.293430 [t-SNE] Computed conditional probabilities for sample 274 / 274 [t-SNE] Mean sigma: 0.275531 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 31.3s
Quality: 0.32262520033094466 Quality: 0.326052645753287 Quality: 0.32637294426841124
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 35.4s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 107 out of 107 | elapsed: 0.2s finished
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 107 samples in 0.002s... [t-SNE] Computed neighbors for 107 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 107 / 107 [t-SNE] Mean sigma: 0.328589 [t-SNE] Computed conditional probabilities in 0.011s [t-SNE] Iteration 50: error = 66.0211182, gradient norm = 0.4759144 (50 iterations in 10.798s) [t-SNE] Iteration 100: error = 65.9862595, gradient norm = 0.4757720 (50 iterations in 12.110s) [t-SNE] Iteration 150: error = 65.6642914, gradient norm = 0.4831016 (50 iterations in 13.280s) [t-SNE] Iteration 200: error = 68.6230927, gradient norm = 0.4377040 (50 iterations in 15.476s) [t-SNE] Iteration 250: error = 63.4003258, gradient norm = 0.5121785 (50 iterations in 16.020s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 63.400326 [t-SNE] Iteration 300: error = 1.1341442, gradient norm = 0.0067295 (50 iterations in 16.730s) [t-SNE] Iteration 350: error = 1.0340002, gradient norm = 0.0026578 (50 iterations in 12.552s) [t-SNE] Iteration 400: error = 0.9322256, gradient norm = 0.0041597 (50 iterations in 13.004s) [t-SNE] Iteration 450: error = 0.6219122, gradient norm = 0.0113083 (50 iterations in 3.490s) [t-SNE] Iteration 500: error = 0.6001081, gradient norm = 0.0011421 (50 iterations in 0.476s) [t-SNE] Iteration 550: error = 0.5998938, gradient norm = 0.0005559 (50 iterations in 0.088s) [t-SNE] Iteration 600: error = 0.6004264, gradient norm = 0.0005771 (50 iterations in 0.067s) [t-SNE] Iteration 650: error = 0.6001657, gradient norm = 0.0005707 (50 iterations in 0.055s) [t-SNE] Iteration 700: error = 0.6003257, gradient norm = 0.0005734 (50 iterations in 0.062s) [t-SNE] Iteration 750: error = 0.6001701, gradient norm = 0.0006019 (50 iterations in 0.051s) [t-SNE] Iteration 800: error = 0.6002852, gradient norm = 0.0009054 (50 iterations in 0.052s) [t-SNE] Iteration 850: error = 0.6002944, gradient norm = 0.0007970 (50 iterations in 0.063s) [t-SNE] Iteration 900: error = 0.6001486, gradient norm = 0.0006453 (50 iterations in 0.916s) [t-SNE] Iteration 900: did not make any progress during the last 300 episodes. Finished. [t-SNE] KL divergence after 900 iterations: 0.600149 [t-SNE] Computed conditional probabilities for sample 107 / 107 [t-SNE] Mean sigma: 0.328589 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 25.7s
Quality: 0.28048434595810334 Quality: 0.2825407164261998 Quality: 0.284059538554229
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 30.9s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 51 out of 58 | elapsed: 0.2s remaining: 0.0s [Parallel(n_jobs=4)]: Done 58 out of 58 | elapsed: 0.2s finished
[t-SNE] Computing 57 nearest neighbors... [t-SNE] Indexed 58 samples in 0.001s... [t-SNE] Computed neighbors for 58 samples in 0.000s... [t-SNE] Computed conditional probabilities for sample 58 / 58 [t-SNE] Mean sigma: 0.418356 [t-SNE] Computed conditional probabilities in 0.003s [t-SNE] Iteration 50: error = 54.3887482, gradient norm = 0.5347674 (50 iterations in 10.597s) [t-SNE] Iteration 100: error = 52.5390739, gradient norm = 0.5386772 (50 iterations in 9.900s) [t-SNE] Iteration 150: error = 52.0619354, gradient norm = 0.4976507 (50 iterations in 11.120s) [t-SNE] Iteration 200: error = 57.3235703, gradient norm = 0.5339420 (50 iterations in 9.180s) [t-SNE] Iteration 250: error = 57.7468376, gradient norm = 0.5014480 (50 iterations in 9.588s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 57.746838 [t-SNE] Iteration 300: error = 1.0349977, gradient norm = 0.0020246 (50 iterations in 10.875s) [t-SNE] Iteration 350: error = 0.8672211, gradient norm = 0.0013071 (50 iterations in 11.392s) [t-SNE] Iteration 400: error = 0.7565125, gradient norm = 0.0008842 (50 iterations in 11.840s) [t-SNE] Iteration 450: error = 0.6724889, gradient norm = 0.0007025 (50 iterations in 10.288s) [t-SNE] Iteration 500: error = 0.6046890, gradient norm = 0.0012817 (50 iterations in 9.716s) [t-SNE] Iteration 550: error = 0.5683649, gradient norm = 0.0003588 (50 iterations in 10.247s) [t-SNE] Iteration 600: error = 0.5306038, gradient norm = 0.0006826 (50 iterations in 10.777s) [t-SNE] Iteration 650: error = 0.5042171, gradient norm = 0.0002344 (50 iterations in 10.575s) [t-SNE] Iteration 700: error = 0.4787498, gradient norm = 0.0004392 (50 iterations in 9.473s) [t-SNE] Iteration 750: error = 0.4523133, gradient norm = 0.0003145 (50 iterations in 10.856s) [t-SNE] Iteration 800: error = 0.4370746, gradient norm = 0.0002760 (50 iterations in 9.804s) [t-SNE] Iteration 850: error = 0.4347964, gradient norm = 0.0001411 (50 iterations in 11.552s) [t-SNE] Iteration 900: error = 0.4321736, gradient norm = 0.0000914 (50 iterations in 10.320s) [t-SNE] Iteration 950: error = 0.4290792, gradient norm = 0.0003072 (50 iterations in 10.828s) [t-SNE] Iteration 1000: error = 0.4148187, gradient norm = 0.0002364 (50 iterations in 10.990s) [t-SNE] KL divergence after 1000 iterations: 0.414819 [t-SNE] Computed conditional probabilities for sample 58 / 58 [t-SNE] Mean sigma: 0.418356 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 27.1s
Quality: 0.21602178841573827
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 32.1s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 0.1s finished
[t-SNE] Computing 49 nearest neighbors... [t-SNE] Indexed 50 samples in 0.003s... [t-SNE] Computed neighbors for 50 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 50 / 50 [t-SNE] Mean sigma: 0.487572 [t-SNE] Computed conditional probabilities in 0.005s [t-SNE] Iteration 50: error = 50.3193398, gradient norm = 0.4692524 (50 iterations in 0.418s) [t-SNE] Iteration 100: error = 52.3291092, gradient norm = 0.4902437 (50 iterations in 0.342s) [t-SNE] Iteration 150: error = 54.8621826, gradient norm = 0.4388794 (50 iterations in 1.290s) [t-SNE] Iteration 200: error = 55.1292152, gradient norm = 0.3486258 (50 iterations in 9.780s) [t-SNE] Iteration 250: error = 48.0092583, gradient norm = 0.5703542 (50 iterations in 11.192s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 48.009258 [t-SNE] Iteration 300: error = 0.9432287, gradient norm = 0.0017080 (50 iterations in 10.527s) [t-SNE] Iteration 350: error = 0.7839852, gradient norm = 0.0007139 (50 iterations in 12.000s) [t-SNE] Iteration 400: error = 0.7306295, gradient norm = 0.0005465 (50 iterations in 11.016s) [t-SNE] Iteration 450: error = 0.6848123, gradient norm = 0.0004261 (50 iterations in 10.756s) [t-SNE] Iteration 500: error = 0.6456725, gradient norm = 0.0003529 (50 iterations in 11.388s) [t-SNE] Iteration 550: error = 0.6188582, gradient norm = 0.0003356 (50 iterations in 10.617s) [t-SNE] Iteration 600: error = 0.5959155, gradient norm = 0.0002084 (50 iterations in 11.574s) [t-SNE] Iteration 650: error = 0.5778752, gradient norm = 0.0002481 (50 iterations in 10.980s) [t-SNE] Iteration 700: error = 0.5458720, gradient norm = 0.0003279 (50 iterations in 11.896s) [t-SNE] Iteration 750: error = 0.5281581, gradient norm = 0.0002822 (50 iterations in 10.380s) [t-SNE] Iteration 800: error = 0.5120935, gradient norm = 0.0004005 (50 iterations in 11.416s) [t-SNE] Iteration 850: error = 0.4945207, gradient norm = 0.0003476 (50 iterations in 10.868s) [t-SNE] Iteration 900: error = 0.4919742, gradient norm = 0.0000834 (50 iterations in 10.763s) [t-SNE] Iteration 950: error = 0.4915423, gradient norm = 0.0000474 (50 iterations in 11.485s) [t-SNE] Iteration 1000: error = 0.4910885, gradient norm = 0.0000447 (50 iterations in 11.408s) [t-SNE] KL divergence after 1000 iterations: 0.491089 [t-SNE] Computed conditional probabilities for sample 50 / 50 [t-SNE] Mean sigma: 0.487572 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 28.0s
Quality: 0.1789553543085227
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 32.9s finished [Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 32 out of 32 | elapsed: 0.0s finished
[t-SNE] Computing 31 nearest neighbors... [t-SNE] Indexed 32 samples in 0.002s... [t-SNE] Computed neighbors for 32 samples in 0.001s... [t-SNE] Computed conditional probabilities for sample 32 / 32 [t-SNE] Mean sigma: 1.129275 [t-SNE] Computed conditional probabilities in 0.004s [t-SNE] Iteration 50: error = 46.9087906, gradient norm = 0.4835995 (50 iterations in 3.454s) [t-SNE] Iteration 100: error = 45.0289764, gradient norm = 0.6263679 (50 iterations in 0.091s) [t-SNE] Iteration 150: error = 42.0630112, gradient norm = 0.7370158 (50 iterations in 0.036s) [t-SNE] Iteration 200: error = 43.8912239, gradient norm = 0.5550996 (50 iterations in 2.732s) [t-SNE] Iteration 250: error = 46.2393684, gradient norm = 0.4115428 (50 iterations in 0.039s) [t-SNE] KL divergence after 250 iterations with early exaggeration: 46.239368 [t-SNE] Iteration 300: error = 0.6313334, gradient norm = 0.0009925 (50 iterations in 6.486s) [t-SNE] Iteration 350: error = 0.5292564, gradient norm = 0.0002422 (50 iterations in 10.035s) [t-SNE] Iteration 400: error = 0.5148782, gradient norm = 0.0001513 (50 iterations in 11.134s) [t-SNE] Iteration 450: error = 0.5069570, gradient norm = 0.0000978 (50 iterations in 13.544s) [t-SNE] Iteration 500: error = 0.4970625, gradient norm = 0.0001143 (50 iterations in 11.696s) [t-SNE] Iteration 550: error = 0.4927297, gradient norm = 0.0000654 (50 iterations in 12.284s) [t-SNE] Iteration 600: error = 0.4882357, gradient norm = 0.0001049 (50 iterations in 10.284s) [t-SNE] Iteration 650: error = 0.4791764, gradient norm = 0.0000663 (50 iterations in 10.184s) [t-SNE] Iteration 700: error = 0.4780936, gradient norm = 0.0000305 (50 iterations in 9.748s) [t-SNE] Iteration 750: error = 0.4758615, gradient norm = 0.0000906 (50 iterations in 11.239s) [t-SNE] Iteration 800: error = 0.4705621, gradient norm = 0.0000521 (50 iterations in 10.896s) [t-SNE] Iteration 850: error = 0.4696307, gradient norm = 0.0000285 (50 iterations in 9.544s) [t-SNE] Iteration 900: error = 0.4689408, gradient norm = 0.0000462 (50 iterations in 9.960s) [t-SNE] Iteration 950: error = 0.4638360, gradient norm = 0.0000759 (50 iterations in 9.764s) [t-SNE] Iteration 1000: error = 0.4624943, gradient norm = 0.0000282 (50 iterations in 11.044s) [t-SNE] KL divergence after 1000 iterations: 0.462494 [t-SNE] Computed conditional probabilities for sample 32 / 32 [t-SNE] Mean sigma: 1.129275 Beginning preprocessing + Leiden
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers. [Parallel(n_jobs=4)]: Done 42 tasks | elapsed: 28.7s
Quality: 0.020570645714653755
[Parallel(n_jobs=4)]: Done 50 out of 50 | elapsed: 33.3s finished
For each motif, determine the peaks that contain it
# Import the hCWMs
hcwms = import_tfmodisco_motifs(tfm_results_path)
motif_keys = list(hcwms.keys())
# Import the motif hits
hit_table = moods.import_moods_hits(os.path.join(moods_dir, "moods_filtered_collapsed.bed"))
hit_peak_indices = get_hit_peak_indices(hit_table, motif_keys)
# Import embeddings (this can take awhile)
embeddings = np.load(embeddings_path)["embeddings"]
# Sum up over sequence axis to remove position dependencies
summed_embeddings = np.sum(embeddings, axis=1)
For each motif, show the subclusters that exist within the TF-MoDISco-identified subpatterns
plot_motif_heterogeneity(tfm_obj)
/mnt/lab_data2/vir/tf_chr_atlas/02-24-2021/TF-Atlas/3M/reports/viz_sequence.py:152: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`). fig = plt.figure(figsize=figsize)
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 12454 | ||
| 0 | 1704 | ||
| 1 | 1695 | ||
| 2 | 1666 | ||
| 3 | 1510 | ||
| 4 | 1416 | ||
| 5 | 1410 | ||
| 6 | 1082 | ||
| 7 | 756 | ||
| 8 | 606 | ||
| 9 | 305 | ||
| 10 | 296 | ||
| 11 | 8 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 160 | ||
| 0 | 54 | ||
| 1 | 35 | ||
| 2 | 31 | ||
| 3 | 30 | ||
| 4 | 5 | ||
| 5 | 5 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 140 | ||
| 0 | 35 | ||
| 1 | 31 | ||
| 2 | 29 | ||
| 3 | 22 | ||
| 4 | 9 | ||
| 5 | 5 | ||
| 6 | 5 | ||
| 8 | 2 | ||
| 7 | 2 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 80 | ||
| 0 | 23 | ||
| 1 | 20 | ||
| 2 | 13 | ||
| 3 | 13 | ||
| 4 | 11 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 85 | ||
| 1 | 22 | ||
| 0 | 22 | ||
| 2 | 14 | ||
| 3 | 14 | ||
| 4 | 13 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 72 | ||
| 0 | 38 | ||
| 1 | 14 | ||
| 2 | 13 | ||
| 3 | 7 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 274 | ||
| 0 | 59 | ||
| 1 | 56 | ||
| 2 | 42 | ||
| 3 | 41 | ||
| 4 | 34 | ||
| 5 | 24 | ||
| 6 | 18 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 107 | ||
| 0 | 32 | ||
| 1 | 28 | ||
| 2 | 25 | ||
| 3 | 22 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 58 | ||
| 0 | 21 | ||
| 1 | 16 | ||
| 2 | 12 | ||
| 3 | 9 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 50 | ||
| 0 | 20 | ||
| 1 | 17 | ||
| 2 | 13 |
| Subpattern | Seqlets | Embeddings | hCWM |
|---|---|---|---|
| Agg. | 32 | ||
| 0 | 20 | ||
| 1 | 12 |
For each peak, cluster the peaks by embeddings to highlight the structure of different peaks and different motifs
plot_peak_clustering(summed_embeddings, motif_keys, hcwms, hit_peak_indices)
| Motif key | Embeddings | hCWM |
|---|---|---|
| 0_0 | ||
| 0_1 | ||
| 0_2 | ||
| 0_3 | ||
| 0_4 | ||
| 0_5 |