In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py

Generating modisco motifs

In [2]:
def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

# deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
# deeplift_data = load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot'])
# print(deeplift_data['scores'].shape, deeplift_data['one_hot'].shape)

# traj_no = -1
# if traj_no==8:
#     indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# else:
#     indices = traj_lab[:, traj_no]==1

# if traj_no<0:
#     _score = deeplift_data['scores']
#     _hyp_score = deeplift_data['hyp_scores']
#     _one_hot = deeplift_data['one_hot']
# else:
#     _score = deeplift_data['scores'][indices,:,:]
#     _hyp_score = deeplift_data['hyp_scores'][indices,:,:]
#     _one_hot = deeplift_data['one_hot'][indices,:,:]
In [3]:
import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
In [4]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
ggr_data = OrderedDict()
with h5py.File(ggrfile, "r") as fp:
    ggr_data['one_hot'] = fp['sequence'][:]
    ggr_data['scores'] = fp['sequence-weighted'][:]
    ggr_data['hyp_scores'] = fp['gradients'][:]
ggr_data['hyp_scores'].shape, ggr_data['scores'].shape, ggr_data['one_hot'].shape
Out[4]:
((35024, 10, 1000, 4), (35024, 10, 1000, 4), (35024, 1, 1000, 4))
In [5]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
#     print(list(fp))
    traj_lab = fp['TRAJ_LABELS'][:]
traj_lab.shape
# #0, 7, 8-10-11, and 9
# indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
# indices.shape, deeplift_data['scores'][indices,:,:].shape, np.sum(indices)
Out[5]:
(35024, 15)
In [6]:
traj_no = -1
if traj_no==8:
    indices = np.logical_or(np.logical_or(traj_lab[:, 8]==1,  traj_lab[:, 10]==1), traj_lab[:, 11]==1)
else:
    indices = traj_lab[:, traj_no]==1

if traj_no<0:
    _score = ggr_data['scores'][:,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][:,0,:,:]
    _one_hot = ggr_data['one_hot'][:,0,:,:]
else:
    _score = ggr_data['scores'][indices,0,:,:]
    _hyp_score = ggr_data['hyp_scores'][indices,0,:,:]
    _one_hot = ggr_data['one_hot'][indices,0,:,:]
_score.shape, _hyp_score.shape, _one_hot.shape
Out[6]:
((35024, 1000, 4), (35024, 1000, 4), (35024, 1000, 4))
In [7]:
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=21,
                    flank_size=10,
                    #target_seqlet_fdr=0.05,
                    
                    min_passing_windows_frac=0.03,
                    max_seqlets_per_metacluster=60000,
                    
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        
                        embedder_factory=(modisco.seqlet_embedding
                          .advanced_gapped_kmer.AdvancedGappedKmerEmbedderFactory(max_entries=500)),
                        
                        trim_to_window_size=30,
                        initial_flank_to_add=10,
                        #kmer_len=5, num_gaps=1,
                        #num_mismatches=0,
                        final_min_cluster_size=30)
                )(
                 task_names=["early0"],# "task1", "task2"],
                 contrib_scores={'early0': _score[:,420:580,:]}, #[:,420:580,:]
                 hypothetical_contribs={'early0': _hyp_score[:,420:580,:] }, #[:,420:580,:]
                 one_hot=_one_hot[:,420:580,:], #[:,420:580,:]
                 null_per_pos_scores = null_per_pos_scores)
MEMORY 12.083171328
On task early0
Computing windowed sums on original
Generating null dist
peak(mu)= 0.032145559787750244
Computing threshold
Subsampling!
For increasing = True , the minimum IR precision was 0.33385234625600296 occurring at 0.0 implying a frac_neg of 0.501168688922987
To be conservative, adjusted frac neg is 0.95
For increasing = False , the minimum IR precision was 0.40646257508671163 occurring at -5.0067901611328125e-06 implying a frac_neg of 0.6848137253452771
To be conservative, adjusted frac neg is 0.95
Thresholds from null dist were -3.5901737213134766  and  16.52375066280365 with frac passing 0.000257
Passing windows frac was 0.000257 , which is below  0.03 ; adjusting
Final raw thresholds are -6.971453695297242  and  6.971453695297242
Final transformed thresholds are -0.970051964367291  and  0.970051964367291
Got 13337 coords
After resolving overlaps, got 13337 seqlets
Across all tasks, the weakest transformed threshold used was: 0.969951964367291
MEMORY 12.29840384
13337 identified in total
min_metacluster_size_frac * len(seqlets) = 133 is more than min_metacluster_size=100.
Using it as a new min_metacluster_size
1 activity patterns with support >= 133 out of 2 possible patterns
Metacluster sizes:  [13337]
Idx to activities:  {0: '1'}
MEMORY 12.298973184
On metacluster 0
Metacluster size 13337
Relevant tasks:  ('early0',)
Relevant signs:  (1,)
TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 13337
(Round 1) Computing coarse affmat
MEMORY 12.298981376
Beginning embedding computation
MEMORY 12.298981376
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  44 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done 632 tasks      | elapsed:   15.9s
[Parallel(n_jobs=4)]: Done 1632 tasks      | elapsed:   36.1s
[Parallel(n_jobs=4)]: Done 3032 tasks      | elapsed:  1.1min
[Parallel(n_jobs=4)]: Done 4832 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 7032 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done 9632 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done 12632 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 13337 out of 13337 | elapsed:  4.8min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 376 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done 876 tasks      | elapsed:   18.3s
[Parallel(n_jobs=4)]: Done 1576 tasks      | elapsed:   32.6s
[Parallel(n_jobs=4)]: Done 2476 tasks      | elapsed:   52.2s
[Parallel(n_jobs=4)]: Done 3576 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done 4876 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 6376 tasks      | elapsed:  2.2min
[Parallel(n_jobs=4)]: Done 8076 tasks      | elapsed:  2.8min
[Parallel(n_jobs=4)]: Done 9976 tasks      | elapsed:  3.5min
[Parallel(n_jobs=4)]: Done 12076 tasks      | elapsed:  4.3min
[Parallel(n_jobs=4)]: Done 13330 out of 13337 | elapsed:  4.9min remaining:    0.2s
[Parallel(n_jobs=4)]: Done 13337 out of 13337 | elapsed:  4.9min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 13337 out of 13337 | elapsed:   40.6s finished
Constructing csr matrix...
csr matrix made in 3.4400315284729004 s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 13337 out of 13337 | elapsed:   41.2s finished
Constructing csr matrix...
csr matrix made in 3.366770029067993 s
Finished embedding computation in 683.91 s
MEMORY 13.642887168
Starting affinity matrix computations
MEMORY 13.642625024
Batching in slices of size 5031
100%|██████████| 3/3 [00:56<00:00, 18.82s/it]
Finished affinity matrix computations in 56.57 s
MEMORY 13.691904
(Round 1) Computed coarse affmat
MEMORY 13.424238592
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 13.424238592

Launching nearest neighbors affmat calculation job
MEMORY 13.469171712
Parallel runs completed
MEMORY 13.592117248
Job completed in: 211.87 s
MEMORY 13.591330816
Launching nearest neighbors affmat calculation job
MEMORY 13.591441408
Parallel runs completed
MEMORY 13.709467648
Job completed in: 213.24 s
MEMORY 13.709467648
(Round 1) Computed affinity matrix on nearest neighbors in 428.1 s
MEMORY 13.709467648
Filtered down to 9576 of 13337
(Round 1) Retained 9576 rows out of 13337 after filtering
MEMORY 13.709930496
(Round 1) Computing density adapted affmat
MEMORY 13.71564032
Symmetrizing nearest neighbors
Computing betas for density adaptation
Computing normalizing denominators
(Round 1) Computing clustering
MEMORY 14.079770624
Beginning preprocessing + Leiden
Affmat shape: 9576
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  7.9min
Quality: 0.5083578336158565
Quality: 0.5083583304441011
Quality: 0.5103465699485165
Quality: 0.5103466234092124
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  9.3min finished
Got 8 clusters after round 1
Counts:
{7: 420, 5: 904, 0: 2500, 4: 1156, 2: 1279, 3: 1221, 6: 603, 1: 1493}
MEMORY 14.11868672
(Round 1) Aggregating seqlets in each cluster
MEMORY 14.11868672
Aggregating for cluster 0 with 2500 seqlets
MEMORY 14.11868672
Skipped 3 seqlets that went over the sequence edge during flank expansion
Skipped 8 seqlets that went over the sequence edge during flank expansion
Skipped 12 seqlets that went over the sequence edge during flank expansion
Skipped 9 seqlets that went over the sequence edge during flank expansion
Skipped 696 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 1772
Skipped 363 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 1 with 1493 seqlets
MEMORY 14.118449152
Skipped 2 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 5 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 17 seqlets that went over the sequence edge during flank expansion
Skipped 20 seqlets that went over the sequence edge during flank expansion
Skipped 453 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 994
Skipped 248 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 2 with 1279 seqlets
MEMORY 14.118449152
Skipped 4 seqlets that went over the sequence edge during flank expansion
Skipped 2 seqlets that went over the sequence edge during flank expansion
Skipped 2 seqlets that went over the sequence edge during flank expansion
Skipped 3 seqlets that went over the sequence edge during flank expansion
Skipped 6 seqlets that went over the sequence edge during flank expansion
Skipped 293 seqlets that went over sequence edge during flank expansion
Skipped 1 due to duplicates
Trimming eliminated 0 seqlets out of 968
Skipped 185 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 3 with 1221 seqlets
MEMORY 14.118449152
Skipped 7 seqlets that went over the sequence edge during flank expansion
Skipped 3 seqlets that went over the sequence edge during flank expansion
Skipped 312 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 899
Skipped 198 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 4 with 1156 seqlets
MEMORY 14.118449152
Skipped 2 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 6 seqlets that went over the sequence edge during flank expansion
Skipped 19 seqlets that went over the sequence edge during flank expansion
Skipped 312 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 815
Skipped 167 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 5 with 904 seqlets
MEMORY 14.118449152
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 208 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 694
Skipped 133 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 6 with 603 seqlets
MEMORY 14.118449152
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 6 seqlets that went over the sequence edge during flank expansion
Skipped 153 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 443
Skipped 101 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 7 with 420 seqlets
MEMORY 14.118449152
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 9 seqlets that went over the sequence edge during flank expansion
Skipped 92 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 318
Skipped 56 seqlets that went over the sequence edge during flank expansion
(Round 2) num seqlets: 5452
(Round 2) Computing coarse affmat
MEMORY 14.118449152
Beginning embedding computation
MEMORY 14.118449152
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 728 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done 1728 tasks      | elapsed:   27.6s
[Parallel(n_jobs=4)]: Done 3128 tasks      | elapsed:   53.4s
[Parallel(n_jobs=4)]: Done 4928 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 5452 out of 5452 | elapsed:  1.6min finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 128 tasks      | elapsed:    2.1s
[Parallel(n_jobs=4)]: Done 728 tasks      | elapsed:   11.8s
[Parallel(n_jobs=4)]: Done 1728 tasks      | elapsed:   28.0s
[Parallel(n_jobs=4)]: Done 3128 tasks      | elapsed:   53.4s
[Parallel(n_jobs=4)]: Done 4928 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 5452 out of 5452 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5452 out of 5452 | elapsed:   17.2s finished
Constructing csr matrix...
csr matrix made in 1.3914871215820312 s
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 5452 out of 5452 | elapsed:   16.2s finished
Constructing csr matrix...
csr matrix made in 1.3504602909088135 s
Finished embedding computation in 236.22 s
MEMORY 14.219665408
Starting affinity matrix computations
MEMORY 14.219665408
Batching in slices of size 5452
100%|██████████| 1/1 [00:09<00:00,  9.55s/it]
Finished affinity matrix computations in 9.59 s
MEMORY 14.219403264
(Round 2) Computed coarse affmat
MEMORY 14.21651968
(Round 2) Computing affinity matrix on nearest neighbors
MEMORY 14.21651968

Launching nearest neighbors affmat calculation job
MEMORY 14.21651968
Parallel runs completed
MEMORY 14.222241792
Job completed in: 131.17 s
MEMORY 14.222241792
Launching nearest neighbors affmat calculation job
MEMORY 14.222241792
Parallel runs completed
MEMORY 14.23163392
Job completed in: 130.77 s
MEMORY 14.2303232
(Round 2) Computed affinity matrix on nearest neighbors in 263.31 s
MEMORY 14.2303232
Not applying filtering for rounds above first round
MEMORY 14.2303232
(Round 2) Computing density adapted affmat
MEMORY 14.2303232
Symmetrizing nearest neighbors
Computing betas for density adaptation
Computing normalizing denominators
(Round 2) Computing clustering
MEMORY 14.278803456
Beginning preprocessing + Leiden
Affmat shape: 5452
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  6.1min
Quality: 0.39213340121284673
Quality: 0.3921335186693394
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  7.3min finished
Got 5 clusters after round 2
Counts:
{0: 1642, 4: 609, 3: 836, 1: 1326, 2: 1039}
MEMORY 14.24715776
(Round 2) Aggregating seqlets in each cluster
MEMORY 14.24715776
Aggregating for cluster 0 with 1642 seqlets
MEMORY 14.24715776
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 62 seqlets that went over sequence edge during flank expansion
Skipped 3 due to duplicates
Trimming eliminated 0 seqlets out of 1574
Skipped 370 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 1 with 1326 seqlets
MEMORY 14.24715776
Skipped 18 seqlets that went over the sequence edge during flank expansion
Skipped 68 seqlets that went over sequence edge during flank expansion
Skipped 2 due to duplicates
Trimming eliminated 0 seqlets out of 1238
Skipped 354 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 2 with 1039 seqlets
MEMORY 14.24715776
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 3 seqlets that went over the sequence edge during flank expansion
Skipped 1 seqlets that went over the sequence edge during flank expansion
Skipped 34 seqlets that went over the sequence edge during flank expansion
Skipped 92 seqlets that went over sequence edge during flank expansion
Skipped 3 due to duplicates
Trimming eliminated 0 seqlets out of 905
Skipped 313 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 3 with 836 seqlets
MEMORY 14.24715776
Skipped 84 seqlets that went over the sequence edge during flank expansion
Skipped 23 seqlets that went over sequence edge during flank expansion
Skipped 4 due to duplicates
Trimming eliminated 0 seqlets out of 725
Skipped 208 seqlets that went over the sequence edge during flank expansion
Aggregating for cluster 4 with 609 seqlets
MEMORY 14.24715776
Skipped 12 seqlets that went over the sequence edge during flank expansion
Skipped 4 seqlets that went over sequence edge during flank expansion
Skipped 1 due to duplicates
Trimming eliminated 0 seqlets out of 592
Skipped 134 seqlets that went over the sequence edge during flank expansion
Got 5 clusters
Splitting into subclusters...
MEMORY 14.24715776
Inspecting pattern 0 for spurious merging
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 1204 out of 1204 | elapsed:    3.9s finished
[t-SNE] Computed conditional probabilities for sample 1000 / 1204
[t-SNE] Computed conditional probabilities for sample 1204 / 1204
[t-SNE] Mean sigma: 0.250850
Beginning preprocessing + Leiden
Affmat shape: 1204
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   41.7s
Quality: 0.4972925561428032
Quality: 0.4975173712950307
Quality: 0.49762667951665335
Quality: 0.49772344157505893
Quality: 0.4978018132385952
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   49.0s finished
Got subclusters: Counter({0: 245, 1: 146, 2: 142, 3: 123, 4: 94, 5: 89, 6: 84, 7: 78, 8: 64, 9: 39, 10: 30, 11: 26, 12: 22, 13: 15, 14: 5, 15: 2})
On merging iteration 1
Numbers for each pattern pre-subsample: [245, 146, 142, 123, 94, 89, 84, 78, 64, 39, 30, 26, 22, 15, 5, 2]
Numbers after subsampling: [245, 146, 142, 123, 94, 89, 84, 78, 64, 39, 30, 26, 22, 15, 5, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 8.600388765335083 s
Computing sims for pattern 1
Computed sims for pattern 1 in 5.07261323928833 s
Computing sims for pattern 2
Computed sims for pattern 2 in 4.779524564743042 s
Computing sims for pattern 3
Computed sims for pattern 3 in 3.440063238143921 s
Computing sims for pattern 4
Computed sims for pattern 4 in 3.0409185886383057 s
Computing sims for pattern 5
Computed sims for pattern 5 in 2.8541126251220703 s
Computing sims for pattern 6
Computed sims for pattern 6 in 3.9525725841522217 s
Computing sims for pattern 7
Computed sims for pattern 7 in 3.663008213043213 s
Computing sims for pattern 8
Computed sims for pattern 8 in 2.6679537296295166 s
Computing sims for pattern 9
Computed sims for pattern 9 in 2.4023735523223877 s
Computing sims for pattern 10
Computed sims for pattern 10 in 1.838848352432251 s
Computing sims for pattern 11
Computed sims for pattern 11 in 1.6040759086608887 s
Computing sims for pattern 12
Computed sims for pattern 12 in 1.5169079303741455 s
Computing sims for pattern 13
Computed sims for pattern 13 in 1.3280768394470215 s
Computing sims for pattern 14
Computed sims for pattern 14 in 0.7087159156799316 s
Computing sims for pattern 15
Computed sims for pattern 15 in 0.6229310035705566 s
Cluster sizes
[245 146 142 123  94  89  84  78  64  39  30  26  22  15   5   2]
Cross-contamination matrix:
[[1.   0.96 0.31 0.63 1.   0.31 0.46 0.63 0.44 0.39 0.48 0.48 0.3  0.41
  0.22 0.33]
 [0.94 1.   0.33 0.61 1.   0.33 0.48 0.62 0.45 0.38 0.49 0.45 0.26 0.38
  0.23 0.24]
 [0.55 0.62 1.   0.47 0.6  0.3  0.44 0.47 0.53 0.43 0.43 0.59 0.38 0.4
  0.39 0.34]
 [0.98 0.99 0.5  1.   1.   0.55 0.8  0.76 0.75 0.58 0.69 0.64 0.54 0.67
  0.44 0.53]
 [0.71 0.77 0.19 0.43 1.   0.19 0.3  0.51 0.3  0.25 0.31 0.34 0.18 0.29
  0.13 0.17]
 [0.36 0.4  0.19 0.32 0.39 1.   0.27 0.26 0.32 0.22 0.34 0.27 0.26 0.22
  0.17 0.19]
 [1.   1.   0.63 1.   1.   0.65 1.   0.81 0.81 0.65 0.77 0.69 0.62 0.75
  0.52 0.51]
 [0.75 0.76 0.37 0.58 0.86 0.34 0.46 1.   0.47 0.43 0.48 0.47 0.39 0.47
  0.34 0.38]
 [0.85 0.89 0.6  0.81 0.92 0.58 0.66 0.69 1.   0.55 0.68 0.73 0.49 0.58
  0.51 0.62]
 [0.96 1.   0.71 0.84 1.   0.62 0.74 0.83 0.79 1.   0.78 0.73 0.68 0.71
  0.52 0.67]
 [0.78 0.82 0.38 0.65 0.84 0.52 0.51 0.59 0.57 0.45 1.   0.51 0.47 0.58
  0.66 0.27]
 [0.76 0.77 0.58 0.6  0.82 0.44 0.47 0.59 0.63 0.46 0.53 1.   0.42 0.49
  0.35 0.33]
 [0.79 0.82 0.61 0.74 0.84 0.64 0.64 0.71 0.66 0.64 0.73 0.64 1.   0.65
  0.5  0.57]
 [0.51 0.52 0.25 0.44 0.57 0.23 0.35 0.41 0.33 0.29 0.4  0.33 0.27 1.
  0.17 0.19]
 [0.05 0.05 0.03 0.04 0.05 0.01 0.02 0.04 0.04 0.01 0.13 0.02 0.02 0.01
  1.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.99 0.88 0.97 0.98 0.86 0.96 0.95 0.94 0.94 0.94 0.93 0.92 0.9
  0.79 0.76]
 [0.99 1.   0.9  0.97 0.99 0.87 0.96 0.96 0.95 0.95 0.95 0.93 0.92 0.9
  0.81 0.74]
 [0.88 0.9  1.   0.87 0.88 0.77 0.86 0.86 0.88 0.88 0.84 0.89 0.84 0.8
  0.74 0.67]
 [0.97 0.97 0.87 1.   0.97 0.85 0.98 0.94 0.95 0.93 0.92 0.91 0.9  0.89
  0.78 0.74]
 [0.98 0.99 0.88 0.97 1.   0.86 0.95 0.96 0.95 0.95 0.93 0.93 0.92 0.9
  0.8  0.74]
 [0.86 0.87 0.77 0.85 0.86 1.   0.84 0.82 0.85 0.82 0.86 0.81 0.81 0.77
  0.7  0.67]
 [0.96 0.96 0.86 0.98 0.95 0.84 1.   0.92 0.93 0.91 0.91 0.89 0.88 0.87
  0.78 0.71]
 [0.95 0.96 0.86 0.94 0.96 0.82 0.92 1.   0.92 0.92 0.91 0.9  0.9  0.87
  0.78 0.73]
 [0.94 0.95 0.88 0.95 0.95 0.85 0.93 0.92 1.   0.91 0.9  0.93 0.88 0.86
  0.79 0.74]
 [0.94 0.95 0.88 0.93 0.95 0.82 0.91 0.92 0.91 1.   0.9  0.88 0.89 0.85
  0.77 0.7 ]
 [0.94 0.95 0.84 0.92 0.93 0.86 0.91 0.91 0.9  0.9  1.   0.88 0.88 0.87
  0.85 0.69]
 [0.93 0.93 0.89 0.91 0.93 0.81 0.89 0.9  0.93 0.88 0.88 1.   0.87 0.84
  0.76 0.71]
 [0.92 0.92 0.84 0.9  0.92 0.81 0.88 0.9  0.88 0.89 0.88 0.87 1.   0.82
  0.74 0.73]
 [0.9  0.9  0.8  0.89 0.9  0.77 0.87 0.87 0.86 0.85 0.87 0.84 0.82 1.
  0.71 0.65]
 [0.79 0.81 0.74 0.78 0.8  0.7  0.78 0.78 0.79 0.77 0.85 0.76 0.74 0.71
  1.   0.63]
 [0.76 0.74 0.67 0.74 0.74 0.67 0.71 0.73 0.74 0.7  0.69 0.71 0.73 0.65
  0.63 1.  ]]
Collapsing 1 & 4 with crosscontam 0.7665061942222795 and sim 0.9900897626871592
Collapsing 0 & 1 with crosscontam 0.9361093710137949 and sim 0.9894096619633175
Collapsing 0 & 4 with crosscontam 0.7133592356064393 and sim 0.9766816955409298
Collapsing 3 & 6 with crosscontam 0.8044190717347908 and sim 0.9755247623793726
Collapsing 1 & 3 with crosscontam 0.6093026824689312 and sim 0.9746891916469241
Collapsing 0 & 3 with crosscontam 0.6329675516126341 and sim 0.9746452167704078
Collapsing 3 & 4 with crosscontam 0.4263454085279128 and sim 0.9673012546585659
Collapsing 4 & 7 with crosscontam 0.5067821911897885 and sim 0.959819628157424
Collapsing 1 & 6 with crosscontam 0.4842867682242311 and sim 0.9584543407261987
Collapsing 1 & 7 with crosscontam 0.6183413116940444 and sim 0.956550193728559
Collapsing 0 & 6 with crosscontam 0.4642755087874384 and sim 0.9554151740551935
Collapsing 3 & 8 with crosscontam 0.7474374847853178 and sim 0.9537181086019021
Collapsing 1 & 9 with crosscontam 0.38425092226211444 and sim 0.9529551723329337
Collapsing 1 & 8 with crosscontam 0.45271300402681636 and sim 0.9522150869510482
Collapsing 0 & 7 with crosscontam 0.6327198204306552 and sim 0.9513971044563515
Collapsing 4 & 6 with crosscontam 0.3036860352532218 and sim 0.9505662130871853
Collapsing 4 & 9 with crosscontam 0.2540041643852937 and sim 0.9459149640038144
Collapsing 4 & 8 with crosscontam 0.2956590633819096 and sim 0.9450704457675476
Collapsing 1 & 10 with crosscontam 0.4874420655138465 and sim 0.9450321415380756
Collapsing 0 & 9 with crosscontam 0.38632210240325904 and sim 0.9437877057118603
Collapsing 0 & 8 with crosscontam 0.4425479817083018 and sim 0.9434343063440589
Collapsing 0 & 10 with crosscontam 0.47862035716410656 and sim 0.9383834514673268
Collapsing 3 & 7 with crosscontam 0.5755438728210172 and sim 0.9382284085464258
Collapsing 4 & 10 with crosscontam 0.31446725035236267 and sim 0.9343311918149648
Collapsing 3 & 9 with crosscontam 0.5795219826743843 and sim 0.9307495663766271
Collapsing 0 & 11 with crosscontam 0.4836029990120547 and sim 0.9279448887720827
Collapsing 1 & 11 with crosscontam 0.4544541359770511 and sim 0.9269371458772888
Collapsing 4 & 11 with crosscontam 0.34372793489511366 and sim 0.9260732924035675
Collapsing 6 & 8 with crosscontam 0.6648143586658297 and sim 0.9258338864330047
Collapsing 8 & 11 with crosscontam 0.6250711197086936 and sim 0.9255085722969743
Collapsing 7 & 9 with crosscontam 0.42779954189451996 and sim 0.9242397868553989
Collapsing 1 & 12 with crosscontam 0.25890718260267676 and sim 0.9229117149781816
Collapsing 3 & 10 with crosscontam 0.6478181270701595 and sim 0.9222385949624371
Collapsing 7 & 8 with crosscontam 0.4739537237225844 and sim 0.9187039153473129
Collapsing 6 & 7 with crosscontam 0.45802065747825127 and sim 0.9183018518182773
Collapsing 0 & 12 with crosscontam 0.3009284535896066 and sim 0.9163153573835731
Collapsing 6 & 9 with crosscontam 0.6509395272150376 and sim 0.9127546279300751
Collapsing 3 & 11 with crosscontam 0.6028490439660883 and sim 0.911003797946603
Collapsing 8 & 9 with crosscontam 0.5548889942658253 and sim 0.9104594612909978
Collapsing 7 & 10 with crosscontam 0.47763785633608125 and sim 0.9068474408864401
Collapsing 6 & 10 with crosscontam 0.507962962962963 and sim 0.9067238714572752
Collapsing 3 & 12 with crosscontam 0.5415939793850735 and sim 0.9045336226709121
Collapsing 8 & 10 with crosscontam 0.5705601851851851 and sim 0.9038725940243383
Collapsing 9 & 10 with crosscontam 0.4471452991452991 and sim 0.9003367090006569
Collapsing 9 & 12 with crosscontam 0.6446473636556282 and sim 0.8941608077788028
Collapsing 2 & 11 with crosscontam 0.5779984870534083 and sim 0.8885583566343448
Collapsing 2 & 8 with crosscontam 0.5277122184355887 and sim 0.8800909818337846
Collapsing 6 & 12 with crosscontam 0.6201369256216194 and sim 0.8800050339392653
Collapsing 10 & 11 with crosscontam 0.5118860398860399 and sim 0.8786489239681509
Trimming eliminated 0 seqlets out of 240
Trimming eliminated 0 seqlets out of 485
Trimming eliminated 0 seqlets out of 207
Trimming eliminated 0 seqlets out of 692
Trimming eliminated 0 seqlets out of 770
Skipped 1 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 833
Skipped 1 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 871
Trimming eliminated 0 seqlets out of 901
Trimming eliminated 0 seqlets out of 927
Trimming eliminated 0 seqlets out of 949
Trimming eliminated 0 seqlets out of 1091
Skipped 4 seqlets that went over the sequence edge during flank expansion
Unmerged patterns remapping: OrderedDict([(5, 1), (13, 2), (14, 3), (15, 4)])
Time spent on merging iteration: 9.759464025497437
On merging iteration 2
Numbers for each pattern pre-subsample: [1087, 89, 15, 5, 2]
Numbers after subsampling: [300, 89, 15, 5, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 2.553689479827881 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.2749452590942383 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.11996793746948242 s
Computing sims for pattern 3
Computed sims for pattern 3 in 0.0755770206451416 s
Computing sims for pattern 4
Computed sims for pattern 4 in 0.0667886734008789 s
Cluster sizes
[1087   89   15    5    2]
Cross-contamination matrix:
[[1.   0.66 0.77 0.61 0.48]
 [0.3  1.   0.22 0.17 0.19]
 [0.38 0.23 1.   0.17 0.19]
 [0.03 0.01 0.01 1.   0.  ]
 [0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.86 0.91 0.8  0.77]
 [0.86 1.   0.77 0.7  0.67]
 [0.91 0.77 1.   0.71 0.65]
 [0.8  0.7  0.71 1.   0.63]
 [0.77 0.67 0.65 0.63 1.  ]]
Collapsing 0 & 2 with crosscontam 0.37886814814814795 and sim 0.9136688326268074
Trimming eliminated 0 seqlets out of 1102
Unmerged patterns remapping: OrderedDict([(1, 1), (3, 2), (4, 3)])
Time spent on merging iteration: 0.9997687339782715
On merging iteration 3
Numbers for each pattern pre-subsample: [1102, 89, 5, 2]
Numbers after subsampling: [300, 89, 5, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 1.8700766563415527 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.30229616165161133 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.08498620986938477 s
Computing sims for pattern 3
Computed sims for pattern 3 in 0.07151293754577637 s
Cluster sizes
[1102   89    5    2]
Cross-contamination matrix:
[[1.   0.66 0.61 0.47]
 [0.3  1.   0.17 0.19]
 [0.03 0.01 1.   0.  ]
 [0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.86 0.8  0.76]
 [0.86 1.   0.7  0.67]
 [0.8  0.7  1.   0.63]
 [0.76 0.67 0.63 1.  ]]
Inspecting pattern 1 for spurious merging
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 312 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 877 out of 884 | elapsed:    1.9s remaining:    0.0s
[Parallel(n_jobs=4)]: Done 884 out of 884 | elapsed:    1.9s finished
[t-SNE] Computed conditional probabilities for sample 884 / 884
[t-SNE] Mean sigma: 0.260749
Beginning preprocessing + Leiden
Affmat shape: 884
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   36.0s
Quality: 0.47667312545956225
Quality: 0.47670287630566616
Quality: 0.4770969695226056
Quality: 0.47713399664839873
Quality: 0.4773697747025852
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   43.1s finished
Got subclusters: Counter({0: 201, 1: 180, 2: 141, 3: 88, 4: 82, 5: 56, 6: 47, 7: 34, 8: 28, 9: 27})
On merging iteration 1
Numbers for each pattern pre-subsample: [201, 180, 141, 88, 82, 56, 47, 34, 28, 27]
Numbers after subsampling: [201, 180, 141, 88, 82, 56, 47, 34, 28, 27]
Computing sims for pattern 0
Computed sims for pattern 0 in 3.474745035171509 s
Computing sims for pattern 1
Computed sims for pattern 1 in 4.355949640274048 s
Computing sims for pattern 2
Computed sims for pattern 2 in 2.9694976806640625 s
Computing sims for pattern 3
Computed sims for pattern 3 in 1.7957398891448975 s
Computing sims for pattern 4
Computed sims for pattern 4 in 2.3462276458740234 s
Computing sims for pattern 5
Computed sims for pattern 5 in 1.4226832389831543 s
Computing sims for pattern 6
Computed sims for pattern 6 in 1.3770601749420166 s
Computing sims for pattern 7
Computed sims for pattern 7 in 1.2855327129364014 s
Computing sims for pattern 8
Computed sims for pattern 8 in 1.0542638301849365 s
Computing sims for pattern 9
Computed sims for pattern 9 in 1.0290346145629883 s
Cluster sizes
[201 180 141  88  82  56  47  34  28  27]
Cross-contamination matrix:
[[1.   0.6  0.4  0.27 0.4  0.32 0.38 0.65 0.24 0.16]
 [0.87 1.   0.52 0.35 0.72 0.49 0.56 0.67 0.38 0.17]
 [0.69 0.58 1.   0.32 0.43 0.39 0.4  0.53 0.31 0.17]
 [1.   0.95 0.87 1.   0.85 0.8  0.86 0.92 0.71 0.55]
 [0.82 0.9  0.52 0.36 1.   0.55 0.57 0.65 0.45 0.19]
 [1.   0.99 0.77 0.57 0.88 1.   0.78 0.92 0.65 0.35]
 [1.   0.96 0.76 0.66 0.85 0.76 1.   0.89 0.67 0.44]
 [0.87 0.67 0.49 0.35 0.52 0.47 0.49 1.   0.33 0.22]
 [0.89 0.83 0.7  0.56 0.79 0.72 0.71 0.76 1.   0.39]
 [0.9  0.59 0.6  0.55 0.56 0.55 0.58 0.73 0.5  1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.95 0.91 0.95 0.94 0.95 0.95 0.96 0.92 0.91]
 [0.95 1.   0.9  0.92 0.97 0.95 0.95 0.93 0.91 0.83]
 [0.91 0.9  1.   0.89 0.87 0.89 0.89 0.89 0.85 0.83]
 [0.95 0.92 0.89 1.   0.89 0.9  0.91 0.91 0.86 0.88]
 [0.94 0.97 0.87 0.89 1.   0.93 0.92 0.91 0.9  0.82]
 [0.95 0.95 0.89 0.9  0.93 1.   0.92 0.93 0.9  0.85]
 [0.95 0.95 0.89 0.91 0.92 0.92 1.   0.93 0.89 0.86]
 [0.96 0.93 0.89 0.91 0.91 0.93 0.93 1.   0.89 0.88]
 [0.92 0.91 0.85 0.86 0.9  0.9  0.89 0.89 1.   0.83]
 [0.91 0.83 0.83 0.88 0.82 0.85 0.86 0.88 0.83 1.  ]]
Collapsing 1 & 4 with crosscontam 0.7208213305898492 and sim 0.9673860096366191
Collapsing 0 & 7 with crosscontam 0.6524842043529713 and sim 0.9617573810351844
Collapsing 0 & 6 with crosscontam 0.3789243908466926 and sim 0.9541019144920482
Collapsing 0 & 1 with crosscontam 0.5966892625585498 and sim 0.9537574679076041
Collapsing 1 & 5 with crosscontam 0.48756476582402497 and sim 0.9537142276120001
Collapsing 1 & 6 with crosscontam 0.5602760558036366 and sim 0.9519236784282666
Collapsing 0 & 5 with crosscontam 0.3215494060810804 and sim 0.9507247207946322
Collapsing 0 & 3 with crosscontam 0.27408197541491863 and sim 0.9457238020895002
Collapsing 0 & 4 with crosscontam 0.40120896254341587 and sim 0.9353607337301877
Collapsing 1 & 7 with crosscontam 0.6678025420087297 and sim 0.9338411167845486
Collapsing 5 & 7 with crosscontam 0.4656705242650694 and sim 0.9307872519867986
Collapsing 4 & 5 with crosscontam 0.5479522880233474 and sim 0.9300056112862165
Collapsing 6 & 7 with crosscontam 0.49395438069212805 and sim 0.9267146133128468
Collapsing 4 & 6 with crosscontam 0.5732090117362247 and sim 0.9247339938565848
Collapsing 5 & 6 with crosscontam 0.7633541150391943 and sim 0.9237223208368156
Collapsing 0 & 8 with crosscontam 0.2445446551835968 and sim 0.9170665422823028
Collapsing 1 & 3 with crosscontam 0.3539941856840003 and sim 0.9157836354675772
Collapsing 4 & 7 with crosscontam 0.5156627762084662 and sim 0.9146973198044483
Collapsing 0 & 2 with crosscontam 0.4011551774002655 and sim 0.9142434816568242
Collapsing 3 & 6 with crosscontam 0.6630086344504151 and sim 0.9138087838052653
Collapsing 3 & 7 with crosscontam 0.3475276631571156 and sim 0.9115189873603472
Collapsing 1 & 8 with crosscontam 0.37619547325102887 and sim 0.9098604470362202
Collapsing 5 & 8 with crosscontam 0.6541512650978758 and sim 0.901528633623206
Collapsing 1 & 2 with crosscontam 0.5181303933300256 and sim 0.8984280033239415
Collapsing 3 & 5 with crosscontam 0.5718843696163531 and sim 0.8977166688330109
Collapsing 6 & 8 with crosscontam 0.6707555853987763 and sim 0.8935255394920085
Collapsing 3 & 9 with crosscontam 0.5509161112727641 and sim 0.8760802705206209
Collapsing 3 & 8 with crosscontam 0.5609138947786907 and sim 0.8637285237535959
Trimming eliminated 0 seqlets out of 262
Trimming eliminated 0 seqlets out of 235
Trimming eliminated 0 seqlets out of 282
Trimming eliminated 0 seqlets out of 544
Trimming eliminated 0 seqlets out of 600
Trimming eliminated 0 seqlets out of 688
Trimming eliminated 0 seqlets out of 716
Removed 1 duplicate seqlets
Skipped 1 due to duplicates
Trimming eliminated 0 seqlets out of 855
Trimming eliminated 0 seqlets out of 882
Unmerged patterns remapping: OrderedDict()
Time spent on merging iteration: 7.015376567840576
On merging iteration 2
Numbers for each pattern pre-subsample: [882]
Numbers after subsampling: [300]
Computing sims for pattern 0
Computed sims for pattern 0 in 2.2411346435546875e-05 s
Cluster sizes
[882]
Cross-contamination matrix:
[[1.]]
Pattern-to-pattern sim matrix:
[[1.]]
Inspecting pattern 2 for spurious merging
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 312 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 592 out of 592 | elapsed:    0.9s finished
[t-SNE] Computed conditional probabilities for sample 592 / 592
[t-SNE] Mean sigma: 0.320591
Beginning preprocessing + Leiden
Affmat shape: 592
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   26.7s
Quality: 0.5086081924835515
Quality: 0.5086305745113916
Quality: 0.5086733049676823
Got subclusters: Counter({0: 151, 1: 142, 2: 114, 3: 98, 4: 43, 5: 28, 6: 14, 7: 2})
On merging iteration 1
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   31.5s finished
Numbers for each pattern pre-subsample: [151, 142, 114, 98, 43, 28, 14, 2]
Numbers after subsampling: [151, 142, 114, 98, 43, 28, 14, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 2.5122036933898926 s
Computing sims for pattern 1
Computed sims for pattern 1 in 2.1562893390655518 s
Computing sims for pattern 2
Computed sims for pattern 2 in 2.0147738456726074 s
Computing sims for pattern 3
Computed sims for pattern 3 in 1.5294322967529297 s
Computing sims for pattern 4
Computed sims for pattern 4 in 0.9523284435272217 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.7795848846435547 s
Computing sims for pattern 6
Computed sims for pattern 6 in 0.598092794418335 s
Computing sims for pattern 7
Computed sims for pattern 7 in 0.24756669998168945 s
Cluster sizes
[151 142 114  98  43  28  14   2]
Cross-contamination matrix:
[[1.   0.71 0.78 0.44 0.06 0.24 0.32 0.06]
 [0.82 1.   0.85 0.43 0.03 0.12 0.19 0.02]
 [0.6  0.56 1.   0.2  0.03 0.17 0.18 0.01]
 [0.78 0.68 0.58 1.   0.16 0.29 0.43 0.14]
 [0.26 0.14 0.27 0.2  1.   0.24 0.2  1.  ]
 [0.8  0.5  0.79 0.45 0.3  1.   0.58 0.17]
 [0.66 0.32 0.52 0.38 0.06 0.24 1.   0.03]
 [0.   0.   0.   0.   0.27 0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.82 0.8  0.72 0.26 0.57 0.67 0.24]
 [0.82 1.   0.75 0.68 0.25 0.46 0.35 0.19]
 [0.8  0.75 1.   0.52 0.21 0.49 0.53 0.2 ]
 [0.72 0.68 0.52 1.   0.2  0.36 0.44 0.18]
 [0.26 0.25 0.21 0.2  1.   0.39 0.29 0.89]
 [0.57 0.46 0.49 0.36 0.39 1.   0.38 0.29]
 [0.67 0.35 0.53 0.44 0.29 0.38 1.   0.33]
 [0.24 0.19 0.2  0.18 0.89 0.29 0.33 1.  ]]
Inspecting pattern 3 for spurious merging
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 312 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 517 out of 517 | elapsed:    0.7s finished
[t-SNE] Computed conditional probabilities for sample 517 / 517
[t-SNE] Mean sigma: 0.265747
Beginning preprocessing + Leiden
Affmat shape: 517
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   25.7s
Quality: 0.4215357708498422
Got subclusters: Counter({0: 132, 1: 100, 2: 98, 3: 85, 4: 75, 5: 17, 6: 10})
On merging iteration 1
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   30.5s finished
Numbers for each pattern pre-subsample: [132, 100, 98, 85, 75, 17, 10]
Numbers after subsampling: [132, 100, 98, 85, 75, 17, 10]
Computing sims for pattern 0
Computed sims for pattern 0 in 1.610112190246582 s
Computing sims for pattern 1
Computed sims for pattern 1 in 1.39109206199646 s
Computing sims for pattern 2
Computed sims for pattern 2 in 1.3290481567382812 s
Computing sims for pattern 3
Computed sims for pattern 3 in 1.6127455234527588 s
Computing sims for pattern 4
Computed sims for pattern 4 in 1.3271996974945068 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.49945545196533203 s
Computing sims for pattern 6
Computed sims for pattern 6 in 0.4165019989013672 s
Cluster sizes
[132 100  98  85  75  17  10]
Cross-contamination matrix:
[[1.   0.87 0.72 0.62 0.87 0.67 0.54]
 [0.44 1.   0.68 0.43 0.61 0.38 0.21]
 [0.42 0.83 1.   0.47 0.62 0.42 0.23]
 [0.72 0.99 0.87 1.   0.93 0.49 0.4 ]
 [0.5  0.7  0.56 0.43 1.   0.26 0.24]
 [0.48 0.6  0.54 0.24 0.4  1.   0.24]
 [0.55 0.59 0.45 0.33 0.61 0.4  1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.88 0.81 0.85 0.88 0.88 0.87]
 [0.88 1.   0.91 0.87 0.91 0.85 0.82]
 [0.81 0.91 1.   0.81 0.84 0.85 0.71]
 [0.85 0.87 0.81 1.   0.86 0.76 0.71]
 [0.88 0.91 0.84 0.86 1.   0.78 0.82]
 [0.88 0.85 0.85 0.76 0.78 1.   0.76]
 [0.87 0.82 0.71 0.71 0.82 0.76 1.  ]]
Collapsing 1 & 4 with crosscontam 0.6133555733333333 and sim 0.9091366710642803
Collapsing 1 & 2 with crosscontam 0.6805592653061225 and sim 0.9064105525512718
Collapsing 0 & 4 with crosscontam 0.5008728350168348 and sim 0.882909953455935
Collapsing 0 & 6 with crosscontam 0.5393756782703063 and sim 0.8747228901385056
Aborting collapse as 2 & 6 have cross-contam 0.3414266164608285 and sim 0.708312679625633
Collapsing 0 & 3 with crosscontam 0.6169264554473597 and sim 0.8544817214029178
Trimming eliminated 0 seqlets out of 175
Trimming eliminated 0 seqlets out of 273
Trimming eliminated 0 seqlets out of 405
Skipped 23 seqlets that went over the sequence edge during flank expansion
Skipped 3 seqlets that went over sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 464
Skipped 107 seqlets that went over the sequence edge during flank expansion
Unmerged patterns remapping: OrderedDict([(5, 1), (6, 2)])
Time spent on merging iteration: 2.5687999725341797
On merging iteration 2
Numbers for each pattern pre-subsample: [357, 17, 10]
Numbers after subsampling: [300, 17, 10]
Computing sims for pattern 0
Computed sims for pattern 0 in 1.3670148849487305 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.1457960605621338 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.11361408233642578 s
Cluster sizes
[357  17  10]
Cross-contamination matrix:
[[1.   0.75 0.6 ]
 [0.46 1.   0.24]
 [0.51 0.4  1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.89 0.85]
 [0.89 1.   0.76]
 [0.85 0.76 1.  ]]
Inspecting pattern 4 for spurious merging
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 312 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 458 out of 458 | elapsed:    0.6s finished
[t-SNE] Computed conditional probabilities for sample 458 / 458
[t-SNE] Mean sigma: 0.302673
Beginning preprocessing + Leiden
Affmat shape: 458
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   24.6s
Quality: 0.44792454849730057
Quality: 0.447942791146518
Quality: 0.448176746757894
Quality: 0.44855873050336187
Quality: 0.44940564299037006
Quality: 0.44963120001191065
Quality: 0.4497066442344871
Got subclusters: Counter({0: 112, 1: 111, 2: 97, 3: 55, 4: 36, 5: 12, 6: 10, 8: 8, 7: 8, 9: 7, 10: 2})
On merging iteration 1
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   29.0s finished
Numbers for each pattern pre-subsample: [112, 111, 97, 55, 36, 12, 10, 8, 8, 7, 2]
Numbers after subsampling: [112, 111, 97, 55, 36, 12, 10, 8, 8, 7, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 2.7441911697387695 s
Computing sims for pattern 1
Computed sims for pattern 1 in 2.6525611877441406 s
Computing sims for pattern 2
Computed sims for pattern 2 in 2.084895133972168 s
Computing sims for pattern 3
Computed sims for pattern 3 in 1.6866869926452637 s
Computing sims for pattern 4
Computed sims for pattern 4 in 1.36021089553833 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.7050774097442627 s
Computing sims for pattern 6
Computed sims for pattern 6 in 0.6231496334075928 s
Computing sims for pattern 7
Computed sims for pattern 7 in 0.551872968673706 s
Computing sims for pattern 8
Computed sims for pattern 8 in 0.533191442489624 s
Computing sims for pattern 9
Computed sims for pattern 9 in 0.4920511245727539 s
Computing sims for pattern 10
Computed sims for pattern 10 in 0.2679424285888672 s
Cluster sizes
[112 111  97  55  36  12  10   8   8   7   2]
Cross-contamination matrix:
[[1.   0.72 0.7  0.39 0.4  0.25 0.35 0.57 0.34 0.28 0.43]
 [0.45 1.   0.39 0.18 0.41 0.13 0.19 0.41 0.28 0.06 0.22]
 [0.87 0.83 1.   0.33 0.68 0.37 0.42 0.68 0.46 0.26 0.48]
 [0.46 0.52 0.28 1.   0.24 0.14 0.28 0.34 0.32 0.16 0.13]
 [0.38 0.67 0.51 0.18 1.   0.21 0.3  0.48 0.34 0.11 0.31]
 [0.68 0.83 0.76 0.52 0.72 1.   0.57 0.69 0.62 0.58 0.56]
 [0.54 0.76 0.5  0.34 0.48 0.19 1.   0.58 0.43 0.09 0.36]
 [0.39 0.55 0.35 0.13 0.33 0.11 0.21 1.   0.23 0.07 0.14]
 [0.18 0.4  0.19 0.13 0.2  0.05 0.12 0.23 1.   0.02 0.04]
 [0.54 0.52 0.4  0.35 0.32 0.37 0.26 0.39 0.29 1.   0.29]
 [0.09 0.11 0.08 0.   0.   0.   0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.89 0.89 0.84 0.76 0.69 0.79 0.82 0.75 0.69 0.64]
 [0.89 1.   0.9  0.83 0.89 0.79 0.88 0.91 0.84 0.67 0.69]
 [0.89 0.9  1.   0.73 0.87 0.8  0.81 0.84 0.79 0.58 0.65]
 [0.84 0.83 0.73 1.   0.67 0.62 0.72 0.74 0.71 0.58 0.53]
 [0.76 0.89 0.87 0.67 1.   0.74 0.8  0.82 0.76 0.54 0.65]
 [0.69 0.79 0.8  0.62 0.74 1.   0.68 0.72 0.7  0.62 0.57]
 [0.79 0.88 0.81 0.72 0.8  0.68 1.   0.8  0.74 0.55 0.66]
 [0.82 0.91 0.84 0.74 0.82 0.72 0.8  1.   0.76 0.6  0.63]
 [0.75 0.84 0.79 0.71 0.76 0.7  0.74 0.76 1.   0.56 0.51]
 [0.69 0.67 0.58 0.58 0.54 0.62 0.55 0.6  0.56 1.   0.51]
 [0.64 0.69 0.65 0.53 0.65 0.57 0.66 0.63 0.51 0.51 1.  ]]
Collapsing 1 & 7 with crosscontam 0.4148030426335758 and sim 0.9103764003893328
Collapsing 1 & 2 with crosscontam 0.39226355194817364 and sim 0.9013274050458031
Collapsing 0 & 2 with crosscontam 0.6991201574287671 and sim 0.8891089932917386
Collapsing 2 & 4 with crosscontam 0.5076320832095937 and sim 0.8654285726070339
Trimming eliminated 0 seqlets out of 119
Trimming eliminated 0 seqlets out of 216
Trimming eliminated 0 seqlets out of 328
Skipped 33 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 331
Unmerged patterns remapping: OrderedDict([(3, 1), (5, 2), (6, 3), (8, 4), (9, 5), (10, 6)])
Time spent on merging iteration: 1.9417147636413574
On merging iteration 2
Numbers for each pattern pre-subsample: [331, 55, 12, 10, 8, 7, 2]
Numbers after subsampling: [300, 55, 12, 10, 8, 7, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 3.8693764209747314 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.24571537971496582 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.1117398738861084 s
Computing sims for pattern 3
Computed sims for pattern 3 in 0.10693573951721191 s
Computing sims for pattern 4
Computed sims for pattern 4 in 0.10177755355834961 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.08361387252807617 s
Computing sims for pattern 6
Computed sims for pattern 6 in 0.06667828559875488 s
Cluster sizes
[331  55  12  10   8   7   2]
Cross-contamination matrix:
[[1.   0.51 0.43 0.62 0.62 0.33 0.59]
 [0.4  1.   0.14 0.28 0.32 0.16 0.13]
 [0.76 0.52 1.   0.57 0.62 0.58 0.56]
 [0.6  0.34 0.19 1.   0.43 0.09 0.36]
 [0.26 0.13 0.05 0.12 1.   0.02 0.04]
 [0.47 0.35 0.37 0.26 0.29 1.   0.29]
 [0.08 0.   0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.82 0.8  0.87 0.83 0.66 0.69]
 [0.82 1.   0.62 0.72 0.71 0.58 0.53]
 [0.8  0.62 1.   0.68 0.7  0.62 0.57]
 [0.87 0.72 0.68 1.   0.74 0.55 0.66]
 [0.83 0.71 0.7  0.74 1.   0.56 0.51]
 [0.66 0.58 0.62 0.55 0.56 1.   0.51]
 [0.69 0.53 0.57 0.66 0.51 0.51 1.  ]]
Collapsing 0 & 3 with crosscontam 0.6003200000000002 and sim 0.8690583254949964
Trimming eliminated 0 seqlets out of 341
Unmerged patterns remapping: OrderedDict([(1, 1), (2, 2), (4, 3), (5, 4), (6, 5)])
Time spent on merging iteration: 0.3192868232727051
On merging iteration 3
Numbers for each pattern pre-subsample: [341, 55, 12, 8, 7, 2]
Numbers after subsampling: [300, 55, 12, 8, 7, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 3.148636817932129 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.24092864990234375 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.11616969108581543 s
Computing sims for pattern 3
Computed sims for pattern 3 in 0.10202264785766602 s
Computing sims for pattern 4
Computed sims for pattern 4 in 0.09877777099609375 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.07165169715881348 s
Cluster sizes
[341  55  12   8   7   2]
Cross-contamination matrix:
[[1.   0.52 0.44 0.64 0.34 0.6 ]
 [0.4  1.   0.14 0.32 0.16 0.13]
 [0.75 0.52 1.   0.62 0.58 0.56]
 [0.25 0.13 0.05 1.   0.02 0.04]
 [0.46 0.35 0.37 0.29 1.   0.29]
 [0.08 0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.82 0.8  0.83 0.66 0.69]
 [0.82 1.   0.62 0.71 0.58 0.53]
 [0.8  0.62 1.   0.7  0.62 0.57]
 [0.83 0.71 0.7  1.   0.56 0.51]
 [0.66 0.58 0.62 0.56 1.   0.51]
 [0.69 0.53 0.57 0.51 0.51 1.  ]]
Merging on 22 clusters
MEMORY 14.247133184
On merging iteration 1
Numbers for each pattern pre-subsample: [1102, 89, 5, 2, 882, 151, 142, 114, 98, 43, 28, 14, 2, 357, 17, 10, 341, 55, 12, 8, 7, 2]
Numbers after subsampling: [300, 89, 5, 2, 300, 151, 142, 114, 98, 43, 28, 14, 2, 300, 17, 10, 300, 55, 12, 8, 7, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 14.187085628509521 s
Computing sims for pattern 1
Computed sims for pattern 1 in 4.300368547439575 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.9889605045318604 s
Computing sims for pattern 3
Computed sims for pattern 3 in 0.779677152633667 s
Computing sims for pattern 4
Computed sims for pattern 4 in 14.201063394546509 s
Computing sims for pattern 5
Computed sims for pattern 5 in 7.423771858215332 s
Computing sims for pattern 6
Computed sims for pattern 6 in 6.514514207839966 s
Computing sims for pattern 7
Computed sims for pattern 7 in 6.02603006362915 s
Computing sims for pattern 8
Computed sims for pattern 8 in 4.6973371505737305 s
Computing sims for pattern 9
Computed sims for pattern 9 in 3.1583895683288574 s
Computing sims for pattern 10
Computed sims for pattern 10 in 2.4132659435272217 s
Computing sims for pattern 11
Computed sims for pattern 11 in 1.8065330982208252 s
Computing sims for pattern 12
Computed sims for pattern 12 in 0.7158913612365723 s
Computing sims for pattern 13
Computed sims for pattern 13 in 14.120256185531616 s
Computing sims for pattern 14
Computed sims for pattern 14 in 1.7865524291992188 s
Computing sims for pattern 15
Computed sims for pattern 15 in 1.4853260517120361 s
Computing sims for pattern 16
Computed sims for pattern 16 in 13.684163093566895 s
Computing sims for pattern 17
Computed sims for pattern 17 in 4.000203847885132 s
Computing sims for pattern 18
Computed sims for pattern 18 in 1.6517527103424072 s
Computing sims for pattern 19
Computed sims for pattern 19 in 1.2630078792572021 s
Computing sims for pattern 20
Computed sims for pattern 20 in 1.1634864807128906 s
Computing sims for pattern 21
Computed sims for pattern 21 in 0.7910940647125244 s
Cluster sizes
[1102   89    5    2  882  151  142  114   98   43   28   14    2  357
   17   10  341   55   12    8    7    2]
Cross-contamination matrix:
[[1.   0.66 0.61 0.47 0.78 0.04 0.3  0.3  0.03 0.   0.   0.   0.   0.62
  0.63 0.43 0.73 0.53 0.34 0.6  0.13 0.44]
 [0.3  1.   0.17 0.19 0.2  0.02 0.07 0.09 0.01 0.   0.   0.   0.   0.18
  0.15 0.12 0.27 0.14 0.07 0.24 0.04 0.09]
 [0.03 0.01 1.   0.   0.02 0.   0.   0.02 0.   0.   0.   0.   0.   0.01
  0.03 0.   0.04 0.01 0.02 0.04 0.22 0.  ]
 [0.   0.   0.   1.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.81 0.5  0.44 0.5  1.   0.06 0.33 0.34 0.05 0.   0.   0.   0.   0.6
  0.7  0.36 0.62 0.41 0.24 0.46 0.13 0.28]
 [0.18 0.19 0.03 0.23 0.2  1.   0.71 0.78 0.44 0.06 0.24 0.32 0.06 0.27
  0.57 0.13 0.05 0.2  0.13 0.05 0.05 0.02]
 [0.63 0.52 0.57 0.61 0.65 0.82 1.   0.85 0.43 0.03 0.12 0.19 0.02 0.64
  0.94 0.48 0.53 0.56 0.3  0.45 0.37 0.36]
 [0.4  0.33 0.36 0.24 0.42 0.6  0.56 1.   0.2  0.03 0.17 0.18 0.01 0.48
  0.93 0.31 0.3  0.32 0.18 0.22 0.1  0.29]
 [0.44 0.37 0.33 0.37 0.47 0.78 0.68 0.58 1.   0.16 0.29 0.43 0.14 0.38
  0.48 0.32 0.37 0.35 0.32 0.3  0.23 0.35]
 [0.16 0.26 0.2  0.32 0.18 0.26 0.14 0.27 0.2  1.   0.24 0.2  1.   0.07
  0.17 0.06 0.2  0.15 0.13 0.55 0.22 0.42]
 [0.27 0.27 0.82 0.5  0.34 0.8  0.5  0.79 0.45 0.3  1.   0.58 0.17 0.41
  0.39 0.23 0.22 0.24 0.73 0.27 0.72 0.17]
 [0.06 0.03 0.08 0.02 0.05 0.66 0.32 0.52 0.38 0.06 0.24 1.   0.03 0.04
  0.05 0.03 0.05 0.09 0.07 0.04 0.05 0.02]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.27 0.   0.   1.   0.
  0.   0.   0.   0.   0.   0.   0.   0.  ]
 [0.57 0.4  0.34 0.33 0.54 0.13 0.32 0.4  0.06 0.   0.03 0.01 0.   1.
  0.75 0.6  0.43 0.33 0.19 0.38 0.14 0.26]
 [0.29 0.11 0.16 0.05 0.33 0.09 0.3  0.61 0.   0.   0.   0.   0.   0.46
  1.   0.24 0.16 0.09 0.03 0.09 0.01 0.06]
 [0.27 0.21 0.2  0.23 0.26 0.01 0.08 0.16 0.   0.   0.   0.   0.   0.51
  0.4  1.   0.19 0.15 0.11 0.25 0.05 0.19]
 [0.69 0.54 0.55 0.6  0.56 0.01 0.22 0.2  0.05 0.01 0.01 0.01 0.02 0.44
  0.41 0.35 1.   0.52 0.44 0.64 0.34 0.6 ]
 [0.36 0.26 0.27 0.18 0.27 0.03 0.14 0.13 0.01 0.   0.   0.   0.   0.21
  0.22 0.23 0.4  1.   0.14 0.32 0.16 0.13]
 [0.66 0.53 0.68 0.56 0.58 0.18 0.32 0.34 0.14 0.02 0.3  0.04 0.01 0.48
  0.51 0.4  0.75 0.52 1.   0.62 0.58 0.56]
 [0.23 0.16 0.24 0.13 0.13 0.   0.02 0.01 0.   0.   0.   0.   0.   0.11
  0.08 0.06 0.25 0.13 0.05 1.   0.02 0.04]
 [0.38 0.23 1.   0.39 0.35 0.   0.15 0.04 0.01 0.   0.19 0.   0.   0.26
  0.3  0.28 0.46 0.35 0.37 0.29 1.   0.29]
 [0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.08 0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.86 0.8  0.76 0.92 0.37 0.75 0.68 0.49 0.21 0.33 0.35 0.21 0.87
  0.83 0.85 0.92 0.8  0.76 0.82 0.67 0.7 ]
 [0.86 1.   0.7  0.67 0.78 0.35 0.62 0.58 0.39 0.24 0.28 0.27 0.16 0.74
  0.69 0.72 0.81 0.69 0.64 0.74 0.56 0.61]
 [0.8  0.7  1.   0.63 0.74 0.29 0.61 0.54 0.37 0.29 0.56 0.32 0.24 0.68
  0.67 0.68 0.77 0.66 0.67 0.71 0.88 0.62]
 [0.76 0.67 0.63 1.   0.75 0.42 0.68 0.52 0.45 0.34 0.42 0.3  0.31 0.66
  0.67 0.65 0.73 0.61 0.6  0.68 0.58 0.52]
 [0.92 0.78 0.74 0.75 1.   0.41 0.79 0.71 0.54 0.26 0.41 0.36 0.22 0.85
  0.86 0.77 0.86 0.75 0.71 0.74 0.67 0.66]
 [0.37 0.35 0.29 0.42 0.41 1.   0.82 0.8  0.72 0.26 0.57 0.67 0.24 0.46
  0.6  0.34 0.35 0.5  0.3  0.28 0.26 0.24]
 [0.75 0.62 0.61 0.68 0.79 0.82 1.   0.75 0.68 0.25 0.46 0.35 0.19 0.71
  0.74 0.63 0.67 0.69 0.5  0.59 0.56 0.42]
 [0.68 0.58 0.54 0.52 0.71 0.8  0.75 1.   0.52 0.21 0.49 0.53 0.2  0.75
  0.9  0.59 0.61 0.61 0.49 0.47 0.34 0.52]
 [0.49 0.39 0.37 0.45 0.54 0.72 0.68 0.52 1.   0.2  0.36 0.44 0.18 0.43
  0.5  0.37 0.41 0.42 0.37 0.31 0.25 0.37]
 [0.21 0.24 0.29 0.34 0.26 0.26 0.25 0.21 0.2  1.   0.39 0.29 0.89 0.16
  0.21 0.18 0.19 0.17 0.3  0.33 0.34 0.26]
 [0.33 0.28 0.56 0.42 0.41 0.57 0.46 0.49 0.36 0.39 1.   0.38 0.29 0.34
  0.35 0.28 0.28 0.3  0.56 0.3  0.62 0.28]
 [0.35 0.27 0.32 0.3  0.36 0.67 0.35 0.53 0.44 0.29 0.38 1.   0.33 0.29
  0.31 0.28 0.3  0.3  0.39 0.28 0.26 0.19]
 [0.21 0.16 0.24 0.31 0.22 0.24 0.19 0.2  0.18 0.89 0.29 0.33 1.   0.16
  0.19 0.18 0.19 0.12 0.31 0.26 0.25 0.16]
 [0.87 0.74 0.68 0.66 0.85 0.46 0.71 0.75 0.43 0.16 0.34 0.29 0.16 1.
  0.89 0.85 0.81 0.69 0.65 0.73 0.59 0.61]
 [0.83 0.69 0.67 0.67 0.86 0.6  0.74 0.9  0.5  0.21 0.35 0.31 0.19 0.89
  1.   0.76 0.76 0.67 0.62 0.65 0.58 0.59]
 [0.85 0.72 0.68 0.65 0.77 0.34 0.63 0.59 0.37 0.18 0.28 0.28 0.18 0.85
  0.76 1.   0.8  0.69 0.65 0.7  0.58 0.63]
 [0.92 0.81 0.77 0.73 0.86 0.35 0.67 0.61 0.41 0.19 0.28 0.3  0.19 0.81
  0.76 0.8  1.   0.82 0.8  0.83 0.66 0.69]
 [0.8  0.69 0.66 0.61 0.75 0.5  0.69 0.61 0.42 0.17 0.3  0.3  0.12 0.69
  0.67 0.69 0.82 1.   0.62 0.71 0.58 0.53]
 [0.76 0.64 0.67 0.6  0.71 0.3  0.5  0.49 0.37 0.3  0.56 0.39 0.31 0.65
  0.62 0.65 0.8  0.62 1.   0.7  0.62 0.57]
 [0.82 0.74 0.71 0.68 0.74 0.28 0.59 0.47 0.31 0.33 0.3  0.28 0.26 0.73
  0.65 0.7  0.83 0.71 0.7  1.   0.56 0.51]
 [0.67 0.56 0.88 0.58 0.67 0.26 0.56 0.34 0.25 0.34 0.62 0.26 0.25 0.59
  0.58 0.58 0.66 0.58 0.62 0.56 1.   0.51]
 [0.7  0.61 0.62 0.52 0.66 0.24 0.42 0.52 0.37 0.26 0.28 0.19 0.16 0.61
  0.59 0.63 0.69 0.53 0.57 0.51 0.51 1.  ]]
Collapsing 0 & 16 with crosscontam 0.6854722049382715 and sim 0.9233883441095919
Collapsing 0 & 4 with crosscontam 0.7821088138271606 and sim 0.9150010033144051
Collapsing 7 & 14 with crosscontam 0.6115890173224634 and sim 0.9028928001880183
Collapsing 0 & 13 with crosscontam 0.5656703708641977 and sim 0.8698389947881155
Collapsing 4 & 16 with crosscontam 0.5602700637037039 and sim 0.8555642993111345
Skipped 5 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 1438
Skipped 81 seqlets that went over the sequence edge during flank expansion
Skipped 4 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 2235
Skipped 81 seqlets that went over the sequence edge during flank expansion
Trimming eliminated 0 seqlets out of 131
Skipped 3 seqlets that went over the sequence edge during flank expansion
Skipped 2 seqlets that went over sequence edge during flank expansion
Skipped 1 due to duplicates
Trimming eliminated 0 seqlets out of 2505
Skipped 146 seqlets that went over the sequence edge during flank expansion
Unmerged patterns remapping: OrderedDict([(1, 5), (2, 14), (3, 15), (5, 1), (6, 2), (8, 4), (9, 7), (10, 8), (11, 9), (12, 16), (15, 11), (17, 6), (18, 10), (19, 12), (20, 13), (21, 17)])
Time spent on merging iteration: 11.327602863311768
On merging iteration 2
Numbers for each pattern pre-subsample: [2359, 151, 142, 131, 98, 89, 55, 43, 28, 14, 12, 10, 8, 7, 5, 2, 2, 2]
Numbers after subsampling: [300, 151, 142, 131, 98, 89, 55, 43, 28, 14, 12, 10, 8, 7, 5, 2, 2, 2]
Computing sims for pattern 0
Computed sims for pattern 0 in 11.164413452148438 s
Computing sims for pattern 1
Computed sims for pattern 1 in 0.8772587776184082 s
Computing sims for pattern 2
Computed sims for pattern 2 in 0.7360920906066895 s
Computing sims for pattern 3
Computed sims for pattern 3 in 4.403687953948975 s
Computing sims for pattern 4
Computed sims for pattern 4 in 0.5462625026702881 s
Computing sims for pattern 5
Computed sims for pattern 5 in 0.7667226791381836 s
Computing sims for pattern 6
Computed sims for pattern 6 in 0.4166250228881836 s
Computing sims for pattern 7
Computed sims for pattern 7 in 0.3210892677307129 s
Computing sims for pattern 8
Computed sims for pattern 8 in 0.28917598724365234 s
Computing sims for pattern 9
Computed sims for pattern 9 in 0.2213582992553711 s
Computing sims for pattern 10
Computed sims for pattern 10 in 0.20161747932434082 s
Computing sims for pattern 11
Computed sims for pattern 11 in 0.19224858283996582 s
Computing sims for pattern 12
Computed sims for pattern 12 in 0.17246055603027344 s
Computing sims for pattern 13
Computed sims for pattern 13 in 0.15851283073425293 s
Computing sims for pattern 14
Computed sims for pattern 14 in 0.13884472846984863 s
Computing sims for pattern 15
Computed sims for pattern 15 in 0.10401153564453125 s
Computing sims for pattern 16
Computed sims for pattern 16 in 0.10087108612060547 s
Computing sims for pattern 17
Computed sims for pattern 17 in 0.09780025482177734 s
Cluster sizes
[2359  151  142  131   98   89   55   43   28   14   12   10    8    7
    5    2    2    2]
Cross-contamination matrix:
[[1.   0.07 0.36 0.46 0.04 0.67 0.56 0.   0.01 0.   0.34 0.54 0.62 0.18
  0.61 0.72 0.   0.36]
 [0.2  1.   0.71 0.76 0.44 0.19 0.2  0.06 0.24 0.32 0.13 0.13 0.05 0.05
  0.03 0.23 0.06 0.02]
 [0.64 0.82 1.   0.86 0.43 0.52 0.56 0.03 0.12 0.19 0.3  0.48 0.45 0.37
  0.57 0.61 0.02 0.36]
 [0.46 0.58 0.58 1.   0.19 0.34 0.32 0.02 0.16 0.16 0.19 0.34 0.27 0.14
  0.36 0.29 0.01 0.27]
 [0.43 0.78 0.68 0.57 1.   0.37 0.35 0.16 0.29 0.43 0.32 0.32 0.3  0.23
  0.33 0.37 0.14 0.35]
 [0.25 0.02 0.07 0.09 0.01 1.   0.14 0.   0.   0.   0.07 0.12 0.24 0.04
  0.17 0.19 0.   0.09]
 [0.33 0.03 0.14 0.15 0.01 0.26 1.   0.   0.   0.   0.14 0.23 0.32 0.16
  0.27 0.18 0.   0.13]
 [0.17 0.26 0.14 0.26 0.2  0.26 0.15 1.   0.24 0.2  0.13 0.06 0.55 0.22
  0.2  0.32 1.   0.42]
 [0.33 0.8  0.5  0.78 0.45 0.27 0.24 0.3  1.   0.58 0.73 0.23 0.27 0.72
  0.82 0.5  0.17 0.17]
 [0.05 0.66 0.32 0.49 0.38 0.03 0.09 0.06 0.24 1.   0.07 0.03 0.04 0.05
  0.08 0.02 0.03 0.02]
 [0.62 0.18 0.32 0.36 0.14 0.53 0.52 0.02 0.3  0.04 1.   0.4  0.62 0.58
  0.68 0.56 0.01 0.56]
 [0.34 0.01 0.08 0.19 0.   0.21 0.15 0.   0.   0.   0.11 1.   0.25 0.05
  0.2  0.23 0.   0.19]
 [0.18 0.   0.02 0.03 0.   0.16 0.13 0.   0.   0.   0.05 0.06 1.   0.02
  0.24 0.13 0.   0.04]
 [0.37 0.   0.15 0.16 0.01 0.23 0.35 0.   0.19 0.   0.37 0.28 0.29 1.
  1.   0.39 0.   0.29]
 [0.03 0.   0.   0.02 0.   0.01 0.01 0.   0.   0.   0.02 0.   0.04 0.22
  1.   0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   1.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.27 0.   0.   0.   0.   0.   0.
  0.   0.   1.   0.  ]
 [0.01 0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.4  0.79 0.75 0.52 0.84 0.79 0.24 0.37 0.36 0.75 0.83 0.8  0.68
  0.79 0.78 0.22 0.7 ]
 [0.4  1.   0.82 0.78 0.72 0.35 0.5  0.26 0.57 0.67 0.3  0.34 0.28 0.26
  0.29 0.42 0.24 0.24]
 [0.79 0.82 1.   0.75 0.68 0.62 0.69 0.25 0.46 0.35 0.5  0.63 0.59 0.56
  0.61 0.68 0.19 0.42]
 [0.75 0.78 0.75 1.   0.5  0.6  0.62 0.19 0.47 0.49 0.51 0.63 0.55 0.48
  0.56 0.55 0.18 0.53]
 [0.52 0.72 0.68 0.5  1.   0.39 0.42 0.2  0.36 0.44 0.37 0.37 0.31 0.25
  0.37 0.45 0.18 0.37]
 [0.84 0.35 0.62 0.6  0.39 1.   0.69 0.24 0.28 0.27 0.64 0.72 0.74 0.56
  0.7  0.67 0.16 0.61]
 [0.79 0.5  0.69 0.62 0.42 0.69 1.   0.17 0.3  0.3  0.62 0.69 0.71 0.58
  0.66 0.61 0.12 0.53]
 [0.24 0.26 0.25 0.19 0.2  0.24 0.17 1.   0.39 0.29 0.3  0.18 0.33 0.34
  0.29 0.34 0.89 0.26]
 [0.37 0.57 0.46 0.47 0.36 0.28 0.3  0.39 1.   0.38 0.56 0.28 0.3  0.62
  0.56 0.42 0.29 0.28]
 [0.36 0.67 0.35 0.49 0.44 0.27 0.3  0.29 0.38 1.   0.39 0.28 0.28 0.26
  0.32 0.3  0.33 0.19]
 [0.75 0.3  0.5  0.51 0.37 0.64 0.62 0.3  0.56 0.39 1.   0.65 0.7  0.62
  0.67 0.6  0.31 0.57]
 [0.83 0.34 0.63 0.63 0.37 0.72 0.69 0.18 0.28 0.28 0.65 1.   0.7  0.58
  0.68 0.65 0.18 0.63]
 [0.8  0.28 0.59 0.55 0.31 0.74 0.71 0.33 0.3  0.28 0.7  0.7  1.   0.56
  0.71 0.68 0.26 0.51]
 [0.68 0.26 0.56 0.48 0.25 0.56 0.58 0.34 0.62 0.26 0.62 0.58 0.56 1.
  0.88 0.58 0.25 0.51]
 [0.79 0.29 0.61 0.56 0.37 0.7  0.66 0.29 0.56 0.32 0.67 0.68 0.71 0.88
  1.   0.63 0.24 0.62]
 [0.78 0.42 0.68 0.55 0.45 0.67 0.61 0.34 0.42 0.3  0.6  0.65 0.68 0.58
  0.63 1.   0.31 0.52]
 [0.22 0.24 0.19 0.18 0.18 0.16 0.12 0.89 0.29 0.33 0.31 0.18 0.26 0.25
  0.24 0.31 1.   0.16]
 [0.7  0.24 0.42 0.53 0.37 0.61 0.53 0.26 0.28 0.19 0.57 0.63 0.51 0.51
  0.62 0.52 0.16 1.  ]]
Got 18 patterns after merging
MEMORY 14.247153664
Performing filtering
MEMORY 14.247153664
Got 8 patterns after filtering
MEMORY 14.247153664
Total time taken is 3410.3s
MEMORY 14.247153664
Applying subclustering to the final motifs
On pattern 0
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 200 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done 1400 tasks      | elapsed:    9.1s
[Parallel(n_jobs=4)]: Done 2359 out of 2359 | elapsed:   15.1s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 2359 samples in 0.067s...
[t-SNE] Computed neighbors for 2359 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2359
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
[t-SNE] Computed conditional probabilities for sample 2000 / 2359
[t-SNE] Computed conditional probabilities for sample 2359 / 2359
[t-SNE] Mean sigma: 0.255017
[t-SNE] Computed conditional probabilities in 0.211s
[t-SNE] Iteration 50: error = 71.8576050, gradient norm = 0.0506708 (50 iterations in 0.521s)
[t-SNE] Iteration 100: error = 72.0109711, gradient norm = 0.0405584 (50 iterations in 0.506s)
[t-SNE] Iteration 150: error = 72.0096512, gradient norm = 0.0241647 (50 iterations in 0.600s)
[t-SNE] Iteration 200: error = 71.8638992, gradient norm = 0.0572998 (50 iterations in 0.501s)
[t-SNE] Iteration 250: error = 71.8495865, gradient norm = 0.0415361 (50 iterations in 0.398s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 71.849586
[t-SNE] Iteration 300: error = 1.9407815, gradient norm = 0.0012448 (50 iterations in 0.358s)
[t-SNE] Iteration 350: error = 1.8094492, gradient norm = 0.0003967 (50 iterations in 0.314s)
[t-SNE] Iteration 400: error = 1.7587214, gradient norm = 0.0002612 (50 iterations in 0.319s)
[t-SNE] Iteration 450: error = 1.7360786, gradient norm = 0.0001801 (50 iterations in 0.318s)
[t-SNE] Iteration 500: error = 1.7242937, gradient norm = 0.0001362 (50 iterations in 0.324s)
[t-SNE] Iteration 550: error = 1.7166767, gradient norm = 0.0001017 (50 iterations in 0.314s)
[t-SNE] Iteration 600: error = 1.7128193, gradient norm = 0.0001010 (50 iterations in 0.317s)
[t-SNE] Iteration 650: error = 1.7098056, gradient norm = 0.0000878 (50 iterations in 0.324s)
[t-SNE] Iteration 700: error = 1.7069721, gradient norm = 0.0000848 (50 iterations in 0.321s)
[t-SNE] Iteration 750: error = 1.7046078, gradient norm = 0.0000652 (50 iterations in 0.324s)
[t-SNE] Iteration 800: error = 1.7029958, gradient norm = 0.0000631 (50 iterations in 0.328s)
[t-SNE] Iteration 850: error = 1.7014551, gradient norm = 0.0000614 (50 iterations in 0.331s)
[t-SNE] Iteration 900: error = 1.7002703, gradient norm = 0.0000660 (50 iterations in 0.332s)
[t-SNE] Iteration 950: error = 1.6994607, gradient norm = 0.0000514 (50 iterations in 0.324s)
[t-SNE] Iteration 1000: error = 1.6988314, gradient norm = 0.0000472 (50 iterations in 0.320s)
[t-SNE] KL divergence after 1000 iterations: 1.698831
[t-SNE] Computed conditional probabilities for sample 1000 / 2359
[t-SNE] Computed conditional probabilities for sample 2000 / 2359
[t-SNE] Computed conditional probabilities for sample 2359 / 2359
[t-SNE] Mean sigma: 0.255017
Beginning preprocessing + Leiden
Affmat shape: 2359
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  1.2min
Quality: 0.593916216812392
Quality: 0.5939603844100576
Quality: 0.5942641368176802
Quality: 0.5942881589214632
Quality: 0.5943249936000555
Quality: 0.5943407921382192
Quality: 0.5944230569119762
Quality: 0.5944436331565464
Quality: 0.5944940078482859
Quality: 0.5945992965933217
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:  1.4min finished
Got subclusters: Counter({0: 516, 1: 346, 2: 243, 3: 202, 4: 185, 5: 156, 6: 153, 7: 135, 8: 108, 9: 98, 10: 79, 11: 44, 12: 42, 13: 29, 14: 23})
On pattern 1
[t-SNE] Computing 150 nearest neighbors...
[t-SNE] Indexed 151 samples in 0.003s...
[t-SNE] Computed neighbors for 151 samples in 0.001s...
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 151 out of 151 | elapsed:    0.2s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
[t-SNE] Computed conditional probabilities for sample 151 / 151
[t-SNE] Mean sigma: 0.327978
[t-SNE] Computed conditional probabilities in 0.017s
[t-SNE] Iteration 50: error = 57.8178177, gradient norm = 0.5157831 (50 iterations in 0.104s)
[t-SNE] Iteration 100: error = 59.7636948, gradient norm = 0.4653271 (50 iterations in 0.068s)
[t-SNE] Iteration 150: error = 61.2096214, gradient norm = 0.5335827 (50 iterations in 0.050s)
[t-SNE] Iteration 200: error = 60.1617165, gradient norm = 0.4369144 (50 iterations in 0.046s)
[t-SNE] Iteration 250: error = 59.5524483, gradient norm = 0.4978087 (50 iterations in 0.046s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.552448
[t-SNE] Iteration 300: error = 0.6144239, gradient norm = 0.0190552 (50 iterations in 0.044s)
[t-SNE] Iteration 350: error = 0.4592904, gradient norm = 0.0028417 (50 iterations in 0.055s)
[t-SNE] Iteration 400: error = 0.4586868, gradient norm = 0.0002356 (50 iterations in 0.062s)
[t-SNE] Iteration 450: error = 0.4589675, gradient norm = 0.0002710 (50 iterations in 0.078s)
[t-SNE] Iteration 500: error = 0.4582903, gradient norm = 0.0003778 (50 iterations in 0.067s)
[t-SNE] Iteration 550: error = 0.4588707, gradient norm = 0.0004006 (50 iterations in 0.045s)
[t-SNE] Iteration 600: error = 0.4588754, gradient norm = 0.0002700 (50 iterations in 0.067s)
[t-SNE] Iteration 650: error = 0.4589237, gradient norm = 0.0003093 (50 iterations in 0.060s)
[t-SNE] Iteration 700: error = 0.4588911, gradient norm = 0.0002174 (50 iterations in 0.068s)
[t-SNE] Iteration 750: error = 0.4587632, gradient norm = 0.0002209 (50 iterations in 0.045s)
[t-SNE] Iteration 800: error = 0.4589559, gradient norm = 0.0002950 (50 iterations in 0.049s)
[t-SNE] Iteration 850: error = 0.4589023, gradient norm = 0.0003308 (50 iterations in 0.046s)
[t-SNE] Iteration 850: did not make any progress during the last 300 episodes. Finished.
[t-SNE] KL divergence after 850 iterations: 0.458902
[t-SNE] Computed conditional probabilities for sample 151 / 151
[t-SNE] Mean sigma: 0.327978
Beginning preprocessing + Leiden
Affmat shape: 151
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.8s
Quality: 0.21302277706576736
Quality: 0.21333140717878132
Quality: 0.21354697115367247
Quality: 0.21442567605855503
Quality: 0.21445476478614436
Quality: 0.2152511090684427
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   23.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 130 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 142 out of 142 | elapsed:    0.1s finished
Got subclusters: Counter({0: 56, 1: 44, 2: 36, 3: 7, 4: 5, 5: 3})
On pattern 2
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
[t-SNE] Computing 141 nearest neighbors...
[t-SNE] Indexed 142 samples in 0.003s...
[t-SNE] Computed neighbors for 142 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 142 / 142
[t-SNE] Mean sigma: 0.358361
[t-SNE] Computed conditional probabilities in 0.016s
[t-SNE] Iteration 50: error = 57.1499901, gradient norm = 0.5057760 (50 iterations in 0.069s)
[t-SNE] Iteration 100: error = 57.2434502, gradient norm = 0.5153511 (50 iterations in 0.054s)
[t-SNE] Iteration 150: error = 58.9423599, gradient norm = 0.4918348 (50 iterations in 0.044s)
[t-SNE] Iteration 200: error = 55.5323639, gradient norm = 0.5658602 (50 iterations in 0.048s)
[t-SNE] Iteration 250: error = 59.2097473, gradient norm = 0.4536256 (50 iterations in 0.044s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 59.209747
[t-SNE] Iteration 300: error = 0.7588489, gradient norm = 0.0071932 (50 iterations in 0.042s)
[t-SNE] Iteration 350: error = 0.4420862, gradient norm = 0.0043462 (50 iterations in 0.043s)
[t-SNE] Iteration 400: error = 0.4378024, gradient norm = 0.0005014 (50 iterations in 0.042s)
[t-SNE] Iteration 450: error = 0.4370647, gradient norm = 0.0002695 (50 iterations in 0.045s)
[t-SNE] Iteration 500: error = 0.4373764, gradient norm = 0.0002361 (50 iterations in 0.043s)
[t-SNE] Iteration 550: error = 0.4372952, gradient norm = 0.0002538 (50 iterations in 0.043s)
[t-SNE] Iteration 600: error = 0.4373324, gradient norm = 0.0001269 (50 iterations in 0.043s)
[t-SNE] Iteration 650: error = 0.4371422, gradient norm = 0.0002580 (50 iterations in 0.042s)
[t-SNE] Iteration 700: error = 0.4372155, gradient norm = 0.0002221 (50 iterations in 0.045s)
[t-SNE] Iteration 750: error = 0.4372273, gradient norm = 0.0001312 (50 iterations in 0.043s)
[t-SNE] Iteration 800: error = 0.4372534, gradient norm = 0.0000900 (50 iterations in 0.043s)
[t-SNE] Iteration 800: did not make any progress during the last 300 episodes. Finished.
[t-SNE] KL divergence after 800 iterations: 0.437253
[t-SNE] Computed conditional probabilities for sample 142 / 142
[t-SNE] Mean sigma: 0.358361
Beginning preprocessing + Leiden
Affmat shape: 142
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.9s
Quality: 0.22980334815671763
Quality: 0.2304595819171252
Quality: 0.23063870580149065
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   23.4s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 131 out of 131 | elapsed:    0.1s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
Got subclusters: Counter({0: 41, 1: 31, 2: 30, 3: 25, 4: 15})
On pattern 3
[t-SNE] Computing 130 nearest neighbors...
[t-SNE] Indexed 131 samples in 0.003s...
[t-SNE] Computed neighbors for 131 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 131 / 131
[t-SNE] Mean sigma: 0.339956
[t-SNE] Computed conditional probabilities in 0.015s
[t-SNE] Iteration 50: error = 59.6914902, gradient norm = 0.4334767 (50 iterations in 0.098s)
[t-SNE] Iteration 100: error = 57.3789978, gradient norm = 0.5039366 (50 iterations in 0.043s)
[t-SNE] Iteration 150: error = 59.4751244, gradient norm = 0.4715656 (50 iterations in 0.042s)
[t-SNE] Iteration 200: error = 57.3927231, gradient norm = 0.5458401 (50 iterations in 0.045s)
[t-SNE] Iteration 250: error = 57.9529572, gradient norm = 0.5015671 (50 iterations in 0.043s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 57.952957
[t-SNE] Iteration 300: error = 0.8441527, gradient norm = 0.0079005 (50 iterations in 0.043s)
[t-SNE] Iteration 350: error = 0.4544646, gradient norm = 0.0158588 (50 iterations in 0.041s)
[t-SNE] Iteration 400: error = 0.3813778, gradient norm = 0.0011356 (50 iterations in 0.041s)
[t-SNE] Iteration 450: error = 0.3813468, gradient norm = 0.0001121 (50 iterations in 0.043s)
[t-SNE] Iteration 500: error = 0.3812933, gradient norm = 0.0001957 (50 iterations in 0.041s)
[t-SNE] Iteration 550: error = 0.3813325, gradient norm = 0.0001813 (50 iterations in 0.044s)
[t-SNE] Iteration 600: error = 0.3813050, gradient norm = 0.0002073 (50 iterations in 0.042s)
[t-SNE] Iteration 650: error = 0.3813581, gradient norm = 0.0001228 (50 iterations in 0.041s)
[t-SNE] Iteration 700: error = 0.3813081, gradient norm = 0.0001233 (50 iterations in 0.044s)
[t-SNE] Iteration 750: error = 0.3813029, gradient norm = 0.0000810 (50 iterations in 0.043s)
[t-SNE] Iteration 800: error = 0.3813598, gradient norm = 0.0001689 (50 iterations in 0.045s)
[t-SNE] Iteration 850: error = 0.3813156, gradient norm = 0.0001620 (50 iterations in 0.042s)
[t-SNE] Iteration 850: did not make any progress during the last 300 episodes. Finished.
[t-SNE] KL divergence after 850 iterations: 0.381316
[t-SNE] Computed conditional probabilities for sample 131 / 131
[t-SNE] Mean sigma: 0.339956
Beginning preprocessing + Leiden
Affmat shape: 131
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.9s
Quality: 0.2209828758193568
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   23.5s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  98 out of  98 | elapsed:    0.1s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
Got subclusters: Counter({0: 42, 1: 37, 2: 31, 3: 13, 4: 6, 5: 2})
On pattern 4
[t-SNE] Computing 97 nearest neighbors...
[t-SNE] Indexed 98 samples in 0.003s...
[t-SNE] Computed neighbors for 98 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 98 / 98
[t-SNE] Mean sigma: 0.527132
[t-SNE] Computed conditional probabilities in 0.011s
[t-SNE] Iteration 50: error = 54.2331505, gradient norm = 0.5383500 (50 iterations in 0.070s)
[t-SNE] Iteration 100: error = 49.6971245, gradient norm = 0.6598901 (50 iterations in 0.042s)
[t-SNE] Iteration 150: error = 53.5617981, gradient norm = 0.5056719 (50 iterations in 0.033s)
[t-SNE] Iteration 200: error = 52.1685867, gradient norm = 0.4623906 (50 iterations in 0.033s)
[t-SNE] Iteration 250: error = 53.4839706, gradient norm = 0.4512546 (50 iterations in 0.034s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 53.483971
[t-SNE] Iteration 300: error = 0.7108304, gradient norm = 0.0070011 (50 iterations in 0.033s)
[t-SNE] Iteration 350: error = 0.5511788, gradient norm = 0.0014457 (50 iterations in 0.037s)
[t-SNE] Iteration 400: error = 0.5981809, gradient norm = 0.0018380 (50 iterations in 0.032s)
[t-SNE] Iteration 450: error = 0.5140861, gradient norm = 0.0008846 (50 iterations in 0.032s)
[t-SNE] Iteration 500: error = 0.4778329, gradient norm = 0.0006913 (50 iterations in 0.032s)
[t-SNE] Iteration 550: error = 0.4437873, gradient norm = 0.0008264 (50 iterations in 0.032s)
[t-SNE] Iteration 600: error = 0.4239740, gradient norm = 0.0005000 (50 iterations in 0.033s)
[t-SNE] Iteration 650: error = 0.4043009, gradient norm = 0.0007683 (50 iterations in 0.034s)
[t-SNE] Iteration 700: error = 0.3815438, gradient norm = 0.0014440 (50 iterations in 0.035s)
[t-SNE] Iteration 750: error = 0.1364995, gradient norm = 0.0100479 (50 iterations in 0.033s)
[t-SNE] Iteration 800: error = 0.1322736, gradient norm = 0.0080954 (50 iterations in 0.034s)
[t-SNE] Iteration 850: error = 0.1313980, gradient norm = 0.0071973 (50 iterations in 0.033s)
[t-SNE] Iteration 900: error = 0.1294290, gradient norm = 0.0009048 (50 iterations in 0.032s)
[t-SNE] Iteration 950: error = 0.1287556, gradient norm = 0.0006338 (50 iterations in 0.033s)
[t-SNE] Iteration 1000: error = 0.1294987, gradient norm = 0.0005745 (50 iterations in 0.033s)
[t-SNE] KL divergence after 1000 iterations: 0.129499
[t-SNE] Computed conditional probabilities for sample 98 / 98
[t-SNE] Mean sigma: 0.527132
Beginning preprocessing + Leiden
Affmat shape: 98
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.2s
Quality: 0.30005753028671917
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.7s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  89 out of  89 | elapsed:    0.1s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
Got subclusters: Counter({0: 41, 1: 39, 2: 18})
On pattern 5
[t-SNE] Computing 88 nearest neighbors...
[t-SNE] Indexed 89 samples in 0.003s...
[t-SNE] Computed neighbors for 89 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 89 / 89
[t-SNE] Mean sigma: 0.375664
[t-SNE] Computed conditional probabilities in 0.011s
[t-SNE] Iteration 50: error = 56.1102180, gradient norm = 0.5055261 (50 iterations in 0.101s)
[t-SNE] Iteration 100: error = 52.7426567, gradient norm = 0.5592338 (50 iterations in 0.031s)
[t-SNE] Iteration 150: error = 49.8281326, gradient norm = 0.4970038 (50 iterations in 0.031s)
[t-SNE] Iteration 200: error = 52.5715523, gradient norm = 0.4941798 (50 iterations in 0.031s)
[t-SNE] Iteration 250: error = 54.8102913, gradient norm = 0.5267977 (50 iterations in 0.031s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 54.810291
[t-SNE] Iteration 300: error = 0.9905930, gradient norm = 0.0030654 (50 iterations in 0.031s)
[t-SNE] Iteration 350: error = 0.8203988, gradient norm = 0.0014124 (50 iterations in 0.030s)
[t-SNE] Iteration 400: error = 0.7396128, gradient norm = 0.0015616 (50 iterations in 0.033s)
[t-SNE] Iteration 450: error = 0.7008389, gradient norm = 0.0003501 (50 iterations in 0.030s)
[t-SNE] Iteration 500: error = 0.6832703, gradient norm = 0.0006963 (50 iterations in 0.030s)
[t-SNE] Iteration 550: error = 0.6620727, gradient norm = 0.0006730 (50 iterations in 0.049s)
[t-SNE] Iteration 600: error = 0.6004928, gradient norm = 0.0012440 (50 iterations in 0.031s)
[t-SNE] Iteration 650: error = 0.5456126, gradient norm = 0.0079389 (50 iterations in 0.031s)
[t-SNE] Iteration 700: error = 0.2018622, gradient norm = 0.0119570 (50 iterations in 0.034s)
[t-SNE] Iteration 750: error = 0.1927510, gradient norm = 0.0014609 (50 iterations in 0.032s)
[t-SNE] Iteration 800: error = 0.1923574, gradient norm = 0.0005565 (50 iterations in 0.031s)
[t-SNE] Iteration 850: error = 0.1923090, gradient norm = 0.0006586 (50 iterations in 0.031s)
[t-SNE] Iteration 900: error = 0.1923369, gradient norm = 0.0005152 (50 iterations in 0.030s)
[t-SNE] Iteration 950: error = 0.1924663, gradient norm = 0.0004486 (50 iterations in 0.030s)
[t-SNE] Iteration 1000: error = 0.1921964, gradient norm = 0.0007483 (50 iterations in 0.031s)
[t-SNE] KL divergence after 1000 iterations: 0.192196
[t-SNE] Computed conditional probabilities for sample 89 / 89
[t-SNE] Mean sigma: 0.375664
Beginning preprocessing + Leiden
Affmat shape: 89
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.3s
Quality: 0.16936796981030405
Quality: 0.1733491273705442
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.9s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  55 out of  55 | elapsed:    0.1s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
Got subclusters: Counter({0: 38, 1: 25, 2: 20, 3: 6})
On pattern 6
[t-SNE] Computing 54 nearest neighbors...
[t-SNE] Indexed 55 samples in 0.002s...
[t-SNE] Computed neighbors for 55 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 55 / 55
[t-SNE] Mean sigma: 0.759145
[t-SNE] Computed conditional probabilities in 0.006s
[t-SNE] Iteration 50: error = 44.3685226, gradient norm = 0.4560668 (50 iterations in 0.069s)
[t-SNE] Iteration 100: error = 45.1286850, gradient norm = 0.5612732 (50 iterations in 0.027s)
[t-SNE] Iteration 150: error = 45.1815414, gradient norm = 0.5947992 (50 iterations in 0.025s)
[t-SNE] Iteration 200: error = 42.6964035, gradient norm = 0.5323554 (50 iterations in 0.025s)
[t-SNE] Iteration 250: error = 46.7280579, gradient norm = 0.5474741 (50 iterations in 0.024s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 46.728058
[t-SNE] Iteration 300: error = 0.6782527, gradient norm = 0.0011823 (50 iterations in 0.024s)
[t-SNE] Iteration 350: error = 0.6140836, gradient norm = 0.0004746 (50 iterations in 0.024s)
[t-SNE] Iteration 400: error = 0.5974553, gradient norm = 0.0003736 (50 iterations in 0.024s)
[t-SNE] Iteration 450: error = 0.5786180, gradient norm = 0.0006919 (50 iterations in 0.025s)
[t-SNE] Iteration 500: error = 0.5522077, gradient norm = 0.0003388 (50 iterations in 0.027s)
[t-SNE] Iteration 550: error = 0.5466132, gradient norm = 0.0001202 (50 iterations in 0.026s)
[t-SNE] Iteration 600: error = 0.5433961, gradient norm = 0.0001957 (50 iterations in 0.024s)
[t-SNE] Iteration 650: error = 0.5386685, gradient norm = 0.0001956 (50 iterations in 0.025s)
[t-SNE] Iteration 700: error = 0.5341217, gradient norm = 0.0001727 (50 iterations in 0.024s)
[t-SNE] Iteration 750: error = 0.5289829, gradient norm = 0.0004864 (50 iterations in 0.024s)
[t-SNE] Iteration 800: error = 0.8437047, gradient norm = 0.0087163 (50 iterations in 0.024s)
[t-SNE] Iteration 850: error = 0.7177541, gradient norm = 0.0006938 (50 iterations in 0.025s)
[t-SNE] Iteration 900: error = 0.6286021, gradient norm = 0.0003456 (50 iterations in 0.024s)
[t-SNE] Iteration 950: error = 0.6064545, gradient norm = 0.0001794 (50 iterations in 0.026s)
[t-SNE] Iteration 1000: error = 0.5934703, gradient norm = 0.0001807 (50 iterations in 0.026s)
[t-SNE] KL divergence after 1000 iterations: 0.593470
[t-SNE] Computed conditional probabilities for sample 55 / 55
[t-SNE] Mean sigma: 0.759145
Beginning preprocessing + Leiden
Affmat shape: 55
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   19.1s
Quality: 0.0361537661129854
Quality: 0.036370288776872435
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.6s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  36 out of  43 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  43 out of  43 | elapsed:    0.0s finished
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/manifold/_t_sne.py:699: FutureWarning: 'square_distances' has been introduced in 0.24 to help phase out legacy squaring behavior. The 'legacy' setting will be removed in 1.1 (renaming of 0.26), and the default setting will be changed to True. In 1.3, 'square_distances' will be removed altogether, and distances will be squared by default. Set 'square_distances'=True to silence this warning.
  FutureWarning
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/sklearn/neighbors/_base.py:176: EfficiencyWarning: Precomputed sparse input was not sorted by data.
  EfficiencyWarning)
Got subclusters: Counter({0: 27, 1: 19, 2: 6, 3: 3})
On pattern 7
[t-SNE] Computing 42 nearest neighbors...
[t-SNE] Indexed 43 samples in 0.003s...
[t-SNE] Computed neighbors for 43 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 43 / 43
[t-SNE] Mean sigma: 1125899906842624.000000
[t-SNE] Computed conditional probabilities in 0.013s
[t-SNE] Iteration 50: error = 45.1561394, gradient norm = 0.4550527 (50 iterations in 0.103s)
[t-SNE] Iteration 100: error = 49.0812607, gradient norm = 0.4520793 (50 iterations in 0.023s)
[t-SNE] Iteration 150: error = 45.4180183, gradient norm = 0.5224083 (50 iterations in 0.023s)
[t-SNE] Iteration 200: error = 48.9434662, gradient norm = 0.5377184 (50 iterations in 0.023s)
[t-SNE] Iteration 250: error = 45.3842468, gradient norm = 0.8476522 (50 iterations in 0.023s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 45.384247
[t-SNE] Iteration 300: error = 0.6132308, gradient norm = 0.0008143 (50 iterations in 0.030s)
[t-SNE] Iteration 350: error = 0.5861698, gradient norm = 0.0001547 (50 iterations in 0.037s)
[t-SNE] Iteration 400: error = 0.5822757, gradient norm = 0.0000381 (50 iterations in 0.033s)
[t-SNE] Iteration 450: error = 0.5818886, gradient norm = 0.0000243 (50 iterations in 0.030s)
[t-SNE] Iteration 500: error = 0.5821879, gradient norm = 0.0000205 (50 iterations in 0.038s)
[t-SNE] Iteration 550: error = 0.5813316, gradient norm = 0.0000383 (50 iterations in 0.038s)
[t-SNE] Iteration 600: error = 0.5812856, gradient norm = 0.0000217 (50 iterations in 0.043s)
[t-SNE] Iteration 650: error = 0.5810173, gradient norm = 0.0000169 (50 iterations in 0.037s)
[t-SNE] Iteration 700: error = 0.5808842, gradient norm = 0.0000173 (50 iterations in 0.037s)
[t-SNE] Iteration 750: error = 0.5808838, gradient norm = 0.0000178 (50 iterations in 0.055s)
[t-SNE] Iteration 800: error = 0.5808979, gradient norm = 0.0000177 (50 iterations in 0.038s)
[t-SNE] Iteration 850: error = 0.5807926, gradient norm = 0.0000184 (50 iterations in 0.058s)
[t-SNE] Iteration 900: error = 0.5805256, gradient norm = 0.0000243 (50 iterations in 0.038s)
[t-SNE] Iteration 950: error = 0.5800563, gradient norm = 0.0001154 (50 iterations in 0.038s)
[t-SNE] Iteration 1000: error = 0.5796041, gradient norm = 0.0000575 (50 iterations in 0.049s)
[t-SNE] KL divergence after 1000 iterations: 0.579604
[t-SNE] Computed conditional probabilities for sample 43 / 43
[t-SNE] Mean sigma: 1125899906842624.000000
Beginning preprocessing + Leiden
Affmat shape: 43
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   18.7s
Quality: -1.9084733793306774e-13
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:   22.2s finished
Got subclusters: Counter({0: 43})
In [8]:
modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/ipykernel_launcher.py:2: H5pyDeprecationWarning: The default file mode will change to 'r' (read-only) in h5py 3.0. To suppress this warning, pass the mode you need to h5py.File(), or set the global default h5.get_config().default_file_mode, or set the environment variable H5PY_DEFAULT_READONLY=1. Available modes are: 'r', 'r+', 'w', 'w-'/'x', 'a'. See the docs for details.
  
In [ ]:
print('..')
In [5]:
from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05, prep=False):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
    if prep:
        ob.fetch_tomtom_matches(save_report=True, 
                                  tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    else:
        ob.load_matched_motifs()
        ob.get_motif_per_celltype(match_threshold=match_threshold)
        pattern_tab, pattern_dict = ob.visualize_pattern_table()
        display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                            ' in ', b(sample_name),
                            ": #{}".format(len(pattern_dict)),
                           ), pattern_tab))
    #ob.display_individual_table()
    
    return None


def show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0.01, prep=False):
    ob = DenovoModisco(modiscodir)
    if prep:
        ob.fetch_tomtom_matches(
                    meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
                    database_name="HOCOMOCO.nonredundant.annotated",
                    save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    else:
        ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
        ob.get_motif_per_celltype(match_threshold=match_threshold, match_criteria='q-value',
                                  database_name="HOCOMOCO.nonredundant.annotated")
        #ob.display_individual_table()

        pattern_tab, pattern_dict = ob.visualize_pattern_table()
        display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                            ' in ', b(sample_name),
                            ": #{}".format(len(pattern_dict)),
                           ), pattern_tab))
    return None

Early timepoint

With all peaks

using central 160bp

In [ ]:
# sample_name = 'early_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out_1k"
# )
In [4]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #8
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 2359 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 151 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 142 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 131 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 98 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_5 # seqlets: 89 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 55 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_7 # seqlets: 43 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A

using central 1kb

In [7]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v10_out_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=1e-15)

With time-point based peaks

using central 160bp

In [28]:
# sample_name = 'early_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [14]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_0"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #7
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 1413 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 237 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_2 # seqlets: 202 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_3 # seqlets: 130 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 142 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_5 # seqlets: 70 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 55 SequenceContrib ScoresHyp_Contrib Scores
In [15]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_7"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #4
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 586 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 153 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_2 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 57 SequenceContrib ScoresHyp_Contrib Scores
In [16]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_8"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #7
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 1596 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 238 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_2 # seqlets: 148 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_3 # seqlets: 61 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_4 # seqlets: 58 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A
metacluster_0/pattern_5 # seqlets: 54 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_6 # seqlets: 49 SequenceContrib ScoresHyp_Contrib Scores
In [17]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_9"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #5
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 412 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-101_NFE2.UNK.0.A, HCLUST-124_FOSB.UNK.0.A, HCLUST-179_BACH1.UNK.0.A
metacluster_0/pattern_1 # seqlets: 79 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-179_BACH1.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_2 # seqlets: 48 SequenceContrib ScoresHyp_Contrib Scores
metacluster_0/pattern_3 # seqlets: 45 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 40 SequenceContrib ScoresHyp_Contrib Scores

using central 1kbp

In [8]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_0_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [19]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_7_1k"

show_patterns_using_hoccomocco_db(sample_name, modiscodir, match_threshold=0)
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
In [10]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_8_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [11]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_9_1k"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)
In [8]:
# generating modisco.meme
from matlas.matches import DenovoModisco
task_dir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_v14_out"
ob = DenovoModisco(task_dir)

ob.write_meme_file(task_dir+"/modisco.meme")

Late timepoint

In [37]:
# sample_name = 'late_fold0'

# display_denovo_patterns(
#     sample_name,
#     modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
# )
In [38]:
# sample_name = 'late_fold0'
# modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

# show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Loading keras model

In [2]:
from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:523: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:524: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/tensorflow/python/framework/dtypes.py:532: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.
channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_1 (Batch (None, 1000, 300)         1200      
_________________________________________________________________
activation_1 (Activation)    (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_2 (Batch (None, 333, 200)          800       
_________________________________________________________________
activation_2 (Activation)    (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_3 (Batch (None, 83, 200)           800       
_________________________________________________________________
activation_3 (Activation)    (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_4 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_4 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_5 (Batch (None, 1000)              4000      
_________________________________________________________________
activation_5 (Activation)    (None, 1000)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________
In [3]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape
Out[3]:
((35024, 19), (35024, 19), (35024, 1, 1000, 4))
In [23]:
from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)
1/1 [==============================] - 1s 664ms/step
In [24]:
from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')
Out[24]:
Text(0, 0.5, 'keras predictions')
In [25]:
import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))
(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model

In [4]:
from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape
load data from labcluster
TF-MoDISco is using the TensorFlow backend.
nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_1_0 the preceding linear layer is conv1d_1_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_2_0 the preceding linear layer is conv1d_2_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_3_0 the preceding linear layer is conv1d_3_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_4_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_5_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
Out[4]:
[None, 1000, 4]
In [9]:
#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]
Out[9]:
(list, 35024)
In [3]:
def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)
In [27]:
from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)
In [28]:
num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]
One hot encoding sequences...
One hot encoding done...
Out[28]:
((450, 1000, 4), 1000)
In [11]:
from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}
In [29]:
task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape
Done 0
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
In [ ]:
# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)
In [8]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape
['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']
Out[8]:
((35024, 10, 1000, 4), (35024, 10, 160, 4))
In [10]:
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)
-0.4860307276248932 1.3380632400512695
-0.0440836176276207 0.1500825583934784
In [5]:
scores_ggr.shape
Out[5]:
(35024, 10, 1000, 4)
In [36]:
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)
-0.0688343504909426 0.35437235310673715
-0.976342553505674 0.35437235310673715
0.0 1.0
In [ ]: