In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np 
import glob
import os
from collections import OrderedDict
import pickle
import h5py
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Generating modisco motifs

In [7]:
def load_deeplift_data(deeplift_hdf, keys=['scores', 'one_hot']):
    fp = h5py.File(deeplift_hdf, "r")
    hyp_scores = fp['deeplift_scores'][:]
    one_hot = fp['inputs'][:]
    shuffled_onehot = fp['shuffled_inputs'][:]

    deeplift_data = OrderedDict()
    if 'one_hot' in keys:
        deeplift_data['one_hot'] = one_hot
    if 'peaks' in keys :
        df = OrderedDict()
        for key in list(fp['metadata/range']):
            df[key] = fp['metadata/range/{}'.format(key)][:]
        df = pd.DataFrame(df)
        df.chr = np.array([df.chr.values[i].decode('utf-8') for i in range(df.shape[0])])
        deeplift_data['peaks'] = df
        
    if 'hyp_scores' in keys :
        deeplift_data['hyp_scores'] = hyp_scores
    if 'scores' in keys :
        deeplift_data['scores'] = np.multiply(hyp_scores, one_hot)#no need to sum before mult, taken care of in deeplift, not in deepshap though
    
    
    return deeplift_data

deeplift_hdf = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/deeplift_out/summit.h5"
deeplift_data = load_deeplift_data(deeplift_hdf, keys=['hyp_scores', 'scores', 'one_hot'])
deeplift_data['scores'].shape, deeplift_data['hyp_scores'].shape, deeplift_data['one_hot'].shape
Out[7]:
((35024, 1000, 4), (35024, 1000, 4), (35024, 1000, 4))
In [8]:
import modisco
null_per_pos_scores = modisco.coordproducers.LaplaceNullDist(num_to_samp=5000)
In [9]:
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
                    #Slight modifications from the default settings
                    sliding_window_size=15,
                    flank_size=5,
                    target_seqlet_fdr=0.15,
                    seqlets_to_patterns_factory=
                     modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
                        #Note: as of version 0.5.6.0, it's possible to use the results of a motif discovery
                        # software like MEME to improve the TF-MoDISco clustering. To use the meme-based
                        # initialization, you would specify the initclusterer_factory as shown in the
                        # commented-out code below:
                        #initclusterer_factory=modisco.clusterinit.memeinit.MemeInitClustererFactory(    
                        #    meme_command="meme", base_outdir="meme_out",            
                        #    max_num_seqlets_to_use=10000, nmotifs=10, n_jobs=1),
                        trim_to_window_size=15,
                        initial_flank_to_add=5,
                        kmer_len=5, num_gaps=1,
                        num_mismatches=0,
                        final_min_cluster_size=60)
                )(
                 task_names=["early0"],
                 contrib_scores={'early0': deeplift_data['scores'][:,420:580,:]},
                 hypothetical_contribs={'early0': deeplift_data['hyp_scores'][:,420:580,:] },
                 one_hot=deeplift_data['one_hot'][:,420:580,:],
                 null_per_pos_scores = null_per_pos_scores)
MEMORY 4.630016
On task early0
Computing windowed sums on original
Generating null dist
peak(mu)= 0.0034060630339271536
Computing threshold
Thresholds from null dist were -0.5258131270413289  and  1.1217074577463788
Final raw thresholds are -0.5258131270413289  and  1.1217074577463788
Final transformed thresholds are -0.8864842972646545  and  0.9668417194941081
Got 15833 coords
After resolving overlaps, got 15833 seqlets
Across all tasks, the weakest transformed threshold used was: 0.8863842972646545
MEMORY 4.9975296
15833 identified in total
min_metacluster_size_frac * len(seqlets) = 158 is more than min_metacluster_size=100.
Using it as a new min_metacluster_size
2 activity patterns with support >= 158 out of 2 possible patterns
Metacluster sizes:  [15458, 375]
Idx to activities:  {0: '1', 1: '-1'}
MEMORY 4.997758976
On metacluster 1
Metacluster size 375
Relevant tasks:  ('early0',)
Relevant signs:  (-1,)
TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 375
(Round 1) Computing coarse affmat
MEMORY 4.997758976
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.14 s
Starting affinity matrix computations
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.47 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.48 s
(Round 1) Compute nearest neighbors from coarse affmat
MEMORY 5.001498624
Computed nearest neighbors in 0.06 s
MEMORY 5.00400128
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 5.00400128
Launching nearest neighbors affmat calculation job
MEMORY 5.004263424
Parallel runs completed
MEMORY 5.011603456
Job completed in: 2.6 s
MEMORY 5.011603456
Launching nearest neighbors affmat calculation job
MEMORY 5.011603456
Parallel runs completed
MEMORY 5.014093824
Job completed in: 2.52 s
MEMORY 5.014122496
(Round 1) Computed affinity matrix on nearest neighbors in 5.27 s
MEMORY 5.014122496
Filtered down to 373 of 375
(Round 1) Retained 373 rows out of 375 after filtering
MEMORY 5.014122496
(Round 1) Computing density adapted affmat
MEMORY 5.014122496
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 373 samples in 0.000s...
[t-SNE] Computed neighbors for 373 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 373 / 373
[t-SNE] Mean sigma: 0.253450
(Round 1) Computing clustering
MEMORY 5.014122496
Beginning preprocessing + Leiden
  0%|          | 0/50 [00:00<?, ?it/s]
Quality: 0.5750669252729659
Quality: 0.5751015221292706
100%|██████████| 50/50 [00:01<00:00, 33.92it/s]
Got 15 clusters after round 1
Counts:
{6: 20, 4: 38, 8: 14, 12: 4, 9: 7, 1: 59, 2: 57, 7: 17, 13: 3, 10: 6, 5: 25, 0: 64, 3: 52, 14: 2, 11: 5}
MEMORY 5.01508096
(Round 1) Aggregating seqlets in each cluster
MEMORY 5.01508096
Aggregating for cluster 0 with 64 seqlets
MEMORY 5.01508096

Trimming eliminated 0 seqlets out of 64
Skipped 2 seqlets
Aggregating for cluster 1 with 59 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 59
Skipped 1 seqlets
Aggregating for cluster 2 with 57 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 57
Skipped 8 seqlets
Aggregating for cluster 3 with 52 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 52
Skipped 5 seqlets
Aggregating for cluster 4 with 38 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 38
Skipped 1 seqlets
Aggregating for cluster 5 with 25 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 25
Skipped 1 seqlets
Aggregating for cluster 6 with 20 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 20
Skipped 2 seqlets
Dropping cluster 6 with 18 seqlets due to sign disagreement
Aggregating for cluster 7 with 17 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 17
Skipped 3 seqlets
Aggregating for cluster 8 with 14 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 14
Aggregating for cluster 9 with 7 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 7
Aggregating for cluster 10 with 6 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 6
Aggregating for cluster 11 with 5 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 5
Skipped 1 seqlets
Aggregating for cluster 12 with 4 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 4
Skipped 1 seqlets
Aggregating for cluster 13 with 3 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 3
Dropping cluster 13 with 3 seqlets due to sign disagreement
Aggregating for cluster 14 with 2 seqlets
MEMORY 5.01508096
Trimming eliminated 0 seqlets out of 2
Dropping cluster 14 with 2 seqlets due to sign disagreement
(Round 2) num seqlets: 325
(Round 2) Computing coarse affmat
MEMORY 5.01508096
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.08 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.02 s
(Round 2) Compute nearest neighbors from coarse affmat
MEMORY 5.015146496
Computed nearest neighbors in 0.06 s
MEMORY 5.015998464
(Round 2) Computing affinity matrix on nearest neighbors
MEMORY 5.015998464
Launching nearest neighbors affmat calculation job
MEMORY 5.015998464
Parallel runs completed
MEMORY 5.015998464
Job completed in: 2.07 s
MEMORY 5.015998464
Launching nearest neighbors affmat calculation job
MEMORY 5.015998464
Parallel runs completed
MEMORY 5.015998464
Job completed in: 1.97 s
MEMORY 5.015998464
(Round 2) Computed affinity matrix on nearest neighbors in 4.22 s
MEMORY 5.015998464
Not applying filtering for rounds above first round
MEMORY 5.015998464
(Round 2) Computing density adapted affmat
MEMORY 5.015998464
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 325 samples in 0.000s...
[t-SNE] Computed neighbors for 325 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 325 / 325
[t-SNE] Mean sigma: 0.244780
(Round 2) Computing clustering
MEMORY 5.015998464
Beginning preprocessing + Leiden
  0%|          | 0/50 [00:00<?, ?it/s]
Quality: 0.5636795151701894
Quality: 0.5644660820642647
 10%|█         | 5/50 [00:00<00:01, 42.74it/s]
Quality: 0.5644715077520571
Quality: 0.5649256015655255
 60%|██████    | 30/50 [00:00<00:00, 41.16it/s]
Quality: 0.5650268374945023
100%|██████████| 50/50 [00:01<00:00, 42.25it/s]
Got 12 clusters after round 2
Counts:
{0: 70, 1: 59, 8: 8, 2: 49, 5: 30, 3: 34, 4: 32, 10: 4, 7: 10, 6: 19, 9: 8, 11: 2}
MEMORY 5.016195072
(Round 2) Aggregating seqlets in each cluster
MEMORY 5.016195072
Aggregating for cluster 0 with 70 seqlets
MEMORY 5.016195072

Trimming eliminated 0 seqlets out of 70
Aggregating for cluster 1 with 59 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 59
Aggregating for cluster 2 with 49 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 49
Aggregating for cluster 3 with 34 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 34
Aggregating for cluster 4 with 32 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 32
Aggregating for cluster 5 with 30 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 30
Dropping cluster 5 with 30 seqlets due to sign disagreement
Aggregating for cluster 6 with 19 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 19
Aggregating for cluster 7 with 10 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 10
Aggregating for cluster 8 with 8 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 8
Aggregating for cluster 9 with 8 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 8
Aggregating for cluster 10 with 4 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 4
Aggregating for cluster 11 with 2 seqlets
MEMORY 5.016195072
Trimming eliminated 0 seqlets out of 2
Got 11 clusters
Splitting into subclusters...
MEMORY 5.016195072
Inspecting for spurious merging
Wrote graph to binary file in 0.010092973709106445 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00517704
After 2 runs, maximum modularity is Q = 0.00533899
Louvain completed 22 runs in 1.954697847366333 seconds
Similarity is 0.9182600307818493; is_dissimilar is False
Merging on 11 clusters
MEMORY 5.016195072
On merging iteration 1
Numbers for each pattern pre-subsample: [70, 59, 49, 34, 32, 19, 10, 8, 8, 4, 2]
Numbers after subsampling: [70, 59, 49, 34, 32, 19, 10, 8, 8, 4, 2]
Cluster sizes
[70 59 49 34 32 19 10  8  8  4  2]
Cross-contamination matrix:
[[1.   0.89 0.78 0.61 0.88 0.58 0.65 0.51 0.61 0.28 0.1 ]
 [0.39 1.   0.48 0.6  0.8  0.44 0.63 0.18 0.35 0.2  0.  ]
 [0.55 0.78 1.   0.6  0.74 0.5  0.61 0.38 0.46 0.26 0.02]
 [0.52 1.   0.72 1.   1.   0.69 0.85 0.34 0.68 1.   0.4 ]
 [0.4  0.83 0.49 0.65 1.   0.49 0.64 0.26 0.4  0.36 0.08]
 [0.76 0.99 0.85 0.89 1.   1.   0.85 0.68 0.84 1.   0.73]
 [0.47 0.81 0.58 0.73 0.81 0.55 1.   0.22 0.65 0.82 0.45]
 [0.81 0.9  0.9  0.85 0.95 0.91 0.81 1.   0.91 1.   0.73]
 [0.78 0.99 0.86 0.88 1.   0.75 0.92 0.56 1.   0.53 0.  ]
 [0.07 0.42 0.2  0.44 0.45 0.3  0.42 0.1  0.06 1.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.89 0.84 0.74 0.88 0.82 0.75 0.66 0.8  0.72 0.28]
 [0.89 1.   0.91 0.93 0.97 0.94 0.88 0.73 0.88 0.9  0.21]
 [0.84 0.91 1.   0.83 0.87 0.86 0.83 0.7  0.82 0.81 0.26]
 [0.75 0.94 0.84 1.   0.94 0.92 0.9  0.7  0.83 0.91 0.51]
 [0.88 0.97 0.87 0.93 1.   0.93 0.9  0.73 0.88 0.89 0.33]
 [0.82 0.95 0.86 0.92 0.93 1.   0.86 0.74 0.83 0.89 0.51]
 [0.76 0.9  0.84 0.91 0.91 0.86 1.   0.65 0.84 0.84 0.44]
 [0.67 0.73 0.71 0.7  0.73 0.75 0.65 1.   0.71 0.72 0.43]
 [0.83 0.9  0.84 0.87 0.91 0.88 0.86 0.72 1.   0.84 0.21]
 [0.72 0.9  0.8  0.95 0.89 0.93 0.87 0.75 0.81 1.   0.5 ]
 [0.37 0.27 0.34 0.65 0.44 0.65 0.57 0.48 0.22 0.62 1.  ]]
Collapsing 1 & 4 with crosscontam 0.8176465113354457 and sim 0.9695266153385633
Collapsing 1 & 5 with crosscontam 0.7176170363590976 and sim 0.9435892377933439
Collapsing 3 & 4 with crosscontam 0.8239405014935662 and sim 0.9358638618866091
Collapsing 1 & 3 with crosscontam 0.8008300579445232 and sim 0.9327991242154157
Collapsing 4 & 5 with crosscontam 0.74609375 and sim 0.9285477150649859
Collapsing 3 & 5 with crosscontam 0.7896901196729197 and sim 0.9160058213174238
Collapsing 3 & 9 with crosscontam 0.7178308823529411 and sim 0.9143864086351157
Collapsing 1 & 2 with crosscontam 0.6318155884131943 and sim 0.9059107730847076
Collapsing 3 & 6 with crosscontam 0.7895190311418686 and sim 0.9024267213992921
Collapsing 1 & 9 with crosscontam 0.31221896469454036 and sim 0.9011003816583107
Collapsing 4 & 6 with crosscontam 0.72749072265625 and sim 0.8997322024149481
Collapsing 0 & 1 with crosscontam 0.6377001321754616 and sim 0.8901811618958332
Aborting collapse as 0 & 9 have cross-contam 0.17266946064139954 and sim 0.7238362546609658
Collapsing 5 & 9 with crosscontam 0.6513157894736843 and sim 0.888777809206303
Collapsing 0 & 4 with crosscontam 0.6357822429334458 and sim 0.8825777196540496
Aborting collapse as 0 & 9 have cross-contam 0.17266946064139954 and sim 0.7238362546609658
Collapsing 1 & 6 with crosscontam 0.7176333607622981 and sim 0.8817925979489356
Collapsing 1 & 8 with crosscontam 0.6695372488180388 and sim 0.8768388507845826
Collapsing 4 & 8 with crosscontam 0.699615478515625 and sim 0.875288779858161
Collapsing 2 & 4 with crosscontam 0.6153400676225268 and sim 0.8734231287843459
Collapsing 2 & 5 with crosscontam 0.6718460706850443 and sim 0.8621775320520364
Collapsing 5 & 6 with crosscontam 0.7015168391893862 and sim 0.8615505659238425
Trimming eliminated 0 seqlets out of 91
Trimming eliminated 0 seqlets out of 110
Trimming eliminated 0 seqlets out of 144
Trimming eliminated 0 seqlets out of 148
Trimming eliminated 0 seqlets out of 197
Trimming eliminated 0 seqlets out of 207
Trimming eliminated 0 seqlets out of 215
Unmerged patterns remapping: OrderedDict([(0, 1), (7, 2), (10, 3)])
Time spent on merging iteration: 15.3963782787323
On merging iteration 2
Numbers for each pattern pre-subsample: [215, 70, 8, 2]
Numbers after subsampling: [215, 70, 8, 2]
Cluster sizes
[215  70   8   2]
Cross-contamination matrix:
[[1.   0.68 0.43 0.62]
 [0.76 1.   0.51 0.1 ]
 [0.9  0.81 1.   0.73]
 [0.   0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.87 0.75 0.53]
 [0.87 1.   0.66 0.28]
 [0.75 0.67 1.   0.43]
 [0.69 0.37 0.48 1.  ]]
Collapsing 0 & 1 with crosscontam 0.7174101967514751 and sim 0.8670842595436395
Trimming eliminated 0 seqlets out of 285
Unmerged patterns remapping: OrderedDict([(2, 1), (3, 2)])
Time spent on merging iteration: 1.736173152923584
On merging iteration 3
Numbers for each pattern pre-subsample: [285, 8, 2]
Numbers after subsampling: [285, 8, 2]
Cluster sizes
[285   8   2]
Cross-contamination matrix:
[[1.   0.51 0.77]
 [0.88 1.   0.73]
 [0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.74 0.54]
 [0.75 1.   0.43]
 [0.71 0.48 1.  ]]
Got 3 patterns after merging
MEMORY 5.018832896
Performing seqlet reassignment
MEMORY 5.018832896
Cross contin jaccard time taken: 0.01 s
Cross contin jaccard time taken: 0.01 s
Discarded 5 seqlets
Skipped 3 seqlets
Skipped 4 seqlets
Skipped 5 seqlets
Got 1 patterns after reassignment
MEMORY 5.018832896
Total time taken is 36.55s
MEMORY 5.018832896
On metacluster 0
Metacluster size 15458
Relevant tasks:  ('early0',)
Relevant signs:  (1,)
TfModiscoSeqletsToPatternsFactory: seed=1234
(Round 1) num seqlets: 15458
(Round 1) Computing coarse affmat
MEMORY 5.015949312
Beginning embedding computation
Computing embeddings
Finished embedding computation in 3.45 s
Starting affinity matrix computations
Normalization computed in 0.77 s
Cosine similarity mat computed in 2.38 s
Normalization computed in 0.85 s
Cosine similarity mat computed in 3.16 s
Finished affinity matrix computations in 10.9 s
(Round 1) Compute nearest neighbors from coarse affmat
MEMORY 6.017482752
Computed nearest neighbors in 21.25 s
MEMORY 6.269038592
(Round 1) Computing affinity matrix on nearest neighbors
MEMORY 6.269038592
Launching nearest neighbors affmat calculation job
MEMORY 6.269534208
Parallel runs completed
MEMORY 6.356226048
Job completed in: 111.92 s
MEMORY 8.267812864
Launching nearest neighbors affmat calculation job
MEMORY 8.266051584
Parallel runs completed
MEMORY 8.309354496
Job completed in: 111.49 s
MEMORY 10.220945408
(Round 1) Computed affinity matrix on nearest neighbors in 232.88 s
MEMORY 8.308051968
Filtered down to 7832 of 15458
(Round 1) Retained 7832 rows out of 15458 after filtering
MEMORY 8.308219904
(Round 1) Computing density adapted affmat
MEMORY 5.931544576
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 7832 samples in 0.052s...
[t-SNE] Computed neighbors for 7832 samples in 0.548s...
[t-SNE] Computed conditional probabilities for sample 1000 / 7832
[t-SNE] Computed conditional probabilities for sample 2000 / 7832
[t-SNE] Computed conditional probabilities for sample 3000 / 7832
[t-SNE] Computed conditional probabilities for sample 4000 / 7832
[t-SNE] Computed conditional probabilities for sample 5000 / 7832
[t-SNE] Computed conditional probabilities for sample 6000 / 7832
[t-SNE] Computed conditional probabilities for sample 7000 / 7832
[t-SNE] Computed conditional probabilities for sample 7832 / 7832
[t-SNE] Mean sigma: 0.205819
(Round 1) Computing clustering
MEMORY 5.931544576
Beginning preprocessing + Leiden
  0%|          | 0/50 [00:00<?, ?it/s]
Quality: 0.8636595161973041
  6%|▌         | 3/50 [00:02<00:42,  1.12it/s]
Quality: 0.863690266224263
 22%|██▏       | 11/50 [00:09<00:34,  1.12it/s]
Quality: 0.8636963147141808
100%|██████████| 50/50 [00:48<00:00,  1.04it/s]
Got 22 clusters after round 1
Counts:
{8: 398, 12: 234, 2: 690, 0: 1136, 6: 477, 3: 688, 1: 1016, 9: 371, 7: 402, 4: 664, 10: 358, 15: 87, 11: 257, 5: 611, 13: 123, 14: 106, 17: 44, 21: 19, 16: 48, 18: 39, 20: 25, 19: 39}
MEMORY 5.441589248
(Round 1) Aggregating seqlets in each cluster
MEMORY 5.441589248
Aggregating for cluster 0 with 1136 seqlets
MEMORY 5.441589248

Trimming eliminated 0 seqlets out of 1136
Skipped 170 seqlets
Aggregating for cluster 1 with 1016 seqlets
MEMORY 5.449715712
Trimming eliminated 0 seqlets out of 1016
Skipped 160 seqlets
Aggregating for cluster 2 with 690 seqlets
MEMORY 5.451026432
Trimming eliminated 0 seqlets out of 690
Skipped 112 seqlets
Aggregating for cluster 3 with 688 seqlets
MEMORY 5.451124736
Trimming eliminated 0 seqlets out of 688
Skipped 83 seqlets
Aggregating for cluster 4 with 664 seqlets
MEMORY 5.451825152
Trimming eliminated 0 seqlets out of 664
Skipped 110 seqlets
Aggregating for cluster 5 with 611 seqlets
MEMORY 5.45288192
Trimming eliminated 0 seqlets out of 611
Skipped 59 seqlets
Removed 1 duplicate seqlets
Aggregating for cluster 6 with 477 seqlets
MEMORY 5.454200832
Trimming eliminated 0 seqlets out of 477
Skipped 73 seqlets
Removed 1 duplicate seqlets
Aggregating for cluster 7 with 402 seqlets
MEMORY 5.454381056
Trimming eliminated 0 seqlets out of 402
Skipped 49 seqlets
Removed 6 duplicate seqlets
Aggregating for cluster 8 with 398 seqlets
MEMORY 5.454696448
Trimming eliminated 0 seqlets out of 398
Skipped 56 seqlets
Aggregating for cluster 9 with 371 seqlets
MEMORY 5.45548288
Trimming eliminated 0 seqlets out of 371
Skipped 55 seqlets
Aggregating for cluster 10 with 358 seqlets
MEMORY 5.456211968
Trimming eliminated 0 seqlets out of 358
Skipped 57 seqlets
Aggregating for cluster 11 with 257 seqlets
MEMORY 5.4567936
Trimming eliminated 0 seqlets out of 257
Skipped 39 seqlets
Aggregating for cluster 12 with 234 seqlets
MEMORY 5.4567936
Trimming eliminated 0 seqlets out of 234
Skipped 23 seqlets
Removed 1 duplicate seqlets
Aggregating for cluster 13 with 123 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 123
Skipped 13 seqlets
Aggregating for cluster 14 with 106 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 106
Skipped 16 seqlets
Aggregating for cluster 15 with 87 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 87
Skipped 4 seqlets
Aggregating for cluster 16 with 48 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 48
Skipped 2 seqlets
Aggregating for cluster 17 with 44 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 44
Skipped 5 seqlets
Aggregating for cluster 18 with 39 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 39
Skipped 8 seqlets
Aggregating for cluster 19 with 39 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 39
Skipped 5 seqlets
Aggregating for cluster 20 with 25 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 25
Skipped 2 seqlets
Aggregating for cluster 21 with 19 seqlets
MEMORY 5.457256448
Trimming eliminated 0 seqlets out of 19
Skipped 1 seqlets
(Round 2) num seqlets: 6721
(Round 2) Computing coarse affmat
MEMORY 5.457317888
Beginning embedding computation
Computing embeddings
Finished embedding computation in 1.51 s
Starting affinity matrix computations
Normalization computed in 0.11 s
Cosine similarity mat computed in 0.34 s
Normalization computed in 0.11 s
Cosine similarity mat computed in 0.36 s
Finished affinity matrix computations in 1.06 s
(Round 2) Compute nearest neighbors from coarse affmat
MEMORY 5.64047872
Computed nearest neighbors in 5.14 s
MEMORY 5.49904384
(Round 2) Computing affinity matrix on nearest neighbors
MEMORY 5.49904384
Launching nearest neighbors affmat calculation job
MEMORY 5.49904384
Parallel runs completed
MEMORY 5.502263296
Job completed in: 48.48 s
MEMORY 5.74076928
Launching nearest neighbors affmat calculation job
MEMORY 5.740507136
Parallel runs completed
MEMORY 5.740535808
Job completed in: 48.15 s
MEMORY 5.979303936
(Round 2) Computed affinity matrix on nearest neighbors in 99.14 s
MEMORY 5.86313728
Not applying filtering for rounds above first round
MEMORY 5.86313728
(Round 2) Computing density adapted affmat
MEMORY 5.682446336
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 6721 samples in 0.039s...
[t-SNE] Computed neighbors for 6721 samples in 0.427s...
[t-SNE] Computed conditional probabilities for sample 1000 / 6721
[t-SNE] Computed conditional probabilities for sample 2000 / 6721
[t-SNE] Computed conditional probabilities for sample 3000 / 6721
[t-SNE] Computed conditional probabilities for sample 4000 / 6721
[t-SNE] Computed conditional probabilities for sample 5000 / 6721
[t-SNE] Computed conditional probabilities for sample 6000 / 6721
[t-SNE] Computed conditional probabilities for sample 6721 / 6721
[t-SNE] Mean sigma: 0.198409
(Round 2) Computing clustering
MEMORY 5.682446336
Beginning preprocessing + Leiden
  0%|          | 0/50 [00:00<?, ?it/s]
Quality: 0.8126913631612114
 10%|█         | 5/50 [00:05<00:44,  1.00it/s]
Quality: 0.8127346623249788
 44%|████▍     | 22/50 [00:23<00:26,  1.04it/s]
Quality: 0.8127370574914725
100%|██████████| 50/50 [00:53<00:00,  1.07s/it]
Got 21 clusters after round 2
Counts:
{9: 197, 1: 959, 13: 65, 5: 591, 2: 824, 0: 1086, 12: 92, 20: 9, 10: 108, 4: 607, 17: 33, 3: 765, 11: 105, 6: 556, 15: 36, 7: 350, 18: 33, 8: 216, 14: 37, 16: 34, 19: 18}
MEMORY 5.321854976
(Round 2) Aggregating seqlets in each cluster
MEMORY 5.321854976
Aggregating for cluster 0 with 1086 seqlets
MEMORY 5.321854976

Trimming eliminated 0 seqlets out of 1086
Removed 1 duplicate seqlets
Aggregating for cluster 1 with 959 seqlets
MEMORY 5.329977344
Trimming eliminated 0 seqlets out of 959
Aggregating for cluster 2 with 824 seqlets
MEMORY 5.331468288
Trimming eliminated 0 seqlets out of 824
Aggregating for cluster 3 with 765 seqlets
MEMORY 5.332598784
Trimming eliminated 0 seqlets out of 765
Removed 1 duplicate seqlets
Aggregating for cluster 4 with 607 seqlets
MEMORY 5.333725184
Trimming eliminated 0 seqlets out of 607
Removed 1 duplicate seqlets
Aggregating for cluster 5 with 591 seqlets
MEMORY 5.3339136
Trimming eliminated 0 seqlets out of 591
Aggregating for cluster 6 with 556 seqlets
MEMORY 5.335748608
Trimming eliminated 0 seqlets out of 556
Aggregating for cluster 7 with 350 seqlets
MEMORY 5.337059328
Trimming eliminated 0 seqlets out of 350
Aggregating for cluster 8 with 216 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 216
Aggregating for cluster 9 with 197 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 197
Removed 2 duplicate seqlets
Aggregating for cluster 10 with 108 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 108
Aggregating for cluster 11 with 105 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 105
Aggregating for cluster 12 with 92 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 92
Aggregating for cluster 13 with 65 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 65
Aggregating for cluster 14 with 37 seqlets
MEMORY 5.33706752
Trimming eliminated 0 seqlets out of 37
Aggregating for cluster 15 with 36 seqlets
MEMORY 5.337141248
Trimming eliminated 0 seqlets out of 36
Aggregating for cluster 16 with 34 seqlets
MEMORY 5.337141248
Trimming eliminated 0 seqlets out of 34
Aggregating for cluster 17 with 33 seqlets
MEMORY 5.337141248
Trimming eliminated 0 seqlets out of 33
Aggregating for cluster 18 with 33 seqlets
MEMORY 5.337174016
Trimming eliminated 0 seqlets out of 33
Aggregating for cluster 19 with 18 seqlets
MEMORY 5.337206784
Trimming eliminated 0 seqlets out of 18
Aggregating for cluster 20 with 9 seqlets
MEMORY 5.337284608
Trimming eliminated 0 seqlets out of 9
Got 21 clusters
Splitting into subclusters...
MEMORY 5.337284608
Inspecting for spurious merging
Wrote graph to binary file in 2.1527435779571533 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00243054
After 2 runs, maximum modularity is Q = 0.00319172
After 3 runs, maximum modularity is Q = 0.00319693
After 8 runs, maximum modularity is Q = 0.00319768
Louvain completed 28 runs in 5.7417497634887695 seconds
Similarity is 0.9811859486730274; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 1.6634547710418701 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00482853
Louvain completed 21 runs in 3.5369105339050293 seconds
Similarity is 0.9777218805037741; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 1.2452082633972168 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00599627
Louvain completed 21 runs in 3.123029947280884 seconds
Similarity is 0.9636695405446599; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 1.1269004344940186 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00321584
After 2 runs, maximum modularity is Q = 0.00327473
After 3 runs, maximum modularity is Q = 0.00327542
Louvain completed 23 runs in 3.6529152393341064 seconds
Similarity is 0.9819158520300739; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.6667885780334473 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00387127
After 2 runs, maximum modularity is Q = 0.00508321
After 9 runs, maximum modularity is Q = 0.00508658
Louvain completed 29 runs in 3.6062188148498535 seconds
Similarity is 0.9665087616283833; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.6751658916473389 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00483503
After 3 runs, maximum modularity is Q = 0.00483527
Louvain completed 23 runs in 2.913325548171997 seconds
Similarity is 0.9717481253842172; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.5968408584594727 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00588013
Louvain completed 21 runs in 2.3297648429870605 seconds
Similarity is 0.9611556892909403; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.22212481498718262 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00811433
After 2 runs, maximum modularity is Q = 0.00863091
Louvain completed 22 runs in 2.2323577404022217 seconds
Similarity is 0.869617845221448; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.09486722946166992 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0080102
Louvain completed 21 runs in 2.0635647773742676 seconds
Similarity is 0.8918617510473941; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.09072184562683105 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00879764
After 2 runs, maximum modularity is Q = 0.00880089
Louvain completed 22 runs in 2.122711181640625 seconds
Similarity is 0.936068762520081; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.027962684631347656 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0175494
Louvain completed 21 runs in 1.8410894870758057 seconds
Similarity is 0.7404218143141033; is_dissimilar is True
Inspecting for spurious merging
Wrote graph to binary file in 0.008305549621582031 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0069309
After 3 runs, maximum modularity is Q = 0.00693091
Louvain completed 23 runs in 2.0871529579162598 seconds
Similarity is 0.9123212801962478; is_dissimilar is False
Got 2 subclusters
Inspecting for spurious merging
Wrote graph to binary file in 0.021019697189331055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.014338
After 3 runs, maximum modularity is Q = 0.0143812
Louvain completed 23 runs in 2.0372679233551025 seconds
Similarity is 0.7105289608109463; is_dissimilar is True
Inspecting for spurious merging
Wrote graph to binary file in 0.009943008422851562 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00505182
After 3 runs, maximum modularity is Q = 0.00573164
After 4 runs, maximum modularity is Q = 0.00585798
After 19 runs, maximum modularity is Q = 0.00585799
Louvain completed 39 runs in 3.459679365158081 seconds
Similarity is 0.9325228016392013; is_dissimilar is False
Got 2 subclusters
Inspecting for spurious merging
Wrote graph to binary file in 0.01705193519592285 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0194596
After 2 runs, maximum modularity is Q = 0.0196088
Louvain completed 22 runs in 2.119094133377075 seconds
Similarity is 0.7663955360348126; is_dissimilar is True
Got 2 subclusters
Inspecting for spurious merging
Wrote graph to binary file in 0.008786201477050781 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0337022
Louvain completed 21 runs in 1.8234524726867676 seconds
Similarity is 0.3225928830239633; is_dissimilar is True
Got 2 subclusters
Merging on 25 clusters
MEMORY 5.337284608
On merging iteration 1
Numbers for each pattern pre-subsample: [1085, 959, 824, 764, 606, 591, 556, 350, 216, 195, 61, 47, 70, 35, 47, 45, 43, 22, 37, 36, 34, 33, 33, 18, 9]
Numbers after subsampling: [300, 300, 300, 300, 300, 300, 300, 300, 216, 195, 61, 47, 70, 35, 47, 45, 43, 22, 37, 36, 34, 33, 33, 18, 9]
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (15424, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (15424, 136, 161) with total sequence length 160
Applying left/right pad of 1 and 0 for (19811, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (19811, -1, 24) with total sequence length 160
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 1 and 0 for (2013, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (2013, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (18339, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (18339, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (9053, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (9053, -1, 24) with total sequence length 160
Applying left/right pad of 0 and 1 for (20574, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (20574, 136, 161) with total sequence length 160
Applying left/right pad of 1 and 0 for (26451, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (26451, -1, 24) with total sequence length 160
Cluster sizes
[1085  959  824  764  606  591  556  350  216  195   61   47   70   35
   47   45   43   22   37   36   34   33   33   18    9]
Cross-contamination matrix:
[[1.   0.6  0.4  0.83 0.37 0.78 0.   0.03 0.   0.27 0.01 0.03 0.   0.
  0.02 0.5  0.29 0.   0.   0.07 0.   0.27 0.   0.   0.01]
 [0.95 1.   0.37 0.85 0.34 0.86 0.   0.   0.   0.26 0.   0.01 0.   0.
  0.02 0.54 0.23 0.   0.   0.06 0.   0.41 0.   0.   0.  ]
 [0.74 0.5  1.   0.65 0.61 0.76 0.   0.   0.   0.33 0.01 0.07 0.   0.
  0.04 0.5  0.28 0.   0.   0.34 0.   0.38 0.   0.   0.01]
 [0.85 0.55 0.31 1.   0.28 0.82 0.   0.   0.   0.2  0.   0.02 0.   0.
  0.01 0.48 0.22 0.   0.   0.07 0.   0.39 0.   0.   0.  ]
 [0.82 0.57 0.74 0.72 1.   0.7  0.   0.04 0.   0.31 0.01 0.03 0.   0.
  0.04 0.39 0.19 0.   0.   0.38 0.   0.36 0.   0.   0.01]
 [0.85 0.64 0.51 0.87 0.37 1.   0.   0.   0.   0.4  0.   0.04 0.   0.
  0.02 0.58 0.28 0.   0.   0.16 0.   0.56 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.   0.01 0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   0.02 0.  ]
 [0.12 0.01 0.02 0.   0.1  0.01 0.01 1.   0.   0.15 0.01 0.03 0.01 0.25
  0.02 0.01 0.01 0.01 0.02 0.   0.01 0.07 0.07 0.01 0.01]
 [0.05 0.02 0.03 0.02 0.02 0.03 0.03 0.02 1.   0.04 0.06 0.05 0.1  0.1
  0.06 0.04 0.04 0.02 0.1  0.05 0.08 0.04 0.03 0.02 0.04]
 [1.   0.76 0.69 0.9  0.58 1.   0.   0.11 0.   1.   0.04 0.17 0.   0.01
  0.08 0.73 0.53 0.   0.01 0.24 0.   0.67 0.   0.   0.  ]
 [0.   0.   0.01 0.   0.   0.01 0.   0.   0.   0.02 1.   0.07 0.   0.
  0.   0.   0.01 0.   0.   0.   0.02 0.01 0.   0.   0.  ]
 [0.93 0.81 1.   0.9  0.76 1.   0.22 0.16 0.09 0.95 0.83 1.   0.1  0.2
  0.47 0.93 0.85 0.09 0.09 0.77 0.04 0.86 0.03 0.06 0.23]
 [0.01 0.01 0.02 0.   0.01 0.01 0.   0.02 0.05 0.01 0.05 0.02 1.   0.39
  0.01 0.03 0.02 0.05 0.01 0.03 0.03 0.02 0.01 0.03 0.03]
 [0.06 0.05 0.03 0.01 0.06 0.02 0.02 0.42 0.1  0.07 0.03 0.1  0.45 1.
  0.05 0.02 0.03 0.11 0.02 0.01 0.05 0.02 0.06 0.03 0.02]
 [0.38 0.36 0.35 0.34 0.3  0.38 0.01 0.02 0.01 0.29 0.04 0.16 0.01 0.02
  1.   0.95 0.2  0.01 0.05 0.3  0.02 0.36 0.02 0.01 0.02]
 [0.37 0.22 0.13 0.36 0.04 0.37 0.   0.   0.   0.1  0.   0.   0.   0.
  0.06 1.   0.06 0.   0.   0.05 0.   0.3  0.   0.   0.  ]
 [0.86 0.74 0.7  0.82 0.56 0.87 0.02 0.01 0.   0.64 0.06 0.19 0.01 0.01
  0.12 0.7  1.   0.01 0.   0.51 0.   0.72 0.   0.   0.03]
 [0.02 0.03 0.17 0.03 0.12 0.13 0.1  0.12 0.1  0.13 0.21 0.17 0.27 0.27
  0.14 0.09 0.2  1.   0.1  0.16 0.22 0.03 0.13 0.16 0.16]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.01 0.01 0.   0.   0.   0.
  0.01 0.   0.   0.   1.   0.   0.   0.   0.   0.   0.  ]
 [0.02 0.01 0.04 0.02 0.01 0.01 0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.02 0.   0.   0.   1.   0.   0.03 0.   0.   0.  ]
 [0.   0.   0.   0.   0.   0.   0.   0.01 0.02 0.   0.05 0.   0.01 0.01
  0.01 0.   0.   0.01 0.01 0.   1.   0.   0.02 0.   0.  ]
 [0.84 0.63 0.56 0.9  0.35 0.8  0.   0.01 0.   0.47 0.01 0.05 0.   0.01
  0.03 0.69 0.28 0.   0.   0.56 0.   1.   0.   0.   0.01]
 [0.17 0.18 0.18 0.18 0.2  0.17 0.09 0.32 0.18 0.19 0.19 0.18 0.17 0.22
  0.24 0.22 0.17 0.18 0.26 0.19 0.22 0.17 1.   0.14 0.13]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  0.   0.   0.   0.   0.   0.   0.   0.   0.   1.   0.  ]
 [0.05 0.07 0.07 0.07 0.05 0.09 0.   0.   0.   0.   0.   0.03 0.   0.
  0.   0.06 0.02 0.   0.   0.02 0.   0.07 0.   0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.96 0.86 0.98 0.87 0.97 0.22 0.41 0.24 0.92 0.49 0.74 0.32 0.25
  0.53 0.93 0.9  0.24 0.26 0.74 0.26 0.95 0.24 0.19 0.38]
 [0.96 1.   0.84 0.96 0.85 0.96 0.26 0.32 0.22 0.86 0.5  0.72 0.31 0.26
  0.55 0.92 0.89 0.27 0.28 0.73 0.2  0.94 0.27 0.25 0.37]
 [0.86 0.84 1.   0.87 0.9  0.89 0.29 0.36 0.24 0.84 0.61 0.84 0.34 0.31
  0.56 0.85 0.85 0.31 0.26 0.86 0.28 0.88 0.28 0.25 0.41]
 [0.98 0.96 0.87 1.   0.84 0.98 0.22 0.27 0.18 0.89 0.5  0.74 0.29 0.22
  0.54 0.94 0.9  0.23 0.3  0.77 0.26 0.97 0.29 0.23 0.4 ]
 [0.87 0.85 0.9  0.84 1.   0.83 0.26 0.4  0.22 0.76 0.52 0.7  0.41 0.29
  0.53 0.8  0.79 0.28 0.33 0.79 0.2  0.81 0.26 0.24 0.4 ]
 [0.97 0.96 0.89 0.98 0.83 1.   0.25 0.34 0.22 0.92 0.55 0.79 0.29 0.28
  0.56 0.94 0.93 0.25 0.27 0.76 0.26 0.96 0.24 0.23 0.4 ]
 [0.22 0.28 0.29 0.22 0.27 0.26 1.   0.17 0.34 0.27 0.25 0.42 0.31 0.25
  0.35 0.23 0.31 0.27 0.36 0.35 0.26 0.26 0.37 0.61 0.23]
 [0.44 0.35 0.36 0.29 0.44 0.38 0.17 1.   0.16 0.38 0.3  0.39 0.28 0.61
  0.45 0.37 0.37 0.28 0.34 0.27 0.28 0.36 0.55 0.19 0.53]
 [0.24 0.22 0.24 0.19 0.22 0.22 0.34 0.16 1.   0.29 0.29 0.29 0.45 0.44
  0.34 0.23 0.23 0.27 0.46 0.24 0.5  0.24 0.31 0.3  0.34]
 [0.92 0.87 0.84 0.89 0.76 0.92 0.27 0.42 0.28 1.   0.59 0.83 0.34 0.25
  0.55 0.87 0.87 0.25 0.29 0.7  0.26 0.9  0.25 0.23 0.32]
 [0.49 0.5  0.61 0.5  0.52 0.55 0.25 0.3  0.29 0.59 1.   0.74 0.43 0.25
  0.48 0.51 0.5  0.38 0.21 0.49 0.28 0.54 0.28 0.12 0.37]
 [0.74 0.72 0.85 0.74 0.7  0.79 0.42 0.39 0.28 0.83 0.74 1.   0.37 0.35
  0.57 0.75 0.76 0.29 0.38 0.69 0.28 0.78 0.29 0.32 0.45]
 [0.32 0.31 0.34 0.29 0.41 0.29 0.27 0.28 0.45 0.34 0.43 0.36 1.   0.7
  0.38 0.31 0.26 0.45 0.49 0.34 0.47 0.36 0.34 0.41 0.4 ]
 [0.26 0.27 0.31 0.22 0.3  0.28 0.25 0.61 0.44 0.26 0.25 0.35 0.7  1.
  0.28 0.27 0.28 0.44 0.32 0.21 0.44 0.25 0.29 0.33 0.39]
 [0.53 0.55 0.57 0.54 0.53 0.56 0.36 0.42 0.34 0.56 0.48 0.57 0.39 0.28
  1.   0.77 0.51 0.27 0.49 0.49 0.37 0.59 0.53 0.34 0.36]
 [0.93 0.92 0.85 0.94 0.8  0.94 0.23 0.36 0.23 0.87 0.51 0.75 0.31 0.27
  0.77 1.   0.88 0.23 0.27 0.75 0.21 0.95 0.32 0.2  0.37]
 [0.91 0.9  0.86 0.91 0.79 0.93 0.31 0.35 0.23 0.88 0.51 0.76 0.26 0.28
  0.51 0.88 1.   0.31 0.19 0.72 0.32 0.9  0.17 0.25 0.35]
 [0.24 0.27 0.32 0.24 0.28 0.25 0.27 0.26 0.27 0.26 0.38 0.29 0.45 0.44
  0.28 0.23 0.31 1.   0.32 0.27 0.43 0.25 0.31 0.33 0.37]
 [0.26 0.28 0.26 0.3  0.33 0.27 0.36 0.32 0.46 0.29 0.21 0.38 0.49 0.3
  0.49 0.27 0.19 0.31 1.   0.21 0.47 0.27 0.55 0.38 0.32]
 [0.74 0.74 0.86 0.77 0.83 0.78 0.35 0.27 0.24 0.7  0.49 0.69 0.37 0.22
  0.51 0.79 0.75 0.28 0.21 1.   0.27 0.85 0.26 0.2  0.37]
 [0.26 0.21 0.28 0.26 0.2  0.26 0.25 0.28 0.5  0.26 0.28 0.28 0.47 0.44
  0.37 0.21 0.32 0.43 0.47 0.26 1.   0.26 0.38 0.3  0.27]
 [0.95 0.94 0.88 0.98 0.81 0.96 0.26 0.37 0.24 0.9  0.54 0.78 0.36 0.25
  0.59 0.95 0.9  0.25 0.27 0.81 0.26 1.   0.27 0.23 0.4 ]
 [0.24 0.27 0.28 0.29 0.26 0.24 0.37 0.5  0.26 0.25 0.28 0.29 0.35 0.29
  0.53 0.32 0.17 0.29 0.55 0.26 0.38 0.27 1.   0.32 0.32]
 [0.19 0.25 0.25 0.23 0.24 0.23 0.59 0.19 0.3  0.23 0.12 0.32 0.41 0.33
  0.37 0.2  0.25 0.33 0.38 0.19 0.3  0.23 0.32 1.   0.33]
 [0.38 0.38 0.41 0.4  0.4  0.4  0.23 0.51 0.33 0.32 0.37 0.45 0.4  0.39
  0.36 0.37 0.35 0.37 0.32 0.36 0.27 0.4  0.32 0.33 1.  ]]
Collapsing 0 & 3 with crosscontam 0.8424691883950617 and sim 0.977506280724056
Collapsing 3 & 5 with crosscontam 0.8424606691358025 and sim 0.9758888005873498
Collapsing 3 & 21 with crosscontam 0.6465415286380796 and sim 0.9737931448523224
Collapsing 0 & 5 with crosscontam 0.8140867933333332 and sim 0.9657826617310513
Collapsing 5 & 21 with crosscontam 0.6782947107438017 and sim 0.9621936988394604
Collapsing 1 & 5 with crosscontam 0.7540697795061729 and sim 0.9594281775465897
Collapsing 0 & 1 with crosscontam 0.7762820355555556 and sim 0.9582759841382814
Collapsing 1 & 3 with crosscontam 0.7006987153086419 and sim 0.9559080704349763
Collapsing 0 & 21 with crosscontam 0.5570965074621327 and sim 0.9528100835710842
Collapsing 15 & 21 with crosscontam 0.49331178945871335 and sim 0.9497017261186713
Collapsing 5 & 15 with crosscontam 0.47664109538180166 and sim 0.9405489369821229
Collapsing 1 & 21 with crosscontam 0.5218025505375146 and sim 0.9375886732777322
Collapsing 3 & 15 with crosscontam 0.41981046310013725 and sim 0.9360939983678489
Collapsing 0 & 15 with crosscontam 0.4358911694558756 and sim 0.925563731435409
Collapsing 5 & 16 with crosscontam 0.5770796394568566 and sim 0.9255033056236921
Collapsing 5 & 9 with crosscontam 0.699029469705603 and sim 0.9232080770387449
Collapsing 1 & 15 with crosscontam 0.3821192914494743 and sim 0.9203976820225563
Collapsing 0 & 9 with crosscontam 0.6363639356563664 and sim 0.9158657063189878
Collapsing 2 & 4 with crosscontam 0.6717751639506172 and sim 0.9047561652914753
Collapsing 0 & 16 with crosscontam 0.5718839821101239 and sim 0.9042565231518179
Collapsing 3 & 16 with crosscontam 0.5235582130126906 and sim 0.8999100288083058
Collapsing 9 & 21 with crosscontam 0.570376916494575 and sim 0.897991316604705
Collapsing 3 & 9 with crosscontam 0.5477413830689437 and sim 0.8914598472702375
Collapsing 2 & 5 with crosscontam 0.6336154461728395 and sim 0.8903671274543389
Collapsing 0 & 4 with crosscontam 0.5954305454320987 and sim 0.8712395475613437
Collapsing 9 & 16 with crosscontam 0.5851559177411649 and sim 0.8696885777334511
Collapsing 0 & 2 with crosscontam 0.5673200128395061 and sim 0.8637459322952143
Collapsing 1 & 9 with crosscontam 0.506302804268447 and sim 0.8633534095318718
Trimming eliminated 0 seqlets out of 1849
Trimming eliminated 0 seqlets out of 2440
Trimming eliminated 0 seqlets out of 2473
Skipped 4 seqlets
Removed 1 duplicate seqlets
Trimming eliminated 0 seqlets out of 3427
Trimming eliminated 0 seqlets out of 3472
Trimming eliminated 0 seqlets out of 3515
Trimming eliminated 0 seqlets out of 3710
Removed 2 duplicate seqlets
Trimming eliminated 0 seqlets out of 1430
Trimming eliminated 0 seqlets out of 5138
Unmerged patterns remapping: OrderedDict([(6, 1), (7, 2), (8, 3), (10, 5), (11, 6), (12, 4), (13, 10), (14, 7), (17, 13), (18, 8), (19, 9), (20, 11), (22, 12), (23, 14), (24, 15)])
Time spent on merging iteration: 226.5939757823944
On merging iteration 2
Numbers for each pattern pre-subsample: [5138, 556, 350, 216, 70, 61, 47, 47, 37, 36, 35, 34, 33, 22, 18, 9]
Numbers after subsampling: [300, 300, 300, 216, 70, 61, 47, 47, 37, 36, 35, 34, 33, 22, 18, 9]
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (8785, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (30712, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (3637, 136, 161) with total sequence length 160
Applying left/right pad of 1 and 0 for (33821, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (33821, -1, 24) with total sequence length 160
Applying left/right pad of 0 and 1 for (24192, 136, 161) with total sequence length 160
Applying left/right pad of 0 and 1 for (24192, 136, 161) with total sequence length 160
Applying left/right pad of 1 and 0 for (4704, -1, 24) with total sequence length 160
Applying left/right pad of 1 and 0 for (4704, -1, 24) with total sequence length 160
Applying left/right pad of 0 and 2 for (22539, 137, 162) with total sequence length 160
Applying left/right pad of 0 and 2 for (22539, 137, 162) with total sequence length 160
Cluster sizes
[5138  556  350  216   70   61   47   47   37   36   35   34   33   22
   18    9]
Cross-contamination matrix:
[[1.   0.   0.08 0.   0.   0.02 0.12 0.09 0.   0.37 0.   0.   0.   0.
  0.   0.01]
 [0.   1.   0.   0.   0.   0.   0.01 0.   0.   0.   0.   0.   0.   0.
  0.02 0.  ]
 [0.12 0.01 1.   0.   0.01 0.01 0.03 0.02 0.02 0.   0.25 0.01 0.07 0.01
  0.01 0.01]
 [0.02 0.03 0.02 1.   0.1  0.06 0.05 0.06 0.1  0.05 0.1  0.08 0.03 0.02
  0.02 0.04]
 [0.01 0.   0.02 0.05 1.   0.05 0.02 0.01 0.01 0.03 0.39 0.03 0.01 0.05
  0.03 0.03]
 [0.01 0.   0.   0.   0.   1.   0.07 0.   0.   0.   0.   0.02 0.   0.
  0.   0.  ]
 [0.91 0.22 0.16 0.09 0.1  0.83 1.   0.47 0.09 0.77 0.2  0.04 0.03 0.09
  0.06 0.23]
 [0.36 0.01 0.02 0.01 0.01 0.04 0.16 1.   0.05 0.3  0.02 0.02 0.02 0.01
  0.01 0.02]
 [0.   0.   0.   0.01 0.   0.   0.   0.01 1.   0.   0.   0.   0.   0.
  0.   0.  ]
 [0.02 0.   0.   0.   0.   0.   0.   0.   0.   1.   0.   0.   0.   0.
  0.   0.  ]
 [0.01 0.02 0.42 0.1  0.45 0.03 0.1  0.05 0.02 0.01 1.   0.05 0.06 0.11
  0.03 0.02]
 [0.   0.   0.01 0.02 0.01 0.05 0.   0.01 0.01 0.   0.01 1.   0.02 0.01
  0.   0.  ]
 [0.18 0.09 0.32 0.18 0.17 0.19 0.18 0.24 0.26 0.19 0.22 0.22 1.   0.18
  0.14 0.13]
 [0.11 0.1  0.12 0.1  0.27 0.21 0.17 0.14 0.1  0.16 0.27 0.22 0.13 1.
  0.16 0.16]
 [0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
  1.   0.  ]
 [0.07 0.   0.   0.   0.   0.   0.03 0.   0.   0.02 0.   0.   0.   0.
  0.   1.  ]]
Pattern-to-pattern sim matrix:
[[1.   0.25 0.4  0.2  0.29 0.55 0.79 0.56 0.24 0.8  0.25 0.26 0.27 0.23
  0.23 0.4 ]
 [0.25 1.   0.17 0.34 0.31 0.25 0.42 0.35 0.36 0.35 0.25 0.26 0.37 0.27
  0.61 0.23]
 [0.43 0.17 1.   0.16 0.28 0.3  0.39 0.45 0.34 0.27 0.61 0.28 0.55 0.28
  0.19 0.53]
 [0.21 0.34 0.16 1.   0.45 0.29 0.29 0.34 0.46 0.24 0.44 0.5  0.31 0.27
  0.3  0.34]
 [0.29 0.27 0.28 0.45 1.   0.43 0.36 0.38 0.49 0.34 0.7  0.47 0.34 0.45
  0.41 0.4 ]
 [0.55 0.25 0.3  0.29 0.43 1.   0.74 0.48 0.21 0.49 0.25 0.28 0.28 0.38
  0.12 0.37]
 [0.79 0.42 0.39 0.28 0.37 0.74 1.   0.57 0.38 0.69 0.35 0.28 0.29 0.29
  0.32 0.45]
 [0.57 0.36 0.42 0.34 0.39 0.48 0.57 1.   0.49 0.49 0.28 0.37 0.53 0.27
  0.34 0.36]
 [0.24 0.36 0.32 0.46 0.49 0.21 0.38 0.49 1.   0.21 0.3  0.47 0.55 0.31
  0.38 0.32]
 [0.8  0.35 0.27 0.24 0.37 0.49 0.69 0.51 0.21 1.   0.22 0.27 0.26 0.28
  0.2  0.37]
 [0.25 0.25 0.61 0.44 0.7  0.25 0.35 0.28 0.32 0.21 1.   0.44 0.29 0.44
  0.33 0.39]
 [0.26 0.25 0.28 0.5  0.47 0.28 0.28 0.37 0.47 0.26 0.44 1.   0.38 0.43
  0.3  0.27]
 [0.27 0.37 0.5  0.26 0.35 0.28 0.29 0.53 0.55 0.26 0.29 0.38 1.   0.29
  0.32 0.32]
 [0.23 0.27 0.26 0.27 0.45 0.38 0.29 0.28 0.32 0.27 0.44 0.43 0.31 1.
  0.33 0.37]
 [0.23 0.59 0.19 0.3  0.41 0.12 0.32 0.37 0.38 0.19 0.33 0.3  0.32 0.33
  1.   0.33]
 [0.41 0.23 0.51 0.33 0.4  0.37 0.45 0.36 0.32 0.36 0.39 0.27 0.32 0.37
  0.33 1.  ]]
Got 16 patterns after merging
MEMORY 5.398626304
Performing seqlet reassignment
MEMORY 5.398626304
Cross contin jaccard time taken: 0.05 s
Cross contin jaccard time taken: 0.05 s
Discarded 93 seqlets
Skipped 40 seqlets
Skipped 30 seqlets
Skipped 1 seqlets
Skipped 7 seqlets
Skipped 2 seqlets
Skipped 4 seqlets
Skipped 97 seqlets
Skipped 8 seqlets
Skipped 6 seqlets
Skipped 2 seqlets
Skipped 1 seqlets
Skipped 4 seqlets
Skipped 95 seqlets
Skipped 18 seqlets
Skipped 6 seqlets
Skipped 9 seqlets
Skipped 5 seqlets
Skipped 3 seqlets
Skipped 1 seqlets
Skipped 2 seqlets
Skipped 2 seqlets
Skipped 1 seqlets
Skipped 1 seqlets
Skipped 2 seqlets
Skipped 1 seqlets
Got 6 patterns after reassignment
MEMORY 5.402775552
Total time taken is 860.97s
MEMORY 5.402775552
In [10]:
modisco_hdf = '/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out/results.hdf5'
grp = h5py.File(modisco_hdf)
tfmodisco_results.save_hdf5(grp)
/users/msharmin/anaconda2/envs/aitac/lib/python3.7/site-packages/ipykernel_launcher.py:2: H5pyDeprecationWarning: The default file mode will change to 'r' (read-only) in h5py 3.0. To suppress this warning, pass the mode you need to h5py.File(), or set the global default h5.get_config().default_file_mode, or set the environment variable H5PY_DEFAULT_READONLY=1. Available modes are: 'r', 'r+', 'w', 'w-'/'x', 'a'. See the docs for details.
  
In [8]:
from matlas.matches import DenovoModisco, DenovoHomer
from vdom.helpers import (b, summary, details)
from IPython.display import display
import numpy as np


def show_patterns_using_hoccomocco_db(sample_name, modiscodir):
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(
#                 meme_db="/mnt/lab_data/kundaje/users/msharmin/annotations/HOCOMOCOv11_core_pwms_HUMAN_mono.renamed.nonredundant.annotated.meme",
#                 database_name="HOCOMOCO.nonredundant.annotated",
#                 save_report=True, tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "HOCOMOCO.nonredundant.annotated"))
    ob.load_matched_motifs(database_name="HOCOMOCO.nonredundant.annotated")
    ob.get_motif_per_celltype(match_threshold=0.03, database_name="HOCOMOCO.nonredundant.annotated")
    #ob.display_individual_table()
    
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    return None



def display_denovo_patterns(sample_name, modiscodir, match_threshold=0.05):
    display(summary(b(sample_name)))
    
    ob = DenovoModisco(modiscodir)
#     ob.fetch_tomtom_matches(save_report=True, 
#                                   tomtom_dir= "{0}/{1}_tomtomout".format(modiscodir, "CISBP_2.00"))
    
    ob.load_matched_motifs()
    ob.get_motif_per_celltype(match_threshold=match_threshold)
    pattern_tab, pattern_dict = ob.visualize_pattern_table()
    display(details(summary('Click here for ', b('Denovo Patterns'), ' by ', b('{}'.format('MoDISco')),
                        ' in ', b(sample_name),
                        ": #{}".format(len(pattern_dict)),
                       ), pattern_tab))
    #ob.display_individual_table()
    
    return None

Early timepoint

In [9]:
sample_name = 'early_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"
)
early_fold0
Click here for Denovo Patterns by MoDISco in early_fold0: #6
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 5085 SequenceContrib ScoresHyp_Contrib Scores
Jund, Fos, Junb, Fosb, Jdp2, Fosl1, Atf3, Fosl2, Jun, Batf,

Bach2, Nfe2, Nfe2l2, Bach1
metacluster_0/pattern_1 # seqlets: 537 SequenceContrib ScoresHyp_Contrib Scores
Trp63, Trp73, Trp53, Tcfcp2l1
metacluster_0/pattern_2 # seqlets: 370 SequenceContrib ScoresHyp_Contrib Scores
Jund, Jun, Batf, Fos, Fosl1, Fosl2, Jdp2, Junb, Fosb, Atf3,

Tead1, Tead4
metacluster_0/pattern_3 # seqlets: 226 SequenceContrib ScoresHyp_Contrib Scores
Ctcf, Ctcfl
metacluster_0/pattern_4 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores
Runx1, Cbfb, Runx2
metacluster_0/pattern_5 # seqlets: 87 SequenceContrib ScoresHyp_Contrib Scores
Atf2, Jdp2, Creb5, Atf7, Creb1, Atf3, Atf1, Crem
In [10]:
sample_name = 'early_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)
Click here for Denovo Patterns by MoDISco in early_fold0: #6
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 5085 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-124_FOSB.UNK.0.A, HCLUST-101_NFE2.UNK.0.A
metacluster_0/pattern_1 # seqlets: 537 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-170_TP53.UNK.0.A
metacluster_0/pattern_2 # seqlets: 370 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-156_TEAD1.UNK.0.A, HCLUST-124_FOSB.UNK.0.A
metacluster_0/pattern_3 # seqlets: 226 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-149_CTCFL.UNK.0.A, HCLUST-134_INSM1.UNK.0.A
metacluster_0/pattern_4 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-176_CBFB.UNK.0.A
metacluster_0/pattern_5 # seqlets: 87 SequenceContrib ScoresHyp_Contrib Scores
HCLUST-175_ATF1.UNK.0.A

Late timepoint

In [11]:
sample_name = 'late_fold0'

display_denovo_patterns(
    sample_name,
    modiscodir="/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"
)
late_fold0
Click here for Denovo Patterns by MoDISco in late_fold0: #10
Pattern NameTF Name(s)Modisco
metacluster_0/pattern_0 # seqlets: 2434 SequenceContrib ScoresHyp_Contrib Scores
Jund, Junb, Fos, Fosb, Jdp2, Fosl1, Atf3, Fosl2, Batf, Jun,

Bach2, Nfe2, Nfe2l2, Bach1
metacluster_0/pattern_1 # seqlets: 641 SequenceContrib ScoresHyp_Contrib Scores
Cebpb, Cebpa, Dbp, Cebpg, Hlf, Cebpd, Nfil3, Cebpe, Tef, Atf4,

Ddit3
metacluster_0/pattern_2 # seqlets: 470 SequenceContrib ScoresHyp_Contrib Scores
Grhl2
metacluster_0/pattern_3 # seqlets: 372 SequenceContrib ScoresHyp_Contrib Scores
Klf1, Klf4, Klf8, Klf12, Klf7, Sp4, Klf5, Klf3, Klf6
metacluster_0/pattern_4 # seqlets: 367 SequenceContrib ScoresHyp_Contrib Scores
Atf4, Ddit3, Cebpg, Cebpb, Cebpd, Nfil3, Cebpa, Dbp
metacluster_0/pattern_5 # seqlets: 311 SequenceContrib ScoresHyp_Contrib Scores
Trp73, Trp63, Trp53
metacluster_0/pattern_6 # seqlets: 109 SequenceContrib ScoresHyp_Contrib Scores
Atf2, Jdp2, Creb5, Atf7, Creb1, Atf3, Crem, Nfil3, Mafb
metacluster_0/pattern_7 # seqlets: 88 SequenceContrib ScoresHyp_Contrib Scores
Ctcf, Ctcfl
metacluster_0/pattern_8 # seqlets: 69 SequenceContrib ScoresHyp_Contrib Scores
Tfap2a, Tfap2c, Tfap2e, Tfap2b
metacluster_0/pattern_9 # seqlets: 66 SequenceContrib ScoresHyp_Contrib Scores
In [ ]:
sample_name = 'late_fold0'
modiscodir = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_late/modisco_out"

show_patterns_using_hoccomocco_db(sample_name, modiscodir)

Loading keras model

In [22]:
from matlas.model_test import getSkinModel
from matlas.model_test import setup_keras_session
setup_keras_session('4')
init_weights = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/weights_from_raw_tf.p"
model = getSkinModel(init_weights, 19, classification=False)
model_h5 = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/model.h5"
model.save(model_h5)
model.summary()
channels_last
compiling!
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
input (InputLayer)           (None, 1000, 4)           0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 1000, 300)         23100     
_________________________________________________________________
batch_normalization_11 (Batc (None, 1000, 300)         1200      
_________________________________________________________________
activation_14 (Activation)   (None, 1000, 300)         0         
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 333, 300)          0         
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 333, 200)          660200    
_________________________________________________________________
batch_normalization_12 (Batc (None, 333, 200)          800       
_________________________________________________________________
activation_15 (Activation)   (None, 333, 200)          0         
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 83, 200)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 83, 200)           280200    
_________________________________________________________________
batch_normalization_13 (Batc (None, 83, 200)           800       
_________________________________________________________________
activation_16 (Activation)   (None, 83, 200)           0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 20, 200)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 4000)              0         
_________________________________________________________________
dense_5 (Dense)              (None, 1000)              4001000   
_________________________________________________________________
batch_normalization_14 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_17 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
batch_normalization_15 (Batc (None, 1000)              4000      
_________________________________________________________________
activation_18 (Activation)   (None, 1000)              0         
_________________________________________________________________
dropout_6 (Dropout)          (None, 1000)              0         
_________________________________________________________________
final_dense19 (Dense)        (None, 19)                19019     
=================================================================
Total params: 5,995,319
Trainable params: 5,989,919
Non-trainable params: 5,400
_________________________________________________________________
In [11]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    labels = fp['labels'][:]
    logits = fp['logits'][:]
    seqs = fp['sequence'][:]
labels.shape, logits.shape, seqs.shape
Out[11]:
((35024, 19), (35024, 19), (35024, 1, 1000, 4))
In [23]:
from matlas.generators import EmbeddingsGenerator
def get_predictions(cur_seqs, model):
    e_generator = EmbeddingsGenerator(cur_seqs, batch_size=1000, num_rows=cur_seqs.shape[0])
    #batch = e_generator.get_batch(i)
    #e = model.predict_on_batch(batch[0])
    e = model.predict_generator(
                e_generator,
                max_queue_size=100,
                workers=1,
                use_multiprocessing=False,
                verbose=1
            )
    return e
keras_op = get_predictions(np.squeeze(seqs[:1000]), model)
1/1 [==============================] - 1s 664ms/step
In [24]:
from matplotlib import pylab as plt
# plt.scatter(activations_all['activation_2/Relu:0'][:1000,0,0,0], cnv1[:1000,0,0,0])
plt.scatter(logits[:1000,0], keras_op[:1000,0])
plt.xlabel('raw tensorflow prediction')
plt.ylabel('keras predictions')
Out[24]:
Text(0, 0.5, 'keras predictions')
In [25]:
import scipy.stats
print(scipy.stats.pearsonr(logits[:1000,0], keras_op[:1000,0]))
print(scipy.stats.spearmanr(logits[:1000,0], keras_op[:1000,0]))
(0.7495659591048396, 4.981212730424334e-181)
SpearmanrResult(correlation=0.7251773091773093, pvalue=6.437351695707496e-164)

Deeplifting the keras model

In [8]:
from matlas.deeplift_run import *
contrib_funcs, input_layer_shape = retrieve_func_from_model(
    model_h5, 
    algorithm="rescale_conv_revealcancel_fc", 
    regression=True,
    sequential=False, 
    w0=None, w1=None, logger=None)
input_layer_shape
load data from labcluster
TF-MoDISco is using the TensorFlow backend.
nonlinear_mxts_mode is set to: DeepLIFT_GenomicsDefault
For layer activation_4_0 the preceding linear layer is conv1d_8_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_5_0 the preceding linear layer is conv1d_9_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_6_0 the preceding linear layer is conv1d_10_0 of type Conv1D;
In accordance with nonlinear_mxts_mode=DeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to Rescale
Heads-up: current implementation assumes maxpool layer is followed by a linear transformation (conv/dense layer)
For layer activation_7_0 the preceding linear layer is dense_1_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
For layer activation_8_0 the preceding linear layer is dense_2_0 of type Dense;
In accordance with nonlinear_mxts_modeDeepLIFT_GenomicsDefault we are setting the NonlinearMxtsMode to RevealCancel
Out[8]:
[None, 1000, 4]
In [9]:
#provide list of strings to run deeplift
# def read_ggr_active_sequences(ggr_h5):
#     with h5py.File(ggr_h5, "r") as fp:
#         seqs = fp['sequence.active.string'][:]
#     sequences = []
#     for seq in seqs:
#         sequences.append(seq[0].decode('utf-8'))
    
#     return sequences

# ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
# sequences = read_ggr_sequences(ggrfile)
# type(sequences), len(sequences) #sequences[0], sequences[1], seqs[0,0], seqs[1,0]
Out[9]:
(list, 35024)
In [3]:
def get_genome_coordinates(ggr_h5, bed_file):
    with h5py.File(ggr_h5, "r") as fp:
        regions = fp['example_metadata'][:]
    
    chroms = []
    starts = []
    ends = []
    for region in regions[:,0]:
        region = region.decode("utf-8")
        if region!='':
            region = region.split("features=")[1]
        else:
            continue
        chroms.append(region.split(":")[0])
        starts.append(region.split(":")[1].split("-")[0])
        ends.append(region.split(":")[1].split("-")[1])
    df = pd.DataFrame({'chrom': chroms, 'start':starts, 'end': ends})
    df.to_csv(bed_file, header=False, index=False, sep="\t", compression="gzip")
    return None
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
bed_file = "/mnt/lab_data2/msharmin/oc-atlas/DanSkinData/fold_0_ggr/result_early/regions.bed.gz"
get_genome_coordinates(ggrfile, bed_file)
In [27]:
from matlas.model_layer import retrieve_sequences
sequences, intervals_wo_flanks = retrieve_sequences(
    bed_file, 
    fasta_file="/mnt/lab_data3/dskim89/ggr/annotations/hg19.genome.fa", flank_size=0)
In [28]:
num_refs_per_seq = 10
from deeplift.dinuc_shuffle import dinuc_shuffle
from matlas.model_layer import one_hot_encode_along_col_axis
from matlas.dlutils import get_shuffled_seqs
input_data_list, input_references_list = get_shuffled_seqs(sequences[:45], num_refs_per_seq, shuffle_func=dinuc_shuffle,
                                                                one_hot_func=lambda x: np.array([one_hot_encode_along_col_axis(seq) for seq in x]),
                                                                progress_update=10000)
input_data_list[0].shape, len(sequences[0])
# input_data_list = [np.expand_dims(input_data_list[0], axis=1)]
# input_references_list = [np.expand_dims(input_references_list[0], axis=1)]
One hot encoding sequences...
One hot encoding done...
Out[28]:
((450, 1000, 4), 1000)
In [11]:
from matlas.dlutils import get_given_seq_ref_function
shuffled_score_funcs = {input_name: get_given_seq_ref_function(score_computation_function=score_func)
                        for input_name, score_func in contrib_funcs.items()}
In [29]:
task_idx = 0
batch_size = 256
num_refs_per_seq = 10
for input_name, score_func in shuffled_score_funcs.items():
    hyp_scores = None
    b = 10000
    c = int(np.ceil(1.0*len(input_data_list[0])/b))
    for si in range(c):
        if(si==c-1):
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:len(input_data_list[0])]],
                               input_references_list=[input_references_list[0][si*b:len(input_data_list[0])]],
                               num_refs_per_seq=num_refs_per_seq, batch_size=batch_size,
                               progress_update=10000)
        else:
            #print('batch: ', si, si*b, (si+1)*b) 
            tmp = score_func(task_idx=int(task_idx), input_data_list=[input_data_list[0][si*b:(si+1)*b]],
                               input_references_list=[input_references_list[0][si*b:(si+1)*b]],
                               num_refs_per_seq=num_refs_per_seq, 
                               batch_size=batch_size,
                               progress_update=10000)
        if(hyp_scores is None):
            hyp_scores = tmp
        else:
            hyp_scores = np.vstack((hyp_scores, tmp))
    input_data_list[0] = np.squeeze(input_data_list[0])
    input_references_list[0] = np.squeeze(input_references_list[0])
    one_hot = input_data_list[0][[range(0, len(input_data_list[0]), num_refs_per_seq)]]
    shuffled_onehot = input_references_list[0].reshape((one_hot.shape[0], num_refs_per_seq, 
                                                       input_references_list[0].shape[-2], #seq_len
                                                        input_references_list[0].shape[-1]))#alphabet 
    scores = np.multiply(hyp_scores, one_hot)
       
hyp_scores.shape, one_hot.shape, scores.shape
Done 0
/users/msharmin/anaconda2/envs/basepair13/lib/python3.6/site-packages/ipykernel_launcher.py:27: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
In [ ]:
# create_deeplift_h5(bed_file, score_hdf, hyp_scores, one_hot, shuffled_onehot)
In [8]:
ggrfile = "/mnt/lab_data3/dskim89/ggr/nn/2019-03-12.freeze/motifs.input_x_grad.early/ggr.scanmotifs.h5"
with h5py.File(ggrfile, "r") as fp:
    print(list(fp))
    scores_ggr = fp['sequence-weighted'][:]
    scores_ggr_active = fp['sequence-weighted.active'][:]
scores_ggr.shape, scores_ggr_active.shape
['ATAC_LABELS', 'ATAC_SIGNALS', 'ATAC_SIGNALS.NORM', 'CTCF_LABELS', 'CTCF_SIGNALS', 'CTCF_SIGNALS.NORM', 'DYNAMIC_MARK_LABELS', 'DYNAMIC_STATE_LABELS', 'H3K27ac_LABELS', 'H3K27ac_SIGNALS', 'H3K27ac_SIGNALS.NORM', 'H3K27me3_LABELS', 'H3K27me3_SIGNALS', 'H3K27me3_SIGNALS.NORM', 'H3K4me1_LABELS', 'H3K4me1_SIGNALS', 'H3K4me1_SIGNALS.NORM', 'KLF4_LABELS', 'POL2_LABELS', 'STABLE_MARK_LABELS', 'STABLE_STATE_LABELS', 'TP63_LABELS', 'TRAJ_LABELS', 'ZNF750_LABELS', 'example_metadata', 'gradients', 'labels', 'logits', 'logits.ci', 'logits.ci.thresh', 'logits.multimodel', 'logits.multimodel.norm', 'logits.norm', 'positive_importance_bp_sum', 'probs', 'pwm-scores.null.idx', 'sequence', 'sequence-weighted', 'sequence-weighted.active', 'sequence-weighted.active.ci', 'sequence-weighted.active.ci.thresh', 'sequence-weighted.active.pwm-scores.thresh', 'sequence-weighted.active.pwm-scores.thresh.max.idx', 'sequence-weighted.active.pwm-scores.thresh.max.val', 'sequence-weighted.active.pwm-scores.thresh.sum', 'sequence-weighted.thresholds', 'sequence.active', 'sequence.active.gc_fract', 'sequence.active.pwm-hits', 'sequence.active.pwm-hits.densities', 'sequence.active.pwm-hits.densities.max', 'sequence.active.pwm-scores.thresh', 'sequence.active.pwm-scores.thresh.sum', 'sequence.active.string']
Out[8]:
((35024, 10, 1000, 4), (35024, 10, 160, 4))
In [10]:
import modisco.visualization
from modisco.visualization import viz_sequence
viz_sequence.plot_weights(scores_ggr[0,0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr_active[0,0], subticks_frequency=20)
-0.4860307276248932 1.3380632400512695
-0.0440836176276207 0.1500825583934784
In [5]:
scores_ggr.shape
Out[5]:
(35024, 10, 1000, 4)
In [36]:
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(hyp_scores[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(one_hot[0][500:600], subticks_frequency=20)
viz_sequence.plot_weights(scores_ggr[0][500:600], subticks_frequency=20)
-0.0688343504909426 0.35437235310673715
-0.976342553505674 0.35437235310673715
0.0 1.0
In [ ]: