In [1]:
%matplotlib inline

import numpy as np
import modisco
In [2]:
def one_hot_encode_along_channel_axis(sequence):
    #theano dim ordering, uses row axis for one-hot
    to_return = np.zeros((len(sequence),4), dtype=np.int8)
    seq_to_one_hot_fill_in_array(zeros_array=to_return,
                                 sequence=sequence, one_hot_axis=1)
    return to_return

def seq_to_one_hot_fill_in_array(zeros_array, sequence, one_hot_axis):
    assert one_hot_axis==0 or one_hot_axis==1
    if (one_hot_axis==0):
        assert zeros_array.shape[1] == len(sequence)
    elif (one_hot_axis==1): 
        assert zeros_array.shape[0] == len(sequence)
    #will mutate zeros_array
    for (i,char) in enumerate(sequence):
        if (char=="A" or char=="a"):
            char_idx = 0
        elif (char=="C" or char=="c"):
            char_idx = 1
        elif (char=="G" or char=="g"):
            char_idx = 2
        elif (char=="T" or char=="t"):
            char_idx = 3
        elif (char=="N" or char=="n"):
            continue #leave that pos as all 0's
        else:
            raise RuntimeError("Unsupported character: "+str(char))
        if (one_hot_axis==0):
            zeros_array[char_idx,i] = 1
        elif (one_hot_axis==1):
            zeros_array[i,char_idx] = 1
In [4]:
import os
In [5]:
if not os.path.exists("task0importancescores.npy"):
    !./grab_data.sh
In [6]:
scores = np.load("task0importancescores.npy")
hyp_scores = np.load("task0hypimpscores.npy")
onehot_data = (np.abs(scores) > 0)*1.0
print(scores.shape)
print(hyp_scores.shape)
(2000, 200, 4)
(2000, 200, 4)
In [7]:
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0])
viz_sequence.plot_weights(hyp_scores[0])
viz_sequence.plot_weights(onehot_data[0])
In [20]:
scores.shape
Out[20]:
(2000, 200, 4)
In [19]:
onehot_data.shape
Out[19]:
(2000, 200, 4)
In [8]:
from imp import reload
In [9]:
%env MKL_THREADING_LAYER=GNU
env: MKL_THREADING_LAYER=GNU
In [10]:
import theano
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/tmp/try_flags_y1gejpf1.c:4:19: fatal error: cudnn.h: No such file or directory\ncompilation terminated.\n'
Mapped name None to device cuda: GeForce GTX TITAN X (0000:04:00.0)
In [15]:
import h5py
import numpy as np
%matplotlib inline
import modisco
reload(modisco)
import modisco.backend
reload(modisco.backend.theano_backend)
reload(modisco.backend)
import modisco.nearest_neighbors
reload(modisco.nearest_neighbors)
import modisco.affinitymat
reload(modisco.affinitymat.core)
reload(modisco.affinitymat.transformers)
import modisco.tfmodisco_workflow.seqlets_to_patterns
reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
import modisco.tfmodisco_workflow.workflow
reload(modisco.tfmodisco_workflow.workflow)
import modisco.aggregator
reload(modisco.aggregator)
import modisco.cluster
reload(modisco.cluster.core)
reload(modisco.cluster.phenograph.core)
reload(modisco.cluster.phenograph.cluster)
import modisco.core
reload(modisco.core)
import modisco.coordproducers
reload(modisco.coordproducers)
import modisco.metaclusterers
reload(modisco.metaclusterers)

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow()(
                task_names=["task0"],
                contrib_scores={'task0': scores},
                hypothetical_contribs={'task0': hyp_scores},
                one_hot=onehot_data)
On task task0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Got 11859 coords
Computing thresholds
Bandwidth calculated: 0.038807619363069534
Computed threshold 0.16936665236949922
5126 coords remaining after thresholding
After resolving overlaps, got 5126 seqlets
2 activity patterns with support >= 200 out of 3 possible patterns
Metacluster sizes:  [3886, 1240]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 1240
Relevant tasks:  ('task0',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 1240
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 1.88 s
Starting affinity matrix computations
Normalization computed in 0.35 s
Cosine similarity mat computed in 0.51 s
Normalization computed in 0.28 s
Cosine similarity mat computed in 0.45 s
Finished affinity matrix computations in 1.0 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.09 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 6.48 s
Launching nearest neighbors affmat calculation job
Job completed in: 6.25 s
(Round 1) Computed affinity matrix on nearest neighbors in 13.66 s
Filtered down to 600 of 1240
(Round 1) Retained 600 rows out of 1240 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 600 samples in 0.000s...
[t-SNE] Computed neighbors for 600 samples in 0.008s...
[t-SNE] Computed conditional probabilities for sample 600 / 600
[t-SNE] Mean sigma: 0.251327
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.10981249809265137 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    1.7s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    2.1s finished
Louvain completed 200 runs in 3.4234118461608887 seconds
Wrote graph to binary file in 0.27323365211486816 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.770903
Louvain completed 51 runs in 5.580981731414795 seconds
Preproc + Louvain took 9.82592225074768 s
Got 6 clusters after round 1
Counts:
{4: 98, 2: 118, 0: 148, 3: 100, 1: 132, 5: 4}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 148 seqlets
Trimmed 43 out of 148
Skipped 10 seqlets
Aggregating for cluster 1 with 132 seqlets
Trimmed 20 out of 132
Skipped 7 seqlets
Dropping cluster 1 with 105 seqlets due to sign disagreement
Aggregating for cluster 2 with 118 seqlets
Trimmed 26 out of 118
Skipped 4 seqlets
Dropping cluster 2 with 88 seqlets due to sign disagreement
Aggregating for cluster 3 with 100 seqlets
Trimmed 30 out of 100
Skipped 6 seqlets
Dropping cluster 3 with 64 seqlets due to sign disagreement
Aggregating for cluster 4 with 98 seqlets
Trimmed 1 out of 98
Skipped 45 seqlets
Aggregating for cluster 5 with 4 seqlets
Trimmed 0 out of 4
Skipped 1 seqlets
Dropping cluster 5 with 3 seqlets due to sign disagreement
(Round 2) num seqlets: 147
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.27 s
Starting affinity matrix computations
Normalization computed in 0.04 s
Cosine similarity mat computed in 0.05 s
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.02 s
Finished affinity matrix computations in 0.07 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.0 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 1.43 s
Launching nearest neighbors affmat calculation job
Job completed in: 1.51 s
(Round 2) Computed affinity matrix on nearest neighbors in 3.03 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 147 samples in 0.000s...
[t-SNE] Computed neighbors for 147 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 147 / 147
[t-SNE] Mean sigma: 0.280331
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.037200927734375 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    2.7s
Louvain completed 200 runs in 4.3953773975372314 seconds
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    3.2s finished
Wrote graph to binary file in 0.03351950645446777 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.630097
After 3 runs, maximum modularity is Q = 0.639725
After 30 runs, maximum modularity is Q = 0.645429
Louvain completed 80 runs in 8.930379152297974 seconds
Preproc + Louvain took 13.742157459259033 s
Got 6 clusters after round 2
Counts:
{0: 41, 2: 29, 3: 16, 4: 15, 5: 12, 1: 34}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 41 seqlets
Trimmed 2 out of 41
Skipped 2 seqlets
Dropping cluster 0 with 37 seqlets due to sign disagreement
Aggregating for cluster 1 with 34 seqlets
Trimmed 0 out of 34
Skipped 18 seqlets
Skipped 1 seqlets
Dropping cluster 1 with 15 seqlets due to sign disagreement
Aggregating for cluster 2 with 29 seqlets
Trimmed 2 out of 29
Skipped 3 seqlets
Dropping cluster 2 with 24 seqlets due to sign disagreement
Aggregating for cluster 3 with 16 seqlets
Trimmed 1 out of 16
Aggregating for cluster 4 with 15 seqlets
Trimmed 6 out of 15
Skipped 1 seqlets
Aggregating for cluster 5 with 12 seqlets
Trimmed 0 out of 12
Dropping cluster 5 with 12 seqlets due to sign disagreement
Got 2 clusters
Splitting into subclusters...
Merging on 2 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 2 patterns after merging
Performing seqlet reassignment
Got 0 patterns after reassignment
Total time taken is 53.25s
On metacluster 0
Metacluster size 3886
Relevant tasks:  ('task0',)
Relevant signs:  (1,)
(Round 1) num seqlets: 3886
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 5.61 s
Starting affinity matrix computations
Normalization computed in 1.05 s
Cosine similarity mat computed in 2.23 s
Normalization computed in 1.31 s
Cosine similarity mat computed in 2.63 s
Finished affinity matrix computations in 5.0 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.41 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 18.48 s
Launching nearest neighbors affmat calculation job
Job completed in: 18.7 s
(Round 1) Computed affinity matrix on nearest neighbors in 42.14 s
Filtered down to 573 of 3886
(Round 1) Retained 573 rows out of 3886 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 573 samples in 0.000s...
[t-SNE] Computed neighbors for 573 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 573 / 573
[t-SNE] Mean sigma: 0.224885
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.14028000831604004 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.3s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    2.0s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    2.4s finished
Louvain completed 200 runs in 3.7841861248016357 seconds
Wrote graph to binary file in 0.46575212478637695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.67618
After 2 runs, maximum modularity is Q = 0.676719
After 24 runs, maximum modularity is Q = 0.679644
After 25 runs, maximum modularity is Q = 0.680392
Louvain completed 75 runs in 9.333353757858276 seconds
Preproc + Louvain took 13.939557790756226 s
Got 7 clusters after round 1
Counts:
{3: 114, 0: 135, 1: 129, 4: 45, 2: 121, 5: 17, 6: 12}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 135 seqlets
Trimmed 1 out of 135
Skipped 26 seqlets
Aggregating for cluster 1 with 129 seqlets
Trimmed 3 out of 129
Skipped 34 seqlets
Aggregating for cluster 2 with 121 seqlets
Trimmed 12 out of 121
Skipped 21 seqlets
Aggregating for cluster 3 with 114 seqlets
Trimmed 61 out of 114
Skipped 3 seqlets
Skipped 4 seqlets
Aggregating for cluster 4 with 45 seqlets
Trimmed 4 out of 45
Skipped 13 seqlets
Aggregating for cluster 5 with 17 seqlets
Trimmed 0 out of 17
Skipped 1 seqlets
Aggregating for cluster 6 with 12 seqlets
Trimmed 1 out of 12
Skipped 1 seqlets
(Round 2) num seqlets: 388
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.65 s
Starting affinity matrix computations
Normalization computed in 0.08 s
Cosine similarity mat computed in 0.11 s
Normalization computed in 0.04 s
Cosine similarity mat computed in 0.07 s
Finished affinity matrix computations in 0.18 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.02 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 3.03 s
Launching nearest neighbors affmat calculation job
Job completed in: 3.06 s
(Round 2) Computed affinity matrix on nearest neighbors in 6.46 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 388 samples in 0.000s...
[t-SNE] Computed neighbors for 388 samples in 0.006s...
[t-SNE] Computed conditional probabilities for sample 388 / 388
[t-SNE] Mean sigma: 0.208439
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.10068321228027344 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
Louvain completed 200 runs in 3.3328542709350586 seconds
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    2.1s finished
Wrote graph to binary file in 0.17615103721618652 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.617469
After 3 runs, maximum modularity is Q = 0.63808
After 35 runs, maximum modularity is Q = 0.638826
Louvain completed 85 runs in 9.830075025558472 seconds
Preproc + Louvain took 13.805950403213501 s
Got 11 clusters after round 2
Counts:
{1: 98, 0: 99, 4: 17, 10: 10, 6: 16, 9: 13, 2: 69, 5: 16, 8: 15, 7: 15, 3: 20}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 99 seqlets
Trimmed 1 out of 99
Skipped 5 seqlets
Aggregating for cluster 1 with 98 seqlets
Trimmed 3 out of 98
Skipped 10 seqlets
Aggregating for cluster 2 with 69 seqlets
Trimmed 5 out of 69
Skipped 2 seqlets
Aggregating for cluster 3 with 20 seqlets
Trimmed 4 out of 20
Skipped 1 seqlets
Aggregating for cluster 4 with 17 seqlets
Trimmed 4 out of 17
Skipped 1 seqlets
Aggregating for cluster 5 with 16 seqlets
Trimmed 1 out of 16
Skipped 1 seqlets
Aggregating for cluster 6 with 16 seqlets
Trimmed 3 out of 16
Skipped 4 seqlets
Aggregating for cluster 7 with 15 seqlets
Trimmed 3 out of 15
Skipped 1 seqlets
Aggregating for cluster 8 with 15 seqlets
Trimmed 0 out of 15
Skipped 6 seqlets
Aggregating for cluster 9 with 13 seqlets
Trimmed 2 out of 13
Aggregating for cluster 10 with 10 seqlets
Trimmed 0 out of 10
Got 11 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.04774641990661621 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0059212
Louvain completed 21 runs in 2.5512454509735107 seconds
Similarity is 0.99248314; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.036466121673583984 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00680034
After 9 runs, maximum modularity is Q = 0.00680035
Louvain completed 29 runs in 3.3384954929351807 seconds
Similarity is 0.9934135; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.020229101181030273 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0105812
Louvain completed 21 runs in 2.316864490509033 seconds
Similarity is 0.9605332; is_dissimilar is False
Merging on 11 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 1 with prob 0.004245115603006835 and sim 0.9877162010013057
Trimmed 0 out of 178
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 10 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 1.19 s
Cross contin jaccard time taken: 1.15 s
Discarded 8 seqlets
Skipped 58 seqlets
Skipped 9 seqlets
Got 2 patterns after reassignment
Total time taken is 120.63s
In [16]:
import h5py
import modisco.util
reload(modisco.util)
!rm task0_results.hdf5
grp = h5py.File("task0_results.hdf5")
tfmodisco_results.save_hdf5(grp)
rm: cannot remove 'task0_results.hdf5': No such file or directory
In [18]:
from collections import Counter
from modisco.visualization import viz_sequence
reload(viz_sequence)
from matplotlib import pyplot as plt

import modisco.affinitymat.core
reload(modisco.affinitymat.core)
import modisco.cluster.phenograph.core
reload(modisco.cluster.phenograph.core)
import modisco.cluster.phenograph.cluster
reload(modisco.cluster.phenograph.cluster)
import modisco.cluster.core
reload(modisco.cluster.core)
import modisco.aggregator
reload(modisco.aggregator)

import sklearn.decomposition
import sklearn.manifold

hdf5_results = h5py.File("task0_results.hdf5")

#patterns = (tfmodisco_results
#            .metacluster_idx_to_submetacluster_results[0]
#            .seqlets_to_patterns_result.patterns);
patterns = (list(hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))
print(len(patterns))
pattern_grp = (hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"])

for pattern_name in patterns:
    pattern = pattern_grp[pattern_name]
    print(pattern_name)
    print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"]))
    #pattern.plot_counts(counts=aggregated_seqlet.get_per_position_seqlet_center_counts())
    background = np.array([0.27, 0.23, 0.23, 0.27])
    print("fwd:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["fwd"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["fwd"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                    background=background))

    print("reverse:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["rev"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["rev"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                    background=background))
2
b'pattern_0'
total seqlets: 189
fwd:
reverse:
b'pattern_1'
total seqlets: 63
fwd:
reverse:
In [15]:
pattern.keys()
Out[15]:
[u'seqlets_and_alnmts',
 u'sequence',
 u'task0_contrib_scores',
 u'task0_hypothetical_contribs']