Goal

  • apply co-clustering
In [12]:
from basepair.imports import *
from basepair.exp.chipnexus.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale
In [7]:
tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']

Load the data

In [90]:
def load_df(modisco_run, min_n_seqlets=100):
    df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

    df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
    df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
    df['log n seqlets'] = np.log10(df['n seqlets'])

    # filter 
    df = df[df['n seqlets'] >= min_n_seqlets]
    return df
In [91]:
df = load_df(modisco_run)
In [92]:
df = load_df('valid')
dfx, row_df, col_df = preproc_motif_table(df, tasks)
In [93]:
x = scale(dfx).T
In [94]:
g = sns.clustermap(x,  row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0);

Apply bi-clustering

In [63]:
from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score
In [64]:
data = x.values
In [67]:
data, rows, columns = make_biclusters(
    shape=(300, 300), n_clusters=5, noise=5,
    shuffle=False, random_state=0)
In [70]:
data.shape
Out[70]:
(300, 300)
In [69]:
rows.shape
Out[69]:
(5, 300)
In [66]:
from sklearn.datasets import make_biclusters
In [80]:
model = SpectralCoclustering(n_clusters=5, random_state=0)
model.fit(x.values)

fit_data = x.iloc[np.argsort(model.row_labels_)]
fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]
In [96]:
sns.clustermap(fit_data, col_cluster=False, figsize=(20, 10), row_cluster=False, cmap='RdBu_r', center=0)
Out[96]:
<seaborn.matrix.ClusterGrid at 0x7ff6a1546f28>
In [95]:
sns.clustermap(fit_data, row_cluster=True, col_cluster=True,  method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0)
Out[95]:
<seaborn.matrix.ClusterGrid at 0x7ff6a0c26630>
In [14]:
from sklearn.metrics import (adjusted_rand_score as ari,
                             normalized_mutual_info_score as nmi)

from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo)
from coclust.io.data_loading import load_doc_term_data
from coclust.evaluation.internal import best_modularity_partition
from coclust.evaluation.external import accuracy
from coclust.io.notebook import(input_with_default_int, input_with_default_str)
from coclust.visualization import (plot_max_modularities, 
                                   plot_intermediate_modularities,
                                   plot_cluster_top_terms, 
                                   get_term_graph, 
                                   plot_cluster_sizes)
In [23]:
range_n_clusters = list(range(2, 9))
n_rand_init = 1
In [24]:
best_coclustMod_model, all_max_modularities = best_modularity_partition(x.values, 
                                                                        nbr_clusters_range=range_n_clusters, n_rand_init=1)
Computing coclust modularity for a range of cluster numbers =
 2 ...
 3 ...
 4 ...
 5 ...
 6 ...
 7 ...
 8 ...
 All done !
In [28]:
cocluster = CoclustMod(5)
In [29]:
cocluster.fit(x.values)
Out[29]:
CoclustMod(init=None, max_iter=20, n_clusters=5, n_init=1, random_state=None,
      tol=1e-09)
In [34]:
x.values.shape
Out[34]:
(29, 58)
In [37]:
from coclust.visualization import plot_reorganized_matrix
In [41]:
X = x
In [42]:
model = cocluster
In [45]:
row_indices
Out[45]:
array([14, 25, 24, 23, 21, 20, 18, 15, 10, 28,  3,  7,  4,  6,  8, 26,  1,
        2, 22,  5, 19, 17, 16, 27, 13, 12, 11,  9,  0])
In [47]:
c
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-47-21a833572bc9> in <module>()
----> 1 X_reorg = X[row_indices, col_indices]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'numpy.ndarray'
In [49]:
col_indices
Out[49]:
array([ 0, 51, 45, 40, 37, 36, 34, 31, 24, 23, 20, 15, 28,  5, 11,  3,  6,
        1, 10,  7,  9,  8,  4,  2, 13, 41, 42, 43, 44, 48, 47, 49, 50, 52,
       53, 54, 55, 46, 39, 35, 12, 14, 16, 17, 18, 19, 21, 22, 25, 26, 27,
       56, 29, 30, 32, 33, 38, 57])
In [50]:
len(col_indices)
Out[50]:
58
In [51]:
X.shape
Out[51]:
(29, 58)
In [54]:
X = x.values
In [55]:
row_indices = np.argsort(model.row_labels_)
col_indices = np.argsort(model.column_labels_)
X_reorg = X[row_indices, :]
X_reorg = X_reorg[:, col_indices]
In [40]:
plot_reorganized_matrix(x, cocluster)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-40-b77c6dad4e48> in <module>()
----> 1 plot_reorganized_matrix(x, cocluster)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/coclust/visualization/__init__.py in plot_reorganized_matrix(X, model, precision, markersize)
    429     row_indices = np.argsort(model.row_labels_)
    430     col_indices = np.argsort(model.column_labels_)
--> 431     X_reorg = X[row_indices, :]
    432     X_reorg = X_reorg[:, col_indices]
    433     plt.spy(X_reorg, precision=precision, markersize=markersize)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'numpy.ndarray'
In [35]:
cocluster.get_assignment_matrix(a)
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-ea791867d2af> in <module>()
----> 1 cocluster.get_assignment_matrix()

TypeError: get_assignment_matrix() missing 2 required positional arguments: 'kind' and 'i'