Goal¶

apply co-clustering

from basepair.imports import *
from basepair.exp.chipnexus.motif_clustering import to_colors, preproc_motif_table, motif_table_long, scale

tasks = ['Oct4', 'Sox2', 'Klf4', 'Nanog']

Load the data¶

def load_df(modisco_run, min_n_seqlets=100):
    df = pd.read_csv(f"http://mitra.stanford.edu/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/{modisco_run}/pattern_table.csv")

    df['metacluster'] = df.pattern.str.split("_", expand=True)[0].str.replace("m", "").astype(int)
    df['metacluster'] = pd.Categorical(df.metacluster, ordered=True)
    df['log n seqlets'] = np.log10(df['n seqlets'])

    # filter 
    df = df[df['n seqlets'] >= min_n_seqlets]
    return df

df = load_df(modisco_run)

df = load_df('valid')
dfx, row_df, col_df = preproc_motif_table(df, tasks)

x = scale(dfx).T

g = sns.clustermap(x,  row_colors=to_colors(col_df), col_colors=to_colors(row_df), method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0);

Apply bi-clustering¶

from sklearn.cluster.bicluster import SpectralCoclustering
from sklearn.metrics import consensus_score

data = x.values

data, rows, columns = make_biclusters(
    shape=(300, 300), n_clusters=5, noise=5,
    shuffle=False, random_state=0)

data.shape

(300, 300)

rows.shape

(5, 300)

from sklearn.datasets import make_biclusters

model = SpectralCoclustering(n_clusters=5, random_state=0)
model.fit(x.values)

fit_data = x.iloc[np.argsort(model.row_labels_)]
fit_data = fit_data.iloc[:, np.argsort(model.column_labels_)]

sns.clustermap(fit_data, col_cluster=False, figsize=(20, 10), row_cluster=False, cmap='RdBu_r', center=0)

<seaborn.matrix.ClusterGrid at 0x7ff6a1546f28>

sns.clustermap(fit_data, row_cluster=True, col_cluster=True,  method="weighted", figsize=(20, 10), cmap='RdBu_r', center=0)

<seaborn.matrix.ClusterGrid at 0x7ff6a0c26630>

from sklearn.metrics import (adjusted_rand_score as ari,
                             normalized_mutual_info_score as nmi)

from coclust.coclustering import (CoclustMod, CoclustSpecMod, CoclustInfo)
from coclust.io.data_loading import load_doc_term_data
from coclust.evaluation.internal import best_modularity_partition
from coclust.evaluation.external import accuracy
from coclust.io.notebook import(input_with_default_int, input_with_default_str)
from coclust.visualization import (plot_max_modularities, 
                                   plot_intermediate_modularities,
                                   plot_cluster_top_terms, 
                                   get_term_graph, 
                                   plot_cluster_sizes)

range_n_clusters = list(range(2, 9))
n_rand_init = 1

best_coclustMod_model, all_max_modularities = best_modularity_partition(x.values, 
                                                                        nbr_clusters_range=range_n_clusters, n_rand_init=1)

Computing coclust modularity for a range of cluster numbers =
 2 ...
 3 ...
 4 ...
 5 ...
 6 ...
 7 ...
 8 ...
 All done !

cocluster = CoclustMod(5)

cocluster.fit(x.values)

CoclustMod(init=None, max_iter=20, n_clusters=5, n_init=1, random_state=None,
      tol=1e-09)

x.values.shape

(29, 58)

from coclust.visualization import plot_reorganized_matrix

X = x

model = cocluster

row_indices

array([14, 25, 24, 23, 21, 20, 18, 15, 10, 28,  3,  7,  4,  6,  8, 26,  1,
        2, 22,  5, 19, 17, 16, 27, 13, 12, 11,  9,  0])

c

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-47-21a833572bc9> in <module>()
----> 1 X_reorg = X[row_indices, col_indices]

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'numpy.ndarray'

col_indices

array([ 0, 51, 45, 40, 37, 36, 34, 31, 24, 23, 20, 15, 28,  5, 11,  3,  6,
        1, 10,  7,  9,  8,  4,  2, 13, 41, 42, 43, 44, 48, 47, 49, 50, 52,
       53, 54, 55, 46, 39, 35, 12, 14, 16, 17, 18, 19, 21, 22, 25, 26, 27,
       56, 29, 30, 32, 33, 38, 57])

len(col_indices)

58

X.shape

(29, 58)

X = x.values

row_indices = np.argsort(model.row_labels_)
col_indices = np.argsort(model.column_labels_)
X_reorg = X[row_indices, :]
X_reorg = X_reorg[:, col_indices]

plot_reorganized_matrix(x, cocluster)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-40-b77c6dad4e48> in <module>()
----> 1 plot_reorganized_matrix(x, cocluster)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/coclust/visualization/__init__.py in plot_reorganized_matrix(X, model, precision, markersize)
    429     row_indices = np.argsort(model.row_labels_)
    430     col_indices = np.argsort(model.column_labels_)
--> 431     X_reorg = X[row_indices, :]
    432     X_reorg = X_reorg[:, col_indices]
    433     plt.spy(X_reorg, precision=precision, markersize=markersize)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in __getitem__(self, key)
   2686             return self._getitem_multilevel(key)
   2687         else:
-> 2688             return self._getitem_column(key)
   2689 
   2690     def _getitem_column(self, key):

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/frame.py in _getitem_column(self, key)
   2693         # get column
   2694         if self.columns.is_unique:
-> 2695             return self._get_item_cache(key)
   2696 
   2697         # duplicate columns & possible reduce dimensionality

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/pandas/core/generic.py in _get_item_cache(self, item)
   2485         """Return the cached item, item represents a label indexer."""
   2486         cache = self._item_cache
-> 2487         res = cache.get(item)
   2488         if res is None:
   2489             values = self._data.get(item)

TypeError: unhashable type: 'numpy.ndarray'

cocluster.get_assignment_matrix(a)

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-35-ea791867d2af> in <module>()
----> 1 cocluster.get_assignment_matrix()

TypeError: get_assignment_matrix() missing 2 required positional arguments: 'kind' and 'i'