Goal¶

subcluster the importance scores
- write the instance vectors of the importance scores to file
- use them when running modisco (e.g. subset)

import seaborn as sns
from basepair.config import get_data_dir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from basepair.modisco import ModiscoResult, append_pattern_loc
from basepair.cli.schemas import DataSpec
from basepair.utils import read_pkl

ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"

ds = DataSpec.load(model_dir / "dataspec.yaml")

from kipoi.readers import HDF5Reader

d = HDF5Reader(model_dir / "grad.all.h5")
d.open()

tasks = list(ds.task_specs)

df = pd.DataFrame({task: d.f[f"/targets/profile/{task}"][:].sum(axis=-1).sum(axis=-1) for task in tasks})

df.head()

dfl = np.log(1+df)

from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(dfl)

from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2'])

dfl.head()

pca.components_

array([[ 0.39153526,  0.49672002,  0.54812079,  0.54729604],
       [ 0.90301651, -0.07103785, -0.21777698, -0.36343912]])

principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);

sns.jointplot(x, y, kind="hex", color="#4CB391")

principalDf.plot("pc1", "pc2", gridsize=100, kind='hexbin');

sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

<seaborn.axisgrid.JointGrid at 0x7f6b5419bcf8>

import seaborn as sns

dfls = dfl.sample(frac=0.1)

g = sns.PairGrid(dfls)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);

2018-08-22 14:47:20,749 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,759 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,768 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,778 [WARNING] No handles with labels found to put in legend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/contour.py:960: UserWarning: The following kwargs were not used by contour: 'label', 'color'
  s)

sns.pairplot(dfl,
             diag_kind = 'kde', 
             markers='o',
             plot_kws = {'alpha': 0.01, 'edgecolor': None})

<seaborn.axisgrid.PairGrid at 0x7f6b800c3d30>

def plot_emb(emb, legend=False):
    """Plot embedding. Highlights the held-out points on the plot
    """
    plt.scatter(emb[:,0], emb[:,1], alpha=0.3, s=60, label="Training")

plot_emb(tsnec.embedding_)

import umap

um = umap.UMAP(n_neighbors=5,
               min_dist=0.3)

plot_emb(mds.embedding_)

from sklearn. import Pip

d.ls()

[('/grads/Klf4/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/inputs', <HDF5 dataset "inputs": shape (98428, 1000, 4), type "<f4">),
 ('/metadata/interval_from_task',
  <HDF5 dataset "interval_from_task": shape (98428,), type "|O">),
 ('/metadata/range/chr', <HDF5 dataset "chr": shape (98428,), type "|O">),
 ('/metadata/range/end', <HDF5 dataset "end": shape (98428,), type "<i8">),
 ('/metadata/range/id', <HDF5 dataset "id": shape (98428,), type "<i8">),
 ('/metadata/range/start', <HDF5 dataset "start": shape (98428,), type "<i8">),
 ('/metadata/range/strand',
  <HDF5 dataset "strand": shape (98428,), type "|O">),
 ('/targets/profile/Klf4',
  <HDF5 dataset "Klf4": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Nanog',
  <HDF5 dataset "Nanog": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Oct4',
  <HDF5 dataset "Oct4": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Sox2',
  <HDF5 dataset "Sox2": shape (98428, 1000, 2), type "<f4">)]

Cluster activations¶

ls {model_dir}

cometml.json   grad.all.h5     grad.valid.h5  Intervene_results/  results.html
dataspec.yaml  grad.test.2.h5  history.csv    model.h5            results.ipynb
figures/       grad.test.h5    hparams.yaml   modisco/

from keras.models import load_model, Model

from basepair.config import create_tf_session

create_tf_session(0)

<tensorflow.python.client.session.Session at 0x7fef912164e0>

m = load_model(model_dir / 'model.h5')

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-08-23 15:13:10,854 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-08-23 15:13:17,264 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

m.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
seq (InputLayer)                (None, 1000, 4)      0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1000, 64)     6464        seq[0][0]                        
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1000, 64)     12352       conv1d_1[0][0]                   
__________________________________________________________________________________________________
add_1 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 1000, 64)     12352       add_1[0][0]                      
__________________________________________________________________________________________________
add_2 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 1000, 64)     12352       add_2[0][0]                      
__________________________________________________________________________________________________
add_3 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 1000, 64)     12352       add_3[0][0]                      
__________________________________________________________________________________________________
add_4 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 1000, 64)     12352       add_4[0][0]                      
__________________________________________________________________________________________________
add_5 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 1000, 64)     12352       add_5[0][0]                      
__________________________________________________________________________________________________
add_6 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 1000, 64)     12352       add_6[0][0]                      
__________________________________________________________________________________________________
add_7 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 1000, 64)     12352       add_7[0][0]                      
__________________________________________________________________________________________________
add_8 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
                                                                 conv1d_9[0][0]                   
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 1000, 64)     12352       add_8[0][0]                      
__________________________________________________________________________________________________
add_9 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
                                                                 conv1d_9[0][0]                   
                                                                 conv1d_10[0][0]                  
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1000, 1, 64)  0           add_9[0][0]                      
__________________________________________________________________________________________________
conv2d_transpose_1 (Conv2DTrans (None, 1000, 1, 8)   12808       reshape_1[0][0]                  
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 1000, 8)      0           conv2d_transpose_1[0][0]         
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 64)           0           add_9[0][0]                      
__________________________________________________________________________________________________
profile/Oct4 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Sox2 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Nanog (Lambda)          (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Klf4 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
counts/Oct4 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Sox2 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Nanog (Dense)            (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Klf4 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
==================================================================================================
Total params: 130,960
Trainable params: 130,960
Non-trainable params: 0
__________________________________________________________________________________________________

seq = d.f['/inputs'][:]

import keras.layers as kl

m_avg = Model(m.input, m.get_layer("global_average_pooling1d_1").output)

hid_act = m.get_layer("add_9").output

gmax_pool = kl.GlobalMaxPooling1D()(hid_act)

max_avg_vec = kl.concatenate([gmax_pool, m.get_layer("global_average_pooling1d_1").output])

m_avg_max = Model(m.input, max_avg_vec)

act = m_avg_max.predict(seq, verbose=1)

98428/98428 [==============================] - 26s 261us/step

# append also the total counts
act_w_counts = np.concatenate([act, dfl.values], axis=1)

act_norm = StandardScaler().fit_transform(act_w_counts)

from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(act_norm)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2', 'pc3', 'pc4'])

idx = pd.Series(np.arange(len(act))).sample(frac=0.1, replace=True)

act_norm_small = act_norm[idx]

len(act_norm_small)

9843

act_norm

def plot_emb(emb, legend=False, alpha=0.3, **kwargs):
    """Plot embedding. Highlights the held-out points on the plot
    """
    plt.scatter(emb[:,0], emb[:,1], alpha=alpha, s=60, label="Training", **kwargs)

import umap

um = umap.UMAP(n_neighbors=5,
               min_dist=0.3)

um.fit(act_norm_small)

UMAP(a=0.9921756197688717, alpha=1.0, angular_rp_forest=False,
   b=1.1122533842193434, bandwidth=1.0, gamma=1.0, init='spectral',
   local_connectivity=1.0, metric='euclidean', metric_kwds={},
   min_dist=0.3, n_components=2, n_epochs=None, n_neighbors=5,
   negative_sample_rate=5, random_state=None, set_op_mix_ratio=1.0,
   spread=1.0, verbose=False)

sns.jointplot(um.embedding_[:,0], um.embedding_[:,1], kind="hex", color="#4CB391")

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

<seaborn.axisgrid.JointGrid at 0x7fec1c0b29e8>

from sklearn.cluster import AgglomerativeClustering

cl = AgglomerativeClustering(10)

cl.fit(act_norm)

AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=10,
            pooling_func=<function mean at 0x7fefc4310158>)

pd.Series(cl.labels_).value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x7fec1c2a7b00>

# Save the clusters to different masked files

outdir = model_dir / "clustering/agglomerative.model,counts/10/"

outdir.mkdir(parents=True, exist_ok=True)

from basepair.utils import write_pkl

for i in range(10):
    mask = cl.labels_ == i
    np.save(outdir / f"{i}.npy", mask, )

write_pkl(cl, outdir / "clustering.pkl")

ls {outdir}

0.npy  2.npy  4.npy  6.npy  8.npy  clustering.pkl
1.npy  3.npy  5.npy  7.npy  9.npy

mask.sum()

1703

from sklearn import manifold

x = manifold.SpectralEmbedding(n_components=2).fit_transform(act_norm_small)

def plot_clustering(X_red, X, labels, title=None):
    x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
    X_red = (X_red - x_min) / (x_max - x_min)

    plt.figure(figsize=(6, 4))
    #for i in range(X_red.shape[0]):
    plt.scatter(X_red[:, 0], X_red[:, 1], color=plt.cm.nipy_spectral(labels / 10.),
                fontdict={'weight': 'bold', 'size': 9})

cl.labels_

array([5, 6, 1, ..., 4, 1, 0])

plt.cm.nipy_spectral(cl.labels_ / 10)

array([[0.        , 0.73853137, 0.        , 1.        ],
       [0.        , 1.        , 0.        , 1.        ],
       [0.53068824, 0.        , 0.59738431, 1.        ],
       ...,
       [0.        , 0.6667    , 0.5333    , 1.        ],
       [0.53068824, 0.        , 0.59738431, 1.        ],
       [0.        , 0.        , 0.        , 1.        ]])

plt.scatter(x[:,0], x[:,1], color=plt.cm.nipy_spectral(cl.labels_ / 10), alpha=0.01)

cm = plt.get_cmap("tab10")

cm(np.array([10]))

array([[0.09019608, 0.74509804, 0.81176471, 1.        ]])

plt.scatter(um.embedding_[:,0], um.embedding_[:,1], color=cm(cl.labels_), alpha=0.05)

<matplotlib.collections.PathCollection at 0x7fec101d8e80>

pd.Series(cl.labels_).value_counts().plot(kind='bar')

<matplotlib.axes._subplots.AxesSubplot at 0x7fec14320b00>

plt.scatter(x[:,0], x[:,1], color=cm(cl.labels_), alpha=0.05)

<matplotlib.collections.PathCollection at 0x7fec0ff33198>

plot_emb(um.embedding_, alpha=0.01)

principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);

sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "

<seaborn.axisgrid.JointGrid at 0x7fec13ff12b0>

	Klf4	Nanog	Oct4	Sox2
0	7450.0	12551.0	23203.0	20492.0
1	396.0	13759.0	16198.0	4756.0
2	3047.0	3119.0	6931.0	1390.0
3	1797.0	1738.0	6956.0	1991.0
4	1692.0	1106.0	6298.0	3134.0

	Klf4	Nanog	Oct4	Sox2
0	8.916103	9.437635	10.052080	9.927838
1	5.983936	9.529521	9.692705	8.467373
2	8.022241	8.045588	8.843904	7.237778
3	7.494430	7.461065	8.847504	7.596894
4	7.434258	7.009409	8.748146	8.050385