Goal

  • subcluster the importance scores
    • write the instance vectors of the importance scores to file
    • use them when running modisco (e.g. subset)
In [1]:
import seaborn as sns
from basepair.config import get_data_dir
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from basepair.modisco import ModiscoResult, append_pattern_loc
from basepair.cli.schemas import DataSpec
from basepair.utils import read_pkl
In [2]:
ddir = Path(get_data_dir())
model_dir = ddir / "processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/"
modisco_dir = model_dir / "modisco/valid"
In [3]:
ds = DataSpec.load(model_dir / "dataspec.yaml")
In [4]:
from kipoi.readers import HDF5Reader
In [5]:
d = HDF5Reader(model_dir / "grad.all.h5")
d.open()
In [6]:
tasks = list(ds.task_specs)
In [77]:
df = pd.DataFrame({task: d.f[f"/targets/profile/{task}"][:].sum(axis=-1).sum(axis=-1) for task in tasks})
In [78]:
df.head()
Out[78]:
Klf4 Nanog Oct4 Sox2
0 7450.0 12551.0 23203.0 20492.0
1 396.0 13759.0 16198.0 4756.0
2 3047.0 3119.0 6931.0 1390.0
3 1797.0 1738.0 6956.0 1991.0
4 1692.0 1106.0 6298.0 3134.0
In [79]:
dfl = np.log(1+df)
In [7]:
from sklearn.preprocessing import StandardScaler
In [11]:
x = StandardScaler().fit_transform(dfl)
In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2'])
In [13]:
dfl.head()
Out[13]:
Klf4 Nanog Oct4 Sox2
0 8.916103 9.437635 10.052080 9.927838
1 5.983936 9.529521 9.692705 8.467373
2 8.022241 8.045588 8.843904 7.237778
3 7.494430 7.461065 8.847504 7.596894
4 7.434258 7.009409 8.748146 8.050385
In [14]:
pca.components_
Out[14]:
array([[ 0.39153526,  0.49672002,  0.54812079,  0.54729604],
       [ 0.90301651, -0.07103785, -0.21777698, -0.36343912]])
In [15]:
principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);
In [ ]:
sns.jointplot(x, y, kind="hex", color="#4CB391")
In [27]:
principalDf.plot("pc1", "pc2", gridsize=100, kind='hexbin');
In [29]:
sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[29]:
<seaborn.axisgrid.JointGrid at 0x7f6b5419bcf8>
In [16]:
import seaborn as sns
In [36]:
dfls = dfl.sample(frac=0.1)
In [37]:
g = sns.PairGrid(dfls)
g.map_diag(sns.kdeplot)
g.map_offdiag(sns.kdeplot, n_levels=6);
2018-08-22 14:47:20,749 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,759 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,768 [WARNING] No handles with labels found to put in legend.
2018-08-22 14:47:20,778 [WARNING] No handles with labels found to put in legend.
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/contour.py:960: UserWarning: The following kwargs were not used by contour: 'label', 'color'
  s)
In [18]:
sns.pairplot(dfl,
             diag_kind = 'kde', 
             markers='o',
             plot_kws = {'alpha': 0.01, 'edgecolor': None})
Out[18]:
<seaborn.axisgrid.PairGrid at 0x7f6b800c3d30>
In [ ]:
def plot_emb(emb, legend=False):
    """Plot embedding. Highlights the held-out points on the plot
    """
    plt.scatter(emb[:,0], emb[:,1], alpha=0.3, s=60, label="Training")
In [ ]:
plot_emb(tsnec.embedding_)
In [38]:
import umap
In [39]:
um = umap.UMAP(n_neighbors=5,
               min_dist=0.3)
In [ ]:
plot_emb(mds.embedding_)
In [ ]:
from sklearn. import Pip
In [8]:
d.ls()
Out[8]:
[('/grads/Klf4/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Klf4/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Nanog/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Oct4/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/count/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/count/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/weighted/0',
  <HDF5 dataset "0": shape (98428, 1000, 4), type "<f4">),
 ('/grads/Sox2/weighted/1',
  <HDF5 dataset "1": shape (98428, 1000, 4), type "<f4">),
 ('/inputs', <HDF5 dataset "inputs": shape (98428, 1000, 4), type "<f4">),
 ('/metadata/interval_from_task',
  <HDF5 dataset "interval_from_task": shape (98428,), type "|O">),
 ('/metadata/range/chr', <HDF5 dataset "chr": shape (98428,), type "|O">),
 ('/metadata/range/end', <HDF5 dataset "end": shape (98428,), type "<i8">),
 ('/metadata/range/id', <HDF5 dataset "id": shape (98428,), type "<i8">),
 ('/metadata/range/start', <HDF5 dataset "start": shape (98428,), type "<i8">),
 ('/metadata/range/strand',
  <HDF5 dataset "strand": shape (98428,), type "|O">),
 ('/targets/profile/Klf4',
  <HDF5 dataset "Klf4": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Nanog',
  <HDF5 dataset "Nanog": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Oct4',
  <HDF5 dataset "Oct4": shape (98428, 1000, 2), type "<f4">),
 ('/targets/profile/Sox2',
  <HDF5 dataset "Sox2": shape (98428, 1000, 2), type "<f4">)]

Cluster activations

In [8]:
ls {model_dir}
cometml.json   grad.all.h5     grad.valid.h5  Intervene_results/  results.html
dataspec.yaml  grad.test.2.h5  history.csv    model.h5            results.ipynb
figures/       grad.test.h5    hparams.yaml   modisco/
In [9]:
from keras.models import load_model, Model
In [10]:
from basepair.config import create_tf_session
In [11]:
create_tf_session(0)
Out[11]:
<tensorflow.python.client.session.Session at 0x7fef912164e0>
In [12]:
m = load_model(model_dir / 'model.h5')
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-23 15:13:10,854 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-23 15:13:17,264 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [13]:
m.summary()
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
==================================================================================================
seq (InputLayer)                (None, 1000, 4)      0                                            
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 1000, 64)     6464        seq[0][0]                        
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 1000, 64)     12352       conv1d_1[0][0]                   
__________________________________________________________________________________________________
add_1 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
__________________________________________________________________________________________________
conv1d_3 (Conv1D)               (None, 1000, 64)     12352       add_1[0][0]                      
__________________________________________________________________________________________________
add_2 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 1000, 64)     12352       add_2[0][0]                      
__________________________________________________________________________________________________
add_3 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 1000, 64)     12352       add_3[0][0]                      
__________________________________________________________________________________________________
add_4 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 1000, 64)     12352       add_4[0][0]                      
__________________________________________________________________________________________________
add_5 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 1000, 64)     12352       add_5[0][0]                      
__________________________________________________________________________________________________
add_6 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 1000, 64)     12352       add_6[0][0]                      
__________________________________________________________________________________________________
add_7 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
__________________________________________________________________________________________________
conv1d_9 (Conv1D)               (None, 1000, 64)     12352       add_7[0][0]                      
__________________________________________________________________________________________________
add_8 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
                                                                 conv1d_9[0][0]                   
__________________________________________________________________________________________________
conv1d_10 (Conv1D)              (None, 1000, 64)     12352       add_8[0][0]                      
__________________________________________________________________________________________________
add_9 (Add)                     (None, 1000, 64)     0           conv1d_1[0][0]                   
                                                                 conv1d_2[0][0]                   
                                                                 conv1d_3[0][0]                   
                                                                 conv1d_4[0][0]                   
                                                                 conv1d_5[0][0]                   
                                                                 conv1d_6[0][0]                   
                                                                 conv1d_7[0][0]                   
                                                                 conv1d_8[0][0]                   
                                                                 conv1d_9[0][0]                   
                                                                 conv1d_10[0][0]                  
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 1000, 1, 64)  0           add_9[0][0]                      
__________________________________________________________________________________________________
conv2d_transpose_1 (Conv2DTrans (None, 1000, 1, 8)   12808       reshape_1[0][0]                  
__________________________________________________________________________________________________
reshape_2 (Reshape)             (None, 1000, 8)      0           conv2d_transpose_1[0][0]         
__________________________________________________________________________________________________
global_average_pooling1d_1 (Glo (None, 64)           0           add_9[0][0]                      
__________________________________________________________________________________________________
profile/Oct4 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Sox2 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Nanog (Lambda)          (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
profile/Klf4 (Lambda)           (None, 1000, 2)      0           reshape_2[0][0]                  
__________________________________________________________________________________________________
counts/Oct4 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Sox2 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Nanog (Dense)            (None, 2)            130         global_average_pooling1d_1[0][0] 
__________________________________________________________________________________________________
counts/Klf4 (Dense)             (None, 2)            130         global_average_pooling1d_1[0][0] 
==================================================================================================
Total params: 130,960
Trainable params: 130,960
Non-trainable params: 0
__________________________________________________________________________________________________
In [14]:
seq = d.f['/inputs'][:]
In [16]:
import keras.layers as kl
In [18]:
m_avg = Model(m.input, m.get_layer("global_average_pooling1d_1").output)

hid_act = m.get_layer("add_9").output
In [19]:
gmax_pool = kl.GlobalMaxPooling1D()(hid_act)
In [20]:
max_avg_vec = kl.concatenate([gmax_pool, m.get_layer("global_average_pooling1d_1").output])
In [21]:
m_avg_max = Model(m.input, max_avg_vec)
In [22]:
act = m_avg_max.predict(seq, verbose=1)
98428/98428 [==============================] - 26s 261us/step
In [80]:
# append also the total counts
act_w_counts = np.concatenate([act, dfl.values], axis=1)
In [81]:
act_norm = StandardScaler().fit_transform(act_w_counts)
In [82]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
principalComponents = pca.fit_transform(act_norm)
principalDf = pd.DataFrame(data = principalComponents, columns = ['pc1', 'pc2', 'pc3', 'pc4'])
In [26]:
idx = pd.Series(np.arange(len(act))).sample(frac=0.1, replace=True)
In [27]:
act_norm_small = act_norm[idx]
In [28]:
len(act_norm_small)
Out[28]:
9843
In [ ]:
act_norm
In [48]:
def plot_emb(emb, legend=False, alpha=0.3, **kwargs):
    """Plot embedding. Highlights the held-out points on the plot
    """
    plt.scatter(emb[:,0], emb[:,1], alpha=alpha, s=60, label="Training", **kwargs)
In [30]:
import umap
In [31]:
um = umap.UMAP(n_neighbors=5,
               min_dist=0.3)
In [32]:
um.fit(act_norm_small)
Out[32]:
UMAP(a=0.9921756197688717, alpha=1.0, angular_rp_forest=False,
   b=1.1122533842193434, bandwidth=1.0, gamma=1.0, init='spectral',
   local_connectivity=1.0, metric='euclidean', metric_kwds={},
   min_dist=0.3, n_components=2, n_epochs=None, n_neighbors=5,
   negative_sample_rate=5, random_state=None, set_op_mix_ratio=1.0,
   spread=1.0, verbose=False)
In [38]:
sns.jointplot(um.embedding_[:,0], um.embedding_[:,1], kind="hex", color="#4CB391")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[38]:
<seaborn.axisgrid.JointGrid at 0x7fec1c0b29e8>
In [51]:
from sklearn.cluster import AgglomerativeClustering
In [86]:
cl = AgglomerativeClustering(10)
In [87]:
cl.fit(act_norm)
Out[87]:
AgglomerativeClustering(affinity='euclidean', compute_full_tree='auto',
            connectivity=None, linkage='ward', memory=None, n_clusters=10,
            pooling_func=<function mean at 0x7fefc4310158>)
In [89]:
pd.Series(cl.labels_).value_counts().plot(kind='bar')
Out[89]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fec1c2a7b00>
In [ ]:
# Save the clusters to different masked files
In [91]:
outdir = model_dir / "clustering/agglomerative.model,counts/10/"
In [93]:
outdir.mkdir(parents=True, exist_ok=True)
In [94]:
from basepair.utils import write_pkl
In [98]:
for i in range(10):
    mask = cl.labels_ == i
    np.save(outdir / f"{i}.npy", mask, )

write_pkl(cl, outdir / "clustering.pkl")
In [101]:
ls {outdir}
0.npy  2.npy  4.npy  6.npy  8.npy  clustering.pkl
1.npy  3.npy  5.npy  7.npy  9.npy
In [100]:
mask.sum()
Out[100]:
1703
In [55]:
from sklearn import manifold
In [56]:
x = manifold.SpectralEmbedding(n_components=2).fit_transform(act_norm_small)
In [57]:
def plot_clustering(X_red, X, labels, title=None):
    x_min, x_max = np.min(X_red, axis=0), np.max(X_red, axis=0)
    X_red = (X_red - x_min) / (x_max - x_min)

    plt.figure(figsize=(6, 4))
    #for i in range(X_red.shape[0]):
    plt.scatter(X_red[:, 0], X_red[:, 1], color=plt.cm.nipy_spectral(labels / 10.),
                fontdict={'weight': 'bold', 'size': 9})
In [60]:
cl.labels_
Out[60]:
array([5, 6, 1, ..., 4, 1, 0])
In [62]:
plt.cm.nipy_spectral(cl.labels_ / 10)
Out[62]:
array([[0.        , 0.73853137, 0.        , 1.        ],
       [0.        , 1.        , 0.        , 1.        ],
       [0.53068824, 0.        , 0.59738431, 1.        ],
       ...,
       [0.        , 0.6667    , 0.5333    , 1.        ],
       [0.53068824, 0.        , 0.59738431, 1.        ],
       [0.        , 0.        , 0.        , 1.        ]])
In [ ]:
plt.scatter(x[:,0], x[:,1], color=plt.cm.nipy_spectral(cl.labels_ / 10), alpha=0.01)
In [67]:
cm = plt.get_cmap("tab10")
In [68]:
cm(np.array([10]))
Out[68]:
array([[0.09019608, 0.74509804, 0.81176471, 1.        ]])
In [72]:
plt.scatter(um.embedding_[:,0], um.embedding_[:,1], color=cm(cl.labels_), alpha=0.05)
Out[72]:
<matplotlib.collections.PathCollection at 0x7fec101d8e80>
In [74]:
pd.Series(cl.labels_).value_counts().plot(kind='bar')
Out[74]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fec14320b00>
In [71]:
plt.scatter(x[:,0], x[:,1], color=cm(cl.labels_), alpha=0.05)
Out[71]:
<matplotlib.collections.PathCollection at 0x7fec0ff33198>
In [50]:
plot_emb(um.embedding_, alpha=0.01)
In [42]:
principalDf.plot("pc1", "pc2", kind='scatter', alpha=0.1);
In [84]:
sns.jointplot(principalDf.pc1, principalDf.pc2, kind="hex", color="#4CB391")
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/matplotlib/axes/_axes.py:6462: UserWarning: The 'normed' kwarg is deprecated, and has been replaced by the 'density' kwarg.
  warnings.warn("The 'normed' kwarg is deprecated, and has been "
Out[84]:
<seaborn.axisgrid.JointGrid at 0x7fec13ff12b0>