Questions

  • can we discover events when Sox2 and Oct4 are both bound?
  • could we discover/distinguish those events when using only Sox2 or only Oct4?
  • does the multi-task model do better than the single task one?

Goal

  • make a multi-task model predicting Sox2 and Oct4 ChipNexus signal git issue

TODO

  • [x] setup a new dataset function based on Sox2 peaks
  • [x] setup a new model having two output branches
  • [x] train the model
  • add the training curve plot bellow training
  • [ ] modify the plotting code
    • show also the output tracks for Oct4
    • show also the sequence importance for Oct4
  • [ ] try out a few models to figure out the optimal hyper-parameters
  • [ ] figure out how to best visualize the sequence importances
    • max w.r.t. each individual track
      • the problem here is that Oct4 might have multiple peaks present in the sequence
    • average?
    • top2 peaks?
  • [ ] run modisco - how do the motifs look like?
  • [ ] plot the average sequence distribution for each motif

setup a new dataset function based on Sox2 peaks

In [3]:
import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir, create_tf_session
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from basepair.datasets import get_sox2_data
from basepair.plots import plot_loss
import pyBigWig
from basepair.math import softmax
import matplotlib.pyplot as plt

from basepair import samplers
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [4]:
create_tf_session(1)
Out[4]:
<tensorflow.python.client.session.Session at 0x7f4995e1be10>
In [5]:
ddir = get_data_dir()
In [6]:
#rm -rf {ddir}/processed/chipnexus/chipseq_pipeline/
#!ln -s /srv/scratch/shared/surya/avsec/chip-nexus/data-stowers/semi_processed {ddir}/processed/chipnexus/chipseq_pipeline
In [7]:
from basepair.datasets import *
In [8]:
from basepair.datasets import sox2_oct4_peaks_sox2
In [9]:
# Get the dataset
df_sox2_oct4 = get_chipnexus_data(
    bigwigs={"sox2_pos": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.pos_strand.bw",
             "sox2_neg": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.neg_strand.bw",
             "oct4_pos": f"{Oct4_BW_DIR}/Oct4_1234.pos.bw",
             "oct4_neg": f"{Oct4_BW_DIR}/Oct4_1234.neg.bw",
})
In [10]:
train, valid, test = sox2_oct4_peaks_sox2()
In [11]:
train[0][:2]
Out[11]:
array([[[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]],

       [[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]]])
In [12]:
train[1]['oct4'][:2,:2]
Out[12]:
array([[[0., 2.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.]]], dtype=float32)
In [13]:
train[1]['sox2'][:2,:2]
Out[13]:
array([[[0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 1.]]], dtype=float32)

Plot some random examples

In [14]:
idx_list = samplers.random(train[1]['sox2'], 10)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

Plot top count examples based on Sox2

In [19]:
idx_list = samplers.top_max_count(train[1]['sox2'], 20, 10)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

Plot top count examples based on Oct4

In [20]:
idx_list = samplers.top_max_count(train[1]['oct4'], 10, 0)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

setup a new model having two output branches

In [14]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_mutlitask
In [15]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
In [16]:
def get_model(mfn, mkwargs):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/chipnexus/exp/models/multi-task"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(datetime.datetime.now()).replace(" ", "::")
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    return eval(mfn)(**mkwargs), name, ckp_file

train the model

  • add the training curve plot bellow training
In [17]:
#model = seq_mutlitask()
In [18]:
# hyper-parameters
mfn = "seq_mutlitask"
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=25,
               n_dil_layers=6,
               lr=0.004)
In [19]:
# best valid so far: 108238.6558
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0], 
                    [train[1]['sox2'], train[1]['oct4']],
          batch_size=256, 
          epochs=100,
          validation_data=valid[:2],
          callbacks=[EarlyStopping(patience=5),
                     History(),
                     ModelCheckpoint(ckp_file, save_best_only=True)]
         )
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll, 
                                             "SpatialLifetimeSparsity": SpatialLifetimeSparsity})
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-05-19 01:37:20,100 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-05-19 01:37:25,860 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
Train on 5561 samples, validate on 1884 samples
Epoch 1/100
5561/5561 [==============================] - 12s 2ms/step - loss: 725.8702 - sox2_loss: 299.0563 - oct4_loss: 426.8138 - val_loss: 672.2314 - val_sox2_loss: 279.9911 - val_oct4_loss: 392.2403
Epoch 2/100
5561/5561 [==============================] - 1s 130us/step - loss: 680.3478 - sox2_loss: 283.3089 - oct4_loss: 397.0389 - val_loss: 662.0509 - val_sox2_loss: 275.3632 - val_oct4_loss: 386.6877
Epoch 3/100
5561/5561 [==============================] - 1s 123us/step - loss: 631.0815 - sox2_loss: 261.4637 - oct4_loss: 369.6178 - val_loss: 624.1503 - val_sox2_loss: 259.5057 - val_oct4_loss: 364.6446
Epoch 22/100
5561/5561 [==============================] - 1s 129us/step - loss: 630.3434 - sox2_loss: 261.1060 - oct4_loss: 369.2375 - val_loss: 623.7369 - val_sox2_loss: 259.1658 - val_oct4_loss: 364.5711
Epoch 23/100
5561/5561 [==============================] - 1s 138us/step - loss: 629.9745 - sox2_loss: 260.9869 - oct4_loss: 368.9876 - val_loss: 623.8516 - val_sox2_loss: 259.2231 - val_oct4_loss: 364.6285
Epoch 24/100
5561/5561 [==============================] - 1s 126us/step - loss: 629.5077 - sox2_loss: 260.7842 - oct4_loss: 368.7235 - val_loss: 623.7873 - val_sox2_loss: 259.2293 - val_oct4_loss: 364.5581
Epoch 25/100
5561/5561 [==============================] - 1s 135us/step - loss: 629.2410 - sox2_loss: 260.7460 - oct4_loss: 368.4949 - val_loss: 623.4647 - val_sox2_loss: 259.0624 - val_oct4_loss: 364.4022
Epoch 26/100
5561/5561 [==============================] - 1s 133us/step - loss: 628.6763 - sox2_loss: 260.4506 - oct4_loss: 368.2258 - val_loss: 623.2275 - val_sox2_loss: 259.0944 - val_oct4_loss: 364.1331
Epoch 27/100
5561/5561 [==============================] - 1s 145us/step - loss: 628.2974 - sox2_loss: 260.2821 - oct4_loss: 368.0153 - val_loss: 622.8090 - val_sox2_loss: 258.8739 - val_oct4_loss: 363.9352
Epoch 28/100
5561/5561 [==============================] - 1s 125us/step - loss: 627.7686 - sox2_loss: 260.1192 - oct4_loss: 367.6494 - val_loss: 622.6503 - val_sox2_loss: 258.7847 - val_oct4_loss: 363.8656
Epoch 29/100
5561/5561 [==============================] - 1s 125us/step - loss: 627.4070 - sox2_loss: 259.9290 - oct4_loss: 367.4780 - val_loss: 622.5302 - val_sox2_loss: 258.5780 - val_oct4_loss: 363.9522
Epoch 30/100
5561/5561 [==============================] - 1s 133us/step - loss: 627.0976 - sox2_loss: 259.8306 - oct4_loss: 367.2670 - val_loss: 622.3950 - val_sox2_loss: 258.8158 - val_oct4_loss: 363.5792
Epoch 31/100
5561/5561 [==============================] - 1s 127us/step - loss: 627.2904 - sox2_loss: 259.8620 - oct4_loss: 367.4284 - val_loss: 622.2841 - val_sox2_loss: 258.5842 - val_oct4_loss: 363.6999
Epoch 32/100
5561/5561 [==============================] - 1s 136us/step - loss: 626.7113 - sox2_loss: 259.6856 - oct4_loss: 367.0257 - val_loss: 621.7083 - val_sox2_loss: 258.4962 - val_oct4_loss: 363.2120
Epoch 33/100
5561/5561 [==============================] - 1s 168us/step - loss: 626.2117 - sox2_loss: 259.4664 - oct4_loss: 366.7453 - val_loss: 621.4707 - val_sox2_loss: 258.3064 - val_oct4_loss: 363.1642
Epoch 34/100
5561/5561 [==============================] - 1s 124us/step - loss: 626.0531 - sox2_loss: 259.3884 - oct4_loss: 366.6647 - val_loss: 622.1674 - val_sox2_loss: 258.5421 - val_oct4_loss: 363.6252
Epoch 35/100
5561/5561 [==============================] - 1s 130us/step - loss: 625.7034 - sox2_loss: 259.2915 - oct4_loss: 366.4118 - val_loss: 621.4809 - val_sox2_loss: 258.3735 - val_oct4_loss: 363.1074
Epoch 36/100
5561/5561 [==============================] - 1s 128us/step - loss: 625.5618 - sox2_loss: 259.2086 - oct4_loss: 366.3532 - val_loss: 621.3216 - val_sox2_loss: 258.3018 - val_oct4_loss: 363.0198
Epoch 37/100
5561/5561 [==============================] - 1s 134us/step - loss: 625.2049 - sox2_loss: 259.1065 - oct4_loss: 366.0984 - val_loss: 622.1884 - val_sox2_loss: 258.7861 - val_oct4_loss: 363.4023
Epoch 38/100
5561/5561 [==============================] - 1s 135us/step - loss: 625.3023 - sox2_loss: 259.0844 - oct4_loss: 366.2179 - val_loss: 621.2375 - val_sox2_loss: 258.2183 - val_oct4_loss: 363.0191
Epoch 39/100
5561/5561 [==============================] - 1s 138us/step - loss: 625.0223 - sox2_loss: 259.0209 - oct4_loss: 366.0014 - val_loss: 621.4369 - val_sox2_loss: 258.2670 - val_oct4_loss: 363.1699
Epoch 40/100
5561/5561 [==============================] - 1s 143us/step - loss: 624.5120 - sox2_loss: 258.8113 - oct4_loss: 365.7007 - val_loss: 620.8262 - val_sox2_loss: 258.0551 - val_oct4_loss: 362.7711
Epoch 41/100
5561/5561 [==============================] - 1s 126us/step - loss: 624.5702 - sox2_loss: 258.8232 - oct4_loss: 365.7471 - val_loss: 621.1330 - val_sox2_loss: 258.2693 - val_oct4_loss: 362.8637
Epoch 42/100
5561/5561 [==============================] - 1s 136us/step - loss: 624.6354 - sox2_loss: 258.8698 - oct4_loss: 365.7655 - val_loss: 621.0011 - val_sox2_loss: 258.1096 - val_oct4_loss: 362.8915
Epoch 43/100
5561/5561 [==============================] - 1s 126us/step - loss: 624.0940 - sox2_loss: 258.6219 - oct4_loss: 365.4721 - val_loss: 621.0460 - val_sox2_loss: 258.2217 - val_oct4_loss: 362.8243
Epoch 44/100
5561/5561 [==============================] - 1s 131us/step - loss: 624.0211 - sox2_loss: 258.6595 - oct4_loss: 365.3616 - val_loss: 621.0060 - val_sox2_loss: 258.0912 - val_oct4_loss: 362.9149
Epoch 45/100
5561/5561 [==============================] - 1s 130us/step - loss: 623.6265 - sox2_loss: 258.4887 - oct4_loss: 365.1378 - val_loss: 620.4986 - val_sox2_loss: 258.0624 - val_oct4_loss: 362.4363
Epoch 46/100
5561/5561 [==============================] - 1s 163us/step - loss: 623.6178 - sox2_loss: 258.4712 - oct4_loss: 365.1466 - val_loss: 620.5570 - val_sox2_loss: 257.9387 - val_oct4_loss: 362.6183
Epoch 47/100
5561/5561 [==============================] - 1s 146us/step - loss: 623.5925 - sox2_loss: 258.4865 - oct4_loss: 365.1061 - val_loss: 620.6887 - val_sox2_loss: 258.1253 - val_oct4_loss: 362.5634
Epoch 48/100
5561/5561 [==============================] - 1s 123us/step - loss: 623.5082 - sox2_loss: 258.4164 - oct4_loss: 365.0918 - val_loss: 620.7249 - val_sox2_loss: 258.0639 - val_oct4_loss: 362.6610
Epoch 49/100
5561/5561 [==============================] - 1s 128us/step - loss: 623.4521 - sox2_loss: 258.3986 - oct4_loss: 365.0534 - val_loss: 620.7637 - val_sox2_loss: 258.0337 - val_oct4_loss: 362.7300
Epoch 50/100
5561/5561 [==============================] - 1s 124us/step - loss: 622.9029 - sox2_loss: 258.1906 - oct4_loss: 364.7123 - val_loss: 620.4869 - val_sox2_loss: 258.0069 - val_oct4_loss: 362.4799
Epoch 51/100
5561/5561 [==============================] - 1s 125us/step - loss: 622.9663 - sox2_loss: 258.2301 - oct4_loss: 364.7361 - val_loss: 621.0509 - val_sox2_loss: 258.2052 - val_oct4_loss: 362.8457
Epoch 52/100
5561/5561 [==============================] - 1s 131us/step - loss: 622.8394 - sox2_loss: 258.1815 - oct4_loss: 364.6579 - val_loss: 620.5304 - val_sox2_loss: 257.9587 - val_oct4_loss: 362.5717
Epoch 53/100
5561/5561 [==============================] - 1s 166us/step - loss: 622.5652 - sox2_loss: 258.0479 - oct4_loss: 364.5173 - val_loss: 620.4069 - val_sox2_loss: 257.8786 - val_oct4_loss: 362.5283
Epoch 54/100
5561/5561 [==============================] - 1s 126us/step - loss: 622.3383 - sox2_loss: 257.9997 - oct4_loss: 364.3386 - val_loss: 620.0581 - val_sox2_loss: 257.8211 - val_oct4_loss: 362.2370
Epoch 55/100
5561/5561 [==============================] - 1s 130us/step - loss: 622.1358 - sox2_loss: 257.8988 - oct4_loss: 364.2370 - val_loss: 620.2261 - val_sox2_loss: 258.0444 - val_oct4_loss: 362.1817
Epoch 56/100
5561/5561 [==============================] - 1s 124us/step - loss: 622.0657 - sox2_loss: 257.8693 - oct4_loss: 364.1964 - val_loss: 620.7373 - val_sox2_loss: 258.0696 - val_oct4_loss: 362.6677
Epoch 57/100
5561/5561 [==============================] - 1s 134us/step - loss: 622.2536 - sox2_loss: 257.9297 - oct4_loss: 364.3239 - val_loss: 621.6410 - val_sox2_loss: 258.3100 - val_oct4_loss: 363.3309
Epoch 58/100
5561/5561 [==============================] - 1s 127us/step - loss: 622.1744 - sox2_loss: 257.9724 - oct4_loss: 364.2020 - val_loss: 620.0066 - val_sox2_loss: 257.8401 - val_oct4_loss: 362.1665
Epoch 59/100
5561/5561 [==============================] - 1s 126us/step - loss: 621.8661 - sox2_loss: 257.8039 - oct4_loss: 364.0622 - val_loss: 620.3550 - val_sox2_loss: 257.9574 - val_oct4_loss: 362.3976
Epoch 60/100
5561/5561 [==============================] - 1s 160us/step - loss: 621.6862 - sox2_loss: 257.7334 - oct4_loss: 363.9528 - val_loss: 620.0129 - val_sox2_loss: 257.8191 - val_oct4_loss: 362.1939
Epoch 61/100
5561/5561 [==============================] - 1s 124us/step - loss: 621.6400 - sox2_loss: 257.7349 - oct4_loss: 363.9051 - val_loss: 620.2102 - val_sox2_loss: 257.9882 - val_oct4_loss: 362.2220
Epoch 62/100
5561/5561 [==============================] - 1s 125us/step - loss: 621.6439 - sox2_loss: 257.7326 - oct4_loss: 363.9113 - val_loss: 620.4273 - val_sox2_loss: 258.0618 - val_oct4_loss: 362.3655
Epoch 63/100
5561/5561 [==============================] - 1s 120us/step - loss: 621.2951 - sox2_loss: 257.6221 - oct4_loss: 363.6730 - val_loss: 621.0728 - val_sox2_loss: 258.2039 - val_oct4_loss: 362.8689
In [ ]:
plot_loss(history)

modify the plotting code

  • show also the output tracks for Oct4
  • show also the sequence importance for Oct4
In [176]:
y_pred = model.predict(test[0])
In [177]:
plt.scatter(np.log(1+np.ravel(test[1]['sox2'])), np.log(1+np.ravel(test[1]['oct4'])));
In [178]:
def normalize(x):
    return x / x.sum(1, keepdims=True)
In [149]:
test[1]['sox2'].shape
Out[149]:
(1951, 201, 2)
In [148]:
plt.scatter(np.log(1+np.ravel(normalize(test[1]['sox2']))), 
            np.log(1+np.ravel(normalize(test[1]['oct4']))));
In [151]:
plt.scatter(np.ravel(normalize(test[1]['sox2'])), 
            np.ravel(normalize(test[1]['oct4'])));
In [142]:
plt.scatter(y_pred[0].reshape((len(y_pred[0]), -1)), y_pred[1].reshape((len(y_pred[0]), -1)));
In [179]:
class Seq2Sox2Oct4:
    
    def __init__(self, x, y, model):
        self.x = x
        self.y = y
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(y) for y in model.predict(x)]
        
    def plot(self, n=10, kind='test', sort='random', sort_prot='sox2', figsize=(20,2)):
        import matplotlib.pyplot as plt
        if sort=='random':
            idx_list = pd.Series(np.arange(len(self.x))).sample(n)
        elif sort == 'top':
            max_counts_pos = pd.Series(np.max(self.y[sort_prot][:,:,0], axis=-1))
            max_counts_neg = pd.Series(np.max(self.y[sort_prot][:,:,1], axis=-1))
            idx_list = (max_counts_pos + max_counts_neg).sort_values(ascending=False).index[:n]
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
        for i,idx in enumerate(idx_list):
            plt.figure(figsize=figsize)
            plt.subplot(141)
            if i==0:
                plt.title("Predicted Sox2")
            plt.plot(self.y_pred[0][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[0][idx,:,0])))
            plt.plot(self.y_pred[0][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[0][idx,:,1])))
            plt.legend();
            plt.subplot(142)
            if i==0:
                plt.title("Observed Sox2")
            plt.plot(self.y['sox2'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['sox2'][idx,:,0])))
            plt.plot(self.y['sox2'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['sox2'][idx,:,1])))
            plt.legend();
            plt.subplot(143)
            if i==0:
                plt.title("Predicted Oct4")
            plt.plot(self.y_pred[1][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[1][idx,:,0])))
            plt.plot(self.y_pred[1][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[1][idx,:,1])))
            plt.legend();
            plt.subplot(144)
            if i==0:
                plt.title("Observed Oct4")
            plt.plot(self.y['oct4'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['oct4'][idx,:,0])))
            plt.plot(self.y['oct4'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['oct4'][idx,:,1])))
            plt.legend();            
In [180]:
p = Seq2Sox2Oct4(test[0], test[1], model)
In [181]:
p.plot(sort='top', sort_prot='oct4',figsize=(20,2))
In [182]:
p.plot(sort='top', sort_prot='sox2',figsize=(20,2))

try out a few models to figure out the optimal hyper-parameters

figure out how to best visualize the sequence importances

  • max w.r.t. each individual track
    • the problem here is that Oct4 might have multiple peaks present in the sequence
  • average?
  • top2 peaks?
In [ ]:
from basepair import samplers
import pandas as pd
from basepair.math import softmax
import numpy as np
import keras.backend as K
from keras.models import Model
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt
In [ ]:
seqlogo
Out[ ]:
<function concise.utils.plot.seqlogo(letter_heights, vocab='DNA', ax=None)>
In [31]:
class Seq2Nexus:
    
    def __init__(self, x, y, df, model):
        self.x = x
        self.y = y
        self.df = df
        self.labels = df.chr + ":" + df.start.astype(str) + "-" + df.end.astype(str)
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(p) for p in model.predict(x)]
        self.seq_len = self.y_pred[0].shape[1]
        
        
    def input_grad(self, x, strand='pos', task_id=0, seq_grad='max'):
        strand_id = {"pos": 0, "neg": 1}[strand]
        
        if seq_grad =='max':
            sfn = K.max
        elif seq_grad == 'mean':
            sfn = K.mean
        else:
            raise ValueError(f"seq_grad={seq_grad} couldn't be interpreted")
        inp = self.model.inputs[0]
        fn = K.function([inp], K.gradients(sfn(self.model.outputs[task_id][:,:,strand_id], axis=-1), inp))
        return fn([x])[0]
        
    
        
    def plot(self, n=10, kind='test', sort='random', 
             seq_grad='max', figsize=(20,6)):
        import matplotlib.pyplot as plt
         
        if sort=='random':
            idx_list = samplers.random(self.x, n)
        elif "_" in sort:
            kind, task = sort.split("_")
            if kind == "max":
                idx_list = samplers.top_max_count(self.y[task], n)
            elif kind == "sum":
                idx_list = samplers.top_sum_count(self.y[task], n)
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
               

        # compute grads
        grads = [[self.input_grad(self.x[idx_list], 'pos', 0, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', 0, seq_grad) * self.x[idx_list]],
                 [self.input_grad(self.x[idx_list], 'pos', 1, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', 1, seq_grad) * self.x[idx_list]]]

        for i,idx in enumerate(idx_list):
            fig, axes = plt.subplots(8, 1, sharex=True, figsize=figsize)

            for tid, task in enumerate(['sox2', 'oct4']):
                axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,0], label="pos")#
                axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,1], label="neg")
                axes[0 + 4*tid].set_ylabel("Observed")
                axes[0 + 4*tid].legend()
                axes[0 + 4*tid].set_title('{} {}'.format(task, self.labels.iloc[idx]))
                axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,0], label="pos")#
                axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,1], label="neg")
                axes[1 + 4*tid].set_ylabel("Predicted")
                axes[1 + 4*tid].legend()
                # ------------------
                seqlogo(grads[tid][0][i], ax=axes[2 + 4*tid]);
                axes[2 + 4*tid].set_ylabel("Pos. strand")
                seqlogo(grads[tid][1][i], ax=axes[3 + 4*tid]);
                axes[3 + 4*tid].set_ylabel("Neg. strand")
                x_range = [1, self.seq_len]
                axes[3 + 4*tid].set_xticks(list(range(0, self.seq_len, 5))); 
In [32]:
p = Seq2Nexus(test[0], test[1],test[2], model)
In [33]:
p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))
In [34]:
p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))
In [35]:
p.plot(sort='max_oct4', seq_grad='max', figsize=(20,10))
In [36]:
p.plot(sort='max_oct4', seq_grad='mean', figsize=(20,10))

run modisco - how do the motifs look like?

In [142]:
# Functions to get the gradients
out = model.outputs[0]
inp = model.inputs[0]
pos_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,0], axis=-1), inp))
neg_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,1], axis=-1), inp))
pos_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,0], axis=-1), inp))
neg_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,1], axis=-1), inp))

Get predictions and gradients

In [143]:
from basepair.data import numpy_minibatch
In [144]:
# Pre-compute the predictions and bottlenecks
x = valid[0]
y_true = valid[1]['sox2']
y_pred = softmax(model.predict(x)[0])
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x

Correlate pos and neg-strand gradients

In [145]:
from scipy.spatial.distance import cosine, correlation
In [146]:
grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))
In [147]:
distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i]) for i in range(len(grads_neg_ext))])
In [148]:
import numpy as np
import seaborn as sns
In [149]:
plt.figure(figsize=(6,6))
plt.subplot(211)
plt.hist(distances, bins=50);
plt.subplot(212)
values, base = np.histogram(distances, bins=40)
plt.plot(base[:-1], np.cumsum(values));
plt.grid()
plt.xlabel("Correlation Distance");
plt.ylabel("Fraction of data points");
In [150]:
# We get roughly 1/3 of the points with high-correlation
In [151]:
top10_idx = pd.Series(np.where(distances<0.5)[0]).sample(10)
In [152]:
top10_idx = pd.Series(distances).sort_values(ascending=True).index[:10]
In [153]:
distances
Out[153]:
array([0.60056475, 0.5733467 , 0.17005539, ..., 0.17135131, 0.08901739,
       0.27528453])

Plot some

In [154]:
# Top maxcount indicies
top10_idx = pd.Series(y_true.max(1).sum(1)).sort_values(ascending=False).index[10:30]
In [155]:
# Top count indicies
top10_idx = pd.Series(y_true.sum(1).sum(1)).sort_values(ascending=False).index[:10]
In [156]:
# Random indicies
top10_idx = pd.Series(np.arange(len(y_true))).sample(10)
In [157]:
for i in top10_idx:
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True, figsize=(20,6))
    
    ax1.plot(np.arange(1,202), y_true[i,:,0], label="pos")#
    ax1.plot(np.arange(1,202), y_true[i,:,1], label="neg")
    ax1.set_ylabel("Observed\ncounts")
    ax1.legend()
    ax2.plot(np.arange(1,202), y_pred[i,:,0], label="pos")#
    ax2.plot(np.arange(1,202), y_pred[i,:,1], label="neg")
    ax2.set_ylabel("Predicted\n")
    ax2.legend()
    ax3.set_ylabel("Pos. strand")
    seqlogo(igrads_pos[i], ax=ax3);
    ax4.set_ylabel("Neg. strand")
    seqlogo(igrads_neg[i], ax=ax4);
    x_range = [1, 201]
    ax4.set_xticks(list(range(0, 201, 5)));

Run modisco

In [158]:
top_distances = distances<0.2
In [159]:
hyp_scores = grads_pos + grads_neg
In [160]:
hyp_scores = hyp_scores[top_distances]
In [161]:
hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)
In [162]:
onehot_data = valid[0][top_distances]
In [163]:
scores = hyp_scores * onehot_data
In [264]:
len(hyp_scores)
Out[264]:
815
In [165]:
# Visualize
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0])
viz_sequence.plot_weights(hyp_scores[0])
viz_sequence.plot_weights(onehot_data[0])
In [ ]:
from imp import reload
In [ ]:
%env MKL_THREADING_LAYER=GNU
env: MKL_THREADING_LAYER=GNU
In [ ]:
import theano
In [ ]:
onehot_data.shape
Out[ ]:
(815, 201, 4)
In [ ]:
import h5py
import numpy as np
%matplotlib inline
import modisco
reload(modisco)
import modisco.backend
reload(modisco.backend.theano_backend)
reload(modisco.backend)
import modisco.nearest_neighbors
reload(modisco.nearest_neighbors)
import modisco.affinitymat
reload(modisco.affinitymat.core)
reload(modisco.affinitymat.transformers)
import modisco.tfmodisco_workflow.seqlets_to_patterns
reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
import modisco.tfmodisco_workflow.workflow
reload(modisco.tfmodisco_workflow.workflow)
import modisco.aggregator
reload(modisco.aggregator)
import modisco.cluster
reload(modisco.cluster.core)
reload(modisco.cluster.phenograph.core)
reload(modisco.cluster.phenograph.cluster)
import modisco.core
reload(modisco.core)
import modisco.coordproducers
reload(modisco.coordproducers)
import modisco.metaclusterers
reload(modisco.metaclusterers)

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
    sliding_window_size=21, 
    flank_size=10,
    histogram_bins=100, 
    percentiles_in_bandwidth=10,
    overlap_portion=0.5,     
    min_cluster_size=50,    
    threshold_for_counting_sign=1.0,
    weak_threshold_for_counting_sign=0.7)(
        task_names=["task0"],
        contrib_scores={'task0': scores},
        hypothetical_contribs={'task0': hyp_scores},
        one_hot=onehot_data)
On task task0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Got 4961 coords
Computing thresholds
Bandwidth calculated: 0.19077584147453308
Computed threshold 0.6647349196552568
2095 coords remaining after thresholding
After resolving overlaps, got 2095 seqlets
2 activity patterns with support >= 50 out of 3 possible patterns
Metacluster sizes:  [1845, 250]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 250
Relevant tasks:  ('task0',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 250
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.41 s
Starting affinity matrix computations
Normalization computed in 0.04 s
Cosine similarity mat computed in 0.05 s
Normalization computed in 0.03 s
Cosine similarity mat computed in 0.04 s
Finished affinity matrix computations in 0.1 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.01 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 4.38 s
Launching nearest neighbors affmat calculation job
Job completed in: 4.03 s
(Round 1) Computed affinity matrix on nearest neighbors in 8.65 s
Filtered down to 246 of 250
(Round 1) Retained 246 rows out of 250 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 246 samples in 0.000s...
[t-SNE] Computed neighbors for 246 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 246 / 246
[t-SNE] Mean sigma: 0.302820
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.06781530380249023 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    4.8s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    5.8s finished
Louvain completed 200 runs in 9.413687467575073 seconds
Wrote graph to binary file in 0.13769841194152832 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.757372
Louvain completed 51 runs in 15.893865823745728 seconds
Preproc + Louvain took 25.584400415420532 s
Got 7 clusters after round 1
Counts:
{0: 64, 4: 32, 6: 9, 2: 42, 5: 24, 1: 42, 3: 33}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 64 seqlets
Trimmed 2 out of 64
Skipped 10 seqlets
Aggregating for cluster 1 with 42 seqlets
Trimmed 2 out of 42
Skipped 1 seqlets
Dropping cluster 1 with 39 seqlets due to sign disagreement
Aggregating for cluster 2 with 42 seqlets
Trimmed 6 out of 42
Skipped 5 seqlets
Dropping cluster 2 with 31 seqlets due to sign disagreement
Aggregating for cluster 3 with 33 seqlets
Trimmed 0 out of 33
Dropping cluster 3 with 33 seqlets due to sign disagreement
Aggregating for cluster 4 with 32 seqlets
Trimmed 1 out of 32
Skipped 15 seqlets
Aggregating for cluster 5 with 24 seqlets
Trimmed 3 out of 24
Skipped 4 seqlets
Dropping cluster 5 with 17 seqlets due to sign disagreement
Aggregating for cluster 6 with 9 seqlets
Trimmed 0 out of 9
Dropping cluster 6 with 9 seqlets due to sign disagreement
(Round 2) num seqlets: 68
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.29 s
Starting affinity matrix computations
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.02 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.02 s
Finished affinity matrix computations in 0.04 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.0 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 3.32 s
Launching nearest neighbors affmat calculation job
Job completed in: 3.05 s
(Round 2) Computed affinity matrix on nearest neighbors in 6.43 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 68 samples in 0.000s...
[t-SNE] Computed neighbors for 68 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 68 / 68
[t-SNE] Mean sigma: 0.309015
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.04799318313598633 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    4.4s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    5.6s finished
Louvain completed 200 runs in 9.199044227600098 seconds
Wrote graph to binary file in 0.028064966201782227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.693675
After 2 runs, maximum modularity is Q = 0.7026
Louvain completed 52 runs in 16.041913270950317 seconds
Preproc + Louvain took 25.38900923728943 s
Got 6 clusters after round 2
Counts:
{2: 14, 3: 12, 5: 3, 1: 16, 0: 17, 4: 6}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 17 seqlets
Trimmed 2 out of 17
Aggregating for cluster 1 with 16 seqlets
Trimmed 1 out of 16
Aggregating for cluster 2 with 14 seqlets
Trimmed 1 out of 14
Aggregating for cluster 3 with 12 seqlets
Trimmed 0 out of 12
Aggregating for cluster 4 with 6 seqlets
Trimmed 0 out of 6
Skipped 1 seqlets
Aggregating for cluster 5 with 3 seqlets
Trimmed 0 out of 3
Dropping cluster 5 with 3 seqlets due to sign disagreement
Got 5 clusters
Splitting into subclusters...
Merging on 5 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 5 patterns after merging
Performing seqlet reassignment
Got 0 patterns after reassignment
Total time taken is 84.49s
On metacluster 0
Metacluster size 1845
Relevant tasks:  ('task0',)
Relevant signs:  (1,)
(Round 1) num seqlets: 1845
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 2.7 s
Starting affinity matrix computations
Normalization computed in 0.44 s
Cosine similarity mat computed in 0.86 s
Normalization computed in 0.26 s
Cosine similarity mat computed in 0.55 s
Finished affinity matrix computations in 1.42 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.13 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 32.74 s
Launching nearest neighbors affmat calculation job
Job completed in: 30.83 s
(Round 1) Computed affinity matrix on nearest neighbors in 68.0 s
Filtered down to 1372 of 1845
(Round 1) Retained 1372 rows out of 1845 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1372 samples in 0.002s...
[t-SNE] Computed neighbors for 1372 samples in 0.025s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1372
[t-SNE] Computed conditional probabilities for sample 1372 / 1372
[t-SNE] Mean sigma: 0.208387
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.42362260818481445 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    1.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    6.2s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    7.3s finished
Louvain completed 200 runs in 12.995800495147705 seconds
Wrote graph to binary file in 2.924574851989746 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.798026
Louvain completed 51 runs in 17.56888437271118 seconds
Preproc + Louvain took 34.18192148208618 s
Got 13 clusters after round 1
Counts:
{1: 183, 7: 102, 8: 70, 2: 179, 11: 20, 0: 183, 4: 132, 5: 120, 3: 157, 6: 113, 12: 17, 10: 31, 9: 65}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 183 seqlets
Trimmed 27 out of 183
Skipped 7 seqlets
Aggregating for cluster 1 with 183 seqlets
Trimmed 27 out of 183
Skipped 9 seqlets
Aggregating for cluster 2 with 179 seqlets
Trimmed 18 out of 179
Skipped 11 seqlets
Aggregating for cluster 3 with 157 seqlets
Trimmed 17 out of 157
Skipped 6 seqlets
Aggregating for cluster 4 with 132 seqlets
Trimmed 13 out of 132
Skipped 6 seqlets
Aggregating for cluster 5 with 120 seqlets
Trimmed 6 out of 120
Skipped 9 seqlets
Aggregating for cluster 6 with 113 seqlets
Trimmed 3 out of 113
Skipped 9 seqlets
Aggregating for cluster 7 with 102 seqlets
Trimmed 9 out of 102
Skipped 16 seqlets
Aggregating for cluster 8 with 70 seqlets
Trimmed 8 out of 70
Skipped 8 seqlets
Aggregating for cluster 9 with 65 seqlets
Trimmed 2 out of 65
Skipped 5 seqlets
Skipped 1 seqlets
Aggregating for cluster 10 with 31 seqlets
Trimmed 0 out of 31
Skipped 6 seqlets
Skipped 3 seqlets
Aggregating for cluster 11 with 20 seqlets
Trimmed 3 out of 20
Aggregating for cluster 12 with 17 seqlets
Trimmed 0 out of 17
Skipped 1 seqlets
(Round 2) num seqlets: 1140
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 2.04 s
Starting affinity matrix computations
Normalization computed in 0.18 s
Cosine similarity mat computed in 0.34 s
Normalization computed in 0.14 s
Cosine similarity mat computed in 0.31 s
Finished affinity matrix computations in 0.66 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.09 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 33.6 s
Launching nearest neighbors affmat calculation job
Job completed in: 36.94 s
(Round 2) Computed affinity matrix on nearest neighbors in 73.6 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1140 samples in 0.001s...
[t-SNE] Computed neighbors for 1140 samples in 0.018s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1140
[t-SNE] Computed conditional probabilities for sample 1140 / 1140
[t-SNE] Mean sigma: 0.220554
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.5245933532714844 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    5.3s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    6.3s finished
Louvain completed 200 runs in 11.072757482528687 seconds
Wrote graph to binary file in 1.684753179550171 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.769149
After 6 runs, maximum modularity is Q = 0.77022
After 14 runs, maximum modularity is Q = 0.770221
After 35 runs, maximum modularity is Q = 0.779157
After 64 runs, maximum modularity is Q = 0.780064
Louvain completed 114 runs in 38.262585163116455 seconds
Preproc + Louvain took 51.77077341079712 s
Got 11 clusters after round 2
Counts:
{10: 10, 0: 179, 3: 142, 7: 81, 8: 81, 2: 158, 1: 164, 4: 102, 6: 101, 5: 101, 9: 21}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 179 seqlets
Trimmed 58 out of 179
Aggregating for cluster 1 with 164 seqlets
Trimmed 31 out of 164
Skipped 10 seqlets
Aggregating for cluster 2 with 158 seqlets
Trimmed 53 out of 158
Skipped 2 seqlets
Aggregating for cluster 3 with 142 seqlets
Trimmed 39 out of 142
Aggregating for cluster 4 with 102 seqlets
Trimmed 35 out of 102
Skipped 2 seqlets
Aggregating for cluster 5 with 101 seqlets
Trimmed 14 out of 101
Aggregating for cluster 6 with 101 seqlets
Trimmed 40 out of 101
Aggregating for cluster 7 with 81 seqlets
Trimmed 16 out of 81
Skipped 1 seqlets
Aggregating for cluster 8 with 81 seqlets
Trimmed 17 out of 81
Aggregating for cluster 9 with 21 seqlets
Trimmed 0 out of 21
Aggregating for cluster 10 with 10 seqlets
Trimmed 1 out of 10
Got 11 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.18747472763061523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00757454
After 5 runs, maximum modularity is Q = 0.00757455
Louvain completed 25 runs in 8.784554243087769 seconds
Similarity is 0.9602218944951644; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.18912816047668457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0101647
Louvain completed 21 runs in 7.034699440002441 seconds
Similarity is 0.9408701335593003; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.09767293930053711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00627039
Louvain completed 21 runs in 7.228049993515015 seconds
Similarity is 0.9498236144811059; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.20365619659423828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00448221
After 6 runs, maximum modularity is Q = 0.00448222
Louvain completed 26 runs in 8.496411323547363 seconds
Similarity is 0.9780427617198454; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.061792850494384766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00671558
Louvain completed 21 runs in 6.85033106803894 seconds
Similarity is 0.9757660006395674; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0802605152130127 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00417568
Louvain completed 21 runs in 7.045934438705444 seconds
Similarity is 0.9666629199455431; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0472254753112793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00549235
After 2 runs, maximum modularity is Q = 0.00665252
After 10 runs, maximum modularity is Q = 0.00669635
After 20 runs, maximum modularity is Q = 0.00669636
Louvain completed 40 runs in 13.441201210021973 seconds
Similarity is 0.9093554289505494; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0958869457244873 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00689957
Louvain completed 21 runs in 6.992563962936401 seconds
Similarity is 0.9315040214004802; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.04470705986022949 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00782816
After 2 runs, maximum modularity is Q = 0.00817473
Louvain completed 22 runs in 7.253430128097534 seconds
Similarity is 0.9296525266182365; is_dissimilar is False
Merging on 11 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 3 with prob 0.0015476031924065176 and sim 0.992335172499657
Collapsing 4 & 5 with prob 7.993504902795112e-05 and sim 0.9870230305497074
Collapsing 1 & 2 with prob 0.0004491173097572592 and sim 0.9425762499263686
Collapsing 6 & 7 with prob 8.357784565493138e-05 and sim 0.938686725454416
Collapsing 5 & 7 with prob 7.792272782438631e-06 and sim 0.9289524634947983
Aborting collapse as 4 & 6 have prob 1.4700832641248293e-08 and sim 0.783949450184587
Aborting collapse as 5 & 6 have prob 5.728444664425489e-09 and sim 0.8042519758423075
Collapsing 0 & 5 with prob 0.00427965614941718 and sim 0.9273874328986416
Collapsing 0 & 4 with prob 2.976902516154714e-05 and sim 0.9201765875661344
Collapsing 3 & 4 with prob 0.0003796042329714934 and sim 0.9152091374567471
Collapsing 4 & 7 with prob 1.5830938901309619e-06 and sim 0.912290210186368
Aborting collapse as 0 & 6 have prob 3.1433066232299556e-08 and sim 0.8632005194401059
Aborting collapse as 3 & 6 have prob 4.990762773241374e-08 and sim 0.8547623129778655
Aborting collapse as 4 & 6 have prob 1.4700832641248293e-08 and sim 0.783949450184587
Aborting collapse as 5 & 6 have prob 5.728444664425489e-09 and sim 0.8042519758423075
Collapsing 3 & 5 with prob 0.00012368551843723187 and sim 0.9117536229455603
Collapsing 2 & 7 with prob 1.1225259054878125e-05 and sim 0.8975989380137881
Aborting collapse as 1 & 6 have prob 5.425293967860285e-08 and sim 0.8000805889732219
Collapsing 6 & 8 with prob 1.482969842368142e-05 and sim 0.8714845525740772
Aborting collapse as 7 & 8 have prob 3.2139912113254365e-06 and sim 0.797631262545294
Trimmed 0 out of 214
Trimmed 0 out of 152
Trimmed 0 out of 220
Trimmed 0 out of 121
Trimmed 1 out of 366
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 2 with prob 5.2261817111251335e-05 and sim 0.9059711886692435
Collapsing 1 & 3 with prob 1.5245023222337902e-05 and sim 0.888420849155576
Trimmed 0 out of 486
Trimmed 1 out of 277
On merging iteration 3
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 2 with prob 7.842405646951197e-06 and sim 0.9868279489371942
Trimmed 0 out of 505
On merging iteration 4
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 3 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 3.24 s
Cross contin jaccard time taken: 3.26 s
Discarded 3 seqlets
Got 2 patterns after reassignment
Total time taken is 392.39s
In [175]:
mkdir -p {ddir}/processed/chipnexus/motifs/sox2/modisco
In [176]:
!du -sh {ddir}/processed/chipnexus/motifs/sox2/modisco
20M	/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco
In [287]:
modisco_file = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.hdf5"
In [180]:
import h5py
import modisco.util
reload(modisco.util)
grp = h5py.File(modisco_file)
tfmodisco_results.save_hdf5(grp)
# TODO - write pickle
In [289]:
from collections import Counter
from modisco.visualization import viz_sequence
reload(viz_sequence)
from matplotlib import pyplot as plt

import modisco.affinitymat.core
reload(modisco.affinitymat.core)
import modisco.cluster.phenograph.core
reload(modisco.cluster.phenograph.core)
import modisco.cluster.phenograph.cluster
reload(modisco.cluster.phenograph.cluster)
import modisco.cluster.core
reload(modisco.cluster.core)
import modisco.aggregator
reload(modisco.aggregator)

import sklearn.decomposition
import sklearn.manifold

hdf5_results = h5py.File(modisco_file)

#patterns = (tfmodisco_results
#            .metacluster_idx_to_submetacluster_results[0]
#            .seqlets_to_patterns_result.patterns);
patterns = (list(hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))
print(len(patterns))
pattern_grp = (hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"])

for pattern_name in patterns:
    pattern = pattern_grp[pattern_name]
    print(pattern_name)
    print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"]))
    #pattern.plot_counts(counts=aggregated_seqlet.get_per_position_seqlet_center_counts())
    background = np.array([0.27, 0.23, 0.23, 0.27])
    print("fwd:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["fwd"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["fwd"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                    background=background))

    print("reverse:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["rev"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["rev"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                    background=background))
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-289-62bab1b52324> in <module>()
     24 #            .seqlets_to_patterns_result.patterns);
     25 patterns = (list(hdf5_results
---> 26                  ["metacluster_idx_to_submetacluster_results"]
     27                  ["metacluster0"]
     28                  ["seqlets_to_patterns_result"]

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/_hl/group.py in __getitem__(self, name)
    165                 raise ValueError("Invalid HDF5 object reference")
    166         else:
--> 167             oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    168 
    169         otype = h5i.get_type(oid)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5o.pyx in h5py.h5o.open()

KeyError: "Unable to open object (object 'metacluster_idx_to_submetacluster_results' doesn't exist)"
In [285]:
hdf5_results
Out[285]:
[]
In [270]:
sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]
In [278]:
cl0 = hdf5_results.metacluster_idx_to_submetacluster_results[0]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-278-f54e30fb67a0> in <module>()
----> 1 cl0 = hdf5_results.metacluster_idx_to_submetacluster_results[0]

AttributeError: 'File' object has no attribute 'metacluster_idx_to_submetacluster_results'
In [276]:
sl.coor.example_idx
Out[276]:
2
In [277]:
len(pd.Series([sl.coor.example_idx for sl in cl0.seqlets]).unique())
Out[277]:
811
In [255]:
cl0.seqlets[0]
Out[255]:
<modisco.core.Seqlet at 0x7f47291decc0>
In [254]:
len(cl0.seqlets)
Out[254]:
1845
In [252]:
len(cl0.seqlets)
Out[252]:
250
In [ ]:
tfmodisco_results["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))
In [193]:
sl.coor.start
Out[193]:
91
In [197]:
sl.exidx_start_end_string
Out[197]:
'2_91_132'
In [222]:
tfmodisco_results.metaclustering_results.metacluster_indices
Out[222]:
array([0, 0, 1, ..., 0, 1, 1])
In [231]:
sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]
In [236]:
sl.coor.
Out[236]:
False
In [217]:
tfmodisco_results.metaclustering_results.metacluster_indices.max()
Out[217]:
1
In [219]:
len(tfmodisco_results.metaclustering_results.metacluster_indices)
Out[219]:
2095
In [198]:
sl.
Out[198]:
OrderedDict([('task0_contrib_scores',
              <modisco.core.Snippet at 0x7f47291de1d0>),
             ('task0_hypothetical_contribs',
              <modisco.core.Snippet at 0x7f47291de2e8>),
             ('sequence', <modisco.core.Snippet at 0x7f47291dea90>)])

plot the average sequence distribution for each motif