Questions¶

can we discover events when Sox2 and Oct4 are both bound?
could we discover/distinguish those events when using only Sox2 or only Oct4?
does the multi-task model do better than the single task one?

Goal¶

make a multi-task model predicting Sox2 and Oct4 ChipNexus signal git issue

TODO¶

[x] setup a new dataset function based on Sox2 peaks
[x] setup a new model having two output branches
[x] train the model
add the training curve plot bellow training
[ ] modify the plotting code
- show also the output tracks for Oct4
- show also the sequence importance for Oct4
[ ] try out a few models to figure out the optimal hyper-parameters
[ ] figure out how to best visualize the sequence importances
- max w.r.t. each individual track
  - the problem here is that Oct4 might have multiple peaks present in the sequence
- average?
- top2 peaks?
[ ] run modisco - how do the motifs look like?
[ ] plot the average sequence distribution for each motif

setup a new dataset function based on Sox2 peaks¶

import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir, create_tf_session
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from basepair.datasets import get_sox2_data
from basepair.plots import plot_loss
import pyBigWig
from basepair.math import softmax
import matplotlib.pyplot as plt

from basepair import samplers

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

create_tf_session(1)

<tensorflow.python.client.session.Session at 0x7f4995e1be10>

ddir = get_data_dir()

#rm -rf {ddir}/processed/chipnexus/chipseq_pipeline/
#!ln -s /srv/scratch/shared/surya/avsec/chip-nexus/data-stowers/semi_processed {ddir}/processed/chipnexus/chipseq_pipeline

from basepair.datasets import *

from basepair.datasets import sox2_oct4_peaks_sox2

# Get the dataset
df_sox2_oct4 = get_chipnexus_data(
    bigwigs={"sox2_pos": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.pos_strand.bw",
             "sox2_neg": f"{Sox2_BW_DIR}/sox2_pooled_reps_1b_2b_4b.neg_strand.bw",
             "oct4_pos": f"{Oct4_BW_DIR}/Oct4_1234.pos.bw",
             "oct4_neg": f"{Oct4_BW_DIR}/Oct4_1234.neg.bw",
})

train, valid, test = sox2_oct4_peaks_sox2()

train[0][:2]

array([[[0., 1., 0., 0.],
        [0., 1., 0., 0.],
        [1., 0., 0., 0.],
        ...,
        [0., 0., 0., 1.],
        [1., 0., 0., 0.],
        [0., 0., 0., 1.]],

       [[1., 0., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 1., 0.],
        ...,
        [0., 0., 0., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.]]])

train[1]['oct4'][:2,:2]

array([[[0., 2.],
        [0., 0.]],

       [[0., 0.],
        [0., 0.]]], dtype=float32)

train[1]['sox2'][:2,:2]

array([[[0., 0.],
        [0., 0.]],

       [[0., 0.],
        [0., 1.]]], dtype=float32)

Plot some random examples¶

idx_list = samplers.random(train[1]['sox2'], 10)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

Plot top count examples based on Sox2¶

idx_list = samplers.top_max_count(train[1]['sox2'], 20, 10)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

Plot top count examples based on Oct4¶

idx_list = samplers.top_max_count(train[1]['oct4'], 10, 0)
for i,idx in enumerate(idx_list):
    for protein in ['sox2', 'oct4']:
        plt.figure(figsize=(10,1))
        plt.plot(train[1][protein][idx,:,0], label='pos,p={}'.format(protein))
        plt.plot(train[1][protein][idx,:,1], label='neg,p={}'.format(protein))
        interval_id = train[2].seq_id.iloc[idx]
        plt.title(f"{protein}, {interval_id}")

setup a new model having two output branches¶

import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_mutlitask

import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model

def get_model(mfn, mkwargs):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/chipnexus/exp/models/multi-task"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(datetime.datetime.now()).replace(" ", "::")
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    return eval(mfn)(**mkwargs), name, ckp_file

train the model¶

add the training curve plot bellow training

#model = seq_mutlitask()

# hyper-parameters
mfn = "seq_mutlitask"
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=25,
               n_dil_layers=6,
               lr=0.004)

# best valid so far: 108238.6558
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0], 
                    [train[1]['sox2'], train[1]['oct4']],
          batch_size=256, 
          epochs=100,
          validation_data=valid[:2],
          callbacks=[EarlyStopping(patience=5),
                     History(),
                     ModelCheckpoint(ckp_file, save_best_only=True)]
         )
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll, 
                                             "SpatialLifetimeSparsity": SpatialLifetimeSparsity})

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-05-19 01:37:20,100 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-05-19 01:37:25,860 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Train on 5561 samples, validate on 1884 samples
Epoch 1/100
5561/5561 [==============================] - 12s 2ms/step - loss: 725.8702 - sox2_loss: 299.0563 - oct4_loss: 426.8138 - val_loss: 672.2314 - val_sox2_loss: 279.9911 - val_oct4_loss: 392.2403
Epoch 2/100
5561/5561 [==============================] - 1s 130us/step - loss: 680.3478 - sox2_loss: 283.3089 - oct4_loss: 397.0389 - val_loss: 662.0509 - val_sox2_loss: 275.3632 - val_oct4_loss: 386.6877
Epoch 3/100
5561/5561 [==============================] - 1s 123us/step - loss: 631.0815 - sox2_loss: 261.4637 - oct4_loss: 369.6178 - val_loss: 624.1503 - val_sox2_loss: 259.5057 - val_oct4_loss: 364.6446
Epoch 22/100
5561/5561 [==============================] - 1s 129us/step - loss: 630.3434 - sox2_loss: 261.1060 - oct4_loss: 369.2375 - val_loss: 623.7369 - val_sox2_loss: 259.1658 - val_oct4_loss: 364.5711
Epoch 23/100
5561/5561 [==============================] - 1s 138us/step - loss: 629.9745 - sox2_loss: 260.9869 - oct4_loss: 368.9876 - val_loss: 623.8516 - val_sox2_loss: 259.2231 - val_oct4_loss: 364.6285
Epoch 24/100
5561/5561 [==============================] - 1s 126us/step - loss: 629.5077 - sox2_loss: 260.7842 - oct4_loss: 368.7235 - val_loss: 623.7873 - val_sox2_loss: 259.2293 - val_oct4_loss: 364.5581
Epoch 25/100
5561/5561 [==============================] - 1s 135us/step - loss: 629.2410 - sox2_loss: 260.7460 - oct4_loss: 368.4949 - val_loss: 623.4647 - val_sox2_loss: 259.0624 - val_oct4_loss: 364.4022
Epoch 26/100
5561/5561 [==============================] - 1s 133us/step - loss: 628.6763 - sox2_loss: 260.4506 - oct4_loss: 368.2258 - val_loss: 623.2275 - val_sox2_loss: 259.0944 - val_oct4_loss: 364.1331
Epoch 27/100
5561/5561 [==============================] - 1s 145us/step - loss: 628.2974 - sox2_loss: 260.2821 - oct4_loss: 368.0153 - val_loss: 622.8090 - val_sox2_loss: 258.8739 - val_oct4_loss: 363.9352
Epoch 28/100
5561/5561 [==============================] - 1s 125us/step - loss: 627.7686 - sox2_loss: 260.1192 - oct4_loss: 367.6494 - val_loss: 622.6503 - val_sox2_loss: 258.7847 - val_oct4_loss: 363.8656
Epoch 29/100
5561/5561 [==============================] - 1s 125us/step - loss: 627.4070 - sox2_loss: 259.9290 - oct4_loss: 367.4780 - val_loss: 622.5302 - val_sox2_loss: 258.5780 - val_oct4_loss: 363.9522
Epoch 30/100
5561/5561 [==============================] - 1s 133us/step - loss: 627.0976 - sox2_loss: 259.8306 - oct4_loss: 367.2670 - val_loss: 622.3950 - val_sox2_loss: 258.8158 - val_oct4_loss: 363.5792
Epoch 31/100
5561/5561 [==============================] - 1s 127us/step - loss: 627.2904 - sox2_loss: 259.8620 - oct4_loss: 367.4284 - val_loss: 622.2841 - val_sox2_loss: 258.5842 - val_oct4_loss: 363.6999
Epoch 32/100
5561/5561 [==============================] - 1s 136us/step - loss: 626.7113 - sox2_loss: 259.6856 - oct4_loss: 367.0257 - val_loss: 621.7083 - val_sox2_loss: 258.4962 - val_oct4_loss: 363.2120
Epoch 33/100
5561/5561 [==============================] - 1s 168us/step - loss: 626.2117 - sox2_loss: 259.4664 - oct4_loss: 366.7453 - val_loss: 621.4707 - val_sox2_loss: 258.3064 - val_oct4_loss: 363.1642
Epoch 34/100
5561/5561 [==============================] - 1s 124us/step - loss: 626.0531 - sox2_loss: 259.3884 - oct4_loss: 366.6647 - val_loss: 622.1674 - val_sox2_loss: 258.5421 - val_oct4_loss: 363.6252
Epoch 35/100
5561/5561 [==============================] - 1s 130us/step - loss: 625.7034 - sox2_loss: 259.2915 - oct4_loss: 366.4118 - val_loss: 621.4809 - val_sox2_loss: 258.3735 - val_oct4_loss: 363.1074
Epoch 36/100
5561/5561 [==============================] - 1s 128us/step - loss: 625.5618 - sox2_loss: 259.2086 - oct4_loss: 366.3532 - val_loss: 621.3216 - val_sox2_loss: 258.3018 - val_oct4_loss: 363.0198
Epoch 37/100
5561/5561 [==============================] - 1s 134us/step - loss: 625.2049 - sox2_loss: 259.1065 - oct4_loss: 366.0984 - val_loss: 622.1884 - val_sox2_loss: 258.7861 - val_oct4_loss: 363.4023
Epoch 38/100
5561/5561 [==============================] - 1s 135us/step - loss: 625.3023 - sox2_loss: 259.0844 - oct4_loss: 366.2179 - val_loss: 621.2375 - val_sox2_loss: 258.2183 - val_oct4_loss: 363.0191
Epoch 39/100
5561/5561 [==============================] - 1s 138us/step - loss: 625.0223 - sox2_loss: 259.0209 - oct4_loss: 366.0014 - val_loss: 621.4369 - val_sox2_loss: 258.2670 - val_oct4_loss: 363.1699
Epoch 40/100
5561/5561 [==============================] - 1s 143us/step - loss: 624.5120 - sox2_loss: 258.8113 - oct4_loss: 365.7007 - val_loss: 620.8262 - val_sox2_loss: 258.0551 - val_oct4_loss: 362.7711
Epoch 41/100
5561/5561 [==============================] - 1s 126us/step - loss: 624.5702 - sox2_loss: 258.8232 - oct4_loss: 365.7471 - val_loss: 621.1330 - val_sox2_loss: 258.2693 - val_oct4_loss: 362.8637
Epoch 42/100
5561/5561 [==============================] - 1s 136us/step - loss: 624.6354 - sox2_loss: 258.8698 - oct4_loss: 365.7655 - val_loss: 621.0011 - val_sox2_loss: 258.1096 - val_oct4_loss: 362.8915
Epoch 43/100
5561/5561 [==============================] - 1s 126us/step - loss: 624.0940 - sox2_loss: 258.6219 - oct4_loss: 365.4721 - val_loss: 621.0460 - val_sox2_loss: 258.2217 - val_oct4_loss: 362.8243
Epoch 44/100
5561/5561 [==============================] - 1s 131us/step - loss: 624.0211 - sox2_loss: 258.6595 - oct4_loss: 365.3616 - val_loss: 621.0060 - val_sox2_loss: 258.0912 - val_oct4_loss: 362.9149
Epoch 45/100
5561/5561 [==============================] - 1s 130us/step - loss: 623.6265 - sox2_loss: 258.4887 - oct4_loss: 365.1378 - val_loss: 620.4986 - val_sox2_loss: 258.0624 - val_oct4_loss: 362.4363
Epoch 46/100
5561/5561 [==============================] - 1s 163us/step - loss: 623.6178 - sox2_loss: 258.4712 - oct4_loss: 365.1466 - val_loss: 620.5570 - val_sox2_loss: 257.9387 - val_oct4_loss: 362.6183
Epoch 47/100
5561/5561 [==============================] - 1s 146us/step - loss: 623.5925 - sox2_loss: 258.4865 - oct4_loss: 365.1061 - val_loss: 620.6887 - val_sox2_loss: 258.1253 - val_oct4_loss: 362.5634
Epoch 48/100
5561/5561 [==============================] - 1s 123us/step - loss: 623.5082 - sox2_loss: 258.4164 - oct4_loss: 365.0918 - val_loss: 620.7249 - val_sox2_loss: 258.0639 - val_oct4_loss: 362.6610
Epoch 49/100
5561/5561 [==============================] - 1s 128us/step - loss: 623.4521 - sox2_loss: 258.3986 - oct4_loss: 365.0534 - val_loss: 620.7637 - val_sox2_loss: 258.0337 - val_oct4_loss: 362.7300
Epoch 50/100
5561/5561 [==============================] - 1s 124us/step - loss: 622.9029 - sox2_loss: 258.1906 - oct4_loss: 364.7123 - val_loss: 620.4869 - val_sox2_loss: 258.0069 - val_oct4_loss: 362.4799
Epoch 51/100
5561/5561 [==============================] - 1s 125us/step - loss: 622.9663 - sox2_loss: 258.2301 - oct4_loss: 364.7361 - val_loss: 621.0509 - val_sox2_loss: 258.2052 - val_oct4_loss: 362.8457
Epoch 52/100
5561/5561 [==============================] - 1s 131us/step - loss: 622.8394 - sox2_loss: 258.1815 - oct4_loss: 364.6579 - val_loss: 620.5304 - val_sox2_loss: 257.9587 - val_oct4_loss: 362.5717
Epoch 53/100
5561/5561 [==============================] - 1s 166us/step - loss: 622.5652 - sox2_loss: 258.0479 - oct4_loss: 364.5173 - val_loss: 620.4069 - val_sox2_loss: 257.8786 - val_oct4_loss: 362.5283
Epoch 54/100
5561/5561 [==============================] - 1s 126us/step - loss: 622.3383 - sox2_loss: 257.9997 - oct4_loss: 364.3386 - val_loss: 620.0581 - val_sox2_loss: 257.8211 - val_oct4_loss: 362.2370
Epoch 55/100
5561/5561 [==============================] - 1s 130us/step - loss: 622.1358 - sox2_loss: 257.8988 - oct4_loss: 364.2370 - val_loss: 620.2261 - val_sox2_loss: 258.0444 - val_oct4_loss: 362.1817
Epoch 56/100
5561/5561 [==============================] - 1s 124us/step - loss: 622.0657 - sox2_loss: 257.8693 - oct4_loss: 364.1964 - val_loss: 620.7373 - val_sox2_loss: 258.0696 - val_oct4_loss: 362.6677
Epoch 57/100
5561/5561 [==============================] - 1s 134us/step - loss: 622.2536 - sox2_loss: 257.9297 - oct4_loss: 364.3239 - val_loss: 621.6410 - val_sox2_loss: 258.3100 - val_oct4_loss: 363.3309
Epoch 58/100
5561/5561 [==============================] - 1s 127us/step - loss: 622.1744 - sox2_loss: 257.9724 - oct4_loss: 364.2020 - val_loss: 620.0066 - val_sox2_loss: 257.8401 - val_oct4_loss: 362.1665
Epoch 59/100
5561/5561 [==============================] - 1s 126us/step - loss: 621.8661 - sox2_loss: 257.8039 - oct4_loss: 364.0622 - val_loss: 620.3550 - val_sox2_loss: 257.9574 - val_oct4_loss: 362.3976
Epoch 60/100
5561/5561 [==============================] - 1s 160us/step - loss: 621.6862 - sox2_loss: 257.7334 - oct4_loss: 363.9528 - val_loss: 620.0129 - val_sox2_loss: 257.8191 - val_oct4_loss: 362.1939
Epoch 61/100
5561/5561 [==============================] - 1s 124us/step - loss: 621.6400 - sox2_loss: 257.7349 - oct4_loss: 363.9051 - val_loss: 620.2102 - val_sox2_loss: 257.9882 - val_oct4_loss: 362.2220
Epoch 62/100
5561/5561 [==============================] - 1s 125us/step - loss: 621.6439 - sox2_loss: 257.7326 - oct4_loss: 363.9113 - val_loss: 620.4273 - val_sox2_loss: 258.0618 - val_oct4_loss: 362.3655
Epoch 63/100
5561/5561 [==============================] - 1s 120us/step - loss: 621.2951 - sox2_loss: 257.6221 - oct4_loss: 363.6730 - val_loss: 621.0728 - val_sox2_loss: 258.2039 - val_oct4_loss: 362.8689

plot_loss(history)

modify the plotting code¶

show also the output tracks for Oct4
show also the sequence importance for Oct4

y_pred = model.predict(test[0])

plt.scatter(np.log(1+np.ravel(test[1]['sox2'])), np.log(1+np.ravel(test[1]['oct4'])));

def normalize(x):
    return x / x.sum(1, keepdims=True)

test[1]['sox2'].shape

(1951, 201, 2)

plt.scatter(np.log(1+np.ravel(normalize(test[1]['sox2']))), 
            np.log(1+np.ravel(normalize(test[1]['oct4']))));

plt.scatter(np.ravel(normalize(test[1]['sox2'])), 
            np.ravel(normalize(test[1]['oct4'])));

plt.scatter(y_pred[0].reshape((len(y_pred[0]), -1)), y_pred[1].reshape((len(y_pred[0]), -1)));

class Seq2Sox2Oct4:
    
    def __init__(self, x, y, model):
        self.x = x
        self.y = y
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(y) for y in model.predict(x)]
        
    def plot(self, n=10, kind='test', sort='random', sort_prot='sox2', figsize=(20,2)):
        import matplotlib.pyplot as plt
        if sort=='random':
            idx_list = pd.Series(np.arange(len(self.x))).sample(n)
        elif sort == 'top':
            max_counts_pos = pd.Series(np.max(self.y[sort_prot][:,:,0], axis=-1))
            max_counts_neg = pd.Series(np.max(self.y[sort_prot][:,:,1], axis=-1))
            idx_list = (max_counts_pos + max_counts_neg).sort_values(ascending=False).index[:n]
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
        for i,idx in enumerate(idx_list):
            plt.figure(figsize=figsize)
            plt.subplot(141)
            if i==0:
                plt.title("Predicted Sox2")
            plt.plot(self.y_pred[0][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[0][idx,:,0])))
            plt.plot(self.y_pred[0][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[0][idx,:,1])))
            plt.legend();
            plt.subplot(142)
            if i==0:
                plt.title("Observed Sox2")
            plt.plot(self.y['sox2'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['sox2'][idx,:,0])))
            plt.plot(self.y['sox2'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['sox2'][idx,:,1])))
            plt.legend();
            plt.subplot(143)
            if i==0:
                plt.title("Predicted Oct4")
            plt.plot(self.y_pred[1][idx,:,0], label='pos,m={}'.format(np.argmax(self.y_pred[1][idx,:,0])))
            plt.plot(self.y_pred[1][idx,:,1], label='neg,m={}'.format(np.argmax(self.y_pred[1][idx,:,1])))
            plt.legend();
            plt.subplot(144)
            if i==0:
                plt.title("Observed Oct4")
            plt.plot(self.y['oct4'][idx,:,0], label='pos,m={}'.format(np.argmax(self.y['oct4'][idx,:,0])))
            plt.plot(self.y['oct4'][idx,:,1], label='neg,m={}'.format(np.argmax(self.y['oct4'][idx,:,1])))
            plt.legend();

p = Seq2Sox2Oct4(test[0], test[1], model)

p.plot(sort='top', sort_prot='oct4',figsize=(20,2))

p.plot(sort='top', sort_prot='sox2',figsize=(20,2))

try out a few models to figure out the optimal hyper-parameters¶

figure out how to best visualize the sequence importances¶

max w.r.t. each individual track
- the problem here is that Oct4 might have multiple peaks present in the sequence
average?
top2 peaks?

from basepair import samplers
import pandas as pd
from basepair.math import softmax
import numpy as np
import keras.backend as K
from keras.models import Model
from concise.utils.plot import seqlogo_fig, seqlogo
import matplotlib.pyplot as plt

seqlogo

<function concise.utils.plot.seqlogo(letter_heights, vocab='DNA', ax=None)>

class Seq2Nexus:
    
    def __init__(self, x, y, df, model):
        self.x = x
        self.y = y
        self.df = df
        self.labels = df.chr + ":" + df.start.astype(str) + "-" + df.end.astype(str)
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(p) for p in model.predict(x)]
        self.seq_len = self.y_pred[0].shape[1]
        
        
    def input_grad(self, x, strand='pos', task_id=0, seq_grad='max'):
        strand_id = {"pos": 0, "neg": 1}[strand]
        
        if seq_grad =='max':
            sfn = K.max
        elif seq_grad == 'mean':
            sfn = K.mean
        else:
            raise ValueError(f"seq_grad={seq_grad} couldn't be interpreted")
        inp = self.model.inputs[0]
        fn = K.function([inp], K.gradients(sfn(self.model.outputs[task_id][:,:,strand_id], axis=-1), inp))
        return fn([x])[0]
        
    
        
    def plot(self, n=10, kind='test', sort='random', 
             seq_grad='max', figsize=(20,6)):
        import matplotlib.pyplot as plt
         
        if sort=='random':
            idx_list = samplers.random(self.x, n)
        elif "_" in sort:
            kind, task = sort.split("_")
            if kind == "max":
                idx_list = samplers.top_max_count(self.y[task], n)
            elif kind == "sum":
                idx_list = samplers.top_sum_count(self.y[task], n)
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
               

        # compute grads
        grads = [[self.input_grad(self.x[idx_list], 'pos', 0, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', 0, seq_grad) * self.x[idx_list]],
                 [self.input_grad(self.x[idx_list], 'pos', 1, seq_grad) * self.x[idx_list],
                  self.input_grad(self.x[idx_list], 'neg', 1, seq_grad) * self.x[idx_list]]]

        for i,idx in enumerate(idx_list):
            fig, axes = plt.subplots(8, 1, sharex=True, figsize=figsize)

            for tid, task in enumerate(['sox2', 'oct4']):
                axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,0], label="pos")#
                axes[0 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y[task][idx,:,1], label="neg")
                axes[0 + 4*tid].set_ylabel("Observed")
                axes[0 + 4*tid].legend()
                axes[0 + 4*tid].set_title('{} {}'.format(task, self.labels.iloc[idx]))
                axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,0], label="pos")#
                axes[1 + 4*tid].plot(np.arange(1,self.seq_len+1), self.y_pred[tid][idx,:,1], label="neg")
                axes[1 + 4*tid].set_ylabel("Predicted")
                axes[1 + 4*tid].legend()
                # ------------------
                seqlogo(grads[tid][0][i], ax=axes[2 + 4*tid]);
                axes[2 + 4*tid].set_ylabel("Pos. strand")
                seqlogo(grads[tid][1][i], ax=axes[3 + 4*tid]);
                axes[3 + 4*tid].set_ylabel("Neg. strand")
                x_range = [1, self.seq_len]
                axes[3 + 4*tid].set_xticks(list(range(0, self.seq_len, 5)));

p = Seq2Nexus(test[0], test[1],test[2], model)

p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))

p.plot(sort='max_sox2', seq_grad='max', figsize=(20,10))

p.plot(sort='max_oct4', seq_grad='max', figsize=(20,10))

p.plot(sort='max_oct4', seq_grad='mean', figsize=(20,10))

run modisco - how do the motifs look like?¶

# Functions to get the gradients
out = model.outputs[0]
inp = model.inputs[0]
pos_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,0], axis=-1), inp))
neg_strand_ginp_avg = K.function([inp], K.gradients(K.mean(out[:,:,1], axis=-1), inp))
pos_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,0], axis=-1), inp))
neg_strand_ginp_max = K.function([inp], K.gradients(K.max(out[:,:,1], axis=-1), inp))

Get predictions and gradients¶

from basepair.data import numpy_minibatch

# Pre-compute the predictions and bottlenecks
x = valid[0]
y_true = valid[1]['sox2']
y_pred = softmax(model.predict(x)[0])
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x

Correlate pos and neg-strand gradients¶

from scipy.spatial.distance import cosine, correlation

grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))

distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i]) for i in range(len(grads_neg_ext))])

import numpy as np
import seaborn as sns

plt.figure(figsize=(6,6))
plt.subplot(211)
plt.hist(distances, bins=50);
plt.subplot(212)
values, base = np.histogram(distances, bins=40)
plt.plot(base[:-1], np.cumsum(values));
plt.grid()
plt.xlabel("Correlation Distance");
plt.ylabel("Fraction of data points");

# We get roughly 1/3 of the points with high-correlation

top10_idx = pd.Series(np.where(distances<0.5)[0]).sample(10)

top10_idx = pd.Series(distances).sort_values(ascending=True).index[:10]

distances

array([0.60056475, 0.5733467 , 0.17005539, ..., 0.17135131, 0.08901739,
       0.27528453])

Plot some¶

# Top maxcount indicies
top10_idx = pd.Series(y_true.max(1).sum(1)).sort_values(ascending=False).index[10:30]

# Top count indicies
top10_idx = pd.Series(y_true.sum(1).sum(1)).sort_values(ascending=False).index[:10]

# Random indicies
top10_idx = pd.Series(np.arange(len(y_true))).sample(10)

for i in top10_idx:
    fig, (ax1, ax2, ax3, ax4) = plt.subplots(4, 1, sharex=True, figsize=(20,6))
    
    ax1.plot(np.arange(1,202), y_true[i,:,0], label="pos")#
    ax1.plot(np.arange(1,202), y_true[i,:,1], label="neg")
    ax1.set_ylabel("Observed\ncounts")
    ax1.legend()
    ax2.plot(np.arange(1,202), y_pred[i,:,0], label="pos")#
    ax2.plot(np.arange(1,202), y_pred[i,:,1], label="neg")
    ax2.set_ylabel("Predicted\n")
    ax2.legend()
    ax3.set_ylabel("Pos. strand")
    seqlogo(igrads_pos[i], ax=ax3);
    ax4.set_ylabel("Neg. strand")
    seqlogo(igrads_neg[i], ax=ax4);
    x_range = [1, 201]
    ax4.set_xticks(list(range(0, 201, 5)));

Run modisco¶

top_distances = distances<0.2

hyp_scores = grads_pos + grads_neg

hyp_scores = hyp_scores[top_distances]

hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)

onehot_data = valid[0][top_distances]

scores = hyp_scores * onehot_data

len(hyp_scores)

815

# Visualize
import modisco.visualization
from modisco.visualization import viz_sequence

viz_sequence.plot_weights(scores[0])
viz_sequence.plot_weights(hyp_scores[0])
viz_sequence.plot_weights(onehot_data[0])

from imp import reload

%env MKL_THREADING_LAYER=GNU

env: MKL_THREADING_LAYER=GNU

import theano

onehot_data.shape

(815, 201, 4)

import h5py
import numpy as np
%matplotlib inline
import modisco
reload(modisco)
import modisco.backend
reload(modisco.backend.theano_backend)
reload(modisco.backend)
import modisco.nearest_neighbors
reload(modisco.nearest_neighbors)
import modisco.affinitymat
reload(modisco.affinitymat.core)
reload(modisco.affinitymat.transformers)
import modisco.tfmodisco_workflow.seqlets_to_patterns
reload(modisco.tfmodisco_workflow.seqlets_to_patterns)
import modisco.tfmodisco_workflow.workflow
reload(modisco.tfmodisco_workflow.workflow)
import modisco.aggregator
reload(modisco.aggregator)
import modisco.cluster
reload(modisco.cluster.core)
reload(modisco.cluster.phenograph.core)
reload(modisco.cluster.phenograph.cluster)
import modisco.core
reload(modisco.core)
import modisco.coordproducers
reload(modisco.coordproducers)
import modisco.metaclusterers
reload(modisco.metaclusterers)

tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(
    sliding_window_size=21, 
    flank_size=10,
    histogram_bins=100, 
    percentiles_in_bandwidth=10,
    overlap_portion=0.5,     
    min_cluster_size=50,    
    threshold_for_counting_sign=1.0,
    weak_threshold_for_counting_sign=0.7)(
        task_names=["task0"],
        contrib_scores={'task0': scores},
        hypothetical_contribs={'task0': hyp_scores},
        one_hot=onehot_data)

On task task0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Done 0
Got 4961 coords
Computing thresholds
Bandwidth calculated: 0.19077584147453308

Computed threshold 0.6647349196552568
2095 coords remaining after thresholding
After resolving overlaps, got 2095 seqlets
2 activity patterns with support >= 50 out of 3 possible patterns
Metacluster sizes:  [1845, 250]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 250
Relevant tasks:  ('task0',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 250
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.41 s
Starting affinity matrix computations
Normalization computed in 0.04 s
Cosine similarity mat computed in 0.05 s
Normalization computed in 0.03 s
Cosine similarity mat computed in 0.04 s
Finished affinity matrix computations in 0.1 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.01 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 4.38 s
Launching nearest neighbors affmat calculation job
Job completed in: 4.03 s
(Round 1) Computed affinity matrix on nearest neighbors in 8.65 s
Filtered down to 246 of 250
(Round 1) Retained 246 rows out of 250 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 246 samples in 0.000s...
[t-SNE] Computed neighbors for 246 samples in 0.002s...
[t-SNE] Computed conditional probabilities for sample 246 / 246
[t-SNE] Mean sigma: 0.302820
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.06781530380249023 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    4.8s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    5.8s finished

Louvain completed 200 runs in 9.413687467575073 seconds
Wrote graph to binary file in 0.13769841194152832 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.757372
Louvain completed 51 runs in 15.893865823745728 seconds
Preproc + Louvain took 25.584400415420532 s
Got 7 clusters after round 1
Counts:
{0: 64, 4: 32, 6: 9, 2: 42, 5: 24, 1: 42, 3: 33}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 64 seqlets
Trimmed 2 out of 64
Skipped 10 seqlets
Aggregating for cluster 1 with 42 seqlets
Trimmed 2 out of 42
Skipped 1 seqlets
Dropping cluster 1 with 39 seqlets due to sign disagreement
Aggregating for cluster 2 with 42 seqlets
Trimmed 6 out of 42
Skipped 5 seqlets
Dropping cluster 2 with 31 seqlets due to sign disagreement
Aggregating for cluster 3 with 33 seqlets
Trimmed 0 out of 33
Dropping cluster 3 with 33 seqlets due to sign disagreement
Aggregating for cluster 4 with 32 seqlets
Trimmed 1 out of 32
Skipped 15 seqlets
Aggregating for cluster 5 with 24 seqlets
Trimmed 3 out of 24
Skipped 4 seqlets
Dropping cluster 5 with 17 seqlets due to sign disagreement
Aggregating for cluster 6 with 9 seqlets
Trimmed 0 out of 9
Dropping cluster 6 with 9 seqlets due to sign disagreement
(Round 2) num seqlets: 68
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.29 s
Starting affinity matrix computations
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.02 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.02 s
Finished affinity matrix computations in 0.04 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.0 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 3.32 s
Launching nearest neighbors affmat calculation job
Job completed in: 3.05 s
(Round 2) Computed affinity matrix on nearest neighbors in 6.43 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 68 samples in 0.000s...
[t-SNE] Computed neighbors for 68 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 68 / 68
[t-SNE] Mean sigma: 0.309015
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.04799318313598633 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.6s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    4.4s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    5.6s finished

Louvain completed 200 runs in 9.199044227600098 seconds
Wrote graph to binary file in 0.028064966201782227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.693675
After 2 runs, maximum modularity is Q = 0.7026
Louvain completed 52 runs in 16.041913270950317 seconds
Preproc + Louvain took 25.38900923728943 s
Got 6 clusters after round 2
Counts:
{2: 14, 3: 12, 5: 3, 1: 16, 0: 17, 4: 6}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 17 seqlets
Trimmed 2 out of 17
Aggregating for cluster 1 with 16 seqlets
Trimmed 1 out of 16
Aggregating for cluster 2 with 14 seqlets
Trimmed 1 out of 14
Aggregating for cluster 3 with 12 seqlets
Trimmed 0 out of 12
Aggregating for cluster 4 with 6 seqlets
Trimmed 0 out of 6
Skipped 1 seqlets
Aggregating for cluster 5 with 3 seqlets
Trimmed 0 out of 3
Dropping cluster 5 with 3 seqlets due to sign disagreement
Got 5 clusters
Splitting into subclusters...
Merging on 5 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 5 patterns after merging
Performing seqlet reassignment
Got 0 patterns after reassignment
Total time taken is 84.49s
On metacluster 0
Metacluster size 1845
Relevant tasks:  ('task0',)
Relevant signs:  (1,)
(Round 1) num seqlets: 1845
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 2.7 s
Starting affinity matrix computations
Normalization computed in 0.44 s
Cosine similarity mat computed in 0.86 s
Normalization computed in 0.26 s
Cosine similarity mat computed in 0.55 s
Finished affinity matrix computations in 1.42 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.13 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 32.74 s
Launching nearest neighbors affmat calculation job
Job completed in: 30.83 s
(Round 1) Computed affinity matrix on nearest neighbors in 68.0 s
Filtered down to 1372 of 1845
(Round 1) Retained 1372 rows out of 1845 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1372 samples in 0.002s...
[t-SNE] Computed neighbors for 1372 samples in 0.025s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1372
[t-SNE] Computed conditional probabilities for sample 1372 / 1372
[t-SNE] Mean sigma: 0.208387
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.42362260818481445 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    1.0s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    6.2s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    7.3s finished

Louvain completed 200 runs in 12.995800495147705 seconds
Wrote graph to binary file in 2.924574851989746 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.798026
Louvain completed 51 runs in 17.56888437271118 seconds
Preproc + Louvain took 34.18192148208618 s
Got 13 clusters after round 1
Counts:
{1: 183, 7: 102, 8: 70, 2: 179, 11: 20, 0: 183, 4: 132, 5: 120, 3: 157, 6: 113, 12: 17, 10: 31, 9: 65}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 183 seqlets
Trimmed 27 out of 183
Skipped 7 seqlets
Aggregating for cluster 1 with 183 seqlets
Trimmed 27 out of 183
Skipped 9 seqlets
Aggregating for cluster 2 with 179 seqlets
Trimmed 18 out of 179
Skipped 11 seqlets
Aggregating for cluster 3 with 157 seqlets
Trimmed 17 out of 157
Skipped 6 seqlets
Aggregating for cluster 4 with 132 seqlets
Trimmed 13 out of 132
Skipped 6 seqlets
Aggregating for cluster 5 with 120 seqlets
Trimmed 6 out of 120
Skipped 9 seqlets
Aggregating for cluster 6 with 113 seqlets
Trimmed 3 out of 113
Skipped 9 seqlets
Aggregating for cluster 7 with 102 seqlets
Trimmed 9 out of 102
Skipped 16 seqlets
Aggregating for cluster 8 with 70 seqlets
Trimmed 8 out of 70
Skipped 8 seqlets
Aggregating for cluster 9 with 65 seqlets
Trimmed 2 out of 65
Skipped 5 seqlets
Skipped 1 seqlets
Aggregating for cluster 10 with 31 seqlets
Trimmed 0 out of 31
Skipped 6 seqlets
Skipped 3 seqlets
Aggregating for cluster 11 with 20 seqlets
Trimmed 3 out of 20
Aggregating for cluster 12 with 17 seqlets
Trimmed 0 out of 17
Skipped 1 seqlets
(Round 2) num seqlets: 1140
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 2.04 s
Starting affinity matrix computations
Normalization computed in 0.18 s
Cosine similarity mat computed in 0.34 s
Normalization computed in 0.14 s
Cosine similarity mat computed in 0.31 s
Finished affinity matrix computations in 0.66 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.09 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 33.6 s
Launching nearest neighbors affmat calculation job
Job completed in: 36.94 s
(Round 2) Computed affinity matrix on nearest neighbors in 73.6 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1140 samples in 0.001s...
[t-SNE] Computed neighbors for 1140 samples in 0.018s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1140
[t-SNE] Computed conditional probabilities for sample 1140 / 1140
[t-SNE] Mean sigma: 0.220554
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.5245933532714844 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.7s
[Parallel(n_jobs=20)]: Done 160 tasks      | elapsed:    5.3s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    6.3s finished

Louvain completed 200 runs in 11.072757482528687 seconds
Wrote graph to binary file in 1.684753179550171 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.769149
After 6 runs, maximum modularity is Q = 0.77022
After 14 runs, maximum modularity is Q = 0.770221
After 35 runs, maximum modularity is Q = 0.779157
After 64 runs, maximum modularity is Q = 0.780064
Louvain completed 114 runs in 38.262585163116455 seconds
Preproc + Louvain took 51.77077341079712 s
Got 11 clusters after round 2
Counts:
{10: 10, 0: 179, 3: 142, 7: 81, 8: 81, 2: 158, 1: 164, 4: 102, 6: 101, 5: 101, 9: 21}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 179 seqlets
Trimmed 58 out of 179
Aggregating for cluster 1 with 164 seqlets
Trimmed 31 out of 164
Skipped 10 seqlets
Aggregating for cluster 2 with 158 seqlets
Trimmed 53 out of 158
Skipped 2 seqlets
Aggregating for cluster 3 with 142 seqlets
Trimmed 39 out of 142
Aggregating for cluster 4 with 102 seqlets
Trimmed 35 out of 102
Skipped 2 seqlets
Aggregating for cluster 5 with 101 seqlets
Trimmed 14 out of 101
Aggregating for cluster 6 with 101 seqlets
Trimmed 40 out of 101
Aggregating for cluster 7 with 81 seqlets
Trimmed 16 out of 81
Skipped 1 seqlets
Aggregating for cluster 8 with 81 seqlets
Trimmed 17 out of 81
Aggregating for cluster 9 with 21 seqlets
Trimmed 0 out of 21
Aggregating for cluster 10 with 10 seqlets
Trimmed 1 out of 10
Got 11 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.18747472763061523 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00757454
After 5 runs, maximum modularity is Q = 0.00757455
Louvain completed 25 runs in 8.784554243087769 seconds
Similarity is 0.9602218944951644; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.18912816047668457 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0101647
Louvain completed 21 runs in 7.034699440002441 seconds
Similarity is 0.9408701335593003; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.09767293930053711 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00627039
Louvain completed 21 runs in 7.228049993515015 seconds
Similarity is 0.9498236144811059; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.20365619659423828 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00448221
After 6 runs, maximum modularity is Q = 0.00448222
Louvain completed 26 runs in 8.496411323547363 seconds
Similarity is 0.9780427617198454; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.061792850494384766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00671558
Louvain completed 21 runs in 6.85033106803894 seconds
Similarity is 0.9757660006395674; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0802605152130127 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00417568
Louvain completed 21 runs in 7.045934438705444 seconds
Similarity is 0.9666629199455431; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0472254753112793 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00549235
After 2 runs, maximum modularity is Q = 0.00665252
After 10 runs, maximum modularity is Q = 0.00669635
After 20 runs, maximum modularity is Q = 0.00669636
Louvain completed 40 runs in 13.441201210021973 seconds
Similarity is 0.9093554289505494; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0958869457244873 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00689957
Louvain completed 21 runs in 6.992563962936401 seconds
Similarity is 0.9315040214004802; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.04470705986022949 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00782816
After 2 runs, maximum modularity is Q = 0.00817473
Louvain completed 22 runs in 7.253430128097534 seconds
Similarity is 0.9296525266182365; is_dissimilar is False
Merging on 11 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 3 with prob 0.0015476031924065176 and sim 0.992335172499657
Collapsing 4 & 5 with prob 7.993504902795112e-05 and sim 0.9870230305497074
Collapsing 1 & 2 with prob 0.0004491173097572592 and sim 0.9425762499263686
Collapsing 6 & 7 with prob 8.357784565493138e-05 and sim 0.938686725454416
Collapsing 5 & 7 with prob 7.792272782438631e-06 and sim 0.9289524634947983
Aborting collapse as 4 & 6 have prob 1.4700832641248293e-08 and sim 0.783949450184587
Aborting collapse as 5 & 6 have prob 5.728444664425489e-09 and sim 0.8042519758423075
Collapsing 0 & 5 with prob 0.00427965614941718 and sim 0.9273874328986416
Collapsing 0 & 4 with prob 2.976902516154714e-05 and sim 0.9201765875661344
Collapsing 3 & 4 with prob 0.0003796042329714934 and sim 0.9152091374567471
Collapsing 4 & 7 with prob 1.5830938901309619e-06 and sim 0.912290210186368
Aborting collapse as 0 & 6 have prob 3.1433066232299556e-08 and sim 0.8632005194401059
Aborting collapse as 3 & 6 have prob 4.990762773241374e-08 and sim 0.8547623129778655
Aborting collapse as 4 & 6 have prob 1.4700832641248293e-08 and sim 0.783949450184587
Aborting collapse as 5 & 6 have prob 5.728444664425489e-09 and sim 0.8042519758423075
Collapsing 3 & 5 with prob 0.00012368551843723187 and sim 0.9117536229455603
Collapsing 2 & 7 with prob 1.1225259054878125e-05 and sim 0.8975989380137881
Aborting collapse as 1 & 6 have prob 5.425293967860285e-08 and sim 0.8000805889732219
Collapsing 6 & 8 with prob 1.482969842368142e-05 and sim 0.8714845525740772
Aborting collapse as 7 & 8 have prob 3.2139912113254365e-06 and sim 0.797631262545294
Trimmed 0 out of 214
Trimmed 0 out of 152
Trimmed 0 out of 220
Trimmed 0 out of 121
Trimmed 1 out of 366
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 2 with prob 5.2261817111251335e-05 and sim 0.9059711886692435
Collapsing 1 & 3 with prob 1.5245023222337902e-05 and sim 0.888420849155576
Trimmed 0 out of 486
Trimmed 1 out of 277
On merging iteration 3
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 2 with prob 7.842405646951197e-06 and sim 0.9868279489371942
Trimmed 0 out of 505
On merging iteration 4
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 3 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 3.24 s
Cross contin jaccard time taken: 3.26 s
Discarded 3 seqlets
Got 2 patterns after reassignment
Total time taken is 392.39s

mkdir -p {ddir}/processed/chipnexus/motifs/sox2/modisco

!du -sh {ddir}/processed/chipnexus/motifs/sox2/modisco

20M	/users/avsec/workspace/basepair/basepair/../data/processed/chipnexus/motifs/sox2/modisco

modisco_file = f"{ddir}/processed/chipnexus/motifs/oct4/modisco/multi-task.hdf5"

import h5py
import modisco.util
reload(modisco.util)
grp = h5py.File(modisco_file)
tfmodisco_results.save_hdf5(grp)
# TODO - write pickle

from collections import Counter
from modisco.visualization import viz_sequence
reload(viz_sequence)
from matplotlib import pyplot as plt

import modisco.affinitymat.core
reload(modisco.affinitymat.core)
import modisco.cluster.phenograph.core
reload(modisco.cluster.phenograph.core)
import modisco.cluster.phenograph.cluster
reload(modisco.cluster.phenograph.cluster)
import modisco.cluster.core
reload(modisco.cluster.core)
import modisco.aggregator
reload(modisco.aggregator)

import sklearn.decomposition
import sklearn.manifold

hdf5_results = h5py.File(modisco_file)

#patterns = (tfmodisco_results
#            .metacluster_idx_to_submetacluster_results[0]
#            .seqlets_to_patterns_result.patterns);
patterns = (list(hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))
print(len(patterns))
pattern_grp = (hdf5_results
                 ["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"])

for pattern_name in patterns:
    pattern = pattern_grp[pattern_name]
    print(pattern_name)
    print("total seqlets:",len(pattern["seqlets_and_alnmts"]["seqlets"]))
    #pattern.plot_counts(counts=aggregated_seqlet.get_per_position_seqlet_center_counts())
    background = np.array([0.27, 0.23, 0.23, 0.27])
    print("fwd:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["fwd"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["fwd"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["fwd"]),
                                                    background=background))

    print("reverse:")
    viz_sequence.plot_weights(pattern["task0_contrib_scores"]["rev"])
    viz_sequence.plot_weights(pattern["task0_hypothetical_contribs"]["rev"])
    viz_sequence.plot_weights(viz_sequence.ic_scale(np.array(pattern["sequence"]["rev"]),
                                                    background=background))

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-289-62bab1b52324> in <module>()
     24 #            .seqlets_to_patterns_result.patterns);
     25 patterns = (list(hdf5_results
---> 26                  ["metacluster_idx_to_submetacluster_results"]
     27                  ["metacluster0"]
     28                  ["seqlets_to_patterns_result"]

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/_hl/group.py in __getitem__(self, name)
    165                 raise ValueError("Invalid HDF5 object reference")
    166         else:
--> 167             oid = h5o.open(self.id, self._e(name), lapl=self._lapl)
    168 
    169         otype = h5i.get_type(oid)

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/_objects.pyx in h5py._objects.with_phil.wrapper()

h5py/h5o.pyx in h5py.h5o.open()

KeyError: "Unable to open object (object 'metacluster_idx_to_submetacluster_results' doesn't exist)"

hdf5_results

[]

sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]

cl0 = hdf5_results.metacluster_idx_to_submetacluster_results[0]

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-278-f54e30fb67a0> in <module>()
----> 1 cl0 = hdf5_results.metacluster_idx_to_submetacluster_results[0]

AttributeError: 'File' object has no attribute 'metacluster_idx_to_submetacluster_results'

sl.coor.example_idx

2

len(pd.Series([sl.coor.example_idx for sl in cl0.seqlets]).unique())

811

cl0.seqlets[0]

<modisco.core.Seqlet at 0x7f47291decc0>

len(cl0.seqlets)

1845

len(cl0.seqlets)

250

tfmodisco_results["metacluster_idx_to_submetacluster_results"]
                 ["metacluster0"]
                 ["seqlets_to_patterns_result"]
                 ["patterns"]["all_pattern_names"]))

sl.coor.start

91

sl.exidx_start_end_string

'2_91_132'

tfmodisco_results.metaclustering_results.metacluster_indices

array([0, 0, 1, ..., 0, 1, 1])

sl = tfmodisco_results.multitask_seqlet_creation_results.final_seqlets[0]

sl.coor.

False

tfmodisco_results.metaclustering_results.metacluster_indices.max()

1

len(tfmodisco_results.metaclustering_results.metacluster_indices)

2095

sl.

OrderedDict([('task0_contrib_scores',
              <modisco.core.Snippet at 0x7f47291de1d0>),
             ('task0_hypothetical_contribs',
              <modisco.core.Snippet at 0x7f47291de2e8>),
             ('sequence', <modisco.core.Snippet at 0x7f47291dea90>)])

Questions¶

Goal¶

TODO¶

setup a new dataset function based on Sox2 peaks¶

Plot some random examples¶

Plot top count examples based on Sox2¶

Plot top count examples based on Oct4¶

setup a new model having two output branches¶

train the model¶

modify the plotting code¶

try out a few models to figure out the optimal hyper-parameters¶

figure out how to best visualize the sequence importances¶

run modisco - how do the motifs look like?¶

Get predictions and gradients¶

Correlate pos and neg-strand gradients¶

Plot some¶

Run modisco¶

plot the average sequence distribution for each motif¶