Goal¶

train a decent seq-> profile + counts model for ChIP-seq

Resources¶

washu
- session: 7-w-sox2-oct4-chipseq

import basepair
import modisco

Using TensorFlow backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/usr/bin/ld: cannot find -lcudnn\ncollect2: error: ld returned 1 exit status\n'
Mapped name None to device cuda: GeForce GTX TITAN X (0000:09:00.0)

from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session

2018-07-29 13:39:26,854 [WARNING] git-lfs not installed
2018-07-29 13:39:26,896 [WARNING] git-lfs not installed

# Use gpus 1, 3, 5, 7
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 3, 5, 7"

ddir = get_data_dir()

ddir

'/srv/scratch/amr1/chipseq/basepair/basepair/../data'

bdir = "/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/"

ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
                                           pos_counts=f"{bdir}/Sox2/pos.bw",
                                           neg_counts=f"{bdir}/Sox2/neg.bw",
                                           peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
                                          ),
                          "Oct4": TaskSpec(task="Oct4",
                                           pos_counts=f"{bdir}/Oct4/pos2.bw",
                                           neg_counts=f"{bdir}/Oct4/neg2.bw",
                                           peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
                                          )
                         },
              fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
             )

def ds2bws(ds):
    return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}

# Get the training data
train, valid, test = chip_exo_nexus(ds, peak_width=1000)

100%|██████████| 23474/23474 [00:00<00:00, 496864.06it/s]
2018-07-29 13:39:30,716 [INFO] extract sequence
2018-07-29 13:39:33,728 [INFO] extract counts
100%|██████████| 2/2 [00:18<00:00,  9.02s/it]

# Pre-process the data
preproc = AppendTotalCounts()
preproc.fit(train[1])
train[1] = preproc.transform(train[1])
valid[1] = preproc.transform(valid[1])
test[1] = preproc.transform(test[1])

train[1].keys()

dict_keys(['profile/Sox2', 'profile/Oct4', 'counts/Sox2', 'counts/Oct4'])

tasks=['Sox2', 'Oct4']

# TODO - play around with this
def seq_multitask_chipseq(filters=21,
                          conv1_kernel_size=21,
                          tconv_kernel_size=25,
                          #tconv_kernel_size2=25,
                          n_dil_layers=6,
                          lr=0.004,
                          c_task_weight=100,
                          use_profile=True,
                          use_counts=True,
                          tasks=tasks,
                          seq_len=201):
    """
    Dense

    Args:
      c_task_weights: how to upweight the count-prediction task
    """
    # TODO - build the reverse complement symmetry into the model
    inp = kl.Input(shape=(seq_len, 4), name='seq')
    first_conv = kl.Conv1D(filters,
                           kernel_size=conv1_kernel_size,
                           padding='same',
                           activation='relu')(inp)

    prev_layers = [first_conv]
    for i in range(1, n_dil_layers + 1):
        if i == 1:
            prev_sum = first_conv
        else:
            prev_sum = kl.add(prev_layers)
        conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
        prev_layers.append(conv_output)

    combined_conv = kl.add(prev_layers)

    # De-conv
    x = kl.Reshape((-1, 1, filters))(combined_conv)
    x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
    out = kl.Reshape((-1, 2 * len(tasks)))(x)

    # setup the output branches
    outputs = []
    losses = []
    loss_weights = []

    if use_profile:
        output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
                            output_shape=(seq_len, 2),
                            name="profile/" + task,
                            arguments={"i": i})(out)
                  for i, task in enumerate(tasks)]
        outputs += output
        losses += [twochannel_multinomial_nll] * len(tasks)
        loss_weights += [1] * len(tasks)

    if use_counts:
        pooled = kl.GlobalAvgPool1D()(combined_conv)
        counts = [kl.Dense(2, name="counts/" + task)(pooled)
                  for task in tasks]
        outputs += counts
        losses += ["mse"] * len(tasks)
        loss_weights += [c_task_weight] * len(tasks)

    model = Model(inp, outputs)
    model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
    return model

setup a new model having two output branches¶

import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll

import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model

i=1

def get_model(mfn, mkwargs, fkwargs, i):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/chipseq/exp/models/count+profile"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(i)
    i+=1
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    all_kwargs = {**mkwargs, **fkwargs}
    return eval(mfn)(**all_kwargs), name, ckp_file

train the model¶

add the training curve plot bellow training

# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = True
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=100,
               n_dil_layers=6,
               use_profile=use_profile,
               use_counts=use_counts,
               c_task_weight=10,
               seq_len=train[0].shape[1],
               lr=0.004)
fixed_kwargs = dict(
    tasks=list(ds.task_specs)
)

import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0], 
                    train[1],
          batch_size=256, 
          epochs=100,
          validation_data=valid[:2],
          callbacks=[EarlyStopping(patience=5),
                     History(),
                     ModelCheckpoint(ckp_file, save_best_only=True)]
         )

model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})

WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-07-29 13:39:54,744 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-07-29 13:40:02,917 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Train on 14727 samples, validate on 4493 samples
Epoch 1/100
14727/14727 [==============================] - 11s 758us/step - loss: 616.3036 - profile/Sox2_loss: 168.1244 - profile/Oct4_loss: 425.4177 - counts/Sox2_loss: 1.0797 - counts/Oct4_loss: 1.1964 - val_loss: 611.2884 - val_profile/Sox2_loss: 167.3049 - val_profile/Oct4_loss: 423.6422 - val_counts/Sox2_loss: 1.0032 - val_counts/Oct4_loss: 1.0309
Epoch 2/100
14727/14727 [==============================] - 5s 330us/step - loss: 589.9961 - profile/Sox2_loss: 164.2339 - profile/Oct4_loss: 405.7602 - counts/Sox2_loss: 0.9990 - counts/Oct4_loss: 1.0012 - val_loss: 590.3192 - val_profile/Sox2_loss: 164.3301 - val_profile/Oct4_loss: 405.7246 - val_counts/Sox2_loss: 1.0015 - val_counts/Oct4_loss: 1.0250
Epoch 3/100
14727/14727 [==============================] - 5s 332us/step - loss: 577.3825 - profile/Sox2_loss: 162.3700 - profile/Oct4_loss: 395.0311 - counts/Sox2_loss: 0.9975 - counts/Oct4_loss: 1.0006 - val_loss: 586.8753 - val_profile/Sox2_loss: 163.8145 - val_profile/Oct4_loss: 402.8185 - val_counts/Sox2_loss: 0.9999 - val_counts/Oct4_loss: 1.0243
Epoch 4/100
14727/14727 [==============================] - 5s 335us/step - loss: 575.2363 - profile/Sox2_loss: 161.9755 - profile/Oct4_loss: 393.3261 - counts/Sox2_loss: 0.9937 - counts/Oct4_loss: 0.9997 - val_loss: 585.2835 - val_profile/Sox2_loss: 163.3736 - val_profile/Oct4_loss: 401.7293 - val_counts/Sox2_loss: 0.9947 - val_counts/Oct4_loss: 1.0234
Epoch 5/100
14727/14727 [==============================] - 5s 327us/step - loss: 573.6786 - profile/Sox2_loss: 161.5812 - profile/Oct4_loss: 392.2044 - counts/Sox2_loss: 0.9914 - counts/Oct4_loss: 0.9979 - val_loss: 583.9188 - val_profile/Sox2_loss: 162.8769 - val_profile/Oct4_loss: 400.8664 - val_counts/Sox2_loss: 0.9926 - val_counts/Oct4_loss: 1.0250
Epoch 6/100
14727/14727 [==============================] - 5s 328us/step - loss: 572.2429 - profile/Sox2_loss: 161.0383 - profile/Oct4_loss: 391.3599 - counts/Sox2_loss: 0.9887 - counts/Oct4_loss: 0.9957 - val_loss: 582.1691 - val_profile/Sox2_loss: 162.4854 - val_profile/Oct4_loss: 399.5875 - val_counts/Sox2_loss: 0.9893 - val_counts/Oct4_loss: 1.0203
Epoch 7/100
14727/14727 [==============================] - 5s 345us/step - loss: 571.3248 - profile/Sox2_loss: 160.8116 - profile/Oct4_loss: 390.7239 - counts/Sox2_loss: 0.9850 - counts/Oct4_loss: 0.9940 - val_loss: 581.2761 - val_profile/Sox2_loss: 162.2338 - val_profile/Oct4_loss: 399.0227 - val_counts/Sox2_loss: 0.9825 - val_counts/Oct4_loss: 1.0195
Epoch 8/100
14727/14727 [==============================] - 5s 334us/step - loss: 570.4518 - profile/Sox2_loss: 160.6292 - profile/Oct4_loss: 390.1135 - counts/Sox2_loss: 0.9807 - counts/Oct4_loss: 0.9902 - val_loss: 581.1346 - val_profile/Sox2_loss: 162.3386 - val_profile/Oct4_loss: 398.8606 - val_counts/Sox2_loss: 0.9791 - val_counts/Oct4_loss: 1.0144
Epoch 9/100
14727/14727 [==============================] - 5s 349us/step - loss: 569.6096 - profile/Sox2_loss: 160.4825 - profile/Oct4_loss: 389.5202 - counts/Sox2_loss: 0.9752 - counts/Oct4_loss: 0.9855 - val_loss: 579.7642 - val_profile/Sox2_loss: 162.1487 - val_profile/Oct4_loss: 397.7824 - val_counts/Sox2_loss: 0.9759 - val_counts/Oct4_loss: 1.0074
Epoch 10/100
14727/14727 [==============================] - 5s 346us/step - loss: 569.1268 - profile/Sox2_loss: 160.3815 - profile/Oct4_loss: 389.2308 - counts/Sox2_loss: 0.9702 - counts/Oct4_loss: 0.9812 - val_loss: 579.9020 - val_profile/Sox2_loss: 162.2220 - val_profile/Oct4_loss: 397.9331 - val_counts/Sox2_loss: 0.9696 - val_counts/Oct4_loss: 1.0051
Epoch 11/100
14727/14727 [==============================] - 5s 341us/step - loss: 568.5690 - profile/Sox2_loss: 160.3207 - profile/Oct4_loss: 388.7906 - counts/Sox2_loss: 0.9680 - counts/Oct4_loss: 0.9778 - val_loss: 579.8739 - val_profile/Sox2_loss: 162.1525 - val_profile/Oct4_loss: 398.0242 - val_counts/Sox2_loss: 0.9622 - val_counts/Oct4_loss: 1.0075
Epoch 12/100
14727/14727 [==============================] - 5s 334us/step - loss: 567.9088 - profile/Sox2_loss: 160.1374 - profile/Oct4_loss: 388.4648 - counts/Sox2_loss: 0.9595 - counts/Oct4_loss: 0.9712 - val_loss: 579.1197 - val_profile/Sox2_loss: 162.0076 - val_profile/Oct4_loss: 397.7406 - val_counts/Sox2_loss: 0.9504 - val_counts/Oct4_loss: 0.9867
Epoch 13/100
14727/14727 [==============================] - 5s 346us/step - loss: 567.2088 - profile/Sox2_loss: 160.0064 - profile/Oct4_loss: 388.0841 - counts/Sox2_loss: 0.9492 - counts/Oct4_loss: 0.9626 - val_loss: 578.2527 - val_profile/Sox2_loss: 161.8983 - val_profile/Oct4_loss: 397.1201 - val_counts/Sox2_loss: 0.9447 - val_counts/Oct4_loss: 0.9788
Epoch 14/100
14727/14727 [==============================] - 5s 333us/step - loss: 567.2393 - profile/Sox2_loss: 159.9910 - profile/Oct4_loss: 388.0667 - counts/Sox2_loss: 0.9542 - counts/Oct4_loss: 0.9640 - val_loss: 579.9305 - val_profile/Sox2_loss: 162.0320 - val_profile/Oct4_loss: 397.8613 - val_counts/Sox2_loss: 0.9852 - val_counts/Oct4_loss: 1.0186
Epoch 15/100
14727/14727 [==============================] - 5s 335us/step - loss: 566.9989 - profile/Sox2_loss: 159.9596 - profile/Oct4_loss: 387.7861 - counts/Sox2_loss: 0.9572 - counts/Oct4_loss: 0.9681 - val_loss: 577.9307 - val_profile/Sox2_loss: 161.8062 - val_profile/Oct4_loss: 396.8345 - val_counts/Sox2_loss: 0.9494 - val_counts/Oct4_loss: 0.9796
Epoch 16/100
14727/14727 [==============================] - 5s 339us/step - loss: 566.1731 - profile/Sox2_loss: 159.7719 - profile/Oct4_loss: 387.4039 - counts/Sox2_loss: 0.9427 - counts/Oct4_loss: 0.9570 - val_loss: 578.1423 - val_profile/Sox2_loss: 161.9082 - val_profile/Oct4_loss: 396.9697 - val_counts/Sox2_loss: 0.9498 - val_counts/Oct4_loss: 0.9766
Epoch 17/100
14727/14727 [==============================] - 5s 352us/step - loss: 565.9112 - profile/Sox2_loss: 159.7642 - profile/Oct4_loss: 387.2870 - counts/Sox2_loss: 0.9355 - counts/Oct4_loss: 0.9505 - val_loss: 579.7296 - val_profile/Sox2_loss: 162.3273 - val_profile/Oct4_loss: 397.6268 - val_counts/Sox2_loss: 0.9572 - val_counts/Oct4_loss: 1.0203
Epoch 18/100
14727/14727 [==============================] - 5s 338us/step - loss: 565.9630 - profile/Sox2_loss: 159.7796 - profile/Oct4_loss: 387.1716 - counts/Sox2_loss: 0.9452 - counts/Oct4_loss: 0.9560 - val_loss: 577.2300 - val_profile/Sox2_loss: 161.7710 - val_profile/Oct4_loss: 396.5393 - val_counts/Sox2_loss: 0.9283 - val_counts/Oct4_loss: 0.9636
Epoch 19/100
14727/14727 [==============================] - 5s 334us/step - loss: 565.2392 - profile/Sox2_loss: 159.6582 - profile/Oct4_loss: 386.7640 - counts/Sox2_loss: 0.9333 - counts/Oct4_loss: 0.9484 - val_loss: 576.8224 - val_profile/Sox2_loss: 161.7198 - val_profile/Oct4_loss: 396.0785 - val_counts/Sox2_loss: 0.9331 - val_counts/Oct4_loss: 0.9693
Epoch 20/100
14727/14727 [==============================] - 5s 331us/step - loss: 564.8125 - profile/Sox2_loss: 159.5927 - profile/Oct4_loss: 386.5804 - counts/Sox2_loss: 0.9237 - counts/Oct4_loss: 0.9403 - val_loss: 577.6030 - val_profile/Sox2_loss: 161.7519 - val_profile/Oct4_loss: 396.5317 - val_counts/Sox2_loss: 0.9632 - val_counts/Oct4_loss: 0.9688
Epoch 21/100
14727/14727 [==============================] - 5s 336us/step - loss: 564.8797 - profile/Sox2_loss: 159.5838 - profile/Oct4_loss: 386.5609 - counts/Sox2_loss: 0.9290 - counts/Oct4_loss: 0.9445 - val_loss: 577.3542 - val_profile/Sox2_loss: 161.8367 - val_profile/Oct4_loss: 396.6261 - val_counts/Sox2_loss: 0.9207 - val_counts/Oct4_loss: 0.9684
Epoch 22/100
14727/14727 [==============================] - 5s 336us/step - loss: 564.3021 - profile/Sox2_loss: 159.5077 - profile/Oct4_loss: 386.3246 - counts/Sox2_loss: 0.9143 - counts/Oct4_loss: 0.9327 - val_loss: 577.0991 - val_profile/Sox2_loss: 161.7091 - val_profile/Oct4_loss: 396.2513 - val_counts/Sox2_loss: 0.9459 - val_counts/Oct4_loss: 0.9679
Epoch 23/100
14727/14727 [==============================] - 5s 352us/step - loss: 564.0358 - profile/Sox2_loss: 159.4736 - profile/Oct4_loss: 386.1087 - counts/Sox2_loss: 0.9146 - counts/Oct4_loss: 0.9307 - val_loss: 576.6525 - val_profile/Sox2_loss: 161.7255 - val_profile/Oct4_loss: 395.9580 - val_counts/Sox2_loss: 0.9248 - val_counts/Oct4_loss: 0.9721
Epoch 24/100
14727/14727 [==============================] - 5s 335us/step - loss: 563.7895 - profile/Sox2_loss: 159.4346 - profile/Oct4_loss: 385.9374 - counts/Sox2_loss: 0.9121 - counts/Oct4_loss: 0.9296 - val_loss: 576.2640 - val_profile/Sox2_loss: 161.7743 - val_profile/Oct4_loss: 395.9850 - val_counts/Sox2_loss: 0.9040 - val_counts/Oct4_loss: 0.9465
Epoch 25/100
14727/14727 [==============================] - 5s 333us/step - loss: 563.4532 - profile/Sox2_loss: 159.3887 - profile/Oct4_loss: 385.8219 - counts/Sox2_loss: 0.9027 - counts/Oct4_loss: 0.9215 - val_loss: 576.6666 - val_profile/Sox2_loss: 162.0667 - val_profile/Oct4_loss: 395.7936 - val_counts/Sox2_loss: 0.9273 - val_counts/Oct4_loss: 0.9533
Epoch 26/100
14727/14727 [==============================] - 5s 329us/step - loss: 563.7353 - profile/Sox2_loss: 159.4124 - profile/Oct4_loss: 385.7577 - counts/Sox2_loss: 0.9179 - counts/Oct4_loss: 0.9386 - val_loss: 576.5776 - val_profile/Sox2_loss: 161.7704 - val_profile/Oct4_loss: 395.9812 - val_counts/Sox2_loss: 0.9245 - val_counts/Oct4_loss: 0.9581
Epoch 27/100
14727/14727 [==============================] - 5s 329us/step - loss: 562.8740 - profile/Sox2_loss: 159.3135 - profile/Oct4_loss: 385.5091 - counts/Sox2_loss: 0.8903 - counts/Oct4_loss: 0.9148 - val_loss: 576.7166 - val_profile/Sox2_loss: 161.7759 - val_profile/Oct4_loss: 396.0170 - val_counts/Sox2_loss: 0.9328 - val_counts/Oct4_loss: 0.9595
Epoch 28/100
14727/14727 [==============================] - 5s 338us/step - loss: 563.4631 - profile/Sox2_loss: 159.4130 - profile/Oct4_loss: 385.7254 - counts/Sox2_loss: 0.9063 - counts/Oct4_loss: 0.9262 - val_loss: 576.2538 - val_profile/Sox2_loss: 161.6636 - val_profile/Oct4_loss: 396.1535 - val_counts/Sox2_loss: 0.9016 - val_counts/Oct4_loss: 0.9421
Epoch 29/100
14727/14727 [==============================] - 5s 346us/step - loss: 562.5747 - profile/Sox2_loss: 159.2428 - profile/Oct4_loss: 385.2273 - counts/Sox2_loss: 0.8927 - counts/Oct4_loss: 0.9177 - val_loss: 575.2956 - val_profile/Sox2_loss: 161.7195 - val_profile/Oct4_loss: 395.4537 - val_counts/Sox2_loss: 0.8885 - val_counts/Oct4_loss: 0.9237
Epoch 30/100
14727/14727 [==============================] - 5s 340us/step - loss: 562.6137 - profile/Sox2_loss: 159.2410 - profile/Oct4_loss: 385.3948 - counts/Sox2_loss: 0.8866 - counts/Oct4_loss: 0.9112 - val_loss: 575.8723 - val_profile/Sox2_loss: 161.6252 - val_profile/Oct4_loss: 395.8987 - val_counts/Sox2_loss: 0.8934 - val_counts/Oct4_loss: 0.9415
Epoch 31/100
14727/14727 [==============================] - 5s 336us/step - loss: 562.0211 - profile/Sox2_loss: 159.1688 - profile/Oct4_loss: 385.0904 - counts/Sox2_loss: 0.8737 - counts/Oct4_loss: 0.9025 - val_loss: 575.4586 - val_profile/Sox2_loss: 161.6599 - val_profile/Oct4_loss: 395.8553 - val_counts/Sox2_loss: 0.8714 - val_counts/Oct4_loss: 0.9230
Epoch 32/100
14727/14727 [==============================] - 5s 331us/step - loss: 561.6580 - profile/Sox2_loss: 159.1968 - profile/Oct4_loss: 384.8804 - counts/Sox2_loss: 0.8649 - counts/Oct4_loss: 0.8931 - val_loss: 575.1678 - val_profile/Sox2_loss: 161.7054 - val_profile/Oct4_loss: 395.7611 - val_counts/Sox2_loss: 0.8597 - val_counts/Oct4_loss: 0.9104
Epoch 33/100
14727/14727 [==============================] - 5s 340us/step - loss: 561.2827 - profile/Sox2_loss: 159.1752 - profile/Oct4_loss: 384.7264 - counts/Sox2_loss: 0.8528 - counts/Oct4_loss: 0.8854 - val_loss: 575.5684 - val_profile/Sox2_loss: 161.9022 - val_profile/Oct4_loss: 395.9268 - val_counts/Sox2_loss: 0.8688 - val_counts/Oct4_loss: 0.9051
Epoch 34/100
14727/14727 [==============================] - 5s 339us/step - loss: 561.2232 - profile/Sox2_loss: 159.1920 - profile/Oct4_loss: 384.7490 - counts/Sox2_loss: 0.8485 - counts/Oct4_loss: 0.8798 - val_loss: 575.6992 - val_profile/Sox2_loss: 161.7604 - val_profile/Oct4_loss: 396.2238 - val_counts/Sox2_loss: 0.8596 - val_counts/Oct4_loss: 0.9119
Epoch 35/100
14727/14727 [==============================] - 5s 334us/step - loss: 561.2887 - profile/Sox2_loss: 159.1534 - profile/Oct4_loss: 384.7210 - counts/Sox2_loss: 0.8546 - counts/Oct4_loss: 0.8868 - val_loss: 576.4402 - val_profile/Sox2_loss: 161.8464 - val_profile/Oct4_loss: 396.7988 - val_counts/Sox2_loss: 0.8679 - val_counts/Oct4_loss: 0.9115
Epoch 36/100
14727/14727 [==============================] - 5s 332us/step - loss: 560.9870 - profile/Sox2_loss: 159.0852 - profile/Oct4_loss: 384.6593 - counts/Sox2_loss: 0.8454 - counts/Oct4_loss: 0.8788 - val_loss: 575.5874 - val_profile/Sox2_loss: 161.7543 - val_profile/Oct4_loss: 395.8845 - val_counts/Sox2_loss: 0.8656 - val_counts/Oct4_loss: 0.9293
Epoch 37/100
14727/14727 [==============================] - 5s 333us/step - loss: 560.8602 - profile/Sox2_loss: 159.0957 - profile/Oct4_loss: 384.5283 - counts/Sox2_loss: 0.8481 - counts/Oct4_loss: 0.8755 - val_loss: 575.4702 - val_profile/Sox2_loss: 161.8405 - val_profile/Oct4_loss: 395.9841 - val_counts/Sox2_loss: 0.8563 - val_counts/Oct4_loss: 0.9082

from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])

{'loss': 575.1677851283202,
 'profile/Sox2_loss': 161.70544497780512,
 'profile/Oct4_loss': 395.7610894888142,
 'counts/Sox2_loss': 0.8597082944481987,
 'counts/Oct4_loss': 0.9104166121889323}

model_dir = "/srv/scratch/amr1/chipseq/basepair/basepair/../data/processed/chipseq/exp/models/count+profile/"

import os
import theano
import numpy as np
import modisco
import modisco.tfmodisco_workflow.workflow
import os
from basepair.cli.schemas import DataSpec, HParams
from basepair.modisco import ModiscoResult
from scipy.spatial.distance import correlation
from concise.utils.helper import write_json, read_json
from basepair.data import numpy_minibatch
import h5py
import numpy as np
import keras.backend as K
from basepair.config import create_tf_session
from keras.models import load_model
from basepair.cli.evaluate import load_data
import matplotlib.pyplot as plt
plt.switch_backend('agg')

data = dict(train=train, valid=valid, test=test)
x, y, m = data['valid']

grad_wrt='profile/Sox2'
summary='weighted'
output_dir="outputs_profile_sox2"
hp = HParams.load("/srv/scratch/amr1/chipseq/basepair-workflow/hparams.yaml")
os.makedirs(output_dir, exist_ok=True)
write_json(dict(model_dir=os.path.abspath(model_dir),
                grad_wrt=grad_wrt,
                output_dir=output_dir,
                summary=summary,
                split='valid'),
           os.path.join(output_dir, "kwargs.json"))

dtype, task = grad_wrt.split("/")
out = model.outputs[ds.task2idx(task, dtype)]
inp = model.inputs[0]

if "counts" in grad_wrt or summary == "count":
    pos_strand_ginp_max = K.function([inp], K.gradients(out[:, 0], inp))
    neg_strand_ginp_max = K.function([inp], K.gradients(out[:, 1], inp))
else:
    if summary == 'weighted':
        print("Using weighted stat")
        pos_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 0])) *
                                                     out[:, :, 0], axis=-1), inp))
        neg_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 1])) *
                                                     out[:, :, 1], axis=-1), inp))
    elif summary == "max":
        print("Using max stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 1], axis=1), inp))
    else:
        print("Using avg stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 1], axis=1), inp))

# Pre-compute the predictions and bottlenecks
y_true = y[grad_wrt]
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x

grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))

# compute the distances
distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i])
                      for i in range(len(grads_neg_ext))])

# Setup different scores
top_distances = distances < hp.modisco.max_strand_distance  # filter sites
hyp_scores = grads_pos + grads_neg
hyp_scores = hyp_scores[top_distances]
hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)
onehot_data = x[top_distances]
scores = hyp_scores * onehot_data

# -------------------------------------------------------------
# run modisco
kw = hp.modisco.get_modisco_kwargs()
del kw['max_strand_distance']
del kw['threshold_for_counting_sign']
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(**kw,
    seqlets_to_patterns_factory=
        modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=15,
            initial_flank_to_add=5,
            kmer_len=5, num_gaps=1,
            num_mismatches=0,
            final_min_cluster_size=60))(
    task_names=[grad_wrt],
    contrib_scores={grad_wrt: scores},
    hypothetical_contribs={grad_wrt: hyp_scores},
    one_hot=onehot_data)
# -------------------------------------------------------------
# save the results
output_path = os.path.join(output_dir, "modisco.h5")
if os.path.exists(output_path):
    print("Output path exists")
    output_path += "backup.h5"

grp = h5py.File(output_path)
tfmodisco_results.save_hdf5(grp)

# dump the top distances to file
top_distances.dump("{0}/included_samples.npy".format(output_dir))
distances.dump("{0}/distances.npy".format(output_dir))

Using weighted stat
On task profile/Sox2
Computing windowed sums
Computing threshold
Thresholds: -0.4703658925128937 and 3.7377698
CDFs: 0.9999019899648633 and 0.9998397884067264
Est. FDRs: 0.009989270597790818 and 0.009996669192653426
Got 4241 coords
After resolving overlaps, got 4241 seqlets
Across all tasks, the weakest laplace threshold used was: 0.9998397884067264
4241 identified in total
2 activity patterns with support >= 50 out of 3 possible patterns
Metacluster sizes:  [3823, 418]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 418
Relevant tasks:  ('profile/Sox2',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 418
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.09 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.03 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.02 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 21.82 s
Launching nearest neighbors affmat calculation job
Job completed in: 1.67 s
(Round 1) Computed affinity matrix on nearest neighbors in 23.92 s
Filtered down to 416 of 418
(Round 1) Retained 416 rows out of 418 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 416 samples in 0.000s...
[t-SNE] Computed neighbors for 416 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 416 / 416
[t-SNE] Mean sigma: 0.297276
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.07297682762145996 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s

Louvain completed 200 runs in 1.1172304153442383 seconds

[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.8s finished

Wrote graph to binary file in 0.14318394660949707 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.754239
After 2 runs, maximum modularity is Q = 0.783261
Louvain completed 52 runs in 14.982104301452637 seconds
Preproc + Louvain took 16.348705291748047 s
Got 8 clusters after round 1
Counts:
{2: 78, 0: 91, 3: 72, 6: 14, 1: 84, 7: 8, 4: 37, 5: 32}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 91 seqlets
Trimmed 10 out of 91
Aggregating for cluster 1 with 84 seqlets
Trimmed 9 out of 84
Dropping cluster 1 with 75 seqlets due to sign disagreement
Aggregating for cluster 2 with 78 seqlets
Trimmed 6 out of 78
Dropping cluster 2 with 72 seqlets due to sign disagreement
Aggregating for cluster 3 with 72 seqlets
Trimmed 2 out of 72
Dropping cluster 3 with 70 seqlets due to sign disagreement
Aggregating for cluster 4 with 37 seqlets
Trimmed 2 out of 37
Dropping cluster 4 with 35 seqlets due to sign disagreement
Aggregating for cluster 5 with 32 seqlets
Trimmed 0 out of 32
Dropping cluster 5 with 32 seqlets due to sign disagreement
Aggregating for cluster 6 with 14 seqlets
Trimmed 0 out of 14
Dropping cluster 6 with 14 seqlets due to sign disagreement
Aggregating for cluster 7 with 8 seqlets
Trimmed 0 out of 8
Dropping cluster 7 with 8 seqlets due to sign disagreement
(Round 2) num seqlets: 81
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.04 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.0 s
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.0 s
Finished affinity matrix computations in 0.01 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.0 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 0.33 s
Launching nearest neighbors affmat calculation job
Job completed in: 0.28 s
(Round 2) Computed affinity matrix on nearest neighbors in 0.65 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 81 samples in 0.000s...
[t-SNE] Computed neighbors for 81 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 81 / 81
[t-SNE] Mean sigma: 0.253568
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.024331331253051758 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s

Louvain completed 200 runs in 0.9437379837036133 seconds

[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.7s finished

Wrote graph to binary file in 0.007923364639282227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.613423
Louvain completed 51 runs in 13.786309719085693 seconds
Preproc + Louvain took 14.780112743377686 s
Got 14 clusters after round 2
Counts:
{7: 3, 1: 18, 2: 8, 6: 5, 0: 18, 5: 5, 4: 5, 13: 2, 3: 7, 12: 2, 11: 2, 10: 2, 9: 2, 8: 2}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 18 seqlets
Trimmed 0 out of 18
Aggregating for cluster 1 with 18 seqlets
Trimmed 4 out of 18
Aggregating for cluster 2 with 8 seqlets
Trimmed 2 out of 8
Aggregating for cluster 3 with 7 seqlets
Trimmed 0 out of 7
Aggregating for cluster 4 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 5 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 6 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 7 with 3 seqlets
Trimmed 0 out of 3
Aggregating for cluster 8 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 9 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 10 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 11 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 12 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 13 with 2 seqlets
Trimmed 0 out of 2
Got 14 clusters
Splitting into subclusters...
Merging on 14 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 1 with prob 9.009378290163036e-05 and sim 0.8957096083217023
Collapsing 0 & 3 with prob 0.0004937050349028236 and sim 0.8946271382798632
Aborting collapse as 1 & 3 have prob 4.67594100764648e-05 and sim 0.8218403584181682
Collapsing 1 & 2 with prob 1.6877610431891668e-05 and sim 0.8708092526659896
Trimmed 3 out of 32
Trimmed 0 out of 35
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 1 with prob 0.000657286249147207 and sim 0.881536205615385
Trimmed 7 out of 42
On merging iteration 3
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 11 patterns after merging
Performing seqlet reassignment
Got 0 patterns after reassignment
Total time taken is 60.96s
On metacluster 0
Metacluster size 3823
Relevant tasks:  ('profile/Sox2',)
Relevant signs:  (1,)
(Round 1) num seqlets: 3823
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.93 s
Starting affinity matrix computations
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.15 s
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.14 s
Finished affinity matrix computations in 0.3 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.63 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 17.26 s
Launching nearest neighbors affmat calculation job
Job completed in: 17.34 s
(Round 1) Computed affinity matrix on nearest neighbors in 39.6 s
Filtered down to 2445 of 3823
(Round 1) Retained 2445 rows out of 3823 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2445 samples in 0.009s...
[t-SNE] Computed neighbors for 2445 samples in 0.079s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2445
[t-SNE] Computed conditional probabilities for sample 2000 / 2445
[t-SNE] Computed conditional probabilities for sample 2445 / 2445
[t-SNE] Mean sigma: 0.259122
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.5295615196228027 seconds
Running Louvain modularity optimization

[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    1.3s finished

def ic_scale(x):
    from modisco.visualization import viz_sequence
    background = np.array([0.27, 0.23, 0.23, 0.27])
    return viz_sequence.ic_scale(x, background=background)

from basepair.modisco import ModiscoResult
mr = ModiscoResult(output_dir+"/modisco.h5")
incl = np.load(output_dir+"/included_samples.npy")
tasks = list(ds.task_specs)
legend=False
rc_vec=None
rc_fn=lambda x: x[::-1, ::-1]
start_vec=None
n_bootstrap=None
n_limit=35
seq_height=1.5
n_bootstrap=100
fpath_template=output_dir+"_figures/"
os.makedirs(fpath_template, exist_ok=True)
figsize=(12, 2.5)
tracks = {task: y[f"profile/{task}"][incl] for task in tasks}
x = x[incl]
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo

print(mr.patterns())

['pattern_0', 'pattern_1', 'pattern_2']

def bootstrap_mean(x, n=100):
    """Bootstrap the mean computation"""
    out = []
    for i in range(n):
        idx = pd.Series(np.arange(len(x))).sample(frac=1.0, replace=True)
        out.append(x[idx].mean(0))
    outm = np.stack(out)
    return outm.mean(0), outm.std(0)

import pandas as pd
from matplotlib.ticker import FormatStrFormatter
from basepair.plots import show_figure

ylim=[0, 3]

%matplotlib inline
for i, pattern in enumerate(mr.patterns()):

    j = i
    seqs = mr.extract_signal(x)[pattern]
    sequence = ic_scale(seqs.mean(axis=0))
    if rc_vec is not None and rc_vec[i]:
        sequence = rc_fn(sequence)
    if start_vec is not None:
        start = start_vec[i]
        sequence = sequence[start:(start + width)]
    n = len(seqs)
    if n < n_limit:
        continue
    fig, ax = plt.subplots(1 + len(tracks), 1, sharex=True,
                           figsize=figsize,
                           gridspec_kw={'height_ratios': [1] * len(tracks) + [seq_height]})
    ax[0].set_title(f"motif {i} ({n})")
    for i, (k, y) in enumerate(tracks.items()):
        signal = mr.extract_signal(y, rc_fn)[pattern]

        if start_vec is not None:
            start = start_vec[i]
            signal = signal[:, start:(start + width)]

        if n_bootstrap is None:
            signal_mean = signal.mean(axis=0)
            signal_std = signal.std(axis=0)
        else:
            signal_mean, signal_std = bootstrap_mean(signal, n=n_bootstrap)
        if rc_vec is not None and rc_vec[i]:
            signal_mean = rc_fn(signal_mean)
            signal_std = rc_fn(signal_std)

        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 0], label='pos')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 0] - 2 * signal_std[:, 0],
                               signal_mean[:, 0] + 2 * signal_std[:, 0],
                               alpha=0.1)
        #                   label='pos')
        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 1], label='neg')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 1] - 2 * signal_std[:, 1],
                               signal_mean[:, 1] + 2 * signal_std[:, 1],
                               alpha=0.1)
        #                   label='pos')
        ax[i].set_ylabel(f"{k}")
        ax[i].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['bottom'].set_visible(False)
        ax[i].xaxis.set_ticks_position('none')
        if isinstance(ylim[0], list):
            ax[i].set_ylim(ylim[i])
        if legend:
            ax[i].legend()

    seqlogo(sequence, ax=ax[-1])
    ax[-1].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    ax[-1].set_ylabel("Inf. content")
    ax[-1].spines['top'].set_visible(False)
    ax[-1].spines['right'].set_visible(False)
    ax[-1].spines['bottom'].set_visible(False)
    ax[-1].set_xticks(list(range(0, len(sequence) + 1, 5)))
    
    if fpath_template is not None:
        plt.savefig(fpath_template.format(j) + '.png', dpi=600)
        plt.savefig(fpath_template.format(j) + '.pdf', dpi=600)
        plt.close(fig)    # close the figure
        show_figure(fig)
        plt.show()