Goal

  • train a decent seq-> profile + counts model for ChIP-seq

Resources

  • washu
    • session: 7-w-sox2-oct4-chipseq
In [1]:
import basepair
import modisco
Using TensorFlow backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/usr/bin/ld: cannot find -lcudnn\ncollect2: error: ld returned 1 exit status\n'
Mapped name None to device cuda: GeForce GTX TITAN X (0000:09:00.0)
In [2]:
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
2018-07-29 12:48:29,677 [WARNING] git-lfs not installed
2018-07-29 12:48:29,716 [WARNING] git-lfs not installed
In [3]:
# Use gpus 3, 5
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "3, 5"
In [4]:
ddir = get_data_dir()
In [5]:
ddir
Out[5]:
'/srv/scratch/amr1/chipseq/basepair/basepair/../data'
In [6]:
bdir = "/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/"

ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
                                           pos_counts=f"{bdir}/Sox2/pos.bw",
                                           neg_counts=f"{bdir}/Sox2/neg.bw",
                                           peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
                                          ),
                          "Oct4": TaskSpec(task="Oct4",
                                           pos_counts=f"{bdir}/Oct4/pos2.bw",
                                           neg_counts=f"{bdir}/Oct4/neg2.bw",
                                           peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
                                          )
                         },
              fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
             )
In [7]:
def ds2bws(ds):
    return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
In [8]:
# Get the training data
train, valid, test = chip_exo_nexus(ds, peak_width=1000)
100%|██████████| 23474/23474 [00:00<00:00, 719447.37it/s]
2018-07-29 12:48:32,612 [INFO] extract sequence
2018-07-29 12:48:36,071 [INFO] extract counts
100%|██████████| 2/2 [00:17<00:00,  8.90s/it]
In [9]:
# Pre-process the data
preproc = AppendTotalCounts()
preproc.fit(train[1])
train[1] = preproc.transform(train[1])
valid[1] = preproc.transform(valid[1])
test[1] = preproc.transform(test[1])
In [10]:
train[1].keys()
Out[10]:
dict_keys(['profile/Sox2', 'profile/Oct4', 'counts/Sox2', 'counts/Oct4'])
In [11]:
tasks=['Sox2', 'Oct4']
In [12]:
# TODO - play around with this
def seq_multitask_chipseq(filters=21,
                          conv1_kernel_size=21,
                          tconv_kernel_size=25,
                          #tconv_kernel_size2=25,
                          n_dil_layers=6,
                          lr=0.004,
                          c_task_weight=100,
                          use_profile=True,
                          use_counts=True,
                          tasks=tasks,
                          seq_len=201):
    """
    Dense

    Args:
      c_task_weights: how to upweight the count-prediction task
    """
    # TODO - build the reverse complement symmetry into the model
    inp = kl.Input(shape=(seq_len, 4), name='seq')
    first_conv = kl.Conv1D(filters,
                           kernel_size=conv1_kernel_size,
                           padding='same',
                           activation='relu')(inp)

    prev_layers = [first_conv]
    for i in range(1, n_dil_layers + 1):
        if i == 1:
            prev_sum = first_conv
        else:
            prev_sum = kl.add(prev_layers)
        conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
        prev_layers.append(conv_output)

    combined_conv = kl.add(prev_layers)

    # De-conv
    x = kl.Reshape((-1, 1, filters))(combined_conv)
    x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
    out = kl.Reshape((-1, 2 * len(tasks)))(x)

    # setup the output branches
    outputs = []
    losses = []
    loss_weights = []

    if use_profile:
        output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
                            output_shape=(seq_len, 2),
                            name="profile/" + task,
                            arguments={"i": i})(out)
                  for i, task in enumerate(tasks)]
        outputs += output
        losses += [twochannel_multinomial_nll] * len(tasks)
        loss_weights += [1] * len(tasks)

    if use_counts:
        pooled = kl.GlobalAvgPool1D()(combined_conv)
        counts = [kl.Dense(2, name="counts/" + task)(pooled)
                  for task in tasks]
        outputs += counts
        losses += ["mse"] * len(tasks)
        loss_weights += [c_task_weight] * len(tasks)

    model = Model(inp, outputs)
    model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
    return model

setup a new model having two output branches

In [13]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
In [14]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
In [15]:
i=1
In [16]:
def get_model(mfn, mkwargs, fkwargs, i):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/chipseq/exp/models/count+profile"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(i)
    i+=1
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    all_kwargs = {**mkwargs, **fkwargs}
    return eval(mfn)(**all_kwargs), name, ckp_file

train the model

  • add the training curve plot bellow training
In [17]:
# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = True
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=100,
               n_dil_layers=6,
               use_profile=use_profile,
               use_counts=use_counts,
               c_task_weight=10,
               seq_len=train[0].shape[1],
               lr=0.004)
fixed_kwargs = dict(
    tasks=list(ds.task_specs)
)
In [18]:
import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0], 
                    train[1],
          batch_size=256, 
          epochs=100,
          validation_data=valid[:2],
          callbacks=[EarlyStopping(patience=5),
                     History(),
                     ModelCheckpoint(ckp_file, save_best_only=True)]
         )

model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-07-29 12:48:57,371 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-07-29 12:49:05,548 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
Train on 14727 samples, validate on 4493 samples
Epoch 1/100
14727/14727 [==============================] - 11s 740us/step - loss: 616.2985 - profile/Sox2_loss: 168.1233 - profile/Oct4_loss: 425.4137 - counts/Sox2_loss: 1.0797 - counts/Oct4_loss: 1.1964 - val_loss: 611.2807 - val_profile/Sox2_loss: 167.3001 - val_profile/Oct4_loss: 423.6395 - val_counts/Sox2_loss: 1.0032 - val_counts/Oct4_loss: 1.0309
Epoch 2/100
14727/14727 [==============================] - 5s 344us/step - loss: 590.0202 - profile/Sox2_loss: 164.2312 - profile/Oct4_loss: 405.7877 - counts/Sox2_loss: 0.9990 - counts/Oct4_loss: 1.0011 - val_loss: 590.4670 - val_profile/Sox2_loss: 164.3758 - val_profile/Oct4_loss: 405.8264 - val_counts/Sox2_loss: 1.0015 - val_counts/Oct4_loss: 1.0250
Epoch 3/100
14727/14727 [==============================] - 5s 337us/step - loss: 577.4205 - profile/Sox2_loss: 162.3679 - profile/Oct4_loss: 395.0719 - counts/Sox2_loss: 0.9974 - counts/Oct4_loss: 1.0007 - val_loss: 587.1208 - val_profile/Sox2_loss: 163.8527 - val_profile/Oct4_loss: 403.0233 - val_counts/Sox2_loss: 1.0000 - val_counts/Oct4_loss: 1.0245
Epoch 4/100
14727/14727 [==============================] - 5s 323us/step - loss: 575.2716 - profile/Sox2_loss: 161.9818 - profile/Oct4_loss: 393.3545 - counts/Sox2_loss: 0.9937 - counts/Oct4_loss: 0.9998 - val_loss: 585.3767 - val_profile/Sox2_loss: 163.3967 - val_profile/Oct4_loss: 401.7948 - val_counts/Sox2_loss: 0.9948 - val_counts/Oct4_loss: 1.0237
Epoch 5/100
14727/14727 [==============================] - 5s 342us/step - loss: 573.6975 - profile/Sox2_loss: 161.5952 - profile/Oct4_loss: 392.2085 - counts/Sox2_loss: 0.9914 - counts/Oct4_loss: 0.9980 - val_loss: 584.1235 - val_profile/Sox2_loss: 162.9408 - val_profile/Oct4_loss: 401.0025 - val_counts/Sox2_loss: 0.9933 - val_counts/Oct4_loss: 1.0248
Epoch 6/100
14727/14727 [==============================] - 5s 352us/step - loss: 572.3057 - profile/Sox2_loss: 161.0567 - profile/Oct4_loss: 391.3982 - counts/Sox2_loss: 0.9892 - counts/Oct4_loss: 0.9959 - val_loss: 582.2802 - val_profile/Sox2_loss: 162.5105 - val_profile/Oct4_loss: 399.6626 - val_counts/Sox2_loss: 0.9899 - val_counts/Oct4_loss: 1.0208
Epoch 7/100
14727/14727 [==============================] - 5s 347us/step - loss: 571.2442 - profile/Sox2_loss: 160.7921 - profile/Oct4_loss: 390.6589 - counts/Sox2_loss: 0.9851 - counts/Oct4_loss: 0.9942 - val_loss: 581.4387 - val_profile/Sox2_loss: 162.2809 - val_profile/Oct4_loss: 399.1247 - val_counts/Sox2_loss: 0.9826 - val_counts/Oct4_loss: 1.0207
Epoch 8/100
14727/14727 [==============================] - 5s 334us/step - loss: 570.4149 - profile/Sox2_loss: 160.6196 - profile/Oct4_loss: 390.0842 - counts/Sox2_loss: 0.9806 - counts/Oct4_loss: 0.9905 - val_loss: 581.2064 - val_profile/Sox2_loss: 162.3459 - val_profile/Oct4_loss: 398.8822 - val_counts/Sox2_loss: 0.9816 - val_counts/Oct4_loss: 1.0163
Epoch 9/100
14727/14727 [==============================] - 5s 340us/step - loss: 569.5685 - profile/Sox2_loss: 160.4657 - profile/Oct4_loss: 389.4840 - counts/Sox2_loss: 0.9759 - counts/Oct4_loss: 0.9860 - val_loss: 579.8365 - val_profile/Sox2_loss: 162.1990 - val_profile/Oct4_loss: 397.8087 - val_counts/Sox2_loss: 0.9754 - val_counts/Oct4_loss: 1.0075
Epoch 10/100
14727/14727 [==============================] - 5s 339us/step - loss: 569.0530 - profile/Sox2_loss: 160.3778 - profile/Oct4_loss: 389.1496 - counts/Sox2_loss: 0.9710 - counts/Oct4_loss: 0.9816 - val_loss: 580.0216 - val_profile/Sox2_loss: 162.2576 - val_profile/Oct4_loss: 397.9768 - val_counts/Sox2_loss: 0.9723 - val_counts/Oct4_loss: 1.0064
Epoch 11/100
14727/14727 [==============================] - 5s 351us/step - loss: 568.6404 - profile/Sox2_loss: 160.3320 - profile/Oct4_loss: 388.8379 - counts/Sox2_loss: 0.9687 - counts/Oct4_loss: 0.9784 - val_loss: 579.7000 - val_profile/Sox2_loss: 162.1837 - val_profile/Oct4_loss: 397.9059 - val_counts/Sox2_loss: 0.9586 - val_counts/Oct4_loss: 1.0025
Epoch 12/100
14727/14727 [==============================] - 5s 344us/step - loss: 567.8617 - profile/Sox2_loss: 160.1421 - profile/Oct4_loss: 388.4118 - counts/Sox2_loss: 0.9594 - counts/Oct4_loss: 0.9713 - val_loss: 579.0232 - val_profile/Sox2_loss: 162.0705 - val_profile/Oct4_loss: 397.5531 - val_counts/Sox2_loss: 0.9523 - val_counts/Oct4_loss: 0.9877
Epoch 13/100
14727/14727 [==============================] - 5s 343us/step - loss: 567.1788 - profile/Sox2_loss: 160.0214 - profile/Oct4_loss: 388.0327 - counts/Sox2_loss: 0.9494 - counts/Oct4_loss: 0.9631 - val_loss: 578.0557 - val_profile/Sox2_loss: 161.8836 - val_profile/Oct4_loss: 396.9183 - val_counts/Sox2_loss: 0.9450 - val_counts/Oct4_loss: 0.9804
Epoch 14/100
14727/14727 [==============================] - 5s 340us/step - loss: 567.1793 - profile/Sox2_loss: 159.9926 - profile/Oct4_loss: 388.0134 - counts/Sox2_loss: 0.9534 - counts/Oct4_loss: 0.9639 - val_loss: 579.1486 - val_profile/Sox2_loss: 161.9504 - val_profile/Oct4_loss: 397.3281 - val_counts/Sox2_loss: 0.9772 - val_counts/Oct4_loss: 1.0098
Epoch 15/100
14727/14727 [==============================] - 5s 340us/step - loss: 566.9933 - profile/Sox2_loss: 159.9754 - profile/Oct4_loss: 387.7619 - counts/Sox2_loss: 0.9576 - counts/Oct4_loss: 0.9680 - val_loss: 577.8493 - val_profile/Sox2_loss: 161.8063 - val_profile/Oct4_loss: 396.7809 - val_counts/Sox2_loss: 0.9468 - val_counts/Oct4_loss: 0.9794
Epoch 16/100
14727/14727 [==============================] - 5s 348us/step - loss: 566.0816 - profile/Sox2_loss: 159.7599 - profile/Oct4_loss: 387.3416 - counts/Sox2_loss: 0.9411 - counts/Oct4_loss: 0.9569 - val_loss: 577.7800 - val_profile/Sox2_loss: 161.9021 - val_profile/Oct4_loss: 396.7032 - val_counts/Sox2_loss: 0.9439 - val_counts/Oct4_loss: 0.9736
Epoch 17/100
14727/14727 [==============================] - 5s 341us/step - loss: 566.0001 - profile/Sox2_loss: 159.7760 - profile/Oct4_loss: 387.2802 - counts/Sox2_loss: 0.9399 - counts/Oct4_loss: 0.9545 - val_loss: 580.4630 - val_profile/Sox2_loss: 162.2881 - val_profile/Oct4_loss: 397.4807 - val_counts/Sox2_loss: 1.0036 - val_counts/Oct4_loss: 1.0658
Epoch 18/100
14727/14727 [==============================] - 5s 347us/step - loss: 566.0112 - profile/Sox2_loss: 159.7592 - profile/Oct4_loss: 387.2035 - counts/Sox2_loss: 0.9477 - counts/Oct4_loss: 0.9572 - val_loss: 577.2630 - val_profile/Sox2_loss: 161.7469 - val_profile/Oct4_loss: 396.5049 - val_counts/Sox2_loss: 0.9305 - val_counts/Oct4_loss: 0.9707
Epoch 19/100
14727/14727 [==============================] - 5s 329us/step - loss: 565.2529 - profile/Sox2_loss: 159.6692 - profile/Oct4_loss: 386.7884 - counts/Sox2_loss: 0.9314 - counts/Oct4_loss: 0.9481 - val_loss: 576.8294 - val_profile/Sox2_loss: 161.7665 - val_profile/Oct4_loss: 396.0169 - val_counts/Sox2_loss: 0.9346 - val_counts/Oct4_loss: 0.9700
Epoch 20/100
14727/14727 [==============================] - 5s 334us/step - loss: 564.8111 - profile/Sox2_loss: 159.6015 - profile/Oct4_loss: 386.5758 - counts/Sox2_loss: 0.9227 - counts/Oct4_loss: 0.9406 - val_loss: 577.6219 - val_profile/Sox2_loss: 161.8357 - val_profile/Oct4_loss: 396.6248 - val_counts/Sox2_loss: 0.9519 - val_counts/Oct4_loss: 0.9642
Epoch 21/100
14727/14727 [==============================] - 5s 340us/step - loss: 564.9956 - profile/Sox2_loss: 159.5956 - profile/Oct4_loss: 386.6369 - counts/Sox2_loss: 0.9296 - counts/Oct4_loss: 0.9467 - val_loss: 577.8979 - val_profile/Sox2_loss: 161.9885 - val_profile/Oct4_loss: 396.7566 - val_counts/Sox2_loss: 0.9325 - val_counts/Oct4_loss: 0.9828
Epoch 22/100
14727/14727 [==============================] - 5s 337us/step - loss: 564.3485 - profile/Sox2_loss: 159.5208 - profile/Oct4_loss: 386.3691 - counts/Sox2_loss: 0.9123 - counts/Oct4_loss: 0.9336 - val_loss: 576.9343 - val_profile/Sox2_loss: 161.7573 - val_profile/Oct4_loss: 396.1260 - val_counts/Sox2_loss: 0.9392 - val_counts/Oct4_loss: 0.9659
Epoch 23/100
14727/14727 [==============================] - 5s 340us/step - loss: 564.1412 - profile/Sox2_loss: 159.5028 - profile/Oct4_loss: 386.1616 - counts/Sox2_loss: 0.9153 - counts/Oct4_loss: 0.9324 - val_loss: 576.7562 - val_profile/Sox2_loss: 161.7960 - val_profile/Oct4_loss: 396.0713 - val_counts/Sox2_loss: 0.9178 - val_counts/Oct4_loss: 0.9711
Epoch 24/100
14727/14727 [==============================] - 5s 324us/step - loss: 563.8476 - profile/Sox2_loss: 159.4296 - profile/Oct4_loss: 385.9918 - counts/Sox2_loss: 0.9126 - counts/Oct4_loss: 0.9300 - val_loss: 576.5019 - val_profile/Sox2_loss: 161.7891 - val_profile/Oct4_loss: 396.2264 - val_counts/Sox2_loss: 0.9004 - val_counts/Oct4_loss: 0.9482
Epoch 25/100
14727/14727 [==============================] - 5s 339us/step - loss: 563.5542 - profile/Sox2_loss: 159.3935 - profile/Oct4_loss: 385.9122 - counts/Sox2_loss: 0.9022 - counts/Oct4_loss: 0.9227 - val_loss: 576.8716 - val_profile/Sox2_loss: 162.1400 - val_profile/Oct4_loss: 395.9231 - val_counts/Sox2_loss: 0.9263 - val_counts/Oct4_loss: 0.9546
Epoch 26/100
14727/14727 [==============================] - 5s 331us/step - loss: 563.8137 - profile/Sox2_loss: 159.4254 - profile/Oct4_loss: 385.8483 - counts/Sox2_loss: 0.9163 - counts/Oct4_loss: 0.9378 - val_loss: 576.2105 - val_profile/Sox2_loss: 161.7627 - val_profile/Oct4_loss: 395.7880 - val_counts/Sox2_loss: 0.9127 - val_counts/Oct4_loss: 0.9533
Epoch 27/100
14727/14727 [==============================] - 5s 343us/step - loss: 562.8496 - profile/Sox2_loss: 159.3221 - profile/Oct4_loss: 385.5447 - counts/Sox2_loss: 0.8871 - counts/Oct4_loss: 0.9112 - val_loss: 576.2715 - val_profile/Sox2_loss: 161.7833 - val_profile/Oct4_loss: 396.0878 - val_counts/Sox2_loss: 0.9015 - val_counts/Oct4_loss: 0.9386
Epoch 28/100
14727/14727 [==============================] - 5s 338us/step - loss: 563.8816 - profile/Sox2_loss: 159.4576 - profile/Oct4_loss: 385.9857 - counts/Sox2_loss: 0.9125 - counts/Oct4_loss: 0.9313 - val_loss: 576.6359 - val_profile/Sox2_loss: 161.9977 - val_profile/Oct4_loss: 396.1833 - val_counts/Sox2_loss: 0.9007 - val_counts/Oct4_loss: 0.9448
Epoch 29/100
14727/14727 [==============================] - 5s 338us/step - loss: 562.7018 - profile/Sox2_loss: 159.2940 - profile/Oct4_loss: 385.3365 - counts/Sox2_loss: 0.8898 - counts/Oct4_loss: 0.9173 - val_loss: 575.6684 - val_profile/Sox2_loss: 161.9328 - val_profile/Oct4_loss: 395.6256 - val_counts/Sox2_loss: 0.8853 - val_counts/Oct4_loss: 0.9257
Epoch 30/100
14727/14727 [==============================] - 5s 333us/step - loss: 562.8837 - profile/Sox2_loss: 159.3045 - profile/Oct4_loss: 385.5524 - counts/Sox2_loss: 0.8887 - counts/Oct4_loss: 0.9140 - val_loss: 575.9025 - val_profile/Sox2_loss: 161.7085 - val_profile/Oct4_loss: 395.7014 - val_counts/Sox2_loss: 0.9000 - val_counts/Oct4_loss: 0.9492
Epoch 31/100
14727/14727 [==============================] - 5s 334us/step - loss: 562.3067 - profile/Sox2_loss: 159.2100 - profile/Oct4_loss: 385.2259 - counts/Sox2_loss: 0.8793 - counts/Oct4_loss: 0.9078 - val_loss: 576.0222 - val_profile/Sox2_loss: 161.8252 - val_profile/Oct4_loss: 396.2157 - val_counts/Sox2_loss: 0.8683 - val_counts/Oct4_loss: 0.9298
Epoch 32/100
14727/14727 [==============================] - 5s 328us/step - loss: 561.8527 - profile/Sox2_loss: 159.2099 - profile/Oct4_loss: 385.0354 - counts/Sox2_loss: 0.8666 - counts/Oct4_loss: 0.8941 - val_loss: 575.3339 - val_profile/Sox2_loss: 161.8051 - val_profile/Oct4_loss: 395.7285 - val_counts/Sox2_loss: 0.8621 - val_counts/Oct4_loss: 0.9179
Epoch 33/100
14727/14727 [==============================] - 5s 336us/step - loss: 561.5871 - profile/Sox2_loss: 159.2508 - profile/Oct4_loss: 384.9918 - counts/Sox2_loss: 0.8514 - counts/Oct4_loss: 0.8830 - val_loss: 576.1467 - val_profile/Sox2_loss: 162.0204 - val_profile/Oct4_loss: 396.1432 - val_counts/Sox2_loss: 0.8822 - val_counts/Oct4_loss: 0.9161
Epoch 34/100
14727/14727 [==============================] - 5s 331us/step - loss: 561.4060 - profile/Sox2_loss: 159.2137 - profile/Oct4_loss: 384.9560 - counts/Sox2_loss: 0.8456 - counts/Oct4_loss: 0.8780 - val_loss: 576.2467 - val_profile/Sox2_loss: 161.9441 - val_profile/Oct4_loss: 396.6798 - val_counts/Sox2_loss: 0.8586 - val_counts/Oct4_loss: 0.9037
Epoch 35/100
14727/14727 [==============================] - 5s 345us/step - loss: 561.3828 - profile/Sox2_loss: 159.1886 - profile/Oct4_loss: 384.9884 - counts/Sox2_loss: 0.8434 - counts/Oct4_loss: 0.8772 - val_loss: 579.4732 - val_profile/Sox2_loss: 162.1436 - val_profile/Oct4_loss: 397.0400 - val_counts/Sox2_loss: 1.0113 - val_counts/Oct4_loss: 1.0176
Epoch 36/100
14727/14727 [==============================] - 5s 339us/step - loss: 561.4723 - profile/Sox2_loss: 159.1599 - profile/Oct4_loss: 384.9105 - counts/Sox2_loss: 0.8549 - counts/Oct4_loss: 0.8853 - val_loss: 575.9091 - val_profile/Sox2_loss: 161.8777 - val_profile/Oct4_loss: 395.8466 - val_counts/Sox2_loss: 0.8754 - val_counts/Oct4_loss: 0.9430
Epoch 37/100
14727/14727 [==============================] - 5s 321us/step - loss: 561.2055 - profile/Sox2_loss: 159.1677 - profile/Oct4_loss: 384.7108 - counts/Sox2_loss: 0.8541 - counts/Oct4_loss: 0.8786 - val_loss: 575.2481 - val_profile/Sox2_loss: 161.8570 - val_profile/Oct4_loss: 395.7086 - val_counts/Sox2_loss: 0.8558 - val_counts/Oct4_loss: 0.9124
Epoch 38/100
14727/14727 [==============================] - 5s 336us/step - loss: 560.9513 - profile/Sox2_loss: 159.1115 - profile/Oct4_loss: 384.7184 - counts/Sox2_loss: 0.8380 - counts/Oct4_loss: 0.8741 - val_loss: 576.4704 - val_profile/Sox2_loss: 162.0339 - val_profile/Oct4_loss: 395.8753 - val_counts/Sox2_loss: 0.9064 - val_counts/Oct4_loss: 0.9497
Epoch 39/100
14727/14727 [==============================] - 5s 334us/step - loss: 560.6209 - profile/Sox2_loss: 159.0832 - profile/Oct4_loss: 384.6439 - counts/Sox2_loss: 0.8272 - counts/Oct4_loss: 0.8622 - val_loss: 576.4714 - val_profile/Sox2_loss: 162.1864 - val_profile/Oct4_loss: 395.8724 - val_counts/Sox2_loss: 0.8894 - val_counts/Oct4_loss: 0.9519
Epoch 40/100
14727/14727 [==============================] - 5s 347us/step - loss: 560.5624 - profile/Sox2_loss: 159.0577 - profile/Oct4_loss: 384.4870 - counts/Sox2_loss: 0.8332 - counts/Oct4_loss: 0.8686 - val_loss: 575.0045 - val_profile/Sox2_loss: 161.8508 - val_profile/Oct4_loss: 395.6001 - val_counts/Sox2_loss: 0.8563 - val_counts/Oct4_loss: 0.8991
Epoch 41/100
14727/14727 [==============================] - 5s 343us/step - loss: 560.1092 - profile/Sox2_loss: 159.0251 - profile/Oct4_loss: 384.3769 - counts/Sox2_loss: 0.8180 - counts/Oct4_loss: 0.8528 - val_loss: 575.3858 - val_profile/Sox2_loss: 161.9594 - val_profile/Oct4_loss: 395.7844 - val_counts/Sox2_loss: 0.8477 - val_counts/Oct4_loss: 0.9165
Epoch 42/100
14727/14727 [==============================] - 5s 345us/step - loss: 559.7493 - profile/Sox2_loss: 159.0084 - profile/Oct4_loss: 384.2586 - counts/Sox2_loss: 0.8054 - counts/Oct4_loss: 0.8429 - val_loss: 578.1461 - val_profile/Sox2_loss: 162.4135 - val_profile/Oct4_loss: 396.4048 - val_counts/Sox2_loss: 0.9854 - val_counts/Oct4_loss: 0.9474
Epoch 43/100
14727/14727 [==============================] - 5s 325us/step - loss: 560.7187 - profile/Sox2_loss: 159.0769 - profile/Oct4_loss: 384.3721 - counts/Sox2_loss: 0.8481 - counts/Oct4_loss: 0.8789 - val_loss: 576.8943 - val_profile/Sox2_loss: 162.0081 - val_profile/Oct4_loss: 396.2313 - val_counts/Sox2_loss: 0.9088 - val_counts/Oct4_loss: 0.9567
Epoch 44/100
14727/14727 [==============================] - 5s 332us/step - loss: 560.2272 - profile/Sox2_loss: 159.0347 - profile/Oct4_loss: 384.2056 - counts/Sox2_loss: 0.8305 - counts/Oct4_loss: 0.8682 - val_loss: 575.5708 - val_profile/Sox2_loss: 161.9421 - val_profile/Oct4_loss: 395.8907 - val_counts/Sox2_loss: 0.8601 - val_counts/Oct4_loss: 0.9137
Epoch 45/100
14727/14727 [==============================] - 5s 343us/step - loss: 559.7672 - profile/Sox2_loss: 158.9676 - profile/Oct4_loss: 384.0755 - counts/Sox2_loss: 0.8186 - counts/Oct4_loss: 0.8538 - val_loss: 575.4086 - val_profile/Sox2_loss: 162.0207 - val_profile/Oct4_loss: 396.0208 - val_counts/Sox2_loss: 0.8410 - val_counts/Oct4_loss: 0.8957
In [19]:
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
Out[19]:
{'loss': 575.0044803581178,
 'profile/Sox2_loss': 161.8507914404123,
 'profile/Oct4_loss': 395.6000949393501,
 'counts/Sox2_loss': 0.8562890054546537,
 'counts/Oct4_loss': 0.8990698542595971}
In [20]:
model_dir = "/srv/scratch/amr1/chipseq/basepair/basepair/../data/processed/chipseq/exp/models/count+profile/"
In [21]:
import os
import theano
import numpy as np
import modisco
import modisco.tfmodisco_workflow.workflow
import os
from basepair.cli.schemas import DataSpec, HParams
from basepair.modisco import ModiscoResult
from scipy.spatial.distance import correlation
from concise.utils.helper import write_json, read_json
from basepair.data import numpy_minibatch
import h5py
import numpy as np
import keras.backend as K
from basepair.config import create_tf_session
from keras.models import load_model
from basepair.cli.evaluate import load_data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
In [22]:
data = dict(train=train, valid=valid, test=test)
x, y, m = data['valid']
In [23]:
grad_wrt='profile/Oct4'
summary='weighted'
output_dir="outputs_profile_oct4"
hp = HParams.load("/srv/scratch/amr1/chipseq/basepair-workflow/hparams.yaml")
os.makedirs(output_dir, exist_ok=True)
write_json(dict(model_dir=os.path.abspath(model_dir),
                grad_wrt=grad_wrt,
                output_dir=output_dir,
                summary=summary,
                split='valid'),
           os.path.join(output_dir, "kwargs.json"))
In [24]:
dtype, task = grad_wrt.split("/")
out = model.outputs[ds.task2idx(task, dtype)]
inp = model.inputs[0]
In [25]:
if "counts" in grad_wrt or summary == "count":
    pos_strand_ginp_max = K.function([inp], K.gradients(out[:, 0], inp))
    neg_strand_ginp_max = K.function([inp], K.gradients(out[:, 1], inp))
else:
    if summary == 'weighted':
        print("Using weighted stat")
        pos_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 0])) *
                                                     out[:, :, 0], axis=-1), inp))
        neg_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 1])) *
                                                     out[:, :, 1], axis=-1), inp))
    elif summary == "max":
        print("Using max stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 1], axis=1), inp))
    else:
        print("Using avg stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 1], axis=1), inp))

# Pre-compute the predictions and bottlenecks
y_true = y[grad_wrt]
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x

grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))

# compute the distances
distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i])
                      for i in range(len(grads_neg_ext))])

# Setup different scores
top_distances = distances < hp.modisco.max_strand_distance  # filter sites
hyp_scores = grads_pos + grads_neg
hyp_scores = hyp_scores[top_distances]
hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)
onehot_data = x[top_distances]
scores = hyp_scores * onehot_data

# -------------------------------------------------------------
# run modisco
kw = hp.modisco.get_modisco_kwargs()
del kw['max_strand_distance']
del kw['threshold_for_counting_sign']
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(**kw,
    seqlets_to_patterns_factory=
        modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=15,
            initial_flank_to_add=5,
            kmer_len=5, num_gaps=1,
            num_mismatches=0,
            final_min_cluster_size=60))(
    task_names=[grad_wrt],
    contrib_scores={grad_wrt: scores},
    hypothetical_contribs={grad_wrt: hyp_scores},
    one_hot=onehot_data)
# -------------------------------------------------------------
# save the results
output_path = os.path.join(output_dir, "modisco.h5")
if os.path.exists(output_path):
    print("Output path exists")
    output_path += "backup.h5"

grp = h5py.File(output_path)
tfmodisco_results.save_hdf5(grp)

# dump the top distances to file
top_distances.dump("{0}/included_samples.npy".format(output_dir))
distances.dump("{0}/distances.npy".format(output_dir))
Using weighted stat
On task profile/Oct4
Computing windowed sums
Computing threshold
Thresholds: -0.33819581137428284 and 3.643135
CDFs: 0.9995408362970394 and 0.9998399517486236
Est. FDRs: 0.009999067733378211 and 0.009999444739060072
Got 5325 coords
After resolving overlaps, got 5325 seqlets
Across all tasks, the weakest laplace threshold used was: 0.9995408362970394
5325 identified in total
2 activity patterns with support >= 50 out of 3 possible patterns
Metacluster sizes:  [3610, 1715]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 1715
Relevant tasks:  ('profile/Oct4',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 1715
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.46 s
Starting affinity matrix computations
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.07 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.05 s
Finished affinity matrix computations in 0.13 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.16 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 37.29 s
Launching nearest neighbors affmat calculation job
Job completed in: 8.78 s
(Round 1) Computed affinity matrix on nearest neighbors in 48.06 s
Filtered down to 1415 of 1715
(Round 1) Retained 1415 rows out of 1715 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 1415 samples in 0.001s...
[t-SNE] Computed neighbors for 1415 samples in 0.033s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1415
[t-SNE] Computed conditional probabilities for sample 1415 / 1415
[t-SNE] Mean sigma: 0.261114
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.37522411346435547 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.9s finished
Louvain completed 200 runs in 2.774407386779785 seconds
Wrote graph to binary file in 3.060058116912842 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.676736
Louvain completed 51 runs in 15.135682344436646 seconds
Preproc + Louvain took 21.561079740524292 s
Got 8 clusters after round 1
Counts:
{0: 446, 5: 63, 1: 340, 6: 46, 3: 194, 4: 68, 2: 242, 7: 16}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 446 seqlets
Trimmed 62 out of 446
Dropping cluster 0 with 384 seqlets due to sign disagreement
Aggregating for cluster 1 with 340 seqlets
Trimmed 33 out of 340
Aggregating for cluster 2 with 242 seqlets
Trimmed 14 out of 242
Dropping cluster 2 with 228 seqlets due to sign disagreement
Aggregating for cluster 3 with 194 seqlets
Trimmed 39 out of 194
Dropping cluster 3 with 155 seqlets due to sign disagreement
Aggregating for cluster 4 with 68 seqlets
Trimmed 0 out of 68
Dropping cluster 4 with 68 seqlets due to sign disagreement
Aggregating for cluster 5 with 63 seqlets
Trimmed 1 out of 63
Aggregating for cluster 6 with 46 seqlets
Trimmed 3 out of 46
Dropping cluster 6 with 43 seqlets due to sign disagreement
Aggregating for cluster 7 with 16 seqlets
Trimmed 0 out of 16
Dropping cluster 7 with 16 seqlets due to sign disagreement
(Round 2) num seqlets: 369
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.11 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.02 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.01 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 0.86 s
Launching nearest neighbors affmat calculation job
Job completed in: 0.68 s
(Round 2) Computed affinity matrix on nearest neighbors in 1.77 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 369 samples in 0.000s...
[t-SNE] Computed neighbors for 369 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 369 / 369
[t-SNE] Mean sigma: 0.229318
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.17434453964233398 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
Louvain completed 200 runs in 0.9661059379577637 seconds
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.6s finished
Wrote graph to binary file in 0.1791989803314209 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.641428
After 3 runs, maximum modularity is Q = 0.645067
After 8 runs, maximum modularity is Q = 0.647745
After 37 runs, maximum modularity is Q = 0.647909
After 71 runs, maximum modularity is Q = 0.648222
Louvain completed 121 runs in 32.915130853652954 seconds
Preproc + Louvain took 34.270671367645264 s
Got 7 clusters after round 2
Counts:
{3: 35, 1: 97, 5: 23, 0: 108, 4: 29, 6: 15, 2: 62}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 108 seqlets
Trimmed 8 out of 108
Aggregating for cluster 1 with 97 seqlets
Trimmed 7 out of 97
Aggregating for cluster 2 with 62 seqlets
Trimmed 0 out of 62
Aggregating for cluster 3 with 35 seqlets
Trimmed 2 out of 35
Aggregating for cluster 4 with 29 seqlets
Trimmed 5 out of 29
Aggregating for cluster 5 with 23 seqlets
Trimmed 1 out of 23
Aggregating for cluster 6 with 15 seqlets
Trimmed 2 out of 15
Got 7 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.06656479835510254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00454706
After 2 runs, maximum modularity is Q = 0.00530925
After 6 runs, maximum modularity is Q = 0.00548712
After 7 runs, maximum modularity is Q = 0.00551219
After 11 runs, maximum modularity is Q = 0.00560749
Louvain completed 31 runs in 9.42081594467163 seconds
Similarity is 0.85844964; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.0505828857421875 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00542908
After 3 runs, maximum modularity is Q = 0.00567055
After 17 runs, maximum modularity is Q = 0.00582805
After 35 runs, maximum modularity is Q = 0.00582981
Louvain completed 55 runs in 15.85268521308899 seconds
Similarity is 0.76708347; is_dissimilar is True
Got 2 subclusters
Inspecting for spurious merging
Wrote graph to binary file in 0.030530214309692383 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00492936
After 2 runs, maximum modularity is Q = 0.00831243
Louvain completed 22 runs in 6.374368667602539 seconds
Similarity is 0.9044827; is_dissimilar is False
Merging on 8 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 8 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 0.03 s
Cross contin jaccard time taken: 0.03 s
Discarded 2 seqlets
Got 2 patterns after reassignment
Total time taken is 154.91s
On metacluster 0
Metacluster size 3610
Relevant tasks:  ('profile/Oct4',)
Relevant signs:  (1,)
(Round 1) num seqlets: 3610
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.95 s
Starting affinity matrix computations
Normalization computed in 0.04 s
Cosine similarity mat computed in 0.17 s
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.13 s
Finished affinity matrix computations in 0.3 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.51 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 16.95 s
Launching nearest neighbors affmat calculation job
Job completed in: 17.6 s
(Round 1) Computed affinity matrix on nearest neighbors in 38.12 s
Filtered down to 2680 of 3610
(Round 1) Retained 2680 rows out of 3610 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2680 samples in 0.007s...
[t-SNE] Computed neighbors for 2680 samples in 0.084s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2680
[t-SNE] Computed conditional probabilities for sample 2000 / 2680
[t-SNE] Computed conditional probabilities for sample 2680 / 2680
[t-SNE] Mean sigma: 0.254649
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.4554471969604492 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    1.3s finished
Louvain completed 200 runs in 6.570242404937744 seconds
Wrote graph to binary file in 8.831515073776245 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.723989
After 2 runs, maximum modularity is Q = 0.72726
After 4 runs, maximum modularity is Q = 0.728488
After 16 runs, maximum modularity is Q = 0.732083
Louvain completed 66 runs in 29.063341856002808 seconds
Preproc + Louvain took 45.51748490333557 s
Got 12 clusters after round 1
Counts:
{2: 350, 0: 484, 6: 154, 8: 112, 5: 271, 3: 338, 10: 53, 7: 130, 4: 320, 1: 358, 9: 84, 11: 26}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 484 seqlets
Trimmed 18 out of 484
Aggregating for cluster 1 with 358 seqlets
Trimmed 14 out of 358
Aggregating for cluster 2 with 350 seqlets
Trimmed 13 out of 350
Aggregating for cluster 3 with 338 seqlets
Trimmed 21 out of 338
Aggregating for cluster 4 with 320 seqlets
Trimmed 33 out of 320
Aggregating for cluster 5 with 271 seqlets
Trimmed 15 out of 271
Aggregating for cluster 6 with 154 seqlets
Trimmed 6 out of 154
Aggregating for cluster 7 with 130 seqlets
Trimmed 5 out of 130
Aggregating for cluster 8 with 112 seqlets
Trimmed 5 out of 112
Aggregating for cluster 9 with 84 seqlets
Trimmed 7 out of 84
Aggregating for cluster 10 with 53 seqlets
Trimmed 3 out of 53
Aggregating for cluster 11 with 26 seqlets
Trimmed 5 out of 26
(Round 2) num seqlets: 2535
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.57 s
Starting affinity matrix computations
Normalization computed in 0.03 s
Cosine similarity mat computed in 0.09 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.07 s
Finished affinity matrix computations in 0.17 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.26 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 5.12 s
Launching nearest neighbors affmat calculation job
Job completed in: 5.02 s
(Round 2) Computed affinity matrix on nearest neighbors in 11.95 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2535 samples in 0.006s...
[t-SNE] Computed neighbors for 2535 samples in 0.101s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2535
[t-SNE] Computed conditional probabilities for sample 2000 / 2535
[t-SNE] Computed conditional probabilities for sample 2535 / 2535
[t-SNE] Mean sigma: 0.250146
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.4032557010650635 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    1.2s finished
Louvain completed 200 runs in 5.9142937660217285 seconds
Wrote graph to binary file in 6.132201194763184 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.742676
After 5 runs, maximum modularity is Q = 0.749165
After 6 runs, maximum modularity is Q = 0.752129
Louvain completed 56 runs in 22.1968731880188 seconds
Preproc + Louvain took 35.24865126609802 s
Got 15 clusters after round 2
Counts:
{2: 258, 1: 263, 4: 204, 0: 308, 8: 143, 3: 251, 6: 181, 7: 170, 13: 80, 14: 55, 9: 139, 12: 83, 5: 186, 11: 100, 10: 114}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 308 seqlets
Trimmed 30 out of 308
Aggregating for cluster 1 with 263 seqlets
Trimmed 5 out of 263
Aggregating for cluster 2 with 258 seqlets
Trimmed 0 out of 258
Aggregating for cluster 3 with 251 seqlets
Trimmed 33 out of 251
Aggregating for cluster 4 with 204 seqlets
Trimmed 0 out of 204
Aggregating for cluster 5 with 186 seqlets
Trimmed 23 out of 186
Aggregating for cluster 6 with 181 seqlets
Trimmed 4 out of 181
Aggregating for cluster 7 with 170 seqlets
Trimmed 13 out of 170
Aggregating for cluster 8 with 143 seqlets
Trimmed 34 out of 143
Aggregating for cluster 9 with 139 seqlets
Trimmed 23 out of 139
Aggregating for cluster 10 with 114 seqlets
Trimmed 14 out of 114
Aggregating for cluster 11 with 100 seqlets
Trimmed 0 out of 100
Aggregating for cluster 12 with 83 seqlets
Trimmed 7 out of 83
Aggregating for cluster 13 with 80 seqlets
Trimmed 2 out of 80
Aggregating for cluster 14 with 55 seqlets
Trimmed 5 out of 55
Got 15 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.296783447265625 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00472315
After 2 runs, maximum modularity is Q = 0.00631509
After 3 runs, maximum modularity is Q = 0.00632822
Louvain completed 23 runs in 8.056605100631714 seconds
Similarity is 0.9419943; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.25379395484924316 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00794286
After 2 runs, maximum modularity is Q = 0.00797306
After 16 runs, maximum modularity is Q = 0.00797785
Louvain completed 36 runs in 22.39340329170227 seconds
Similarity is 0.9288262; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.3412294387817383 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00636977
After 2 runs, maximum modularity is Q = 0.00641839
After 3 runs, maximum modularity is Q = 0.00645823
After 8 runs, maximum modularity is Q = 0.00646772
Louvain completed 28 runs in 9.60759425163269 seconds
Similarity is 0.93967634; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.23006534576416016 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0113641
After 2 runs, maximum modularity is Q = 0.0113964
After 4 runs, maximum modularity is Q = 0.0115468
After 5 runs, maximum modularity is Q = 0.0115936
After 6 runs, maximum modularity is Q = 0.011597
After 11 runs, maximum modularity is Q = 0.0116062
Louvain completed 31 runs in 10.458629846572876 seconds
Similarity is 0.86322045; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.21857786178588867 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00757235
After 2 runs, maximum modularity is Q = 0.00760268
After 3 runs, maximum modularity is Q = 0.00760287
Louvain completed 23 runs in 7.7971413135528564 seconds
Similarity is 0.9284465; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.12232851982116699 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00634273
After 2 runs, maximum modularity is Q = 0.00646172
After 6 runs, maximum modularity is Q = 0.00860881
Louvain completed 26 runs in 8.108899593353271 seconds
Similarity is 0.9217995; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.4949381351470947 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0109874
Louvain completed 21 runs in 13.933294296264648 seconds
Similarity is 0.89337415; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.14215493202209473 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00635245
Louvain completed 21 runs in 6.44206690788269 seconds
Similarity is 0.9372116; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.08538937568664551 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00681553
After 3 runs, maximum modularity is Q = 0.00712644
After 15 runs, maximum modularity is Q = 0.00731753
Louvain completed 35 runs in 12.090519666671753 seconds
Similarity is 0.9628205; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.07422494888305664 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00976029
Louvain completed 21 runs in 7.221029758453369 seconds
Similarity is 0.8586619; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.06100320816040039 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0151498
After 2 runs, maximum modularity is Q = 0.0158198
Louvain completed 22 runs in 7.900119304656982 seconds
Similarity is 0.82307523; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.07787370681762695 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0105244
Louvain completed 21 runs in 6.973620176315308 seconds
Similarity is 0.9011462; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.040665626525878906 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00580514
After 4 runs, maximum modularity is Q = 0.0058267
After 5 runs, maximum modularity is Q = 0.00620941
After 12 runs, maximum modularity is Q = 0.00641407
Louvain completed 32 runs in 10.344244718551636 seconds
Similarity is 0.9366569; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.038019657135009766 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0106798
After 3 runs, maximum modularity is Q = 0.0107301
Louvain completed 23 runs in 7.365370035171509 seconds
Similarity is 0.893922; is_dissimilar is False
Merging on 15 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 2 & 7 with prob 0.0002494932342301131 and sim 0.9590264184252105
Collapsing 7 & 8 with prob 0.00016236097956396606 and sim 0.9538156269158113
Collapsing 1 & 4 with prob 0.00019667581460403413 and sim 0.9438356106760856
Collapsing 3 & 8 with prob 0.0004916761299843828 and sim 0.9429691971988887
Collapsing 2 & 8 with prob 0.00035969893650843985 and sim 0.9406957085961495
Collapsing 1 & 2 with prob 0.00012264996261004296 and sim 0.921204065353995
Aborting collapse as 1 & 3 have prob 4.4936410332079764e-07 and sim 0.7935097575592722
Aborting collapse as 3 & 4 have prob 2.0221687708512117e-08 and sim 0.7468837138837048
Aborting collapse as 4 & 7 have prob 2.0391389982293315e-07 and sim 0.8261960991427079
Collapsing 7 & 11 with prob 1.2444975842920631e-05 and sim 0.9146443355222127
Aborting collapse as 3 & 11 have prob 4.5776835084975727e-07 and sim 0.7809587883145455
Aborting collapse as 8 & 11 have prob 2.963591090904253e-06 and sim 0.8262190781346793
Collapsing 2 & 4 with prob 1.4732457442762477e-05 and sim 0.9130088388215073
Aborting collapse as 1 & 3 have prob 4.4936410332079764e-07 and sim 0.7935097575592722
Aborting collapse as 3 & 4 have prob 2.0221687708512117e-08 and sim 0.7468837138837048
Aborting collapse as 4 & 7 have prob 2.0391389982293315e-07 and sim 0.8261960991427079
Collapsing 0 & 4 with prob 9.134099299913802e-05 and sim 0.9120626828054953
Collapsing 3 & 7 with prob 2.5526467319431598e-05 and sim 0.9093910766141818
Collapsing 0 & 1 with prob 0.00019104791506962702 and sim 0.9091228782906665
Collapsing 2 & 3 with prob 0.00010919205172411359 and sim 0.9047372983644826
Collapsing 3 & 6 with prob 0.00027504948584693335 and sim 0.8937742100537399
Collapsing 2 & 12 with prob 8.350780444373553e-05 and sim 0.873035896950215
Aborting collapse as 3 & 12 have prob 1.647460854077765e-07 and sim 0.8165987396375693
Aborting collapse as 6 & 12 have prob 3.933846247853654e-06 and sim 0.7741985831778433
Collapsing 6 & 8 with prob 0.00017111834593558814 and sim 0.8521558438819871
Collapsing 5 & 11 with prob 0.00018169580757238302 and sim 0.8497637126172312
Collapsing 2 & 6 with prob 0.00019079583178076862 and sim 0.8440441913262626
Trimmed 0 out of 415
Trimmed 0 out of 524
Trimmed 0 out of 462
Trimmed 3 out of 742
Trimmed 3 out of 740
Trimmed 3 out of 916
Trimmed 1 out of 263
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 1 & 6 with prob 0.00012800721729785137 and sim 0.8809386946528662
Collapsing 2 & 4 with prob 1.756582456553501e-05 and sim 0.8718924204698266
Collapsing 0 & 1 with prob 0.0010567121315225927 and sim 0.8624840998266177
Trimmed 0 out of 813
Trimmed 10 out of 362
Trimmed 8 out of 1726
On merging iteration 3
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 5 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 0.02 s
Cross contin jaccard time taken: 0.02 s
Got 4 patterns after reassignment
Total time taken is 319.53s
In [26]:
def ic_scale(x):
    from modisco.visualization import viz_sequence
    background = np.array([0.27, 0.23, 0.23, 0.27])
    return viz_sequence.ic_scale(x, background=background)
In [27]:
from basepair.modisco import ModiscoResult
mr = ModiscoResult(output_dir+"/modisco.h5")
incl = np.load(output_dir+"/included_samples.npy")
tasks = list(ds.task_specs)
legend=False
rc_vec=None
rc_fn=lambda x: x[::-1, ::-1]
start_vec=None
n_bootstrap=None
n_limit=35
seq_height=1.5
n_bootstrap=100
fpath_template=output_dir+"_figures/"
os.makedirs(fpath_template, exist_ok=True)
figsize=(12, 2.5)
tracks = {task: y[f"profile/{task}"][incl] for task in tasks}
x = x[incl]
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo

print(mr.patterns())
['pattern_0', 'pattern_1', 'pattern_2', 'pattern_3']
In [28]:
def bootstrap_mean(x, n=100):
    """Bootstrap the mean computation"""
    out = []
    for i in range(n):
        idx = pd.Series(np.arange(len(x))).sample(frac=1.0, replace=True)
        out.append(x[idx].mean(0))
    outm = np.stack(out)
    return outm.mean(0), outm.std(0)
In [29]:
import pandas as pd
from matplotlib.ticker import FormatStrFormatter
from basepair.plots import show_figure

ylim=[0, 3]
In [30]:
%matplotlib inline
for i, pattern in enumerate(mr.patterns()):

    j = i
    seqs = mr.extract_signal(x)[pattern]
    sequence = ic_scale(seqs.mean(axis=0))
    if rc_vec is not None and rc_vec[i]:
        sequence = rc_fn(sequence)
    if start_vec is not None:
        start = start_vec[i]
        sequence = sequence[start:(start + width)]
    n = len(seqs)
    if n < n_limit:
        continue
    fig, ax = plt.subplots(1 + len(tracks), 1, sharex=True,
                           figsize=figsize,
                           gridspec_kw={'height_ratios': [1] * len(tracks) + [seq_height]})
    ax[0].set_title(f"motif {i} ({n})")
    for i, (k, y) in enumerate(tracks.items()):
        signal = mr.extract_signal(y, rc_fn)[pattern]

        if start_vec is not None:
            start = start_vec[i]
            signal = signal[:, start:(start + width)]

        if n_bootstrap is None:
            signal_mean = signal.mean(axis=0)
            signal_std = signal.std(axis=0)
        else:
            signal_mean, signal_std = bootstrap_mean(signal, n=n_bootstrap)
        if rc_vec is not None and rc_vec[i]:
            signal_mean = rc_fn(signal_mean)
            signal_std = rc_fn(signal_std)

        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 0], label='pos')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 0] - 2 * signal_std[:, 0],
                               signal_mean[:, 0] + 2 * signal_std[:, 0],
                               alpha=0.1)
        #                   label='pos')
        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 1], label='neg')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 1] - 2 * signal_std[:, 1],
                               signal_mean[:, 1] + 2 * signal_std[:, 1],
                               alpha=0.1)
        #                   label='pos')
        ax[i].set_ylabel(f"{k}")
        ax[i].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['bottom'].set_visible(False)
        ax[i].xaxis.set_ticks_position('none')
        if isinstance(ylim[0], list):
            ax[i].set_ylim(ylim[i])
        if legend:
            ax[i].legend()

    seqlogo(sequence, ax=ax[-1])
    ax[-1].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    ax[-1].set_ylabel("Inf. content")
    ax[-1].spines['top'].set_visible(False)
    ax[-1].spines['right'].set_visible(False)
    ax[-1].spines['bottom'].set_visible(False)
    ax[-1].set_xticks(list(range(0, len(sequence) + 1, 5)))
    
    if fpath_template is not None:
        plt.savefig(fpath_template.format(j) + '.png', dpi=600)
        plt.savefig(fpath_template.format(j) + '.pdf', dpi=600)
        plt.close(fig)    # close the figure
        show_figure(fig)
        plt.show()