Goal

  • train a decent seq-> profile + counts model for ChIP-seq

Resources

  • washu
    • session: 7-w-sox2-oct4-chipseq
In [1]:
import basepair
import modisco
Using TensorFlow backend.
Can not use cuDNN on context None: cannot compile with cuDNN. We got this error:
b'/usr/bin/ld: cannot find -lcudnn\ncollect2: error: ld returned 1 exit status\n'
Mapped name None to device cuda: GeForce GTX TITAN X (0000:09:00.0)
In [2]:
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
2018-07-29 13:39:26,854 [WARNING] git-lfs not installed
2018-07-29 13:39:26,896 [WARNING] git-lfs not installed
In [3]:
# Use gpus 1, 3, 5, 7
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 3, 5, 7"
In [4]:
ddir = get_data_dir()
In [5]:
ddir
Out[5]:
'/srv/scratch/amr1/chipseq/basepair/basepair/../data'
In [6]:
bdir = "/srv/scratch/amr1/chipseq/sox2-oct4-chipseq/"

ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
                                           pos_counts=f"{bdir}/Sox2/pos.bw",
                                           neg_counts=f"{bdir}/Sox2/neg.bw",
                                           peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
                                          ),
                          "Oct4": TaskSpec(task="Oct4",
                                           pos_counts=f"{bdir}/Oct4/pos2.bw",
                                           neg_counts=f"{bdir}/Oct4/neg2.bw",
                                           peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
                                          )
                         },
              fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
             )
In [7]:
def ds2bws(ds):
    return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
In [8]:
# Get the training data
train, valid, test = chip_exo_nexus(ds, peak_width=1000)
100%|██████████| 23474/23474 [00:00<00:00, 496864.06it/s]
2018-07-29 13:39:30,716 [INFO] extract sequence
2018-07-29 13:39:33,728 [INFO] extract counts
100%|██████████| 2/2 [00:18<00:00,  9.02s/it]
In [9]:
# Pre-process the data
preproc = AppendTotalCounts()
preproc.fit(train[1])
train[1] = preproc.transform(train[1])
valid[1] = preproc.transform(valid[1])
test[1] = preproc.transform(test[1])
In [10]:
train[1].keys()
Out[10]:
dict_keys(['profile/Sox2', 'profile/Oct4', 'counts/Sox2', 'counts/Oct4'])
In [11]:
tasks=['Sox2', 'Oct4']
In [12]:
# TODO - play around with this
def seq_multitask_chipseq(filters=21,
                          conv1_kernel_size=21,
                          tconv_kernel_size=25,
                          #tconv_kernel_size2=25,
                          n_dil_layers=6,
                          lr=0.004,
                          c_task_weight=100,
                          use_profile=True,
                          use_counts=True,
                          tasks=tasks,
                          seq_len=201):
    """
    Dense

    Args:
      c_task_weights: how to upweight the count-prediction task
    """
    # TODO - build the reverse complement symmetry into the model
    inp = kl.Input(shape=(seq_len, 4), name='seq')
    first_conv = kl.Conv1D(filters,
                           kernel_size=conv1_kernel_size,
                           padding='same',
                           activation='relu')(inp)

    prev_layers = [first_conv]
    for i in range(1, n_dil_layers + 1):
        if i == 1:
            prev_sum = first_conv
        else:
            prev_sum = kl.add(prev_layers)
        conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
        prev_layers.append(conv_output)

    combined_conv = kl.add(prev_layers)

    # De-conv
    x = kl.Reshape((-1, 1, filters))(combined_conv)
    x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
    #x = kl.UpSampling2D((2, 1))(x)
    #x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
    out = kl.Reshape((-1, 2 * len(tasks)))(x)

    # setup the output branches
    outputs = []
    losses = []
    loss_weights = []

    if use_profile:
        output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
                            output_shape=(seq_len, 2),
                            name="profile/" + task,
                            arguments={"i": i})(out)
                  for i, task in enumerate(tasks)]
        outputs += output
        losses += [twochannel_multinomial_nll] * len(tasks)
        loss_weights += [1] * len(tasks)

    if use_counts:
        pooled = kl.GlobalAvgPool1D()(combined_conv)
        counts = [kl.Dense(2, name="counts/" + task)(pooled)
                  for task in tasks]
        outputs += counts
        losses += ["mse"] * len(tasks)
        loss_weights += [c_task_weight] * len(tasks)

    model = Model(inp, outputs)
    model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
    return model

setup a new model having two output branches

In [13]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
In [14]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
In [15]:
i=1
In [16]:
def get_model(mfn, mkwargs, fkwargs, i):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/chipseq/exp/models/count+profile"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(i)
    i+=1
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    all_kwargs = {**mkwargs, **fkwargs}
    return eval(mfn)(**all_kwargs), name, ckp_file

train the model

  • add the training curve plot bellow training
In [17]:
# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = True
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=100,
               n_dil_layers=6,
               use_profile=use_profile,
               use_counts=use_counts,
               c_task_weight=10,
               seq_len=train[0].shape[1],
               lr=0.004)
fixed_kwargs = dict(
    tasks=list(ds.task_specs)
)
In [18]:
import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0], 
                    train[1],
          batch_size=256, 
          epochs=100,
          validation_data=valid[:2],
          callbacks=[EarlyStopping(patience=5),
                     History(),
                     ModelCheckpoint(ckp_file, save_best_only=True)]
         )

model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-07-29 13:39:54,744 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-07-29 13:40:02,917 [WARNING] From /users/amr1/miniconda3/envs/basepair/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
Train on 14727 samples, validate on 4493 samples
Epoch 1/100
14727/14727 [==============================] - 11s 758us/step - loss: 616.3036 - profile/Sox2_loss: 168.1244 - profile/Oct4_loss: 425.4177 - counts/Sox2_loss: 1.0797 - counts/Oct4_loss: 1.1964 - val_loss: 611.2884 - val_profile/Sox2_loss: 167.3049 - val_profile/Oct4_loss: 423.6422 - val_counts/Sox2_loss: 1.0032 - val_counts/Oct4_loss: 1.0309
Epoch 2/100
14727/14727 [==============================] - 5s 330us/step - loss: 589.9961 - profile/Sox2_loss: 164.2339 - profile/Oct4_loss: 405.7602 - counts/Sox2_loss: 0.9990 - counts/Oct4_loss: 1.0012 - val_loss: 590.3192 - val_profile/Sox2_loss: 164.3301 - val_profile/Oct4_loss: 405.7246 - val_counts/Sox2_loss: 1.0015 - val_counts/Oct4_loss: 1.0250
Epoch 3/100
14727/14727 [==============================] - 5s 332us/step - loss: 577.3825 - profile/Sox2_loss: 162.3700 - profile/Oct4_loss: 395.0311 - counts/Sox2_loss: 0.9975 - counts/Oct4_loss: 1.0006 - val_loss: 586.8753 - val_profile/Sox2_loss: 163.8145 - val_profile/Oct4_loss: 402.8185 - val_counts/Sox2_loss: 0.9999 - val_counts/Oct4_loss: 1.0243
Epoch 4/100
14727/14727 [==============================] - 5s 335us/step - loss: 575.2363 - profile/Sox2_loss: 161.9755 - profile/Oct4_loss: 393.3261 - counts/Sox2_loss: 0.9937 - counts/Oct4_loss: 0.9997 - val_loss: 585.2835 - val_profile/Sox2_loss: 163.3736 - val_profile/Oct4_loss: 401.7293 - val_counts/Sox2_loss: 0.9947 - val_counts/Oct4_loss: 1.0234
Epoch 5/100
14727/14727 [==============================] - 5s 327us/step - loss: 573.6786 - profile/Sox2_loss: 161.5812 - profile/Oct4_loss: 392.2044 - counts/Sox2_loss: 0.9914 - counts/Oct4_loss: 0.9979 - val_loss: 583.9188 - val_profile/Sox2_loss: 162.8769 - val_profile/Oct4_loss: 400.8664 - val_counts/Sox2_loss: 0.9926 - val_counts/Oct4_loss: 1.0250
Epoch 6/100
14727/14727 [==============================] - 5s 328us/step - loss: 572.2429 - profile/Sox2_loss: 161.0383 - profile/Oct4_loss: 391.3599 - counts/Sox2_loss: 0.9887 - counts/Oct4_loss: 0.9957 - val_loss: 582.1691 - val_profile/Sox2_loss: 162.4854 - val_profile/Oct4_loss: 399.5875 - val_counts/Sox2_loss: 0.9893 - val_counts/Oct4_loss: 1.0203
Epoch 7/100
14727/14727 [==============================] - 5s 345us/step - loss: 571.3248 - profile/Sox2_loss: 160.8116 - profile/Oct4_loss: 390.7239 - counts/Sox2_loss: 0.9850 - counts/Oct4_loss: 0.9940 - val_loss: 581.2761 - val_profile/Sox2_loss: 162.2338 - val_profile/Oct4_loss: 399.0227 - val_counts/Sox2_loss: 0.9825 - val_counts/Oct4_loss: 1.0195
Epoch 8/100
14727/14727 [==============================] - 5s 334us/step - loss: 570.4518 - profile/Sox2_loss: 160.6292 - profile/Oct4_loss: 390.1135 - counts/Sox2_loss: 0.9807 - counts/Oct4_loss: 0.9902 - val_loss: 581.1346 - val_profile/Sox2_loss: 162.3386 - val_profile/Oct4_loss: 398.8606 - val_counts/Sox2_loss: 0.9791 - val_counts/Oct4_loss: 1.0144
Epoch 9/100
14727/14727 [==============================] - 5s 349us/step - loss: 569.6096 - profile/Sox2_loss: 160.4825 - profile/Oct4_loss: 389.5202 - counts/Sox2_loss: 0.9752 - counts/Oct4_loss: 0.9855 - val_loss: 579.7642 - val_profile/Sox2_loss: 162.1487 - val_profile/Oct4_loss: 397.7824 - val_counts/Sox2_loss: 0.9759 - val_counts/Oct4_loss: 1.0074
Epoch 10/100
14727/14727 [==============================] - 5s 346us/step - loss: 569.1268 - profile/Sox2_loss: 160.3815 - profile/Oct4_loss: 389.2308 - counts/Sox2_loss: 0.9702 - counts/Oct4_loss: 0.9812 - val_loss: 579.9020 - val_profile/Sox2_loss: 162.2220 - val_profile/Oct4_loss: 397.9331 - val_counts/Sox2_loss: 0.9696 - val_counts/Oct4_loss: 1.0051
Epoch 11/100
14727/14727 [==============================] - 5s 341us/step - loss: 568.5690 - profile/Sox2_loss: 160.3207 - profile/Oct4_loss: 388.7906 - counts/Sox2_loss: 0.9680 - counts/Oct4_loss: 0.9778 - val_loss: 579.8739 - val_profile/Sox2_loss: 162.1525 - val_profile/Oct4_loss: 398.0242 - val_counts/Sox2_loss: 0.9622 - val_counts/Oct4_loss: 1.0075
Epoch 12/100
14727/14727 [==============================] - 5s 334us/step - loss: 567.9088 - profile/Sox2_loss: 160.1374 - profile/Oct4_loss: 388.4648 - counts/Sox2_loss: 0.9595 - counts/Oct4_loss: 0.9712 - val_loss: 579.1197 - val_profile/Sox2_loss: 162.0076 - val_profile/Oct4_loss: 397.7406 - val_counts/Sox2_loss: 0.9504 - val_counts/Oct4_loss: 0.9867
Epoch 13/100
14727/14727 [==============================] - 5s 346us/step - loss: 567.2088 - profile/Sox2_loss: 160.0064 - profile/Oct4_loss: 388.0841 - counts/Sox2_loss: 0.9492 - counts/Oct4_loss: 0.9626 - val_loss: 578.2527 - val_profile/Sox2_loss: 161.8983 - val_profile/Oct4_loss: 397.1201 - val_counts/Sox2_loss: 0.9447 - val_counts/Oct4_loss: 0.9788
Epoch 14/100
14727/14727 [==============================] - 5s 333us/step - loss: 567.2393 - profile/Sox2_loss: 159.9910 - profile/Oct4_loss: 388.0667 - counts/Sox2_loss: 0.9542 - counts/Oct4_loss: 0.9640 - val_loss: 579.9305 - val_profile/Sox2_loss: 162.0320 - val_profile/Oct4_loss: 397.8613 - val_counts/Sox2_loss: 0.9852 - val_counts/Oct4_loss: 1.0186
Epoch 15/100
14727/14727 [==============================] - 5s 335us/step - loss: 566.9989 - profile/Sox2_loss: 159.9596 - profile/Oct4_loss: 387.7861 - counts/Sox2_loss: 0.9572 - counts/Oct4_loss: 0.9681 - val_loss: 577.9307 - val_profile/Sox2_loss: 161.8062 - val_profile/Oct4_loss: 396.8345 - val_counts/Sox2_loss: 0.9494 - val_counts/Oct4_loss: 0.9796
Epoch 16/100
14727/14727 [==============================] - 5s 339us/step - loss: 566.1731 - profile/Sox2_loss: 159.7719 - profile/Oct4_loss: 387.4039 - counts/Sox2_loss: 0.9427 - counts/Oct4_loss: 0.9570 - val_loss: 578.1423 - val_profile/Sox2_loss: 161.9082 - val_profile/Oct4_loss: 396.9697 - val_counts/Sox2_loss: 0.9498 - val_counts/Oct4_loss: 0.9766
Epoch 17/100
14727/14727 [==============================] - 5s 352us/step - loss: 565.9112 - profile/Sox2_loss: 159.7642 - profile/Oct4_loss: 387.2870 - counts/Sox2_loss: 0.9355 - counts/Oct4_loss: 0.9505 - val_loss: 579.7296 - val_profile/Sox2_loss: 162.3273 - val_profile/Oct4_loss: 397.6268 - val_counts/Sox2_loss: 0.9572 - val_counts/Oct4_loss: 1.0203
Epoch 18/100
14727/14727 [==============================] - 5s 338us/step - loss: 565.9630 - profile/Sox2_loss: 159.7796 - profile/Oct4_loss: 387.1716 - counts/Sox2_loss: 0.9452 - counts/Oct4_loss: 0.9560 - val_loss: 577.2300 - val_profile/Sox2_loss: 161.7710 - val_profile/Oct4_loss: 396.5393 - val_counts/Sox2_loss: 0.9283 - val_counts/Oct4_loss: 0.9636
Epoch 19/100
14727/14727 [==============================] - 5s 334us/step - loss: 565.2392 - profile/Sox2_loss: 159.6582 - profile/Oct4_loss: 386.7640 - counts/Sox2_loss: 0.9333 - counts/Oct4_loss: 0.9484 - val_loss: 576.8224 - val_profile/Sox2_loss: 161.7198 - val_profile/Oct4_loss: 396.0785 - val_counts/Sox2_loss: 0.9331 - val_counts/Oct4_loss: 0.9693
Epoch 20/100
14727/14727 [==============================] - 5s 331us/step - loss: 564.8125 - profile/Sox2_loss: 159.5927 - profile/Oct4_loss: 386.5804 - counts/Sox2_loss: 0.9237 - counts/Oct4_loss: 0.9403 - val_loss: 577.6030 - val_profile/Sox2_loss: 161.7519 - val_profile/Oct4_loss: 396.5317 - val_counts/Sox2_loss: 0.9632 - val_counts/Oct4_loss: 0.9688
Epoch 21/100
14727/14727 [==============================] - 5s 336us/step - loss: 564.8797 - profile/Sox2_loss: 159.5838 - profile/Oct4_loss: 386.5609 - counts/Sox2_loss: 0.9290 - counts/Oct4_loss: 0.9445 - val_loss: 577.3542 - val_profile/Sox2_loss: 161.8367 - val_profile/Oct4_loss: 396.6261 - val_counts/Sox2_loss: 0.9207 - val_counts/Oct4_loss: 0.9684
Epoch 22/100
14727/14727 [==============================] - 5s 336us/step - loss: 564.3021 - profile/Sox2_loss: 159.5077 - profile/Oct4_loss: 386.3246 - counts/Sox2_loss: 0.9143 - counts/Oct4_loss: 0.9327 - val_loss: 577.0991 - val_profile/Sox2_loss: 161.7091 - val_profile/Oct4_loss: 396.2513 - val_counts/Sox2_loss: 0.9459 - val_counts/Oct4_loss: 0.9679
Epoch 23/100
14727/14727 [==============================] - 5s 352us/step - loss: 564.0358 - profile/Sox2_loss: 159.4736 - profile/Oct4_loss: 386.1087 - counts/Sox2_loss: 0.9146 - counts/Oct4_loss: 0.9307 - val_loss: 576.6525 - val_profile/Sox2_loss: 161.7255 - val_profile/Oct4_loss: 395.9580 - val_counts/Sox2_loss: 0.9248 - val_counts/Oct4_loss: 0.9721
Epoch 24/100
14727/14727 [==============================] - 5s 335us/step - loss: 563.7895 - profile/Sox2_loss: 159.4346 - profile/Oct4_loss: 385.9374 - counts/Sox2_loss: 0.9121 - counts/Oct4_loss: 0.9296 - val_loss: 576.2640 - val_profile/Sox2_loss: 161.7743 - val_profile/Oct4_loss: 395.9850 - val_counts/Sox2_loss: 0.9040 - val_counts/Oct4_loss: 0.9465
Epoch 25/100
14727/14727 [==============================] - 5s 333us/step - loss: 563.4532 - profile/Sox2_loss: 159.3887 - profile/Oct4_loss: 385.8219 - counts/Sox2_loss: 0.9027 - counts/Oct4_loss: 0.9215 - val_loss: 576.6666 - val_profile/Sox2_loss: 162.0667 - val_profile/Oct4_loss: 395.7936 - val_counts/Sox2_loss: 0.9273 - val_counts/Oct4_loss: 0.9533
Epoch 26/100
14727/14727 [==============================] - 5s 329us/step - loss: 563.7353 - profile/Sox2_loss: 159.4124 - profile/Oct4_loss: 385.7577 - counts/Sox2_loss: 0.9179 - counts/Oct4_loss: 0.9386 - val_loss: 576.5776 - val_profile/Sox2_loss: 161.7704 - val_profile/Oct4_loss: 395.9812 - val_counts/Sox2_loss: 0.9245 - val_counts/Oct4_loss: 0.9581
Epoch 27/100
14727/14727 [==============================] - 5s 329us/step - loss: 562.8740 - profile/Sox2_loss: 159.3135 - profile/Oct4_loss: 385.5091 - counts/Sox2_loss: 0.8903 - counts/Oct4_loss: 0.9148 - val_loss: 576.7166 - val_profile/Sox2_loss: 161.7759 - val_profile/Oct4_loss: 396.0170 - val_counts/Sox2_loss: 0.9328 - val_counts/Oct4_loss: 0.9595
Epoch 28/100
14727/14727 [==============================] - 5s 338us/step - loss: 563.4631 - profile/Sox2_loss: 159.4130 - profile/Oct4_loss: 385.7254 - counts/Sox2_loss: 0.9063 - counts/Oct4_loss: 0.9262 - val_loss: 576.2538 - val_profile/Sox2_loss: 161.6636 - val_profile/Oct4_loss: 396.1535 - val_counts/Sox2_loss: 0.9016 - val_counts/Oct4_loss: 0.9421
Epoch 29/100
14727/14727 [==============================] - 5s 346us/step - loss: 562.5747 - profile/Sox2_loss: 159.2428 - profile/Oct4_loss: 385.2273 - counts/Sox2_loss: 0.8927 - counts/Oct4_loss: 0.9177 - val_loss: 575.2956 - val_profile/Sox2_loss: 161.7195 - val_profile/Oct4_loss: 395.4537 - val_counts/Sox2_loss: 0.8885 - val_counts/Oct4_loss: 0.9237
Epoch 30/100
14727/14727 [==============================] - 5s 340us/step - loss: 562.6137 - profile/Sox2_loss: 159.2410 - profile/Oct4_loss: 385.3948 - counts/Sox2_loss: 0.8866 - counts/Oct4_loss: 0.9112 - val_loss: 575.8723 - val_profile/Sox2_loss: 161.6252 - val_profile/Oct4_loss: 395.8987 - val_counts/Sox2_loss: 0.8934 - val_counts/Oct4_loss: 0.9415
Epoch 31/100
14727/14727 [==============================] - 5s 336us/step - loss: 562.0211 - profile/Sox2_loss: 159.1688 - profile/Oct4_loss: 385.0904 - counts/Sox2_loss: 0.8737 - counts/Oct4_loss: 0.9025 - val_loss: 575.4586 - val_profile/Sox2_loss: 161.6599 - val_profile/Oct4_loss: 395.8553 - val_counts/Sox2_loss: 0.8714 - val_counts/Oct4_loss: 0.9230
Epoch 32/100
14727/14727 [==============================] - 5s 331us/step - loss: 561.6580 - profile/Sox2_loss: 159.1968 - profile/Oct4_loss: 384.8804 - counts/Sox2_loss: 0.8649 - counts/Oct4_loss: 0.8931 - val_loss: 575.1678 - val_profile/Sox2_loss: 161.7054 - val_profile/Oct4_loss: 395.7611 - val_counts/Sox2_loss: 0.8597 - val_counts/Oct4_loss: 0.9104
Epoch 33/100
14727/14727 [==============================] - 5s 340us/step - loss: 561.2827 - profile/Sox2_loss: 159.1752 - profile/Oct4_loss: 384.7264 - counts/Sox2_loss: 0.8528 - counts/Oct4_loss: 0.8854 - val_loss: 575.5684 - val_profile/Sox2_loss: 161.9022 - val_profile/Oct4_loss: 395.9268 - val_counts/Sox2_loss: 0.8688 - val_counts/Oct4_loss: 0.9051
Epoch 34/100
14727/14727 [==============================] - 5s 339us/step - loss: 561.2232 - profile/Sox2_loss: 159.1920 - profile/Oct4_loss: 384.7490 - counts/Sox2_loss: 0.8485 - counts/Oct4_loss: 0.8798 - val_loss: 575.6992 - val_profile/Sox2_loss: 161.7604 - val_profile/Oct4_loss: 396.2238 - val_counts/Sox2_loss: 0.8596 - val_counts/Oct4_loss: 0.9119
Epoch 35/100
14727/14727 [==============================] - 5s 334us/step - loss: 561.2887 - profile/Sox2_loss: 159.1534 - profile/Oct4_loss: 384.7210 - counts/Sox2_loss: 0.8546 - counts/Oct4_loss: 0.8868 - val_loss: 576.4402 - val_profile/Sox2_loss: 161.8464 - val_profile/Oct4_loss: 396.7988 - val_counts/Sox2_loss: 0.8679 - val_counts/Oct4_loss: 0.9115
Epoch 36/100
14727/14727 [==============================] - 5s 332us/step - loss: 560.9870 - profile/Sox2_loss: 159.0852 - profile/Oct4_loss: 384.6593 - counts/Sox2_loss: 0.8454 - counts/Oct4_loss: 0.8788 - val_loss: 575.5874 - val_profile/Sox2_loss: 161.7543 - val_profile/Oct4_loss: 395.8845 - val_counts/Sox2_loss: 0.8656 - val_counts/Oct4_loss: 0.9293
Epoch 37/100
14727/14727 [==============================] - 5s 333us/step - loss: 560.8602 - profile/Sox2_loss: 159.0957 - profile/Oct4_loss: 384.5283 - counts/Sox2_loss: 0.8481 - counts/Oct4_loss: 0.8755 - val_loss: 575.4702 - val_profile/Sox2_loss: 161.8405 - val_profile/Oct4_loss: 395.9841 - val_counts/Sox2_loss: 0.8563 - val_counts/Oct4_loss: 0.9082
In [19]:
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
Out[19]:
{'loss': 575.1677851283202,
 'profile/Sox2_loss': 161.70544497780512,
 'profile/Oct4_loss': 395.7610894888142,
 'counts/Sox2_loss': 0.8597082944481987,
 'counts/Oct4_loss': 0.9104166121889323}
In [20]:
model_dir = "/srv/scratch/amr1/chipseq/basepair/basepair/../data/processed/chipseq/exp/models/count+profile/"
In [21]:
import os
import theano
import numpy as np
import modisco
import modisco.tfmodisco_workflow.workflow
import os
from basepair.cli.schemas import DataSpec, HParams
from basepair.modisco import ModiscoResult
from scipy.spatial.distance import correlation
from concise.utils.helper import write_json, read_json
from basepair.data import numpy_minibatch
import h5py
import numpy as np
import keras.backend as K
from basepair.config import create_tf_session
from keras.models import load_model
from basepair.cli.evaluate import load_data
import matplotlib.pyplot as plt
plt.switch_backend('agg')
In [22]:
data = dict(train=train, valid=valid, test=test)
x, y, m = data['valid']
In [23]:
grad_wrt='profile/Sox2'
summary='weighted'
output_dir="outputs_profile_sox2"
hp = HParams.load("/srv/scratch/amr1/chipseq/basepair-workflow/hparams.yaml")
os.makedirs(output_dir, exist_ok=True)
write_json(dict(model_dir=os.path.abspath(model_dir),
                grad_wrt=grad_wrt,
                output_dir=output_dir,
                summary=summary,
                split='valid'),
           os.path.join(output_dir, "kwargs.json"))
In [24]:
dtype, task = grad_wrt.split("/")
out = model.outputs[ds.task2idx(task, dtype)]
inp = model.inputs[0]
In [25]:
if "counts" in grad_wrt or summary == "count":
    pos_strand_ginp_max = K.function([inp], K.gradients(out[:, 0], inp))
    neg_strand_ginp_max = K.function([inp], K.gradients(out[:, 1], inp))
else:
    if summary == 'weighted':
        print("Using weighted stat")
        pos_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 0])) *
                                                     out[:, :, 0], axis=-1), inp))
        neg_strand_ginp_max = K.function([inp], K.gradients(K.sum(K.stop_gradient(K.softmax(out[:, :, 1])) *
                                                     out[:, :, 1], axis=-1), inp))
    elif summary == "max":
        print("Using max stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.max(out[:, :, 1], axis=1), inp))
    else:
        print("Using avg stat")
        pos_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 0], axis=1), inp))
        neg_strand_ginp_max = K.function([inp],
                                         K.gradients(K.mean(out[:, :, 1], axis=1), inp))

# Pre-compute the predictions and bottlenecks
y_true = y[grad_wrt]
grads_pos = np.concatenate([pos_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
grads_neg = np.concatenate([neg_strand_ginp_max([batch])[0]
                            for batch in numpy_minibatch(x, 512)])
igrads_pos = grads_pos * x
igrads_neg = grads_neg * x

grads_pos_ext = grads_pos.reshape((grads_pos.shape[0], -1))
grads_neg_ext = grads_neg.reshape((grads_neg.shape[0], -1))

# compute the distances
distances = np.array([correlation(grads_neg_ext[i], grads_pos_ext[i])
                      for i in range(len(grads_neg_ext))])

# Setup different scores
top_distances = distances < hp.modisco.max_strand_distance  # filter sites
hyp_scores = grads_pos + grads_neg
hyp_scores = hyp_scores[top_distances]
hyp_scores = hyp_scores - hyp_scores.mean(-1, keepdims=True)
onehot_data = x[top_distances]
scores = hyp_scores * onehot_data

# -------------------------------------------------------------
# run modisco
kw = hp.modisco.get_modisco_kwargs()
del kw['max_strand_distance']
del kw['threshold_for_counting_sign']
tfmodisco_results = modisco.tfmodisco_workflow.workflow.TfModiscoWorkflow(**kw,
    seqlets_to_patterns_factory=
        modisco.tfmodisco_workflow.seqlets_to_patterns.TfModiscoSeqletsToPatternsFactory(
            trim_to_window_size=15,
            initial_flank_to_add=5,
            kmer_len=5, num_gaps=1,
            num_mismatches=0,
            final_min_cluster_size=60))(
    task_names=[grad_wrt],
    contrib_scores={grad_wrt: scores},
    hypothetical_contribs={grad_wrt: hyp_scores},
    one_hot=onehot_data)
# -------------------------------------------------------------
# save the results
output_path = os.path.join(output_dir, "modisco.h5")
if os.path.exists(output_path):
    print("Output path exists")
    output_path += "backup.h5"

grp = h5py.File(output_path)
tfmodisco_results.save_hdf5(grp)

# dump the top distances to file
top_distances.dump("{0}/included_samples.npy".format(output_dir))
distances.dump("{0}/distances.npy".format(output_dir))
Using weighted stat
On task profile/Sox2
Computing windowed sums
Computing threshold
Thresholds: -0.4703658925128937 and 3.7377698
CDFs: 0.9999019899648633 and 0.9998397884067264
Est. FDRs: 0.009989270597790818 and 0.009996669192653426
Got 4241 coords
After resolving overlaps, got 4241 seqlets
Across all tasks, the weakest laplace threshold used was: 0.9998397884067264
4241 identified in total
2 activity patterns with support >= 50 out of 3 possible patterns
Metacluster sizes:  [3823, 418]
Idx to activities:  {0: '1', 1: '-1'}
On metacluster 1
Metacluster size 418
Relevant tasks:  ('profile/Sox2',)
Relevant signs:  (-1,)
(Round 1) num seqlets: 418
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.09 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.01 s
Finished affinity matrix computations in 0.03 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.02 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 21.82 s
Launching nearest neighbors affmat calculation job
Job completed in: 1.67 s
(Round 1) Computed affinity matrix on nearest neighbors in 23.92 s
Filtered down to 416 of 418
(Round 1) Retained 416 rows out of 418 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 416 samples in 0.000s...
[t-SNE] Computed neighbors for 416 samples in 0.003s...
[t-SNE] Computed conditional probabilities for sample 416 / 416
[t-SNE] Mean sigma: 0.297276
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.07297682762145996 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
Louvain completed 200 runs in 1.1172304153442383 seconds
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.8s finished
Wrote graph to binary file in 0.14318394660949707 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.754239
After 2 runs, maximum modularity is Q = 0.783261
Louvain completed 52 runs in 14.982104301452637 seconds
Preproc + Louvain took 16.348705291748047 s
Got 8 clusters after round 1
Counts:
{2: 78, 0: 91, 3: 72, 6: 14, 1: 84, 7: 8, 4: 37, 5: 32}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 91 seqlets
Trimmed 10 out of 91
Aggregating for cluster 1 with 84 seqlets
Trimmed 9 out of 84
Dropping cluster 1 with 75 seqlets due to sign disagreement
Aggregating for cluster 2 with 78 seqlets
Trimmed 6 out of 78
Dropping cluster 2 with 72 seqlets due to sign disagreement
Aggregating for cluster 3 with 72 seqlets
Trimmed 2 out of 72
Dropping cluster 3 with 70 seqlets due to sign disagreement
Aggregating for cluster 4 with 37 seqlets
Trimmed 2 out of 37
Dropping cluster 4 with 35 seqlets due to sign disagreement
Aggregating for cluster 5 with 32 seqlets
Trimmed 0 out of 32
Dropping cluster 5 with 32 seqlets due to sign disagreement
Aggregating for cluster 6 with 14 seqlets
Trimmed 0 out of 14
Dropping cluster 6 with 14 seqlets due to sign disagreement
Aggregating for cluster 7 with 8 seqlets
Trimmed 0 out of 8
Dropping cluster 7 with 8 seqlets due to sign disagreement
(Round 2) num seqlets: 81
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.04 s
Starting affinity matrix computations
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.0 s
Normalization computed in 0.0 s
Cosine similarity mat computed in 0.0 s
Finished affinity matrix computations in 0.01 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.0 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 0.33 s
Launching nearest neighbors affmat calculation job
Job completed in: 0.28 s
(Round 2) Computed affinity matrix on nearest neighbors in 0.65 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 81 samples in 0.000s...
[t-SNE] Computed neighbors for 81 samples in 0.001s...
[t-SNE] Computed conditional probabilities for sample 81 / 81
[t-SNE] Mean sigma: 0.253568
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.024331331253051758 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
Louvain completed 200 runs in 0.9437379837036133 seconds
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    0.7s finished
Wrote graph to binary file in 0.007923364639282227 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.613423
Louvain completed 51 runs in 13.786309719085693 seconds
Preproc + Louvain took 14.780112743377686 s
Got 14 clusters after round 2
Counts:
{7: 3, 1: 18, 2: 8, 6: 5, 0: 18, 5: 5, 4: 5, 13: 2, 3: 7, 12: 2, 11: 2, 10: 2, 9: 2, 8: 2}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 18 seqlets
Trimmed 0 out of 18
Aggregating for cluster 1 with 18 seqlets
Trimmed 4 out of 18
Aggregating for cluster 2 with 8 seqlets
Trimmed 2 out of 8
Aggregating for cluster 3 with 7 seqlets
Trimmed 0 out of 7
Aggregating for cluster 4 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 5 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 6 with 5 seqlets
Trimmed 0 out of 5
Aggregating for cluster 7 with 3 seqlets
Trimmed 0 out of 3
Aggregating for cluster 8 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 9 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 10 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 11 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 12 with 2 seqlets
Trimmed 0 out of 2
Aggregating for cluster 13 with 2 seqlets
Trimmed 0 out of 2
Got 14 clusters
Splitting into subclusters...
Merging on 14 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 1 with prob 9.009378290163036e-05 and sim 0.8957096083217023
Collapsing 0 & 3 with prob 0.0004937050349028236 and sim 0.8946271382798632
Aborting collapse as 1 & 3 have prob 4.67594100764648e-05 and sim 0.8218403584181682
Collapsing 1 & 2 with prob 1.6877610431891668e-05 and sim 0.8708092526659896
Trimmed 3 out of 32
Trimmed 0 out of 35
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 0 & 1 with prob 0.000657286249147207 and sim 0.881536205615385
Trimmed 7 out of 42
On merging iteration 3
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 11 patterns after merging
Performing seqlet reassignment
Got 0 patterns after reassignment
Total time taken is 60.96s
On metacluster 0
Metacluster size 3823
Relevant tasks:  ('profile/Sox2',)
Relevant signs:  (1,)
(Round 1) num seqlets: 3823
(Round 1) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.93 s
Starting affinity matrix computations
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.15 s
Normalization computed in 0.02 s
Cosine similarity mat computed in 0.14 s
Finished affinity matrix computations in 0.3 s
(Round 1) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.63 s
(Round 1) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 17.26 s
Launching nearest neighbors affmat calculation job
Job completed in: 17.34 s
(Round 1) Computed affinity matrix on nearest neighbors in 39.6 s
Filtered down to 2445 of 3823
(Round 1) Retained 2445 rows out of 3823 after filtering
(Round 1) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2445 samples in 0.009s...
[t-SNE] Computed neighbors for 2445 samples in 0.079s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2445
[t-SNE] Computed conditional probabilities for sample 2000 / 2445
[t-SNE] Computed conditional probabilities for sample 2445 / 2445
[t-SNE] Mean sigma: 0.259122
(Round 1) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.5295615196228027 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    1.3s finished
Louvain completed 200 runs in 6.075063467025757 seconds
Wrote graph to binary file in 7.859645843505859 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.757505
After 3 runs, maximum modularity is Q = 0.784662
After 8 runs, maximum modularity is Q = 0.784774
After 16 runs, maximum modularity is Q = 0.794698
Louvain completed 66 runs in 26.593217611312866 seconds
Preproc + Louvain took 41.61826014518738 s
Got 14 clusters after round 1
Counts:
{4: 263, 1: 343, 0: 358, 2: 335, 3: 301, 6: 127, 9: 98, 8: 104, 11: 61, 5: 225, 13: 16, 12: 31, 10: 63, 7: 120}
(Round 1) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 358 seqlets
Trimmed 37 out of 358
Aggregating for cluster 1 with 343 seqlets
Trimmed 15 out of 343
Aggregating for cluster 2 with 335 seqlets
Trimmed 22 out of 335
Aggregating for cluster 3 with 301 seqlets
Trimmed 26 out of 301
Aggregating for cluster 4 with 263 seqlets
Trimmed 13 out of 263
Aggregating for cluster 5 with 225 seqlets
Trimmed 2 out of 225
Aggregating for cluster 6 with 127 seqlets
Trimmed 23 out of 127
Aggregating for cluster 7 with 120 seqlets
Trimmed 10 out of 120
Aggregating for cluster 8 with 104 seqlets
Trimmed 16 out of 104
Aggregating for cluster 9 with 98 seqlets
Trimmed 45 out of 98
Aggregating for cluster 10 with 63 seqlets
Trimmed 3 out of 63
Aggregating for cluster 11 with 61 seqlets
Trimmed 4 out of 61
Aggregating for cluster 12 with 31 seqlets
Trimmed 6 out of 31
Aggregating for cluster 13 with 16 seqlets
Trimmed 0 out of 16
(Round 2) num seqlets: 2223
(Round 2) Computing coarse affmat
Beginning embedding computation
Computing embeddings
Finished embedding computation in 0.61 s
Starting affinity matrix computations
Normalization computed in 0.03 s
Cosine similarity mat computed in 0.1 s
Normalization computed in 0.01 s
Cosine similarity mat computed in 0.06 s
Finished affinity matrix computations in 0.17 s
(Round 2) Compute nearest neighbors from coarse affmat
Computed nearest neighbors in 0.22 s
(Round 2) Computing affinity matrix on nearest neighbors
Launching nearest neighbors affmat calculation job
Job completed in: 4.15 s
Launching nearest neighbors affmat calculation job
Job completed in: 3.7 s
(Round 2) Computed affinity matrix on nearest neighbors in 9.69 s
Not applying filtering for rounds above first round
(Round 2) Computing density adapted affmat
[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 2223 samples in 0.004s...
[t-SNE] Computed neighbors for 2223 samples in 0.093s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2223
[t-SNE] Computed conditional probabilities for sample 2000 / 2223
[t-SNE] Computed conditional probabilities for sample 2223 / 2223
[t-SNE] Mean sigma: 0.241127
(Round 2) Computing clustering
Beginning preprocessing + Louvain
Wrote graph to binary file in 0.4348030090332031 seconds
Running Louvain modularity optimization
[Parallel(n_jobs=20)]: Using backend LokyBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done 200 out of 200 | elapsed:    1.2s finished
Louvain completed 200 runs in 5.23734712600708 seconds
Wrote graph to binary file in 4.850239515304565 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.755718
After 21 runs, maximum modularity is Q = 0.756906
After 25 runs, maximum modularity is Q = 0.757113
Louvain completed 75 runs in 30.298176765441895 seconds
Preproc + Louvain took 41.296796798706055 s
Got 14 clusters after round 2
Counts:
{2: 303, 3: 286, 8: 62, 10: 59, 0: 337, 9: 61, 5: 216, 1: 336, 13: 32, 4: 218, 12: 55, 6: 103, 7: 97, 11: 58}
(Round 2) Aggregating seqlets in each cluster
Aggregating for cluster 0 with 337 seqlets
Trimmed 31 out of 337
Aggregating for cluster 1 with 336 seqlets
Trimmed 31 out of 336
Aggregating for cluster 2 with 303 seqlets
Trimmed 8 out of 303
Aggregating for cluster 3 with 286 seqlets
Trimmed 2 out of 286
Aggregating for cluster 4 with 218 seqlets
Trimmed 8 out of 218
Aggregating for cluster 5 with 216 seqlets
Trimmed 2 out of 216
Aggregating for cluster 6 with 103 seqlets
Trimmed 4 out of 103
Aggregating for cluster 7 with 97 seqlets
Trimmed 10 out of 97
Aggregating for cluster 8 with 62 seqlets
Trimmed 4 out of 62
Aggregating for cluster 9 with 61 seqlets
Trimmed 2 out of 61
Aggregating for cluster 10 with 59 seqlets
Trimmed 7 out of 59
Aggregating for cluster 11 with 58 seqlets
Trimmed 5 out of 58
Aggregating for cluster 12 with 55 seqlets
Trimmed 0 out of 55
Aggregating for cluster 13 with 32 seqlets
Trimmed 0 out of 32
Got 14 clusters
Splitting into subclusters...
Inspecting for spurious merging
Wrote graph to binary file in 0.36664748191833496 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00714342
After 3 runs, maximum modularity is Q = 0.00714396
After 4 runs, maximum modularity is Q = 0.00714413
After 5 runs, maximum modularity is Q = 0.00716238
Louvain completed 25 runs in 9.255085229873657 seconds
Similarity is 0.9903531; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.3529376983642578 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00660954
Louvain completed 21 runs in 7.53514289855957 seconds
Similarity is 0.96683306; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.6395473480224609 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00830553
Louvain completed 21 runs in 7.567034006118774 seconds
Similarity is 0.915972; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.28691625595092773 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00744359
After 4 runs, maximum modularity is Q = 0.0074436
Louvain completed 24 runs in 8.280121803283691 seconds
Similarity is 0.9294492; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.49452805519104004 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00433235
Louvain completed 21 runs in 6.833980083465576 seconds
Similarity is 0.9784541; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.2502763271331787 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00698492
After 5 runs, maximum modularity is Q = 0.00699649
Louvain completed 25 runs in 9.844514608383179 seconds
Similarity is 0.9917188; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.12505292892456055 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.0066376
After 5 runs, maximum modularity is Q = 0.00675891
After 8 runs, maximum modularity is Q = 0.00691597
Louvain completed 28 runs in 11.038772821426392 seconds
Similarity is 0.9490298; is_dissimilar is False
Inspecting for spurious merging
Wrote graph to binary file in 0.05417466163635254 seconds
Running Louvain modularity optimization
After 1 runs, maximum modularity is Q = 0.00826917
After 2 runs, maximum modularity is Q = 0.00828327
After 5 runs, maximum modularity is Q = 0.00836739
Louvain completed 25 runs in 9.997448444366455 seconds
Similarity is 0.95790803; is_dissimilar is False
Merging on 14 clusters
On merging iteration 1
Computing pattern to seqlet distances
Computing pattern to pattern distances
Collapsing 1 & 12 with prob 0.0003031290548221803 and sim 0.966830919908387
Collapsing 4 & 7 with prob 5.041854422682651e-05 and sim 0.9613330372294213
Collapsing 1 & 4 with prob 0.009876370214526697 and sim 0.9488452185555877
Collapsing 3 & 12 with prob 9.919036770629235e-05 and sim 0.9454274708681666
Collapsing 7 & 12 with prob 4.3832985092703356e-05 and sim 0.9415490974093651
Collapsing 0 & 5 with prob 0.007157706917953332 and sim 0.9342161309604295
Collapsing 4 & 12 with prob 0.00026166970547527387 and sim 0.9338032845420665
Collapsing 1 & 9 with prob 1.1469571788920172e-05 and sim 0.9249410236230544
Aborting collapse as 3 & 9 have prob 5.052537627681495e-09 and sim 0.763370984001204
Aborting collapse as 7 & 9 have prob 1.979549992799982e-09 and sim 0.8408721228855294
Collapsing 3 & 7 with prob 2.292955734930397e-05 and sim 0.9217198193658244
Collapsing 1 & 7 with prob 1.5992753146756524e-05 and sim 0.9207164918801796
Collapsing 5 & 9 with prob 9.692246636750263e-06 and sim 0.9200752489166327
Collapsing 1 & 3 with prob 6.847624311719341e-05 and sim 0.9090312127909612
Collapsing 3 & 4 with prob 7.136424203660523e-05 and sim 0.9041899454566594
Collapsing 4 & 6 with prob 0.0004219446028705813 and sim 0.8961197072247246
Collapsing 3 & 6 with prob 0.0001812265145987223 and sim 0.8937915475363788
Collapsing 2 & 3 with prob 6.076515183498185e-05 and sim 0.8924655964471089
Aborting collapse as 1 & 2 have prob 4.1293374265929175e-07 and sim 0.6872172914163854
Aborting collapse as 2 & 4 have prob 5.5681767180989504e-08 and sim 0.696643674295823
Collapsing 1 & 6 with prob 0.00032179338929625253 and sim 0.8704431768628763
Trimmed 0 out of 360
Trimmed 0 out of 297
Trimmed 0 out of 657
Trimmed 1 out of 941
Trimmed 0 out of 520
Trimmed 2 out of 579
Trimmed 0 out of 1039
On merging iteration 2
Computing pattern to seqlet distances
Computing pattern to pattern distances
Got 7 patterns after merging
Performing seqlet reassignment
Cross contin jaccard time taken: 0.06 s
Cross contin jaccard time taken: 0.04 s
Discarded 1 seqlets
Got 3 patterns after reassignment
Total time taken is 248.84s
In [26]:
def ic_scale(x):
    from modisco.visualization import viz_sequence
    background = np.array([0.27, 0.23, 0.23, 0.27])
    return viz_sequence.ic_scale(x, background=background)
In [27]:
from basepair.modisco import ModiscoResult
mr = ModiscoResult(output_dir+"/modisco.h5")
incl = np.load(output_dir+"/included_samples.npy")
tasks = list(ds.task_specs)
legend=False
rc_vec=None
rc_fn=lambda x: x[::-1, ::-1]
start_vec=None
n_bootstrap=None
n_limit=35
seq_height=1.5
n_bootstrap=100
fpath_template=output_dir+"_figures/"
os.makedirs(fpath_template, exist_ok=True)
figsize=(12, 2.5)
tracks = {task: y[f"profile/{task}"][incl] for task in tasks}
x = x[incl]
import matplotlib.pyplot as plt
from concise.utils.plot import seqlogo_fig, seqlogo

print(mr.patterns())
['pattern_0', 'pattern_1', 'pattern_2']
In [28]:
def bootstrap_mean(x, n=100):
    """Bootstrap the mean computation"""
    out = []
    for i in range(n):
        idx = pd.Series(np.arange(len(x))).sample(frac=1.0, replace=True)
        out.append(x[idx].mean(0))
    outm = np.stack(out)
    return outm.mean(0), outm.std(0)
In [29]:
import pandas as pd
from matplotlib.ticker import FormatStrFormatter
from basepair.plots import show_figure

ylim=[0, 3]
In [30]:
%matplotlib inline
for i, pattern in enumerate(mr.patterns()):

    j = i
    seqs = mr.extract_signal(x)[pattern]
    sequence = ic_scale(seqs.mean(axis=0))
    if rc_vec is not None and rc_vec[i]:
        sequence = rc_fn(sequence)
    if start_vec is not None:
        start = start_vec[i]
        sequence = sequence[start:(start + width)]
    n = len(seqs)
    if n < n_limit:
        continue
    fig, ax = plt.subplots(1 + len(tracks), 1, sharex=True,
                           figsize=figsize,
                           gridspec_kw={'height_ratios': [1] * len(tracks) + [seq_height]})
    ax[0].set_title(f"motif {i} ({n})")
    for i, (k, y) in enumerate(tracks.items()):
        signal = mr.extract_signal(y, rc_fn)[pattern]

        if start_vec is not None:
            start = start_vec[i]
            signal = signal[:, start:(start + width)]

        if n_bootstrap is None:
            signal_mean = signal.mean(axis=0)
            signal_std = signal.std(axis=0)
        else:
            signal_mean, signal_std = bootstrap_mean(signal, n=n_bootstrap)
        if rc_vec is not None and rc_vec[i]:
            signal_mean = rc_fn(signal_mean)
            signal_std = rc_fn(signal_std)

        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 0], label='pos')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 0] - 2 * signal_std[:, 0],
                               signal_mean[:, 0] + 2 * signal_std[:, 0],
                               alpha=0.1)
        #                   label='pos')
        ax[i].plot(np.arange(1, len(signal_mean) + 1), signal_mean[:, 1], label='neg')
        if n_bootstrap is not None:
            ax[i].fill_between(np.arange(1, len(signal_mean) + 1),
                               signal_mean[:, 1] - 2 * signal_std[:, 1],
                               signal_mean[:, 1] + 2 * signal_std[:, 1],
                               alpha=0.1)
        #                   label='pos')
        ax[i].set_ylabel(f"{k}")
        ax[i].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
        ax[i].spines['top'].set_visible(False)
        ax[i].spines['right'].set_visible(False)
        ax[i].spines['bottom'].set_visible(False)
        ax[i].xaxis.set_ticks_position('none')
        if isinstance(ylim[0], list):
            ax[i].set_ylim(ylim[i])
        if legend:
            ax[i].legend()

    seqlogo(sequence, ax=ax[-1])
    ax[-1].yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
    ax[-1].set_ylabel("Inf. content")
    ax[-1].spines['top'].set_visible(False)
    ax[-1].spines['right'].set_visible(False)
    ax[-1].spines['bottom'].set_visible(False)
    ax[-1].set_xticks(list(range(0, len(sequence) + 1, 5)))
    
    if fpath_template is not None:
        plt.savefig(fpath_template.format(j) + '.png', dpi=600)
        plt.savefig(fpath_template.format(j) + '.pdf', dpi=600)
        plt.close(fig)    # close the figure
        show_figure(fig)
        plt.show()