Goal¶

train the DNase bias model

Tasks¶

[x] get the training data
[x] update the specs to allow training with strand-less TaskSpecs
[x] allow training with the dataloader/Dataset in the hparams
[x] allow overriding the hyper-params using the CLI (exploit nesting with the dot: first.second_level.third_level)
[x] save the log files of the script
[x] train a few DNase models (random search)
- [x] vary the network size (number of layers, filters, etc)
[ ] train single-task model
[ ] evaluate the single-task model
[ ] train multi-task model
[ ] evaluate the multi-task model
[ ] improve the model
- [ ] larger context
- [ ] valid padding
- [ ] U-net structure
- [ ] automatically balance betwen counts and the profile
- [ ] RC parameter sharing or augmentation
- [ ] layer-norm?

get the training data¶

import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from keras.models import load_model
from basepair.config import create_tf_session, get_data_dir
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.config import valid_chr, test_chr

ddir = get_data_dir()

create_tf_session(0)

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

<tensorflow.python.client.session.Session at 0x7fc5f8102fd0>

dpath = Path("/mnt/lab_data/users/avsec/dnase_bias/")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
                                 peaks=dpath / "genome.stride1000.w1000.nonblacklist.bed")},
    fasta_file=dpath / "GRCh38.genome.fa",
              )

Get the dataloader¶

dl = StrandedProfile(ds, 
                     excl_chromosomes=valid_chr+test_chr, 
                     peak_width=SEQ_WIDTH, shuffle=True,
                     target_transformer=AppendCounts())

Skipped 90 intervals outside of the genome size

dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=valid_chr, 
                           peak_width=SEQ_WIDTH, 
                           shuffle=True,
                           target_transformer=AppendCounts())

Skipped 15 intervals outside of the genome size

dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=True,
                          target_transformer=AppendCounts())

Skipped 15 intervals outside of the genome size

train single-task model¶

import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_multitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model

def get_model(mfn, mkwargs):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/dnase/exp/models/background"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(datetime.datetime.now()).replace(" ", "::")
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    return eval(mfn)(**mkwargs), name, ckp_file

train the model¶

add the training curve plot bellow training

# hyper-parameters
mfn = "seq_multitask"
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=25,
               n_dil_layers=6,
               tasks=['DNase'],
               seq_len=SEQ_WIDTH,
               lr=0.004)

# TODO - how to deal with counts which are not provided?
#   - directly train the count prediction model by multiplying the loss-functions

# best valid so far: 
model, name, ckp_file = get_model(mfn, mkwargs)
batch_size=256
history = model.fit_generator(dl.batch_train_iter(batch_size=batch_size, num_workers=28), 
                              steps_per_epoch=(len(dl)//batch_size) // 20, # Evaluate 20x more frequently
                              epochs=100,
                              validation_data=dl_valid.batch_train_iter(batch_size=batch_size, num_workers=28),
                              validation_steps=(len(dl_valid)//batch_size) // 100,
                              callbacks=[EarlyStopping(patience=5),
                                         History(),
                                         ModelCheckpoint(ckp_file, save_best_only=True)]
                     )
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll, 
                                             "SpatialLifetimeSparsity": SpatialLifetimeSparsity})

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-08-13 07:39:01,446 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-08-13 07:39:05,676 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

Epoch 1/100
1879/1879 [==============================] - 88s 47ms/step - loss: 63.7975 - profile/DNase_loss: 37.8222 - counts/DNase_loss: 0.2598 - val_loss: 62.6912 - val_profile/DNase_loss: 40.8849 - val_counts/DNase_loss: 0.2181
Epoch 2/100
1879/1879 [==============================] - 85s 45ms/step - loss: 61.4768 - profile/DNase_loss: 37.1426 - counts/DNase_loss: 0.2433 - val_loss: 62.2256 - val_profile/DNase_loss: 40.3929 - val_counts/DNase_loss: 0.2183
Epoch 3/100
1879/1879 [==============================] - 86s 46ms/step - loss: 61.1653 - profile/DNase_loss: 37.0569 - counts/DNase_loss: 0.2411 - val_loss: 61.6666 - val_profile/DNase_loss: 40.3413 - val_counts/DNase_loss: 0.2133
Epoch 4/100
1879/1879 [==============================] - 86s 46ms/step - loss: 60.5447 - profile/DNase_loss: 36.8528 - counts/DNase_loss: 0.2369 - val_loss: 61.7291 - val_profile/DNase_loss: 40.3179 - val_counts/DNase_loss: 0.2141
Epoch 5/100
1185/1879 [=================>............] - ETA: 30s - loss: 60.5127 - profile/DNase_loss: 36.9517 - counts/DNase_loss: 0.2356

ckp_file

"/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-13::07:39:01.200919.h5"

evaluate the single-task model¶

!cat {ddir}/processed/dnase/exp/models/background/models/w=1000/dataspec.yaml

task_specs:
  DNase:
    pos_counts: /mnt/lab_data/users/avsec/dnase_bias/IMR90_Naked_DNase.pos.bw
    neg_counts: /mnt/lab_data/users/avsec/dnase_bias/IMR90_Naked_DNase.neg.bw
    peaks: /mnt/lab_data/users/avsec/dnase_bias/genome.stride1000.w1000.nonblacklist.bed
    ignore_strand: true
    bias_model: null
    bias_bigwig: null
fasta_file: /mnt/lab_data/users/avsec/dnase_bias/GRCh38.genome.fa
path: dataspec.yml

!ls -latr {ddir}/processed/dnase/exp/models/background/models/

total 132
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 2-more-filters
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 1-ndil=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 0
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128,n_dil_layers=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128,n_dil_layers=3
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=256
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=32
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=32
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=256
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=3
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=6
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=11
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=9
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=25
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=31
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=51
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=81
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 test-bias-model
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 test
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 w200
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 03:43 2test
drwxrwxr-x  3 avsec kundaje 4096 Aug 15 04:18 no_profile,w=200,n_dil_layers=2,filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 04:39 w=200,filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 05:15 w=200,filters=128
drwxrwxr-x 33 avsec kundaje 4096 Aug 15 07:04 .
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 07:29 w=1000
drwxrwxr-x  5 avsec kundaje 4096 Aug 15 07:52 ..
drwxrwxr-x  3 avsec kundaje 4096 Aug 15 14:43 w=200,n_dil_layers=2,filters=128

ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=51,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-09::07:23:11.054472.h5"

model = load_model(ckp_file)

ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-13::07:39:01.200919.h5"

model = load_model(ckp_file)

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

2018-08-13 08:40:19,098 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-08-13 08:40:23,323 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

from basepair.preproc import bin_counts

# TODO - write a generic plotter
class Seq2Profile:

    def __init__(self, x, y, model):
        self.x = x
        self.y = y
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(y) for y in model.predict(x)]

    def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), binsize=1, fpath_template=None):
        import matplotlib.pyplot as plt
        if sort == 'random':
            idx_list = samplers.random(self.x, n)
        elif "_" in sort:
            kind, task = sort.split("_")
            
            if kind == "max":
                idx_list = samplers.top_max_count(self.y["profile/" + task], n)
            elif kind == "sum":
                idx_list = samplers.top_sum_count(self.y["profile/" + task], n)
            else:
                raise ValueError("")
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
        for i, idx in enumerate(idx_list):
            task = "DNase"
            fig = plt.figure(figsize=figsize)
            plt.subplot(121)
            if i == 0:
                plt.title("Predicted DNase")
                bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0]
            plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0], 
                     label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
            plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1], 
                     label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
            plt.legend()
            plt.subplot(122)
            if i == 0:
                plt.title("Observed DNase")
            plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 0], 
                     label='pos,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 0])))
            plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 1], 
                     label='neg,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 1])))
            plt.legend()

dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())

Skipped 15 intervals outside of the genome size

test = dl_test.load_all(num_workers=28)

100%|██████████| 83202/83202 [03:20<00:00, 414.75it/s]

from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

from kipoi.writers import HDF5BatchWriter

y_pred = model.predict(test['inputs'], verbose=1)

2662434/2662434 [==============================] - 352s 132us/step

y_true = test["targets"]

!mkdir -p {ddir}/processed/dnase/exp/predictions/single-task/

obj = HDF5BatchWriter(f"{ddir}/processed/dnase/exp/predictions/single-task/model-2018-08-09::04:17:01.267710.h5")
obj.batch_write(y_pred)
obj.close()

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-254b27e9a55b> in <module>()
----> 1 HDF5BatchWriter.dump(f"{ddir}/processed/dnase/exp/predictions/single-task/model-2018-08-09::04:17:01.267710.h5", y_pred)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/writers.py in dump(cls, file_path, batch)
    203     @classmethod
    204     def dump(cls, file_path, batch):
--> 205         cls(file_path=file_path).batch_write(batch).close()
    206 
    207 # Nice-to-have writers:

AttributeError: 'NoneType' object has no attribute 'close'

len(y_pred)

2

test_chr

['chr1', 'chr8', 'chr9']

#test['metadata']['range']['chr'] == 'chr9'

# Get the most interesting regions
top_counts = samplers.top_sum_count(y_true["profile/DNase"], 10)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

task = "DNase"

yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]

df = eval_profile(yt, yp)

/users/avsec/workspace/basepair/basepair/cli/evaluate.py:141: RuntimeWarning: invalid value encountered in true_divide
  fracs = yt / yt.sum(axis=1, keepdims=True)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:142: RuntimeWarning: invalid value encountered in greater_equal
  is_peak = (fracs >= pos_min_threshold).astype(float)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:143: RuntimeWarning: invalid value encountered in less
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:143: RuntimeWarning: invalid value encountered in greater_equal
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)

df

pl = Seq2Profile(test["inputs"], y_true, model)

pl.plot(n=20, sort='random', binsize=1)

pl.plot(n=10, sort='sum_DNase', binsize=1)

Compare the predictions for the k-mer model¶

from kipoi.readers import HDF5Reader
import random

# Run src/dnase/benchmark-kmer.py to obtain this file
# K-mers obtained from https://github.com/jpiper/pyDNase/blob/master/pyDNase/data/IMR90_6mer.pickle
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/IMR90_6mer.preds.h5")

r.open()

r.ls()

[('/metadata/interval_from_task',
  <HDF5 dataset "interval_from_task": shape (2662434,), type "|O">),
 ('/metadata/range/chr', <HDF5 dataset "chr": shape (2662434,), type "|O">),
 ('/metadata/range/end', <HDF5 dataset "end": shape (2662434,), type "<i8">),
 ('/metadata/range/id', <HDF5 dataset "id": shape (2662434,), type "<i8">),
 ('/metadata/range/start',
  <HDF5 dataset "start": shape (2662434,), type "<i8">),
 ('/metadata/range/strand',
  <HDF5 dataset "strand": shape (2662434,), type "|O">),
 ('/preds', <HDF5 dataset "preds": shape (2662434, 995), type "<f8">),
 ('/targets/counts/DNase',
  <HDF5 dataset "DNase": shape (2662434, 2), type "<f4">),
 ('/targets/profile/DNase',
  <HDF5 dataset "DNase": shape (2662434, 1000, 2), type "<f4">)]

preds = r.f['/preds'][:]

targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)

y_true = targets[:,3:(1000-2)]

y_true_l = np.ravel(y_true)

y_pred_l = np.ravel(preds)

y_true_l[:5]

array([0., 0., 0., 0., 0.], dtype=float32)

y_true_l.max()

61.0

idx

array([     32538,     356576,     880736, ..., 2647863212, 2648224655,
       2648681311])

idx = np.unique(random.sample(range(0, len(y_pred_l)), 1000000))

plt.scatter(y_true_l[idx], y_pred_l[idx], alpha=0.1)
plt.xlabel("true values")
plt.ylabel("predictions")

Text(0,0.5,'predictions')

Aggregated plot¶

from basepair.plots import regression_eval

yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))

# 3:end - 2
regression_eval(yt_counts, yp_counts, alpha=0.1)

# 2:end - 3
regression_eval(yt_counts, yp_counts, alpha=0.1)

Plot a few profile examples¶

Get the results¶

def get_performance(mdir):
    try:
        dfh = pd.read_csv(f"{mdir}/history.csv")
        return dict(dfh.iloc[dfh.val_loss.argmin()])
    except:
        return {}

mdir = Path(f"{ddir}/processed/dnase/exp/models/background/models")

mdir.name

'models'

df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in mdir.iterdir()])

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/ipykernel/__main__.py:4: FutureWarning: 'argmin' is deprecated. Use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional minimum in the future. Use 'series.values.argmin' to get the position of the minimum now.

df_exp.sort_values("val_counts/DNase_loss")

best_model = mdir / "filters=128/model.h5"

best_model = mdir / "n_dil_layers=1/model.h5"

best_model = mdir / "w200/model.h5"

import basepair

from basepair.losses import MultichannelMultinomialNLL

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.

from keras.models import load_model

mdir

PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/models')

# TODO - unable to load the model as multiple might have the same name
model = load_model(best_model, custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                               "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})  # TODO - fix the loss function

WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

2018-08-15 02:05:56,124 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.

dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=200, #SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())

Skipped 3 intervals outside of the genome size

test = dl_test.load_all(num_workers=28)

100%|██████████| 83202/83202 [00:51<00:00, 1618.21it/s]

from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

from kipoi.writers import HDF5BatchWriter

y_pred = model.predict(test['inputs'], verbose=1)

2662446/2662446 [==============================] - 265s 99us/step

y_true = test["targets"]

y_true['counts/DNase'].mean(-1).mean()

0.4259427

y_true['counts/DNase'].mean(-1).std()

0.35637203

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)

df_exp.head()

ls {ddir}/processed/dnase/exp/models/background/models

0/                           filters=32/            tconv_kernel_size=25/
1-ndil=1/                    filters=64/            tconv_kernel_size=31/
2-more-filters/              n_dil_layers=1/        tconv_kernel_size=51/
filters=128/                 n_dil_layers=3/        tconv_kernel_size=81/
filters=128,n_dil_layers=1/  n_dil_layers=6/        test/
filters=128,n_dil_layers=3/  n_dil_layers=9/
filters=256/                 tconv_kernel_size=11/

	auprc	binsize	frac_ambigous	imbalance	n_positives	random_auprc
0	0.065803	1	0.035182	0.003204	68	0.005749
1	0.068983	2	0.065273	0.006613	68	0.009716
2	0.074774	4	0.114364	0.013344	65	0.019089
3	0.102669	10	0.211818	0.034025	59	0.043666

	counts/DNase_loss	epoch	loss	name	profile/DNase_loss	val_counts/DNase_loss	val_loss	val_profile/DNase_loss
15	0.182623	16.0	35.560557	filters=128	35.377934	0.128930	38.769257	38.640327
24	0.185571	14.0	35.640936	n_dil_layers=6	35.455365	0.129076	38.745200	38.616124
21	0.200432	4.0	35.688733	tconv_kernel_size=25	35.488301	0.129666	38.827258	38.697592
14	0.221377	0.0	35.921744	test	35.700367	0.129715	38.869296	38.739581
18	0.193858	5.0	35.622445	2-more-filters	35.428587	0.129909	38.804538	38.674629
5	0.197724	6.0	35.583637	tconv_kernel_size=11	35.385913	0.129995	38.764216	38.634220
10	0.198610	8.0	35.641856	tconv_kernel_size=51	35.443246	0.130210	38.861494	38.731285
1	0.193425	2.0	37.519163	0	35.584910	0.131726	40.192790	38.875525
17	0.200270	6.0	35.718000	n_dil_layers=3	35.517730	0.131814	38.852261	38.720447
11	0.201226	3.0	35.644755	filters=256	35.443529	0.132037	38.846617	38.714580
8	0.197916	4.0	35.772572	filters=128,n_dil_layers=1	35.574656	0.132459	39.014482	38.882023
16	0.208118	1.0	35.747706	tconv_kernel_size=81	35.539588	0.132564	38.862644	38.730080
4	0.200440	14.0	35.697409	filters=32	35.496969	0.132972	38.786908	38.653936
9	0.199130	8.0	35.802095	n_dil_layers=1	35.602965	0.132995	38.958927	38.825932
22	0.202464	3.0	35.658114	n_dil_layers=9	35.455651	0.133286	38.818563	38.685277
23	0.201380	3.0	35.631906	filters=64	35.430526	0.133413	38.829109	38.695696
2	0.199738	3.0	35.819388	n_dil_layers=1,filters=64	35.619650	0.133641	38.997383	38.863742
13	0.198228	4.0	35.693804	filters=128,n_dil_layers=3	35.495577	0.133837	38.902903	38.769066
3	0.202916	7.0	35.818035	n_dil_layers=1,filters=32	35.615119	0.135351	39.024694	38.889343
20	0.202335	2.0	35.722158	tconv_kernel_size=31	35.519823	0.135585	38.866309	38.730723
7	0.199539	3.0	35.875132	n_dil_layers=1,filters=256	35.675593	0.135975	39.042875	38.906900
19	0.197249	7.0	35.807942	n_dil_layers=1,filters=128	35.610693	0.141668	38.894963	38.753295
12	0.261085	7.0	6.430682	w200	6.169596	0.267019	6.993833	6.726814
0	NaN	NaN	NaN	test-bias-model	NaN	NaN	NaN	NaN
6	NaN	NaN	NaN	1-ndil=1	NaN	NaN	NaN	NaN

	counts/DNase_loss	epoch	loss	name	profile/DNase_loss	val_counts/DNase_loss	val_loss	val_profile/DNase_loss
0	0.193425	2.0	37.519163	/users/avsec/workspace/basepair/basepair/../da...	35.584910	0.131726	40.192790	38.875525
1	0.200440	14.0	35.697409	/users/avsec/workspace/basepair/basepair/../da...	35.496969	0.132972	38.786908	38.653936
2	0.197724	6.0	35.583637	/users/avsec/workspace/basepair/basepair/../da...	35.385913	0.129995	38.764216	38.634220
3	NaN	NaN	NaN	/users/avsec/workspace/basepair/basepair/../da...	NaN	NaN	NaN	NaN
4	0.197916	4.0	35.772572	/users/avsec/workspace/basepair/basepair/../da...	35.574656	0.132459	39.014482	38.882023