Goal

  • train the DNase bias model

Tasks

  • [x] get the training data
  • [x] update the specs to allow training with strand-less TaskSpecs
  • [x] allow training with the dataloader/Dataset in the hparams
  • [x] allow overriding the hyper-params using the CLI (exploit nesting with the dot: first.second_level.third_level)
  • [x] save the log files of the script
  • [x] train a few DNase models (random search)

    • [x] vary the network size (number of layers, filters, etc)
  • [ ] train single-task model

  • [ ] evaluate the single-task model
  • [ ] train multi-task model
  • [ ] evaluate the multi-task model
  • [ ] improve the model
    • [ ] larger context
    • [ ] valid padding
    • [ ] U-net structure
    • [ ] automatically balance betwen counts and the profile
    • [ ] RC parameter sharing or augmentation
    • [ ] layer-norm?

get the training data

In [1]:
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from keras.models import load_model
from basepair.config import create_tf_session, get_data_dir
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.config import valid_chr, test_chr

ddir = get_data_dir()

create_tf_session(0)
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
Out[1]:
<tensorflow.python.client.session.Session at 0x7fc5f8102fd0>
In [2]:
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias/")

SEQ_WIDTH = 1000

ds = DataSpec(
    task_specs={"DNase": TaskSpec(task="DNase",
                                 pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
                                 neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
                                 peaks=dpath / "genome.stride1000.w1000.nonblacklist.bed")},
    fasta_file=dpath / "GRCh38.genome.fa",
              )

Get the dataloader

In [4]:
dl = StrandedProfile(ds, 
                     excl_chromosomes=valid_chr+test_chr, 
                     peak_width=SEQ_WIDTH, shuffle=True,
                     target_transformer=AppendCounts())
Skipped 90 intervals outside of the genome size
In [5]:
dl_valid = StrandedProfile(ds, 
                           incl_chromosomes=valid_chr, 
                           peak_width=SEQ_WIDTH, 
                           shuffle=True,
                           target_transformer=AppendCounts())
Skipped 15 intervals outside of the genome size
In [6]:
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=True,
                          target_transformer=AppendCounts())
Skipped 15 intervals outside of the genome size

train single-task model

In [3]:
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_multitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
In [9]:
def get_model(mfn, mkwargs):
    """Get the model"""
    import datetime
    mdir = f"{ddir}/processed/dnase/exp/models/background"
    name = mfn + "_" + \
            ",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
            "." + str(datetime.datetime.now()).replace(" ", "::")
    !mkdir -p {mdir}
    ckp_file = f"{mdir}/{name}.h5"
    return eval(mfn)(**mkwargs), name, ckp_file

train the model

  • add the training curve plot bellow training
In [10]:
# hyper-parameters
mfn = "seq_multitask"
mkwargs = dict(filters=32, 
               conv1_kernel_size=21,
               tconv_kernel_size=25,
               n_dil_layers=6,
               tasks=['DNase'],
               seq_len=SEQ_WIDTH,
               lr=0.004)
In [ ]:
# TODO - how to deal with counts which are not provided?
#   - directly train the count prediction model by multiplying the loss-functions
In [ ]:
# best valid so far: 
model, name, ckp_file = get_model(mfn, mkwargs)
batch_size=256
history = model.fit_generator(dl.batch_train_iter(batch_size=batch_size, num_workers=28), 
                              steps_per_epoch=(len(dl)//batch_size) // 20, # Evaluate 20x more frequently
                              epochs=100,
                              validation_data=dl_valid.batch_train_iter(batch_size=batch_size, num_workers=28),
                              validation_steps=(len(dl_valid)//batch_size) // 100,
                              callbacks=[EarlyStopping(patience=5),
                                         History(),
                                         ModelCheckpoint(ckp_file, save_best_only=True)]
                     )
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll, 
                                             "SpatialLifetimeSparsity": SpatialLifetimeSparsity})
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-13 07:39:01,446 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-13 07:39:05,676 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
Epoch 1/100
1879/1879 [==============================] - 88s 47ms/step - loss: 63.7975 - profile/DNase_loss: 37.8222 - counts/DNase_loss: 0.2598 - val_loss: 62.6912 - val_profile/DNase_loss: 40.8849 - val_counts/DNase_loss: 0.2181
Epoch 2/100
1879/1879 [==============================] - 85s 45ms/step - loss: 61.4768 - profile/DNase_loss: 37.1426 - counts/DNase_loss: 0.2433 - val_loss: 62.2256 - val_profile/DNase_loss: 40.3929 - val_counts/DNase_loss: 0.2183
Epoch 3/100
1879/1879 [==============================] - 86s 46ms/step - loss: 61.1653 - profile/DNase_loss: 37.0569 - counts/DNase_loss: 0.2411 - val_loss: 61.6666 - val_profile/DNase_loss: 40.3413 - val_counts/DNase_loss: 0.2133
Epoch 4/100
1879/1879 [==============================] - 86s 46ms/step - loss: 60.5447 - profile/DNase_loss: 36.8528 - counts/DNase_loss: 0.2369 - val_loss: 61.7291 - val_profile/DNase_loss: 40.3179 - val_counts/DNase_loss: 0.2141
Epoch 5/100
1185/1879 [=================>............] - ETA: 30s - loss: 60.5127 - profile/DNase_loss: 36.9517 - counts/DNase_loss: 0.2356
In [14]:
ckp_file
Out[14]:
"/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-13::07:39:01.200919.h5"

evaluate the single-task model

In [6]:
!cat {ddir}/processed/dnase/exp/models/background/models/w=1000/dataspec.yaml
task_specs:
  DNase:
    pos_counts: /mnt/lab_data/users/avsec/dnase_bias/IMR90_Naked_DNase.pos.bw
    neg_counts: /mnt/lab_data/users/avsec/dnase_bias/IMR90_Naked_DNase.neg.bw
    peaks: /mnt/lab_data/users/avsec/dnase_bias/genome.stride1000.w1000.nonblacklist.bed
    ignore_strand: true
    bias_model: null
    bias_bigwig: null
fasta_file: /mnt/lab_data/users/avsec/dnase_bias/GRCh38.genome.fa
path: dataspec.yml
In [7]:
!ls -latr {ddir}/processed/dnase/exp/models/background/models/
total 132
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 2-more-filters
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 1-ndil=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 0
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128,n_dil_layers=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=128,n_dil_layers=3
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=256
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=32
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=32
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=256
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=3
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=1,filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=6
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=11
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 n_dil_layers=9
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=25
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=31
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=51
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 tconv_kernel_size=81
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 test-bias-model
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 test
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 02:52 w200
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 03:43 2test
drwxrwxr-x  3 avsec kundaje 4096 Aug 15 04:18 no_profile,w=200,n_dil_layers=2,filters=128
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 04:39 w=200,filters=64
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 05:15 w=200,filters=128
drwxrwxr-x 33 avsec kundaje 4096 Aug 15 07:04 .
drwxrwxr-x  2 avsec kundaje 4096 Aug 15 07:29 w=1000
drwxrwxr-x  5 avsec kundaje 4096 Aug 15 07:52 ..
drwxrwxr-x  3 avsec kundaje 4096 Aug 15 14:43 w=200,n_dil_layers=2,filters=128
In [29]:
ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=51,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-09::07:23:11.054472.h5"

model = load_model(ckp_file)
In [5]:
ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-13::07:39:01.200919.h5"

model = load_model(ckp_file)
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-13 08:40:19,098 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-13 08:40:23,323 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [6]:
from basepair.preproc import bin_counts

# TODO - write a generic plotter
class Seq2Profile:

    def __init__(self, x, y, model):
        self.x = x
        self.y = y
        self.model = model
        # Make the prediction
        self.y_pred = [softmax(y) for y in model.predict(x)]

    def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), binsize=1, fpath_template=None):
        import matplotlib.pyplot as plt
        if sort == 'random':
            idx_list = samplers.random(self.x, n)
        elif "_" in sort:
            kind, task = sort.split("_")
            
            if kind == "max":
                idx_list = samplers.top_max_count(self.y["profile/" + task], n)
            elif kind == "sum":
                idx_list = samplers.top_sum_count(self.y["profile/" + task], n)
            else:
                raise ValueError("")
        else:
            raise ValueError(f"sort={sort} couldn't be interpreted")
        for i, idx in enumerate(idx_list):
            task = "DNase"
            fig = plt.figure(figsize=figsize)
            plt.subplot(121)
            if i == 0:
                plt.title("Predicted DNase")
                bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0]
            plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0], 
                     label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
            plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1], 
                     label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
            plt.legend()
            plt.subplot(122)
            if i == 0:
                plt.title("Observed DNase")
            plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 0], 
                     label='pos,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 0])))
            plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 1], 
                     label='neg,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 1])))
            plt.legend()
In [31]:
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())
Skipped 15 intervals outside of the genome size
In [8]:
test = dl_test.load_all(num_workers=28)
100%|██████████| 83202/83202 [03:20<00:00, 414.75it/s]
In [7]:
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

from kipoi.writers import HDF5BatchWriter
In [32]:
y_pred = model.predict(test['inputs'], verbose=1)
2662434/2662434 [==============================] - 352s 132us/step
In [33]:
y_true = test["targets"]
In [34]:
!mkdir -p {ddir}/processed/dnase/exp/predictions/single-task/
In [35]:
obj = HDF5BatchWriter(f"{ddir}/processed/dnase/exp/predictions/single-task/model-2018-08-09::04:17:01.267710.h5")
obj.batch_write(y_pred)
obj.close()
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-35-254b27e9a55b> in <module>()
----> 1 HDF5BatchWriter.dump(f"{ddir}/processed/dnase/exp/predictions/single-task/model-2018-08-09::04:17:01.267710.h5", y_pred)

~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/writers.py in dump(cls, file_path, batch)
    203     @classmethod
    204     def dump(cls, file_path, batch):
--> 205         cls(file_path=file_path).batch_write(batch).close()
    206 
    207 # Nice-to-have writers:

AttributeError: 'NoneType' object has no attribute 'close'
In [21]:
len(y_pred)
Out[21]:
2
In [28]:
test_chr
Out[28]:
['chr1', 'chr8', 'chr9']
In [29]:
#test['metadata']['range']['chr'] == 'chr9'
In [25]:
# Get the most interesting regions
top_counts = samplers.top_sum_count(y_true["profile/DNase"], 10)
In [34]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [12]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [42]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [26]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [25]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [26]:
task = "DNase"
In [28]:
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
In [29]:
df = eval_profile(yt, yp)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:141: RuntimeWarning: invalid value encountered in true_divide
  fracs = yt / yt.sum(axis=1, keepdims=True)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:142: RuntimeWarning: invalid value encountered in greater_equal
  is_peak = (fracs >= pos_min_threshold).astype(float)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:143: RuntimeWarning: invalid value encountered in less
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
/users/avsec/workspace/basepair/basepair/cli/evaluate.py:143: RuntimeWarning: invalid value encountered in greater_equal
  ambigous = (fracs < pos_min_threshold) & (fracs >= neg_max_threshold)
In [30]:
df
Out[30]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.065803 1 0.035182 0.003204 68 0.005749
1 0.068983 2 0.065273 0.006613 68 0.009716
2 0.074774 4 0.114364 0.013344 65 0.019089
3 0.102669 10 0.211818 0.034025 59 0.043666
In [47]:
pl = Seq2Profile(test["inputs"], y_true, model)
In [53]:
pl.plot(n=20, sort='random', binsize=1)
In [52]:
pl.plot(n=10, sort='sum_DNase', binsize=1)

Compare the predictions for the k-mer model

In [14]:
from kipoi.readers import HDF5Reader
import random
In [15]:
# Run src/dnase/benchmark-kmer.py to obtain this file
# K-mers obtained from https://github.com/jpiper/pyDNase/blob/master/pyDNase/data/IMR90_6mer.pickle
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/IMR90_6mer.preds.h5")
In [16]:
r.open()
In [17]:
r.ls()
Out[17]:
[('/metadata/interval_from_task',
  <HDF5 dataset "interval_from_task": shape (2662434,), type "|O">),
 ('/metadata/range/chr', <HDF5 dataset "chr": shape (2662434,), type "|O">),
 ('/metadata/range/end', <HDF5 dataset "end": shape (2662434,), type "<i8">),
 ('/metadata/range/id', <HDF5 dataset "id": shape (2662434,), type "<i8">),
 ('/metadata/range/start',
  <HDF5 dataset "start": shape (2662434,), type "<i8">),
 ('/metadata/range/strand',
  <HDF5 dataset "strand": shape (2662434,), type "|O">),
 ('/preds', <HDF5 dataset "preds": shape (2662434, 995), type "<f8">),
 ('/targets/counts/DNase',
  <HDF5 dataset "DNase": shape (2662434, 2), type "<f4">),
 ('/targets/profile/DNase',
  <HDF5 dataset "DNase": shape (2662434, 1000, 2), type "<f4">)]
In [18]:
preds = r.f['/preds'][:]
In [19]:
targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)
In [26]:
y_true = targets[:,3:(1000-2)]
In [9]:
y_true_l = np.ravel(y_true)
In [10]:
y_pred_l = np.ravel(preds)
In [17]:
y_true_l[:5]
Out[17]:
array([0., 0., 0., 0., 0.], dtype=float32)
In [18]:
y_true_l.max()
Out[18]:
61.0
In [23]:
idx
Out[23]:
array([     32538,     356576,     880736, ..., 2647863212, 2648224655,
       2648681311])
In [32]:
idx = np.unique(random.sample(range(0, len(y_pred_l)), 1000000))
In [33]:
plt.scatter(y_true_l[idx], y_pred_l[idx], alpha=0.1)
plt.xlabel("true values")
plt.ylabel("predictions")
Out[33]:
Text(0,0.5,'predictions')

Aggregated plot

In [24]:
from basepair.plots import regression_eval
In [27]:
yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))
In [28]:
# 3:end - 2
regression_eval(yt_counts, yp_counts, alpha=0.1)
In [48]:
# 2:end - 3
regression_eval(yt_counts, yp_counts, alpha=0.1)

Plot a few profile examples

Get the results

In [9]:
def get_performance(mdir):
    try:
        dfh = pd.read_csv(f"{mdir}/history.csv")
        return dict(dfh.iloc[dfh.val_loss.argmin()])
    except:
        return {}
In [10]:
mdir = Path(f"{ddir}/processed/dnase/exp/models/background/models")
In [11]:
mdir.name
Out[11]:
'models'
In [12]:
df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in mdir.iterdir()])
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/ipykernel/__main__.py:4: FutureWarning: 'argmin' is deprecated. Use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional minimum in the future. Use 'series.values.argmin' to get the position of the minimum now.
In [13]:
df_exp.sort_values("val_counts/DNase_loss")
Out[13]:
counts/DNase_loss epoch loss name profile/DNase_loss val_counts/DNase_loss val_loss val_profile/DNase_loss
15 0.182623 16.0 35.560557 filters=128 35.377934 0.128930 38.769257 38.640327
24 0.185571 14.0 35.640936 n_dil_layers=6 35.455365 0.129076 38.745200 38.616124
21 0.200432 4.0 35.688733 tconv_kernel_size=25 35.488301 0.129666 38.827258 38.697592
14 0.221377 0.0 35.921744 test 35.700367 0.129715 38.869296 38.739581
18 0.193858 5.0 35.622445 2-more-filters 35.428587 0.129909 38.804538 38.674629
5 0.197724 6.0 35.583637 tconv_kernel_size=11 35.385913 0.129995 38.764216 38.634220
10 0.198610 8.0 35.641856 tconv_kernel_size=51 35.443246 0.130210 38.861494 38.731285
1 0.193425 2.0 37.519163 0 35.584910 0.131726 40.192790 38.875525
17 0.200270 6.0 35.718000 n_dil_layers=3 35.517730 0.131814 38.852261 38.720447
11 0.201226 3.0 35.644755 filters=256 35.443529 0.132037 38.846617 38.714580
8 0.197916 4.0 35.772572 filters=128,n_dil_layers=1 35.574656 0.132459 39.014482 38.882023
16 0.208118 1.0 35.747706 tconv_kernel_size=81 35.539588 0.132564 38.862644 38.730080
4 0.200440 14.0 35.697409 filters=32 35.496969 0.132972 38.786908 38.653936
9 0.199130 8.0 35.802095 n_dil_layers=1 35.602965 0.132995 38.958927 38.825932
22 0.202464 3.0 35.658114 n_dil_layers=9 35.455651 0.133286 38.818563 38.685277
23 0.201380 3.0 35.631906 filters=64 35.430526 0.133413 38.829109 38.695696
2 0.199738 3.0 35.819388 n_dil_layers=1,filters=64 35.619650 0.133641 38.997383 38.863742
13 0.198228 4.0 35.693804 filters=128,n_dil_layers=3 35.495577 0.133837 38.902903 38.769066
3 0.202916 7.0 35.818035 n_dil_layers=1,filters=32 35.615119 0.135351 39.024694 38.889343
20 0.202335 2.0 35.722158 tconv_kernel_size=31 35.519823 0.135585 38.866309 38.730723
7 0.199539 3.0 35.875132 n_dil_layers=1,filters=256 35.675593 0.135975 39.042875 38.906900
19 0.197249 7.0 35.807942 n_dil_layers=1,filters=128 35.610693 0.141668 38.894963 38.753295
12 0.261085 7.0 6.430682 w200 6.169596 0.267019 6.993833 6.726814
0 NaN NaN NaN test-bias-model NaN NaN NaN NaN
6 NaN NaN NaN 1-ndil=1 NaN NaN NaN NaN
In [14]:
best_model = mdir / "filters=128/model.h5"
In [15]:
best_model = mdir / "n_dil_layers=1/model.h5"
In [16]:
best_model = mdir / "w200/model.h5"
In [20]:
import basepair
In [1]:
from basepair.losses import MultichannelMultinomialNLL
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [2]:
from keras.models import load_model
In [27]:
mdir
Out[27]:
PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/background/models')
In [18]:
# TODO - unable to load the model as multiple might have the same name
model = load_model(best_model, custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                               "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})  # TODO - fix the loss function
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-15 02:05:56,124 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [19]:
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=200, #SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())
Skipped 3 intervals outside of the genome size
In [20]:
test = dl_test.load_all(num_workers=28)
100%|██████████| 83202/83202 [00:51<00:00, 1618.21it/s]
In [21]:
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

from kipoi.writers import HDF5BatchWriter
In [22]:
y_pred = model.predict(test['inputs'], verbose=1)
2662446/2662446 [==============================] - 265s 99us/step
In [23]:
y_true = test["targets"]
In [24]:
y_true['counts/DNase'].mean(-1).mean()
Out[24]:
0.4259427
In [25]:
y_true['counts/DNase'].mean(-1).std()
Out[25]:
0.35637203
In [26]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [63]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [58]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [34]:
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
In [34]:
df_exp.head()
Out[34]:
counts/DNase_loss epoch loss name profile/DNase_loss val_counts/DNase_loss val_loss val_profile/DNase_loss
0 0.193425 2.0 37.519163 /users/avsec/workspace/basepair/basepair/../da... 35.584910 0.131726 40.192790 38.875525
1 0.200440 14.0 35.697409 /users/avsec/workspace/basepair/basepair/../da... 35.496969 0.132972 38.786908 38.653936
2 0.197724 6.0 35.583637 /users/avsec/workspace/basepair/basepair/../da... 35.385913 0.129995 38.764216 38.634220
3 NaN NaN NaN /users/avsec/workspace/basepair/basepair/../da... NaN NaN NaN NaN
4 0.197916 4.0 35.772572 /users/avsec/workspace/basepair/basepair/../da... 35.574656 0.132459 39.014482 38.882023
In [14]:
ls {ddir}/processed/dnase/exp/models/background/models
0/                           filters=32/            tconv_kernel_size=25/
1-ndil=1/                    filters=64/            tconv_kernel_size=31/
2-more-filters/              n_dil_layers=1/        tconv_kernel_size=51/
filters=128/                 n_dil_layers=3/        tconv_kernel_size=81/
filters=128,n_dil_layers=1/  n_dil_layers=6/        test/
filters=128,n_dil_layers=3/  n_dil_layers=9/
filters=256/                 tconv_kernel_size=11/