Goal

  • compare DNase models with bias prediction to models without bias prediction

Conclusions

  • bias-crrected model predicts better

Future todo

  • check for some examples where bias prediction is particularly high what the importance scores are
In [2]:
import matplotlib.pyplot as plt
import basepair
from basepair.plots import regression_eval
from basepair.losses import MultichannelMultinomialNLL
from basepair.exp.dnase.models import KmerBiasModel
from kipoi.readers import HDF5Reader
from basepair.utils import read_pkl
from keras.models import load_model
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import StrandedProfile
from basepair.datasets import chip_exo_nexus
from pathlib import Path
import pandas as pd
import numpy as np
from basepair.config import create_tf_session, get_data_dir

ddir = get_data_dir()
SEQ_WIDTH = 1000
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
In [3]:
ls {ddir}/processed/dnase/exp/models/single-task/
dataspec.yml
dataspec.yml~
hparams.yml
models/
seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=51,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-09::04:17:01.267710.h5
In [4]:
ls {ddir}/processed/dnase/exp/models/single-task-background-corrected/models
1-test/          filters=64/      n_dil_layers=6/        tconv_kernel_size=51/
2-default-arch/  lr=0.001/        n_dil_layers=9/        tconv_kernel_size=81/
filters=128/     lr=0.01/         tconv_kernel_size=11/
filters=256/     n_dil_layers=1/  tconv_kernel_size=25/
filters=32/      n_dil_layers=3/  tconv_kernel_size=31/
In [5]:
ls {ddir}/processed/dnase/exp/models/single-task-background-corrected/
dataspec.yml   hparams.yml   logs/    tryout.model.txt
dataspec.yml~  hparams.yml~  models/  tryout.model.txt~
In [6]:
!cat {ddir}/processed/dnase/exp/models/single-task-background-corrected/tryout.model.txt
filters=32
filters=64
filters=128
filters=256
n_dil_layers=1
n_dil_layers=3
n_dil_layers=6
n_dil_layers=9
tconv_kernel_size=11
tconv_kernel_size=25
tconv_kernel_size=31
tconv_kernel_size=51
tconv_kernel_size=81
lr=0.001
lr=0.01
In [7]:
mdir_root = Path(f"{ddir}//processed/dnase/exp/models/single-task-background-corrected/models")
mdirs = list(mdir_root.glob("*=*"))
mdirs
Out[7]:
[PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/filters=32'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/tconv_kernel_size=11'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/lr=0.01'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/n_dil_layers=1'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/tconv_kernel_size=51'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/filters=256'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/filters=128'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/tconv_kernel_size=81'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/n_dil_layers=3'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/lr=0.001'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/tconv_kernel_size=31'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/tconv_kernel_size=25'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/n_dil_layers=9'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/filters=64'),
 PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models/n_dil_layers=6')]
In [8]:
# performance comparison
def get_performance(mdir):
    try:
        dfh = pd.read_csv(f"{mdir}/history.csv")
        return dict(dfh.iloc[dfh.val_loss.idxmin()])
    except:
        return {}

df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
                        for md in mdirs])

df_exp.sort_values("val_counts/DNase_loss")
Out[8]:
counts/DNase_loss epoch loss name profile/DNase_loss val_counts/DNase_loss val_loss val_profile/DNase_loss
4 0.651767 18.0 1236.794538 tconv_kernel_size=51 1230.276869 0.591518 1279.827481 1273.912304
1 0.667344 21.0 1241.674487 tconv_kernel_size=11 1235.001050 0.593641 1278.895052 1272.958640
6 0.627956 13.0 1196.484928 filters=128 1190.205366 0.600471 1263.252838 1257.248123
13 0.666916 21.0 1247.217667 filters=64 1240.548506 0.609397 1293.909189 1287.815221
7 0.684370 10.0 1249.239680 tconv_kernel_size=81 1242.395977 0.611300 1289.102738 1282.989732
8 0.673469 23.0 1427.849958 n_dil_layers=3 1421.115271 0.619060 1446.129403 1439.938802
2 0.670497 21.0 1309.483766 lr=0.01 1302.778800 0.623379 1330.296115 1324.062326
0 0.711739 22.0 1292.012985 filters=32 1284.895597 0.628092 1307.759874 1301.478960
3 0.679153 37.0 1531.211371 n_dil_layers=1 1524.419840 0.648334 1531.832609 1525.349263
5 0.619766 32.0 1234.416685 filters=256 1228.219023 0.672271 1305.538144 1298.815436
14 0.692374 8.0 1262.635222 n_dil_layers=6 1255.711482 0.729643 1288.447792 1281.151362
10 0.675839 11.0 1249.545481 tconv_kernel_size=31 1242.787091 0.760515 1286.280900 1278.675749
9 NaN NaN NaN lr=0.001 NaN NaN NaN NaN
11 NaN NaN NaN tconv_kernel_size=25 NaN NaN NaN NaN
12 NaN NaN NaN n_dil_layers=9 NaN NaN NaN NaN
In [9]:
!cat {str(mdir_root / "tconv_kernel_size=51/hparams.yaml")} | head
model:
  name: seq_multitask
  kwargs:
    filters: 64
    conv1_kernel_size: 25
    tconv_kernel_size: 51
    n_dil_layers: 6
    lr: 0.004
    c_task_weight: 10
data:
In [10]:
mdir_root
Out[10]:
PosixPath('/users/avsec/workspace/basepair/basepair/../data/processed/dnase/exp/models/single-task-background-corrected/models')
In [11]:
!cat {str(mdir_root / "tconv_kernel_size=51/hparams.yaml")} | head
model:
  name: seq_multitask
  kwargs:
    filters: 64
    conv1_kernel_size: 25
    tconv_kernel_size: 51
    n_dil_layers: 6
    lr: 0.004
    c_task_weight: 10
data:
In [12]:
from keras.models import load_model
In [13]:
create_tf_session(0)
Out[13]:
<tensorflow.python.client.session.Session at 0x7fc74c8ff198>
In [14]:
mdir = mdir_root / "tconv_kernel_size=51"
model = load_model(mdir / "model.h5",
                   custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                   "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
2018-08-17 16:58:54,349 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:497: calling conv1d (from tensorflow.python.ops.nn_ops) with data_format=NHWC is deprecated and will be removed in a future version.
Instructions for updating:
`NHWC` for data_format is deprecated, use `NWC` instead
WARNING:tensorflow:From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
2018-08-17 16:58:58,867 [WARNING] From /users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/tensorflow/contrib/learn/python/learn/datasets/base.py:198: retry (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Use the retry module or similar alternatives.
In [194]:
mdir = mdir_root / "../../single-task/models/tconv_kernel_size=51"
model = load_model(mdir / "model.h5",
                   custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
                                   "MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)})
In [28]:
from basepair.cli.modisco import compute_importance_scores
In [35]:
compute_importance_scores(mdir, "/tmp/imp.h5")
100%|██████████| 121/121 [11:02<00:00,  5.47s/it]
In [36]:
from kipoi.readers import HDF5Reader

r = HDF5Reader("/tmp/imp.h5")
r.open()
In [47]:
d = HDF5Reader.load("/tmp/imp.h5")
Signature: HDF5Reader.load(file_path, unflatten=True)
Docstring: <no docstring>
File:      ~/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/kipoi/readers.py
Type:      method
In [37]:
r.ls()
Out[37]:
[('/grads/DNase/count/0',
  <HDF5 dataset "0": shape (61670, 1000, 4), type "<f4">),
 ('/grads/DNase/weighted/0',
  <HDF5 dataset "0": shape (61670, 1000, 4), type "<f4">),
 ('/inputs/bias/counts/DNase',
  <HDF5 dataset "DNase": shape (61670, 1), type "<f4">),
 ('/inputs/bias/profile/DNase',
  <HDF5 dataset "DNase": shape (61670, 1000, 1), type "<f4">),
 ('/inputs/seq', <HDF5 dataset "seq": shape (61670, 1000, 4), type "<f4">),
 ('/metadata/interval_from_task',
  <HDF5 dataset "interval_from_task": shape (61670,), type "|O">),
 ('/metadata/range/chr', <HDF5 dataset "chr": shape (61670,), type "|O">),
 ('/metadata/range/end', <HDF5 dataset "end": shape (61670,), type "<i8">),
 ('/metadata/range/id', <HDF5 dataset "id": shape (61670,), type "<i8">),
 ('/metadata/range/start', <HDF5 dataset "start": shape (61670,), type "<i8">),
 ('/metadata/range/strand',
  <HDF5 dataset "strand": shape (61670,), type "|O">),
 ('/targets/profile/DNase',
  <HDF5 dataset "DNase": shape (61670, 1000, 1), type "<f4">)]
In [42]:
%time d = r.load_all()
CPU times: user 136 ms, sys: 1.98 s, total: 2.11 s
Wall time: 2.03 s
In [46]:
r.close()
In [42]:
from basepair.config import test_chr
from basepair.preproc import AppendCounts
ds = DataSpec.load(mdir / "dataspec.yaml")
ds.task_specs['DNase'].ignore_strand = True  # fix
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=SEQ_WIDTH, 
                          shuffle=False,
                          target_transformer=AppendCounts())
In [43]:
test = dl_test.load_all(batch_size=256, num_workers=10)
100%|██████████| 219/219 [00:04<00:00, 48.00it/s]
In [44]:
from basepair.BPNet import BiasModel
In [45]:
ds.task_specs['DNase'].bias_model = "/users/avsec/workspace/basepair/data/processed/dnase/exp/models/background/models/w=1000/model.h5"
In [46]:
x,y_true = BiasModel(ds).predict((test["inputs"], test["targets"]), batch_size=512)
In [47]:
y_true.keys()
Out[47]:
dict_keys(['profile/DNase', 'counts/DNase'])
In [48]:
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair.math import softmax
In [92]:
y_pred = model.predict(x, verbose=1)
55826/55826 [==============================] - 12s 209us/step
In [77]:
# Not bias-corrected
regression_eval(y_true['counts/DNase'].mean(-1), 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1), 
                alpha=0.1)
In [53]:
# bias-corrected
regression_eval(y_true['counts/DNase'].mean(-1), 
                y_pred[ds.task2idx("DNase", 'counts')].mean(-1), 
                alpha=0.1)
In [78]:
task = "DNase"
In [79]:
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
In [80]:
df = eval_profile(yt, yp)
In [81]:
# not bias-corrected
df
Out[81]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.500964 1 0.008679 0.000088 4863 0.000193
1 0.518632 2 0.016696 0.000176 4833 0.000398
2 0.538008 4 0.031330 0.000354 4782 0.000892
3 0.577080 10 0.067133 0.000904 4706 0.002571
In [52]:
# bias-corrected
df
Out[52]:
auprc binsize frac_ambigous imbalance n_positives random_auprc
0 0.537989 1 0.008679 0.000088 4863 0.000181
1 0.552396 2 0.016696 0.000176 4833 0.000389
2 0.569569 4 0.031330 0.000354 4782 0.000864
3 0.607829 10 0.067133 0.000904 4706 0.002497
In [101]:
from basepair.plots import plot_profiles

from basepair import samplers
In [56]:
tasks = list(ds.task_specs)
In [95]:
idx_list = samplers.top_sum_count(y_true['profile/DNase'], 10)
In [93]:
idx_list = samplers.top_max_count(y_true['profile/DNase'], 10)
In [97]:
idx_list = samplers.random(y_true['profile/DNase'], 10)
In [98]:
plot_profiles(y_true, y_pred, tasks, idx_list, preproc=None, figsize=(20, 2), tic_freq=50)
In [102]:
# get high bias sites
In [117]:
high_bias_idx = list(pd.Series(x['bias/counts/DNase'][:,0]).sort_values(ascending=False)[:10].index)
In [118]:
high_bias_idx
Out[118]:
[50698, 44257, 22650, 5092, 4777, 4469, 4466, 23567, 23566, 15499]
In [ ]:
# TODO - plot the importance scores
In [119]:
plot_profiles(y_true, y_pred, tasks, high_bias_idx, preproc=None, figsize=(20, 2), tic_freq=50)
In [120]:
from basepair.BPNet import BPNetPredictor
In [195]:
bp = BPNetPredictor(model, ds.fasta_file, tasks, bias_model=BiasModel(ds))
In [122]:
from kipoi.data_utils import get_dataset_item
In [149]:
bp.model.inputs[0]
Out[149]:
<tf.Tensor 'seq_2:0' shape=(?, 1000, 4) dtype=float32>
In [130]:
assert bp.model.inputs[0].name[:3] == "seq"
In [136]:
bp.model.input_names
Out[136]:
['seq', 'bias/profile/DNase', 'bias/counts/DNase']
In [132]:
bp.model.inputs
Out[132]:
[<tf.Tensor 'seq_2:0' shape=(?, 1000, 4) dtype=float32>,
 <tf.Tensor 'bias/profile/DNase_1:0' shape=(?, 1000, 1) dtype=float32>,
 <tf.Tensor 'bias/counts/DNase_1:0' shape=(?, 1) dtype=float32>]
In [150]:
bp.input_grad(get_dataset_item(x, high_bias_idx), strand='pos').shape
Out[150]:
(10, 1000, 4)
In [1]:
from kipoi.writers import HDF5BatchWriter
In [165]:
from pybedtools import BedTool
In [176]:
bt = list(BedTool.from_dataframe(dl_test.dfm.iloc[high_bias_idx]))
In [159]:
list((1,2,3))
Out[159]:
[1, 2, 3]
In [163]:
list(({"seq": 1}, )[1:])
Out[163]:
[]
In [196]:
# non bias-corrected
bp.plot_predict_grad(bt)
In [193]:
# bias-corrected
bp.plot_predict_grad(bt)
In [156]:
(1,) + (1,)[1:]
Out[156]:
(1,)
In [146]:
bt.head()
chr9	93770266	93771266	DNase
 chr9	48314979	48315979	DNase
 chr8	10184950	10185950	DNase
 chr1	52770150	52771150	DNase
 chr1	49234690	49235690	DNase
 chr1	44511468	44512468	DNase
 chr1	44461036	44462036	DNase
 chr8	15483104	15484104	DNase
 chr8	15482954	15483954	DNase
 chr1	150652590	150653590	DNase
 
In [ ]:
bp.plot_predict_grad()