[x] train a few DNase models (random search)
[ ] train single-task model
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from keras.models import load_model
from basepair.config import create_tf_session, get_data_dir
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.config import valid_chr, test_chr
ddir = get_data_dir()
create_tf_session(0)
dpath = Path("/mnt/lab_data/users/avsec/dnase_bias/")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts=dpath / "IMR90_Naked_DNase.pos.bw",
neg_counts=dpath / "IMR90_Naked_DNase.neg.bw",
peaks=dpath / "genome.stride1000.w1000.nonblacklist.bed")},
fasta_file=dpath / "GRCh38.genome.fa",
)
dl = StrandedProfile(ds,
excl_chromosomes=valid_chr+test_chr,
peak_width=SEQ_WIDTH, shuffle=True,
target_transformer=AppendCounts())
dl_valid = StrandedProfile(ds,
incl_chromosomes=valid_chr,
peak_width=SEQ_WIDTH,
shuffle=True,
target_transformer=AppendCounts())
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=SEQ_WIDTH,
shuffle=True,
target_transformer=AppendCounts())
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_multitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
def get_model(mfn, mkwargs):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/dnase/exp/models/background"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(datetime.datetime.now()).replace(" ", "::")
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
return eval(mfn)(**mkwargs), name, ckp_file
# hyper-parameters
mfn = "seq_multitask"
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
tasks=['DNase'],
seq_len=SEQ_WIDTH,
lr=0.004)
# TODO - how to deal with counts which are not provided?
# - directly train the count prediction model by multiplying the loss-functions
# best valid so far:
model, name, ckp_file = get_model(mfn, mkwargs)
batch_size=256
history = model.fit_generator(dl.batch_train_iter(batch_size=batch_size, num_workers=28),
steps_per_epoch=(len(dl)//batch_size) // 20, # Evaluate 20x more frequently
epochs=100,
validation_data=dl_valid.batch_train_iter(batch_size=batch_size, num_workers=28),
validation_steps=(len(dl_valid)//batch_size) // 100,
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll,
"SpatialLifetimeSparsity": SpatialLifetimeSparsity})
ckp_file
!cat {ddir}/processed/dnase/exp/models/background/models/w=1000/dataspec.yaml
!ls -latr {ddir}/processed/dnase/exp/models/background/models/
ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=51,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-09::07:23:11.054472.h5"
model = load_model(ckp_file)
ckp_file = f"{ddir}/processed/dnase/exp/models/background/seq_multitask_filters=32,conv1_kernel_size=21,tconv_kernel_size=25,n_dil_layers=6,tasks=['DNase'],seq_len=1000,lr=0.004.2018-08-13::07:39:01.200919.h5"
model = load_model(ckp_file)
from basepair.preproc import bin_counts
# TODO - write a generic plotter
class Seq2Profile:
def __init__(self, x, y, model):
self.x = x
self.y = y
self.model = model
# Make the prediction
self.y_pred = [softmax(y) for y in model.predict(x)]
def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), binsize=1, fpath_template=None):
import matplotlib.pyplot as plt
if sort == 'random':
idx_list = samplers.random(self.x, n)
elif "_" in sort:
kind, task = sort.split("_")
if kind == "max":
idx_list = samplers.top_max_count(self.y["profile/" + task], n)
elif kind == "sum":
idx_list = samplers.top_sum_count(self.y["profile/" + task], n)
else:
raise ValueError("")
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
for i, idx in enumerate(idx_list):
task = "DNase"
fig = plt.figure(figsize=figsize)
plt.subplot(121)
if i == 0:
plt.title("Predicted DNase")
bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0]
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0],
label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1],
label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
plt.legend()
plt.subplot(122)
if i == 0:
plt.title("Observed DNase")
plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 0],
label='pos,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 1],
label='neg,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 1])))
plt.legend()
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=SEQ_WIDTH,
shuffle=False,
target_transformer=AppendCounts())
test = dl_test.load_all(num_workers=28)
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers
from basepair.math import softmax
from kipoi.writers import HDF5BatchWriter
y_pred = model.predict(test['inputs'], verbose=1)
y_true = test["targets"]
!mkdir -p {ddir}/processed/dnase/exp/predictions/single-task/
obj = HDF5BatchWriter(f"{ddir}/processed/dnase/exp/predictions/single-task/model-2018-08-09::04:17:01.267710.h5")
obj.batch_write(y_pred)
obj.close()
len(y_pred)
test_chr
#test['metadata']['range']['chr'] == 'chr9'
# Get the most interesting regions
top_counts = samplers.top_sum_count(y_true["profile/DNase"], 10)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
task = "DNase"
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
df = eval_profile(yt, yp)
df
pl = Seq2Profile(test["inputs"], y_true, model)
pl.plot(n=20, sort='random', binsize=1)
pl.plot(n=10, sort='sum_DNase', binsize=1)
from kipoi.readers import HDF5Reader
import random
# Run src/dnase/benchmark-kmer.py to obtain this file
# K-mers obtained from https://github.com/jpiper/pyDNase/blob/master/pyDNase/data/IMR90_6mer.pickle
r = HDF5Reader(f"{ddir}/processed/dnase/6mer/IMR90_6mer.preds.h5")
r.open()
r.ls()
preds = r.f['/preds'][:]
targets = r.f['/targets/profile/DNase'][:].sum(axis=-1)
y_true = targets[:,3:(1000-2)]
y_true_l = np.ravel(y_true)
y_pred_l = np.ravel(preds)
y_true_l[:5]
y_true_l.max()
idx
idx = np.unique(random.sample(range(0, len(y_pred_l)), 1000000))
plt.scatter(y_true_l[idx], y_pred_l[idx], alpha=0.1)
plt.xlabel("true values")
plt.ylabel("predictions")
from basepair.plots import regression_eval
yt_counts = np.log(1+y_true.sum(axis=-1))
yp_counts = np.log(1+preds.sum(axis=-1))
# 3:end - 2
regression_eval(yt_counts, yp_counts, alpha=0.1)
# 2:end - 3
regression_eval(yt_counts, yp_counts, alpha=0.1)
def get_performance(mdir):
try:
dfh = pd.read_csv(f"{mdir}/history.csv")
return dict(dfh.iloc[dfh.val_loss.argmin()])
except:
return {}
mdir = Path(f"{ddir}/processed/dnase/exp/models/background/models")
mdir.name
df_exp = pd.DataFrame([{**get_performance(md), "name":md.name}
for md in mdir.iterdir()])
df_exp.sort_values("val_counts/DNase_loss")
best_model = mdir / "filters=128/model.h5"
best_model = mdir / "n_dil_layers=1/model.h5"
best_model = mdir / "w200/model.h5"
import basepair
from basepair.losses import MultichannelMultinomialNLL
from keras.models import load_model
mdir
# TODO - unable to load the model as multiple might have the same name
model = load_model(best_model, custom_objects={"mc_multinomial_nll": MultichannelMultinomialNLL(1),
"MultichannelMultinomialNLL": MultichannelMultinomialNLL(1)}) # TODO - fix the loss function
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=200, #SEQ_WIDTH,
shuffle=False,
target_transformer=AppendCounts())
test = dl_test.load_all(num_workers=28)
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers
from basepair.math import softmax
from kipoi.writers import HDF5BatchWriter
y_pred = model.predict(test['inputs'], verbose=1)
y_true = test["targets"]
y_true['counts/DNase'].mean(-1).mean()
y_true['counts/DNase'].mean(-1).std()
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
regression_eval(y_true['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1), alpha=0.1)
df_exp.head()
ls {ddir}/processed/dnase/exp/models/background/models