Train a basic model predicting DNase cuts.
import basepair
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from pathlib import Path
from basepair.config import create_tf_session, get_data_dir
ddir = get_data_dir()
create_tf_session(0)
dpath = Path("/srv/scratch/avsec/workspace/basepair-workflow/data/DNase")
SEQ_WIDTH = 1000
ds = DataSpec(
task_specs={"DNase": TaskSpec(task="DNase",
pos_counts=dpath / "signal/raw/merged.pos.bw",
neg_counts=dpath / "signal/raw/merged.neg.bw",
peaks=dpath / "raw/peaks/ENCFF426TTH.bed.gz")},
fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta",
)
train, valid, test = chip_exo_nexus(ds, peak_width=SEQ_WIDTH)
# TODO - preproc the data
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_multitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
model.summary()
def get_model(mfn, mkwargs):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/dnase/exp/models/single-task"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(datetime.datetime.now()).replace(" ", "::")
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
return eval(mfn)(**mkwargs), name, ckp_file
# hyper-parameters
mfn = "seq_multitask"
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=51,
n_dil_layers=6,
tasks=['DNase'],
seq_len=SEQ_WIDTH,
lr=0.004)
# best valid so far:
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0],
train[1],
batch_size=256,
epochs=100,
validation_data=valid[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll,
"SpatialLifetimeSparsity": SpatialLifetimeSparsity})
a=1
from basepair.preproc import bin_counts
# TODO - write a generic plotter
class Seq2Profile:
def __init__(self, x, y, model):
self.x = x
self.y = y
self.model = model
# Make the prediction
self.y_pred = [softmax(y) for y in model.predict(x)]
def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), binsize=1, fpath_template=None):
import matplotlib.pyplot as plt
if sort == 'random':
idx_list = samplers.random(self.x, n)
elif "_" in sort:
kind, task = sort.split("_")
if kind == "max":
idx_list = samplers.top_max_count(self.y["profile/" + task], n)
elif kind == "sum":
idx_list = samplers.top_sum_count(self.y["profile/" + task], n)
else:
raise ValueError("")
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
for i, idx in enumerate(idx_list):
task = "DNase"
fig = plt.figure(figsize=figsize)
plt.subplot(121)
if i == 0:
plt.title("Predicted DNase")
bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0]
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0],
label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1],
label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
plt.legend()
plt.subplot(122)
if i == 0:
plt.title("Observed DNase")
plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 0],
label='pos,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/" + task], binsize=binsize)[idx, :, 1],
label='neg,m={}'.format(np.argmax(self.y["profile/" + task][idx, :, 1])))
plt.legend()
from basepair.plots import regression_eval
from basepair.cli.evaluate import eval_profile
y_pred = model.predict(test[0])
y_true = test[1]
regression_eval(test[1]['counts/DNase'].mean(-1), y_pred[ds.task2idx("DNase", 'counts')].mean(-1))
task = "DNase"
from basepair.math import softmax
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
df = eval_profile(yt, yp)
df
a=1
pl = Seq2Profile(test[0], test[1], model)
from basepair import samplers
samplers.top_sum_count(test[1]['profile/DNase'], 10)
pl.plot(n=10, sort='sum_DNase', binsize=1)
pl.plot(n=10, sort='sum_DNase', binsize=10)
pl.plot(n=10, sort='max_DNase', binsize=1)
pl.plot(n=20, sort='random', binsize=1)
pl.plot(n=20, sort='random', binsize=10)