# Use gpus 0,1
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"
import time
from pathlib import Path
import sys
sys.path.append(str(Path(os.getcwd()).absolute().parent.parent))
sys.path.append('/opt/miniconda3/envs/basepair/lib/python3.6/site-packages')
import basepair
from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
ddir = '/home/prime/data'
bdir = "/data/sox2-oct4-chipseq/"
ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
pos_counts=f"{bdir}/Sox2/pos.bw",
neg_counts=f"{bdir}/Sox2/neg.bw",
peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
),
"Oct4": TaskSpec(task="Oct4",
pos_counts=f"{bdir}/Oct4/pos2.bw",
neg_counts=f"{bdir}/Oct4/neg2.bw",
peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
)
},
fasta_file="/data/mm10_no_alt_analysis_set_ENCODE.fasta"
)
def ds2bws(ds):
return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
# Get the training data
start = time.time()
train, valid, test = chip_exo_nexus(ds, peak_width=1000)
end = time.time() - start
print('Time taken: ' + str(end))
train[1]['profile/Sox2'].shape
# TODO - play around with this
def seq_multitask_chipseq(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
#tconv_kernel_size2=25,
n_dil_layers=6,
lr=0.004,
c_task_weight=100,
use_profile=True,
use_counts=True,
tasks=['sox2', 'oct4'],
seq_len=201):
"""
Dense
Args:
c_task_weights: how to upweight the count-prediction task
"""
# TODO - build the reverse complement symmetry into the model
inp = kl.Input(shape=(seq_len, 4), name='seq')
first_conv = kl.Conv1D(filters,
kernel_size=conv1_kernel_size,
padding='same',
activation='relu')(inp)
prev_layers = [first_conv]
for i in range(1, n_dil_layers + 1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = kl.add(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
combined_conv = kl.add(prev_layers)
# De-conv
x = kl.Reshape((-1, 1, filters))(combined_conv)
x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
out = kl.Reshape((-1, 2 * len(tasks)))(x)
# setup the output branches
outputs = []
losses = []
loss_weights = []
if use_profile:
output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
output_shape=(seq_len, 2),
name="profile/" + task,
arguments={"i": i})(out)
for i, task in enumerate(tasks)]
# true counts size is (tasks, 1000, 2)
outputs += output
losses += [twochannel_multinomial_nll] * len(tasks)
loss_weights += [1] * len(tasks)
if use_counts:
pooled = kl.GlobalAvgPool1D()(combined_conv)
counts = [kl.Dense(2, name="counts/" + task)(pooled)
for task in tasks]
outputs += counts
losses += ["mse"] * len(tasks)
loss_weights += [c_task_weight] * len(tasks)
model = Model(inp, outputs)
model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
return model
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
i=1
def get_model(mfn, mkwargs, fkwargs, i):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/chipseq/exp/models/count+profile"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(i)
i+=1
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
all_kwargs = {**mkwargs, **fkwargs}
return eval(mfn)(**all_kwargs), name, ckp_file
# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = False
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=100,
n_dil_layers=6,
use_profile=use_profile,
use_counts=use_counts,
c_task_weight=10,
seq_len=train[0].shape[1],
lr=0.004)
fixed_kwargs = dict(
tasks=list(ds.task_specs)
)
import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0],
train[1],
batch_size=256,
epochs=100,
validation_data=valid[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])
from basepair.math import softmax
from basepair import samplers
from basepair.preproc import bin_counts
import numpy as np
class Seq2Sox2Oct4:
def __init__(self, x, y, model):
self.x = x
self.y = y
self.model = model
# Make the prediction
self.y_pred = [softmax(y) for y in model.predict(x)]
def plot(self, n=10, kind='test', sort='random', figsize=(20, 2), fpath_template=None, binsize=1):
import matplotlib.pyplot as plt
if sort == 'random':
idx_list = samplers.random(self.x, n)
elif "_" in sort:
kind, task = sort.split("_")
#task_id = {"Sox2": 0, "Oct4": 1}[task]
if kind == "max":
idx_list = samplers.top_max_count(self.y[f"profile/{task}"], n)
elif kind == "sum":
idx_list = samplers.top_sum_count(self.y[f"profile/{task}"], n)
else:
raise ValueError("")
else:
raise ValueError(f"sort={sort} couldn't be interpreted")
# for visualization, we use bucketize
for i, idx in enumerate(idx_list):
fig = plt.figure(figsize=figsize)
plt.subplot(141)
if i == 0:
plt.title("Predicted Sox2")
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y_pred[0][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[0], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y_pred[0][idx, :, 1])))
plt.legend()
plt.subplot(142)
if i == 0:
plt.title("Observed Sox2")
plt.plot(bin_counts(self.y["profile/Sox2"], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y["profile/Sox2"][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/Sox2"], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y["profile/Sox2"][idx, :, 1])))
plt.legend()
plt.subplot(143)
if i == 0:
plt.title("Predicted Oct4")
plt.plot(bin_counts(self.y_pred[1], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y_pred[1][idx, :, 0])))
plt.plot(bin_counts(self.y_pred[1], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y_pred[1][idx, :, 1])))
plt.legend()
plt.subplot(144)
if i == 0:
plt.title("Observed Oct4")
plt.plot(bin_counts(self.y["profile/Oct4"], binsize=binsize)[idx, :, 0], label='pos,m={}'.format(np.argmax(self.y["profile/Oct4"][idx, :, 0])))
plt.plot(bin_counts(self.y["profile/Oct4"], binsize=binsize)[idx, :, 1], label='neg,m={}'.format(np.argmax(self.y["profile/Oct4"][idx, :, 1])))
plt.legend()
if fpath_template is not None:
plt.savefig(fpath_template.format(i) + '.png', dpi=600)
plt.savefig(fpath_template.format(i) + '.pdf', dpi=600)
plt.close(fig) # close the figure
show_figure(fig)
plt.show()
y_pred = model.predict(test[0])
from basepair.plots import regression_eval
regression_eval(test[1]['counts/Sox2'].mean(-1), y_pred[ds.task2idx("Sox2", 'counts')].mean(-1))
regression_eval(test[1]['counts/Oct4'].mean(-1), y_pred[ds.task2idx("Oct4", 'counts')].mean(-1))
pl = Seq2Sox2Oct4(test[0], test[1], model)
pl.plot(n=10, sort='sum_Sox2', binsize=10)
from basepair.BPNet import BPNetPredictor
bpnet = BPNetPredictor(model, ds.fasta_file, list(ds.task_specs), preproc=preproc)
test[2].head()
from pybedtools import Interval, BedTool
bt = BedTool.from_dataframe(test[2][["chr", "start", "end"]][:5])
# For some intervals from the genome, plot the observed and predicted profiles
bpnet.predict_plot(intervals=list(bt), bws = ds2bws(ds), profile_grad="weighted")