from basepair.cli.schemas import DataSpec, TaskSpec
from basepair.datasets import chip_exo_nexus
from basepair.preproc import AppendTotalCounts
from basepair.config import get_data_dir, create_tf_session
# Use gpus 1, 2
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1, 2"
ddir = '/users/amr1/bpnet/basepair/src/chipseq/'
bdir = "/users/amr1/bpnet/sox2-oct4-chipseq"
ds = DataSpec(task_specs={"Sox2": TaskSpec(task="Sox2",
pos_counts=f"{bdir}/Sox2/pos.bw",
neg_counts=f"{bdir}/Sox2/neg.bw",
peaks=f"{bdir}/Sox2/Sox2_1_rep1-pr.IDR0.05.filt.12-col.bed.gz",
),
"Oct4": TaskSpec(task="Oct4",
pos_counts=f"{bdir}/Oct4/pos2.bw",
neg_counts=f"{bdir}/Oct4/neg2.bw",
peaks=f"{bdir}/Oct4/Oct4_12_ppr.IDR0.05.filt.12-col.bed.gz",
)
},
fasta_file="/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
)
def ds2bws(ds):
return {task: {"pos": task_spec.pos_counts, "neg": task_spec.neg_counts} for task, task_spec in ds.task_specs.items()}
train, valid, test = chip_exo_nexus(ds, peak_width=1000)
train[1]['profile/Sox2'].shape
train[1].keys()
def seq_multitask_chipseq(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
#tconv_kernel_size2=25,
n_dil_layers=6,
lr=0.004,
c_task_weight=100,
use_profile=True,
use_counts=True,
tasks=['sox2', 'oct4'],
seq_len=1000):
"""
Dense
Args:
c_task_weights: how to upweight the count-prediction task
"""
inp = kl.Input(shape=(seq_len, 4), name='seq')
first_conv = kl.Conv1D(filters,
kernel_size=conv1_kernel_size,
padding='same',
activation='relu')(inp)
prev_layers = [first_conv]
for i in range(1, n_dil_layers + 1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = kl.add(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
combined_conv = kl.add(prev_layers)
# De-conv
x = kl.Reshape((-1, 1, filters))(combined_conv)
x = kl.Conv2DTranspose(2*len(tasks), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(len(tasks), kernel_size=(tconv_kernel_size2, 1), padding='same')(x)
#x = kl.UpSampling2D((2, 1))(x)
#x = kl.Conv2DTranspose(int(len(tasks)/2), kernel_size=(tconv_kernel_size3, 1), padding='same')(x)
out = kl.Reshape((-1, 2 * len(tasks)))(x)
# setup the output branches
outputs = []
losses = []
loss_weights = []
if use_profile:
output = [kl.Lambda(lambda x, i: x[:, :, (2 * i):(2 * i + 2)],
output_shape=(seq_len, 2),
name="profile/" + task,
arguments={"i": i})(out)
for i, task in enumerate(tasks)]
outputs += output
losses += [twochannel_multinomial_nll] * len(tasks)
loss_weights += [1] * len(tasks)
if use_counts:
pooled = kl.GlobalAvgPool1D()(combined_conv)
counts = [kl.Dense(2, name="counts/" + task)(pooled)
for task in tasks]
outputs += counts
losses += ["mse"] * len(tasks)
loss_weights += [c_task_weight] * len(tasks)
model = Model(inp, outputs)
model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
return model
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
i=1
def get_model(mfn, mkwargs, fkwargs, i):
"""Get the model"""
import datetime
mdir = f"{ddir}/"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(i)
i+=1
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
all_kwargs = {**mkwargs, **fkwargs}
return eval(mfn)(**all_kwargs), name, ckp_file
# hyper-parameters
mfn = "seq_multitask_chipseq"
use_profile = True
use_counts = True
mkwargs = dict(filters=32,
conv1_kernel_size=21,
tconv_kernel_size=100,
n_dil_layers=6,
use_profile=use_profile,
use_counts=use_counts,
c_task_weight=10,
seq_len=train[0].shape[1],
lr=0.004)
fixed_kwargs = dict(
tasks=list(ds.task_specs)
)
import numpy as np
np.random.seed(20)
i += 1
model, name, ckp_file = get_model(mfn, mkwargs, fixed_kwargs, i)
history = model.fit(train[0],
train[1],
batch_size=256,
epochs=100,
validation_data=valid[:2],
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
from basepair.eval import evaluate
evaluate(model, valid[0], valid[1])