from basepair.imports import *
from basepair.datasets import *
from kipoi.data_utils import numpy_collate_concat
from basepair.config import valid_chr, test_chr
from basepair.utils import read_json
from concise.losses import binary_crossentropy_masked
from concise.metrics import accuracy
from concise.eval_metrics import auprc
from basepair.samplers import StratifiedRandomBatchSampler
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, History, CSVLogger, ModelCheckpoint
# test StratifiedRandomBatchSampler
sampler = StratifiedRandomBatchSampler(np.array([0,0,0,0, 0, 1,1]), p_vec=[0.5, 0.5], batch_size=2)
list(sampler)
create_tf_session(0)
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
# Load the model
model = load_model(model_dir / "model.h5")
batch_size = 256
num_workers = 8
intervalspec = read_json("/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/chipnexus/accessible/intervalspec.json")
tasks = intervalspec['task_names']
intervals_file = intervalspec['chipnexus']['intervals_file']
fasta_file = "/mnt/data/pipeline_genome_data/mm10/mm10_no_alt_analysis_set_ENCODE.fasta"
intervals_file
train = SeqClassification(intervals_file, fasta_file, excl_chromosomes=valid_chr+test_chr)
valid = SeqClassification(intervals_file, fasta_file, incl_chromosomes=valid_chr)
len(train)
len(valid)
transfer_to = "add_9"
inp = kl.Input((model.inputs[0].shape[1].value, 4), name='seq')
# Transferred part
tmodel = Model(model.inputs,
model.get_layer(transfer_to).output)
# Freeze all the layers up to (including) the freeze_to layer
for l in tmodel.layers:
l.trainable = False
# define the top model
top_model = Sequential([
kl.MaxPool1D(50, input_shape=tmodel.output_shape[1:]),
kl.Flatten(),
kl.Dense(64),
kl.BatchNormalization(),
kl.Activation('relu'),
kl.Dropout(0.2),
#kl.Dense(128),
#kl.BatchNormalization(),
#kl.Activation('relu'),
#kl.Dropout(0.5),
kl.Dense(2, activation='sigmoid')
])
final_model = Model([inp], top_model(tmodel(inp)))
final_model.compile(Adam(lr=0.003), binary_crossentropy_masked, metrics=[accuracy])
final_model.summary()
mdir = f"{ddir}/processed/chipseq/exp/models/finetune-bpnet/fine-tune-conv"
ckp_file = f"{mdir}/model.h5"
history_path = f"{mdir}/history.csv"
!mkdir -p {mdir}
from basepair.samplers import StratifiedRandomBatchSampler
train_it = train.batch_train_iter(shuffle=False,
batch_size=1,
drop_last=None,
batch_sampler=StratifiedRandomBatchSampler(train.get_targets().max(axis=1),
batch_size=batch_size,
p_vec=[0.95, 0.05],
verbose=True),
num_workers=num_workers)
next(train_it)
valid_it = valid.batch_train_iter(batch_size=batch_size,
shuffle=True,
num_workers=num_workers)
next(valid_it);
# -------------------------------------------
final_model.fit_generator(train_it,
epochs=100,
steps_per_epoch=int(len(train) / batch_size * 0.1),
validation_data=valid_it,
validation_steps=int(len(valid) / batch_size * 0.2),
# sample_weight=sample_weight,
callbacks=[EarlyStopping(patience=4),
CSVLogger(history_path),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
final_model = load_model(ckp_file)
a=1
valid_it = valid.batch_predict_iter(batch_size=batch_size,
shuffle=False,
num_workers=num_workers*2)
preds = final_model.predict_generator(valid_it, verbose=1, steps=len(valid) // batch_size)
n_points = len(valid) // batch_size
print("Oct4:")
auprc(valid.get_targets()[:(n_points*batch_size),0], preds[:,0])
print("Sox2:")
auprc(valid.get_targets()[:(n_points*batch_size),1], preds[:,1])
for l in final_model.layers[1].layers:
l.trainable = True
final_model.compile(Adam(lr=0.003), binary_crossentropy_masked, metrics=[accuracy])
# -------------------------------------------
final_model.fit_generator(train_it,
epochs=100,
steps_per_epoch=int(len(train) / batch_size * 0.1),
validation_data=valid_it,
validation_steps=int(len(valid) / batch_size * 0.2),
# sample_weight=sample_weight,
callbacks=[EarlyStopping(patience=4),
CSVLogger(history_path),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
valid_it = valid.batch_predict_iter(batch_size=batch_size,
shuffle=False,
num_workers=num_workers*2)
preds = final_model.predict_generator(valid_it, verbose=1, steps=len(valid) // batch_size)
n_points = len(valid) // batch_size
print("Oct4:")
auprc(valid.get_targets()[:(n_points*batch_size),0], preds[:,0])
print("Sox2:")
auprc(valid.get_targets()[:(n_points*batch_size),1], preds[:,1])
print("Oct4:")
auprc(valid.get_targets()[:(n_points*batch_size),0], preds[:,0])
print("Sox2:")
auprc(valid.get_targets()[:(n_points*batch_size),1], preds[:,1])
class BPNetSequenceClassifier:
def __init__(self, shapes, num_tasks,
filters=64,
conv1_kernel_size=25,
n_dil_layers=9,
pool_size=50,
fc_units=[64],
dropout=0.2):
import keras.layers as kl
assert len(num_filters) == len(conv_width)
# configure inputs
keras_inputs = self.get_keras_inputs(shapes)
inputs = self.reshape_keras_inputs(keras_inputs)
# convolve sequence
inp = inputs["data/genome_data_dir"]
first_conv = kl.Conv1D(filters,
kernel_size=conv1_kernel_size,
padding='same',
activation='relu')(inp)
prev_layers = [first_conv]
for i in range(1, n_dil_layers + 1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = kl.add(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
combined_conv = kl.add(prev_layers, name='final_conv')
seq_preds = kl.MaxPooling1D(pool_size)(combined_conv)
seq_preds = kl.Flatten()(seq_preds)
for units in fc_units:
seq_preds = kl.Dense(units)(seq_preds)
seq_preds = kl.BatchNormalization()(seq_preds)
seq_preds = kl.Activation('relu')(seq_preds)
seq_preds = kl.Dropout(dropout)(seq_preds)
seq_preds = kl.Dense(2, activation='sigmoid')(seq_preds)
self.model = Model(input=keras_inputs.values(), output=seq_preds)
from basepair.models import binary_seq_multitask
mdir = f"{ddir}/processed/chipseq/exp/models/finetune-bpnet/from-scratch3"
!mkdir -p {mdir}
ckp_file = f"{mdir}/model.h5"
history_path = f"{mdir}/history.csv"
train_it = train.batch_train_iter(shuffle=False,
batch_size=1,
drop_last=None,
batch_sampler=StratifiedRandomBatchSampler(train.get_targets().max(axis=1),
batch_size=batch_size,
p_vec=[0.95, 0.05],
verbose=True),
num_workers=num_workers)
next(train_it)
valid_it = valid.batch_train_iter(batch_size=batch_size,
shuffle=True,
num_workers=num_workers)
next(valid_it);
final_model = binary_seq_multitask()
# -------------------------------------------
final_model.fit_generator(train_it,
epochs=100,
steps_per_epoch=int(len(train) / batch_size * 0.1),
validation_data=valid_it,
validation_steps=int(len(valid) / batch_size * 0.2),
callbacks=[EarlyStopping(patience=4),
CSVLogger(history_path),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
final_model = load_model(ckp_file)
# get predictions
valid_it = valid.batch_predict_iter(batch_size=batch_size, shuffle=False, num_workers=num_workers*2)
preds = final_model.predict_generator(valid_it, verbose=1, steps=len(valid) // batch_size)
n_points = len(valid) // batch_size
print("Oct4:")
auprc(valid.get_targets()[:(n_points*batch_size),0], preds[:,0])
print("Sox2:")
auprc(valid.get_targets()[:(n_points*batch_size),1], preds[:,1])
from basepair.trainers import KerasTrainer
final_model = binary_seq_multitask()
trainer = KerasTrainer(final_model, train, valid,
f"{ddir}/processed/chipseq/exp/models/finetune-bpnet/from-scratch2")
trainer.train(train_epoch_frac=0.1, valid_epoch_frac=0.2,
train_batch_sampler=StratifiedRandomBatchSampler(trainer.train_dataset.get_targets().max(axis=1),
batch_size=batch_size,
p_vec=[0.95, 0.05],
verbose=True)
)
from basepair.metrics import MetricsMultiTask, MetricsConcise
trainer.evaluate(MetricsMultiTask(MetricsConcise(['auprc']), ["Oct4", "Sox2"]))
!ls {trainer.output_dir}
pd.read_csv(f"{trainer.output_dir}/history.csv")
!cat {trainer.output_dir}/evaluation.valid.json