import pandas as pd
import numpy as np
from pybedtools import BedTool
from basepair.config import get_data_dir, create_tf_session
from tqdm import tqdm
from concise.preprocessing import encodeDNA
from basepair.datasets import get_sox2_data
from basepair.plots import plot_loss
import pyBigWig
from basepair.math import softmax
import matplotlib.pyplot as plt
from basepair import samplers
ddir = get_data_dir()
create_tf_session(1)
from basepair.datasets import *
from basepair.models import *
from basepair.plots import *
train, valid, test = seq_inp_exo_out() # default sox2 dataste
import keras.layers as kl
from keras.optimizers import Adam
from keras.models import Model
import keras.backend as K
from concise.utils.helper import get_from_module
from basepair.losses import twochannel_multinomial_nll
from basepair.layers import SpatialLifetimeSparsity
from basepair.models import seq_mutlitask
import keras.layers as kl
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
from keras.models import Model, load_model
def get_model(mfn, mkwargs):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/chipnexus/exp/models/multi-task"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(datetime.datetime.now()).replace(" ", "::")
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
return eval(mfn)(**mkwargs), name, ckp_file
train[1].shape
plt.hist(train[1].sum(1).sum(1), bins=100)
plt.xlabel("Total counts");
counts = train[1].sum(1).sum(1)
plt.scatter(train[1].sum(1)[:,0], train[1].sum(1)[:,1])
plt.xlabel("Forward strand counts");
plt.ylabel("Reverse strand counts");
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import poisson, nbinom, norm
import scipy.stats as stats
counts.mean()
stats.probplot(counts, dist="norm", plot=plt);
# Log-normal seems to model the data pretty well
stats.probplot(np.log(counts+1), dist="norm", plot=plt);
stats.probplot(counts, sparams=(counts.mean()), dist="poisson", plot=plt);
stats.probplot(counts, sparams=(0.2, 0.001), dist="nbinom", plot=plt);
from keras.models import Sequential
import concise.metrics as ce
import concise.eval_metrics as cem
from concise.metrics import var_explained
filters = 32
def single_output(filters=32,
conv1_kernel_size=21,
dropout=0.1,
seq_len=201,
pool_type='avg',
lr=0.004):
def get_pool(pool_type):
if pool_type=='max':
return kl.MaxPool1D()
elif pool_type =='avg':
return kl.AveragePooling1D()
model = Sequential([
kl.Conv1D(filters, conv1_kernel_size, activation='relu',
input_shape=(seq_len,4), padding='same'),
kl.Conv1D(filters, 1, activation='relu', padding='same'),
get_pool(pool_type),
kl.Conv1D(2*filters, 7, activation='relu', padding='same'),
get_pool(pool_type),
kl.Conv1D(2*filters, 7, activation='relu', padding='same'),
#get_pool(pool_type),
kl.GlobalAveragePooling1D(),
#kl.Dense(8*filters, activation='relu'),
#kl.Dropout(dropout),
kl.Dense(1)
])
model.compile(Adam(lr=lr),
loss="mse",
metrics=[ce.var_explained]
)
return model
def get_model(mfn, mkwargs):
"""Get the model"""
import datetime
mdir = f"{ddir}/processed/chipnexus/exp/models/count-output"
name = mfn + "_" + \
",".join([f'{k}={v}' for k,v in mkwargs.items()]) + \
"." + str(datetime.datetime.now()).replace(" ", "::")
!mkdir -p {mdir}
ckp_file = f"{mdir}/{name}.h5"
return eval(mfn)(**mkwargs), name, ckp_file
# hyper-parameters
mfn = "single_output"
mkwargs = dict(filters=64,
conv1_kernel_size=21,
dropout=0.5,
pool_type='max',
lr=0.004)
valid_counts = (valid[0], np.log(valid[1].sum(1).sum(1)+1))
test_counts = (test[0], np.log(test[1].sum(1).sum(1)+1))
# best valid so far: 108238.6558
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0],
np.log(train[1].sum(1).sum(1)+1),
batch_size=256,
epochs=100,
validation_data=(valid[0], np.log(valid[1].sum(1).sum(1)+1)),
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file)
dfh = pd.DataFrame(history.history)
plt.figure(figsize=(8,2))
plt.subplot(121)
plt.plot(dfh.loss, label="loss")
plt.plot(dfh.val_loss, label="val_loss")
plt.legend()
plt.xlabel("epoch")
plt.subplot(122)
plt.plot(dfh.var_explained, label="var_explained")
plt.plot(dfh.val_var_explained, label="val_var_explained")
plt.legend()
plt.xlabel("epoch");
y_pred = model.predict(valid_counts[0])
regression_eval(valid_counts[1], y_pred[:,0])
cem.var_explained(valid_counts[1], y_pred)
def seq_dense_count(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
use_profile=True,
seq_len=201,
profile_pool=None,
count_weight=100,
lr=0.004):
"""
Dense
"""
# TODO - build the reverse complement symmetry into the model
inp = kl.Input(shape=(seq_len, 4), name='seq')
first_conv = kl.Conv1D(filters, kernel_size=conv1_kernel_size, padding='same',
#kernel_initializer = ci.PSSMKernelInitializer(pwm_list, stddev=0, add_noise_before_Pwm2Pssm=False),
#bias_initializer = 'zeros',
activation='relu')(inp)
prev_layers = [first_conv]
for i in range(1, n_dil_layers+1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = kl.add(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
# De-conv
combined_conv = kl.add(prev_layers)
if profile_pool is not None:
combined_conv = kl.AveragePooling1D(pool_size=profile_pool)(combined_conv)
# Bottleneck layer
#combined_conv = kl.Conv1D(3, kernel_size=1, padding='same', activation='relu')(combined_conv)
x = kl.Reshape((-1, 1, filters))(combined_conv)
x = kl.Conv2DTranspose(2, kernel_size=(tconv_kernel_size, 1), padding='same')(x)
#kl.Conv2DTranspose(32, kernel_size=(7, 1), padding='same', activation='relu'),
#kl.Conv2DTranspose(2, kernel_size=(3, 1), padding='same'),
out = kl.Reshape((-1, 2), name='profile')(x)
pooled = kl.GlobalAvgPool1D()(combined_conv)
#hidden = kl.Dense(1, pooled)
count_out = kl.Dense(2, name='count')(pooled)
if use_profile:
model = Model(inp, [out, count_out])
model.compile(Adam(lr=lr),
loss=[twochannel_multinomial_nll, 'mse'],
loss_weights=[1, count_weight])
else:
model = Model(inp, count_out)
model.compile(Adam(lr=lr),
loss='mse',)
return model
# hyper-parameters
mfn = "seq_dense_count"
mkwargs = dict(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
seq_len=201,
profile_pool=None,
use_profile=False,
lr=0.004)
valid_counts = (valid[0], np.log(valid[1].sum(1).sum(1)+1))
test_counts = (test[0], np.log(test[1].sum(1).sum(1)+1))
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0],
[train[1], np.log(train[1].sum(1)+1)],
batch_size=256,
epochs=100,
validation_data=(valid[0], [np.log(valid[1].sum(1).sum(1)+1)]),
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file)
y_pred = model.predict(valid[0])
yc_pred = y_pred[:,0]
yc_true = np.log(valid[1].sum(1).sum(1)+1)
cem.var_explained(yc_true, yc_pred)
regression_eval(yc_true, yc_pred)
regression_eval(yc_true, yc_pred)
x = train[1]
x.shape
from basepair.preproc import bin_counts
coarsen = bin_counts(x,3)
coarsen.shape
plt.plot(x[-1,:,0])
plt.plot(coarsen[-1,:,0])
200/4
200/8
# hyper-parameters
mfn = "seq_dense_count"
bin_size=None
mkwargs = dict(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
seq_len=200,
profile_pool=bin_size,
use_profile=True,
count_weight=100,
lr=0.004)
train, valid, test = seq_inp_exo_out(truncate_len=200, bin_size=bin_size)
valid_counts = (valid[0], np.log(valid[1].sum(1).sum(1)+1))
test_counts = (test[0], np.log(test[1].sum(1).sum(1)+1))
model, name, ckp_file = get_model(mfn, mkwargs)
history = model.fit(train[0],
[train[1], np.log(train[1].sum(1)+1)],
batch_size=256,
epochs=100,
validation_data=(valid[0], [valid[1], np.log(valid[1].sum(1)+1)]),
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file)
y_pred = model.predict(valid[0])
yc_pred = y_pred[1][:,0]
yc_true = np.log(valid[1].sum(1)[:,0]+1)
cem.var_explained(yc_true, yc_pred)
regression_eval(yc_true, yc_pred)
y_pred = model.predict(test[0])
yc_pred = y_pred[1][:,0]
yc_true = np.log(test[1].sum(1).sum(1)+1)
cem.var_explained(yc_true, yc_pred)
ckp_file
regression_eval(yc_true, yc_pred)
test[1].shape
y_pred[0][:,:,0].shape
y_pred[1].shape
y_pred[0][:,:,0].shape
binsize=1
regression_eval(np.log(1+np.ravel(bin_counts(test[1], binsize=binsize)[:,:,0])),
np.log(1+np.ravel((bin_counts(softmax(y_pred[0]), binsize=binsize)[:,:,0])*
y_pred[1][:,:1])))
from scipy.stats import pearsonr, spearmanr
binsizes = [1, 2, 4, 10, 25, 50, 100, 200]
perf = []
for binsize in binsizes:
yc_true = np.log(1+np.ravel(bin_counts(test[1], binsize=binsize)[:,:,0]))
yc_pred = np.log(1+np.ravel((bin_counts(softmax(y_pred[0]), binsize=binsize)[:,:,0])*
y_pred[1][:,:1]))
perf.append([pearsonr(yc_true, yc_pred)[0], spearmanr(yc_true, yc_pred)[0]])
df = pd.DataFrame(perf, columns=['pearson','spearman']).assign(binsize=binsizes)
plt.figure(figsize=(3,2))
plt.semilogx(df.binsize, df.spearman,'-o' )
plt.ylabel("R_spearman")
plt.xlabel("Bin size")
plt.xticks(binsizes, binsizes);
# Show just the plot of different binning
binsize=200//25
regression_eval(np.log(1+np.ravel(bin_counts(test[1], binsize=binsize)[:,:,0])),
np.log(1+np.ravel((bin_counts(softmax(y_pred[0]), binsize=binsize)[:,:,0])*
y_pred[1][:,:1])))
The correlation performance is roughly the same if we train the model on pooled signal (of 25)
R = [0.45, 0.43, 0.39, 0.42, 0.38, 0.42, 0.42, 0.42, 0.38, 0.27]
pool_size = [0, 2, 4, 8, 10, 20, 25, 50, 100, 200]
plt.semilogx(np.array(pool_size)+1, R, "-o")
plt.xlabel("Pool Size")
plt.ylabel("R_spearman")
plt.plot(np.array(pool_size), R, "-o")
plt.xlabel("Pool Size")
plt.ylabel("R_spearman")
200/16