# Imports
from basepair.imports import *
hv.extension('bokeh')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
%load_ext autoreload
%autoreload 2
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
create_tf_session(1)
from basepair.cli.schemas import HParams
from basepair.models import seq_multitask
hparams = HParams.load(model_dir / 'hparams.yaml')
ds = DataSpec.load(model_dir / 'dataspec.yaml')
model.compi
from basepair.datasets import get_StrandedProfile_datasets2
ds.path
train,valid = get_StrandedProfile_datasets2(model_dir / 'dataspec.yaml', peak_width=1000)
valid
train = train.load_all(num_workers=10)
valid = valid[0][1].load_all(num_workers=10)
train['targets'].keys()
# create random output dir
output_dir = Path(f"{ddir}/processed/chipnexus/exp/models/loss-functions")
import datetime
def time_now():
from time import gmtime, strftime
return strftime("%Y-%m-%d_%H:%M:%S", gmtime())
def run_dir(output_dir):
import os
outdir = os.path.join(output_dir, "run_"+time_now())
os.makedirs(outdir)
return outdir
import keras
from basepair.functions import softmax
import keras.backend as K
import keras.activations as ka
from basepair.models import *
def seq_multitask(filters=21,
conv1_kernel_size=21,
tconv_kernel_size=25,
n_dil_layers=6,
lr=0.004,
c_task_weight=100,
use_profile=True,
use_counts=True,
tasks=['Sox2', 'Oct4'],
outputs_per_task=2,
task_use_bias=False,
seq_len=1000,
pool_size=0,
connect_prev='add',
profile_loss='mc_multinomial_nll',
count_loss='mse'
): # TODO - automatically infer sequence length
"""
Dense
Args:
c_task_weights: how to upweight the count-prediction task
task_use_bias (bool or a list of bools): if True, a
bias term is assumed to be provided at the input
"""
# TODO - split the body of this model into multiple subparts:
# - encoder
# - profile_decoder
# - profile_decoder_w_bias
# - counts_decoder
# - counts_decoder_w_bias
if isinstance(outputs_per_task, int):
outputs_per_task = [outputs_per_task for i in range(len(tasks))]
else:
assert len(tasks) == len(outputs_per_task)
if isinstance(task_use_bias, bool):
task_use_bias = [task_use_bias for i in range(len(tasks))]
else:
assert len(tasks) == len(task_use_bias)
# TODO - build the reverse complement symmetry into the model
inp = kl.Input(shape=(seq_len, 4), name='seq')
first_conv = kl.Conv1D(filters,
kernel_size=conv1_kernel_size,
padding='same',
activation='relu')(inp)
bias_profile_inputs = {task: kl.Input(shape=(seq_len, outputs_per_task[i]), name=f"bias/profile/{task}")
for i, task in enumerate(tasks) if task_use_bias[i]}
bias_counts_inputs = [kl.Input(shape=(outputs_per_task[i], ), name=f"bias/counts/{task}")
for i, task in enumerate(tasks) if task_use_bias[i]]
prev_layers = [first_conv]
if connect_prev == 'add':
merge_previous = kl.add
elif connect_prev == 'concat':
merge_previous = kl.concatenate
else:
raise ValueError("connect_prev needs to be 'add' or 'concat'")
for i in range(1, n_dil_layers + 1):
if i == 1:
prev_sum = first_conv
else:
prev_sum = merge_previous(prev_layers)
conv_output = kl.Conv1D(filters, kernel_size=3, padding='same', activation='relu', dilation_rate=2**i)(prev_sum)
prev_layers.append(conv_output)
combined_conv = merge_previous(prev_layers)
# add one more layer in between
if connect_prev == 'concat':
combined_conv = kl.Conv1D(filters,
kernel_size=1,
padding='same',
activation='relu')(combined_conv)
# De-conv
x = kl.Reshape((-1, 1, filters))(combined_conv)
x = kl.Conv2DTranspose(sum(outputs_per_task), kernel_size=(tconv_kernel_size, 1), padding='same')(x)
out = kl.Reshape((-1, sum(outputs_per_task)))(x)
# batch x seqlen x tasks*2 array
# need another array of the same length
# AvgPool if necessary
if pool_size:
out = kl.AvgPool1D(pool_size=pool_size, padding='valid')(out)
# setup the output branches
outputs = []
losses = []
loss_weights = []
if use_profile:
# TODO - use a different loss function for the same profiles
start_idx = np.cumsum([0] + outputs_per_task[:-1])
end_idx = np.cumsum(outputs_per_task)
def get_output_name(task):
if task in bias_profile_inputs:
return "lambda/profile/" + task
else:
return "profile/" + task
output = [kl.Lambda(lambda x, i, sidx, eidx: x[:, :, sidx:eidx],
output_shape=(seq_len, outputs_per_task[i]),
name=get_output_name(task),
arguments={"i": i, "sidx": start_idx[i], "eidx": end_idx[i]})(out)
for i, task in enumerate(tasks)]
for i, task in enumerate(tasks):
if task in bias_profile_inputs:
output_with_bias = kl.concatenate([output[i],
bias_profile_inputs[task]], axis=-1) # batch x seqlen x (2+2)
output[i] = kl.Conv1D(outputs_per_task[i],
1,
name="profile/" + task)(output_with_bias)
outputs += output
if profile_loss != 'poisson':
losses += [basepair.losses.get(f"{profile_loss}_{nt}") for nt in outputs_per_task]
loss_weights += [1] * len(tasks)
if use_counts:
pooled = kl.GlobalAvgPool1D()(combined_conv)
if bias_counts_inputs:
pooled = kl.concatenate([pooled] + bias_counts_inputs, axis=-1) # add bias as additional features
activation = K.exp if count_loss == 'poisson' else 'linear'
counts = [kl.Dense(outputs_per_task[i], name="counts/" + task, activation=activation)(pooled)
for i, task in enumerate(tasks)]
outputs += counts
losses += [count_loss] * len(tasks)
loss_weights += [c_task_weight] * len(tasks)
if profile_loss == 'poisson':
# override the output and loss function
losses = 'poisson'
outputs = [kl.multiply([kl.Lambda(lambda x: ka.softmax(x, axis=-2))(outputs[i]),
kl.Lambda(lambda x: K.exp(x))(outputs[i+len(tasks)])], name=task)
for i,task in enumerate(tasks)]
#outputs = [kl.multiply([ka.softmax(outputs[i], axis=-2), kl.Lambda(lambda x: K.exp(x))(outputs[i+len(tasks)])])
# for i,task in enumerate(tasks)]
loss_weights = None
model = Model([inp] + list(bias_profile_inputs.values()) + bias_counts_inputs, outputs)
model.compile(Adam(lr=lr), loss=losses, loss_weights=loss_weights)
return model
keras.__version__
tasks = list(ds.task_specs)
kwargs = OrderedDict([('filters', 64),
('seq_len', 1000),
('tasks', tasks),
('conv1_kernel_size', 25),
('tconv_kernel_size', 25),
('n_dil_layers', 9),
('lr', 0.004),
('c_task_weight', 1), # don't use it for the poisson loss
('profile_loss', 'mc_multinomial_nll'), # 'poisson'),
('count_loss', 'poisson') # mse
])
model = seq_multitask(**kwargs)
from keras.callbacks import EarlyStopping, ModelCheckpoint, History
# ckp_file = os.path.join(run_dir(output_dir), 'model.h5')
# hist = model.fit(train['inputs'], [train['targets'][f'profile/{t}'] for t in tasks],
# batch_size=256,
# epochs=200,
# validation_data=(valid['inputs'], [valid['targets'][f'profile/{t}'] for t in tasks]),
# validation_split=0.2,
# callbacks=[EarlyStopping(patience=5),
# History(),
# ModelCheckpoint(ckp_file, save_best_only=True)]
# )
# # get the best model
# model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
train_targets = [train['targets'][f'profile/{t}'] for t in tasks] + [train['targets'][f'profile/{t}'].sum(axis=(1))
for t in tasks]
valid_targets = [valid['targets'][f'profile/{t}'] for t in tasks] + [valid['targets'][f'profile/{t}'].sum(axis=(1))
for t in tasks]
ckp_file = os.path.join(run_dir(output_dir), 'model.h5')
hist = model.fit(train['inputs'], train_targets,
batch_size=256,
epochs=200,
validation_data=(valid['inputs'], valid_targets),
validation_split=0.2,
callbacks=[EarlyStopping(patience=5),
History(),
ModelCheckpoint(ckp_file, save_best_only=True)]
)
# get the best model
model = load_model(ckp_file, custom_objects={"twochannel_multinomial_nll": twochannel_multinomial_nll})
preds = model.predict(valid['inputs'])
from basepair.functions import softmax
len(preds)
y_pred_profile = {t: softmax(preds[i]) for i,t in enumerate(tasks)}
y_pred_counts = {t: preds[i] for i,t in enumerate(tasks)}
y_pred['Nanog'].shape
valid['targets'][f'profile/{t}']
from basepair.metrics import BPNetMetric, PeakPredictionProfileMetric, pearson_spearman
bpm = BPNetMetric(tasks, pearson_spearman, PeakPredictionProfileMetric)
from basepair.plot.evaluate import regression_eval
t = 'Nanog'
fig,axes = plt.subplots(1, 4, figsize=get_figsize(1, 1/4), sharex=True, sharey=True)
for i,t in enumerate(tasks):
regression_eval(np.log10(1+valid['targets'][f'profile/{t}'].sum(axis=(1,2))),
np.log10(1+y_pred[t].sum(axis=(1,2))),
ax=axes[i]
)
axes[i].set_title(t)
ppm = PeakPredictionProfileMetric(pos_min_threshold = 0.015,
neg_max_threshold = 0.005,
required_min_pos_counts = 2.5,
binsizes = [1])
# per-base accuracy
{t:ppm(valid['targets'][f'profile/{t}'],
y_pred[t] / y_pred[t].sum(axis=-2, keepdims=True))['binsize=1']['auprc']
for t in tasks}
from statsmodels.graphics.gofplots import qqplot
from scipy.stats import poisson, nbinom, norm
import scipy.stats as stats
TODO - get the count distributions
train['targets']['counts/Nanog'].shape
per_base_counts = np.ravel(train['targets']['profile/Nanog'].sum(axis=-1))
total_counts = train['targets']['profile/Nanog'].sum(axis=(-1,-2))
plt.hist(per_base_counts, bins=100, log='y');
plt.xlabel("Per-base counts for Nanog");
plt.hist(np.log10(total_counts+1), bins=100, log='y');
plt.xlabel("1-kb counts for Nanog");
log_counts = train['targets']['counts/Nanog'].mean(axis=-1)
counts = np.exp(log_counts) - 1
counts.mean()
total_counts.mean()
counts = total_counts
fig, axes= plt.subplots(1, 4, figsize=get_figsize(1.5, aspect=1/5))
ax=axes[0]
stats.probplot(counts, dist="norm", plot=ax);
ax.set_title("Normal")
ax=axes[1]
stats.probplot(np.log(counts+1), dist="norm", plot=ax);
ax.set_title("Log-normal")
ax=axes[2]
stats.probplot(counts, sparams=(counts.mean()), dist="poisson", plot=ax);
ax.set_title("Poisson")
ax=axes[3]
stats.probplot(counts, sparams=(0.1, 0.001), dist="nbinom", plot=ax);
ax.set_title("NB")
plt.tight_layout()
counts = np.random.choice(per_base_counts, 10000)
fig, axes= plt.subplots(1, 4, figsize=get_figsize(1.5, aspect=1/5))
ax=axes[0]
stats.probplot(counts, dist="norm", plot=ax);
ax.set_title("Normal")
ax=axes[1]
stats.probplot(np.log(counts+1), dist="norm", plot=ax);
ax.set_title("Log-normal")
ax=axes[2]
stats.probplot(counts, sparams=(counts.mean()), dist="poisson", plot=ax);
ax.set_title("Poisson")
ax=axes[3]
stats.probplot(counts, sparams=(0.1, 0.001), dist="nbinom", plot=ax);
ax.set_title("NB")
plt.tight_layout()
np.log(N + 1)
N = 40
C = np.arange(-5, 5) + np.log(N)
plt.plot(C, np.exp(C) - N * C, label='Poisson loss');
plt.plot(C, N/2*(np.log(N) - C)**2, label='mse loss');
plt.vlines(np.log(N), 0, max(np.exp(C) - N * C));
plt.legend();
where $C$ are the predicted log-counts and $p_j$ is the per-base probability.
from keras.losses import poisson
poisson??