-
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
hv.extension('bokeh')
paper_config()
from basepair.config import valid_chr, test_chr
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
output_dir = Path("/srv/www/kundaje/avsec/chipnexus/oct-sox-nanog-klf/models/n_dil_layers=9/modisco/all/profile")
dataspec_file = model_dir / "dataspec.yaml"
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)
from basepair.cli.imp_score import ImpScoreFile
imp_file = ImpScoreFile(model_dir / "grad.all.h5")
seq = imp_file.f.f['/inputs'][:]
create_tf_session(0)
bpnet = BPNetPredictor.from_mdir(model_dir)
bpnet.model.summary()
bottleneck_model = Model(bpnet.model.inputs, bpnet.model.get_layer("add_9").output)
bottleneck_predictions = bottleneck_model.predict(seq, batch_size=32, verbose=1)
chromosomes = pd.Series(imp_file.f.f['/metadata/range/chr'][:])
# generate the data split
in_train = ~ chromosomes.isin(valid_chr + test_chr)
in_valid = chromosomes.isin(valid_chr)
in_test = chromosomes.isin(test_chr)
assert not np.any(in_train & in_valid)
assert not np.any(in_train & in_test)
assert not np.any(in_valid & in_test)
profiles = imp_file.get_profiles()
imp_file.close()
# Get the count matrix
total_counts = np.concatenate([profiles[t].sum(axis=1) for t in bpnet.tasks], axis=1)
columns_names = [f"{t}/{s}" for t in bpnet.tasks for s in ['pos', 'neg']]
total_counts = pd.DataFrame(total_counts.astype(int), columns = columns_names)
del profiles
total_counts.head()
from sklearn.preprocessing import StandardScaler
log_total_counts = np.log(1 + total_counts).values
scaler = StandardScaler()
train = (bottleneck_predictions[in_train], scaler.fit_transform(log_total_counts[in_train]))
valid = (bottleneck_predictions[in_valid], scaler.transform(log_total_counts[in_valid]))
test = (bottleneck_predictions[in_valid], scaler.transform(log_total_counts[in_test]))
train_averaged = train[0].mean(axis=1)
valid_averaged = valid[0].mean(axis=1)
from basepair.plot.evaluate import regression_eval
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression
m = MultiOutputRegressor(LinearRegression())
m.fit(train_averaged, log_total_counts[in_train]) # Don't use the scaled version when training the linear model
y_pred = m.predict(valid_averaged)
fig, axes = plt.subplots(1, len(bpnet.tasks), figsize=get_figsize(frac=1.5, aspect=0.3), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(bpnet.tasks, axes)):
s = slice(2*i, 2*(i+1))
regression_eval(log_total_counts[in_valid][:,s].mean(axis=1), y_pred[:,s].mean(axis=1), alpha=0.05, task=a, ax=ax);
plt.tight_layout()
replace_weights = False
plt.figure(figsize=(4,4))
for task_i, task in enumerate(tasks):
W_lm = np.stack([m.estimators_[2*task_i + strand_i].coef_ for strand_i in range(2)], axis=-1)
b_lm = np.stack([m.estimators_[2*task_i + strand_i].intercept_ for strand_i in range(2)])
W, b = bpnet.model.get_layer(f'counts/{task}').get_weights()
if replace_weights:
bpnet.model.get_layer(f'counts/{task}').set_weights((W_lm, b_lm))
plt.scatter(W.ravel(), W_lm.ravel(), alpha=0.5, s=4);
plt.xlabel("Neural net")
plt.ylabel("Linear model");
# Save back the model
# bpnet.model.save(model_dir / 'model.h5')
from sklearn.ensemble import RandomForestRegressor
m = MultiOutputRegressor(RandomForestRegressor(n_estimators=20), n_jobs=8)
m.fit(train_averaged, train[1])
y_pred = m.predict(valid_averaged)
fig, axes = plt.subplots(1, len(bpnet.tasks), figsize=get_figsize(frac=1.5, aspect=0.3), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(bpnet.tasks, axes)):
s = slice(2*i, 2*(i+1))
regression_eval(valid[1][:,s].mean(axis=1), y_pred[:,s].mean(axis=1), alpha=0.05, task=a, ax=ax);
plt.tight_layout()
from basepair.preproc import bin_counts
# bin and flatten
train_avg_pool = bin_counts(train[0], 50).reshape((len(train[0]), -1))
valid_avg_pool = bin_counts(valid[0], 50).reshape((len(valid[0]), -1))
m = MultiOutputRegressor(LinearRegression())
m.fit(train_avg_pool, train[1])
y_pred = m.predict(valid_avg_pool)
fig, axes = plt.subplots(1, len(bpnet.tasks), figsize=get_figsize(frac=1.5, aspect=0.3), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(bpnet.tasks, axes)):
s = slice(2*i, 2*(i+1))
regression_eval(valid[1][:,s].mean(axis=1), y_pred[:,s].mean(axis=1), alpha=0.05, task=a, ax=ax);
plt.tight_layout()
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint
i = 0
!mkdir -p {model_dir}/count-models
m = Sequential([kl.BatchNormalization(input_shape = train_averaged.shape[1:]),
kl.Dense(64, activation='relu'),
kl.BatchNormalization(),
kl.Dense(len(tasks))])
m.compile(Adam(0.01), 'mse')
i+=1
ckp_file = str(model_dir / f"count-models/{i}.h5")
m.fit(train_averaged, train[1],
batch_size=512,
epochs=50,
callbacks=[EarlyStopping(patience=5, ),
ModelCheckpoint(ckp_file, save_best_only=True, )],
validation_data=(valid_averaged, valid[1]))
m = load_model(ckp_file)
y_pred = m.predict(valid_averaged)
fig, axes = plt.subplots(1, len(bpnet.tasks), figsize=get_figsize(frac=1.5, aspect=0.3), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(bpnet.tasks, axes)):
s = slice(2*i, 2*(i+1))
regression_eval(valid[1][:,s].mean(axis=1), y_pred[:,s].mean(axis=1), alpha=0.05, task=a, ax=ax);
plt.tight_layout()
mp = Sequential([kl.AveragePooling1D(100, input_shape = train[0].shape[1:]),
kl.Flatten()])
train_maxpool = mp.predict(train[0])
valid_maxpool = mp.predict(valid[0])
m = Sequential([kl.BatchNormalization(input_shape = train_maxpool.shape[1:]),
kl.Dense(64, activation='relu'),
kl.Dropout(0.3),
kl.BatchNormalization(),
kl.Dense(len(tasks))])
m.compile(Adam(0.01), 'mse')
i+=1
ckp_file = str(model_dir / f"count-models/{i}.h5")
m.fit(train_maxpool, train[1],
batch_size=512,
epochs=50,
callbacks=[EarlyStopping(patience=5, ),
ModelCheckpoint(ckp_file, save_best_only=True, )],
validation_data=(valid_maxpool, valid[1]))
m = load_model(ckp_file)
y_pred = m.predict(valid_maxpool)
fig, axes = plt.subplots(1, len(bpnet.tasks), figsize=get_figsize(frac=1.5, aspect=0.3), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(bpnet.tasks, axes)):
s = slice(2*i, 2*(i+1))
regression_eval(valid[1][:,s].mean(axis=1), y_pred[:,s].mean(axis=1), alpha=0.05, task=a, ax=ax);
plt.tight_layout()