from basepair.imports import *
import pybedtools
from genomelake.extractors import BigwigExtractor
from basepair.extractors import StrandedBigWigExtractor, bw_extract
from kipoiseq.transforms import ResizeInterval
from basepair.modisco.table import ModiscoData
from pybedtools import BedTool
from basepair.cli.modisco import load_ranges
from basepair.plot.heatmaps import (heatmap_importance_profile, normalize, multiple_heatmap_stranded_profile,
heatmap_stranded_profile)
from basepair.plot.profiles import plot_stranded_profile
import hvplot.pandas
from basepair.plots import regression_eval
create_tf_session(1)
df = pd.read_csv(f"{ddir}/processed/chipnexus/external-data.tsv", sep='\t')
df = df.set_index('assay')
df
Write the bed file
from basepair.cli.schemas import DataSpec
ds = DataSpec.load(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf-sall/dataspec-incl-Sall4.yml")
!mkdir -p {ddir}/processed/activity/data/
def read_factor(factor, filename):
df = pd.read_table(filename, header=None, usecols=[0, 1, 2])
df[3] = factor
df.columns = ['chrom', 'start', 'end', 'name']
return df
dfc = pd.concat([read_factor(k, ds.task_specs[k].peaks) for k in ds.task_specs], axis=0)
dfc.to_csv(f"{ddir}/processed/activity/data/peaks.bed", sep='\t', index=False, header=False)
assays = ['H3K27ac', 'PolII', 'Groseq']
def tolist(s):
if isinstance(s, str):
return [s]
else:
return list(s)
bigwigs = {a: tolist(df.loc[a].path) for a in assays}
bigwigs
from basepair.config import valid_chr, test_chr
from basepair.datasets import *
dl = ActivityDataset(f"{ddir}/processed/activity/data/peaks.bed", ds.fasta_file, bigwigs,
excl_chromosomes=valid_chr + test_chr)
# load all
train = dl.load_all(num_workers=10)
dfy = pd.DataFrame(train['targets'])
valid = ActivityDataset(f"{ddir}/processed/activity/data/peaks.bed", ds.fasta_file, bigwigs,
incl_chromosomes=valid_chr).load_all(num_workers=10)
dfy_valid = pd.DataFrame(valid['targets'])
test = ActivityDataset(f"{ddir}/processed/activity/data/peaks.bed", ds.fasta_file, bigwigs,
incl_chromosomes=test_chr).load_all(num_workers=10)
dfy_test = pd.DataFrame(test['targets'])
dfy.head()
len(dl)
import scipy.stats as stats
%matplotlib inline
paper_config()
fig, axes = plt.subplots(1, len(assays), figsize=(9, 3), sharex=True, sharey=True)
for a, ax in zip(assays, axes):
stats.probplot(np.log10(1 + dfy[a]), dist="norm", plot=ax);
ax.set_title(a)
plt.tight_layout()
from basepair.BPNet import BPNetPredictor
from keras.models import Model, Sequential
import keras.layers as kl
mdir = f"{ddir}/processed/chipnexus/exp/models/osnk-pstat-sall-smad-zfp/models/c_task_weight=5,filters=128/"
bpnet = BPNetPredictor.from_mdir(mdir)
bpnet.tasks
bpnet.model.get_layer("add_9").output
bottleneck_model = Model(bpnet.model.inputs, bpnet.model.get_layer("add_9").output)
bottleneck_predictions = bottleneck_model.predict(train['inputs'], batch_size=32, verbose=1)
bottleneck_predictions_valid = bottleneck_model.predict(valid['inputs'], batch_size=32, verbose=1)
bottleneck_predictions_test = bottleneck_model.predict(test['inputs'], batch_size=32, verbose=1)
bottleneck_predictions.shape
n_filters = 128
top_model = Sequential([
kl.GlobalAvgPool1D(input_shape=(1000, n_filters)),
kl.Dense(3)
])
top_model = Sequential([
kl.MaxPool1D(pool_size=50, input_shape=(1000, n_filters)),
kl.Flatten(),
kl.Dense(64, activation='relu'),
# kl.Dropout(.5)
kl.Dense(3)
])
from concise.metrics import var_explained
from keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
top_model.compile("adam", 'mse', metrics=[var_explained])
preproc = StandardScaler()
y = preproc.fit_transform(np.log10(1 + dfy).values)
y_valid = preproc.transform(np.log10(1 + dfy_valid).values)
y_test = preproc.transform(np.log10(1 + dfy_test).values)
top_model.fit(bottleneck_predictions, y, batch_size=512,
epochs=100,
validation_data=(bottleneck_predictions_valid, y_valid),
callbacks=[EarlyStopping(patience=5)]
)
a=1
ypred_valid = top_model.predict(bottleneck_predictions_valid)
ypred_valid.shape
fig, axes = plt.subplots(len(assays), 1, figsize=(5, 11), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(assays, axes)):
regression_eval(ypred_valid[:,i], y_valid[:,i], alpha=0.05, task=a, ax=ax);
plt.tight_layout()
whole_model = Sequential([bottleneck_model, top_model])
whole_model.compile("adam", 'mse', metrics=[var_explained])
whole_model.fit(train['inputs']['seq'], y, batch_size=512,
epochs=100,
validation_data=(valid['inputs']['seq'], y_valid),
callbacks=[EarlyStopping(patience=5)])
ypred_valid = whole_model.predict(valid['inputs']['seq'])
ypred_valid.shape
fig, axes = plt.subplots(len(assays), 1, figsize=(5, 11), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(assays, axes)):
regression_eval(ypred_valid[:,i], y_valid[:,i], alpha=0.05, task=a, ax=ax);
plt.tight_layout()
import keras.backend as K
from keras.models import load_model
def reset_weights(model):
session = K.get_session()
for layer in model.layers:
if hasattr(layer, 'kernel_initializer'):
layer.kernel.initializer.run(session=session)
whole_model.save(f"{ddir}/processed/activity/models/dense/fine-tuned.h5")
reinitialized_model = load_model(f"{ddir}/processed/activity/models/dense/fine-tuned.h5")
reset_weights(reinitialized_model.layers[0])
reset_weights(reinitialized_model.layers[1])
reinitialized_model.compile("adam", 'mse', metrics=[var_explained])
reinitialized_model.fit(train['inputs']['seq'], y, batch_size=512,
epochs=100,
validation_data=(valid['inputs']['seq'], y_valid),
callbacks=[EarlyStopping(patience=5)])
ypred_valid = reinitialized_model.predict(valid['inputs']['seq'])
ypred_valid.shape
fig, axes = plt.subplots(len(assays), 1, figsize=(5, 11), sharex=True, sharey=True)
for i, (a, ax) in enumerate(zip(assays, axes)):
regression_eval(ypred_valid[:,i], y_valid[:,i], alpha=0.05, task=a, ax=ax);
plt.tight_layout()