exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE'
gpu = 1
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.config import get_figsize, paper_config
from basepair.extractors import bw_extract
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plot.evaluate import regression_eval
from basepair.cli.evaluate import eval_profile
from basepair import samplers
from basepair.math import softmax
from basepair.exp.paper.config import *
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")
# Use matplotlib paper config
paper_config()
from basepair.config import get_data_dir, get_repo_root
#ddir = get_data_dir()
rdir = get_repo_root()
# Common paths
model_dir = models_dir / exp
# figures = f"{ddir}/figures/model-evaluation/chipnexus-bpnet"
figures = f"{ddir}/figures/model-evaluation/chipnexus-bpnet/{exp}"
# Parameters
dataspec_file = rdir / "src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml"
history_file = model_dir / "history.csv"
seq_width = 1000
num_workers = 10
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)
create_tf_session(gpu)
ls {model_dir}
from basepair.seqmodel import SeqModel
bpnet = SeqModel.load(model_dir / 'calibrated_seqmodel.pkl')
profile_bias_pool_size=[1,50] # Note: this is specific to the model
!rm {model_dir}/preds.test.pkl
# Get the predictions
if not os.path.exists(model_dir / "preds.test.pkl"):
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=seq_width,
shuffle=False,
target_transformer=AppendCounts(),
profile_bias_pool_size=profile_bias_pool_size)
test = dl_test.load_all(num_workers=num_workers)
y_true = test["targets"]
y_pred = bpnet.predict(test['inputs'])
write_pkl((test, y_pred), model_dir / "preds.test.pkl")
# Load predictions
test, y_pred = read_pkl(model_dir / "preds.test.pkl")
y_true = test['targets']
# Load the training data and compute the average profile
dl_train = StrandedProfile(ds,
excl_chromosomes=test_chr + valid_chr,
peak_width=seq_width,
shuffle=False)
train_data = dl_train.load_all(num_workers=num_workers)
avg_profile = {k.split("/")[1]: v.mean(axis=0) for k,v in train_data['targets'].items()}
y_pred.keys()
for task in tasks:
fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
yt = np.exp(y_true[f'counts/{task}'].mean(-1))
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
xrange = [10, 1e4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.1, task=task, ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
plt.minorticks_off()
# save the figure
os.makedirs(f"{figures}/scatter", exist_ok=True)
fig.savefig(f"{figures}/scatter/{task}.pdf")
fig.savefig(f"{figures}/scatter/{task}.png")
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
yt = np.exp(y_true[f'counts/{task}'].mean(-1))
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
xrange = [10, 1e4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.1, task=task, ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
if i > 0:
ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf")
fig.savefig(f"{figures}/scatter/all.png")
from basepair.plots import plot_profiles
from basepair import samplers
from kipoi.data_utils import get_dataset_item
from kipoi.metadata import GenomicRanges
import pybedtools
from basepair.utils import flatten_list
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import dfint_no_intersection
from pybedtools import BedTool
# Figure out valid indices (non-overlapping)
df_ranges = pd.DataFrame(test['metadata']['range'])[['chr', 'start','end']]
df_ranges_tasks = {t: df_ranges[test['metadata']['interval_from_task'] == t] for t in bpnet.tasks}
all_intervals = list(BedTool.from_dataframe(df_ranges))
o = dict()
for i,t in enumerate(bpnet.tasks):
dft = df_ranges.iloc[test['metadata']['interval_from_task'] == t]
if i == 0:
o[t] = dft
else:
df_existing = pd.concat(list(o.values()), axis=0)
o[t] = dft[dfint_no_intersection(dft, df_existing)]
valid_idx = pd.concat(list(o.values()), axis=0).index
valid_idx_bool = pd.Series(np.arange(len(df_ranges))).isin(valid_idx)
print("Fraction of non-overlapping peaks:", valid_idx_bool.mean())
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval
input_seqlen = 1000 - bpnet.body.get_len_change() - bpnet.heads[0].net.get_len_change()
# blacklisted_idx = list({idx
# for task,_ in n_blacklist.items()
# for idx in samplers.top_max_count(y_true[f'profile/{task}'],15)
# })
# blacklisted = pd.Series(np.arange(len(test['metadata']['interval_from_task']))).isin(blacklisted_idx)
# Check how much of the total counts is allocated at a single position
yt = sum([y_true[f'profile/{task}']
for task in tasks])
(yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2)).max()
max_frac = (yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2))
max_pos = (yt ).max(axis=(-1, -2))
total_counts = (yt ).sum(axis=(-1, -2))
n_zeros = np.sum(yt == 0, axis=(-1, -2))
from basepair.exp.paper.config import tf_colors
# Generate the right colors
colors = []
for task in bpnet.tasks:
colors.append((tf_colors[task], tf_colors[task] + "80")) # 80 add alpha=0.5
colors.append((tf_colors[task], tf_colors[task] + "80")) # 80 add alpha=0.5
def to_neg(track):
"""Use the negative sign for reads on the reverse strand
"""
track = track.copy()
track[:, 1] = - track[:, 1]
return track
from basepair.samplers import top_sum_count, top_max_count
def top_max_count(arr, end=10, start=0, keep=None):
"""
Return indices where arr has the highest max(pos) + max(neg)
Args:
arr: can be an array or a list of arrays
start: Where to start returning the values
end: where to stop
"""
if keep is None:
keep = np.arange(len(arr))
assert end > start
# Top maxcount indicies
return pd.Series(arr.max(1).sum(1))[keep].sort_values(ascending=False).index[start:end]
# Get the idx to test
idx_set = set()
for task in bpnet.tasks:
keep = ((test['metadata']['interval_from_task'] == task) & valid_idx_bool &
valid_idx_bool & ~((max_pos > 400) & (n_zeros > 1000))
)
idx_set.update(top_max_count(y_true[f'profile/{task}'],10,keep=keep))
idx_set.update(top_sum_count(y_true[f'profile/{task}'],10,keep=keep))
trim_edge = 300
xlim = [trim_edge, 1000 - trim_edge]
fig_width=get_figsize(0.5)[0]
rotate_y=90
fig_height_per_track=0.5
tasks = bpnet.tasks
for idx in idx_set:
# get the interval for that idx
r = get_dataset_item(test['metadata']['range'], idx)
interval = pybedtools.create_interval_from_list([r['chr'], int(r['start']), int(r['end'])])
interval_str = f"{interval.chrom}:{interval.start + trim_edge}-{interval.end - trim_edge}"
# make prediction
fe = FastaExtractor(ds.fasta_file)
seq = fe([resize_interval(interval, input_seqlen, ignore_strand=True)])
x = bpnet.neutral_bias_inputs(1000, 1000)
x['seq'] = seq
pred = bpnet.predict(x)
# compile the list of tracks to plot
viz_dict =flatten_list([[
# Observed
(f"{task}\nObs", to_neg(y_true[f'profile/{task}'][idx])),
# Predicted
(f"\nPred", to_neg(pred[f'{task}/profile'][0] * np.exp(pred[f'{task}/counts'][0]))),
] for task_idx, task in enumerate(tasks)])
sl = slice(*xlim)
# Get ylim
ylim = []
for task in tasks:
m = y_true[f'profile/{task}'][idx][sl].max()
ylim.append((-m,m))
m = (pred[f'{task}/profile'][0] * np.exp(pred[f'{task}/counts'][0])).max()
ylim.append((-m,m))
fig = plot_tracks(filter_tracks(viz_dict, xlim),
title=interval_str,
fig_height_per_track=fig_height_per_track,
rotate_y=rotate_y,
use_spine_subset=True,
color=colors,
fig_width=fig_width,
ylim=ylim,
legend=False)
fig.align_ylabels()
sns.despine(top=True, right=True, bottom=True)
os.makedirs(f"{figures}/profiles", exist_ok=True)
fig.savefig(f"{figures}/profiles/top-sum-max-count-{idx}-{interval_str}.pdf")
# fig.savefig(f"{figures}/profiles/top-sum-max-count-{idx}-{interval_str}.png")
rep_indices = [f'N{x:02d}' for x in np.arange(1, 21)]
reps = pd.read_csv("https://docs.google.com/spreadsheets/d/1PvHGy0P9_Yq0tZFw807bjadxaZHAYECE4RytlI9rdeQ/export?gid=0&format=csv")
reps = reps[reps.Mnemonic.isin(rep_indices)]
reps.head()
from basepair.cli.schemas import DataSpec, TaskSpec
def get_bigwig(row, strand):
rep_n = int(row['Rep Number'])
task = row['TF Name']
rep_name = row['FTP Name'].split(".")[0]
return f"/srv/scratch/amr1/chip-nexus-pipeline/cromwell-executions/chip_nexus/{task}/call-count_signal_track/shard-{rep_n -1}/execution/{rep_name}.trim.merged.nodup.{strand}.bigwig"
def get_taskspec(rows):
task = rows.iloc[0]['TF Name'].capitalize()
pos_counts=[get_bigwig(rows.iloc[i], 'positive') for i in range(len(rows))]
neg_counts=[get_bigwig(rows.iloc[i], 'negative') for i in range(len(rows))]
return TaskSpec(task=task, pos_counts=pos_counts, neg_counts=neg_counts)
dataspec_train = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == False)])
for task in tasks},
fasta_file="")
dataspec_train.touch_all_files()
dataspec_test = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == True)])
for task in tasks},
fasta_file="")
dataspec_test.touch_all_files()
train = dataspec_train.load_counts(all_intervals, progbar=True)
test = dataspec_test.load_counts(all_intervals, progbar=True)
# Setup the evaluation metric
from basepair.exp.paper.config import peak_pred_metric
from basepair.metrics import *
# Override the binsizes for peak_pred_metric
binsizes = [1, 2, 5, 10, 20]
peak_pred_metric.binsizes = binsizes
metric = BPNetMetricSingleProfile(
count_metric=pearson_spearman,
profile_metric=peak_pred_metric
)
def profile_metrics2df(m):
out = []
for task,v in m.items():
for binsize_str, mout in v['profile'].items():
out.append({"task": task,
"binsize": int(binsize_str.split("=")[1]),
**mout
})
return pd.DataFrame(out)
def count_metrics2df(m):
out = []
for task,v in m.items():
out.append({"task": task,
**v['counts']
})
return pd.DataFrame(out)
metrics_rep = {task: metric(test[task], train[task]) for task in tasks}
metrics_model = {task: metric(test[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) for task in tasks}
# average the training signal
metrics_avg = {task: metric(test[task],
np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0))
for task in tasks}
dfm_profile = profile_metrics2df(metrics_rep).assign(method='Replicates')
dfm_counts = count_metrics2df(metrics_rep).assign(method='Replicates')
df_auprc_random = dfm_profile.copy()
df_auprc_random['auprc'] = df_auprc_random['random_auprc']
df_auprc_random['method'] = 'Random'# + df_auprc_random['method']
dfm_profile = pd.concat([dfm_profile, df_auprc_random])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_model).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_avg).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(metrics_model).assign(method='BPNet')])
dfm_profile['split'] = 'train->test'
dfm_profile1 = dfm_profile.copy()
dfm_counts['split'] = 'train->test'
dfm_counts1 = dfm_counts.copy()
Swap train and test.
metrics_rep = {task: metric(train[task], test[task]) for task in tasks}
metrics_model = {task: metric(train[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) for task in tasks}
# average the training signal
metrics_avg = {task: metric(train[task],
np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0))
for task in tasks}
dfm_profile = profile_metrics2df(metrics_rep).assign(method='Replicates')
dfm_counts = count_metrics2df(metrics_rep).assign(method='Replicates')
df_auprc_random = dfm_profile.copy()
df_auprc_random['auprc'] = df_auprc_random['random_auprc']
df_auprc_random['method'] = 'Random'# + df_auprc_random['method']
dfm_profile = pd.concat([dfm_profile, df_auprc_random])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_model).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_avg).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(metrics_model).assign(method='BPNet')])
dfm_profile['split'] = 'train->test'
dfm_profile2 = dfm_profile.copy()
dfm_counts['split'] = 'train->test'
dfm_counts2 = dfm_counts.copy()
["BPNet", "Replicates", "Average", "Random"]
df_auprc_tidy = pd.concat([dfm_profile1, dfm_profile2])
df_auprc_tidy['method'] = pd.Categorical(df_auprc_tidy['method'], categories=["BPNet", "Replicates", "Average", "Random"])
df_auprc_tidy['task'] = pd.Categorical(df_auprc_tidy['task'], categories=df_auprc_tidy['task'].unique())
df_counts_tidy = pd.concat([dfm_counts1, dfm_counts2])
df_counts_tidy['method'] = pd.Categorical(df_counts_tidy['method'], categories=["BPNet", "Replicates"])
df_counts_tidy['task'] = pd.Categorical(df_counts_tidy['task'], categories=df_counts_tidy['task'].unique())
from plotnine import *
import plotnine
plotnine.__version__
paper_config()
df_auprc_tidy = df_auprc_tidy[df_auprc_tidy.binsize < 11]
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='auprc', color='method'), data=df_auprc_tidy.groupby(['binsize', 'task', 'method']).auprc.mean().reset_index()) + \
scale_x_log10(breaks=binsizes) + \
geom_point() + \
geom_line() + \
facet_grid(".~task") + \
theme_classic(base_size=10, base_family='Arial') + \
theme(legend_position='top') + \
scale_color_brewer('qual', 7)
!mkdir -p {figures}/profile-metrics
fig.save(f"{figures}/profile-metrics/auprc.pdf")
fig.save(f"{figures}/profile-metrics/auprc.png", dpi=300)
fig
!mkdir -p {figures}/profile-metrics
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
if ax is None:
fig, ax = plt.subplots(1)
from scipy.stats import pearsonr, spearmanr
xmax = max([y_true.max(), y_pred.max()])
xmin = min([y_true.min(), y_pred.min()])
if loglog:
pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
else:
pearson, pearson_pval = pearsonr(y_true, y_pred)
spearman, spearman_pval = spearmanr(y_true, y_pred)
if loglog:
plt_fn = ax.loglog
else:
plt_fn = ax.plot
plt_fn(y_pred, y_true, ".",
markersize=markersize,
rasterized=True,
alpha=alpha)
ax.set_xlabel("Predicted")
ax.set_ylabel("Observed")
if same_lim:
ax.set_xlim((xmin, xmax))
ax.set_ylim((xmin, xmax))
ax.set_title(task)
ax.text(0.95, .01, f"R={spearman:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
fontsize=8,
transform=ax.transAxes)
yt.mean()
(train[task].sum(axis=1).mean(-1) + 1).mean()
yt.mean()
offset = train[task].sum(axis=1).mean(-1) + 1
np.exp(y_pred[f'{task}/counts'].mean(-1)).mean()
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(frac=.5, aspect=2/len(tasks)),
sharex=True, sharey=True)
for i, task in enumerate(tasks):
for j in range(2):
ax = axes[j,i]
if j == 0:
# yp = np.exp(y_pred[f'{task}/counts'].mean(-1)*0.9)
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
# yt = y_true[f'{task}/counts'].mean(-1) + 1
yt = np.exp(y_true[f'counts/{task}'].mean(-1))
else:
yt = test[task].sum(axis=1).mean(-1) + 1
yp = train[task].sum(axis=1).mean(-1) + 1
xrange = [2, 5e3]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.01, task=task if j == 0 else '', ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
if i > 0:
ax.set_ylabel("")
else:
ax.set_ylabel("BPNet\nObserved" if j==0 else "Replicates\n")
fig.subplots_adjust(wspace=0, hspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf", dpi=300)
# fig.savefig(f"{figures}/scatter/all.png")
figures
(0.96+.88+.98+.93)/4
(0.63+.61+.58+.6)/4