# exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
# With augmentation
exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE'
augment_train = True
augment_test = True
gpu = 0
{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/model.h5{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/dataspec.yaml -> all files listed in dataspec.yaml# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.config import get_figsize, paper_config
from basepair.extractors import bw_extract
from matplotlib import ticker
# Use matplotlib paper config
paper_config()
from basepair.exp.paper.config import *
# Common paths
model_dir = models_dir / exp
figures = f"{ddir}/figures/model-evaluation/chipseq-bpnet/{exp}"
# Parameters
model_file = model_dir / "model.h5"
dataspec_file = "../../chipnexus/train/seqmodel/ChIP-seq.dataspec.yml"
history_file = model_dir / "history.csv"
seq_width = 1000
num_workers = 10
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plots import regression_eval, plot_loss
from basepair.cli.evaluate import eval_profile
from basepair import samplers
from basepair.math import softmax
# !mv {model_dir}/preds.test.pkl {model_dir}/preds.test.bak.pkl
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)
# Cache predictions
create_tf_session(gpu)
#bpnet = BPNetPredictor.from_mdir(model_dir)
from basepair.seqmodel import SeqModel
bpnet = SeqModel.from_mdir(model_dir)
profile_bias_pool_size=[1,50]
from basepair.preproc import IntervalAugmentor
if augment_train:
train_interval_transformer = IntervalAugmentor(max_shift=200, flip_strand=True)
else:
train_interval_transformer = lambda x: x
if augment_test:
test_interval_transformer = IntervalAugmentor(max_shift=400, flip_strand=True)
else:
test_interval_transformer = lambda x: x
# if not os.path.exists(model_dir / "preds.test.pkl"):
model = bpnet.model
dl_test = StrandedProfile(ds,
incl_chromosomes=test_chr,
peak_width=seq_width,
shuffle=False,
target_transformer=AppendCounts(),
interval_transformer=test_interval_transformer,
profile_bias_pool_size=profile_bias_pool_size)
test = dl_test.load_all(num_workers=num_workers)
y_true = test["targets"]
y_pred = bpnet.predict(test['inputs'])
y_pred_seqonly = bpnet.predict(test['inputs']['seq'])
# write_pkl((test, y_pred), model_dir / "preds.test.pkl")
# Load predictions
# test, y_pred = read_pkl(model_dir / "preds.test.pkl")
# y_true = test['targets']
# Load the training data and compute the average profile
dl_train = StrandedProfile(ds,
excl_chromosomes=test_chr + valid_chr,
peak_width=seq_width,
interval_transformer=train_interval_transformer,
shuffle=False)
train_data = dl_train.load_all(num_workers=num_workers)
avg_profile = {k.split("/")[1]: v.mean(axis=0) for k,v in train_data['targets'].items()}
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
if ax is None:
fig, ax = plt.subplots(1)
from scipy.stats import pearsonr, spearmanr
xmax = max([y_true.max(), y_pred.max()])
xmin = min([y_true.min(), y_pred.min()])
if loglog:
pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
else:
pearson, pearson_pval = pearsonr(y_true, y_pred)
spearman, spearman_pval = spearmanr(y_true, y_pred)
if loglog:
plt_fn = ax.loglog
else:
plt_fn = ax.plot
plt_fn(y_pred, y_true, ".",
markersize=markersize,
rasterized=True,
alpha=alpha)
ax.set_xlabel("Predicted")
ax.set_ylabel("Observed")
if same_lim:
ax.set_xlim((xmin, xmax))
ax.set_ylim((xmin, xmax))
rp = r"$R_{p}$"
rs = r"$R_{s}$"
ax.set_title(task)
ax.text(.95, .2, f"{rp}={pearson:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
ax.text(.95, .05, f"{rs}={spearman:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
y_pred.keys()
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")
for task in tasks:
fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
yt = np.exp(y_true[f'counts/{task}'].mean(-1))
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
xrange = [10, 1e4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.1, task=task, ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
plt.minorticks_off()
# save the figure
os.makedirs(f"{figures}/scatter", exist_ok=True)
fig.savefig(f"{figures}/scatter/{task}.pdf")
fig.savefig(f"{figures}/scatter/{task}.png")
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
yt = np.exp(y_true[f'counts/{task}'].mean(-1))
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
xrange = [10, 1e4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.1, task=task, ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
if i > 0:
ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf")
fig.savefig(f"{figures}/scatter/all.png")
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
yp_all = y_pred[f'{task}/counts'].mean(-1)
yp_bias = yp_all - y_pred_seqonly[f'{task}/counts'].mean(-1)
yt = np.exp(y_true[f'counts/{task}'].mean(-1) - yp_bias)
yp = np.exp(y_pred_seqonly[f'{task}/counts'].mean(-1))
xrange = [1, 2e2]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.1, task=task, ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
if i > 0:
ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all-bias-corrected.pdf", raster=True)
from basepair.plots import plot_profiles
from basepair import samplers
from kipoi.data_utils import get_dataset_item
from kipoi.metadata import GenomicRanges
import pybedtools
from basepair.utils import flatten_list
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import dfint_no_intersection
from pybedtools import BedTool
# Figure out valid indices (non-overlapping)
df_ranges = pd.DataFrame(test['metadata']['range'])[['chr', 'start','end']]
df_ranges_tasks = {t: df_ranges[test['metadata']['interval_from_task'] == t] for t in bpnet.tasks}
all_intervals = list(BedTool.from_dataframe(df_ranges))
o = dict()
for i,t in enumerate(bpnet.tasks):
dft = df_ranges.iloc[test['metadata']['interval_from_task'] == t]
if i == 0:
o[t] = dft
else:
df_existing = pd.concat(list(o.values()), axis=0)
o[t] = dft[dfint_no_intersection(dft, df_existing)]
valid_idx = pd.concat(list(o.values()), axis=0).index
valid_idx_bool = pd.Series(np.arange(len(df_ranges))).isin(valid_idx)
print("Fraction of non-overlapping peaks:", valid_idx_bool.mean())
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval
input_seqlen = 1000 - bpnet.body.get_len_change() - bpnet.heads[0].net.get_len_change()
yt = sum([y_true[f'profile/{task}']
for task in tasks])
(yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2)).max()
max_frac = (yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2))
max_pos = (yt ).max(axis=(-1, -2))
total_counts = (yt ).sum(axis=(-1, -2))
# trim_edge = 300
# xlim = [trim_edge, 1000 - trim_edge]
# fig_width=get_figsize(0.5)[0]
# rotate_y=90
# fig_height_per_track=0.5
# tasks = bpnet.tasks
# for task in bpnet.tasks:
# print(task)
# for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10,
# keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool )):
# #& ~blacklisted
# # for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10,
# # keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool & ~((max_pos > 400) & (n_zeros > 1000)))):
# # for idx in samplers.random(y_true[f'profile/{task}'],2):
# # get the interval for that idx
# r = get_dataset_item(test['metadata']['range'], idx)
# interval = pybedtools.create_interval_from_list([r['chr'], int(r['start']), int(r['end'])])
# interval_str = f"{interval.chrom}:{interval.start + trim_edge}-{interval.end - trim_edge}"
# # make prediction
# fe = FastaExtractor(ds.fasta_file)
# seq = fe([resize_interval(interval, input_seqlen, ignore_strand=True)])
# x = bpnet.neutral_bias_inputs(1000, 1000)
# x['seq'] = seq
# pred = bpnet.predict(x)
# # compile the list of tracks to plot
# viz_dict =flatten_list([[
# # Observed
# (f"{task}\nObs", y_true[f'profile/{task}'][idx]),
# # Predicted
# (f"\nPred", pred[f'{task}/profile'][0]),
# ] for task_idx, task in enumerate(tasks)])
# fig = plot_tracks(filter_tracks(viz_dict, xlim),
# title=interval_str,
# fig_height_per_track=fig_height_per_track,
# rotate_y=rotate_y,
# fig_width=fig_width,
# ylim=None,
# legend=False)
# fig.align_ylabels()
# os.makedirs(f"{figures}/profiles", exist_ok=True)
# fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.pdf")
# fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.png")
import matplotlib.pyplot as plt
from basepair.preproc import moving_average
inputs = y_true['profile/Oct4'][:2]
outputs = np.array([moving_average(inp, n=50) for inp in inputs])
inputs[0][:,0].shape
outputs.shape
for inp in inputs:
plt.plot(inp[:,0], color='blue')
plt.plot(inp[:,1], color='orange')
plt.show()
for output in outputs:
plt.plot(output[:,0], color='blue')
plt.plot(output[:,1], color='orange')
plt.show()
rep_indices = [f'C{x:02d}' for x in np.arange(1, 8)]
reps = pd.read_csv("https://docs.google.com/spreadsheets/d/1PvHGy0P9_Yq0tZFw807bjadxaZHAYECE4RytlI9rdeQ/export?gid=0&format=csv")
reps = reps[reps.Mnemonic.isin(rep_indices)]
reps.head()
from basepair.cli.schemas import TaskSpec
def get_bigwig(row, strand):
rep_n = int(row['Rep Number'])
task = row['TF Name']
rep_name = row['FTP Name'].split(".")[0]
return f"/srv/scratch/amr1/chip-seq-pipeline2/cromwell-executions/chip/{task}/call-bam2ta/shard-{rep_n -1}/execution/{task}.{strand}.bw"
def get_taskspec(rows):
task = rows.iloc[0]['TF Name'].capitalize()
pos_counts=[get_bigwig(rows.iloc[i], 'positive') for i in range(len(rows))]
neg_counts=[get_bigwig(rows.iloc[i], 'negative') for i in range(len(rows))]
return TaskSpec(task=task, pos_counts=pos_counts, neg_counts=neg_counts)
dataspec_train = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == False)])
for task in tasks},
fasta_file="")
dataspec_test = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == True)])
for task in tasks},
fasta_file="")
dataspec_train.touch_all_files()
dataspec_test.touch_all_files()
# Augment the intervals
# test_interval_transformer = IntervalAugmentor(max_shift=600, flip_strand=True)
shifted_intervals = [test_interval_transformer(interval) for interval in all_intervals]
train = dataspec_train.load_counts(shifted_intervals, progbar=True)
test = dataspec_test.load_counts(shifted_intervals, progbar=True)
ds.touch_all_files()
bias = ds.load_bias_counts(shifted_intervals)
seqs = FastaExtractor(fasta_file, use_strand=True)(shifted_intervals)
y_pred = bpnet.predict(seqs)
from basepair.cli.evaluate import bin_counts_summary
class BinnedMetric:
def __init__(self, metric, binsizes):
self.metric = metric
self.binsizes= binsizes
def __call__(self, y_true, y_pred):
return [{"binsize": binsize, **self.metric(
bin_counts_summary(y_true, binsize=binsize, fn=np.sum),
bin_counts_summary(y_pred, binsize=binsize, fn=np.sum))}
for binsize in self.binsizes]
from basepair.exp.paper.config import peak_pred_metric
from basepair.metrics import *
def normalize(x):
p = x / np.maximum(np.sum(x, axis=-1, keepdims=True), 1e-8)
p += 1 / (x.shape[-1]*10) # Should be
p = p / np.sum(p, axis=-1, keepdims=True)
return p
# Override the binsizes for peak_pred_metric
def multinomial_nll(y_true, y_pred):
from scipy.stats import multinomial
# subset = y_true.sum(axis=-1) > 0
subset = slice(None)
return -multinomial.logpmf(x=y_true[subset],
n=y_true[subset].sum(axis=-1),
p=normalize(y_pred[subset]))
def stranded_multinomial_nll(y_true, y_pred):
return (np.nanmean(multinomial_nll(y_true[:, :, 0], y_pred[:, :, 0])) +
np.nanmean(multinomial_nll(y_true[:, :, 1], y_pred[:, :, 1])))
from basepair.losses import multinomial_nll as k_multinomial_nll
import keras.backend as K
import tensorflow as tf
# y_pred = K.placeholder(ndim=2)
# y_true = K.placeholder(ndim=2)
# nll = K.function([y_true, y_pred], [k_multinomial_nll(y_true, K.log(y_pred / (1 - y_pred)))])
# nll([np.ones((100, 10)), np.ones((100, 10))/10])[0]
multinomial_nll(np.array([1,2,10]), np.array([1,2,10]))
multinomial_nll(np.array([1,2,10]), np.array([1,1,1]))
multinomial_nll(np.array([1,2,10]), np.array([3,2,1]))
# stranded_multinomial_nll(np.ones((100, 10, 2)), np.ones((100, 10, 2))/10)/2
from basepair.stats import symmetric_kl
def padded_symmetric_kl(y_true, y_pred):
return (symmetric_kl(normalize(moving_average(y_true[:, :, 0], n=50)), normalize(y_pred[:, :, 0])) +
symmetric_kl(normalize(moving_average(y_true[:, :, 1], n=50)), normalize(y_pred[:, :, 1])))
for output in outputs:
plt.plot(train_reps['Oct4'][0][:,0], color='blue')
plt.plot(train_reps['Oct4'][0][:,1], color='orange')
plt.show()
plt.plot(test['Nanog'][0][:, 0])
plt.plot(test['Nanog'][0][:, 1])
import random
random.sample(range(100), 3)
train_reps = {}
for task in tasks:
train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])
dftm
# task = 'Nanog'
dft = pd.concat([pd.DataFrame({
"idx": np.arange(len(test[task])),
"rep": multinomial_nll(test[task][:,:,0],
moving_average(train_reps[task][:,:,0], n=50)),
"pred": multinomial_nll(test[task][:,:,0],
y_pred[f"{task}/profile"][:,:,0]),
"avg": multinomial_nll(test[task][:,:,0],
np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)[:,:,0]),
"const": multinomial_nll(test[task][:,:,0],
np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)[:,:,0]),
'task': task
}) for task in tasks]).reset_index()
dftm = pd.melt(dft.groupby('task').mean().reset_index(), id_vars='task',
value_vars=['rep', 'pred', 'avg', 'const'])
# 100
dft['rep'].mean()
# 50
dft['rep'].mean()
# 50
dft['pred'].mean()
# TODO - figure out the wors difference
dft.loc[dft.avg.argmax()]
# TODO - figure out the wors difference
dft.loc[dft.rep.argmax()]
i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
"obs2": test[TF][i],
"bias": bias['input'][i],
"smooth1": moving_average(train[TF][i], n=50),
"smooth2": moving_average(test[TF][i], n=50),
"bias smooth": moving_average(bias['input'][i], n=50),
"avg": avg_profile[task],
"pred": y_pred[f'{TF}/profile'][i],
}, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]
i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
"obs2": test[TF][i],
"bias": bias['input'][i],
"smooth1": moving_average(train[TF][i], n=50),
"smooth2": moving_average(test[TF][i], n=50),
"bias smooth": moving_average(bias['input'][i], n=50),
"avg": avg_profile[task],
"pred": y_pred[f'{TF}/profile'][i],
}, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]
def plot_track_w_rug(arr, ax, legend=False, ylim=None, color=None, track=None):
"""Plot a track
"""
import collections
seqlen = len(arr)
if arr.shape[1] == 4:
# plot seqlogo
seqlogo(arr, ax=ax)
elif arr.shape[1] == 2:
# plot both strands
if color is not None:
assert isinstance(color, collections.Sequence)
c1 = color[0]
c2 = color[1]
else:
c1, c2 = None, None
if 'Obs' in track:
y_smooth = moving_average(arr, n=50)
ax.plot(np.arange(1, seqlen + 1), y_smooth[:,0], label='pos', color=c1)
ax.plot(np.arange(1, seqlen + 1), y_smooth[:,1], label='neg', color=c2)
# Rugs
rug_pos = np.where(arr[:, 0]>=1)[0] + 1
rug_neg = np.where(arr[:, 1]>=1)[0] + 1
sns.rugplot(rug_pos, color=c1, alpha=0.3, ax=ax, height=.1)
sns.rugplot(rug_neg, color=c2, alpha=0.3, ax=ax, height=.1)
else:
ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
if legend:
ax.legend()
else:
raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
if ylim is not None:
ax.set_ylim(ylim)
ax.set_xlim([0, seqlen])
# chr1:136680205-136680605 (known Zfp281 enhancer)
# chr1:180924752-180925152 (known Letfy1 enhancer)
intervals = [
('known Zfp281 enhancer', 'chr1:136680205-136680605'),
('known Letfy1 enhancer', 'chr1:180924752-180925152'),
]
viz_dict = flatten_list([[
("Obs bias", bias['input'][i]),
("Obs 1", train[TF][i]),
("Obs 3", test[TF][i]),
# "avg": avg_profile[task],
("pred", y_pred[f'{TF}/profile'][i]),
] for TF in tasks])
tasks
OrderedDict(viz_dict).keys()
viz_dict.keys()
i = 11
TF = 'Sox2'
for i in random.sample(range(len(train[TF])), 20):
viz_dict = OrderedDict([(f"Obs bias", bias['input'][i])] + flatten_list([[
(f"{TF} Obs", train[TF][i] + test[TF][i]),
# (f"{TF} Obs 2", test[TF][i]),
# "avg": avg_profile[task],
(f"{TF} pred", y_pred[f'{TF}/profile'][i]),
] for TF in tasks]))
plot_tracks(viz_dict, fig_width=get_figsize(.5)[0], fig_height_per_track=.5,
color=[['blue', 'orange']]*4*3,
rotate_y=0, plot_track_fn=plot_track_w_rug);
dft[(dft.idx == i) & (dft.task == TF)]
sns.despine(top=True, bottom=True, right=True)
moving_average(train[TF][i], n=25)
moving_average(train[TF][i], n=50).min()
x = moving_average(train[TF][i], n=50)
x = x / x.sum(axis=0, keepdims=True)
x.min()
x.max()
y_pred[f'{TF}/profile'][i].max()
y_pred[f'{TF}/profile'][i].min()
# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) +
geom_bar(stat='identity', position='dodge')
)
# 10-3
(ggplot(aes(x='task', fill='variable', y='value'), dftm) +
geom_bar(stat='identity', position='dodge')
)
# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) +
geom_bar(stat='identity', position='dodge')
)
TF = 'Sox2'
i = 0
for i in random.sample(range(len(train[TF])), 20):
plot_tracks({"obs1": train[TF][i],
"obs2": test[TF][i],
"bias": bias['input'][i],
"smooth1": moving_average(train[TF][i], n=50),
"smooth2": moving_average(test[TF][i], n=50),
"bias smooth": moving_average(bias['input'][i], n=50),
"pred": y_pred[f'{TF}/profile'][i],
}, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
binsizes = [1, 2, 5, 10, 20, 50] # 2, 5,
peak_pred_metric.binsizes = binsizes
metric = BPNetMetricSingleProfile(
count_metric=pearson_spearman,
profile_metric=stranded_multinomial_nll
)
binned_metrics = BinnedMetric(metric, binsizes)
train_reps = {}
for task in tasks:
train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])
metrics = {task: binned_metrics(test[task], train_reps[task]) for task in tqdm(tasks)}
metrics_model = {task: binned_metrics(test[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) for task in tqdm(tasks)}
metrics_avg = {task: binned_metrics(test[task],
np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0))
for task in tqdm(tasks)}
metrics_const = {task: binned_metrics(test[task],
np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0))
for task in tqdm(tasks)}
def profile_metrics2df(m, norm=None):
out = []
for task,v in m.items():
for bin_id in range(len(v)):
if norm is None:
profile = v[bin_id]['profile']
else:
profile = v[bin_id]['profile'] - norm[task][bin_id]['profile']
out.append({"task": task,
"binsize": v[bin_id]['binsize'],
"profile": profile
})
return pd.DataFrame(out)
def count_metrics2df(m):
out = []
for task,v in m.items():
out.append({"task": task,
"pearsonr": v[0]['counts']['pearsonr']
})
return pd.DataFrame(out)
dfm_profile = profile_metrics2df(metrics, metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_model, metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_avg, metrics_const).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(metrics_model).assign(method='BPNet')])
dfm_profile['split'] = 'train->test'
dfm_profile1 = dfm_profile.copy()
dfm_counts['split'] = 'train->test'
dfm_counts1 = dfm_counts.copy()
# Swap train and test
test_reps = {}
for task in tasks:
test_reps[task] = np.array([moving_average(test[task][i], n=50) for i in range(len(test[task]))])
shuff_metrics = {task: binned_metrics(train[task], test_reps[task]) for task in tqdm(tasks)}
shuff_metrics_model = {task: binned_metrics(train[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis]))
for task in tqdm(tasks)}
shuff_metrics_avg = {task: binned_metrics(train[task],
np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0))
for task in tqdm(tasks)}
shuff_metrics_const = {task: binned_metrics(train[task],
np.repeat(np.zeros_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0))
for task in tqdm(tasks)}
dfm_profile = profile_metrics2df(shuff_metrics, shuff_metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(shuff_metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_model, shuff_metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_avg, shuff_metrics_const).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(shuff_metrics_model).assign(method='BPNet')])
dfm_profile['split'] = 'test->train'
dfm_profile2 = dfm_profile.copy()
dfm_counts['split'] = 'test->train'
dfm_counts2 = dfm_counts.copy()
df_nll_tidy = pd.concat([dfm_profile1, dfm_profile2])
df_nll_tidy['method'] = pd.Categorical(df_nll_tidy['method'], categories=["BPNet", "Replicates", "Average"])
df_nll_tidy['task'] = pd.Categorical(df_nll_tidy['task'], categories=df_nll_tidy['task'].unique())
df_counts_tidy = pd.concat([dfm_counts1, dfm_counts2])
df_counts_tidy['method'] = pd.Categorical(df_counts_tidy['method'], categories=["BPNet", "Replicates"])
df_counts_tidy['task'] = pd.Categorical(df_counts_tidy['task'], categories=df_counts_tidy['task'].unique())
Plot
from plotnine import *
import plotnine
plotnine.__version__
paper_config()
df_nll_tidy = df_nll_tidy[df_nll_tidy.binsize < 100]
def booms(inputs):
import math
out = []
for key, item in inputs:
[a, b] = item.values
a = a[~np.isnan(a)]
b = b[~np.isnan(b)]
out.append([a.mean(), b.mean()])
return out
df_nll_tidy['profile_ll'] = -df_nll_tidy['profile']
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
scale_x_log10(breaks=binsizes) + \
geom_point() + \
geom_line() + \
geom_abline(intercept=0, slope=0) + \
facet_grid(".~task") + \
theme_classic(base_size=10, base_family='Arial') + \
theme(legend_position='top') + \
scale_color_brewer('qual', 7) + \
ylab("Log-likelihood")
fig.save(f"{figures}/ll.pdf")
fig.save(f"{figures}/ll.png", dpi=300)
fig
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
scale_x_log10(breaks=binsizes) + \
geom_point() + \
geom_line() + \
geom_abline(intercept=0, slope=0) + \
facet_grid(".~task") + \
theme_classic(base_size=10, base_family='Arial') + \
theme(legend_position='top') + \
scale_color_brewer('qual', 7) + \
ylab("Log-likelihood")
#fig.save(f"{figures}/nll.pdf")
#fig.save(f"{figures}/nll.png", dpi=300)
fig
plotnine.options.figure_size = get_figsize(0.7, aspect=0.3)
fig = ggplot(aes(x='method', y='pearsonr', fill='method'), data=df_counts_tidy.groupby(['task', 'method']).pearsonr.mean().reset_index()) + \
geom_bar(stat='identity', position='dodge') + \
geom_line() + \
facet_grid(".~task") + \
theme_classic(base_size=10, base_family='Arial') + \
theme(legend_position='top') + \
scale_fill_brewer('qual', 7)
fig.save(f"{figures}/counts.pdf")
fig.save(f"{figures}/counts.png", dpi=300)
fig
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
if ax is None:
fig, ax = plt.subplots(1)
from scipy.stats import pearsonr, spearmanr
xmax = max([y_true.max(), y_pred.max()])
xmin = min([y_true.min(), y_pred.min()])
if loglog:
pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
else:
pearson, pearson_pval = pearsonr(y_true, y_pred)
spearman, spearman_pval = spearmanr(y_true, y_pred)
if loglog:
plt_fn = ax.loglog
else:
plt_fn = ax.plot
plt_fn(y_pred, y_true, ".",
markersize=markersize,
rasterized=True,
alpha=alpha)
ax.set_xlabel("Predicted")
ax.set_ylabel("Observed")
if same_lim:
ax.set_xlim((xmin, xmax))
ax.set_ylim((xmin, xmax))
ax.set_title(task)
ax.text(0.95, .01, f"R={pearson:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
fontsize=8,
transform=ax.transAxes)
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(frac=.5, aspect=2/len(tasks)),
sharex=True, sharey=True)
for i, task in enumerate(tasks):
for j in range(2):
ax = axes[j,i]
yt = test[task].sum(axis=1).mean(-1) + 1
if j == 0:
yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
else:
yp = train[task].sum(axis=1).mean(-1) + 1
xrange = [2, 5e3]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
regression_eval(yt,
yp, alpha=.01, task=task if j == 0 else '', ax=ax, loglog=True)
ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
if i > 0:
ax.set_ylabel("")
else:
ax.set_ylabel("BPNet\nObserved" if j==0 else "Replicates\n")
fig.subplots_adjust(wspace=0, hspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/all.pdf")
fig.savefig(f"{figures}/all.png")