visualize the following things regarding bias correction:
this will give you a clue how strongly the bias is present
and whether it has a linear relationship with the output
our model is namely: y ~ model_output + bias * w2
and if the relationship would be non-linear we'd need to do:
y ~ model_output + f(bias)
# Imports
from basepair.imports import *
hv.extension('matplotlib')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
import warnings
warnings.filterwarnings('ignore')
import matplotlib
%matplotlib inline
from scipy.stats import spearmanr
from basepair.plot.evaluate import regression_eval
from basepair.preproc import bin_counts
gather all the plots into the following plot matrix:
Use also a lower alpha value of say 0.1. Add the R spearman to the top-left corner of each plot
By having all the plots arranged in a grid we can easier inspect the results
dataspec_path = '/users/amr1/basepair/src/chipnexus/train/seqmodel/ChIP-seq.dataspec.yml'
from basepair.datasets import get_StrandedProfile_datasets2
train, valid = get_StrandedProfile_datasets2(
dataspec=dataspec_path,
peak_width = 1000,
seq_width=1000,
include_metadata = False,
taskname_first = True, # so that the output labels will be "{task}/profile"
exclude_chr = ['chrX', 'chrY'])
valid = valid[0][1]
ds = DataSpec.load(dataspec_path)
tasks = list(ds.task_specs)
train_set = train.load_all(batch_size=32, num_workers=5)
binsize=50
fdir = Path(f"{ddir}/figures/bias-correction")
!mkdir -p {fdir}
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2/4*len(tasks), 2/len(tasks)))
for i, task in enumerate(train.tasks):
total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2), dtype=np.float32)
ax = axes[0, i]
regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
ax.set_xlabel("Total observed")
if i == 0:
ax.set_ylabel("Total bias")
else:
ax.set_ylabel("")
ax.set_title(task)
ax.set_ylim([10, 1500])
ax.set_xlim([10, 200])
local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
ax = axes[1, i]
regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
ax.set_xlabel("Local observed")
if i == 0:
ax.set_ylabel("Local bias")
else:
ax.set_ylabel("")
fig.savefig(fdir / f'ChIP-seq.bias.scatterplots.binsize={binsize}.pdf', raster=True)
fig.savefig(fdir / f'ChIP-seq.bias.scatterplots.binsize={binsize}.png', raster=True)
dataspec_path = '/users/amr1/basepair/src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml'
from basepair.datasets import get_StrandedProfile_datasets2
train, valid = get_StrandedProfile_datasets2(
dataspec=dataspec_path,
peak_width = 1000,
seq_width=1000,
include_metadata = False,
taskname_first = True, # so that the output labels will be "{task}/profile"
exclude_chr = ['chrX', 'chrY'])
valid = valid[0][1]
ds = DataSpec.load(dataspec_path)
tasks = list(ds.task_specs)
train_set = train.load_all(batch_size=32, num_workers=5)
binsize=50
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2/4*len(tasks), 2/len(tasks)))
for i, task in enumerate(train.tasks):
total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2), dtype=np.float32)
ax = axes[0, i]
regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
ax.set_xlabel("Total observed")
if i == 0:
ax.set_ylabel("Total bias")
else:
ax.set_ylabel("")
ax.set_title(task)
ax.set_ylim([20, 5000])
ax.set_xlim([2, 200])
local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
ax = axes[1, i]
regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
ax.set_xlabel("Local observed")
if i == 0:
ax.set_ylabel("Local bias")
else:
ax.set_ylabel("")
fig.savefig(fdir / f'ChIP-nexus.bias.scatterplots.binsize={binsize}.pdf', raster=True)
fig.savefig(fdir / f'ChIP-nexus.bias.scatterplots.binsize={binsize}.png', raster=True)
binsize=20
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2, 2/len(tasks)))
for i, task in enumerate(train.tasks):
total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2), dtype=np.float32)
ax = axes[0, i]
regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
ax.set_xlabel("Total observed")
if i == 0:
ax.set_ylabel("Total bias")
else:
ax.set_ylabel("")
ax.set_title(task)
ax.set_ylim([20, 5000])
ax.set_xlim([2, 200])
local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
ax = axes[1, i]
regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
ax.set_xlabel("Local observed")
if i == 0:
ax.set_ylabel("Local bias")
else:
ax.set_ylabel("")
binsize=5
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2, 2/len(tasks)))
for i, task in enumerate(train.tasks):
total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2), dtype=np.float32)
ax = axes[0, i]
regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
ax.set_xlabel("Total observed")
if i == 0:
ax.set_ylabel("Total bias")
else:
ax.set_ylabel("")
ax.set_title(task)
ax.set_ylim([20, 5000])
ax.set_xlim([2, 200])
local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
ax = axes[1, i]
regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
ax.set_xlabel("Local observed")
if i == 0:
ax.set_ylabel("Local bias")
else:
ax.set_ylabel("")