Goals

visualize the following things regarding bias correction:

  • scatter-plot: total counts vs bias total counts
  • scatter-plot: local counts (in say 50bp bins) vs bias local counts (both log scale)

this will give you a clue how strongly the bias is present and whether it has a linear relationship with the output our model is namely: y ~ model_output + bias * w2 and if the relationship would be non-linear we'd need to do: y ~ model_output + f(bias)

In [1]:
# Imports
from basepair.imports import *
hv.extension('matplotlib')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
Using TensorFlow backend.
In [2]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib
%matplotlib inline

from scipy.stats import spearmanr
In [3]:
from basepair.plot.evaluate import regression_eval

from basepair.preproc import bin_counts

gather all the plots into the following plot matrix:

  • Column = TF
  • row = different plots: 1. total count bias vs observed, 2. total count bias vs prediction, 3. total count prediction vs observed, 4. total count prediction vs observed - bias (on log scale), 5-8 same but for local counts

Use also a lower alpha value of say 0.1. Add the R spearman to the top-left corner of each plot

By having all the plots arranged in a grid we can easier inspect the results

ChIP-seq

In [4]:
dataspec_path = '/users/amr1/basepair/src/chipnexus/train/seqmodel/ChIP-seq.dataspec.yml'
In [5]:
from basepair.datasets import get_StrandedProfile_datasets2

train, valid = get_StrandedProfile_datasets2(
    dataspec=dataspec_path,
    peak_width = 1000,
    seq_width=1000,
    include_metadata = False,
    taskname_first = True,  # so that the output labels will be "{task}/profile"
    exclude_chr = ['chrX', 'chrY'])
In [6]:
valid = valid[0][1]
In [7]:
ds = DataSpec.load(dataspec_path)
In [8]:
tasks = list(ds.task_specs)
In [9]:
train_set = train.load_all(batch_size=32, num_workers=5)
100%|██████████| 1307/1307 [00:40<00:00, 32.13it/s]
In [10]:
binsize=50
In [11]:
fdir = Path(f"{ddir}/figures/bias-correction")
In [12]:
!mkdir -p {fdir}
In [13]:
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2/4*len(tasks), 2/len(tasks)))
for i, task in enumerate(train.tasks):
    total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
    bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2),  dtype=np.float32)
    ax = axes[0, i]
    regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
    ax.set_xlabel("Total observed")
    if i == 0:
        ax.set_ylabel("Total bias")
    else:
        ax.set_ylabel("")
    ax.set_title(task)
    ax.set_ylim([10, 1500])
    ax.set_xlim([10, 200])
    
    local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    ax = axes[1, i]
    regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
    ax.set_xlabel("Local observed")
    if i == 0:
        ax.set_ylabel("Local bias")
    else:
        ax.set_ylabel("")
fig.savefig(fdir / f'ChIP-seq.bias.scatterplots.binsize={binsize}.pdf', raster=True)
fig.savefig(fdir / f'ChIP-seq.bias.scatterplots.binsize={binsize}.png', raster=True)

ChIP-nexus

In [14]:
dataspec_path = '/users/amr1/basepair/src/chipnexus/train/seqmodel/ChIP-nexus.dataspec.yml'
[autoreload of basepair.datasets failed: Traceback (most recent call last):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 244, in check
    superreload(m, reload, self.old_objects)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 376, in superreload
    module = reload(module)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 678, in exec_module
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/users/avsec/workspace/basepair/basepair/datasets.py", line 93, in <module>
    test_chr=test_chr):
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/gin/config.py", line 1129, in configurable
    return perform_decoration(decoration_target)
  File "/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/gin/config.py", line 1126, in perform_decoration
    return _make_configurable(fn_or_cls, name, module, whitelist, blacklist)
ValueError: A configurable matching 'basepair.datasets.chip_exo_nexus' already exists.
]
In [15]:
from basepair.datasets import get_StrandedProfile_datasets2

train, valid = get_StrandedProfile_datasets2(
    dataspec=dataspec_path,
    peak_width = 1000,
    seq_width=1000,
    include_metadata = False,
    taskname_first = True,  # so that the output labels will be "{task}/profile"
    exclude_chr = ['chrX', 'chrY'])
In [16]:
valid = valid[0][1]
In [17]:
ds = DataSpec.load(dataspec_path)
In [18]:
tasks = list(ds.task_specs)
In [19]:
train_set = train.load_all(batch_size=32, num_workers=5)
100%|██████████| 2843/2843 [01:55<00:00, 24.66it/s]
In [20]:
binsize=50
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2/4*len(tasks), 2/len(tasks)))
for i, task in enumerate(train.tasks):
    total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
    bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2),  dtype=np.float32)
    ax = axes[0, i]
    regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
    ax.set_xlabel("Total observed")
    if i == 0:
        ax.set_ylabel("Total bias")
    else:
        ax.set_ylabel("")
    ax.set_title(task)
    ax.set_ylim([20, 5000])
    ax.set_xlim([2, 200])
    
    local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    ax = axes[1, i]
    regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
    ax.set_xlabel("Local observed")
    if i == 0:
        ax.set_ylabel("Local bias")
    else:
        ax.set_ylabel("")
fig.savefig(fdir / f'ChIP-nexus.bias.scatterplots.binsize={binsize}.pdf', raster=True)
fig.savefig(fdir / f'ChIP-nexus.bias.scatterplots.binsize={binsize}.png', raster=True)

Other binsizes

20

In [34]:
binsize=20
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2, 2/len(tasks)))
for i, task in enumerate(train.tasks):
    total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
    bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2),  dtype=np.float32)
    ax = axes[0, i]
    regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
    ax.set_xlabel("Total observed")
    if i == 0:
        ax.set_ylabel("Total bias")
    else:
        ax.set_ylabel("")
    ax.set_title(task)
    ax.set_ylim([20, 5000])
    ax.set_xlim([2, 200])
    
    local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    ax = axes[1, i]
    regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
    ax.set_xlabel("Local observed")
    if i == 0:
        ax.set_ylabel("Local bias")
    else:
        ax.set_ylabel("")

5

In [35]:
binsize=5
fix, axes = plt.subplots(2, len(tasks), figsize=get_figsize(2, 2/len(tasks)))
for i, task in enumerate(train.tasks):
    total_counts = np.sum(train_set['targets'][f'{task}/profile'], axis=(1, 2), dtype=np.float32)
    bias_total_counts = np.sum(train_set['inputs'][f'bias/{task}/profile'], axis=(1, 2),  dtype=np.float32)
    ax = axes[0, i]
    regression_eval(1+total_counts, 1+bias_total_counts, ax=ax, loglog=True, same_lim=True)
    ax.set_xlabel("Total observed")
    if i == 0:
        ax.set_ylabel("Total bias")
    else:
        ax.set_ylabel("")
    ax.set_title(task)
    ax.set_ylim([20, 5000])
    ax.set_xlim([2, 200])
    
    local_observed = np.ravel(np.sum(bin_counts(train_set['targets'][f'{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    local_bias = np.ravel(np.sum(bin_counts(train_set['inputs'][f'bias/{task}/profile'], binsize=binsize), axis=-1, dtype=np.float32))
    ax = axes[1, i]
    regression_eval(1+local_observed, 1+local_bias, ax=ax, loglog=True, markersize=10, alpha=0.05)
    ax.set_xlabel("Local observed")
    if i == 0:
        ax.set_ylabel("Local bias")
    else:
        ax.set_ylabel("")