Goal

  • make the paper Figure 2
    • add also the upper-bound (cross-replicate accuracy)

Tasks

  • [x] re-compute the old stats
  • [ ] add variance via bootstrapping

Required files

  • {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/model.h5
  • {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/dataspec.yaml -> all files listed in dataspec.yaml
In [1]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.config import get_figsize, paper_config
from basepair.extractors import bw_extract

# Use matplotlib paper config
paper_config()
Using TensorFlow backend.
In [ ]:
# Common paths
model_dir = Path(f"{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/")
modisco_dir = model_dir / f"modisco/all/profile/"
figures = f"{ddir}/figures/model-evaluation/chipnexus-bpnet"

# Parameters
model_file = model_dir / "model.h5"
dataspec_file = model_dir / "dataspec.yaml"
history_file = model_dir / "history.csv"
seq_width = 1000
num_workers = 10

Get predictions

In [4]:
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plots import regression_eval, plot_loss
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax
In [8]:
# !mv {model_dir}/preds.test.pkl {model_dir}/preds.test.bak.pkl
In [5]:
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)

# Cache predictions
create_tf_session(0)
bpnet = BPNetPredictor.from_mdir(model_dir)
In [9]:
if not os.path.exists(model_dir / "preds.test.pkl"):
    
    model = bpnet.model
    dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=seq_width,
                          shuffle=False,
                          target_transformer=AppendCounts())

    test = dl_test.load_all(num_workers=num_workers)
    y_true = test["targets"]
    y_pred = model.predict(test['inputs'], verbose=1)
    write_pkl((test, y_pred), model_dir / "preds.test.pkl")

# Load predictions
test, y_pred = read_pkl(model_dir / "preds.test.pkl")
y_true = test['targets']
100%|██████████| 566/566 [03:11<00:00,  2.95it/s]
18086/18086 [==============================] - 10s 550us/step

Predicted vs observed scatterplot of log10(counts + 1)

In [84]:
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Observed")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    rp = r"$R_{p}$"
    rs = r"$R_{s}$"
    ax.set_title(task)
    ax.text(.95, .2, f"{rp}={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
    ax.text(.95, .05, f"{rs}={spearman:.2f}", 
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
In [85]:
import matplotlib.ticker as ticker

for task in tasks:
    fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[ds.task2idx(task, 'counts')].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    plt.minorticks_off()
    # save the figure
    os.makedirs(f"{figures}/scatter", exist_ok=True)
    fig.savefig(f"{figures}/scatter/{task}.pdf")
    fig.savefig(f"{figures}/scatter/{task}.png")

All plots in one row

In [86]:
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[ds.task2idx(task, 'counts')].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf")
fig.savefig(f"{figures}/scatter/all.png")

Profile plots (predicted vs observed)

  • for each set of TF peaks (all non-overlapping) in the test set (chr1, chr8, chr9), choose two with most counts
In [59]:
from basepair.plots import plot_profiles
from basepair import samplers
from kipoi.data_utils import get_dataset_item
from kipoi.metadata import GenomicRanges
import pybedtools
from basepair.utils import flatten_list
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import dfint_no_intersection
from pybedtools import BedTool
In [60]:
# Figure out valid indices (non-overlapping)
df_ranges = pd.DataFrame(test['metadata']['range'])[['chr', 'start','end']]
df_ranges_tasks = {t: df_ranges[test['metadata']['interval_from_task'] == t] for t in bpnet.tasks}
all_intervals = list(BedTool.from_dataframe(df_ranges))

o = dict()
for i,t in enumerate(bpnet.tasks):
    dft = df_ranges.iloc[test['metadata']['interval_from_task'] == t]
    if i == 0:
        o[t] = dft
    else:
        df_existing = pd.concat(list(o.values()), axis=0)
        o[t] = dft[dfint_no_intersection(dft, df_existing)]
valid_idx = pd.concat(list(o.values()), axis=0).index

valid_idx_bool = pd.Series(np.arange(len(df_ranges))).isin(valid_idx)

print("Fraction of non-overlapping peaks:", valid_idx_bool.mean())
Fraction of non-overlapping peaks: 0.6114121419882782
In [10]:
trim_edge = 300
xlim = [trim_edge, 1000 - trim_edge]
fig_width=get_figsize(0.5)[0]
rotate_y=90
fig_height_per_track=0.5
tasks = bpnet.tasks

for task in bpnet.tasks:
    print(task)
    for idx in samplers.top_sum_count(y_true[f'profile/{task}'],2, keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool):
        # get the interval for that idx
        r = get_dataset_item(test['metadata']['range'], idx)
        interval = pybedtools.create_interval_from_list([r['chr'], int(r['start']), int(r['end'])])
        interval_str = f"{interval.chrom}:{interval.start + trim_edge}-{interval.end - trim_edge}"
        
        # make prediction
        pred = bpnet.predict([interval], compute_grads=False)[0]
        
        # compile the list of tracks to plot
        viz_dict =flatten_list([[
            # Observed
            (f"{task}\nObs", y_true[f'profile/{task}'][idx]),
            # Predicted
            (f"\nPred", pred['preds']['scaled_profile'][task]),
        ] for task_idx, task in enumerate(tasks)])
        
        fig = plot_tracks(filter_tracks(viz_dict, xlim),
                          title=interval_str,
                          fig_height_per_track=fig_height_per_track,
                          rotate_y=rotate_y,
                          fig_width=fig_width,
                          ylim=None,
                          legend=False)
        fig.align_ylabels()
        os.makedirs(f"{figures}/profiles", exist_ok=True)
        fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.pdf")
        fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.png")
Oct4
Sox2
Nanog
Klf4

Compute the same stats for the technical replicates

  • always have two technical replicates for each factor
In [89]:
from snakemake.io import glob_wildcards, expand
In [90]:
# Oct4-bigwigs
!ls /users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep*/*.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep0/neg.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep0/pos.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep1/neg.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep1/pos.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep2/neg.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep2/pos.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep3/neg.bw
/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep3/pos.bw

Number of reads:

  • rep0: 70,264,547
  • rep1: 38,522,714
  • rep2: 67,357,884
  • rep3: 71,419,776
In [91]:
# Sox2-bigwigs
!ls /users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep*/*.bw
/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep0/neg.bw
/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep0/pos.bw
/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep1/neg.bw
/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep1/pos.bw

N Reads

  • rep0: 128,664,465
  • rep1: 81,396,924
In [92]:
# Nanog-bigwigs
!ls /users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep*/*.bw
/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep0/neg.bw
/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep0/pos.bw
/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep1/neg.bw
/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep1/pos.bw

N Reads

  • rep0: 69,328,167
  • rep1: 52,908,450
In [93]:
# Klf4-bigwigs
!ls /users/avsec/workspace/basepair-workflow/data/Klf4/signal/raw/rep*/*.bw
/users/avsec/workspace/basepair-workflow/data/Klf4/signal/raw/rep0/neg.bw
/users/avsec/workspace/basepair-workflow/data/Klf4/signal/raw/rep0/pos.bw

N reads

  • rep0: 80,649,356
In [94]:
wildcard = r"/users/avsec/workspace/basepair-workflow/data/{factor}/signal/raw/{rep}/{strand}.bw"
factors, reps, strands = glob_wildcards(wildcard)
# Remove DNase
factors, reps, strands = zip(*[(f,r,s) for f,r,s in zip(factors, reps, strands) if f != "DNase"])
files = expand(wildcard, zip, factor=factors, rep=reps, strand=strands, )

files = {f"/{f}/{r}/{s}": fi for f, r, s, fi in zip(factors, reps, strands, files)}

files
Out[94]:
{'/Sox2/rep0/pos': '/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep0/pos.bw',
 '/Sox2/rep0/neg': '/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep0/neg.bw',
 '/Sox2/rep1/pos': '/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep1/pos.bw',
 '/Sox2/rep1/neg': '/users/avsec/workspace/basepair-workflow/data/Sox2/signal/raw/rep1/neg.bw',
 '/Klf4/rep0/pos': '/users/avsec/workspace/basepair-workflow/data/Klf4/signal/raw/rep0/pos.bw',
 '/Klf4/rep0/neg': '/users/avsec/workspace/basepair-workflow/data/Klf4/signal/raw/rep0/neg.bw',
 '/Oct4/rep3/pos': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep3/pos.bw',
 '/Oct4/rep3/neg': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep3/neg.bw',
 '/Oct4/rep2/pos': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep2/pos.bw',
 '/Oct4/rep2/neg': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep2/neg.bw',
 '/Oct4/rep0/pos': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep0/pos.bw',
 '/Oct4/rep0/neg': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep0/neg.bw',
 '/Oct4/rep1/pos': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep1/pos.bw',
 '/Oct4/rep1/neg': '/users/avsec/workspace/basepair-workflow/data/Oct4/signal/raw/rep1/neg.bw',
 '/Nanog/rep0/pos': '/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep0/pos.bw',
 '/Nanog/rep0/neg': '/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep0/neg.bw',
 '/Nanog/rep1/pos': '/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep1/pos.bw',
 '/Nanog/rep1/neg': '/users/avsec/workspace/basepair-workflow/data/Nanog/signal/raw/rep1/neg.bw'}
In [95]:
counts = {f: bw_extract(fp, all_intervals, lambda x: x) for f,fp in tqdm(files.items())}
  0%|          | 0/18 [00:00<?, ?it/s]
  6%|▌         | 1/18 [00:02<00:45,  2.69s/it]
 11%|█         | 2/18 [00:05<00:42,  2.63s/it]
 17%|█▋        | 3/18 [00:07<00:38,  2.58s/it]
 22%|██▏       | 4/18 [00:10<00:35,  2.56s/it]
 28%|██▊       | 5/18 [00:12<00:34,  2.65s/it]
 33%|███▎      | 6/18 [00:15<00:31,  2.64s/it]
 39%|███▉      | 7/18 [00:18<00:28,  2.61s/it]
 44%|████▍     | 8/18 [00:20<00:25,  2.57s/it]
 50%|█████     | 9/18 [00:23<00:23,  2.57s/it]
 56%|█████▌    | 10/18 [00:25<00:20,  2.58s/it]
 61%|██████    | 11/18 [00:28<00:18,  2.65s/it]
 67%|██████▋   | 12/18 [00:31<00:15,  2.66s/it]
 72%|███████▏  | 13/18 [00:34<00:13,  2.68s/it]
 78%|███████▊  | 14/18 [00:36<00:10,  2.62s/it]
 83%|████████▎ | 15/18 [00:39<00:07,  2.61s/it]
 89%|████████▉ | 16/18 [00:41<00:05,  2.66s/it]
 94%|█████████▍| 17/18 [00:44<00:02,  2.65s/it]
100%|██████████| 18/18 [00:46<00:00,  2.60s/it]
In [96]:
# Add up reps [0,1] and [2,3]
counts['/Oct4/rep0/pos'] = sum([counts['/Oct4/rep0/pos'], counts['/Oct4/rep1/pos']])
counts['/Oct4/rep0/neg'] = sum([counts['/Oct4/rep0/neg'], counts['/Oct4/rep1/neg']])
counts['/Oct4/rep1/pos'] = sum([counts['/Oct4/rep2/pos'], counts['/Oct4/rep3/pos']])
counts['/Oct4/rep1/neg'] = sum([counts['/Oct4/rep2/neg'], counts['/Oct4/rep3/neg']])

Replicates

Total counts

In [97]:
for task in tasks:
    if task == 'Klf4':
        continue
    fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
    rep0 = 1+sum([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']]).sum(1)
    rep1 = 1+sum([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']]).sum(1)
    
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    
    regression_eval(rep0,  rep1, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    
    plt.minorticks_off()
    # save the figure
    os.makedirs(f"{figures}/scatter-replicates", exist_ok=True)
    fig.savefig(f"{figures}/scatter-replicates/{task}.pdf")
    fig.savefig(f"{figures}/scatter-replicates/{task}.png")
In [98]:
fig, axes = plt.subplots(1, len(tasks) -1 , figsize=get_figsize(frac=.75, aspect=1/(len(tasks) - 1)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    if task == 'Klf4':
        continue
    rep0 = 1+sum([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']]).sum(1)
    rep1 = 1+sum([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']]).sum(1)
    
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    
    regression_eval(rep0,  rep1, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")

fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter-replicates/all.pdf")
fig.savefig(f"{figures}/scatter-replicates/all.png")    

Per-base counts log10(counts + 1)

In [99]:
import holoviews.operation.datashader as hd
hd.shade.cmap=["lightblue", "darkblue"]
hv.extension("bokeh", "matplotlib")
In [100]:
import datashader as dsh
import datashader.transfer_functions as tf
In [101]:
task = 'Sox2'
rep0 = np.log10(1+sum([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']]).ravel())
rep1 = np.log10(1+sum([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']]).ravel())

df = pd.DataFrame({"rep0": rep0, "rep1": rep1})
fig = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=300, plot_height=300).points(df, 'rep0' ,'rep1'))), px=2)
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter-replicates/{task}", 'pdf')
fig
Out[101]:
In [102]:
ypc = y_pred[ds.task2idx(task, 'counts')]
ypp = softmax(y_pred[ds.task2idx(task, 'profile')])

y_pred_profile =  np.log10(1+(ypp * (np.exp(ypc) - 1 )[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile =  np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
a = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Using predicted total counts')), px=2)

y_pred_profile = np.log10(1+(ypp * y_true[f'profile/{task}'].sum(axis=1)[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile = np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
b = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Using observed total counts')), px=2)
fig = a + b
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter/{task}", 'pdf')
fig
Out[102]:
In [103]:
task = 'Nanog'
rep0 = np.log10(1+sum([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']]).ravel())
rep1 = np.log10(1+sum([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']]).ravel())

df = pd.DataFrame({"rep0": rep0, "rep1": rep1})
fig = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=300, plot_height=300).points(df, 'rep0' ,'rep1'))), px=2)
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter-replicates/{task}", 'pdf')
fig
Out[103]:
In [104]:
ypc = y_pred[ds.task2idx(task, 'counts')]
ypp = softmax(y_pred[ds.task2idx(task, 'profile')])

y_pred_profile =  np.log10(1+(ypp * (np.exp(ypc) - 1 )[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile =  np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
a = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Predicted total counts')), px=2)

y_pred_profile = np.log10(1+(ypp * y_true[f'profile/{task}'].sum(axis=1)[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile = np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
b = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Observed total counts')), px=2)
fig = a + b
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter/{task}", 'pdf')
fig
Out[104]:
In [105]:
task = 'Oct4'
rep0 = np.log10(1+sum([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']]).ravel())
rep1 = np.log10(1+sum([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']]).ravel())

df = pd.DataFrame({"rep0": rep0, "rep1": rep1})
fig = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=300, plot_height=300).points(df, 'rep0' ,'rep1'))), px=2)
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter-replicates/{task}", 'pdf')
fig
Out[105]:
In [106]:
ypc = y_pred[ds.task2idx(task, 'counts')]
ypp = softmax(y_pred[ds.task2idx(task, 'profile')])

y_pred_profile =  np.log10(1+(ypp * (np.exp(ypc) - 1 )[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile =  np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
a = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Predicted total counts')), px=2)

y_pred_profile = np.log10(1+(ypp * y_true[f'profile/{task}'].sum(axis=1)[:, np.newaxis]).sum(axis=-1).ravel())
y_true_profile = np.log10(1+y_true[f'profile/{task}'].sum(axis=-1).ravel())

df = pd.DataFrame({"Predicted": y_pred_profile, "Observed": y_true_profile})
b = hd.spread(hd.shade(hv.Image(dsh.Canvas(plot_width=600, plot_height=600).points(df, 'Predicted' ,'Observed'), label='Observed total counts')), px=2)
fig = a + b
hv.Store.renderers['matplotlib'].save(fig, f"{figures}/per-base-scatter/{task}", 'pdf')
fig
Out[106]:

Profile evaluation

  • TODO - load multiple replicates and evaluate the model using the replicates to get the best possible performance
In [30]:
task=tasks[0]
In [31]:
yp = softmax(y_pred[ds.task2idx(task, "profile")])
yt = y_true["profile/" + task]
In [32]:
x = np.ravel(yt / (1+yt.sum(axis=-2, keepdims=True)))
In [33]:
plt.figure(figsize=(12,3))
plt.subplot(121)
plt.hist(x[(x<0.04) ], bins=100);
plt.subplot(122)
plt.hist(x[(x<0.04) & (x>0.0001)], bins=100);
In [34]:
np.mean(x>0.0001)
Out[34]:
0.15592939290058608
In [35]:
out_df = Parallel(n_jobs=len(tasks))(delayed(basepair.cli.evaluate.eval_profile)(y_true["profile/" + task], 
                                                                                 softmax(y_pred[ds.task2idx(task, "profile")]), 
                                                                                 binsizes=[1,10],
                                                                                 pos_min_threshold=0.015,
                                                                                 neg_max_threshold=0.005,)
                 for task in tasks)
df = pd.concat([out_df[i].assign(task=task) for i,task in enumerate(tasks)])
In [36]:
df
Out[36]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task
0 0.1892 1 0.0697 0.0026 39337 0.0071 Oct4
1 0.5777 10 0.3550 0.0310 32782 0.1517 Oct4
0 0.3805 1 0.0750 0.0057 19474 0.0210 Sox2
... ... ... ... ... ... ... ...
1 0.7911 10 0.2340 0.0482 44206 0.3251 Nanog
0 0.2477 1 0.0680 0.0040 35562 0.0152 Klf4
1 0.7025 10 0.3055 0.0400 26373 0.2278 Klf4

8 rows × 7 columns

Rep0 -> rep1 and rep1 -> rep0

In [122]:
binsizes = [1,2, 5, 10, 20, 50]
In [123]:
np.random.seed(42)
out_df = Parallel(n_jobs=len(tasks))(delayed(basepair.cli.evaluate.eval_profile)(np.concatenate([np.stack([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']], axis=-1), 
                                                                                           np.stack([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']], axis=-1)
                                                                                          ], axis=0),
                                                                                 np.concatenate([softmax(y_pred[ds.task2idx(task, "profile")]), 
                                                                                           softmax(y_pred[ds.task2idx(task, "profile")])], axis=0),
                                                                                 binsizes=binsizes,
                                                                                 pos_min_threshold=0.015,
                                                                                 neg_max_threshold=0.005,)
                 for task in tasks if task != 'Klf4')
df = pd.concat([out_df[i].assign(task=task) for i,task in enumerate(tasks) if task != 'Klf4'])
df['method'] = 'bpnet'
In [131]:
task = 'Oct4'
In [132]:
np.random.seed(42)
y_true_rep0 = np.stack([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']], axis=-1)
y_true_rep1 = np.stack([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']], axis=-1)
out_df = basepair.cli.evaluate.eval_profile(np.concatenate([y_true_rep0, 
                                                            y_true_rep1], axis=0), 
                                           np.concatenate([y_true_rep1 / y_true_rep1.sum(axis=1, keepdims=True), 
                                                           y_true_rep0 / y_true_rep0.sum(axis=1, keepdims=True), ], axis=0),
                                           binsizes=binsizes,
                                           pos_min_threshold=0.015,
                                           neg_max_threshold=0.005,)

out_df["task"] = task
out_df["method"] = "replicates"
rep_dfs.append(out_df)
out_df
Out[132]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.1706 1 0.0718 0.0044 44806 0.0044 Oct4 replicates
1 0.2614 2 0.1238 0.0087 42020 0.0087 Oct4 replicates
2 0.4279 5 0.2323 0.0225 38022 0.0223 Oct4 replicates
3 0.6018 10 0.3360 0.0472 34479 0.0469 Oct4 replicates
4 0.7819 20 0.4366 0.0970 30063 0.0956 Oct4 replicates
5 0.9221 50 0.5380 0.2287 23256 0.2256 Oct4 replicates
In [133]:
df[df.task == task]
Out[133]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.2519 1 0.0718 0.0044 44806 0.0042 Oct4 bpnet
1 0.3677 2 0.1238 0.0087 42020 0.0082 Oct4 bpnet
2 0.5447 5 0.2323 0.0225 38022 0.0212 Oct4 bpnet
3 0.6999 10 0.3360 0.0472 34479 0.0440 Oct4 bpnet
4 0.8370 20 0.4366 0.0970 30063 0.0884 Oct4 bpnet
5 0.9364 50 0.5380 0.2287 23256 0.2107 Oct4 bpnet
In [124]:
task = 'Sox2'
In [125]:
rep_dfs = []
In [126]:
np.random.seed(42)
y_true_rep0 = np.stack([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']], axis=-1)
y_true_rep1 = np.stack([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']], axis=-1)
out_df = basepair.cli.evaluate.eval_profile(np.concatenate([y_true_rep0, 
                                                            y_true_rep1], axis=0), 
                                           np.concatenate([y_true_rep1 / y_true_rep1.sum(axis=1, keepdims=True), 
                                                           y_true_rep0 / y_true_rep0.sum(axis=1, keepdims=True), ], axis=0),
                                           binsizes=binsizes,
                                           pos_min_threshold=0.015,
                                           neg_max_threshold=0.005,)

out_df["task"] = task
out_df["method"] = "replicates"
rep_dfs.append(out_df)
out_df
Out[126]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.3163 1 0.0749 0.0075 11315 0.0075 Sox2 replicates
1 0.4142 2 0.1264 0.0142 10140 0.0141 Sox2 replicates
2 0.5721 5 0.2302 0.0339 8549 0.0340 Sox2 replicates
3 0.7024 10 0.3280 0.0657 7231 0.0652 Sox2 replicates
4 0.8228 20 0.4245 0.1238 5833 0.1231 Sox2 replicates
5 0.9285 50 0.5172 0.2602 4115 0.2601 Sox2 replicates
In [127]:
df[df.task == task]
Out[127]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.3881 1 0.0749 0.0075 11315 0.0074 Sox2 bpnet
1 0.5216 2 0.1264 0.0142 10140 0.0140 Sox2 bpnet
2 0.6969 5 0.2302 0.0339 8549 0.0333 Sox2 bpnet
3 0.8128 10 0.3280 0.0657 7231 0.0638 Sox2 bpnet
4 0.8932 20 0.4245 0.1238 5833 0.1228 Sox2 bpnet
5 0.9538 50 0.5172 0.2602 4115 0.2556 Sox2 bpnet
In [128]:
task = 'Nanog'
In [129]:
np.random.seed(42)
y_true_rep0 = np.stack([counts[f'/{task}/rep0/pos'], counts[f'/{task}/rep0/neg']], axis=-1)
y_true_rep1 = np.stack([counts[f'/{task}/rep1/pos'], counts[f'/{task}/rep1/neg']], axis=-1)
out_df = basepair.cli.evaluate.eval_profile(np.concatenate([y_true_rep0, 
                                                            y_true_rep1], axis=0), 
                                           np.concatenate([y_true_rep1 / y_true_rep1.sum(axis=1, keepdims=True), 
                                                           y_true_rep0 / y_true_rep0.sum(axis=1, keepdims=True), ], axis=0),
                                           binsizes=binsizes,
                                           pos_min_threshold=0.015,
                                           neg_max_threshold=0.005,)

out_df["task"] = task
out_df["method"] = "replicates"
rep_dfs.append(out_df)
out_df
Out[129]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.3447 1 0.0629 0.0079 80196 0.0079 Nanog replicates
1 0.4527 2 0.1023 0.0144 70520 0.0145 Nanog replicates
2 0.6177 5 0.1742 0.0317 56975 0.0316 Nanog replicates
3 0.7379 10 0.2368 0.0566 47069 0.0564 Nanog replicates
4 0.8380 20 0.2970 0.0988 37859 0.0983 Nanog replicates
5 0.9220 50 0.3617 0.1932 26878 0.1940 Nanog replicates
In [130]:
df[df.task == task]
Out[130]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.4531 1 0.0629 0.0079 80196 0.0079 Nanog bpnet
1 0.5544 2 0.1023 0.0144 70520 0.0146 Nanog bpnet
2 0.6979 5 0.1742 0.0317 56975 0.0320 Nanog bpnet
3 0.8023 10 0.2368 0.0566 47069 0.0577 Nanog bpnet
4 0.8823 20 0.2970 0.0988 37859 0.1010 Nanog bpnet
5 0.9450 50 0.3617 0.1932 26878 0.1984 Nanog bpnet

Plot

In [161]:
df_auprc = pd.concat([df] + rep_dfs, axis=0)
In [162]:
df_auprc
Out[162]:
auprc binsize frac_ambigous imbalance n_positives random_auprc task method
0 0.2519 1 0.0718 0.0044 44806 0.0042 Oct4 bpnet
1 0.3677 2 0.1238 0.0087 42020 0.0082 Oct4 bpnet
2 0.5447 5 0.2323 0.0225 38022 0.0212 Oct4 bpnet
... ... ... ... ... ... ... ... ...
3 0.6018 10 0.3360 0.0472 34479 0.0469 Oct4 replicates
4 0.7819 20 0.4366 0.0970 30063 0.0956 Oct4 replicates
5 0.9221 50 0.5380 0.2287 23256 0.2256 Oct4 replicates

36 rows × 8 columns

In [168]:
df_auprc_random = df.copy()
df_auprc_random['auprc'] = df_auprc_random['random_auprc']
df_auprc_random['method'] = 'random'# + df_auprc_random['method']
# df_auprc_random = df_auprc_random.drop_duplicates()
In [169]:
df_auprc_tidy = pd.concat([df_auprc, df_auprc_random], axis=0)
del df_auprc_tidy['random_auprc']
In [227]:
from plotnine import *
import plotnine

plotnine.__version__
Out[227]:
'0.5.1'
In [176]:
df_auprc_tidy['method'] = pd.Categorical(df_auprc_tidy['method'], categories=df_auprc_tidy['method'].unique())
df_auprc_tidy['task'] = pd.Categorical(df_auprc_tidy['task'], categories=df_auprc_tidy['task'].unique())
In [200]:
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='auprc', color='method'), data=df_auprc_tidy) + \
  scale_x_log10(breaks=binsizes) + \
  geom_point() + \
  geom_line() + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_color_brewer('qual', 7)
fig
Out[200]:
<ggplot: (8767424596961)>
In [228]:
!mkdir -p {figures}/profile-metrics
In [201]:
fig.save(f"{figures}/profile-metrics/auprc.pdf")
fig.save(f"{figures}/profile-metrics/auprc.png", dpi=300)