# exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
# With augmentation
exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE'
augment_train = True
augment_test = True
gpu = 0

Goal¶

make the paper Figure 2
- add also the upper-bound (cross-replicate accuracy)

TODO¶

[X] allow for data augmentation in ChIP-seq in evaluation
[X] Why does a constant model yield the same NLL? (0-values when smoothing)
- shall we smooth the output signal first and then evealuate it using KL divergence or something like that?
[ ] Make the example plots for ChIP-seq

Required files¶

{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/model.h5
{ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/dataspec.yaml -> all files listed in dataspec.yaml

# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.config import get_figsize, paper_config
from basepair.extractors import bw_extract
from matplotlib import ticker

# Use matplotlib paper config
paper_config()

from basepair.exp.paper.config import *

# Common paths
model_dir = models_dir / exp
figures = f"{ddir}/figures/model-evaluation/chipseq-bpnet/{exp}"

# Parameters
model_file = model_dir / "model.h5"
dataspec_file = "../../chipnexus/train/seqmodel/ChIP-seq.dataspec.yml"
history_file = model_dir / "history.csv"
seq_width = 1000
num_workers = 10

Get predictions¶

import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plots import regression_eval, plot_loss
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax

# !mv {model_dir}/preds.test.pkl {model_dir}/preds.test.bak.pkl

ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)

# Cache predictions
create_tf_session(gpu)
#bpnet = BPNetPredictor.from_mdir(model_dir)

<tensorflow.python.client.session.Session at 0x7f8f10a2df60>

from basepair.seqmodel import SeqModel
bpnet = SeqModel.from_mdir(model_dir)

profile_bias_pool_size=[1,50]

from basepair.preproc import IntervalAugmentor
if augment_train:
    train_interval_transformer = IntervalAugmentor(max_shift=200, flip_strand=True) 
else:
    train_interval_transformer = lambda x: x
    
if augment_test:
    test_interval_transformer = IntervalAugmentor(max_shift=400, flip_strand=True) 
else:
    test_interval_transformer = lambda x: x

# if not os.path.exists(model_dir / "preds.test.pkl"):
    
model = bpnet.model
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=seq_width,
                          shuffle=False,
                          target_transformer=AppendCounts(),
                          interval_transformer=test_interval_transformer,
                          profile_bias_pool_size=profile_bias_pool_size)

test = dl_test.load_all(num_workers=num_workers)
y_true = test["targets"]
y_pred = bpnet.predict(test['inputs'])
y_pred_seqonly = bpnet.predict(test['inputs']['seq'])
# write_pkl((test, y_pred), model_dir / "preds.test.pkl")

# Load predictions
# test, y_pred = read_pkl(model_dir / "preds.test.pkl")
# y_true = test['targets']

100%|██████████| 406/406 [00:11<00:00, 34.66it/s]

# Load the training data and compute the average profile
dl_train = StrandedProfile(ds, 
                           excl_chromosomes=test_chr + valid_chr, 
                           peak_width=seq_width,
                           interval_transformer=train_interval_transformer,
                           shuffle=False)
train_data = dl_train.load_all(num_workers=num_workers)
avg_profile = {k.split("/")[1]: v.mean(axis=0) for k,v in train_data['targets'].items()}

100%|██████████| 1354/1354 [01:00<00:00, 22.36it/s]

Predicted vs observed scatterplot of log10(counts + 1)¶

def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Observed")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    rp = r"$R_{p}$"
    rs = r"$R_{s}$"
    ax.set_title(task)
    ax.text(.95, .2, f"{rp}={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
    ax.text(.95, .05, f"{rs}={spearman:.2f}", 
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)

y_pred.keys()

import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")

for task in tasks:
    fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    plt.minorticks_off()
    # save the figure
    os.makedirs(f"{figures}/scatter", exist_ok=True)
    fig.savefig(f"{figures}/scatter/{task}.pdf")
    fig.savefig(f"{figures}/scatter/{task}.png")

All plots in one row¶

fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf")
fig.savefig(f"{figures}/scatter/all.png")

TODO¶

[X] Take into account bias counts

fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    
    yp_all = y_pred[f'{task}/counts'].mean(-1)
    yp_bias = yp_all - y_pred_seqonly[f'{task}/counts'].mean(-1)
    yt = np.exp(y_true[f'counts/{task}'].mean(-1) - yp_bias)
    yp = np.exp(y_pred_seqonly[f'{task}/counts'].mean(-1))
    xrange = [1, 2e2]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    
    
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all-bias-corrected.pdf", raster=True)

Profile plots (predicted vs observed)¶

for each set of TF peaks (all non-overlapping) in the test set (chr1, chr8, chr9), choose two with most counts

from basepair.plots import plot_profiles
from basepair import samplers
from kipoi.data_utils import get_dataset_item
from kipoi.metadata import GenomicRanges
import pybedtools
from basepair.utils import flatten_list
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import dfint_no_intersection
from pybedtools import BedTool

# Figure out valid indices (non-overlapping)
df_ranges = pd.DataFrame(test['metadata']['range'])[['chr', 'start','end']]
df_ranges_tasks = {t: df_ranges[test['metadata']['interval_from_task'] == t] for t in bpnet.tasks}
all_intervals = list(BedTool.from_dataframe(df_ranges))

o = dict()
for i,t in enumerate(bpnet.tasks):
    dft = df_ranges.iloc[test['metadata']['interval_from_task'] == t]
    if i == 0:
        o[t] = dft
    else:
        df_existing = pd.concat(list(o.values()), axis=0)
        o[t] = dft[dfint_no_intersection(dft, df_existing)]
valid_idx = pd.concat(list(o.values()), axis=0).index

valid_idx_bool = pd.Series(np.arange(len(df_ranges))).isin(valid_idx)

print("Fraction of non-overlapping peaks:", valid_idx_bool.mean())

Fraction of non-overlapping peaks: 0.6635866635866636

from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval

input_seqlen = 1000 - bpnet.body.get_len_change()  - bpnet.heads[0].net.get_len_change()

yt = sum([y_true[f'profile/{task}']
          for task in tasks])

(yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2)).max()

0.09090909

max_frac = (yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2))

max_pos = (yt ).max(axis=(-1, -2))
total_counts = (yt ).sum(axis=(-1, -2))

# trim_edge = 300
# xlim = [trim_edge, 1000 - trim_edge]
# fig_width=get_figsize(0.5)[0]
# rotate_y=90
# fig_height_per_track=0.5
# tasks = bpnet.tasks

# for task in bpnet.tasks:
#     print(task)
    
#     for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10, 
#                                                    keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool )):
#         #& ~blacklisted
# #     for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10, 
# #                                                    keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool & ~((max_pos > 400) & (n_zeros > 1000)))):
#     # for idx in samplers.random(y_true[f'profile/{task}'],2):        
#         # get the interval for that idx
#         r = get_dataset_item(test['metadata']['range'], idx)
#         interval = pybedtools.create_interval_from_list([r['chr'], int(r['start']), int(r['end'])])
#         interval_str = f"{interval.chrom}:{interval.start + trim_edge}-{interval.end - trim_edge}"
        
#         # make prediction
        
#         fe = FastaExtractor(ds.fasta_file)
#         seq = fe([resize_interval(interval, input_seqlen, ignore_strand=True)])
#         x = bpnet.neutral_bias_inputs(1000, 1000)
#         x['seq'] = seq
#         pred = bpnet.predict(x)

        
#         # compile the list of tracks to plot
#         viz_dict =flatten_list([[
#             # Observed
#             (f"{task}\nObs", y_true[f'profile/{task}'][idx]),
#             # Predicted
#             (f"\nPred", pred[f'{task}/profile'][0]),
#         ] for task_idx, task in enumerate(tasks)])
        
#         fig = plot_tracks(filter_tracks(viz_dict, xlim),
#                           title=interval_str,
#                           fig_height_per_track=fig_height_per_track,
#                           rotate_y=rotate_y,
#                           fig_width=fig_width,
#                           ylim=None,
#                           legend=False)
#         fig.align_ylabels()
#         os.makedirs(f"{figures}/profiles", exist_ok=True)
#         fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.pdf")
#         fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.png")

Smoothing¶

import matplotlib.pyplot as plt
from basepair.preproc import moving_average

inputs = y_true['profile/Oct4'][:2]
outputs = np.array([moving_average(inp, n=50) for inp in inputs])

inputs[0][:,0].shape

(1000,)

outputs.shape

(2, 1000, 2)

for inp in inputs:
    plt.plot(inp[:,0], color='blue')
    plt.plot(inp[:,1], color='orange')
plt.show()

for output in outputs:
    plt.plot(output[:,0], color='blue')
    plt.plot(output[:,1], color='orange')
plt.show()

Replicate task specs¶

rep_indices = [f'C{x:02d}' for x in np.arange(1, 8)]

reps = pd.read_csv("https://docs.google.com/spreadsheets/d/1PvHGy0P9_Yq0tZFw807bjadxaZHAYECE4RytlI9rdeQ/export?gid=0&format=csv")

reps = reps[reps.Mnemonic.isin(rep_indices)]

reps.head()

from basepair.cli.schemas import TaskSpec

def get_bigwig(row, strand):
    rep_n = int(row['Rep Number'])
    task = row['TF Name']
    rep_name = row['FTP Name'].split(".")[0]
    return f"/srv/scratch/amr1/chip-seq-pipeline2/cromwell-executions/chip/{task}/call-bam2ta/shard-{rep_n -1}/execution/{task}.{strand}.bw"

def get_taskspec(rows):
    task = rows.iloc[0]['TF Name'].capitalize()
    pos_counts=[get_bigwig(rows.iloc[i], 'positive') for i in range(len(rows))]
    neg_counts=[get_bigwig(rows.iloc[i], 'negative') for i in range(len(rows))]
    return TaskSpec(task=task, pos_counts=pos_counts, neg_counts=neg_counts)

dataspec_train = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == False)])
                                                 for task in tasks},
                  fasta_file="")

dataspec_test = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == True)])
                                                 for task in tasks},
                  fasta_file="")

dataspec_train.touch_all_files()
dataspec_test.touch_all_files()

Compute the same stats for the technical replicates¶

always have two technical replicates for each factor

# Augment the intervals
# test_interval_transformer = IntervalAugmentor(max_shift=600, flip_strand=True) 
shifted_intervals = [test_interval_transformer(interval) for interval in all_intervals]

train = dataspec_train.load_counts(shifted_intervals, progbar=True)
test = dataspec_test.load_counts(shifted_intervals, progbar=True)

100%|██████████| 12987/12987 [00:01<00:00, 6781.40it/s]
100%|██████████| 12987/12987 [00:01<00:00, 7415.04it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6446.78it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5675.07it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5844.02it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6730.13it/s]
100%|██████████| 12987/12987 [00:01<00:00, 7236.28it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6015.07it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5845.01it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6598.72it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6496.49it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6247.80it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5457.48it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5756.09it/s]

ds.touch_all_files()

bias = ds.load_bias_counts(shifted_intervals)

seqs = FastaExtractor(fasta_file, use_strand=True)(shifted_intervals)

y_pred = bpnet.predict(seqs)

Evaluate¶

from basepair.cli.evaluate import bin_counts_summary

class BinnedMetric:
    def __init__(self, metric, binsizes):
        self.metric = metric
        self.binsizes= binsizes

    def __call__(self, y_true, y_pred):
        return [{"binsize": binsize, **self.metric(
            bin_counts_summary(y_true, binsize=binsize, fn=np.sum),
            bin_counts_summary(y_pred, binsize=binsize, fn=np.sum))}
                for binsize in self.binsizes]

from basepair.exp.paper.config import peak_pred_metric
from basepair.metrics import *

def normalize(x):
    p = x / np.maximum(np.sum(x, axis=-1, keepdims=True), 1e-8)
    p += 1 / (x.shape[-1]*10)  # Should be 
    p = p / np.sum(p, axis=-1, keepdims=True)
    return p

# Override the binsizes for peak_pred_metric
def multinomial_nll(y_true, y_pred):
    from scipy.stats import multinomial
    # subset = y_true.sum(axis=-1) > 0
    subset = slice(None)
    return -multinomial.logpmf(x=y_true[subset],
                               n=y_true[subset].sum(axis=-1), 
                               p=normalize(y_pred[subset]))

def stranded_multinomial_nll(y_true, y_pred):
    return (np.nanmean(multinomial_nll(y_true[:, :, 0], y_pred[:, :, 0])) +
            np.nanmean(multinomial_nll(y_true[:, :, 1], y_pred[:, :, 1])))

from basepair.losses import multinomial_nll as k_multinomial_nll
import keras.backend as K
import tensorflow as tf

# y_pred = K.placeholder(ndim=2)
# y_true = K.placeholder(ndim=2)

# nll = K.function([y_true, y_pred], [k_multinomial_nll(y_true, K.log(y_pred / (1 - y_pred)))])

# nll([np.ones((100, 10)), np.ones((100, 10))/10])[0]

multinomial_nll(np.array([1,2,10]), np.array([1,2,10]))

(3,)

2.2401849431843814

multinomial_nll(np.array([1,2,10]), np.array([1,1,1]))

7.527355653197464

multinomial_nll(np.array([1,2,10]), np.array([3,2,1]))

14.05046367506539

# stranded_multinomial_nll(np.ones((100, 10, 2)), np.ones((100, 10, 2))/10)/2

from basepair.stats import symmetric_kl

def padded_symmetric_kl(y_true, y_pred):
    return (symmetric_kl(normalize(moving_average(y_true[:, :, 0], n=50)), normalize(y_pred[:, :, 0])) +
            symmetric_kl(normalize(moving_average(y_true[:, :, 1], n=50)), normalize(y_pred[:, :, 1])))

for output in outputs:
    plt.plot(train_reps['Oct4'][0][:,0], color='blue')
    plt.plot(train_reps['Oct4'][0][:,1], color='orange')
plt.show()

plt.plot(test['Nanog'][0][:, 0])
plt.plot(test['Nanog'][0][:, 1])

[<matplotlib.lines.Line2D at 0x7f8ee40699b0>]

import random

random.sample(range(100), 3)

[12, 38, 8]

TODO¶

[X] plot also the bias track
choose one footprint where we predict well
compute the log-likelihood for the multinomial NLL
compare first vs second replicate

train_reps = {}
for task in tasks:
    train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])

dftm

# task = 'Nanog'

dft = pd.concat([pd.DataFrame({
    "idx": np.arange(len(test[task])),
    "rep": multinomial_nll(test[task][:,:,0], 
                           moving_average(train_reps[task][:,:,0], n=50)),
    "pred": multinomial_nll(test[task][:,:,0], 
                            y_pred[f"{task}/profile"][:,:,0]),
    "avg": multinomial_nll(test[task][:,:,0], 
                           np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)[:,:,0]),
    "const": multinomial_nll(test[task][:,:,0], 
                             np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)[:,:,0]),
    'task': task
}) for task in tasks]).reset_index()

dftm = pd.melt(dft.groupby('task').mean().reset_index(), id_vars='task', 
               value_vars=['rep', 'pred', 'avg', 'const'])

# 100
dft['rep'].mean()

99.72438733883433

# 50
dft['rep'].mean()

99.91845892760922

# 50
dft['pred'].mean()

91.43684157947176

# TODO - figure out the wors difference
dft.loc[dft.avg.argmax()]

index     6443
idx       6443
rep      822.2
pred     561.7
avg      919.8
const    723.9
task     Nanog
Name: 32417, dtype: object

# TODO - figure out the wors difference
dft.loc[dft.rep.argmax()]

index     6443
idx       6443
rep      822.2
pred     561.7
avg      919.8
const    723.9
task     Nanog
Name: 32417, dtype: object

Questions¶

[ ] how strongly are the ChIP-seq tracks correlated with each other?

i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
             "obs2": test[TF][i],
             "bias": bias['input'][i],
             "smooth1": moving_average(train[TF][i], n=50),
             "smooth2": moving_average(test[TF][i], n=50),
             "bias smooth": moving_average(bias['input'][i], n=50),
             "avg": avg_profile[task],
             "pred": y_pred[f'{TF}/profile'][i],
            }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]

i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
             "obs2": test[TF][i],
             "bias": bias['input'][i],
             "smooth1": moving_average(train[TF][i], n=50),
             "smooth2": moving_average(test[TF][i], n=50),
             "bias smooth": moving_average(bias['input'][i], n=50),
             "avg": avg_profile[task],
             "pred": y_pred[f'{TF}/profile'][i],
            }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]

def plot_track_w_rug(arr, ax, legend=False, ylim=None, color=None, track=None):
    """Plot a track
    """
    import collections
    seqlen = len(arr)
    if arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
        
        if 'Obs' in track:
            y_smooth = moving_average(arr, n=50)
            ax.plot(np.arange(1, seqlen + 1), y_smooth[:,0], label='pos', color=c1)
            ax.plot(np.arange(1, seqlen + 1), y_smooth[:,1], label='neg', color=c2)
            
            # Rugs
            rug_pos = np.where(arr[:, 0]>=1)[0] + 1
            rug_neg = np.where(arr[:, 1]>=1)[0] + 1
            sns.rugplot(rug_pos, color=c1, alpha=0.3, ax=ax, height=.1)
            sns.rugplot(rug_neg, color=c2, alpha=0.3, ax=ax, height=.1)
        else:
            ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
            ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
        
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)
    ax.set_xlim([0, seqlen])

# chr1:136680205-136680605 (known Zfp281 enhancer) 
# chr1:180924752-180925152 (known Letfy1 enhancer) 
intervals = [
    ('known Zfp281 enhancer', 'chr1:136680205-136680605'),
    ('known Letfy1 enhancer', 'chr1:180924752-180925152'),
]

viz_dict = flatten_list([[
    ("Obs bias", bias['input'][i]),
    ("Obs 1", train[TF][i]),
    ("Obs 3", test[TF][i]),
    # "avg": avg_profile[task],
    ("pred", y_pred[f'{TF}/profile'][i]),
            ] for TF in tasks])

tasks

['Oct4', 'Sox2', 'Nanog']

OrderedDict(viz_dict).keys()

odict_keys(['Obs bias', 'Obs 1', 'Obs 3', 'pred'])

viz_dict.keys()

odict_keys(['Oct4 Obs bias', 'Oct4 Obs 1', 'Oct4 Obs 3', 'Oct4 pred', 'Sox2 Obs bias', 'Sox2 Obs 1', 'Sox2 Obs 3', 'Sox2 pred', 'Nanog Obs bias', 'Nanog Obs 1', 'Nanog Obs 3', 'Nanog pred'])

i = 11
TF = 'Sox2'
for i in random.sample(range(len(train[TF])), 20):
    viz_dict = OrderedDict([(f"Obs bias", bias['input'][i])] + flatten_list([[
        (f"{TF} Obs", train[TF][i] + test[TF][i]),
        # (f"{TF} Obs 2", test[TF][i]),
        # "avg": avg_profile[task],
        (f"{TF} pred", y_pred[f'{TF}/profile'][i]),
                ] for TF in tasks]))

    plot_tracks(viz_dict, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, 
                color=[['blue', 'orange']]*4*3,
                rotate_y=0, plot_track_fn=plot_track_w_rug);
    dft[(dft.idx == i) & (dft.task == TF)]
    sns.despine(top=True, bottom=True, right=True)

moving_average(train[TF][i], n=25)

moving_average(train[TF][i], n=50).min()

0.0

x = moving_average(train[TF][i], n=50)
x = x / x.sum(axis=0, keepdims=True)

x.min()

0.0

x.max()

0.005657530802112133

y_pred[f'{TF}/profile'][i].max()

0.009529398

y_pred[f'{TF}/profile'][i].min()

4.722764e-05

# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)

<ggplot: (-9223363271124186515)>

# 10-3
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)

<ggplot: (8765716433584)>

# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)

<ggplot: (8765713298303)>

Figure out which ones are off¶

Questions¶

why is log likelihood of the reps much worse than the constant model?
- is it because of many 0's?
- shall we increaase the biase size?
store the performance per example (binsize=1, positive strand)
- store predictive performance

TF = 'Sox2'
i = 0
for i in random.sample(range(len(train[TF])), 20):
    plot_tracks({"obs1": train[TF][i],
                 "obs2": test[TF][i],
                 "bias": bias['input'][i],
                 "smooth1": moving_average(train[TF][i], n=50),
                 "smooth2": moving_average(test[TF][i], n=50),
                 "bias smooth": moving_average(bias['input'][i], n=50),
                 "pred": y_pred[f'{TF}/profile'][i],       
                }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);

binsizes = [1, 2, 5, 10, 20, 50]  # 2, 5, 
peak_pred_metric.binsizes = binsizes

metric = BPNetMetricSingleProfile(
    count_metric=pearson_spearman,
    profile_metric=stranded_multinomial_nll
)

binned_metrics = BinnedMetric(metric, binsizes)

train_reps = {}
for task in tasks:
    train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])

metrics = {task: binned_metrics(test[task], train_reps[task]) for task in tqdm(tasks)}

100%|██████████| 3/3 [13:27<00:00, 268.68s/it]

metrics_model = {task: binned_metrics(test[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) for task in tqdm(tasks)}

100%|██████████| 3/3 [14:37<00:00, 289.24s/it]

metrics_avg = {task: binned_metrics(test[task], 
                                    np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)) 
               for task in tqdm(tasks)}

100%|██████████| 3/3 [13:54<00:00, 279.57s/it]

metrics_const = {task: binned_metrics(test[task], 
                                    np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)) 
               for task in tqdm(tasks)}

100%|██████████| 3/3 [13:52<00:00, 276.89s/it]

def profile_metrics2df(m, norm=None):
    out = []
    for task,v in m.items():
        for bin_id in range(len(v)):
            if norm is None:
                profile = v[bin_id]['profile']
            else:
                profile = v[bin_id]['profile'] - norm[task][bin_id]['profile']
            out.append({"task": task,
                        "binsize": v[bin_id]['binsize'],
                        "profile": profile
                       })
    return pd.DataFrame(out)


def count_metrics2df(m):
    out = []
    for task,v in m.items():
        out.append({"task": task,
                    "pearsonr": v[0]['counts']['pearsonr']
                       })
    return pd.DataFrame(out)

dfm_profile = profile_metrics2df(metrics, metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_model, metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_avg, metrics_const).assign(method='Average')])

dfm_counts = pd.concat([dfm_counts, count_metrics2df(metrics_model).assign(method='BPNet')])

dfm_profile['split'] = 'train->test'
dfm_profile1 = dfm_profile.copy()
dfm_counts['split'] = 'train->test'
dfm_counts1 = dfm_counts.copy()

# Swap train and test

test_reps = {}
for task in tasks:
    test_reps[task] = np.array([moving_average(test[task][i], n=50) for i in range(len(test[task]))])
    
shuff_metrics = {task: binned_metrics(train[task], test_reps[task]) for task in tqdm(tasks)}

100%|██████████| 3/3 [14:00<00:00, 279.89s/it]

shuff_metrics_model = {task: binned_metrics(train[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) 
                       for task in tqdm(tasks)}

100%|██████████| 3/3 [14:08<00:00, 282.10s/it]

shuff_metrics_avg = {task: binned_metrics(train[task],
                                          np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)) 
                     for task in tqdm(tasks)}

100%|██████████| 3/3 [14:11<00:00, 283.76s/it]

shuff_metrics_const = {task: binned_metrics(train[task],
                                          np.repeat(np.zeros_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)) 
                     for task in tqdm(tasks)}

100%|██████████| 3/3 [14:09<00:00, 283.33s/it]

dfm_profile = profile_metrics2df(shuff_metrics, shuff_metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(shuff_metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_model, shuff_metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_avg, shuff_metrics_const).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(shuff_metrics_model).assign(method='BPNet')])

dfm_profile['split'] = 'test->train'
dfm_profile2 = dfm_profile.copy()
dfm_counts['split'] = 'test->train'
dfm_counts2 = dfm_counts.copy()

df_nll_tidy = pd.concat([dfm_profile1, dfm_profile2])
df_nll_tidy['method'] = pd.Categorical(df_nll_tidy['method'], categories=["BPNet", "Replicates", "Average"])
df_nll_tidy['task'] = pd.Categorical(df_nll_tidy['task'], categories=df_nll_tidy['task'].unique())

df_counts_tidy = pd.concat([dfm_counts1, dfm_counts2])
df_counts_tidy['method'] = pd.Categorical(df_counts_tidy['method'], categories=["BPNet", "Replicates"])
df_counts_tidy['task'] = pd.Categorical(df_counts_tidy['task'], categories=df_counts_tidy['task'].unique())

Plot

from plotnine import *
import plotnine

plotnine.__version__

'0.5.1'

paper_config()

df_nll_tidy = df_nll_tidy[df_nll_tidy.binsize < 100]

def booms(inputs):
    import math
    out = []
    for key, item in inputs:
        [a, b] = item.values
        a = a[~np.isnan(a)]
        b = b[~np.isnan(b)]
        out.append([a.mean(), b.mean()])
    return out

Q: Why is there such a big gap between replicates and average?¶

investigate the big gap in performance on the per example basis
- which example deviate a lot?

df_nll_tidy['profile_ll'] = -df_nll_tidy['profile']

plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
  scale_x_log10(breaks=binsizes) + \
  geom_point() + \
  geom_line() + \
  geom_abline(intercept=0, slope=0) + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_color_brewer('qual', 7) + \
  ylab("Log-likelihood")
fig.save(f"{figures}/ll.pdf")
fig.save(f"{figures}/ll.png", dpi=300)
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 3.42519 x 1.027557 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/model-evaluation/chipseq-bpnet/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE/ll.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 3.42519 x 1.027557 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/model-evaluation/chipseq-bpnet/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE/ll.png
  warn('Filename: {}'.format(filename))

<ggplot: (-9223363271131884028)>

plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
  scale_x_log10(breaks=binsizes) + \
  geom_point() + \
  geom_line() + \
  geom_abline(intercept=0, slope=0) + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_color_brewer('qual', 7) + \
  ylab("Log-likelihood")
#fig.save(f"{figures}/nll.pdf")
#fig.save(f"{figures}/nll.png", dpi=300)
fig

<ggplot: (8765716610203)>

plotnine.options.figure_size = get_figsize(0.7, aspect=0.3)
fig = ggplot(aes(x='method', y='pearsonr', fill='method'), data=df_counts_tidy.groupby(['task', 'method']).pearsonr.mean().reset_index()) + \
  geom_bar(stat='identity', position='dodge') + \
  geom_line() + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_fill_brewer('qual', 7)
fig.save(f"{figures}/counts.pdf")
fig.save(f"{figures}/counts.png", dpi=300)
fig

/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "

<ggplot: (8765716718705)>

def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Observed")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    
    ax.set_title(task)
    ax.text(0.95, .01, f"R={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            fontsize=8,
            transform=ax.transAxes)

fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(frac=.5, aspect=2/len(tasks)),
                         sharex=True, sharey=True)
for i, task in enumerate(tasks):
    for j in range(2):
        ax = axes[j,i]
        yt = test[task].sum(axis=1).mean(-1) + 1
        if j == 0:
            yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
        else:
            yp = train[task].sum(axis=1).mean(-1) + 1
        xrange = [2, 5e3]
        ax.set_ylim(xrange)
        ax.set_xlim(xrange)

        ax.plot(xrange, xrange, c='grey', alpha=0.2)
        regression_eval(yt, 
                        yp, alpha=.01, task=task if j == 0 else '', ax=ax, loglog=True)
        
        ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
        ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
        if i > 0:
            ax.set_ylabel("")
        else:
            ax.set_ylabel("BPNet\nObserved" if j==0 else "Replicates\n")
fig.subplots_adjust(wspace=0, hspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/all.pdf")
fig.savefig(f"{figures}/all.png")

	Mnemonic	FTP Name	thenexus Name	Sample ID	Comments	Data Type	TF Name	Rep Number	Control Reps	QC report	Unique deduped reads	Held-out test	#Rep-IDRpeaks (N1, N2, ..)	#IDR-optimal peaks (Np)	Md5-hash-FASTQ	Md5-hash-IDRoptimal	Md5-bigwigs	Oak path peak	Oak path bw pos	Oak path bw neg
0	C01	mesc_oct4_chipseq_1	m_esc_1m_oct4_chipseq...	2222.0	NaN	chipseq	oct4	1.0	C14,C15	http://mitra.stanford...	27M	False	10770.0	19351.0	ca231a170bad4679ace07...	NaN	NaN	NaN	NaN	NaN
1	C02	mesc_oct4_chipseq_2	chipseq_mesc_10m_oct4_2	2383.0	NaN	chipseq	oct4	2.0	C14,C15	http://mitra.stanford...	18M	True	14402.0	19351.0	61280040ed745ef31c228...	NaN	NaN	NaN	NaN	NaN
2	C03	mesc_sox2_chipseq_1	m_esc_1m_sox2_chipseq...	2223.0	NaN	chipseq	sox2	1.0	C14,C15	http://mitra.stanford...	19M	False	255.0	9497.0	1d4228d99a87618c8813e...	NaN	NaN	NaN	NaN	NaN
3	C04	mesc_sox2_chipseq_2	mesc_sox2_chipseq_2	3659.0	NaN	chipseq	sox2	2.0	C14,C15	http://mitra.stanford...	49M	False	8847.0	9497.0	28bf183b8c906280844e8...	NaN	NaN	NaN	NaN	NaN
4	C05	mesc_sox2_chipseq_3	m_esc_10m_sox2_chipse...	2224.0	NaN	chipseq	sox2	3.0	C14,C15	http://mitra.stanford...	32M	True	4202.0	9497.0	NaN	NaN	NaN	NaN	NaN	NaN

	idx	task	variable	value
0	6493	Nanog	rep	161.2594
1	6493	Oct4	rep	111.4792
2	6493	Sox2	rep	78.5938
...	...	...	...	...
9	6493	Nanog	const	137.1863
10	6493	Oct4	const	92.7489
11	6493	Sox2	const	69.6990