In [136]:
# exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
# With augmentation
exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE'
augment_train = True
augment_test = True
gpu = 0

Goal

  • make the paper Figure 2
    • add also the upper-bound (cross-replicate accuracy)

TODO

  • [X] allow for data augmentation in ChIP-seq in evaluation
  • [X] Why does a constant model yield the same NLL? (0-values when smoothing)
    • shall we smooth the output signal first and then evealuate it using KL divergence or something like that?
  • [ ] Make the example plots for ChIP-seq

Required files

  • {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/model.h5
  • {ddir}/processed/chipnexus/exp/models/oct-sox-nanog-klf/models/n_dil_layers=9/dataspec.yaml -> all files listed in dataspec.yaml
In [498]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.plot.config import get_figsize, paper_config
from basepair.extractors import bw_extract
from matplotlib import ticker

# Use matplotlib paper config
paper_config()
In [166]:
from basepair.exp.paper.config import *
In [167]:
# Common paths
model_dir = models_dir / exp
figures = f"{ddir}/figures/model-evaluation/chipseq-bpnet/{exp}"

# Parameters
model_file = model_dir / "model.h5"
dataspec_file = "../../chipnexus/train/seqmodel/ChIP-seq.dataspec.yml"
history_file = model_dir / "history.csv"
seq_width = 1000
num_workers = 10

Get predictions

In [168]:
import basepair
import pandas as pd
import numpy as np
from basepair.cli.schemas import DataSpec, TaskSpec
from pathlib import Path
from keras.models import load_model
from basepair.datasets import StrandedProfile
from basepair.preproc import AppendCounts
from basepair.losses import MultichannelMultinomialNLL
from basepair.config import valid_chr, test_chr
from basepair.plots import regression_eval, plot_loss
from basepair.cli.evaluate import eval_profile
from basepair import samplers 
from basepair.math import softmax
In [169]:
# !mv {model_dir}/preds.test.pkl {model_dir}/preds.test.bak.pkl
In [170]:
ds = DataSpec.load(dataspec_file)
tasks = list(ds.task_specs)

# Cache predictions
create_tf_session(gpu)
#bpnet = BPNetPredictor.from_mdir(model_dir)
Out[170]:
<tensorflow.python.client.session.Session at 0x7f8f10a2df60>
In [493]:
from basepair.seqmodel import SeqModel
bpnet = SeqModel.from_mdir(model_dir)
In [9]:
profile_bias_pool_size=[1,50]
In [78]:
from basepair.preproc import IntervalAugmentor
if augment_train:
    train_interval_transformer = IntervalAugmentor(max_shift=200, flip_strand=True) 
else:
    train_interval_transformer = lambda x: x
    
if augment_test:
    test_interval_transformer = IntervalAugmentor(max_shift=400, flip_strand=True) 
else:
    test_interval_transformer = lambda x: x
In [494]:
# if not os.path.exists(model_dir / "preds.test.pkl"):
    
model = bpnet.model
dl_test = StrandedProfile(ds, 
                          incl_chromosomes=test_chr, 
                          peak_width=seq_width,
                          shuffle=False,
                          target_transformer=AppendCounts(),
                          interval_transformer=test_interval_transformer,
                          profile_bias_pool_size=profile_bias_pool_size)

test = dl_test.load_all(num_workers=num_workers)
y_true = test["targets"]
y_pred = bpnet.predict(test['inputs'])
y_pred_seqonly = bpnet.predict(test['inputs']['seq'])
# write_pkl((test, y_pred), model_dir / "preds.test.pkl")

# Load predictions
# test, y_pred = read_pkl(model_dir / "preds.test.pkl")
# y_true = test['targets']
100%|██████████| 406/406 [00:11<00:00, 34.66it/s]
In [12]:
# Load the training data and compute the average profile
dl_train = StrandedProfile(ds, 
                           excl_chromosomes=test_chr + valid_chr, 
                           peak_width=seq_width,
                           interval_transformer=train_interval_transformer,
                           shuffle=False)
train_data = dl_train.load_all(num_workers=num_workers)
avg_profile = {k.split("/")[1]: v.mean(axis=0) for k,v in train_data['targets'].items()}
100%|██████████| 1354/1354 [01:00<00:00, 22.36it/s]

Predicted vs observed scatterplot of log10(counts + 1)

In [496]:
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Observed")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    rp = r"$R_{p}$"
    rs = r"$R_{s}$"
    ax.set_title(task)
    ax.text(.95, .2, f"{rp}={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
    ax.text(.95, .05, f"{rs}={spearman:.2f}", 
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
In [ ]:
y_pred.keys()
In [15]:
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")

for task in tasks:
    fig, ax= plt.subplots(figsize=get_figsize(frac=0.25, aspect=1))
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    plt.minorticks_off()
    # save the figure
    os.makedirs(f"{figures}/scatter", exist_ok=True)
    fig.savefig(f"{figures}/scatter/{task}.pdf")
    fig.savefig(f"{figures}/scatter/{task}.png")

All plots in one row

In [16]:
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    yt = np.exp(y_true[f'counts/{task}'].mean(-1))
    yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
    xrange = [10, 1e4]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all.pdf")
fig.savefig(f"{figures}/scatter/all.png")

TODO

  • [X] Take into account bias counts
In [507]:
fig, axes = plt.subplots(1, len(tasks), figsize=get_figsize(frac=1, aspect=1/len(tasks)),
                         sharex=True, sharey=True)
for i, (task, ax) in enumerate(zip(tasks, axes)):
    
    yp_all = y_pred[f'{task}/counts'].mean(-1)
    yp_bias = yp_all - y_pred_seqonly[f'{task}/counts'].mean(-1)
    yt = np.exp(y_true[f'counts/{task}'].mean(-1) - yp_bias)
    yp = np.exp(y_pred_seqonly[f'{task}/counts'].mean(-1))
    xrange = [1, 2e2]
    ax.set_ylim(xrange)
    ax.set_xlim(xrange)
    
    
    ax.plot(xrange, xrange, c='grey', alpha=0.2)
    regression_eval(yt, 
                    yp, alpha=.1, task=task, ax=ax, loglog=True)
    ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=4))
    if i > 0:
        ax.set_ylabel("")
fig.subplots_adjust(wspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/scatter/all-bias-corrected.pdf", raster=True)

Profile plots (predicted vs observed)

  • for each set of TF peaks (all non-overlapping) in the test set (chr1, chr8, chr9), choose two with most counts
In [517]:
from basepair.plots import plot_profiles
from basepair import samplers
from kipoi.data_utils import get_dataset_item
from kipoi.metadata import GenomicRanges
import pybedtools
from basepair.utils import flatten_list
from basepair.plot.tracks import plot_tracks, filter_tracks
from basepair.preproc import dfint_no_intersection
from pybedtools import BedTool
In [518]:
# Figure out valid indices (non-overlapping)
df_ranges = pd.DataFrame(test['metadata']['range'])[['chr', 'start','end']]
df_ranges_tasks = {t: df_ranges[test['metadata']['interval_from_task'] == t] for t in bpnet.tasks}
all_intervals = list(BedTool.from_dataframe(df_ranges))

o = dict()
for i,t in enumerate(bpnet.tasks):
    dft = df_ranges.iloc[test['metadata']['interval_from_task'] == t]
    if i == 0:
        o[t] = dft
    else:
        df_existing = pd.concat(list(o.values()), axis=0)
        o[t] = dft[dfint_no_intersection(dft, df_existing)]
valid_idx = pd.concat(list(o.values()), axis=0).index

valid_idx_bool = pd.Series(np.arange(len(df_ranges))).isin(valid_idx)

print("Fraction of non-overlapping peaks:", valid_idx_bool.mean())
Fraction of non-overlapping peaks: 0.6635866635866636
In [519]:
from genomelake.extractors import FastaExtractor
from basepair.preproc import resize_interval
In [520]:
input_seqlen = 1000 - bpnet.body.get_len_change()  - bpnet.heads[0].net.get_len_change()
In [521]:
yt = sum([y_true[f'profile/{task}']
          for task in tasks])
In [522]:
(yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2)).max()
Out[522]:
0.09090909
In [523]:
max_frac = (yt / yt.sum(axis=1, keepdims=True)).max(axis=(-1, -2))
In [524]:
max_pos = (yt ).max(axis=(-1, -2))
total_counts = (yt ).sum(axis=(-1, -2))
In [24]:
# trim_edge = 300
# xlim = [trim_edge, 1000 - trim_edge]
# fig_width=get_figsize(0.5)[0]
# rotate_y=90
# fig_height_per_track=0.5
# tasks = bpnet.tasks

# for task in bpnet.tasks:
#     print(task)
    
#     for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10, 
#                                                    keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool )):
#         #& ~blacklisted
# #     for i, idx in enumerate(samplers.top_sum_count(y_true[f'profile/{task}'],10, 
# #                                                    keep=(test['metadata']['interval_from_task'] == task) & valid_idx_bool & ~((max_pos > 400) & (n_zeros > 1000)))):
#     # for idx in samplers.random(y_true[f'profile/{task}'],2):        
#         # get the interval for that idx
#         r = get_dataset_item(test['metadata']['range'], idx)
#         interval = pybedtools.create_interval_from_list([r['chr'], int(r['start']), int(r['end'])])
#         interval_str = f"{interval.chrom}:{interval.start + trim_edge}-{interval.end - trim_edge}"
        
#         # make prediction
        
#         fe = FastaExtractor(ds.fasta_file)
#         seq = fe([resize_interval(interval, input_seqlen, ignore_strand=True)])
#         x = bpnet.neutral_bias_inputs(1000, 1000)
#         x['seq'] = seq
#         pred = bpnet.predict(x)

        
#         # compile the list of tracks to plot
#         viz_dict =flatten_list([[
#             # Observed
#             (f"{task}\nObs", y_true[f'profile/{task}'][idx]),
#             # Predicted
#             (f"\nPred", pred[f'{task}/profile'][0]),
#         ] for task_idx, task in enumerate(tasks)])
        
#         fig = plot_tracks(filter_tracks(viz_dict, xlim),
#                           title=interval_str,
#                           fig_height_per_track=fig_height_per_track,
#                           rotate_y=rotate_y,
#                           fig_width=fig_width,
#                           ylim=None,
#                           legend=False)
#         fig.align_ylabels()
#         os.makedirs(f"{figures}/profiles", exist_ok=True)
#         fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.pdf")
#         fig.savefig(f"{figures}/profiles/max-count-{task}-{idx}-{interval_str}.png")

Smoothing

In [611]:
import matplotlib.pyplot as plt
from basepair.preproc import moving_average

inputs = y_true['profile/Oct4'][:2]
outputs = np.array([moving_average(inp, n=50) for inp in inputs])
In [26]:
inputs[0][:,0].shape
Out[26]:
(1000,)
In [27]:
outputs.shape
Out[27]:
(2, 1000, 2)
In [28]:
for inp in inputs:
    plt.plot(inp[:,0], color='blue')
    plt.plot(inp[:,1], color='orange')
plt.show()
In [29]:
for output in outputs:
    plt.plot(output[:,0], color='blue')
    plt.plot(output[:,1], color='orange')
plt.show()

Replicate task specs

In [526]:
rep_indices = [f'C{x:02d}' for x in np.arange(1, 8)]
In [527]:
reps = pd.read_csv("https://docs.google.com/spreadsheets/d/1PvHGy0P9_Yq0tZFw807bjadxaZHAYECE4RytlI9rdeQ/export?gid=0&format=csv")
In [528]:
reps = reps[reps.Mnemonic.isin(rep_indices)]
In [529]:
reps.head()
Out[529]:
Mnemonic FTP Name thenexus Name Sample ID Comments Data Type TF Name Rep Number Control Reps QC report Unique deduped reads Held-out test #Rep-IDRpeaks (N1, N2, ..) #IDR-optimal peaks (Np) Md5-hash-FASTQ Md5-hash-IDRoptimal Md5-bigwigs Oak path peak Oak path bw pos Oak path bw neg
0 C01 mesc_oct4_chipseq_1 m_esc_1m_oct4_chipseq... 2222.0 NaN chipseq oct4 1.0 C14,C15 http://mitra.stanford... 27M False 10770.0 19351.0 ca231a170bad4679ace07... NaN NaN NaN NaN NaN
1 C02 mesc_oct4_chipseq_2 chipseq_mesc_10m_oct4_2 2383.0 NaN chipseq oct4 2.0 C14,C15 http://mitra.stanford... 18M True 14402.0 19351.0 61280040ed745ef31c228... NaN NaN NaN NaN NaN
2 C03 mesc_sox2_chipseq_1 m_esc_1m_sox2_chipseq... 2223.0 NaN chipseq sox2 1.0 C14,C15 http://mitra.stanford... 19M False 255.0 9497.0 1d4228d99a87618c8813e... NaN NaN NaN NaN NaN
3 C04 mesc_sox2_chipseq_2 mesc_sox2_chipseq_2 3659.0 NaN chipseq sox2 2.0 C14,C15 http://mitra.stanford... 49M False 8847.0 9497.0 28bf183b8c906280844e8... NaN NaN NaN NaN NaN
4 C05 mesc_sox2_chipseq_3 m_esc_10m_sox2_chipse... 2224.0 NaN chipseq sox2 3.0 C14,C15 http://mitra.stanford... 32M True 4202.0 9497.0 NaN NaN NaN NaN NaN NaN
In [540]:
from basepair.cli.schemas import TaskSpec
In [548]:
def get_bigwig(row, strand):
    rep_n = int(row['Rep Number'])
    task = row['TF Name']
    rep_name = row['FTP Name'].split(".")[0]
    return f"/srv/scratch/amr1/chip-seq-pipeline2/cromwell-executions/chip/{task}/call-bam2ta/shard-{rep_n -1}/execution/{task}.{strand}.bw"

def get_taskspec(rows):
    task = rows.iloc[0]['TF Name'].capitalize()
    pos_counts=[get_bigwig(rows.iloc[i], 'positive') for i in range(len(rows))]
    neg_counts=[get_bigwig(rows.iloc[i], 'negative') for i in range(len(rows))]
    return TaskSpec(task=task, pos_counts=pos_counts, neg_counts=neg_counts)
In [549]:
dataspec_train = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == False)])
                                                 for task in tasks},
                  fasta_file="")
In [550]:
dataspec_test = DataSpec(task_specs={task: get_taskspec(reps[(reps['TF Name'].str.capitalize() == task) & (reps['Held-out test'] == True)])
                                                 for task in tasks},
                  fasta_file="")
In [551]:
dataspec_train.touch_all_files()
dataspec_test.touch_all_files()

Compute the same stats for the technical replicates

  • always have two technical replicates for each factor
In [552]:
# Augment the intervals
# test_interval_transformer = IntervalAugmentor(max_shift=600, flip_strand=True) 
shifted_intervals = [test_interval_transformer(interval) for interval in all_intervals]
In [553]:
train = dataspec_train.load_counts(shifted_intervals, progbar=True)
test = dataspec_test.load_counts(shifted_intervals, progbar=True)
100%|██████████| 12987/12987 [00:01<00:00, 6781.40it/s]
100%|██████████| 12987/12987 [00:01<00:00, 7415.04it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6446.78it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5675.07it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5844.02it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6730.13it/s]
100%|██████████| 12987/12987 [00:01<00:00, 7236.28it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6015.07it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5845.01it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6598.72it/s]
100%|██████████| 12987/12987 [00:01<00:00, 6496.49it/s]
100%|██████████| 12987/12987 [00:02<00:00, 6247.80it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5457.48it/s]
100%|██████████| 12987/12987 [00:02<00:00, 5756.09it/s]
In [554]:
ds.touch_all_files()
In [555]:
bias = ds.load_bias_counts(shifted_intervals)
In [556]:
seqs = FastaExtractor(fasta_file, use_strand=True)(shifted_intervals)
In [557]:
y_pred = bpnet.predict(seqs)

Evaluate

In [558]:
from basepair.cli.evaluate import bin_counts_summary

class BinnedMetric:
    def __init__(self, metric, binsizes):
        self.metric = metric
        self.binsizes= binsizes

    def __call__(self, y_true, y_pred):
        return [{"binsize": binsize, **self.metric(
            bin_counts_summary(y_true, binsize=binsize, fn=np.sum),
            bin_counts_summary(y_pred, binsize=binsize, fn=np.sum))}
                for binsize in self.binsizes]
In [559]:
from basepair.exp.paper.config import peak_pred_metric
from basepair.metrics import *

def normalize(x):
    p = x / np.maximum(np.sum(x, axis=-1, keepdims=True), 1e-8)
    p += 1 / (x.shape[-1]*10)  # Should be 
    p = p / np.sum(p, axis=-1, keepdims=True)
    return p

# Override the binsizes for peak_pred_metric
def multinomial_nll(y_true, y_pred):
    from scipy.stats import multinomial
    # subset = y_true.sum(axis=-1) > 0
    subset = slice(None)
    return -multinomial.logpmf(x=y_true[subset],
                               n=y_true[subset].sum(axis=-1), 
                               p=normalize(y_pred[subset]))

def stranded_multinomial_nll(y_true, y_pred):
    return (np.nanmean(multinomial_nll(y_true[:, :, 0], y_pred[:, :, 0])) +
            np.nanmean(multinomial_nll(y_true[:, :, 1], y_pred[:, :, 1])))
In [560]:
from basepair.losses import multinomial_nll as k_multinomial_nll
import keras.backend as K
import tensorflow as tf
In [382]:
# y_pred = K.placeholder(ndim=2)
# y_true = K.placeholder(ndim=2)

# nll = K.function([y_true, y_pred], [k_multinomial_nll(y_true, K.log(y_pred / (1 - y_pred)))])

# nll([np.ones((100, 10)), np.ones((100, 10))/10])[0]
In [428]:
multinomial_nll(np.array([1,2,10]), np.array([1,2,10]))
(3,)
Out[428]:
2.2401849431843814
In [371]:
multinomial_nll(np.array([1,2,10]), np.array([1,1,1]))
Out[371]:
7.527355653197464
In [372]:
multinomial_nll(np.array([1,2,10]), np.array([3,2,1]))
Out[372]:
14.05046367506539
In [48]:
# stranded_multinomial_nll(np.ones((100, 10, 2)), np.ones((100, 10, 2))/10)/2
In [144]:
from basepair.stats import symmetric_kl
In [145]:
def padded_symmetric_kl(y_true, y_pred):
    return (symmetric_kl(normalize(moving_average(y_true[:, :, 0], n=50)), normalize(y_pred[:, :, 0])) +
            symmetric_kl(normalize(moving_average(y_true[:, :, 1], n=50)), normalize(y_pred[:, :, 1])))
In [146]:
for output in outputs:
    plt.plot(train_reps['Oct4'][0][:,0], color='blue')
    plt.plot(train_reps['Oct4'][0][:,1], color='orange')
plt.show()
In [60]:
plt.plot(test['Nanog'][0][:, 0])
plt.plot(test['Nanog'][0][:, 1])
Out[60]:
[<matplotlib.lines.Line2D at 0x7f8ee40699b0>]
In [72]:
import random
In [74]:
random.sample(range(100), 3)
Out[74]:
[12, 38, 8]

TODO

  • [X] plot also the bias track
  • choose one footprint where we predict well
  • compute the log-likelihood for the multinomial NLL
  • compare first vs second replicate
In [513]:
train_reps = {}
for task in tasks:
    train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])
In [571]:
dftm
Out[571]:
idx task variable value
0 6493 Nanog rep 161.2594
1 6493 Oct4 rep 111.4792
2 6493 Sox2 rep 78.5938
... ... ... ... ...
9 6493 Nanog const 137.1863
10 6493 Oct4 const 92.7489
11 6493 Sox2 const 69.6990

12 rows × 4 columns

In [606]:
# task = 'Nanog'

dft = pd.concat([pd.DataFrame({
    "idx": np.arange(len(test[task])),
    "rep": multinomial_nll(test[task][:,:,0], 
                           moving_average(train_reps[task][:,:,0], n=50)),
    "pred": multinomial_nll(test[task][:,:,0], 
                            y_pred[f"{task}/profile"][:,:,0]),
    "avg": multinomial_nll(test[task][:,:,0], 
                           np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)[:,:,0]),
    "const": multinomial_nll(test[task][:,:,0], 
                             np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)[:,:,0]),
    'task': task
}) for task in tasks]).reset_index()

dftm = pd.melt(dft.groupby('task').mean().reset_index(), id_vars='task', 
               value_vars=['rep', 'pred', 'avg', 'const'])
In [605]:
# 100
dft['rep'].mean()
Out[605]:
99.72438733883433
In [607]:
# 50
dft['rep'].mean()
Out[607]:
99.91845892760922
In [609]:
# 50
dft['pred'].mean()
Out[609]:
91.43684157947176
In [598]:
# TODO - figure out the wors difference
dft.loc[dft.avg.argmax()]
Out[598]:
index     6443
idx       6443
rep      822.2
pred     561.7
avg      919.8
const    723.9
task     Nanog
Name: 32417, dtype: object
In [599]:
# TODO - figure out the wors difference
dft.loc[dft.rep.argmax()]
Out[599]:
index     6443
idx       6443
rep      822.2
pred     561.7
avg      919.8
const    723.9
task     Nanog
Name: 32417, dtype: object

Questions

  • [ ] how strongly are the ChIP-seq tracks correlated with each other?
In [604]:
i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
             "obs2": test[TF][i],
             "bias": bias['input'][i],
             "smooth1": moving_average(train[TF][i], n=50),
             "smooth2": moving_average(test[TF][i], n=50),
             "bias smooth": moving_average(bias['input'][i], n=50),
             "avg": avg_profile[task],
             "pred": y_pred[f'{TF}/profile'][i],
            }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]
Out[604]:
index idx rep pred avg const task
12998 11 11 155.5408 141.7314 153.7959 159.5265 Sox2
In [602]:
i = 11
TF = 'Sox2'
plot_tracks({"obs1": train[TF][i],
             "obs2": test[TF][i],
             "bias": bias['input'][i],
             "smooth1": moving_average(train[TF][i], n=50),
             "smooth2": moving_average(test[TF][i], n=50),
             "bias smooth": moving_average(bias['input'][i], n=50),
             "avg": avg_profile[task],
             "pred": y_pred[f'{TF}/profile'][i],
            }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
dft[(dft.idx == i) & (dft.task == TF)]
Out[602]:
index idx rep pred avg const task
12998 11 11 153.7203 141.7314 153.7959 159.5265 Sox2
In [653]:
def plot_track_w_rug(arr, ax, legend=False, ylim=None, color=None, track=None):
    """Plot a track
    """
    import collections
    seqlen = len(arr)
    if arr.shape[1] == 4:
        # plot seqlogo
        seqlogo(arr, ax=ax)
    elif arr.shape[1] == 2:
        # plot both strands
        if color is not None:
            assert isinstance(color, collections.Sequence)
            c1 = color[0]
            c2 = color[1]
        else:
            c1, c2 = None, None
        
        if 'Obs' in track:
            y_smooth = moving_average(arr, n=50)
            ax.plot(np.arange(1, seqlen + 1), y_smooth[:,0], label='pos', color=c1)
            ax.plot(np.arange(1, seqlen + 1), y_smooth[:,1], label='neg', color=c2)
            
            # Rugs
            rug_pos = np.where(arr[:, 0]>=1)[0] + 1
            rug_neg = np.where(arr[:, 1]>=1)[0] + 1
            sns.rugplot(rug_pos, color=c1, alpha=0.3, ax=ax, height=.1)
            sns.rugplot(rug_neg, color=c2, alpha=0.3, ax=ax, height=.1)
        else:
            ax.plot(np.arange(1, seqlen + 1), arr[:, 0], label='pos', color=c1)
            ax.plot(np.arange(1, seqlen + 1), arr[:, 1], label='neg', color=c2)
        
        if legend:
            ax.legend()
    else:
        raise ValueError(f"Don't know how to plot array with shape[1] != {arr.shape[1]}. Valid values are: 1,2 or 4.")
    if ylim is not None:
        ax.set_ylim(ylim)
    ax.set_xlim([0, seqlen])
In [672]:
# chr1:136680205-136680605 (known Zfp281 enhancer) 
# chr1:180924752-180925152 (known Letfy1 enhancer) 
intervals = [
    ('known Zfp281 enhancer', 'chr1:136680205-136680605'),
    ('known Letfy1 enhancer', 'chr1:180924752-180925152'),
]
In [657]:
viz_dict = flatten_list([[
    ("Obs bias", bias['input'][i]),
    ("Obs 1", train[TF][i]),
    ("Obs 3", test[TF][i]),
    # "avg": avg_profile[task],
    ("pred", y_pred[f'{TF}/profile'][i]),
            ] for TF in tasks])
In [660]:
tasks
Out[660]:
['Oct4', 'Sox2', 'Nanog']
In [659]:
OrderedDict(viz_dict).keys()
Out[659]:
odict_keys(['Obs bias', 'Obs 1', 'Obs 3', 'pred'])
In [664]:
viz_dict.keys()
Out[664]:
odict_keys(['Oct4 Obs bias', 'Oct4 Obs 1', 'Oct4 Obs 3', 'Oct4 pred', 'Sox2 Obs bias', 'Sox2 Obs 1', 'Sox2 Obs 3', 'Sox2 pred', 'Nanog Obs bias', 'Nanog Obs 1', 'Nanog Obs 3', 'Nanog pred'])
In [671]:
i = 11
TF = 'Sox2'
for i in random.sample(range(len(train[TF])), 20):
    viz_dict = OrderedDict([(f"Obs bias", bias['input'][i])] + flatten_list([[
        (f"{TF} Obs", train[TF][i] + test[TF][i]),
        # (f"{TF} Obs 2", test[TF][i]),
        # "avg": avg_profile[task],
        (f"{TF} pred", y_pred[f'{TF}/profile'][i]),
                ] for TF in tasks]))

    plot_tracks(viz_dict, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, 
                color=[['blue', 'orange']]*4*3,
                rotate_y=0, plot_track_fn=plot_track_w_rug);
    dft[(dft.idx == i) & (dft.task == TF)]
    sns.despine(top=True, bottom=True, right=True)
In [ ]:
moving_average(train[TF][i], n=25)
In [581]:
moving_average(train[TF][i], n=50).min()
Out[581]:
0.0
In [587]:
x = moving_average(train[TF][i], n=50)
x = x / x.sum(axis=0, keepdims=True)
In [589]:
x.min()
Out[589]:
0.0
In [590]:
x.max()
Out[590]:
0.005657530802112133
In [578]:
y_pred[f'{TF}/profile'][i].max()
Out[578]:
0.009529398
In [579]:
y_pred[f'{TF}/profile'][i].min()
Out[579]:
4.722764e-05
In [378]:
# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)
Out[378]:
<ggplot: (-9223363271124186515)>
In [331]:
# 10-3
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)
Out[331]:
<ggplot: (8765716433584)>
In [327]:
# 10-4
(ggplot(aes(x='task', fill='variable', y='value'), dftm) + 
 geom_bar(stat='identity', position='dodge')
)
Out[327]:
<ggplot: (8765713298303)>

Figure out which ones are off

Questions

  • why is log likelihood of the reps much worse than the constant model?
    • is it because of many 0's?
    • shall we increaase the biase size?
  • store the performance per example (binsize=1, positive strand)
    • store predictive performance
In [181]:
TF = 'Sox2'
i = 0
for i in random.sample(range(len(train[TF])), 20):
    plot_tracks({"obs1": train[TF][i],
                 "obs2": test[TF][i],
                 "bias": bias['input'][i],
                 "smooth1": moving_average(train[TF][i], n=50),
                 "smooth2": moving_average(test[TF][i], n=50),
                 "bias smooth": moving_average(bias['input'][i], n=50),
                 "pred": y_pred[f'{TF}/profile'][i],       
                }, fig_width=get_figsize(.5)[0], fig_height_per_track=.5, rotate_y=0);
In [467]:
binsizes = [1, 2, 5, 10, 20, 50]  # 2, 5, 
peak_pred_metric.binsizes = binsizes

metric = BPNetMetricSingleProfile(
    count_metric=pearson_spearman,
    profile_metric=stranded_multinomial_nll
)
In [468]:
binned_metrics = BinnedMetric(metric, binsizes)
In [469]:
train_reps = {}
for task in tasks:
    train_reps[task] = np.array([moving_average(train[task][i], n=50) for i in range(len(train[task]))])
In [470]:
metrics = {task: binned_metrics(test[task], train_reps[task]) for task in tqdm(tasks)}
100%|██████████| 3/3 [13:27<00:00, 268.68s/it]
In [471]:
metrics_model = {task: binned_metrics(test[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) for task in tqdm(tasks)}
100%|██████████| 3/3 [14:37<00:00, 289.24s/it]
In [472]:
metrics_avg = {task: binned_metrics(test[task], 
                                    np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)) 
               for task in tqdm(tasks)}
100%|██████████| 3/3 [13:54<00:00, 279.57s/it]
In [473]:
metrics_const = {task: binned_metrics(test[task], 
                                    np.repeat(np.ones_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)) 
               for task in tqdm(tasks)}
100%|██████████| 3/3 [13:52<00:00, 276.89s/it]
In [474]:
def profile_metrics2df(m, norm=None):
    out = []
    for task,v in m.items():
        for bin_id in range(len(v)):
            if norm is None:
                profile = v[bin_id]['profile']
            else:
                profile = v[bin_id]['profile'] - norm[task][bin_id]['profile']
            out.append({"task": task,
                        "binsize": v[bin_id]['binsize'],
                        "profile": profile
                       })
    return pd.DataFrame(out)


def count_metrics2df(m):
    out = []
    for task,v in m.items():
        out.append({"task": task,
                    "pearsonr": v[0]['counts']['pearsonr']
                       })
    return pd.DataFrame(out)
In [475]:
dfm_profile = profile_metrics2df(metrics, metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_model, metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(metrics_avg, metrics_const).assign(method='Average')])

dfm_counts = pd.concat([dfm_counts, count_metrics2df(metrics_model).assign(method='BPNet')])
In [476]:
dfm_profile['split'] = 'train->test'
dfm_profile1 = dfm_profile.copy()
dfm_counts['split'] = 'train->test'
dfm_counts1 = dfm_counts.copy()
In [477]:
# Swap train and test
In [478]:
test_reps = {}
for task in tasks:
    test_reps[task] = np.array([moving_average(test[task][i], n=50) for i in range(len(test[task]))])
    
shuff_metrics = {task: binned_metrics(train[task], test_reps[task]) for task in tqdm(tasks)}
100%|██████████| 3/3 [14:00<00:00, 279.89s/it]
In [479]:
shuff_metrics_model = {task: binned_metrics(train[task], y_pred[f"{task}/profile"] * np.exp(y_pred[f"{task}/counts"][:, np.newaxis])) 
                       for task in tqdm(tasks)}
100%|██████████| 3/3 [14:08<00:00, 282.10s/it]
In [480]:
shuff_metrics_avg = {task: binned_metrics(train[task],
                                          np.repeat(avg_profile[task][np.newaxis], len(test[task]), axis=0)) 
                     for task in tqdm(tasks)}
100%|██████████| 3/3 [14:11<00:00, 283.76s/it]
In [481]:
shuff_metrics_const = {task: binned_metrics(train[task],
                                          np.repeat(np.zeros_like(avg_profile[task][np.newaxis]), len(test[task]), axis=0)) 
                     for task in tqdm(tasks)}
100%|██████████| 3/3 [14:09<00:00, 283.33s/it]
In [482]:
dfm_profile = profile_metrics2df(shuff_metrics, shuff_metrics_const).assign(method='Replicates')
dfm_counts = count_metrics2df(shuff_metrics).assign(method='Replicates')
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_model, shuff_metrics_const).assign(method='BPNet')])
dfm_profile = pd.concat([dfm_profile, profile_metrics2df(shuff_metrics_avg, shuff_metrics_const).assign(method='Average')])
dfm_counts = pd.concat([dfm_counts, count_metrics2df(shuff_metrics_model).assign(method='BPNet')])
In [483]:
dfm_profile['split'] = 'test->train'
dfm_profile2 = dfm_profile.copy()
dfm_counts['split'] = 'test->train'
dfm_counts2 = dfm_counts.copy()
In [484]:
df_nll_tidy = pd.concat([dfm_profile1, dfm_profile2])
df_nll_tidy['method'] = pd.Categorical(df_nll_tidy['method'], categories=["BPNet", "Replicates", "Average"])
df_nll_tidy['task'] = pd.Categorical(df_nll_tidy['task'], categories=df_nll_tidy['task'].unique())
In [485]:
df_counts_tidy = pd.concat([dfm_counts1, dfm_counts2])
df_counts_tidy['method'] = pd.Categorical(df_counts_tidy['method'], categories=["BPNet", "Replicates"])
df_counts_tidy['task'] = pd.Categorical(df_counts_tidy['task'], categories=df_counts_tidy['task'].unique())

Plot

In [486]:
from plotnine import *
import plotnine

plotnine.__version__
Out[486]:
'0.5.1'
In [487]:
paper_config()
In [488]:
df_nll_tidy = df_nll_tidy[df_nll_tidy.binsize < 100]
In [489]:
def booms(inputs):
    import math
    out = []
    for key, item in inputs:
        [a, b] = item.values
        a = a[~np.isnan(a)]
        b = b[~np.isnan(b)]
        out.append([a.mean(), b.mean()])
    return out

Q: Why is there such a big gap between replicates and average?

  • investigate the big gap in performance on the per example basis
    • which example deviate a lot?
In [490]:
df_nll_tidy['profile_ll'] = -df_nll_tidy['profile']
In [491]:
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
  scale_x_log10(breaks=binsizes) + \
  geom_point() + \
  geom_line() + \
  geom_abline(intercept=0, slope=0) + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_color_brewer('qual', 7) + \
  ylab("Log-likelihood")
fig.save(f"{figures}/ll.pdf")
fig.save(f"{figures}/ll.png", dpi=300)
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 3.42519 x 1.027557 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/model-evaluation/chipseq-bpnet/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE/ll.pdf
  warn('Filename: {}'.format(filename))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:706: UserWarning: Saving 3.42519 x 1.027557 in image.
  from_inches(height, units), units))
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/ggplot.py:707: UserWarning: Filename: /users/avsec/workspace/basepair/data/figures/model-evaluation/chipseq-bpnet/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE/ll.png
  warn('Filename: {}'.format(filename))
Out[491]:
<ggplot: (-9223363271131884028)>
In [461]:
plotnine.options.figure_size = get_figsize(0.5, aspect=0.3)
fig = ggplot(aes(x='binsize', y='profile_ll', color='method'), data=df_nll_tidy.groupby(['binsize', 'task', 'method']).profile_ll.mean().reset_index()) + \
  scale_x_log10(breaks=binsizes) + \
  geom_point() + \
  geom_line() + \
  geom_abline(intercept=0, slope=0) + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_color_brewer('qual', 7) + \
  ylab("Log-likelihood")
#fig.save(f"{figures}/nll.pdf")
#fig.save(f"{figures}/nll.png", dpi=300)
fig
Out[461]:
<ggplot: (8765716610203)>
In [465]:
plotnine.options.figure_size = get_figsize(0.7, aspect=0.3)
fig = ggplot(aes(x='method', y='pearsonr', fill='method'), data=df_counts_tidy.groupby(['task', 'method']).pearsonr.mean().reset_index()) + \
  geom_bar(stat='identity', position='dodge') + \
  geom_line() + \
  facet_grid(".~task") + \
  theme_classic(base_size=10, base_family='Arial') +  \
  theme(legend_position='top') + \
  scale_fill_brewer('qual', 7)
fig.save(f"{figures}/counts.pdf")
fig.save(f"{figures}/counts.png", dpi=300)
fig
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "
/users/avsec/bin/anaconda3/envs/chipnexus/lib/python3.6/site-packages/plotnine/geoms/geom_path.py:80: UserWarning: geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
  warn("geom_path: Each group consist of only one "
Out[465]:
<ggplot: (8765716718705)>
In [75]:
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Observed")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    
    ax.set_title(task)
    ax.text(0.95, .01, f"R={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            fontsize=8,
            transform=ax.transAxes)
In [76]:
fig, axes = plt.subplots(2, len(tasks), figsize=get_figsize(frac=.5, aspect=2/len(tasks)),
                         sharex=True, sharey=True)
for i, task in enumerate(tasks):
    for j in range(2):
        ax = axes[j,i]
        yt = test[task].sum(axis=1).mean(-1) + 1
        if j == 0:
            yp = np.exp(y_pred[f'{task}/counts'].mean(-1))
        else:
            yp = train[task].sum(axis=1).mean(-1) + 1
        xrange = [2, 5e3]
        ax.set_ylim(xrange)
        ax.set_xlim(xrange)

        ax.plot(xrange, xrange, c='grey', alpha=0.2)
        regression_eval(yt, 
                        yp, alpha=.01, task=task if j == 0 else '', ax=ax, loglog=True)
        
        ax.xaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
        ax.yaxis.set_major_locator(ticker.LogLocator(base=10.0, numticks=3))
        if i > 0:
            ax.set_ylabel("")
        else:
            ax.set_ylabel("BPNet\nObserved" if j==0 else "Replicates\n")
fig.subplots_adjust(wspace=0, hspace=0)
plt.minorticks_off()
# Save the figure
fig.savefig(f"{figures}/all.pdf")
fig.savefig(f"{figures}/all.png")