In [1]:
from basepair.utils import read_pkl
from basepair.imports import *
paper_config()
Using TensorFlow backend.
In [2]:
df_mpra = pd.read_csv('output/MPRA.csv')
In [3]:
from config import model_exps
In [4]:
model_exps_inf = {v:k for k,v in model_exps.items()}
In [5]:
df_mpra['model'] = df_mpra.exp.map(model_exps_inf)
In [45]:
with pd.option_context("display.max_colwidth", 100):
    print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
          sort_values("metrics/genomic/spearmanr").
          to_string())
                                       model  metrics/genomic/spearmanr
18  nexus/profile.peaks-union.bias-corrected                     0.3900
14                             seq/binary.gw                     0.3982
16     seq/profile.peaks.bias-corrected.augm                     0.4133
15          seq/profile.peaks.bias-corrected                     0.4134
17      seq/profile.peaks.non-bias-corrected                     0.4199
19    seq/profile.peaks-union.bias-corrected                     0.4293
10                           nexus/binary.gw                     0.4294
11        nexus/profile.peaks.bias-corrected                     0.4389
12   nexus/profile.peaks.bias-corrected.augm                     0.4409
13    nexus/profile.peaks.non-bias-corrected                     0.4628
In [46]:
with pd.option_context("display.max_colwidth", 100):
    print(df_mpra.query('data=="synthetic"')[['model', 'metrics/synthetic/spearmanr']].
          sort_values("metrics/synthetic/spearmanr").
          to_string())
                                      model  metrics/synthetic/spearmanr
4                             seq/binary.gw                       0.6506
0                           nexus/binary.gw                       0.7786
3    nexus/profile.peaks.non-bias-corrected                       0.8456
5          seq/profile.peaks.bias-corrected                       0.8487
7      seq/profile.peaks.non-bias-corrected                       0.8518
9    seq/profile.peaks-union.bias-corrected                       0.8567
6     seq/profile.peaks.bias-corrected.augm                       0.8571
1        nexus/profile.peaks.bias-corrected                       0.8628
2   nexus/profile.peaks.bias-corrected.augm                       0.8742
8  nexus/profile.peaks-union.bias-corrected                       0.8749
In [49]:
df_act = pd.read_csv('../../../src/chipnexus/train/seqmodel/output/activity.csv')
In [51]:
df_act = df_act[df_act.split == 'test']
In [57]:
df_act[[
        'model_kwargs/bn', 
        'model_kwargs/n_hidden/0',
        'model_kwargs/pool_size',
        'model_kwargs/pool_type']].drop_duplicates()
Out[57]:
model_kwargs/bn model_kwargs/n_hidden/0 model_kwargs/pool_size model_kwargs/pool_type
2 NaN 64 50.0 max
32 NaN 64 NaN global_avg
62 True 64 NaN global_avg
In [60]:
with pd.option_context("display.max_colwidth", 100):
    print(df_act[['model',
        'model_kwargs/bn', 
        'model_kwargs/pool_size',
        'model_kwargs/pool_type', 'metrics/H3K27ac/spearmanr', 'metrics/PolII/spearmanr']].
          sort_values("metrics/PolII/spearmanr").
          to_string())
                                       model model_kwargs/bn  model_kwargs/pool_size model_kwargs/pool_type  metrics/H3K27ac/spearmanr  metrics/PolII/spearmanr
17          seq/profile.peaks.bias-corrected             NaN                    50.0                    max                     0.3094                   0.4672
80     seq/profile.peaks.bias-corrected.augm            True                     NaN             global_avg                     0.3126                   0.4686
26  nexus/profile.peaks-union.bias-corrected             NaN                    50.0                    max                     0.3205                   0.4733
77          seq/profile.peaks.bias-corrected            True                     NaN             global_avg                     0.3296                   0.4746
20     seq/profile.peaks.bias-corrected.augm             NaN                    50.0                    max                     0.3372                   0.4747
47          seq/profile.peaks.bias-corrected             NaN                     NaN             global_avg                     0.3195                   0.4750
65        nexus/profile.peaks.bias-corrected            True                     NaN             global_avg                     0.3466                   0.4786
23      seq/profile.peaks.non-bias-corrected             NaN                    50.0                    max                     0.3208                   0.4831
14                             seq/binary.gw             NaN                    50.0                    max                     0.3403                   0.4855
5         nexus/profile.peaks.bias-corrected             NaN                    50.0                    max                     0.3601                   0.4899
29    seq/profile.peaks-union.bias-corrected             NaN                    50.0                    max                     0.3366                   0.4911
11    nexus/profile.peaks.non-bias-corrected             NaN                    50.0                    max                     0.3657                   0.4919
2                            nexus/binary.gw             NaN                    50.0                    max                     0.3791                   0.4958
89    seq/profile.peaks-union.bias-corrected            True                     NaN             global_avg                     0.3597                   0.4998
41    nexus/profile.peaks.non-bias-corrected             NaN                     NaN             global_avg                     0.3734                   0.5014
8    nexus/profile.peaks.bias-corrected.augm             NaN                    50.0                    max                     0.3765                   0.5021
74                             seq/binary.gw            True                     NaN             global_avg                     0.3804                   0.5036
59    seq/profile.peaks-union.bias-corrected             NaN                     NaN             global_avg                     0.3641                   0.5045
50     seq/profile.peaks.bias-corrected.augm             NaN                     NaN             global_avg                     0.3640                   0.5062
53      seq/profile.peaks.non-bias-corrected             NaN                     NaN             global_avg                     0.3532                   0.5072
83      seq/profile.peaks.non-bias-corrected            True                     NaN             global_avg                     0.3613                   0.5077
44                             seq/binary.gw             NaN                     NaN             global_avg                     0.3790                   0.5082
56  nexus/profile.peaks-union.bias-corrected             NaN                     NaN             global_avg                     0.3547                   0.5084
35        nexus/profile.peaks.bias-corrected             NaN                     NaN             global_avg                     0.3758                   0.5117
86  nexus/profile.peaks-union.bias-corrected            True                     NaN             global_avg                     0.3708                   0.5195
71    nexus/profile.peaks.non-bias-corrected            True                     NaN             global_avg                     0.4038                   0.5214
38   nexus/profile.peaks.bias-corrected.augm             NaN                     NaN             global_avg                     0.3929                   0.5224
68   nexus/profile.peaks.bias-corrected.augm            True                     NaN             global_avg                     0.3968                   0.5234
32                           nexus/binary.gw             NaN                     NaN             global_avg                     0.4133                   0.5287
62                           nexus/binary.gw            True                     NaN             global_avg                     0.4231                   0.5288
In [54]:
df_act['model'] = df_act.exp.map(model_exps_inf)
In [52]:
df_act
Out[52]:
data/fasta_file data/intervals_file data/use_bigwigs/H3K27ac/0 data/use_bigwigs/H3K27ac/1 data/use_bigwigs/PolII/0 data/use_bigwigs/PolII/1 exp metrics/H3K27ac/mad metrics/H3K27ac/mse metrics/H3K27ac/pearsonr metrics/H3K27ac/spearmanr metrics/H3K27ac/var_explained metrics/PolII/mad metrics/PolII/mse metrics/PolII/pearsonr metrics/PolII/spearmanr metrics/PolII/var_explained model model_kwargs/bn model_kwargs/n_hidden/0 model_kwargs/pool_size model_kwargs/pool_type split tasks/0 tasks/1
2 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,gw,OSNK,1,0,0,F... 0.7601 0.8623 0.3791 0.3791 0.1391 0.6481 0.6676 0.5685 0.4958 0.3187 model_pool NaN 64 50.0 max test H3K27ac PolII
5 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,peaks,OSNK,0,10... 0.7813 0.8803 0.3599 0.3601 0.1290 0.6659 0.6777 0.5604 0.4899 0.3137 model_pool NaN 64 50.0 max test H3K27ac PolII
8 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,peaks,OSNK,0,10... 0.7785 0.8764 0.3759 0.3765 0.1408 0.6575 0.6613 0.5745 0.5021 0.3300 model_pool NaN 64 50.0 max test H3K27ac PolII
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
83 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... seq,peaks,OSN,0,10,1,... 0.7758 0.8756 0.3633 0.3613 0.1307 0.6452 0.6460 0.5861 0.5077 0.3430 model_pool True 64 NaN global_avg test H3K27ac PolII
86 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,nexus-seq-union... 0.7740 0.8697 0.3686 0.3708 0.1356 0.6495 0.6501 0.5821 0.5195 0.3389 model_pool True 64 NaN global_avg test H3K27ac PolII
89 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... seq,nexus-seq-union,O... 0.7802 0.8764 0.3622 0.3597 0.1311 0.6545 0.6567 0.5773 0.4998 0.3332 model_pool True 64 NaN global_avg test H3K27ac PolII

30 rows × 25 columns

In [50]:
df_act.head()
Out[50]:
data/fasta_file data/intervals_file data/use_bigwigs/H3K27ac/0 data/use_bigwigs/H3K27ac/1 data/use_bigwigs/PolII/0 data/use_bigwigs/PolII/1 exp metrics/H3K27ac/mad metrics/H3K27ac/mse metrics/H3K27ac/pearsonr metrics/H3K27ac/spearmanr metrics/H3K27ac/var_explained metrics/PolII/mad metrics/PolII/mse metrics/PolII/pearsonr metrics/PolII/spearmanr metrics/PolII/var_explained model model_kwargs/bn model_kwargs/n_hidden/0 model_kwargs/pool_size model_kwargs/pool_type split tasks/0 tasks/1
0 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,gw,OSNK,1,0,0,F... 0.7263 0.7783 0.4741 0.4736 0.2228 0.6365 0.6376 0.6022 0.5623 0.3625 model_pool NaN 64 50.0 max train H3K27ac PolII
1 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,gw,OSNK,1,0,0,F... 0.7674 0.8784 0.3644 0.3626 0.1246 0.6682 0.7267 0.5345 0.4726 0.2738 model_pool NaN 64 50.0 max valid H3K27ac PolII
2 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,gw,OSNK,1,0,0,F... 0.7601 0.8623 0.3791 0.3791 0.1391 0.6481 0.6676 0.5685 0.4958 0.3187 model_pool NaN 64 50.0 max test H3K27ac PolII
3 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,peaks,OSNK,0,10... 0.7480 0.8181 0.4290 0.4245 0.1824 0.6447 0.6527 0.5897 0.5461 0.3476 model_pool NaN 64 50.0 max train H3K27ac PolII
4 /mnt/data/pipeline_ge... /users/avsec/workspac... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... /srv/scratch/avsec/wo... nexus,peaks,OSNK,0,10... 0.7724 0.8703 0.3603 0.3579 0.1285 0.6659 0.6968 0.5430 0.4890 0.2923 model_pool NaN 64 50.0 max valid H3K27ac PolII
In [13]:
# mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
# figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
# fig_genome = f"{ddir}/figures/activity/genome"
In [16]:
mdir = 'output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
fig_genome = f"{ddir}/figures/activity/genome"
In [18]:
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted (FC)")
    ax.set_ylabel("Observed (FC)")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    rp = r"$R_{p}$"
    rs = r"$R_{s}$"
    ax.set_title(task)
    ax.text(.95, .2, f"{rp}={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
    ax.text(.95, .05, f"{rs}={spearman:.2f}", 
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)

MPRA

Synthetic

In [24]:
preds = read_pkl(f"{mdir}/MPRA/synthetic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")
In [25]:
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
In [26]:
preds = read_pkl(f"{mdir}/MPRA/genomic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")
In [27]:
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/gen.obs-vs-pred.pdf")
fig.savefig(f"{figures}/gen.obs-vs-pred.png")

Activity

In [50]:
preds = read_pkl(f"{mdir}/activity//peaks/H3K27ac;PolII/model_pool/pool_type=max;n_hidden=[64];pool_size=50/test.preds.pkl")
In [51]:
from sklearn.linear_model import LinearRegression
In [58]:
fig, axes = plt.subplots(1, 2, figsize=get_figsize(.7, 1/2))

for i, c in enumerate(preds['columns']):
    ax = axes[i]
    #lm = LinearRegression()
    #lm.fit(preds['y_pred'][:, i, np.newaxis], preds['y_true'][:, i])
    #y_pred = lm.predict(preds['y_pred'][:,i, np.newaxis])
    y_pred = preds['y_pred'][:, i]
    regression_eval(np.exp(y_pred), 
                    np.exp(preds['y_true'][:, i]),
                    same_lim=True,
                    markersize=2, alpha=0.4, ax=ax, loglog=True);
    ax.set_title(c)
    plt.tight_layout()
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.pdf")
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.png")
In [ ]:
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds, 2**dfs_syn.expression_fc_log2.values, markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
In [116]:
ls {mdir}/'activity//peaks/H3K27ac;PolII/model_pool/pool_type=global_avg;n_hidden=[64];pool_size=null/test.preds.pkl'
kwargs.json  test.metrics.json   train.preds.pkl
model.h5     test.preds.pkl      valid.metrics.json
preproc.pkl  train.metrics.json  valid.preds.pkl