from basepair.utils import read_pkl
from basepair.imports import *
paper_config()

Using TensorFlow backend.

df_mpra = pd.read_csv('output/MPRA.csv')

from config import model_exps

model_exps_inf = {v:k for k,v in model_exps.items()}

df_mpra['model'] = df_mpra.exp.map(model_exps_inf)

with pd.option_context("display.max_colwidth", 100):
    print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
          sort_values("metrics/genomic/spearmanr").
          to_string())

                                       model  metrics/genomic/spearmanr
18  nexus/profile.peaks-union.bias-corrected                     0.3900
14                             seq/binary.gw                     0.3982
16     seq/profile.peaks.bias-corrected.augm                     0.4133
15          seq/profile.peaks.bias-corrected                     0.4134
17      seq/profile.peaks.non-bias-corrected                     0.4199
19    seq/profile.peaks-union.bias-corrected                     0.4293
10                           nexus/binary.gw                     0.4294
11        nexus/profile.peaks.bias-corrected                     0.4389
12   nexus/profile.peaks.bias-corrected.augm                     0.4409
13    nexus/profile.peaks.non-bias-corrected                     0.4628

with pd.option_context("display.max_colwidth", 100):
    print(df_mpra.query('data=="synthetic"')[['model', 'metrics/synthetic/spearmanr']].
          sort_values("metrics/synthetic/spearmanr").
          to_string())

                                      model  metrics/synthetic/spearmanr
4                             seq/binary.gw                       0.6506
0                           nexus/binary.gw                       0.7786
3    nexus/profile.peaks.non-bias-corrected                       0.8456
5          seq/profile.peaks.bias-corrected                       0.8487
7      seq/profile.peaks.non-bias-corrected                       0.8518
9    seq/profile.peaks-union.bias-corrected                       0.8567
6     seq/profile.peaks.bias-corrected.augm                       0.8571
1        nexus/profile.peaks.bias-corrected                       0.8628
2   nexus/profile.peaks.bias-corrected.augm                       0.8742
8  nexus/profile.peaks-union.bias-corrected                       0.8749

df_act = pd.read_csv('../../../src/chipnexus/train/seqmodel/output/activity.csv')

df_act = df_act[df_act.split == 'test']

df_act[[
        'model_kwargs/bn', 
        'model_kwargs/n_hidden/0',
        'model_kwargs/pool_size',
        'model_kwargs/pool_type']].drop_duplicates()

with pd.option_context("display.max_colwidth", 100):
    print(df_act[['model',
        'model_kwargs/bn', 
        'model_kwargs/pool_size',
        'model_kwargs/pool_type', 'metrics/H3K27ac/spearmanr', 'metrics/PolII/spearmanr']].
          sort_values("metrics/PolII/spearmanr").
          to_string())

                                       model model_kwargs/bn  model_kwargs/pool_size model_kwargs/pool_type  metrics/H3K27ac/spearmanr  metrics/PolII/spearmanr
17          seq/profile.peaks.bias-corrected             NaN                    50.0                    max                     0.3094                   0.4672
80     seq/profile.peaks.bias-corrected.augm            True                     NaN             global_avg                     0.3126                   0.4686
26  nexus/profile.peaks-union.bias-corrected             NaN                    50.0                    max                     0.3205                   0.4733
77          seq/profile.peaks.bias-corrected            True                     NaN             global_avg                     0.3296                   0.4746
20     seq/profile.peaks.bias-corrected.augm             NaN                    50.0                    max                     0.3372                   0.4747
47          seq/profile.peaks.bias-corrected             NaN                     NaN             global_avg                     0.3195                   0.4750
65        nexus/profile.peaks.bias-corrected            True                     NaN             global_avg                     0.3466                   0.4786
23      seq/profile.peaks.non-bias-corrected             NaN                    50.0                    max                     0.3208                   0.4831
14                             seq/binary.gw             NaN                    50.0                    max                     0.3403                   0.4855
5         nexus/profile.peaks.bias-corrected             NaN                    50.0                    max                     0.3601                   0.4899
29    seq/profile.peaks-union.bias-corrected             NaN                    50.0                    max                     0.3366                   0.4911
11    nexus/profile.peaks.non-bias-corrected             NaN                    50.0                    max                     0.3657                   0.4919
2                            nexus/binary.gw             NaN                    50.0                    max                     0.3791                   0.4958
89    seq/profile.peaks-union.bias-corrected            True                     NaN             global_avg                     0.3597                   0.4998
41    nexus/profile.peaks.non-bias-corrected             NaN                     NaN             global_avg                     0.3734                   0.5014
8    nexus/profile.peaks.bias-corrected.augm             NaN                    50.0                    max                     0.3765                   0.5021
74                             seq/binary.gw            True                     NaN             global_avg                     0.3804                   0.5036
59    seq/profile.peaks-union.bias-corrected             NaN                     NaN             global_avg                     0.3641                   0.5045
50     seq/profile.peaks.bias-corrected.augm             NaN                     NaN             global_avg                     0.3640                   0.5062
53      seq/profile.peaks.non-bias-corrected             NaN                     NaN             global_avg                     0.3532                   0.5072
83      seq/profile.peaks.non-bias-corrected            True                     NaN             global_avg                     0.3613                   0.5077
44                             seq/binary.gw             NaN                     NaN             global_avg                     0.3790                   0.5082
56  nexus/profile.peaks-union.bias-corrected             NaN                     NaN             global_avg                     0.3547                   0.5084
35        nexus/profile.peaks.bias-corrected             NaN                     NaN             global_avg                     0.3758                   0.5117
86  nexus/profile.peaks-union.bias-corrected            True                     NaN             global_avg                     0.3708                   0.5195
71    nexus/profile.peaks.non-bias-corrected            True                     NaN             global_avg                     0.4038                   0.5214
38   nexus/profile.peaks.bias-corrected.augm             NaN                     NaN             global_avg                     0.3929                   0.5224
68   nexus/profile.peaks.bias-corrected.augm            True                     NaN             global_avg                     0.3968                   0.5234
32                           nexus/binary.gw             NaN                     NaN             global_avg                     0.4133                   0.5287
62                           nexus/binary.gw            True                     NaN             global_avg                     0.4231                   0.5288

df_act['model'] = df_act.exp.map(model_exps_inf)

df_act

df_act.head()

# mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
# figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
# fig_genome = f"{ddir}/figures/activity/genome"

mdir = 'output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
fig_genome = f"{ddir}/figures/activity/genome"

def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
    if ax is None:
        fig, ax = plt.subplots(1)
    from scipy.stats import pearsonr, spearmanr
    xmax = max([y_true.max(), y_pred.max()])
    xmin = min([y_true.min(), y_pred.min()])
    
    if loglog:
        pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
        spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
    else:
        pearson, pearson_pval = pearsonr(y_true, y_pred)
        spearman, spearman_pval = spearmanr(y_true, y_pred)
    if loglog:
        plt_fn = ax.loglog
    else:
        plt_fn = ax.plot
        
    plt_fn(y_pred, y_true, ".", 
           markersize=markersize, 
           rasterized=True,
           alpha=alpha)
    ax.set_xlabel("Predicted (FC)")
    ax.set_ylabel("Observed (FC)")
    
    if same_lim:
        ax.set_xlim((xmin, xmax))
        ax.set_ylim((xmin, xmax))
    rp = r"$R_{p}$"
    rs = r"$R_{s}$"
    ax.set_title(task)
    ax.text(.95, .2, f"{rp}={pearson:.2f}",
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)
    ax.text(.95, .05, f"{rs}={spearman:.2f}", 
            verticalalignment='bottom',
            horizontalalignment='right',
            transform=ax.transAxes)

MPRA¶

Synthetic¶

preds = read_pkl(f"{mdir}/MPRA/synthetic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")

import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")

preds = read_pkl(f"{mdir}/MPRA/genomic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")

import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/gen.obs-vs-pred.pdf")
fig.savefig(f"{figures}/gen.obs-vs-pred.png")

Activity¶

preds = read_pkl(f"{mdir}/activity//peaks/H3K27ac;PolII/model_pool/pool_type=max;n_hidden=[64];pool_size=50/test.preds.pkl")

from sklearn.linear_model import LinearRegression

fig, axes = plt.subplots(1, 2, figsize=get_figsize(.7, 1/2))

for i, c in enumerate(preds['columns']):
    ax = axes[i]
    #lm = LinearRegression()
    #lm.fit(preds['y_pred'][:, i, np.newaxis], preds['y_true'][:, i])
    #y_pred = lm.predict(preds['y_pred'][:,i, np.newaxis])
    y_pred = preds['y_pred'][:, i]
    regression_eval(np.exp(y_pred), 
                    np.exp(preds['y_true'][:, i]),
                    same_lim=True,
                    markersize=2, alpha=0.4, ax=ax, loglog=True);
    ax.set_title(c)
    plt.tight_layout()
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.pdf")
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.png")

import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds, 2**dfs_syn.expression_fc_log2.values, markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")

ls {mdir}/'activity//peaks/H3K27ac;PolII/model_pool/pool_type=global_avg;n_hidden=[64];pool_size=null/test.preds.pkl'

kwargs.json  test.metrics.json   train.preds.pkl
model.h5     test.preds.pkl      valid.metrics.json
preproc.pkl  train.metrics.json  valid.preds.pkl

	data/fasta_file	data/intervals_file	data/use_bigwigs/H3K27ac/0	data/use_bigwigs/H3K27ac/1	data/use_bigwigs/PolII/0	data/use_bigwigs/PolII/1	exp	metrics/H3K27ac/mad	metrics/H3K27ac/mse	metrics/H3K27ac/pearsonr	metrics/H3K27ac/spearmanr	metrics/H3K27ac/var_explained	metrics/PolII/mad	metrics/PolII/mse	metrics/PolII/pearsonr	metrics/PolII/spearmanr	metrics/PolII/var_explained	model	model_kwargs/bn	model_kwargs/n_hidden/0	model_kwargs/pool_size	model_kwargs/pool_type	split	tasks/0	tasks/1
2	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,gw,OSNK,1,0,0,F...	0.7601	0.8623	0.3791	0.3791	0.1391	0.6481	0.6676	0.5685	0.4958	0.3187	model_pool	NaN	64	50.0	max	test	H3K27ac	PolII
5	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,peaks,OSNK,0,10...	0.7813	0.8803	0.3599	0.3601	0.1290	0.6659	0.6777	0.5604	0.4899	0.3137	model_pool	NaN	64	50.0	max	test	H3K27ac	PolII
8	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,peaks,OSNK,0,10...	0.7785	0.8764	0.3759	0.3765	0.1408	0.6575	0.6613	0.5745	0.5021	0.3300	model_pool	NaN	64	50.0	max	test	H3K27ac	PolII
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
83	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	seq,peaks,OSN,0,10,1,...	0.7758	0.8756	0.3633	0.3613	0.1307	0.6452	0.6460	0.5861	0.5077	0.3430	model_pool	True	64	NaN	global_avg	test	H3K27ac	PolII
86	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,nexus-seq-union...	0.7740	0.8697	0.3686	0.3708	0.1356	0.6495	0.6501	0.5821	0.5195	0.3389	model_pool	True	64	NaN	global_avg	test	H3K27ac	PolII
89	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	seq,nexus-seq-union,O...	0.7802	0.8764	0.3622	0.3597	0.1311	0.6545	0.6567	0.5773	0.4998	0.3332	model_pool	True	64	NaN	global_avg	test	H3K27ac	PolII

	data/fasta_file	data/intervals_file	data/use_bigwigs/H3K27ac/0	data/use_bigwigs/H3K27ac/1	data/use_bigwigs/PolII/0	data/use_bigwigs/PolII/1	exp	metrics/H3K27ac/mad	metrics/H3K27ac/mse	metrics/H3K27ac/pearsonr	metrics/H3K27ac/spearmanr	metrics/H3K27ac/var_explained	metrics/PolII/mad	metrics/PolII/mse	metrics/PolII/pearsonr	metrics/PolII/spearmanr	metrics/PolII/var_explained	model	model_kwargs/bn	model_kwargs/n_hidden/0	model_kwargs/pool_size	model_kwargs/pool_type	split	tasks/0	tasks/1
0	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,gw,OSNK,1,0,0,F...	0.7263	0.7783	0.4741	0.4736	0.2228	0.6365	0.6376	0.6022	0.5623	0.3625	model_pool	NaN	64	50.0	max	train	H3K27ac	PolII
1	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,gw,OSNK,1,0,0,F...	0.7674	0.8784	0.3644	0.3626	0.1246	0.6682	0.7267	0.5345	0.4726	0.2738	model_pool	NaN	64	50.0	max	valid	H3K27ac	PolII
2	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,gw,OSNK,1,0,0,F...	0.7601	0.8623	0.3791	0.3791	0.1391	0.6481	0.6676	0.5685	0.4958	0.3187	model_pool	NaN	64	50.0	max	test	H3K27ac	PolII
3	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,peaks,OSNK,0,10...	0.7480	0.8181	0.4290	0.4245	0.1824	0.6447	0.6527	0.5897	0.5461	0.3476	model_pool	NaN	64	50.0	max	train	H3K27ac	PolII
4	/mnt/data/pipeline_ge...	/users/avsec/workspac...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	/srv/scratch/avsec/wo...	nexus,peaks,OSNK,0,10...	0.7724	0.8703	0.3603	0.3579	0.1285	0.6659	0.6968	0.5430	0.4890	0.2923	model_pool	NaN	64	50.0	max	valid	H3K27ac	PolII