from basepair.utils import read_pkl
from basepair.imports import *
paper_config()
df_mpra = pd.read_csv('output/MPRA.csv')
from config import model_exps
model_exps_inf = {v:k for k,v in model_exps.items()}
df_mpra['model'] = df_mpra.exp.map(model_exps_inf)
with pd.option_context("display.max_colwidth", 100):
print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
sort_values("metrics/genomic/spearmanr").
to_string())
with pd.option_context("display.max_colwidth", 100):
print(df_mpra.query('data=="synthetic"')[['model', 'metrics/synthetic/spearmanr']].
sort_values("metrics/synthetic/spearmanr").
to_string())
df_act = pd.read_csv('../../../src/chipnexus/train/seqmodel/output/activity.csv')
df_act = df_act[df_act.split == 'test']
df_act[[
'model_kwargs/bn',
'model_kwargs/n_hidden/0',
'model_kwargs/pool_size',
'model_kwargs/pool_type']].drop_duplicates()
with pd.option_context("display.max_colwidth", 100):
print(df_act[['model',
'model_kwargs/bn',
'model_kwargs/pool_size',
'model_kwargs/pool_type', 'metrics/H3K27ac/spearmanr', 'metrics/PolII/spearmanr']].
sort_values("metrics/PolII/spearmanr").
to_string())
df_act['model'] = df_act.exp.map(model_exps_inf)
df_act
df_act.head()
# mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
# figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
# fig_genome = f"{ddir}/figures/activity/genome"
mdir = 'output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
fig_genome = f"{ddir}/figures/activity/genome"
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
if ax is None:
fig, ax = plt.subplots(1)
from scipy.stats import pearsonr, spearmanr
xmax = max([y_true.max(), y_pred.max()])
xmin = min([y_true.min(), y_pred.min()])
if loglog:
pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
else:
pearson, pearson_pval = pearsonr(y_true, y_pred)
spearman, spearman_pval = spearmanr(y_true, y_pred)
if loglog:
plt_fn = ax.loglog
else:
plt_fn = ax.plot
plt_fn(y_pred, y_true, ".",
markersize=markersize,
rasterized=True,
alpha=alpha)
ax.set_xlabel("Predicted (FC)")
ax.set_ylabel("Observed (FC)")
if same_lim:
ax.set_xlim((xmin, xmax))
ax.set_ylim((xmin, xmax))
rp = r"$R_{p}$"
rs = r"$R_{s}$"
ax.set_title(task)
ax.text(.95, .2, f"{rp}={pearson:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
ax.text(.95, .05, f"{rs}={spearman:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
preds = read_pkl(f"{mdir}/MPRA/synthetic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
preds = read_pkl(f"{mdir}/MPRA/genomic/pool_size=15/RandomForestRegressor/n_estimators=100/out-of-fold.preds.pkl")
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds['y_pred'][:,0], 2**preds['y_true'][:,0], markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/gen.obs-vs-pred.pdf")
fig.savefig(f"{figures}/gen.obs-vs-pred.png")
preds = read_pkl(f"{mdir}/activity//peaks/H3K27ac;PolII/model_pool/pool_type=max;n_hidden=[64];pool_size=50/test.preds.pkl")
from sklearn.linear_model import LinearRegression
fig, axes = plt.subplots(1, 2, figsize=get_figsize(.7, 1/2))
for i, c in enumerate(preds['columns']):
ax = axes[i]
#lm = LinearRegression()
#lm.fit(preds['y_pred'][:, i, np.newaxis], preds['y_true'][:, i])
#y_pred = lm.predict(preds['y_pred'][:,i, np.newaxis])
y_pred = preds['y_pred'][:, i]
regression_eval(np.exp(y_pred),
np.exp(preds['y_true'][:, i]),
same_lim=True,
markersize=2, alpha=0.4, ax=ax, loglog=True);
ax.set_title(c)
plt.tight_layout()
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.pdf")
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.png")
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds, 2**dfs_syn.expression_fc_log2.values, markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
ls {mdir}/'activity//peaks/H3K27ac;PolII/model_pool/pool_type=global_avg;n_hidden=[64];pool_size=null/test.preds.pkl'