import warnings
warnings.filterwarnings("ignore")
from basepair.exp.chipnexus.data import load_BarakCohnen_mESC_MPRA, load_BarakCohnen_mESC_MPRA_raw
from basepair.config import create_tf_session
create_tf_session(0)
figures = f"{ddir}/figures/model-evaluation/ChIP-nexus/MPRA"
fig_genome = f"{ddir}/figures/activity/genome"
!mkdir {fig_genome}
dfs_gen, dfs_syn = load_BarakCohnen_mESC_MPRA()
df_gen, df_syn = load_BarakCohnen_mESC_MPRA_raw()
dfs_gen.head()
dfs_syn.head()
df_gen.head()
df_gen.groupby("CRE_id").size().value_counts()
There are 8 different BC-seq per sequence
Genomic sequences were represented in the pool by 150 bp oligos with the following sequences:
GACTTACATTAGGGCCCGT[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCGGACTACGATACTG
Where [SEQ] is either a reference (gWT) or mutated (gMUT) genomic sequence of 81-82 bps
CTTCTACTACTAGGGCCCA[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCTACATGCTAGTTCATG
where [SEQ] is a 40-80 bp synthetic element comprised of concatenated 20 bp building blocks
of pluripotency sites, as described previously, with the fifth position of the KLF4 site changed to
‘T’ to facilitate cloning (Fiore and Cohen 2016) . [FILL] is a random filler sequence of variable
length to bring the total length of each sequence to 150 bp, and [BC] is a random 9 bp barcode.
from basepair.exp.chipnexus.simulate import generate_seq, random_seq
def gen_padded_sequence(sequence, barcode, seqlen=1000):
"""Generate MPRA construct sequence for Genomic sequences
"""
fill = random_seq(150 - len(sequence) - len(barcode) - 59)
core_seq = f"GACTTACATTAGGGCCCGT{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCGGACTACGATACTG"
assert len(core_seq) == 150
return generate_seq(core_seq, seqlen=seqlen)
def syn_padded_sequence(sequence, barcode, seqlen=1000):
"""Generate MPRA construct sequence for Synthetic sequences
"""
fill = random_seq(150 - len(sequence) - len(barcode) - 61)
core_seq = f'CTTCTACTACTAGGGCCCA{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCTACATGCTAGTTCATG'
assert len(core_seq) == 150
return generate_seq(core_seq, seqlen=seqlen)
def get_core_seq_ij(seqlen, core_seq_len=150):
start = (seqlen - core_seq_len) // 2
return start, start + core_seq_len
a = gen_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000
b = syn_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000
from basepair.seqmodel import SeqModel
mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
# mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE/'
# mdir = '../../../src/chipnexus/train/seqmodel/output/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE/'
# mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2/'
bpnet = SeqModel.from_mdir(mdir)
first, don't average across multiple flanking sequences
from keras.models import Model, Sequential
import keras.layers as kl
from concise.preprocessing import encodeDNA
from concise.preprocessing.sequence import one_hot2string, DNA
bottleneck_model = bpnet.bottleneck_model()
bpnet_seq_syn = encodeDNA([syn_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()])
bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()])
import skimage.measure
%time bpnet_bottleneck_syn = bottleneck_model.predict(bpnet_seq_syn)
bpnet_bottleneck_syn.shape
%time bpnet_bottleneck_gen = bottleneck_model.predict(bpnet_seq_gen)
bpnet_bottleneck_syn.shape
# get the features
def pool_bottleneck(bottleneck, pool_size=37, outlen=80, agg_fn=np.max):
i,j = get_core_seq_ij(bottleneck.shape[1])
i += 19
j = i + outlen
out = skimage.measure.block_reduce(bottleneck[:,i:j, :, np.newaxis], (1, pool_size, 1, 1), agg_fn)
return out.reshape((out.shape[0], -1))
## train a simple model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_predict
from basepair.plots import regression_eval
In the paper, they achieved R-squared of 0.87
bpnet_bottleneck_syn.shape
def regression_eval(y_true, y_pred, alpha=0.5, markersize=2, task="", ax=None, same_lim=False, loglog=False):
if ax is None:
fig, ax = plt.subplots(1)
from scipy.stats import pearsonr, spearmanr
xmax = max([y_true.max(), y_pred.max()])
xmin = min([y_true.min(), y_pred.min()])
if loglog:
pearson, pearson_pval = pearsonr(np.log10(y_true), np.log10(y_pred))
spearman, spearman_pval = spearmanr(np.log10(y_true), np.log(y_pred))
else:
pearson, pearson_pval = pearsonr(y_true, y_pred)
spearman, spearman_pval = spearmanr(y_true, y_pred)
if loglog:
plt_fn = ax.loglog
else:
plt_fn = ax.plot
plt_fn(y_pred, y_true, ".",
markersize=markersize,
rasterized=True,
alpha=alpha)
ax.set_xlabel("Predicted (FC)")
ax.set_ylabel("Observed (FC)")
if same_lim:
ax.set_xlim((xmin, xmax))
ax.set_ylim((xmin, xmax))
rp = r"$R_{p}$"
rs = r"$R_{s}$"
ax.set_title(task)
ax.text(.95, .2, f"{rp}={pearson:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
ax.text(.95, .05, f"{rs}={spearman:.2f}",
verticalalignment='bottom',
horizontalalignment='right',
transform=ax.transAxes)
from basepair.imports import *
paper_config()
bpnet_bottleneck_syn_feat = pool_bottleneck(bpnet_bottleneck_syn, pool_size=15, outlen=80, agg_fn=np.max)
preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values, cv=5, n_jobs=-1)
!mkdir -p {figures}
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds, 2**dfs_syn.expression_fc_log2.values, markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 10, outlen=80)
# bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values, cv=5, n_jobs=-1)
regression_eval(dfs_gen.expression_fc_log2, preds, alpha=0.4);
subset = dfs_gen.CRE_type=='Genomic'
regression_eval(dfs_gen.expression_fc_log2[subset], preds[subset], alpha=0.4);
rf_gen = RandomForestRegressor(100, n_jobs=-1)
rf_gen.fit(bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values)
regression_eval(dfs_syn.expression_fc_log2, rf_gen.predict(bpnet_bottleneck_syn_feat), alpha=0.4);
rf_syn = RandomForestRegressor(100)
rf_syn.fit(bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values)
regression_eval(dfs_gen.expression_fc_log2, rf_syn.predict(bpnet_bottleneck_gen_feat), alpha=0.4);
from basepair.stats import perc
from concise.eval_metrics import auc, auprc
genomic = dfs_gen.CRE_type=='Genomic'
neg = perc(dfs_gen.expression_fc_log2.values[genomic])< 25
pos = perc(dfs_gen.expression_fc_log2.values[genomic])> 75
subset = pos | neg
y = pos[subset]
y.mean()
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
#bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])
bpnet_bottleneck_gen_feat = values_gen.values
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])
from basepair.utils import read_pkl
mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE/'
#mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE/'
# mdir = '../../../src/chipnexus/train/seqmodel/output/seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE/'
mdir = '../../../src/chipnexus/train/seqmodel/output/nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2/'
preds = read_pkl(f"{mdir}/activity//peaks/H3K27ac;PolII/model_pool/pool_type=max;n_hidden=[64];pool_size=50/test.preds.pkl")
from sklearn.linear_model import LinearRegression
fig, axes = plt.subplots(1, 2, figsize=get_figsize(.7, 1/2))
for i, c in enumerate(preds['columns']):
ax = axes[i]
#lm = LinearRegression()
#lm.fit(preds['y_pred'][:, i, np.newaxis], preds['y_true'][:, i])
#y_pred = lm.predict(preds['y_pred'][:,i, np.newaxis])
y_pred = preds['y_pred'][:, i]
regression_eval(np.exp(y_pred),
np.exp(preds['y_true'][:, i]),
same_lim=True,
markersize=2, alpha=0.4, ax=ax, loglog=True);
ax.set_title(c)
plt.tight_layout()
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.pdf")
fig.savefig(f"{fig_genome}/H3K27ac;PolII.obs-vs-pred.png")
import matplotlib.ticker as ticker
fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.25, aspect=1))
regression_eval(2**preds, 2**dfs_syn.expression_fc_log2.values, markersize=5, alpha=0.4, ax=ax);
xrange = [0.6, 2**4]
ax.set_ylim(xrange)
ax.set_xlim(xrange)
ax.plot(xrange, xrange, c='grey', alpha=0.2)
ax.set_xscale('log', basex=2)
ax.set_yscale('log', basey=2)
ax.xaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.xaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
ax.yaxis.set_major_locator(ticker.LogLocator(base=2, numticks=5))
ax.yaxis.set_major_formatter(ticker.FormatStrFormatter('%.0f'))
plt.minorticks_off()
fig.savefig(f"{figures}/syn.obs-vs-pred.pdf")
fig.savefig(f"{figures}/syn.obs-vs-pred.png")
ls {mdir}/'activity//peaks/H3K27ac;PolII/model_pool/pool_type=global_avg;n_hidden=[64];pool_size=null/test.preds.pkl'