import warnings
warnings.filterwarnings("ignore")
from basepair.exp.chipnexus.data import load_BarakCohnen_mESC_MPRA, load_BarakCohnen_mESC_MPRA_raw
dfs_gen, dfs_syn = load_BarakCohnen_mESC_MPRA()
df_gen, df_syn = load_BarakCohnen_mESC_MPRA_raw()
dfs_gen.head()
dfs_syn.head()
df_gen.head()
df_gen.groupby("CRE_id").size().value_counts()
There are 8 different BC-seq per sequence
Genomic sequences were represented in the pool by 150 bp oligos with the following sequences:
GACTTACATTAGGGCCCGT[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCGGACTACGATACTG
Where [SEQ] is either a reference (gWT) or mutated (gMUT) genomic sequence of 81-82 bps
CTTCTACTACTAGGGCCCA[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCTACATGCTAGTTCATG
where [SEQ] is a 40-80 bp synthetic element comprised of concatenated 20 bp building blocks
of pluripotency sites, as described previously, with the fifth position of the KLF4 site changed to
‘T’ to facilitate cloning (Fiore and Cohen 2016) . [FILL] is a random filler sequence of variable
length to bring the total length of each sequence to 150 bp, and [BC] is a random 9 bp barcode.
from basepair.exp.chipnexus.simulate import generate_seq, random_seq
def gen_padded_sequence(sequence, barcode, seqlen=1000):
"""Generate MPRA construct sequence for Genomic sequences
"""
fill = random_seq(150 - len(sequence) - len(barcode) - 59)
core_seq = f"GACTTACATTAGGGCCCGT{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCGGACTACGATACTG"
assert len(core_seq) == 150
return generate_seq(core_seq, seqlen=seqlen)
def syn_padded_sequence(sequence, barcode, seqlen=1000):
"""Generate MPRA construct sequence for Synthetic sequences
"""
fill = random_seq(150 - len(sequence) - len(barcode) - 61)
core_seq = f'CTTCTACTACTAGGGCCCA{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCTACATGCTAGTTCATG'
assert len(core_seq) == 150
return generate_seq(core_seq, seqlen=seqlen)
def get_core_seq_ij(seqlen, core_seq_len=150):
start = (seqlen - core_seq_len) // 2
return start, start + core_seq_len
a = gen_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000
b = syn_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000
from basepair.BPNet import BPNetPredictor
# TODO - change to {ddir}/...
mdir = '/s/project/avsec/basepair/models/oct-sox-nanog-klf'
ls {mdir}
bpnet = BPNetPredictor.from_mdir(mdir)
bpnet.tasks
first, don't average across multiple flanking sequences
from concise.preprocessing import encodeDNA
from concise.preprocessing.sequence import one_hot2string, DNA
from basepair.utils import write_pkl, read_pkl
from pathlib import Path
cache_dir = Path("/s/project/avsec/basepair/BarakCohen-mESC-MPRA/cache")
cache=True
if cache and os.path.exists(cache_dir / "bpnet_preds.pkl"):
d = read_pkl(cache_dir / "bpnet_preds.pkl")
bpnet_seq_preds_gen, bpnet_seq_preds_syn = d['gen'], d['syn']
else:
bpnet_seq_syn = encodeDNA([syn_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()])
bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()])
bpnet_seq_preds_gen = bpnet.seq_predict(bpnet_seq_gen)
bpnet_seq_preds_syn = bpnet.seq_predict(bpnet_seq_syn)
write_pkl(dict(gen=bpnet_seq_preds_gen, syn=bpnet_seq_preds_syn), cache_dir / "bpnet_preds.pkl")
!du -sh {cache_dir}/
from basepair.plot.tracks import plot_tracks, filter_tracks
from collections import OrderedDict
def bpnet_format_output(obj, tasks, output_features=['p', 'i'], outlen=150):
out = dict(p=obj['preds']['scaled_profile'],
i={t: obj['seq'] * (obj['grads'][ti]['profile']['pos'] + obj['grads'][ti]['profile']['neg'])/2
for ti,t in enumerate(tasks)})
to_plot = OrderedDict([(f"{task}/{t}", out[t][task])
for t in output_features
for task in tasks
])
seqlen = obj['seq'].shape[0]
start = (seqlen - outlen) // 2
return filter_tracks(to_plot, xlim=[start, start + outlen])
max([np.abs(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_syn))])
max([np.abs(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_gen))])
# pre-compute the ranges for the importance scores
ylim_max = max(max([np.abs(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_syn))]),
max([np.abs(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_gen))]))
dfs_gen.head()
mean_vals = OrderedDict([(k,v.mean()) for k,v in bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['p', 'i']).items()])
# 10 most expressed sequences
for i in dfs_gen.expression_fc.sort_values().tail(10).index:
plot_tracks(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i']), # use output_features=['p', i'] to also show the profile
fig_width=14, fig_height_per_track=1.5,
ylim=(-ylim_max, ylim_max),
title=dfs_gen.iloc[i].CRE_id);
# 5 least expressed sequences
for i in dfs_gen.expression_fc.sort_values().head(5).index:
plot_tracks(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i']), # use output_features=['p', i'] to also show the profile
fig_width=14, fig_height_per_track=1.5,
ylim=(-ylim_max, ylim_max),
title=dfs_gen.iloc[i].CRE_id);
# for i in pd.Series(np.arange(len(bpnet_seq_preds_syn))).sample(10):
for i in dfs_syn.expression_fc.sort_values().tail(10).index:
plot_tracks(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i']), # use output_features=['p', i'] to also show the profile
fig_width=14, fig_height_per_track=1.5,
ylim=(-ylim_max, ylim_max),
title=dfs_syn.iloc[i].CRE_elements);
# 5 least expressed constructrs
for i in dfs_syn.expression_fc.sort_values().head(5).index:
plot_tracks(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i']), # use output_features=['p', i'] to also show the profile
fig_width=14, fig_height_per_track=1.5,
ylim=(-ylim_max, ylim_max),
title=dfs_syn.iloc[i].CRE_elements);
values_gen = pd.DataFrame([OrderedDict([(k + "/" + fn.__name__,fn(v)) for k,v in bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['p', 'i']).items() for fn in [np.max, np.mean]])
for i in range(len(bpnet_seq_preds_gen))])
values_syn = pd.DataFrame([OrderedDict([(k + "/" + fn.__name__,fn(v)) for k,v in bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['p', 'i']).items() for fn in [np.max, np.mean]])
for i in range(len(bpnet_seq_preds_syn))])
for f in ['p/amax', 'i/amax', 'p/mean', 'i/mean']:
values_gen[f] = values_gen[[t + "/" + f for t in bpnet.tasks]].sum(axis=1)
for f in ['p/amax', 'i/amax', 'p/mean', 'i/mean']:
values_syn[f] = values_syn[[t + "/" + f for t in bpnet.tasks]].sum(axis=1)
from basepair.modisco.motif_clustering import df_minmax_scale
dfs_gen_val = pd.concat([df_minmax_scale(values_gen), dfs_gen], 1)
dfs_syn_val = pd.concat([df_minmax_scale(values_syn), dfs_syn], 1)
dfs_gen_val.head()
dfm_gen = dfs_gen_val.melt(['expression_fc_log2', 'CRE_type', 'interval'], list(values_gen))
dfm_gen.head()
dfm_syn = dfs_syn_val.melt(['expression_fc_log2', 'CRE_elements'], list(values_syn))
dfm_syn.head()
import matplotlib as mpl
mpl.style.use('default')
dfm_gen.groupby(['CRE_type', 'variable'])[['expression_fc_log2', 'value']].corr('spearman').iloc[0::2,-1].plot.barh(figsize=(5, 8))
dfm_syn.groupby(['variable'])[['expression_fc_log2', 'value']].corr('spearman').iloc[0::2,-1].plot.barh(figsize=(3, 5))
fig, ax = plt.subplots(figsize=(4,3))
dfs_gen_val.plot.scatter('i/amax', 'expression_fc_log2', alpha=0.2, ax=ax);
import plotnine
from plotnine import *
plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_gen[dfm_gen.CRE_type=='Genomic']) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) +theme_classic()
plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_gen[dfm_gen.CRE_type=='All_Mutated']) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) +theme_classic()
plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_syn) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) + theme_classic()
bpnet.model
from keras.models import Model, Sequential
import keras.layers as kl
bpnet.model.get_layer("reshape_1").output
bottleneck_model = Model(bpnet.model.inputs, bpnet.model.get_layer("reshape_1").output)
bpnet_seq_syn = encodeDNA([syn_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()])
bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()])
import skimage.measure
%time bpnet_bottleneck_syn = bottleneck_model.predict(bpnet_seq_syn)
%time bpnet_bottleneck_gen = bottleneck_model.predict(bpnet_seq_gen)
bpnet_bottleneck_syn.shape
# get the features
def pool_bottleneck(bottleneck, pool_size=37, outlen=80, agg_fn=np.max):
i,j = get_core_seq_ij(bottleneck.shape[1])
i += 19
j = i + outlen
out = skimage.measure.block_reduce(bottleneck[:,i:j], (1, pool_size, 1, 1), agg_fn)
return out.reshape((out.shape[0], -1))
bpnet_bottleneck_syn_gen.shape
## train a simple model
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_predict
from basepair.plots import regression_eval
In the paper, they achieved R-squared of 0.87
bpnet_bottleneck_syn_feat = pool_bottleneck(bpnet_bottleneck_syn, pool_size=10, outlen=80, agg_fn=np.max)
# add also the individual features
bpnet_bottleneck_syn_feat = np.concatenate([values_syn.values, bpnet_bottleneck_syn_feat], 1)
preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values, cv=5, n_jobs=-1)
regression_eval(preds, dfs_syn.expression_fc_log2.values, alpha=0.4);
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 10, outlen=80)
bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values, cv=5, n_jobs=-1)
regression_eval(dfs_gen.expression_fc_log2, preds, alpha=0.4);
subset = dfs_gen.CRE_type=='Genomic'
regression_eval(dfs_gen.expression_fc_log2[subset], preds[subset], alpha=0.4);
rf_gen = RandomForestRegressor(100, n_jobs=-1)
rf_gen.fit(bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values)
regression_eval(dfs_syn.expression_fc_log2, rf_gen.predict(bpnet_bottleneck_syn_feat), alpha=0.4);
rf_syn = RandomForestRegressor(100)
rf_syn.fit(bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values)
regression_eval(dfs_gen.expression_fc_log2, rf_syn.predict(bpnet_bottleneck_gen_feat), alpha=0.4);
from basepair.stats import perc
from concise.eval_metrics import auc, auprc
genomic = dfs_gen.CRE_type=='Genomic'
neg = perc(dfs_gen.expression_fc_log2.values[genomic])< 25
pos = perc(dfs_gen.expression_fc_log2.values[genomic])> 75
subset = pos | neg
y = pos[subset]
y.mean()
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
#bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])
bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])
bpnet_bottleneck_gen_feat = values_gen.values
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])