Goal¶

[x] analyze the data with BPNet

Notes¶

to run BPNet, we should probably inject this sequence in the middle of a random sequence and average the predictions across multiple random sequences

Tasks¶

[x] get the importance scores for all these sequences
[x] visualize the importance scores as well as the footprints
[x] correlate importance scores with activity
[X] build a simple predictive model
[x] compare to their RF model
- genomic sequences: gkSVM (top 25% vs bottom 75%) and got AUROC and AUPRC in the neighborhood of 0.75.
  - See Figure 4

Conclusions¶

our model is not that good as theirs on synthetic constructs (however, their model greatly overfits the data (knows the location)
model trained on genomics sequences achieves auc=0.83 instead of theirs 0.75

import warnings
warnings.filterwarnings("ignore")

from basepair.exp.chipnexus.data import load_BarakCohnen_mESC_MPRA, load_BarakCohnen_mESC_MPRA_raw

Using TensorFlow backend.

dfs_gen, dfs_syn = load_BarakCohnen_mESC_MPRA()
df_gen, df_syn = load_BarakCohnen_mESC_MPRA_raw()

dfs_gen.head()

dfs_syn.head()

df_gen.head()

df_gen.groupby("CRE_id").size().value_counts()

8      810
112      1
dtype: int64

There are 8 different BC-seq per sequence

Create full sequences¶

Gen¶

Genomic sequences were represented in the pool by 150 bp oligos with the following sequences:

GACTTACATTAGGGCCCGT[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCGGACTACGATACTG

Where [SEQ] is either a reference (gWT) or mutated (gMUT) genomic sequence of 81-82 bps

Syn¶

CTTCTACTACTAGGGCCCA[SEQ]AAGCTT[FILL]GAATTCTCTAGAC[BC]TGAGCTCTACATGCTAGTTCATG

where [SEQ] is a 40-80 bp synthetic element comprised of concatenated 20 bp building blocks of pluripotency sites, as described previously, with the fifth position of the KLF4 site changed to ‘T’ to facilitate cloning (Fiore and Cohen 2016) . [FILL] is a random filler sequence of variable length to bring the total length of each sequence to 150 bp, and [BC] is a random 9 bp barcode.

from basepair.exp.chipnexus.simulate import generate_seq, random_seq

def gen_padded_sequence(sequence, barcode, seqlen=1000):
    """Generate MPRA construct sequence for Genomic sequences
    """
    fill = random_seq(150 - len(sequence) - len(barcode) - 59)
    core_seq = f"GACTTACATTAGGGCCCGT{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCGGACTACGATACTG"
    assert len(core_seq) == 150
    return generate_seq(core_seq, seqlen=seqlen)


def syn_padded_sequence(sequence, barcode, seqlen=1000):
    """Generate MPRA construct sequence for Synthetic sequences
    """
    fill = random_seq(150 - len(sequence) - len(barcode) - 61)
    core_seq = f'CTTCTACTACTAGGGCCCA{sequence}AAGCTT{fill}GAATTCTCTAGAC{barcode}TGAGCTCTACATGCTAGTTCATG'
    assert len(core_seq) == 150
    return generate_seq(core_seq, seqlen=seqlen)

def get_core_seq_ij(seqlen, core_seq_len=150):
    start = (seqlen - core_seq_len) // 2
    return start, start + core_seq_len

a = gen_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000

b = syn_padded_sequence("ACGTA", "ACGST")
assert len(a) == 1000

Get the importance scores for all these sequences¶

from basepair.BPNet import BPNetPredictor

# TODO - change to {ddir}/...
mdir = '/s/project/avsec/basepair/models/oct-sox-nanog-klf'

ls {mdir}

dataspec.yaml  hparams.yaml  model.h5

bpnet = BPNetPredictor.from_mdir(mdir)

bpnet.tasks

['Oct4', 'Sox2', 'Nanog', 'Klf4']

first, don't average across multiple flanking sequences

from concise.preprocessing import encodeDNA

from concise.preprocessing.sequence import one_hot2string, DNA

Predict¶

from basepair.utils import write_pkl, read_pkl
from pathlib import Path

cache_dir = Path("/s/project/avsec/basepair/BarakCohen-mESC-MPRA/cache")

cache=True

if cache and os.path.exists(cache_dir / "bpnet_preds.pkl"):    
    d = read_pkl(cache_dir / "bpnet_preds.pkl")
    bpnet_seq_preds_gen, bpnet_seq_preds_syn = d['gen'], d['syn']
else:
    bpnet_seq_syn = encodeDNA([syn_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()])
    bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()])
    bpnet_seq_preds_gen = bpnet.seq_predict(bpnet_seq_gen)
    bpnet_seq_preds_syn = bpnet.seq_predict(bpnet_seq_syn)
    write_pkl(dict(gen=bpnet_seq_preds_gen, syn=bpnet_seq_preds_syn), cache_dir / "bpnet_preds.pkl")

!du -sh  {cache_dir}/

Visualize the importance scores as well as the footprints¶

from basepair.plot.tracks import plot_tracks, filter_tracks

from collections import OrderedDict

def bpnet_format_output(obj, tasks, output_features=['p', 'i'], outlen=150):
    out = dict(p=obj['preds']['scaled_profile'],
               i={t: obj['seq'] * (obj['grads'][ti]['profile']['pos'] + obj['grads'][ti]['profile']['neg'])/2 
                  for ti,t in enumerate(tasks)})
    to_plot = OrderedDict([(f"{task}/{t}", out[t][task])
                          for t in output_features
                          for task in tasks
                          ])
    seqlen = obj['seq'].shape[0]
    start = (seqlen - outlen) // 2
    return filter_tracks(to_plot, xlim=[start, start + outlen])

max([np.abs(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_syn))])

0.3774355351924896

max([np.abs(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_gen))])

0.48443692922592163

# pre-compute the ranges for the importance scores
ylim_max = max(max([np.abs(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_syn))]), 
               max([np.abs(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i'])[f'{task}/i']).max() for task in bpnet.tasks for i in range(len(bpnet_seq_preds_gen))]))

dfs_gen.head()

mean_vals = OrderedDict([(k,v.mean()) for k,v in bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['p', 'i']).items()])

Genomic¶

# 10 most expressed sequences
for i in dfs_gen.expression_fc.sort_values().tail(10).index:
    plot_tracks(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i']),   # use output_features=['p', i'] to also show the profile
                fig_width=14, fig_height_per_track=1.5,
                ylim=(-ylim_max, ylim_max), 
                title=dfs_gen.iloc[i].CRE_id);

# 5 least expressed sequences
for i in dfs_gen.expression_fc.sort_values().head(5).index:
    plot_tracks(bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['i']),   # use output_features=['p', i'] to also show the profile
                fig_width=14, fig_height_per_track=1.5,
                ylim=(-ylim_max, ylim_max), 
                title=dfs_gen.iloc[i].CRE_id);

Synthetic¶

# for i in pd.Series(np.arange(len(bpnet_seq_preds_syn))).sample(10):
for i in dfs_syn.expression_fc.sort_values().tail(10).index:   
    plot_tracks(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i']),    # use output_features=['p', i'] to also show the profile
                fig_width=14, fig_height_per_track=1.5,
                ylim=(-ylim_max, ylim_max), 
                title=dfs_syn.iloc[i].CRE_elements);

# 5 least expressed constructrs
for i in dfs_syn.expression_fc.sort_values().head(5).index:   
    plot_tracks(bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['i']),    # use output_features=['p', i'] to also show the profile
                fig_width=14, fig_height_per_track=1.5,
                ylim=(-ylim_max, ylim_max), 
                title=dfs_syn.iloc[i].CRE_elements);

Correlate with importance scores¶

values_gen = pd.DataFrame([OrderedDict([(k + "/" + fn.__name__,fn(v)) for k,v in bpnet_format_output(bpnet_seq_preds_gen[i], bpnet.tasks, output_features=['p', 'i']).items() for fn in [np.max, np.mean]])
                       for i in range(len(bpnet_seq_preds_gen))])

values_syn = pd.DataFrame([OrderedDict([(k + "/" + fn.__name__,fn(v)) for k,v in bpnet_format_output(bpnet_seq_preds_syn[i], bpnet.tasks, output_features=['p', 'i']).items() for fn in [np.max, np.mean]])
                       for i in range(len(bpnet_seq_preds_syn))])

for f in ['p/amax', 'i/amax', 'p/mean', 'i/mean']:
    values_gen[f] = values_gen[[t + "/" + f for t in bpnet.tasks]].sum(axis=1)

for f in ['p/amax', 'i/amax', 'p/mean', 'i/mean']:
    values_syn[f] = values_syn[[t + "/" + f for t in bpnet.tasks]].sum(axis=1)

from basepair.modisco.motif_clustering import df_minmax_scale

dfs_gen_val = pd.concat([df_minmax_scale(values_gen), dfs_gen], 1)

dfs_syn_val = pd.concat([df_minmax_scale(values_syn), dfs_syn], 1)

dfs_gen_val.head()

dfm_gen = dfs_gen_val.melt(['expression_fc_log2', 'CRE_type', 'interval'], list(values_gen))
dfm_gen.head()

dfm_syn = dfs_syn_val.melt(['expression_fc_log2', 'CRE_elements'], list(values_syn))
dfm_syn.head()

import matplotlib as mpl

mpl.style.use('default')

dfm_gen.groupby(['CRE_type', 'variable'])[['expression_fc_log2', 'value']].corr('spearman').iloc[0::2,-1].plot.barh(figsize=(5, 8))

<matplotlib.axes._subplots.AxesSubplot at 0x7ff5b2625cf8>

dfm_syn.groupby(['variable'])[['expression_fc_log2', 'value']].corr('spearman').iloc[0::2,-1].plot.barh(figsize=(3, 5))

<matplotlib.axes._subplots.AxesSubplot at 0x7ff5bf3e6198>

fig, ax = plt.subplots(figsize=(4,3))
dfs_gen_val.plot.scatter('i/amax', 'expression_fc_log2', alpha=0.2, ax=ax);

import plotnine
from plotnine import *

plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_gen[dfm_gen.CRE_type=='Genomic']) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) +theme_classic()

<ggplot: (-9223363243511226862)>

plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_gen[dfm_gen.CRE_type=='All_Mutated']) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) +theme_classic()

<ggplot: (8793327086989)>

plotnine.options.figure_size = (25, 4)
ggplot(aes(x='value', y='expression_fc_log2'), dfm_syn) + geom_point(alpha=0.1) + geom_smooth(alpha=0.1, color='grey', linetype='dashed', method='lm') + facet_wrap("~variable", ncol=10) + theme_classic()

<ggplot: (-9223363243527211839)>

Score motif instances¶

Build a simple predictive model based on motif instance locations¶

bpnet.model

<keras.engine.training.Model at 0x7ff615e58978>

from keras.models import Model, Sequential

import keras.layers as kl

bpnet.model.get_layer("reshape_1").output

<tf.Tensor 'reshape_1/Reshape:0' shape=(?, ?, 1, 64) dtype=float32>

bottleneck_model = Model(bpnet.model.inputs, bpnet.model.get_layer("reshape_1").output)

bpnet_seq_syn = encodeDNA([syn_padded_sequence(s, "AAAGACGCG") for s in dfs_syn.Sequence.str.upper()])
bpnet_seq_gen = encodeDNA([gen_padded_sequence(s, "AAAGACGCG") for s in dfs_gen.Sequence.str.upper()])

import skimage.measure

%time bpnet_bottleneck_syn = bottleneck_model.predict(bpnet_seq_syn)

CPU times: user 10.2 s, sys: 2.08 s, total: 12.3 s
Wall time: 3.69 s

%time bpnet_bottleneck_gen = bottleneck_model.predict(bpnet_seq_gen)

CPU times: user 13.1 s, sys: 2.49 s, total: 15.6 s
Wall time: 4.55 s

bpnet_bottleneck_syn.shape

(622, 1000, 1, 64)

# get the features
def pool_bottleneck(bottleneck, pool_size=37, outlen=80, agg_fn=np.max):
    i,j = get_core_seq_ij(bottleneck.shape[1])
    i += 19
    j = i + outlen
    out = skimage.measure.block_reduce(bottleneck[:,i:j], (1, pool_size, 1, 1), agg_fn)
    return out.reshape((out.shape[0], -1))

bpnet_bottleneck_syn_gen.shape

(809, 256)

## train a simple model

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LassoCV
from sklearn.model_selection import cross_val_predict

from basepair.plots import regression_eval

Synthetic¶

In the paper, they achieved R-squared of 0.87

bpnet_bottleneck_syn_feat = pool_bottleneck(bpnet_bottleneck_syn, pool_size=10, outlen=80, agg_fn=np.max)

# add also the individual features
bpnet_bottleneck_syn_feat = np.concatenate([values_syn.values, bpnet_bottleneck_syn_feat], 1)

preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values, cv=5, n_jobs=-1)

regression_eval(preds, dfs_syn.expression_fc_log2.values, alpha=0.4);

Genomic¶

bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 10, outlen=80)

bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)

preds = cross_val_predict(RandomForestRegressor(100), bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values, cv=5, n_jobs=-1)

regression_eval(dfs_gen.expression_fc_log2, preds, alpha=0.4);

subset = dfs_gen.CRE_type=='Genomic'
regression_eval(dfs_gen.expression_fc_log2[subset], preds[subset], alpha=0.4);

Cross-data comparison¶

Trained on genetic data, eval on synthetic¶

rf_gen = RandomForestRegressor(100, n_jobs=-1)
rf_gen.fit(bpnet_bottleneck_gen_feat, dfs_gen.expression_fc_log2.values)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

regression_eval(dfs_syn.expression_fc_log2, rf_gen.predict(bpnet_bottleneck_syn_feat), alpha=0.4);

Trained on synthetic data, eval on genetic¶

rf_syn = RandomForestRegressor(100)
rf_syn.fit(bpnet_bottleneck_syn_feat, dfs_syn.expression_fc_log2.values)

regression_eval(dfs_gen.expression_fc_log2, rf_syn.predict(bpnet_bottleneck_gen_feat), alpha=0.4);

Compare to their RF model¶

genomic sequences: gkSVM (top 25% vs bottom 75%) and got AUROC and AUPRC in the neighborhood of 0.75.
- See Figure 4

from basepair.stats import perc

from concise.eval_metrics import auc, auprc

genomic = dfs_gen.CRE_type=='Genomic'

neg = perc(dfs_gen.expression_fc_log2.values[genomic])< 25
pos = perc(dfs_gen.expression_fc_log2.values[genomic])> 75

subset = pos | neg

y = pos[subset]

y.mean()

0.5024875621890548

Bottleneck features¶

bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
#bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)

preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')

print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])

auc, auprc

(0.8318316831683168, 0.7877723788876447)

Both¶

bpnet_bottleneck_gen_feat = pool_bottleneck(bpnet_bottleneck_gen, 5, outlen=80)
bpnet_bottleneck_gen_feat = np.concatenate([values_gen.values, bpnet_bottleneck_gen_feat], axis=1)
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])

auc, auprc

(0.8327227722772278, 0.7798736186062161)

Only importance scores¶

bpnet_bottleneck_gen_feat = values_gen.values
preds = cross_val_predict(RandomForestClassifier(1000), bpnet_bottleneck_gen_feat[genomic][subset], y, cv=3, n_jobs=-1, method='predict_proba')
print("auc, auprc")
auc(y, preds[:,1]), auprc(y, preds[:,1])

auc, auprc

(0.7976732673267327, 0.7900365048297909)

	CRE_id	Sequence	Rep1_CRE_normalized	Rep2_CRE_normalized	Rep3_CRE_normalized	Rep2_CRE_norm_SEM	Rep3_CRE_norm_SEM	interval	CRE_type	chrom	start	end	expression_fc	expression_fc_SEM	expression_fc_log2
0	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	1.2789	1.3300	1.1202	0.1134	0.0980	chr10:108456620-10845...	All_Mutated	chr10	108456620	108456701	1.3045	0.1134	0.3835
1	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	2.5366	2.0188	2.4672	1.0134	1.3304	chr10:108456620-10845...	Genomic	chr10	108456620	108456701	2.2777	1.0134	1.1876
2	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.8912	0.7321	0.8639	0.0673	0.1322	chr10:110940873-11094...	All_Mutated	chr10	110940873	110940954	0.8116	0.0673	-0.3011
3	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.9943	1.0337	1.1185	0.1044	0.0994	chr10:110940873-11094...	Genomic	chr10	110940873	110940954	1.0140	0.1044	0.0201
4	chr10:115312950-11531...	cagctatagtcccagtcTGTT...	0.9246	1.0169	1.0652	0.1435	0.0881	chr10:115312950-11531...	All_Mutated	chr10	115312950	115313031	0.9707	0.1435	-0.0429

	CRE_id	Sequence	Rep1_CRE_normalized	Rep2_CRE_normalized	Rep3_CRE_normalized	Rep2_CRE_norm_SEM	Rep3_CRE_norm_SEM	CRE_elements	expression_fc	expression_fc_SEM	expression_fc_log2	Klf4	Klf4_orientation	Sox2	Sox2_orientation	Essrb	Essrb_orientation	Oct4	Oct4_orientation
0	Ef-Kf-Of-Sf_synthetic	AGCTACGTTCAAGGTCACGTA...	2.9462	3.5538	3.6890	0.2847	0.2054	Ef-Kf-Of-Sf	3.2500	0.2847	1.7004	True	fwd	True	fwd	True	fwd	True	fwd
1	Ef-Kf-Of-Sr_synthetic	AGCTACGTTCAAGGTCACGTA...	3.7496	5.1441	4.6180	0.6682	0.6283	Ef-Kf-Of-Sr	4.4469	0.6682	2.1528	True	fwd	True	rev	True	fwd	True	fwd
2	Ef-Kf-Of_synthetic	AGCTACGTTCAAGGTCACGTA...	2.0628	2.7998	2.2773	0.2142	0.3677	Ef-Kf-Of	2.4313	0.2142	1.2817	True	fwd	False	rev	True	fwd	True	fwd
3	Ef-Kf-Or-Sf_synthetic	AGCTACGTTCAAGGTCACGTA...	4.5477	5.1814	4.9207	0.7599	0.7516	Ef-Kf-Or-Sf	4.8646	0.7599	2.2823	True	fwd	True	fwd	True	fwd	True	rev
4	Ef-Kf-Or-Sr_synthetic	AGCTACGTTCAAGGTCACGTA...	5.4429	7.1354	6.6851	0.9533	0.9347	Ef-Kf-Or-Sr	6.2892	0.9533	2.6529	True	fwd	True	rev	True	fwd	True	rev

	CRE_id	BC_seq	Sequence	RNA_counts_rep1	RNA_counts_rep2	RNA_counts_rep3	DNA_counts	BC_exp_rep1	BC_exp_rep2	BC_exp_rep3	Rep1_BC_norm	Rep2_BC_norm	Rep3_BC_norm	Rep1_CRE_normalized	Rep2_CRE_normalized	Rep3_CRE_normalized	Rep2_CRE_norm_SEM	Rep3_CRE_norm_SEM
0	Basal	AAAGACGCG	NaN	247.2739	179.0611	160.3011	229.3460	1.0782	0.7807	0.6989	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Basal	AAATCAAGC	NaN	88.0702	107.7321	140.7916	181.2556	0.4859	0.5944	0.7768	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	Basal	AAATGGTCC	NaN	205.2365	170.3033	173.5353	313.6059	0.6544	0.5430	0.5534	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	Basal	AACAAGAGG	NaN	172.0582	236.9896	114.2612	200.5631	0.8579	1.1816	0.5697	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	Basal	AACGACCCT	NaN	339.0788	209.2388	217.3385	374.8905	0.9045	0.5581	0.5797	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	CRE_id	Sequence	Rep1_CRE_normalized	Rep2_CRE_normalized	Rep3_CRE_normalized	Rep2_CRE_norm_SEM	Rep3_CRE_norm_SEM	interval	CRE_type	chrom	start	end	expression_fc	expression_fc_SEM	expression_fc_log2
0	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	1.2789	1.3300	1.1202	0.1134	0.0980	chr10:108456620-10845...	All_Mutated	chr10	108456620	108456701	1.3045	0.1134	0.3835
1	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	2.5366	2.0188	2.4672	1.0134	1.3304	chr10:108456620-10845...	Genomic	chr10	108456620	108456701	2.2777	1.0134	1.1876
2	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.8912	0.7321	0.8639	0.0673	0.1322	chr10:110940873-11094...	All_Mutated	chr10	110940873	110940954	0.8116	0.0673	-0.3011
3	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.9943	1.0337	1.1185	0.1044	0.0994	chr10:110940873-11094...	Genomic	chr10	110940873	110940954	1.0140	0.1044	0.0201
4	chr10:115312950-11531...	cagctatagtcccagtcTGTT...	0.9246	1.0169	1.0652	0.1435	0.0881	chr10:115312950-11531...	All_Mutated	chr10	115312950	115313031	0.9707	0.1435	-0.0429

	Oct4/p/amax	Oct4/p/mean	Sox2/p/amax	Sox2/p/mean	Nanog/p/amax	Nanog/p/mean	Klf4/p/amax	Klf4/p/mean	Oct4/i/amax	Oct4/i/mean	Sox2/i/amax	Sox2/i/mean	Nanog/i/amax	Nanog/i/mean	Klf4/i/amax	Klf4/i/mean	p/amax	i/amax	p/mean	i/mean	CRE_id	Sequence	Rep1_CRE_normalized	Rep2_CRE_normalized	Rep3_CRE_normalized	Rep2_CRE_norm_SEM	Rep3_CRE_norm_SEM	interval	CRE_type	chrom	start	end	expression_fc	expression_fc_SEM	expression_fc_log2
0	0.0361	0.1062	0.0100	0.0860	0.0313	0.0924	0.1392	0.2100	0.3196	0.1822	0.1981	0.1016	0.0785	0.0937	0.1569	0.1855	0.0373	0.2363	0.1008	0.1047	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	1.2789	1.3300	1.1202	0.1134	0.0980	chr10:108456620-10845...	All_Mutated	chr10	108456620	108456701	1.3045	0.1134	0.3835
1	0.0297	0.0696	0.0087	0.0737	0.0239	0.0573	0.0939	0.0796	0.1997	0.0756	0.1248	0.0513	0.1294	0.0672	0.1972	0.2365	0.0258	0.2097	0.0469	0.0609	chr10:108456620-10845...	AGTCATGGAGAACAGGGGTTA...	2.5366	2.0188	2.4672	1.0134	1.3304	chr10:108456620-10845...	Genomic	chr10	108456620	108456701	2.2777	1.0134	1.1876
2	0.0121	0.0634	0.0105	0.0713	0.0323	0.0940	0.1681	0.3698	0.0209	0.0838	0.0538	0.0713	0.1654	0.1708	0.0415	0.2323	0.0253	0.1021	0.0960	0.1176	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.8912	0.7321	0.8639	0.0673	0.1322	chr10:110940873-11094...	All_Mutated	chr10	110940873	110940954	0.8116	0.0673	-0.3011
3	0.0384	0.1276	0.0547	0.2120	0.0726	0.2211	0.2404	0.6134	0.0891	0.1866	0.2073	0.2267	0.3520	0.3149	0.1238	0.3825	0.0837	0.2863	0.2456	0.2757	chr10:110940873-11094...	CCAGCCCTCAACAGAGCAGCT...	0.9943	1.0337	1.1185	0.1044	0.0994	chr10:110940873-11094...	Genomic	chr10	110940873	110940954	1.0140	0.1044	0.0201
4	0.0252	0.0668	0.0152	0.0801	0.0640	0.1264	0.0829	0.1863	0.0201	0.0702	0.0551	0.0632	0.2786	0.1995	0.0230	0.1540	0.0416	0.1554	0.0927	0.1136	chr10:115312950-11531...	cagctatagtcccagtcTGTT...	0.9246	1.0169	1.0652	0.1435	0.0881	chr10:115312950-11531...	All_Mutated	chr10	115312950	115313031	0.9707	0.1435	-0.0429