Compare different models:
python generate_annotated_dfi.pymodisco.results.csv and model.results.csv# # old config
# models = {
# 'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
# 'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
# 'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
# 'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
# 'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
# }
models = {
'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
#'nexus/binary+profile': 'nexus,gw,OSNK,1,0.1,0.01,FALSE,same,0.5,64,25,0.001,9,FALSE',
#'nexus/profile.gw': 'nexus,gw,OSNK,0,10,1,FALSE,same,0.5,64,25,0.001,9,FALSE',
# 'nexus/profile.peaks.bias-corrected.augm': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'nexus/profile.peaks.non-bias-corrected': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2',
'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
#'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE',
#'seq/profile.peaks.bias-corrected.augm': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
# 'seq/profile.peaks.non-bias-corrected': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE',
'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
# 'basset/binary': 'binary-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.6',
# 'factorized-basset/binary': 'factorized-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.5'
}
models_inv = {v:k for k,v in models.items()}
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from basepair.config import test_chr
from plotnine import *
import plotnine
from config import experiments
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
from basepair.plot.utils import plt9_tilt_xlab
from basepair.exp.paper.config import models_dir
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()
import sys
sys.path.insert(0, '..')
fdir = Path(f'{ddir}/figures/method-comparison/modisco')
fdir.mkdir(exist_ok=True)
exp = models['nexus/profile']
model_dir = models_dir / exp
df_train = pd.read_csv(models_dir / "model.results.finished.csv")
df_train = df_train[df_train.exp.isin(models.values())]
df_train['exp_name'] = df_train.exp.map(models_inv)
list(df_train.exp_name)
df_train[df_train.exp_name == 'nexus/binary+profile']
tf_colors
gw_exp = ['nexus/binary', 'nexus/binary+profile']
dfp = df_train[df_train.exp_name.isin(gw_exp)][[f'valid-genome-wide/{task}/class/auPR' for task in tasks] + ['exp_name']].melt(id_vars='exp_name')
dfp['TF'] = dfp.variable.str.split("/", expand=True)[1]
dfp['TF'] = pd.Categorical(dfp['TF'], tasks)
dfp.groupby("exp_name").value.mean()
colors = {"nexus/binary": "#EE8030",
"nexus/profile": "#498AAD",
"nexus/profile.gw": "#586BA4",
"nexus/binary+profile": "#4E598C"
}
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='TF', fill='exp_name', y='value'), dfp) +
geom_bar(stat='identity', position='dodge') +
theme_classic(base_size=10, base_family='Arial') + \
scale_fill_manual([colors[x] for x in gw_exp]) + \
ylab("auPRC")
)
fig.save(fdir / 'auPRC.binary-vs-profile+binary.bar.pdf')
fig
dft = pd.DataFrame([{"training_time": pd.read_csv(f"{models_dir}/{exp}/train.smk-benchmark.tsv", sep='\t').iloc[0]['s'] / 60,
"Model": model_name,
"exp": exp}
for model_name, exp in models.items()])
plt_exp = ['nexus/binary', 'nexus/profile']
plotnine.options.figure_size = get_figsize(.18)
fig = (ggplot(aes(x='Model', fill='Model', y='training_time'), dft[dft.Model.isin(plt_exp)]) +
geom_bar(stat='identity') +
scale_fill_manual([colors[x] for x in plt_exp]) + \
theme_classic(base_size=10, base_family='Arial') + \
xlab("Model") +
ylab("Training time (min)") +
plt9_tilt_xlab(30)
)
fig.save(fdir / 'training-time.bar.pdf')
fig
from MPRA.config import model_exps
df_mpra = pd.read_csv('../MPRA/output/MPRA.csv')
model_exps_inf = {v:k for k,v in model_exps.items()}
df_mpra['model'] = df_mpra.exp.map(model_exps_inf)
df_mpra.exp = df_mpra.exp.str.rstrip("/")
df_mpra['exp_name'] = df_mpra.exp.map(models_inv)
df_mpra = df_mpra[~df_mpra.exp_name.isnull()]
# with pd.option_context("display.max_colwidth", 100):
# print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
# sort_values("metrics/genomic/spearmanr").
# to_string())
df_mpra_subset = df_mpra[df_mpra.exp_name.isin(colors)]
df_mpra_subset['exp_name'] = pd.Categorical(df_mpra_subset.exp_name, categories=list(colors))
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/synthetic/spearmanr'), df_mpra_subset.query('data=="synthetic"')) +
geom_bar(stat='identity') +
theme_classic(base_size=10, base_family='Arial') + \
scale_fill_manual(list(colors.values())) + \
ylab("Spearman corr") +
plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.synthetic.bar.pdf')
fig
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/genomic/spearmanr'), df_mpra_subset.query('data=="genomic"')) +
geom_bar(stat='identity') +
theme_classic(base_size=10, base_family='Arial') + \
scale_fill_manual(list(colors.values())) + \
ylab("Spearman corr") +
plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.genomic.bar.pdf')
fig
dfa = pd.read_csv(models_dir / 'activity.csv')
dfa.exp = dfa.exp.str.rstrip("/")
dfa['exp_name'] = dfa.exp.map(models_inv)
dfa = dfa[~dfa.exp_name.isnull()].query("split=='test'")
dfa = dfa[dfa.exp_name.isin(colors)]
dfa['exp_name'] = pd.Categorical(dfa.exp_name, categories=list(colors))
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/H3K27ac/spearmanr'),
dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') &
(dfa['model_kwargs/bn'].isnull()) &
~dfa.exp_name.str.contains("peaks-union")]) +
geom_bar(stat='identity', position='dodge') +
theme_classic(base_size=10, base_family='Arial') + \
scale_fill_manual(list(colors.values())) + \
ggtitle("H3K27ac") +
ylab("Spearman corr") +
plt9_tilt_xlab(30))
fig.save(fdir / 'H3K27ac.bar.pdf')
fig
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/PolII/spearmanr'),
dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') &
(dfa['model_kwargs/bn'].isnull()) &
~dfa.exp_name.str.contains("peaks-union")]) +
ggtitle("Pol II") +
geom_bar(stat='identity', position='dodge') +
scale_fill_manual(list(colors.values())) + \
theme_classic(base_size=10, base_family='Arial') + \
ylab("Spearman corr") +
plt9_tilt_xlab(30))
fig.save(fdir / 'PolII.bar.pdf')
fig
models
main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4'] # NOTE this is a duplication from before
dfi = pd.read_parquet(models_dir / 'dfi_subset.annotated.v3.parq', engine='fastparquet')
dfi = dfi[dfi.example_chrom.isin(test_chr)] # look at only the test chromosomes
dfi = dfi[dfi.pattern_name.isin(main_motifs)]
# Remove duplicates
dfi = dfi.sort_values("imp_weighted", ascending=False)
duplicated = dfi[['model', 'pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi = dfi[~duplicated]
# setup coordinates
dfi['Chromosome'] = dfi['example_chrom']
dfi['Start'] = dfi['pattern_start_abs']
dfi['End'] = dfi['pattern_end_abs']
dfi['Strand'] = dfi['pattern_end_abs']
dfi.head()
import pyranges as pr
pr.__version__
dfi_pr = pr.PyRanges(dfi)
model = 'nexus/binary'
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
len(dfi_pr_s_nonoverlapping)
len(dfi_pr_s)
model = 'seq/profile'
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
len(dfi_pr_s_nonoverlapping)
len(dfi_pr_s)
model = 'nexus/profile'
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
len(dfi_pr_s_nonoverlapping)
len(dfi_pr_s)
len(dfi_pr_s_nonoverlapping)
dfi_test = dfi[dfi.example_chrom.isin(test_chr)]
print(dfi.groupby(['pattern_name', 'model']).size().to_string())
def get_profile_score(dfi, score, profile_cls):
df = dfi[(~dfi[score].isnull())]
return df.sort_values(score, ascending=False)[profile_cls]
motif_profile_counts = [(motif,
{model: get_profile_score(dfi[(dfi.pattern_name == motif) &
(dfi.model == model) &
(dfi.example_chrom.isin(test_chr))],
'imp_weighted',
profile_mapping[motif]+ "/profile_counts_max_ref").values
for model in models})
for motif in main_motifs]
# Profile scores
model = 'nexus/profile'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1), sharex=True)
for i, motif in enumerate(main_motifs):
ax = axes[i]
scores = motif_profile_counts[i][1][model]
thresholds[motif] = np.percentile(scores, 90)
#if len(scores) == 0:
# continue
ax.hist(scores, bins=np.arange(30))
ax.axvline(thresholds[motif], color='grey')
ax.set_ylabel(motif)
if i == len(main_motifs)-1:
ax.set_xlabel("Profile score (max value)");
# fig.savefig(fdir / 'profile-score.hist.pdf')
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))),
sharex=True,
sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
ax = axes[i]
# top_n = min([len(x) for x in res.values()])
top_n = 2000
for k,v in res.items():
if k in ['nexus/profile', 'nexus/binary']:
cutoff = thresholds[motif]
ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)
max_val = max(max_val, ax.get_ylim()[1])
if i == 0:
ax.set_ylabel("# motif instances\nwith high coverage")
if i == 1:
ax.set_xlabel("Top # of motif instances")
if i == 0:
leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
leg.get_frame().set_linewidth(0.0)
ax.set_title(motif)
for i, (motif, res) in enumerate(motif_profile_counts):
axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)
fig.savefig(fdir / 'profile-recall.nexus.profile-vs-binary.pdf')
fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(0.75, 1/(1*len(main_motifs)-1)),
sharex=True,
sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
# fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(.75, 1/(1*len(main_motifs)-1)),
# sharex=True,
# sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
ax = axes[i]
# top_n = min([len(x) for x in res.values()])
top_n = 2000
for k,v in res.items():
if k in ['nexus/profile.peaks-union', 'seq/profile', 'seq/profile.peaks-union', 'seq/binary']:
cutoff = thresholds[motif]
ax.plot(np.cumsum(v[:top_n] > cutoff), label=k.replace("peaks-union", 'u'))
max_val = max(max_val, ax.get_ylim()[1])
if i == 0:
ax.set_ylabel("# motif instances\nwith high coverage")
if i == 1:
ax.set_xlabel("Top # of motif instances")
if i == 0:
leg = ax.legend(loc='bottom right', framealpha=1, borderpad=0)
leg.get_frame().set_linewidth(0.0)
ax.set_title(motif)
for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)
fig.savefig(fdir / 'profile-recall.nexus-vs-seq.pdf')
model = 'nexus/profile.peaks-union'
pattern = 'Sox2'
exp = models[model]
exp
isf = ImpScoreFile(models_dir / exp / 'deeplift.imp_score.h5', default_imp_score='profile/wn')
isf.cache()
thresholds['Sox2']
threshold = 1
dfi_ranges = dfi[(dfi.model == model) & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
mr = ModiscoResult(f"{models_dir}/{exp}/deeplift/Sox2/out/profile/wn/modisco.h5")
dfi2 = pd.read_parquet(f"{models_dir}/{exp}/deeplift/dfi_subset.parq")
dfi2.head()
ls {models_dir}/{exp}/deeplift
subset = isf.get_ranges()['interval_from_task'] == "Sox2"
pname = longer_pattern(experiments[models[model]]['motifs'][pattern].split("/")[1])
# Question: why is the
seqlets = []
for s in mr._get_seqlets(pname, trim_frac=0.08):
s.start = s.start - 30
s.end = s.end + 30
seqlets.append(s)
# TODO - the issue is with determining the reference sequence
isf.include_samples = subset
ess_seqlets = isf.extract(seqlets, )
isf.include_samples = None
ess_seqlets.aggregate().plot();
p = ess.aggregate()
#pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
threshold = np.percentile((pos+neg)/2, 90)
print("threshold: ", threshold)
# plt.plot(pos)
# plt.plot(neg);
# plt.title("Sox2")
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)
plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')
recall_nexus = np.cumsum((pos+neg)/2 > threshold)
dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
p = ess.aggregate()
# pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)
plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')
recall_seq = np.cumsum((pos+neg)/2 > threshold)
plt.plot(recall_seq, label='seq', alpha=0.5);
plt.plot(recall_nexus, label='nexus', alpha=0.5);
plt.plot([0, 600], [0, 600], color='gray')
plt.legend()
ess.plot("profile");
ess.plot("seq");
dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
p = ess.aggregate()
pl = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0), [0, 1]]
plt.plot(pl[:, 0])
plt.plot(pl[:, 1]);
from basepair.modisco.core import Pattern
ess.aggregate().plot(rotate_y=0);
ess.plot("profile");
ess.plot("seq");
from basepair.preproc import dfint_intersects, dfint_no_intersection, dfint_overlap_idx
int_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
list(models)
# Example
pattern = 'Oct4-Sox2'
ma = 'nexus/profile'
mb = 'nexus/binary'
def extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log'):
"""Overlap different motif instances
"""
dfic = dfi[(dfi.pattern_name == pattern) & (dfi.model.isin([ma, mb]))]
dfa = dfic[(dfi.model == ma)][int_cols]
dfb = dfic[(dfi.model == mb)][int_cols]
int_ab = dfint_intersects(dfa, dfb)
int_ba = dfint_intersects(dfb, dfa)
# Add the set
dfic['set'] = ''
dfic.loc[int_ba[int_ba].index, 'set'] = 'both'
dfic.loc[int_ba[~int_ba].index, 'set'] = mb
dfic.loc[int_ab[~int_ab].index, 'set'] = ma
dfic.loc[int_ab[int_ab].index, 'set'] = 'both'
profile = profile_mapping[pattern]
dfic[f'{profile}/profile_counts_max_ref_log'] = np.log10(1+dfic[f'{profile}/profile_counts_max_ref'])
dfic['profile_score'] = dfic[f'{profile}/{score}']
dfic['pair'] = "<>".join([ma, mb])
dfic['score'] = 'score'
return dfic[['pattern_name', 'set', 'profile_score', 'pair', 'score']]
dfr = pd.concat([extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log')
for pattern in tqdm(main_motifs)
for ma,mb in [
('nexus/profile', 'nexus/binary'),
('seq/profile', 'seq/binary'),
('nexus/profile.peaks-union', 'seq/profile.peaks-union'),
] if not (pattern == 'Klf4' and 'seq' in mb)])
dfr['pair'] =dfr.pair.str.replace("peaks-union", 'u') # For shorter plot labels
plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'), dfr) +
facet_grid("pattern_name ~ pair", scales='free_x') +
geom_violin() +
# geom_jitter(alpha=0.1) +
# scale_y_log10() +
theme_classic() +
plt9_tilt_xlab() +
ylab(f"Profile score")
)
fig
plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'),
dfr.groupby(['pair', 'set', 'pattern_name']).profile_score.size().reset_index()) +
facet_grid("pattern_name ~ pair", scales='free_x') +
geom_bar(stat='identity') +
# geom_jitter(alpha=0.1) +
# scale_y_log10() +
theme_classic() +
plt9_tilt_xlab() +
ylab(f"Profile score")
)
fig
dfi.head()
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi
from basepair.modisco.periodicity import smooth
# dfi = pd.concat([pd.read_parquet(models_dir / exp / "deeplift/dfi_subset.parq", engine='fastparquet').assign(model=model_name)
# for model_name, exp in tqdm(models.items())])
# dict(dfi.groupby("model").size())
# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi[dfi.model == model], pair).assign(motif_pair="<>".join(pair)).assign(model=model)
for pair in [['Nanog', 'Nanog']] for model in tqdm(models)],
axis=0)
motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']
motif_pair_name = 'Nanog<>Nanog'
pairs = get_motif_pairs(motifs)
from matplotlib import ticker
frac_10bp = dict()
for model in models:
strand_comb = "++"
fig, ax = plt.subplots(figsize = get_figsize(.25))
center_diff = dfab[(dfab.strand_combination == strand_comb) &
(dfab.model == model) &
(dfab.motif_pair == motif_pair_name)].center_diff
counts, bin_edges = np.histogram(center_diff, bins=np.arange(150)+.5)
smooth_part = smooth(counts, 10)
plt.plot(bin_edges[1:], counts, label='raw')
plt.plot(bin_edges[1:], smooth_part, label='smoothed')
plt.legend()
plt.title(f"{model} Nanog<>Nanog ({strand_comb})")
plt.xlabel("Pairwise distance")
plt.ylabel("Frequency");
power = 0
for strand_comb in ['++', "-+", '+-', "--"]:
counts, bin_edges = np.histogram(dfab[(dfab.strand_combination == strand_comb) &
(dfab.model == model) &
(dfab.motif_pair == motif_pair_name)].center_diff,
bins=np.arange(150)+.5)
smooth_part = smooth(counts, 10)
power += np.abs(np.fft.fft(counts-smooth_part)[:49])**2
freq = np.fft.fftfreq(counts.shape[-1])
power = power[3:49]
x = 1 / freq[3:49]
fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
# Store the peak height vs others
frac_10bp[model] = power[np.argmax(power)] / power.sum()
ax.plot(x, power)
ax.scatter(x, power, s=10)
ax.set_xlim([0, 50])
ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
# ax.grid(alpha=0.3)
t0_max = x[np.argmax(power)]
ax.axvline(x=t0_max, color='grey', alpha=0.5)
print("Max frequency:", t0_max)
ax.set_xlabel("1/Frequency [bp]")
ax.set_ylabel("Power spectrum");
ax.set_title(model)
# fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
# fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
df = pd.Series(frac_10bp).reset_index()
df.columns = ['model', '10bp periodicity']
(ggplot(aes(x='model', y='10bp periodicity'), df) +
geom_bar(stat='identity') +
theme_classic() +
plt9_tilt_xlab()
)