Goal¶

Compare different models:

Requirements¶

Run python generate_annotated_dfi.py

Compared models¶

ChIP-nexus:
- profile
- binary
ChIP-seq
- profile
- binary

Compared aspects¶

[X] Practicality
- [X] training time plot
[X] Importance score quality (in 7.3-cluster-motifs-modisco.ipynb)
[X] Predictiveness of the bottleneck activation layer
- [X] MPRA performance plot
- [X] H3K27ac plot
[X] Instance labelling quality (periodicity plot)
- [X] generate a function to generate fft @10bp periodicity from dfi_subset Nanog<>Nanog histogram
- [X] finish generating dfi_subset for all motifs

TODO¶

[X] have a mapping from name -> exp id
[X] Finish all the snakemake jobs
- [X] Get both the modisco.results.csv and model.results.csv
[X] Fix the dfi_subset issue
- append whether the thing is a TE
[X] Use the augmented version of the seq-model
[ ] Add error-bars

# # old config
# models = {
#         'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
#         'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
#         'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
#         'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
#         'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
#         'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
#     }

models = {
    'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
    'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
    #'nexus/binary+profile': 'nexus,gw,OSNK,1,0.1,0.01,FALSE,same,0.5,64,25,0.001,9,FALSE',
    #'nexus/profile.gw': 'nexus,gw,OSNK,0,10,1,FALSE,same,0.5,64,25,0.001,9,FALSE',
    # 'nexus/profile.peaks.bias-corrected.augm': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE',
    # 'nexus/profile.peaks.non-bias-corrected': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2',
    'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
    'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
    #'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE',
    #'seq/profile.peaks.bias-corrected.augm': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
    
    # 'seq/profile.peaks.non-bias-corrected': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE',
    'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
    'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
#     'basset/binary': 'binary-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.6',
#     'factorized-basset/binary': 'factorized-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.5'
}
models_inv = {v:k for k,v in models.items()}

from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from basepair.config import test_chr
from plotnine import *
import plotnine
from config import experiments
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
from basepair.plot.utils import plt9_tilt_xlab
from basepair.exp.paper.config import models_dir
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

import sys
sys.path.insert(0, '..')

Using TensorFlow backend.

fdir = Path(f'{ddir}/figures/method-comparison/modisco')
fdir.mkdir(exist_ok=True)

Practicality¶

training time plot¶

exp = models['nexus/profile']

model_dir = models_dir / exp

df_train = pd.read_csv(models_dir / "model.results.finished.csv")

df_train = df_train[df_train.exp.isin(models.values())]

df_train['exp_name'] = df_train.exp.map(models_inv)

list(df_train.exp_name)

['nexus/binary',
 'nexus/binary+profile',
 'nexus/profile.gw',
 'seq/binary',
 'nexus/profile',
 'seq/profile']

df_train[df_train.exp_name == 'nexus/binary+profile']

tf_colors

{'Klf4': '#357C42',
 'Oct4': '#9F1D20',
 'Sox2': '#3A3C97',
 'Nanog': '#9F8A31',
 'Esrrb': '#30BDC4'}

gw_exp = ['nexus/binary', 'nexus/binary+profile']
dfp = df_train[df_train.exp_name.isin(gw_exp)][[f'valid-genome-wide/{task}/class/auPR' for task in tasks] + ['exp_name']].melt(id_vars='exp_name')
dfp['TF'] = dfp.variable.str.split("/", expand=True)[1]

dfp['TF'] = pd.Categorical(dfp['TF'], tasks)

dfp.groupby("exp_name").value.mean()

exp_name
nexus/binary            0.2495
nexus/binary+profile    0.3073
Name: value, dtype: float64

colors = {"nexus/binary": "#EE8030",
          "nexus/profile": "#498AAD",
          "nexus/profile.gw": "#586BA4",
          "nexus/binary+profile": "#4E598C"
         }

plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='TF', fill='exp_name', y='value'), dfp) + 
       geom_bar(stat='identity', position='dodge') + 
       theme_classic(base_size=10, base_family='Arial') +  \
       scale_fill_manual([colors[x] for x in gw_exp]) + \
       ylab("auPRC") 
      )
fig.save(fdir / 'auPRC.binary-vs-profile+binary.bar.pdf')
fig

<ggplot: (8770059255602)>

TODO¶

[ ] show all profile plots?

dft = pd.DataFrame([{"training_time": pd.read_csv(f"{models_dir}/{exp}/train.smk-benchmark.tsv", sep='\t').iloc[0]['s'] / 60,
              "Model": model_name,
              "exp": exp}
             for model_name, exp in models.items()])

plt_exp = ['nexus/binary', 'nexus/profile']
plotnine.options.figure_size = get_figsize(.18)
fig = (ggplot(aes(x='Model', fill='Model', y='training_time'), dft[dft.Model.isin(plt_exp)]) + 
 geom_bar(stat='identity') + 
 scale_fill_manual([colors[x] for x in plt_exp]) + \
 theme_classic(base_size=10, base_family='Arial') +  \
 xlab("Model") + 
 ylab("Training time (min)") +
 plt9_tilt_xlab(30)
)
fig.save(fdir / 'training-time.bar.pdf')
fig

<ggplot: (8770059135012)>

Predictiveness of the bottleneck activation layer¶

MPRA performance plot¶

from MPRA.config import model_exps

df_mpra = pd.read_csv('../MPRA/output/MPRA.csv')
model_exps_inf = {v:k for k,v in model_exps.items()}
df_mpra['model'] = df_mpra.exp.map(model_exps_inf)
df_mpra.exp = df_mpra.exp.str.rstrip("/")
df_mpra['exp_name'] = df_mpra.exp.map(models_inv)
df_mpra = df_mpra[~df_mpra.exp_name.isnull()]

# with pd.option_context("display.max_colwidth", 100):
#     print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
#           sort_values("metrics/genomic/spearmanr").
#           to_string())

df_mpra_subset = df_mpra[df_mpra.exp_name.isin(colors)]
df_mpra_subset['exp_name'] = pd.Categorical(df_mpra_subset.exp_name, categories=list(colors))

plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/synthetic/spearmanr'), df_mpra_subset.query('data=="synthetic"')) + 
 geom_bar(stat='identity') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.synthetic.bar.pdf')
fig

<ggplot: (8761990578906)>

plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/genomic/spearmanr'), df_mpra_subset.query('data=="genomic"')) + 
 geom_bar(stat='identity') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.genomic.bar.pdf')
fig

<ggplot: (-9223363274802961236)>

H3K27ac plot¶

dfa = pd.read_csv(models_dir / 'activity.csv')

dfa.exp = dfa.exp.str.rstrip("/")
dfa['exp_name'] = dfa.exp.map(models_inv)

dfa = dfa[~dfa.exp_name.isnull()].query("split=='test'")

dfa = dfa[dfa.exp_name.isin(colors)]
dfa['exp_name'] = pd.Categorical(dfa.exp_name, categories=list(colors))

plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/H3K27ac/spearmanr'), 
        dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') & 
            (dfa['model_kwargs/bn'].isnull()) &
            ~dfa.exp_name.str.contains("peaks-union")]) + 
 geom_bar(stat='identity', position='dodge') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ggtitle("H3K27ac") + 
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'H3K27ac.bar.pdf')
fig

<ggplot: (8762051822386)>

plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/PolII/spearmanr'), 
        dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') & 
            (dfa['model_kwargs/bn'].isnull()) &
            ~dfa.exp_name.str.contains("peaks-union")]) + 
 ggtitle("Pol II") + 
 geom_bar(stat='identity', position='dodge') + 
 scale_fill_manual(list(colors.values())) + \
 theme_classic(base_size=10, base_family='Arial') +  \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'PolII.bar.pdf')
fig

<ggplot: (-9223363274802907858)>

Compare the high-quality peak recall¶

models

{'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
 'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
 'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
 'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
 'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
 'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0'}

main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']  # NOTE this is a duplication from before

dfi = pd.read_parquet(models_dir / 'dfi_subset.annotated.v3.parq', engine='fastparquet')
dfi = dfi[dfi.example_chrom.isin(test_chr)]  # look at only the test chromosomes
dfi = dfi[dfi.pattern_name.isin(main_motifs)]

# Remove duplicates
dfi = dfi.sort_values("imp_weighted", ascending=False)
duplicated = dfi[['model', 'pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi = dfi[~duplicated]

# setup coordinates
dfi['Chromosome'] = dfi['example_chrom']
dfi['Start'] = dfi['pattern_start_abs']
dfi['End'] = dfi['pattern_end_abs']
dfi['Strand'] = dfi['pattern_end_abs']

dfi.head()

import pyranges as pr

pr.__version__

'0.0.51'

dfi_pr = pr.PyRanges(dfi)

model = 'nexus/binary'

dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])

dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)

len(dfi_pr_s_nonoverlapping)

3648

len(dfi_pr_s)

12063

model = 'seq/profile'

dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])

dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)

len(dfi_pr_s_nonoverlapping)

1186

len(dfi_pr_s)

9576

model = 'nexus/profile'

dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])

dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)

len(dfi_pr_s_nonoverlapping)

866

len(dfi_pr_s)

22688

len(dfi_pr_s_nonoverlapping)

dfi_test = dfi[dfi.example_chrom.isin(test_chr)]

print(dfi.groupby(['pattern_name', 'model']).size().to_string())

pattern_name  model                    
Klf4          nexus/binary                  8074
              nexus/profile                11715
Nanog         nexus/binary                 10089
              nexus/profile                 9653
              nexus/profile.peaks-union     5297
              seq/binary                    3816
              seq/profile                  11743
              seq/profile.peaks-union       7025
Oct4-Sox2     nexus/binary                  4324
              nexus/profile                 4947
              nexus/profile.peaks-union     2260
              seq/binary                    2148
              seq/profile                   2394
              seq/profile.peaks-union       2516
Sox2          nexus/binary                  2123
              nexus/profile                 4358
              nexus/profile.peaks-union     2119
              seq/binary                    1796
              seq/profile                   1822
              seq/profile.peaks-union       1862

def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = [(motif, 
                         {model: get_profile_score(dfi[(dfi.pattern_name == motif) &
                                                       (dfi.model == model) & 
                                                       (dfi.example_chrom.isin(test_chr))],
                                                   'imp_weighted', 
                                                   profile_mapping[motif]+ "/profile_counts_max_ref").values
                         for model in models})
                        for motif in main_motifs]

# Profile scores
model = 'nexus/profile'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1), sharex=True)
for i, motif in enumerate(main_motifs):
    ax = axes[i]
    scores = motif_profile_counts[i][1][model]
    thresholds[motif] = np.percentile(scores, 90)
    #if len(scores) == 0:
    #    continue
    ax.hist(scores, bins=np.arange(30))
    ax.axvline(thresholds[motif], color='grey')
    ax.set_ylabel(motif)
    if i == len(main_motifs)-1:
        ax.set_xlabel("Profile score (max value)");
    # fig.savefig(fdir / 'profile-score.hist.pdf')

fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 2000
    for k,v in res.items():
        if k in ['nexus/profile', 'nexus/binary']:
            cutoff = thresholds[motif]
            ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)    
fig.savefig(fdir / 'profile-recall.nexus.profile-vs-binary.pdf')

fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(0.75, 1/(1*len(main_motifs)-1)), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
# fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(.75, 1/(1*len(main_motifs)-1)), 
#                          sharex=True, 
#                          sharey=True, gridspec_kw=dict(wspace=0, hspace=0))

fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 2000
    for k,v in res.items():
        if k in ['nexus/profile.peaks-union', 'seq/profile', 'seq/profile.peaks-union', 'seq/binary']:
            cutoff = thresholds[motif]
            ax.plot(np.cumsum(v[:top_n] > cutoff), label=k.replace("peaks-union", 'u'))

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='bottom right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)   
fig.savefig(fdir / 'profile-recall.nexus-vs-seq.pdf')

TODO¶

[ ] setup nicer color for the comparison

TODO¶

visually inspect the Sox2 instances

model = 'nexus/profile.peaks-union'
pattern = 'Sox2'

exp = models[model]

exp

'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0'

isf = ImpScoreFile(models_dir / exp / 'deeplift.imp_score.h5', default_imp_score='profile/wn')

isf.cache()

<basepair.cli.imp_score.ImpScoreFile at 0x7f241adae908>

thresholds['Sox2']

1.0000019073486328

threshold = 1

dfi_ranges = dfi[(dfi.model == model) & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)

mr = ModiscoResult(f"{models_dir}/{exp}/deeplift/Sox2/out/profile/wn/modisco.h5")

dfi2 = pd.read_parquet(f"{models_dir}/{exp}/deeplift/dfi_subset.parq")

dfi2.head()

ls {models_dir}/{exp}/deeplift

dfi_subset.parq  Nanog/  Oct4/  Sox2/

subset = isf.get_ranges()['interval_from_task'] == "Sox2"

pname = longer_pattern(experiments[models[model]]['motifs'][pattern].split("/")[1])

# Question: why is the

seqlets = []
for s in mr._get_seqlets(pname, trim_frac=0.08):
    s.start = s.start - 30
    s.end = s.end + 30
    seqlets.append(s)

# TODO - the issue is with determining the reference sequence

isf.include_samples = subset
ess_seqlets = isf.extract(seqlets, )
isf.include_samples = None

ess_seqlets.aggregate().plot();

p = ess.aggregate()
 #pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
threshold = np.percentile((pos+neg)/2, 90)
print("threshold: ", threshold)
# plt.plot(pos)
# plt.plot(neg);
# plt.title("Sox2")
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)

plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')

threshold:  5.5

<matplotlib.lines.Line2D at 0x7f23d6327c88>

recall_nexus = np.cumsum((pos+neg)/2 > threshold)

dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)

p = ess.aggregate()
# pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)
plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')

<matplotlib.lines.Line2D at 0x7f23d6386208>

recall_seq = np.cumsum((pos+neg)/2 > threshold)

plt.plot(recall_seq, label='seq', alpha=0.5);
plt.plot(recall_nexus, label='nexus', alpha=0.5);
plt.plot([0, 600], [0, 600], color='gray')
plt.legend()

<matplotlib.legend.Legend at 0x7f23d63662b0>

ess.plot("profile");

ess.plot("seq");

dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)

p = ess.aggregate()
pl = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0), [0, 1]]
plt.plot(pl[:, 0])
plt.plot(pl[:, 1]);

from basepair.modisco.core import Pattern

ess.aggregate().plot(rotate_y=0);

ess.plot("profile");

ess.plot("seq");

TODO¶

[X] Intersect motif instances
- how much overlap do we see between instances?
? why is the recall so different for Sox2 motifs?
- generate stacked_seqlets

Analyzing instersecting / non-intersecting motifs¶

from basepair.preproc import dfint_intersects, dfint_no_intersection, dfint_overlap_idx

int_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']

list(models)

['nexus/profile',
 'nexus/binary',
 'seq/profile',
 'seq/binary',
 'nexus/profile.peaks-union',
 'seq/profile.peaks-union']

# Example
pattern = 'Oct4-Sox2'
ma = 'nexus/profile'
mb = 'nexus/binary'

def extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log'):
    """Overlap different motif instances
    """
    dfic = dfi[(dfi.pattern_name == pattern) & (dfi.model.isin([ma, mb]))]
    dfa = dfic[(dfi.model == ma)][int_cols]
    dfb = dfic[(dfi.model == mb)][int_cols]
    int_ab = dfint_intersects(dfa, dfb)
    int_ba = dfint_intersects(dfb, dfa)

    # Add the set
    dfic['set'] = ''
    dfic.loc[int_ba[int_ba].index, 'set'] = 'both'
    dfic.loc[int_ba[~int_ba].index, 'set'] = mb
    dfic.loc[int_ab[~int_ab].index, 'set'] = ma
    dfic.loc[int_ab[int_ab].index, 'set'] = 'both'

    profile = profile_mapping[pattern]
    dfic[f'{profile}/profile_counts_max_ref_log'] = np.log10(1+dfic[f'{profile}/profile_counts_max_ref'])
    dfic['profile_score'] = dfic[f'{profile}/{score}']
    dfic['pair'] = "<>".join([ma, mb])
    dfic['score'] = 'score'
    return dfic[['pattern_name', 'set', 'profile_score', 'pair', 'score']]

dfr = pd.concat([extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log')
    for pattern in tqdm(main_motifs)
    for ma,mb in [
        ('nexus/profile', 'nexus/binary'),
        ('seq/profile', 'seq/binary'),
        ('nexus/profile.peaks-union', 'seq/profile.peaks-union'),
    ] if not (pattern == 'Klf4' and 'seq' in mb)])

100%|██████████| 4/4 [00:59<00:00, 13.79s/it]

dfr['pair'] =dfr.pair.str.replace("peaks-union", 'u') # For shorter plot labels

plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'), dfr) + 
       facet_grid("pattern_name ~ pair", scales='free_x') +
       geom_violin() +
       # geom_jitter(alpha=0.1) +
       # scale_y_log10() + 
       theme_classic() +
       plt9_tilt_xlab() + 
       ylab(f"Profile score")
      )
fig

<ggplot: (-9223363257119676370)>

plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'), 
              dfr.groupby(['pair', 'set', 'pattern_name']).profile_score.size().reset_index()) + 
       facet_grid("pattern_name ~ pair", scales='free_x') +
       geom_bar(stat='identity') +
       # geom_jitter(alpha=0.1) +
       # scale_y_log10() + 
       theme_classic() +
       plt9_tilt_xlab() + 
       ylab(f"Profile score")
      )
fig

<ggplot: (-9223363257126602748)>

Questions¶

[X] What are some properties of the intersected ones
- When we are comparing A and B, plot: A only, A and B, B only
- [X] make a boxplot for that particular example
- [X] Shall we explicitly plot some of the ranges?
[X] Generate the same plots for all motifs and all method pairs
- nexus: binary vs profile
- seq: binary vs profile
- seq vs nexus
[X] How much overlap do we see for different methods?
[ ] How to best display the overlaps?

dfi.head()

Periodicity¶

Single example¶

from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi

from basepair.modisco.periodicity import smooth

# dfi = pd.concat([pd.read_parquet(models_dir / exp / "deeplift/dfi_subset.parq", engine='fastparquet').assign(model=model_name)
#                  for model_name, exp in tqdm(models.items())])

100%|██████████| 8/8 [00:14<00:00,  1.64s/it]

# dict(dfi.groupby("model").size())

{'basset/binary': 227803,
 'factorized-basset/binary': 282214,
 'nexus/binary': 269379,
 'nexus/profile': 392571,
 'nexus/profile.peaks-union': 1281958,
 'seq/binary': 93983,
 'seq/profile': 169231,
 'seq/profile.peaks-union': 94067}

# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi[dfi.model == model], pair).assign(motif_pair="<>".join(pair)).assign(model=model) 
                  for pair in [['Nanog', 'Nanog']] for model in tqdm(models)],
                 axis=0)

motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

motif_pair_name = 'Nanog<>Nanog'

pairs = get_motif_pairs(motifs)

Periodicity¶

from matplotlib import ticker

frac_10bp = dict()
for model in models:
    strand_comb = "++"
    fig, ax = plt.subplots(figsize = get_figsize(.25))
    center_diff = dfab[(dfab.strand_combination == strand_comb) & 
                       (dfab.model == model) &
                       (dfab.motif_pair == motif_pair_name)].center_diff
    counts, bin_edges = np.histogram(center_diff, bins=np.arange(150)+.5)
    smooth_part = smooth(counts, 10)
    plt.plot(bin_edges[1:], counts, label='raw')
    plt.plot(bin_edges[1:], smooth_part, label='smoothed')
    plt.legend()
    plt.title(f"{model} Nanog<>Nanog ({strand_comb})")
    plt.xlabel("Pairwise distance")
    plt.ylabel("Frequency");
    
    power = 0
    for strand_comb in ['++', "-+", '+-', "--"]:
        counts, bin_edges = np.histogram(dfab[(dfab.strand_combination == strand_comb) & 
                                              (dfab.model == model) &
                                              (dfab.motif_pair == motif_pair_name)].center_diff,
                 bins=np.arange(150)+.5)
        smooth_part = smooth(counts, 10)
        power += np.abs(np.fft.fft(counts-smooth_part)[:49])**2

    freq = np.fft.fftfreq(counts.shape[-1])
    power = power[3:49]
    x = 1 / freq[3:49]    

    fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
    # Store the peak height vs others
    frac_10bp[model] = power[np.argmax(power)] / power.sum()
    ax.plot(x, power)
    ax.scatter(x, power, s=10)
    ax.set_xlim([0, 50])
    ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
    # ax.grid(alpha=0.3)
    t0_max = x[np.argmax(power)]
    ax.axvline(x=t0_max, color='grey', alpha=0.5)
    print("Max frequency:", t0_max)
    ax.set_xlabel("1/Frequency [bp]")
    ax.set_ylabel("Power spectrum");
    ax.set_title(model)
    # fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
    # fft_fig.savefig(fdir / 'Nanog.p1.fft.png')

Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142

df = pd.Series(frac_10bp).reset_index()
df.columns = ['model', '10bp periodicity']

(ggplot(aes(x='model', y='10bp periodicity'), df) + 
 geom_bar(stat='identity') + 
 theme_classic() + 
 plt9_tilt_xlab()
)

<ggplot: (-9223363274861830926)>

	Klf4/profile_counts	Klf4/profile_counts_max_ref	Klf4/profile_counts_max_ref_p	Klf4/profile_counts_p	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_counts	Nanog/profile_counts_max_ref	Nanog/profile_counts_max_ref_p	Nanog/profile_counts_p	Nanog/profile_match	Nanog/profile_match_p	Nanog/profile_max	Nanog/profile_max_p	Oct4/profile_counts	Oct4/profile_counts_max_ref	Oct4/profile_counts_max_ref_p	Oct4/profile_counts_p	Oct4/profile_match	Oct4/profile_match_p	Oct4/profile_max	Oct4/profile_max_p	Sox2/profile_counts	Sox2/profile_counts_max_ref	Sox2/profile_counts_max_ref_p	Sox2/profile_counts_p	Sox2/profile_match	Sox2/profile_match_p	Sox2/profile_max	Sox2/profile_max_p	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	exp	id	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	is_te	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	model	pattern	pattern_center	pattern_center_aln	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	pattern_strand_aln	seq_match	seq_match_cat	seq_match_p	strand	tf	Chromosome	Start	End	Strand
490400	80.0001	2.0000e-06	0.4574	0.5237	4.2121	0.4750	4.0	0.5602	311.0001	6.0	0.5589	0.5737	2.0192	0.4506	8.0	0.3694	286.0001	8.0	0.9608	0.9689	2.0613	0.0365	12.0	0.9648	108.0001	13.0	0.8796	0.7984	3.3154	0.2165	13.0	0.9039	chr1	186923485	27512	Sox2	186922485	.	nexus,gw,OSNK,1,0,0,F...	101176	NaN	NaN	NaN	38.9884	38.9884	Sox2	38.9884	high	0.9986	False	NaN	NaN	NaN	0.2261	0.2261	Sox2	0.2261	medium	0.4913	nexus/binary	Sox2/metacluster_0/pa...	500	491	529	186923014	58	Sox2	Sox2/m0_p3	471	186922956	-	5.4838	low	0.2572	+	Sox2	chr1	186922956	186923014	186923014
496468	37.0001	1.0000e+00	0.4574	0.2355	5.3695	0.7686	3.0	0.4100	234.0001	1.0	0.2476	0.4628	1.8828	0.4087	7.0	0.3342	469.0001	10.0	0.9743	0.9878	1.5551	0.0203	25.0	0.9959	131.0001	12.0	0.8593	0.8660	2.6023	0.0920	10.0	0.8417	chr8	60346612	46172	Nanog	60345612	.	nexus,gw,OSNK,1,0,0,F...	107244	NaN	NaN	NaN	38.0438	38.0438	Sox2	38.0438	high	0.9971	False	NaN	NaN	NaN	0.2344	0.2344	Sox2	0.2344	medium	0.5405	nexus/binary	Sox2/metacluster_0/pa...	463	454	492	60346104	58	Sox2	Sox2/m0_p3	434	60346046	-	2.8692	low	0.0968	+	Sox2	chr8	60346046	60346104	60346104
503835	203.0001	1.1000e+01	0.9405	0.8484	2.5899	0.1854	7.0	0.8051	1022.0001	8.0	0.6292	0.9188	0.9810	0.1827	47.0	0.9161	679.0001	14.0	0.9878	0.9973	1.1521	0.0149	58.0	1.0000	309.0001	35.0	0.9865	0.9824	1.6059	0.0230	31.0	0.9919	chr1	180925386	93820	Klf4	180924386	.	nexus,gw,OSNK,1,0,0,F...	114611	NaN	NaN	NaN	36.5360	36.5360	Sox2	36.5360	high	0.9971	False	NaN	NaN	NaN	0.2018	0.2018	Sox2	0.2018	medium	0.3483	nexus/binary	Sox2/metacluster_0/pa...	545	554	574	180924960	58	Sox2	Sox2/m0_p3	516	180924902	+	5.1639	low	0.2428	-	Sox2	chr1	180924902	180924960	180924960
494666	155.0001	2.0000e+00	0.6184	0.7713	2.9060	0.2206	6.0	0.7659	540.0001	43.0	0.9120	0.7591	0.7866	0.1245	24.0	0.7591	125.0001	8.0	0.9608	0.8566	3.7845	0.1894	8.0	0.9337	56.0001	5.0	0.6130	0.4885	4.4660	0.5264	5.0	0.5819	chr1	53581962	39532	Nanog	53580962	.	nexus,gw,OSNK,1,0,0,F...	105442	NaN	NaN	NaN	35.7210	35.7210	Sox2	35.7210	high	0.9971	True	NaN	NaN	NaN	0.3273	0.3273	Sox2	0.3273	high	0.9003	nexus/binary	Sox2/metacluster_0/pa...	502	511	531	53581493	58	Sox2	Sox2/m0_p3	473	53581435	+	26.7954	high	0.8887	-	Sox2	chr1	53581435	53581493	53581493
505904	106.0001	2.0000e-06	0.4574	0.6441	3.7177	0.3748	8.0	0.8471	1358.0000	20.0	0.8024	0.9540	0.8744	0.1570	78.0	0.9635	104.0001	5.0	0.9202	0.8160	3.8483	0.2003	4.0	0.7551	78.0001	4.0	0.5304	0.6549	3.7633	0.3180	3.0	0.3099	chr9	53681518	110858	Klf4	53680518	.	nexus,gw,OSNK,1,0,0,F...	116680	NaN	NaN	NaN	33.8227	33.8227	Sox2	33.8227	high	0.9942	False	NaN	NaN	NaN	0.1980	0.1980	Sox2	0.1980	low	0.3121	nexus/binary	Sox2/metacluster_0/pa...	517	526	546	53681064	58	Sox2	Sox2/m0_p3	488	53681006	+	2.6344	low	0.0809	-	Sox2	chr9	53681006	53681064	53681064

	Klf4/profile_counts	Klf4/profile_counts_max_ref	Klf4/profile_counts_max_ref_p	Klf4/profile_counts_p	Klf4/profile_match	Klf4/profile_match_p	Klf4/profile_max	Klf4/profile_max_p	Nanog/profile_counts	Nanog/profile_counts_max_ref	Nanog/profile_counts_max_ref_p	Nanog/profile_counts_p	Nanog/profile_match	Nanog/profile_match_p	Nanog/profile_max	Nanog/profile_max_p	Oct4/profile_counts	Oct4/profile_counts_max_ref	Oct4/profile_counts_max_ref_p	Oct4/profile_counts_p	Oct4/profile_match	Oct4/profile_match_p	Oct4/profile_max	Oct4/profile_max_p	Sox2/profile_counts	Sox2/profile_counts_max_ref	Sox2/profile_counts_max_ref_p	Sox2/profile_counts_p	Sox2/profile_match	Sox2/profile_match_p	Sox2/profile_max	Sox2/profile_max_p	example_chrom	example_end	example_idx	example_interval_from_task	example_start	example_strand	id	imp/Klf4	imp/Nanog	imp/Oct4	imp/Sox2	imp_max	imp_max_task	imp_weighted	imp_weighted_cat	imp_weighted_p	is_te	match/Klf4	match/Nanog	match/Oct4	match/Sox2	match_max	match_max_task	match_weighted	match_weighted_cat	match_weighted_p	model	pattern	pattern_center	pattern_center_aln	pattern_end	pattern_end_abs	pattern_len	pattern_name	pattern_short	pattern_start	pattern_start_abs	pattern_strand_aln	seq_match	seq_match_cat	seq_match_p	strand	tf
456394	60.0001	2.0000e-06	0.6441	0.4668	4.9265	0.5724	5.0	0.7199	374.0001	1.0	0.4614	0.7334	3.0232	0.4533	17.0	0.7510	597.0001	12.0	0.9283	0.9499	1.7144	0.1299	25.0	0.9459	136.0001	1.0000e+00	0.7835	0.9188	3.5819	0.0961	8.0	0.8972	chr15	84169370	486	Oct4	84168370	.	95269	NaN	NaN	NaN	44.4014	44.4014	Sox2	44.4014	high	1.0	False	NaN	NaN	NaN	0.2133	0.2133	Sox2	0.2133	medium	0.4147	nexus/binary	Sox2/metacluster_0/pa...	499	490	528	84168898	58	Sox2	Sox2/m0_p3	470	84168840	-	1.6304	low	0.0361	+	Sox2
465477	122.0001	1.0000e+00	0.6441	0.7375	3.8495	0.3410	6.0	0.7984	1091.0000	6.0	0.8160	0.9350	0.7184	0.0880	33.0	0.8796	453.0001	1.0	0.3978	0.9242	1.2164	0.0595	12.0	0.8268	157.0001	2.0000e-06	0.7835	0.9337	3.2870	0.0690	11.0	0.9378	chr15	30383836	36484	Nanog	30382836	.	104352	NaN	NaN	NaN	44.1754	44.1754	Sox2	44.1754	high	1.0	False	NaN	NaN	NaN	0.1896	0.1896	Sox2	0.1896	low	0.2601	nexus/binary	Sox2/metacluster_0/pa...	449	440	478	30383314	58	Sox2	Sox2/m0_p3	420	30383256	-	5.7461	low	0.2746	+	Sox2
456721	249.0001	3.0000e+00	0.8403	0.9134	1.9564	0.0785	7.0	0.8457	828.0001	6.0	0.8160	0.8985	1.0016	0.1475	42.0	0.8972	226.0001	3.0	0.6712	0.7456	2.2944	0.2084	8.0	0.6739	80.0001	4.0000e+00	0.9540	0.8146	4.5998	0.2260	5.0	0.7889	chr4	14487813	1485	Oct4	14486813	.	95596	NaN	NaN	NaN	40.7515	40.7515	Sox2	40.7515	high	1.0	True	NaN	NaN	NaN	0.3323	0.3323	Sox2	0.3323	high	0.9075	nexus/binary	Sox2/metacluster_0/pa...	498	507	527	14487340	58	Sox2	Sox2/m0_p3	469	14487282	+	26.7264	high	0.8844	-	Sox2
476844	125.0001	2.0000e+00	0.7794	0.7470	3.8106	0.3369	9.0	0.9012	2247.0000	35.0	0.9756	0.9770	0.7873	0.1042	205.0	0.9946	837.0001	4.0	0.7483	0.9783	1.1528	0.0528	54.0	0.9946	399.0001	7.0000e+00	0.9824	0.9959	2.2875	0.0149	21.0	0.9892	chr5	131527458	101668	Klf4	131526458	.	115719	NaN	NaN	NaN	40.7138	40.7138	Sox2	40.7138	high	1.0	False	NaN	NaN	NaN	0.2026	0.2026	Sox2	0.2026	medium	0.3526	nexus/binary	Sox2/metacluster_0/pa...	492	483	521	131526979	58	Sox2	Sox2/m0_p3	463	131526921	-	5.4391	low	0.2572	+	Sox2
469702	65.0001	2.0000e+00	0.7794	0.4939	4.6181	0.4858	4.0	0.6022	121.0001	4.0	0.7280	0.4668	3.9832	0.5792	6.0	0.4899	758.0001	19.0	0.9702	0.9689	1.4775	0.0893	52.0	0.9932	256.0001	9.0000e+00	0.9878	0.9811	2.9985	0.0474	20.0	0.9865	chr18	31569412	52302	Nanog	31568412	.	108577	NaN	NaN	NaN	40.0392	40.0392	Sox2	40.0392	high	1.0	True	NaN	NaN	NaN	0.1972	0.1972	Sox2	0.1972	low	0.3078	nexus/binary	Sox2/metacluster_0/pa...	551	542	580	31568992	58	Sox2	Sox2/m0_p3	522	31568934	-	2.3431	low	0.0636	+	Sox2