Goal

Compare different models:

Requirements

  • Run python generate_annotated_dfi.py

Compared models

  • ChIP-nexus:
    • profile
    • binary
  • ChIP-seq
    • profile
    • binary

Compared aspects

  • [X] Practicality
    • [X] training time plot
  • [X] Importance score quality (in 7.3-cluster-motifs-modisco.ipynb)
  • [X] Predictiveness of the bottleneck activation layer
    • [X] MPRA performance plot
    • [X] H3K27ac plot
  • [X] Instance labelling quality (periodicity plot)
    • [X] generate a function to generate fft @10bp periodicity from dfi_subset Nanog<>Nanog histogram
    • [X] finish generating dfi_subset for all motifs

TODO

  • [X] have a mapping from name -> exp id
  • [X] Finish all the snakemake jobs
    • [X] Get both the modisco.results.csv and model.results.csv
  • [X] Fix the dfi_subset issue
    • append whether the thing is a TE
  • [X] Use the augmented version of the seq-model
  • [ ] Add error-bars
In [3]:
# # old config
# models = {
#         'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
#         'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
#         'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
#         'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
#         'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
#         'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE'
#     }
In [1]:
models = {
    'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
    'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
    #'nexus/binary+profile': 'nexus,gw,OSNK,1,0.1,0.01,FALSE,same,0.5,64,25,0.001,9,FALSE',
    #'nexus/profile.gw': 'nexus,gw,OSNK,0,10,1,FALSE,same,0.5,64,25,0.001,9,FALSE',
    # 'nexus/profile.peaks.bias-corrected.augm': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE',
    # 'nexus/profile.peaks.non-bias-corrected': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2',
    'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
    'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
    #'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE',
    #'seq/profile.peaks.bias-corrected.augm': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
    
    # 'seq/profile.peaks.non-bias-corrected': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE',
    'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
    'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
#     'basset/binary': 'binary-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.6',
#     'factorized-basset/binary': 'factorized-basset,nexus,gw,OSNK,0.5,64,0.001,FALSE,0.5'
}
models_inv = {v:k for k,v in models.items()}
In [2]:
from basepair.imports import *
from basepair.exp.paper.config import *
from basepair.utils import flatten
from basepair.modisco.core import Pattern
from basepair.config import test_chr
from plotnine import *
import plotnine
from config import experiments
from basepair.utils import pd_first_cols, flatten
from basepair.exp.chipnexus.comparison import read_peakxus_dfi, read_chexmix_dfi, read_fimo_dfi, read_meme_motifs, read_transfac
from basepair.plot.utils import plt9_tilt_xlab
from basepair.exp.paper.config import models_dir
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
paper_config()

import sys
sys.path.insert(0, '..')
Using TensorFlow backend.
In [3]:
fdir = Path(f'{ddir}/figures/method-comparison/modisco')
fdir.mkdir(exist_ok=True)

Practicality

training time plot

In [4]:
exp = models['nexus/profile']
In [5]:
model_dir = models_dir / exp
In [6]:
df_train = pd.read_csv(models_dir / "model.results.finished.csv")
In [7]:
df_train = df_train[df_train.exp.isin(models.values())]
In [8]:
df_train['exp_name'] = df_train.exp.map(models_inv)
In [9]:
list(df_train.exp_name)
Out[9]:
['nexus/binary',
 'nexus/binary+profile',
 'nexus/profile.gw',
 'seq/binary',
 'nexus/profile',
 'seq/profile']
In [10]:
df_train[df_train.exp_name == 'nexus/binary+profile']
Out[10]:
Unnamed: 30 Unnamed: 31 Unnamed: 32 assay augment_interval batch_size best-epoch/Klf4/class_loss best-epoch/Klf4/counts_loss best-epoch/Klf4/profile_loss best-epoch/Nanog/class_loss best-epoch/Nanog/counts_loss best-epoch/Nanog/profile_loss best-epoch/Oct4/class_loss best-epoch/Oct4/counts_loss best-epoch/Oct4/profile_loss best-epoch/Sox2/class_loss best-epoch/Sox2/counts_loss best-epoch/Sox2/profile_loss best-epoch/epoch best-epoch/loss best-epoch/val_Klf4/class_loss best-epoch/val_Klf4/counts_loss best-epoch/val_Klf4/profile_loss best-epoch/val_Nanog/class_loss best-epoch/val_Nanog/counts_loss best-epoch/val_Nanog/profile_loss best-epoch/val_Oct4/class_loss best-epoch/val_Oct4/counts_loss best-epoch/val_Oct4/profile_loss best-epoch/val_Sox2/class_loss best-epoch/val_Sox2/counts_loss best-epoch/val_Sox2/profile_loss best-epoch/val_loss bias_pool binary_weight bn dataspec dropout exp filters gin_bindings gin_files imp_score lr merge_profile_reg modisco_tasks n_dil_layers note p_peak padding peak_width profile_weight r_modisco r_wandb region regression_weight run_modisco seed seq_width stats/train/h:m:s stats/train/io_in stats/train/io_out stats/train/max_pss stats/train/max_rss stats/train/max_uss stats/train/max_vms stats/train/mean_load stats/train/s tconv_kernel_size tfs train train-peaks/Klf4/counts/mad train-peaks/Klf4/counts/mse train-peaks/Klf4/counts/pearsonr train-peaks/Klf4/counts/spearmanr train-peaks/Klf4/counts/var_explained train-peaks/Klf4/profile/binsize=1/auprc train-peaks/Klf4/profile/binsize=1/frac_ambigous train-peaks/Klf4/profile/binsize=1/imbalance train-peaks/Klf4/profile/binsize=1/n_positives train-peaks/Klf4/profile/binsize=1/random_auprc train-peaks/Klf4/profile/binsize=10/auprc train-peaks/Klf4/profile/binsize=10/frac_ambigous train-peaks/Klf4/profile/binsize=10/imbalance train-peaks/Klf4/profile/binsize=10/n_positives train-peaks/Klf4/profile/binsize=10/random_auprc train-peaks/Nanog/counts/mad train-peaks/Nanog/counts/mse train-peaks/Nanog/counts/pearsonr train-peaks/Nanog/counts/spearmanr train-peaks/Nanog/counts/var_explained train-peaks/Nanog/profile/binsize=1/auprc train-peaks/Nanog/profile/binsize=1/frac_ambigous train-peaks/Nanog/profile/binsize=1/imbalance train-peaks/Nanog/profile/binsize=1/n_positives train-peaks/Nanog/profile/binsize=1/random_auprc train-peaks/Nanog/profile/binsize=10/auprc train-peaks/Nanog/profile/binsize=10/frac_ambigous train-peaks/Nanog/profile/binsize=10/imbalance train-peaks/Nanog/profile/binsize=10/n_positives train-peaks/Nanog/profile/binsize=10/random_auprc train-peaks/Oct4/counts/mad train-peaks/Oct4/counts/mse train-peaks/Oct4/counts/pearsonr train-peaks/Oct4/counts/spearmanr train-peaks/Oct4/counts/var_explained train-peaks/Oct4/profile/binsize=1/auprc train-peaks/Oct4/profile/binsize=1/frac_ambigous train-peaks/Oct4/profile/binsize=1/imbalance train-peaks/Oct4/profile/binsize=1/n_positives train-peaks/Oct4/profile/binsize=1/random_auprc train-peaks/Oct4/profile/binsize=10/auprc train-peaks/Oct4/profile/binsize=10/frac_ambigous train-peaks/Oct4/profile/binsize=10/imbalance train-peaks/Oct4/profile/binsize=10/n_positives train-peaks/Oct4/profile/binsize=10/random_auprc train-peaks/Sox2/counts/mad train-peaks/Sox2/counts/mse train-peaks/Sox2/counts/pearsonr train-peaks/Sox2/counts/spearmanr train-peaks/Sox2/counts/var_explained train-peaks/Sox2/profile/binsize=1/auprc train-peaks/Sox2/profile/binsize=1/frac_ambigous train-peaks/Sox2/profile/binsize=1/imbalance train-peaks/Sox2/profile/binsize=1/n_positives train-peaks/Sox2/profile/binsize=1/random_auprc train-peaks/Sox2/profile/binsize=10/auprc train-peaks/Sox2/profile/binsize=10/frac_ambigous train-peaks/Sox2/profile/binsize=10/imbalance train-peaks/Sox2/profile/binsize=10/n_positives train-peaks/Sox2/profile/binsize=10/random_auprc train-peaks/avg/counts/mad train-peaks/avg/counts/mse train-peaks/avg/counts/pearsonr train-peaks/avg/counts/spearmanr train-peaks/avg/counts/var_explained train-peaks/avg/profile/binsize=1/auprc train-peaks/avg/profile/binsize=1/frac_ambigous train-peaks/avg/profile/binsize=1/imbalance train-peaks/avg/profile/binsize=1/n_positives train-peaks/avg/profile/binsize=1/random_auprc train-peaks/avg/profile/binsize=10/auprc train-peaks/avg/profile/binsize=10/frac_ambigous train-peaks/avg/profile/binsize=10/imbalance train-peaks/avg/profile/binsize=10/n_positives train-peaks/avg/profile/binsize=10/random_auprc use_bias valid-genome-wide/Klf4/class/accuracy valid-genome-wide/Klf4/class/auPR valid-genome-wide/Klf4/class/auROC valid-genome-wide/Klf4/class/frac_positive valid-genome-wide/Klf4/class/n_negative valid-genome-wide/Klf4/class/n_positive valid-genome-wide/Nanog/class/accuracy valid-genome-wide/Nanog/class/auPR valid-genome-wide/Nanog/class/auROC valid-genome-wide/Nanog/class/frac_positive valid-genome-wide/Nanog/class/n_negative valid-genome-wide/Nanog/class/n_positive valid-genome-wide/Oct4/class/accuracy valid-genome-wide/Oct4/class/auPR valid-genome-wide/Oct4/class/auROC valid-genome-wide/Oct4/class/frac_positive valid-genome-wide/Oct4/class/n_negative valid-genome-wide/Oct4/class/n_positive valid-genome-wide/Sox2/class/accuracy valid-genome-wide/Sox2/class/auPR valid-genome-wide/Sox2/class/auROC valid-genome-wide/Sox2/class/frac_positive valid-genome-wide/Sox2/class/n_negative valid-genome-wide/Sox2/class/n_positive valid-genome-wide/avg/class/accuracy valid-genome-wide/avg/class/auPR valid-genome-wide/avg/class/auROC valid-genome-wide/avg/class/frac_positive valid-genome-wide/avg/class/n_negative valid-genome-wide/avg/class/n_positive valid-peaks/Klf4/counts/mad valid-peaks/Klf4/counts/mse valid-peaks/Klf4/counts/pearsonr valid-peaks/Klf4/counts/spearmanr valid-peaks/Klf4/counts/var_explained valid-peaks/Klf4/profile/binsize=1/auprc valid-peaks/Klf4/profile/binsize=1/frac_ambigous valid-peaks/Klf4/profile/binsize=1/imbalance valid-peaks/Klf4/profile/binsize=1/n_positives valid-peaks/Klf4/profile/binsize=1/random_auprc valid-peaks/Klf4/profile/binsize=10/auprc valid-peaks/Klf4/profile/binsize=10/frac_ambigous valid-peaks/Klf4/profile/binsize=10/imbalance valid-peaks/Klf4/profile/binsize=10/n_positives valid-peaks/Klf4/profile/binsize=10/random_auprc valid-peaks/Nanog/counts/mad valid-peaks/Nanog/counts/mse valid-peaks/Nanog/counts/pearsonr valid-peaks/Nanog/counts/spearmanr valid-peaks/Nanog/counts/var_explained valid-peaks/Nanog/profile/binsize=1/auprc valid-peaks/Nanog/profile/binsize=1/frac_ambigous valid-peaks/Nanog/profile/binsize=1/imbalance valid-peaks/Nanog/profile/binsize=1/n_positives valid-peaks/Nanog/profile/binsize=1/random_auprc valid-peaks/Nanog/profile/binsize=10/auprc valid-peaks/Nanog/profile/binsize=10/frac_ambigous valid-peaks/Nanog/profile/binsize=10/imbalance valid-peaks/Nanog/profile/binsize=10/n_positives valid-peaks/Nanog/profile/binsize=10/random_auprc valid-peaks/Oct4/counts/mad valid-peaks/Oct4/counts/mse valid-peaks/Oct4/counts/pearsonr valid-peaks/Oct4/counts/spearmanr valid-peaks/Oct4/counts/var_explained valid-peaks/Oct4/profile/binsize=1/auprc valid-peaks/Oct4/profile/binsize=1/frac_ambigous valid-peaks/Oct4/profile/binsize=1/imbalance valid-peaks/Oct4/profile/binsize=1/n_positives valid-peaks/Oct4/profile/binsize=1/random_auprc valid-peaks/Oct4/profile/binsize=10/auprc valid-peaks/Oct4/profile/binsize=10/frac_ambigous valid-peaks/Oct4/profile/binsize=10/imbalance valid-peaks/Oct4/profile/binsize=10/n_positives valid-peaks/Oct4/profile/binsize=10/random_auprc valid-peaks/Sox2/counts/mad valid-peaks/Sox2/counts/mse valid-peaks/Sox2/counts/pearsonr valid-peaks/Sox2/counts/spearmanr valid-peaks/Sox2/counts/var_explained valid-peaks/Sox2/profile/binsize=1/auprc valid-peaks/Sox2/profile/binsize=1/frac_ambigous valid-peaks/Sox2/profile/binsize=1/imbalance valid-peaks/Sox2/profile/binsize=1/n_positives valid-peaks/Sox2/profile/binsize=1/random_auprc valid-peaks/Sox2/profile/binsize=10/auprc valid-peaks/Sox2/profile/binsize=10/frac_ambigous valid-peaks/Sox2/profile/binsize=10/imbalance valid-peaks/Sox2/profile/binsize=10/n_positives valid-peaks/Sox2/profile/binsize=10/random_auprc valid-peaks/avg/counts/mad valid-peaks/avg/counts/mse valid-peaks/avg/counts/pearsonr valid-peaks/avg/counts/spearmanr valid-peaks/avg/counts/var_explained valid-peaks/avg/profile/binsize=1/auprc valid-peaks/avg/profile/binsize=1/frac_ambigous valid-peaks/avg/profile/binsize=1/imbalance valid-peaks/avg/profile/binsize=1/n_positives valid-peaks/avg/profile/binsize=1/random_auprc valid-peaks/avg/profile/binsize=10/auprc valid-peaks/avg/profile/binsize=10/frac_ambigous valid-peaks/avg/profile/binsize=10/imbalance valid-peaks/avg/profile/binsize=10/n_positives valid-peaks/avg/profile/binsize=10/random_auprc exp_name
56 NaN NaN NaN nexus False 128.0 0.2289 0.3573 544.6508 0.2288 0.3993 560.2367 0.193 0.2877 529.9826 0.0648 0.232 314.7775 10.0 20.3396 0.0896 0.3899 263.6964 0.1001 0.4628 306.3487 0.0445 0.3847 343.7712 0.01 0.3131 228.5908 11.8234 NaN 1.0 False ChIP-nexus.dataspec.yml NaN nexus,gw,OSNK,1,0.1,0... 64.0 b_loss_weight=1;c_los... problem-gw.gin,joint-... profile/wn 0.001 False NaN 9.0 binary + profile 0.5 same 1000 0.01 NaN wandb gw 0.1 False None 1000 1 day, 5:06:41 4398.99 333.52 14842.11 112137.71 7159.79 757713.31 0.0 104801.1711 25.0 OSNK True 0.4205 0.2918 0.7915 0.786 0.6165 0.1768 0.064 0.0026 189381.0 0.0026 0.5558 0.3183 0.0273 147401.0 0.0278 0.4115 0.2843 0.8327 0.7879 0.682 0.509 0.058 0.0054 409298.0 0.0057 0.781 0.2409 0.0398 245161.0 0.0406 0.2929 0.1514 0.8128 0.7909 0.6459 0.2171 0.0725 0.0029 154097.0 0.0029 0.5505 0.3639 0.0336 120478.0 0.0318 0.2629 0.1218 0.7796 0.722 0.6021 0.4434 0.0703 0.0063 55119.0 0.0063 0.8218 0.3199 0.0551 35175.0 0.0534 0.3469 0.2123 0.8041 0.7717 0.6366 0.3366 0.0662 0.0043 201973.75 0.0044 0.6773 0.3108 0.0389 137053.75 0.0384 False 0.9681 0.4768 0.9636 0.0126 9847187.0 125883.0 0.9631 0.2687 0.9427 0.0089 9884476.0 88594.0 0.9876 0.2564 0.9384 0.0059 9914050.0 59020.0 0.9972 0.2273 0.9447 0.0019 9954155.0 18915.0 0.979 0.3073 0.9474 0.0073 9899967.0 73103.0 0.6115 0.6334 0.5912 0.597 0.2593 0.1281 0.0636 0.0025 59593.0 0.0025 0.4717 0.3164 0.027 46678.0 0.0265 0.645 0.7514 0.571 0.5638 0.2581 0.4015 0.0581 0.0052 130301.0 0.0052 0.6922 0.243 0.0391 78059.0 0.0374 0.4488 0.3721 0.552 0.5236 0.2009 0.1544 0.0719 0.0029 48721.0 0.0028 0.4687 0.3615 0.0331 38842.0 0.0323 0.3781 0.2742 0.4927 0.4554 0.1687 0.323 0.0719 0.0058 17464.0 0.0058 0.7223 0.3297 0.0533 11634.0 0.0502 0.5209 0.5078 0.5517 0.535 0.2218 0.2518 0.0664 0.0041 64019.75 0.0041 0.5887 0.3126 0.0381 43803.25 0.0366 nexus/binary+profile
In [11]:
tf_colors
Out[11]:
{'Klf4': '#357C42',
 'Oct4': '#9F1D20',
 'Sox2': '#3A3C97',
 'Nanog': '#9F8A31',
 'Esrrb': '#30BDC4'}
In [12]:
gw_exp = ['nexus/binary', 'nexus/binary+profile']
dfp = df_train[df_train.exp_name.isin(gw_exp)][[f'valid-genome-wide/{task}/class/auPR' for task in tasks] + ['exp_name']].melt(id_vars='exp_name')
dfp['TF'] = dfp.variable.str.split("/", expand=True)[1]

dfp['TF'] = pd.Categorical(dfp['TF'], tasks)
In [13]:
dfp.groupby("exp_name").value.mean()
Out[13]:
exp_name
nexus/binary            0.2495
nexus/binary+profile    0.3073
Name: value, dtype: float64
In [16]:
colors = {"nexus/binary": "#EE8030",
          "nexus/profile": "#498AAD",
          "nexus/profile.gw": "#586BA4",
          "nexus/binary+profile": "#4E598C"
         }
In [17]:
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='TF', fill='exp_name', y='value'), dfp) + 
       geom_bar(stat='identity', position='dodge') + 
       theme_classic(base_size=10, base_family='Arial') +  \
       scale_fill_manual([colors[x] for x in gw_exp]) + \
       ylab("auPRC") 
      )
fig.save(fdir / 'auPRC.binary-vs-profile+binary.bar.pdf')
fig
Out[17]:
<ggplot: (8770059255602)>

TODO

  • [ ] show all profile plots?
In [18]:
dft = pd.DataFrame([{"training_time": pd.read_csv(f"{models_dir}/{exp}/train.smk-benchmark.tsv", sep='\t').iloc[0]['s'] / 60,
              "Model": model_name,
              "exp": exp}
             for model_name, exp in models.items()])
In [19]:
plt_exp = ['nexus/binary', 'nexus/profile']
plotnine.options.figure_size = get_figsize(.18)
fig = (ggplot(aes(x='Model', fill='Model', y='training_time'), dft[dft.Model.isin(plt_exp)]) + 
 geom_bar(stat='identity') + 
 scale_fill_manual([colors[x] for x in plt_exp]) + \
 theme_classic(base_size=10, base_family='Arial') +  \
 xlab("Model") + 
 ylab("Training time (min)") +
 plt9_tilt_xlab(30)
)
fig.save(fdir / 'training-time.bar.pdf')
fig
Out[19]:
<ggplot: (8770059135012)>

Predictiveness of the bottleneck activation layer

MPRA performance plot

In [89]:
from MPRA.config import model_exps

df_mpra = pd.read_csv('../MPRA/output/MPRA.csv')
model_exps_inf = {v:k for k,v in model_exps.items()}
df_mpra['model'] = df_mpra.exp.map(model_exps_inf)
df_mpra.exp = df_mpra.exp.str.rstrip("/")
df_mpra['exp_name'] = df_mpra.exp.map(models_inv)
df_mpra = df_mpra[~df_mpra.exp_name.isnull()]
In [114]:
# with pd.option_context("display.max_colwidth", 100):
#     print(df_mpra.query('data=="genomic"')[['model', 'metrics/genomic/spearmanr']].
#           sort_values("metrics/genomic/spearmanr").
#           to_string())
In [115]:
df_mpra_subset = df_mpra[df_mpra.exp_name.isin(colors)]
df_mpra_subset['exp_name'] = pd.Categorical(df_mpra_subset.exp_name, categories=list(colors))
In [116]:
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/synthetic/spearmanr'), df_mpra_subset.query('data=="synthetic"')) + 
 geom_bar(stat='identity') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.synthetic.bar.pdf')
fig
Out[116]:
<ggplot: (8761990578906)>
In [117]:
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/genomic/spearmanr'), df_mpra_subset.query('data=="genomic"')) + 
 geom_bar(stat='identity') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'MPRA.genomic.bar.pdf')
fig
Out[117]:
<ggplot: (-9223363274802961236)>

H3K27ac plot

In [118]:
dfa = pd.read_csv(models_dir / 'activity.csv')

dfa.exp = dfa.exp.str.rstrip("/")
dfa['exp_name'] = dfa.exp.map(models_inv)

dfa = dfa[~dfa.exp_name.isnull()].query("split=='test'")
In [119]:
dfa = dfa[dfa.exp_name.isin(colors)]
dfa['exp_name'] = pd.Categorical(dfa.exp_name, categories=list(colors))
In [120]:
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/H3K27ac/spearmanr'), 
        dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') & 
            (dfa['model_kwargs/bn'].isnull()) &
            ~dfa.exp_name.str.contains("peaks-union")]) + 
 geom_bar(stat='identity', position='dodge') + 
 theme_classic(base_size=10, base_family='Arial') +  \
 scale_fill_manual(list(colors.values())) + \
 ggtitle("H3K27ac") + 
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'H3K27ac.bar.pdf')
fig
Out[120]:
<ggplot: (8762051822386)>
In [121]:
plotnine.options.figure_size = get_figsize(.25)
fig = (ggplot(aes(x='exp_name', fill='exp_name', y='metrics/PolII/spearmanr'), 
        dfa[(dfa['model_kwargs/pool_type'] == 'global_avg') & 
            (dfa['model_kwargs/bn'].isnull()) &
            ~dfa.exp_name.str.contains("peaks-union")]) + 
 ggtitle("Pol II") + 
 geom_bar(stat='identity', position='dodge') + 
 scale_fill_manual(list(colors.values())) + \
 theme_classic(base_size=10, base_family='Arial') +  \
 ylab("Spearman corr") + 
 plt9_tilt_xlab(30))
fig.save(fdir / 'PolII.bar.pdf')
fig
Out[121]:
<ggplot: (-9223363274802907858)>

Compare the high-quality peak recall

In [7]:
models
Out[7]:
{'nexus/binary': 'nexus,gw,OSNK,1,0,0,FALSE,same,0.5,64,25,0.001,9,FALSE',
 'nexus/profile': 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE',
 'seq/profile': 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE',
 'seq/binary': 'seq,gw,OSN,1,0,0,FALSE,same,0.5,64,50,0.001,9,FALSE',
 'nexus/profile.peaks-union': 'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0',
 'seq/profile.peaks-union': 'seq,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE,[1,50],TRUE,TRUE,0'}
In [4]:
main_motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']  # NOTE this is a duplication from before
In [13]:
dfi = pd.read_parquet(models_dir / 'dfi_subset.annotated.v3.parq', engine='fastparquet')
dfi = dfi[dfi.example_chrom.isin(test_chr)]  # look at only the test chromosomes
dfi = dfi[dfi.pattern_name.isin(main_motifs)]
In [14]:
# Remove duplicates
dfi = dfi.sort_values("imp_weighted", ascending=False)
duplicated = dfi[['model', 'pattern_name', 'example_chrom', 'pattern_start_abs', 'pattern_end_abs', 'strand']].duplicated()
dfi = dfi[~duplicated]

# setup coordinates
dfi['Chromosome'] = dfi['example_chrom']
dfi['Start'] = dfi['pattern_start_abs']
dfi['End'] = dfi['pattern_end_abs']
dfi['Strand'] = dfi['pattern_end_abs']
In [15]:
dfi.head()
Out[15]:
Klf4/profile_counts Klf4/profile_counts_max_ref Klf4/profile_counts_max_ref_p Klf4/profile_counts_p Klf4/profile_match Klf4/profile_match_p Klf4/profile_max Klf4/profile_max_p Nanog/profile_counts Nanog/profile_counts_max_ref Nanog/profile_counts_max_ref_p Nanog/profile_counts_p Nanog/profile_match Nanog/profile_match_p Nanog/profile_max Nanog/profile_max_p Oct4/profile_counts Oct4/profile_counts_max_ref Oct4/profile_counts_max_ref_p Oct4/profile_counts_p Oct4/profile_match Oct4/profile_match_p Oct4/profile_max Oct4/profile_max_p Sox2/profile_counts Sox2/profile_counts_max_ref Sox2/profile_counts_max_ref_p Sox2/profile_counts_p Sox2/profile_match Sox2/profile_match_p Sox2/profile_max Sox2/profile_max_p example_chrom example_end example_idx example_interval_from_task example_start example_strand exp id imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p is_te match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p model pattern pattern_center pattern_center_aln pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs pattern_strand_aln seq_match seq_match_cat seq_match_p strand tf Chromosome Start End Strand
490400 80.0001 2.0000e-06 0.4574 0.5237 4.2121 0.4750 4.0 0.5602 311.0001 6.0 0.5589 0.5737 2.0192 0.4506 8.0 0.3694 286.0001 8.0 0.9608 0.9689 2.0613 0.0365 12.0 0.9648 108.0001 13.0 0.8796 0.7984 3.3154 0.2165 13.0 0.9039 chr1 186923485 27512 Sox2 186922485 . nexus,gw,OSNK,1,0,0,F... 101176 NaN NaN NaN 38.9884 38.9884 Sox2 38.9884 high 0.9986 False NaN NaN NaN 0.2261 0.2261 Sox2 0.2261 medium 0.4913 nexus/binary Sox2/metacluster_0/pa... 500 491 529 186923014 58 Sox2 Sox2/m0_p3 471 186922956 - 5.4838 low 0.2572 + Sox2 chr1 186922956 186923014 186923014
496468 37.0001 1.0000e+00 0.4574 0.2355 5.3695 0.7686 3.0 0.4100 234.0001 1.0 0.2476 0.4628 1.8828 0.4087 7.0 0.3342 469.0001 10.0 0.9743 0.9878 1.5551 0.0203 25.0 0.9959 131.0001 12.0 0.8593 0.8660 2.6023 0.0920 10.0 0.8417 chr8 60346612 46172 Nanog 60345612 . nexus,gw,OSNK,1,0,0,F... 107244 NaN NaN NaN 38.0438 38.0438 Sox2 38.0438 high 0.9971 False NaN NaN NaN 0.2344 0.2344 Sox2 0.2344 medium 0.5405 nexus/binary Sox2/metacluster_0/pa... 463 454 492 60346104 58 Sox2 Sox2/m0_p3 434 60346046 - 2.8692 low 0.0968 + Sox2 chr8 60346046 60346104 60346104
503835 203.0001 1.1000e+01 0.9405 0.8484 2.5899 0.1854 7.0 0.8051 1022.0001 8.0 0.6292 0.9188 0.9810 0.1827 47.0 0.9161 679.0001 14.0 0.9878 0.9973 1.1521 0.0149 58.0 1.0000 309.0001 35.0 0.9865 0.9824 1.6059 0.0230 31.0 0.9919 chr1 180925386 93820 Klf4 180924386 . nexus,gw,OSNK,1,0,0,F... 114611 NaN NaN NaN 36.5360 36.5360 Sox2 36.5360 high 0.9971 False NaN NaN NaN 0.2018 0.2018 Sox2 0.2018 medium 0.3483 nexus/binary Sox2/metacluster_0/pa... 545 554 574 180924960 58 Sox2 Sox2/m0_p3 516 180924902 + 5.1639 low 0.2428 - Sox2 chr1 180924902 180924960 180924960
494666 155.0001 2.0000e+00 0.6184 0.7713 2.9060 0.2206 6.0 0.7659 540.0001 43.0 0.9120 0.7591 0.7866 0.1245 24.0 0.7591 125.0001 8.0 0.9608 0.8566 3.7845 0.1894 8.0 0.9337 56.0001 5.0 0.6130 0.4885 4.4660 0.5264 5.0 0.5819 chr1 53581962 39532 Nanog 53580962 . nexus,gw,OSNK,1,0,0,F... 105442 NaN NaN NaN 35.7210 35.7210 Sox2 35.7210 high 0.9971 True NaN NaN NaN 0.3273 0.3273 Sox2 0.3273 high 0.9003 nexus/binary Sox2/metacluster_0/pa... 502 511 531 53581493 58 Sox2 Sox2/m0_p3 473 53581435 + 26.7954 high 0.8887 - Sox2 chr1 53581435 53581493 53581493
505904 106.0001 2.0000e-06 0.4574 0.6441 3.7177 0.3748 8.0 0.8471 1358.0000 20.0 0.8024 0.9540 0.8744 0.1570 78.0 0.9635 104.0001 5.0 0.9202 0.8160 3.8483 0.2003 4.0 0.7551 78.0001 4.0 0.5304 0.6549 3.7633 0.3180 3.0 0.3099 chr9 53681518 110858 Klf4 53680518 . nexus,gw,OSNK,1,0,0,F... 116680 NaN NaN NaN 33.8227 33.8227 Sox2 33.8227 high 0.9942 False NaN NaN NaN 0.1980 0.1980 Sox2 0.1980 low 0.3121 nexus/binary Sox2/metacluster_0/pa... 517 526 546 53681064 58 Sox2 Sox2/m0_p3 488 53681006 + 2.6344 low 0.0809 - Sox2 chr9 53681006 53681064 53681064
In [16]:
import pyranges as pr
In [17]:
pr.__version__
Out[17]:
'0.0.51'
In [75]:
dfi_pr = pr.PyRanges(dfi)
In [77]:
model = 'nexus/binary'
In [83]:
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
In [84]:
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
In [85]:
len(dfi_pr_s_nonoverlapping)
Out[85]:
3648
In [86]:
len(dfi_pr_s)
Out[86]:
12063
In [87]:
model = 'seq/profile'
In [88]:
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
In [89]:
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
In [90]:
len(dfi_pr_s_nonoverlapping)
Out[90]:
1186
In [91]:
len(dfi_pr_s)
Out[91]:
9576
In [92]:
model = 'nexus/profile'
In [93]:
dfi_pr_os = pr.PyRanges(dfi[((dfi.pattern_name == 'Oct4-Sox2') & (dfi.model == model ))])
dfi_pr_s = pr.PyRanges(dfi[((dfi.pattern_name == 'Sox2') & (dfi.model == model ))])
In [94]:
dfi_pr_s_nonoverlapping = dfi_pr_s.overlap(dfi_pr_os, strandedness=None)
In [95]:
len(dfi_pr_s_nonoverlapping)
Out[95]:
866
In [96]:
len(dfi_pr_s)
Out[96]:
22688
In [55]:
len(dfi_pr_s_nonoverlapping)
In [119]:
dfi_test = dfi[dfi.example_chrom.isin(test_chr)]
In [133]:
print(dfi.groupby(['pattern_name', 'model']).size().to_string())
pattern_name  model                    
Klf4          nexus/binary                  8074
              nexus/profile                11715
Nanog         nexus/binary                 10089
              nexus/profile                 9653
              nexus/profile.peaks-union     5297
              seq/binary                    3816
              seq/profile                  11743
              seq/profile.peaks-union       7025
Oct4-Sox2     nexus/binary                  4324
              nexus/profile                 4947
              nexus/profile.peaks-union     2260
              seq/binary                    2148
              seq/profile                   2394
              seq/profile.peaks-union       2516
Sox2          nexus/binary                  2123
              nexus/profile                 4358
              nexus/profile.peaks-union     2119
              seq/binary                    1796
              seq/profile                   1822
              seq/profile.peaks-union       1862
In [18]:
def get_profile_score(dfi, score, profile_cls):
    df = dfi[(~dfi[score].isnull())]
    return df.sort_values(score, ascending=False)[profile_cls]

motif_profile_counts = [(motif, 
                         {model: get_profile_score(dfi[(dfi.pattern_name == motif) &
                                                       (dfi.model == model) & 
                                                       (dfi.example_chrom.isin(test_chr))],
                                                   'imp_weighted', 
                                                   profile_mapping[motif]+ "/profile_counts_max_ref").values
                         for model in models})
                        for motif in main_motifs]
In [19]:
# Profile scores
model = 'nexus/profile'
thresholds = dict()
fig, axes = plt.subplots(len(main_motifs),1,figsize=get_figsize(0.25, 1), sharex=True)
for i, motif in enumerate(main_motifs):
    ax = axes[i]
    scores = motif_profile_counts[i][1][model]
    thresholds[motif] = np.percentile(scores, 90)
    #if len(scores) == 0:
    #    continue
    ax.hist(scores, bins=np.arange(30))
    ax.axvline(thresholds[motif], color='grey')
    ax.set_ylabel(motif)
    if i == len(main_motifs)-1:
        ax.set_xlabel("Profile score (max value)");
    # fig.savefig(fdir / 'profile-score.hist.pdf')
In [29]:
fig, axes = plt.subplots(1, len(main_motifs), figsize=get_figsize(1, 1/(1*len(main_motifs))), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 2000
    for k,v in res.items():
        if k in ['nexus/profile', 'nexus/binary']:
            cutoff = thresholds[motif]
            ax.plot(np.cumsum(v[:top_n] > cutoff), label=k)

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='lower right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)    
fig.savefig(fdir / 'profile-recall.nexus.profile-vs-binary.pdf')
In [31]:
fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(0.75, 1/(1*len(main_motifs)-1)), 
                         sharex=True, 
                         sharey=True, gridspec_kw=dict(wspace=0, hspace=0))
# fig, axes = plt.subplots(1, len(main_motifs)-1, figsize=get_figsize(.75, 1/(1*len(main_motifs)-1)), 
#                          sharex=True, 
#                          sharey=True, gridspec_kw=dict(wspace=0, hspace=0))

fig.subplots_adjust(wspace=.5)#, wspace=0)
max_val = 0
for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
    ax = axes[i]
    # top_n = min([len(x) for x in res.values()])
    top_n = 2000
    for k,v in res.items():
        if k in ['nexus/profile.peaks-union', 'seq/profile', 'seq/profile.peaks-union', 'seq/binary']:
            cutoff = thresholds[motif]
            ax.plot(np.cumsum(v[:top_n] > cutoff), label=k.replace("peaks-union", 'u'))

    max_val = max(max_val, ax.get_ylim()[1])
    if i == 0:
        ax.set_ylabel("# motif instances\nwith high coverage")
    if i == 1:
        ax.set_xlabel("Top # of motif instances")
    if i == 0:
        leg = ax.legend(loc='bottom right', framealpha=1, borderpad=0)
        leg.get_frame().set_linewidth(0.0)
    ax.set_title(motif)

for i, (motif, res) in enumerate(motif_profile_counts[:-1]):
    axes[i].plot([0, max_val], [0, max_val], '--', color='grey', label=None)
sns.despine(top=True, right=True)   
fig.savefig(fdir / 'profile-recall.nexus-vs-seq.pdf')

TODO

  • [ ] setup nicer color for the comparison

TODO

  • visually inspect the Sox2 instances
In [315]:
model = 'nexus/profile.peaks-union'
pattern = 'Sox2'
In [316]:
exp = models[model]
In [317]:
exp
Out[317]:
'nexus,nexus-seq-union,OSN,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE,[1,50],TRUE,TRUE,0'
In [16]:
isf = ImpScoreFile(models_dir / exp / 'deeplift.imp_score.h5', default_imp_score='profile/wn')
In [17]:
isf.cache()
Out[17]:
<basepair.cli.imp_score.ImpScoreFile at 0x7f241adae908>
In [214]:
thresholds['Sox2']
Out[214]:
1.0000019073486328
In [ ]:
threshold = 1
In [339]:
dfi_ranges = dfi[(dfi.model == model) & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
In [318]:
mr = ModiscoResult(f"{models_dir}/{exp}/deeplift/Sox2/out/profile/wn/modisco.h5")
In [ ]:
dfi2 = pd.read_parquet(f"{models_dir}/{exp}/deeplift/dfi_subset.parq")
In [ ]:
dfi2.head()
In [347]:
ls {models_dir}/{exp}/deeplift
dfi_subset.parq  Nanog/  Oct4/  Sox2/
In [332]:
subset = isf.get_ranges()['interval_from_task'] == "Sox2"
In [333]:
pname = longer_pattern(experiments[models[model]]['motifs'][pattern].split("/")[1])
In [334]:
# Question: why is the 
In [335]:
seqlets = []
for s in mr._get_seqlets(pname, trim_frac=0.08):
    s.start = s.start - 30
    s.end = s.end + 30
    seqlets.append(s)
In [338]:
# TODO - the issue is with determining the reference sequence
In [336]:
isf.include_samples = subset
ess_seqlets = isf.extract(seqlets, )
isf.include_samples = None
In [337]:
ess_seqlets.aggregate().plot();
In [291]:
p = ess.aggregate()
 #pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
threshold = np.percentile((pos+neg)/2, 90)
print("threshold: ", threshold)
# plt.plot(pos)
# plt.plot(neg);
# plt.title("Sox2")
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)

plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')
threshold:  5.5
Out[291]:
<matplotlib.lines.Line2D at 0x7f23d6327c88>
In [292]:
recall_nexus = np.cumsum((pos+neg)/2 > threshold)
In [293]:
dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
In [294]:
p = ess.aggregate()
# pos = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[0], 0]
# neg = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0)[1], 1]
pos = ess.profile['Sox2'].max(axis=1)[:, 0]
neg = ess.profile['Sox2'].max(axis=1)[:, 1]
plt.scatter((pos+neg)/2, dfi_ranges['Sox2/profile_max'], s=2, alpha=0.5)
plt.figure()
plt.plot((pos+neg)/2)
plt.axhline(y=threshold, color='gray')
Out[294]:
<matplotlib.lines.Line2D at 0x7f23d6386208>
In [295]:
recall_seq = np.cumsum((pos+neg)/2 > threshold)
In [296]:
plt.plot(recall_seq, label='seq', alpha=0.5);
plt.plot(recall_nexus, label='nexus', alpha=0.5);
plt.plot([0, 600], [0, 600], color='gray')
plt.legend()
Out[296]:
<matplotlib.legend.Legend at 0x7f23d63662b0>
In [19]:
ess.plot("profile");
In [20]:
ess.plot("seq");
In [55]:
dfi_ranges = dfi[(dfi.model == 'seq/profile.peaks-union') & (dfi.pattern_name == pattern)].copy()
dfi_ranges['pattern_start'] = dfi_ranges['pattern_start'] - 30
dfi_ranges['pattern_end'] = dfi_ranges['pattern_end'] + 30
ess = isf.extract_dfi(dfi_ranges)
In [56]:
p = ess.aggregate()
pl = ess.profile['Sox2'][:, p.profile['Sox2'].argmax(axis=0), [0, 1]]
plt.plot(pl[:, 0])
plt.plot(pl[:, 1]);
In [33]:
from basepair.modisco.core import Pattern
In [34]:
ess.aggregate().plot(rotate_y=0);
In [22]:
ess.plot("profile");
In [23]:
ess.plot("seq");

TODO

  • [X] Intersect motif instances
    • how much overlap do we see between instances?
  • ? why is the recall so different for Sox2 motifs?
    • generate stacked_seqlets

Analyzing instersecting / non-intersecting motifs

In [45]:
from basepair.preproc import dfint_intersects, dfint_no_intersection, dfint_overlap_idx

int_cols = ['example_chrom', 'pattern_start_abs', 'pattern_end_abs']
In [129]:
list(models)
Out[129]:
['nexus/profile',
 'nexus/binary',
 'seq/profile',
 'seq/binary',
 'nexus/profile.peaks-union',
 'seq/profile.peaks-union']
In [125]:
# Example
pattern = 'Oct4-Sox2'
ma = 'nexus/profile'
mb = 'nexus/binary'
In [146]:
def extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log'):
    """Overlap different motif instances
    """
    dfic = dfi[(dfi.pattern_name == pattern) & (dfi.model.isin([ma, mb]))]
    dfa = dfic[(dfi.model == ma)][int_cols]
    dfb = dfic[(dfi.model == mb)][int_cols]
    int_ab = dfint_intersects(dfa, dfb)
    int_ba = dfint_intersects(dfb, dfa)

    # Add the set
    dfic['set'] = ''
    dfic.loc[int_ba[int_ba].index, 'set'] = 'both'
    dfic.loc[int_ba[~int_ba].index, 'set'] = mb
    dfic.loc[int_ab[~int_ab].index, 'set'] = ma
    dfic.loc[int_ab[int_ab].index, 'set'] = 'both'

    profile = profile_mapping[pattern]
    dfic[f'{profile}/profile_counts_max_ref_log'] = np.log10(1+dfic[f'{profile}/profile_counts_max_ref'])
    dfic['profile_score'] = dfic[f'{profile}/{score}']
    dfic['pair'] = "<>".join([ma, mb])
    dfic['score'] = 'score'
    return dfic[['pattern_name', 'set', 'profile_score', 'pair', 'score']]
In [147]:
dfr = pd.concat([extract_score(dfi, pattern, ma, mb, score='profile_counts_max_ref_log')
    for pattern in tqdm(main_motifs)
    for ma,mb in [
        ('nexus/profile', 'nexus/binary'),
        ('seq/profile', 'seq/binary'),
        ('nexus/profile.peaks-union', 'seq/profile.peaks-union'),
    ] if not (pattern == 'Klf4' and 'seq' in mb)])
100%|██████████| 4/4 [00:59<00:00, 13.79s/it]
In [163]:
dfr['pair'] =dfr.pair.str.replace("peaks-union", 'u') # For shorter plot labels
In [165]:
plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'), dfr) + 
       facet_grid("pattern_name ~ pair", scales='free_x') +
       geom_violin() +
       # geom_jitter(alpha=0.1) +
       # scale_y_log10() + 
       theme_classic() +
       plt9_tilt_xlab() + 
       ylab(f"Profile score")
      )
fig
Out[165]:
<ggplot: (-9223363257119676370)>
In [166]:
plotnine.options.figure_size = get_figsize(1, 1)
fig = (ggplot(aes(x='set', y='profile_score'), 
              dfr.groupby(['pair', 'set', 'pattern_name']).profile_score.size().reset_index()) + 
       facet_grid("pattern_name ~ pair", scales='free_x') +
       geom_bar(stat='identity') +
       # geom_jitter(alpha=0.1) +
       # scale_y_log10() + 
       theme_classic() +
       plt9_tilt_xlab() + 
       ylab(f"Profile score")
      )
fig
Out[166]:
<ggplot: (-9223363257126602748)>

Questions

  • [X] What are some properties of the intersected ones
    • When we are comparing A and B, plot: A only, A and B, B only
    • [X] make a boxplot for that particular example
    • [X] Shall we explicitly plot some of the ranges?
  • [X] Generate the same plots for all motifs and all method pairs
    • nexus: binary vs profile
    • seq: binary vs profile
    • seq vs nexus
  • [X] How much overlap do we see for different methods?
  • [ ] How to best display the overlaps?
In [41]:
dfi.head()
Out[41]:
Klf4/profile_counts Klf4/profile_counts_max_ref Klf4/profile_counts_max_ref_p Klf4/profile_counts_p Klf4/profile_match Klf4/profile_match_p Klf4/profile_max Klf4/profile_max_p Nanog/profile_counts Nanog/profile_counts_max_ref Nanog/profile_counts_max_ref_p Nanog/profile_counts_p Nanog/profile_match Nanog/profile_match_p Nanog/profile_max Nanog/profile_max_p Oct4/profile_counts Oct4/profile_counts_max_ref Oct4/profile_counts_max_ref_p Oct4/profile_counts_p Oct4/profile_match Oct4/profile_match_p Oct4/profile_max Oct4/profile_max_p Sox2/profile_counts Sox2/profile_counts_max_ref Sox2/profile_counts_max_ref_p Sox2/profile_counts_p Sox2/profile_match Sox2/profile_match_p Sox2/profile_max Sox2/profile_max_p example_chrom example_end example_idx example_interval_from_task example_start example_strand id imp/Klf4 imp/Nanog imp/Oct4 imp/Sox2 imp_max imp_max_task imp_weighted imp_weighted_cat imp_weighted_p is_te match/Klf4 match/Nanog match/Oct4 match/Sox2 match_max match_max_task match_weighted match_weighted_cat match_weighted_p model pattern pattern_center pattern_center_aln pattern_end pattern_end_abs pattern_len pattern_name pattern_short pattern_start pattern_start_abs pattern_strand_aln seq_match seq_match_cat seq_match_p strand tf
456394 60.0001 2.0000e-06 0.6441 0.4668 4.9265 0.5724 5.0 0.7199 374.0001 1.0 0.4614 0.7334 3.0232 0.4533 17.0 0.7510 597.0001 12.0 0.9283 0.9499 1.7144 0.1299 25.0 0.9459 136.0001 1.0000e+00 0.7835 0.9188 3.5819 0.0961 8.0 0.8972 chr15 84169370 486 Oct4 84168370 . 95269 NaN NaN NaN 44.4014 44.4014 Sox2 44.4014 high 1.0 False NaN NaN NaN 0.2133 0.2133 Sox2 0.2133 medium 0.4147 nexus/binary Sox2/metacluster_0/pa... 499 490 528 84168898 58 Sox2 Sox2/m0_p3 470 84168840 - 1.6304 low 0.0361 + Sox2
465477 122.0001 1.0000e+00 0.6441 0.7375 3.8495 0.3410 6.0 0.7984 1091.0000 6.0 0.8160 0.9350 0.7184 0.0880 33.0 0.8796 453.0001 1.0 0.3978 0.9242 1.2164 0.0595 12.0 0.8268 157.0001 2.0000e-06 0.7835 0.9337 3.2870 0.0690 11.0 0.9378 chr15 30383836 36484 Nanog 30382836 . 104352 NaN NaN NaN 44.1754 44.1754 Sox2 44.1754 high 1.0 False NaN NaN NaN 0.1896 0.1896 Sox2 0.1896 low 0.2601 nexus/binary Sox2/metacluster_0/pa... 449 440 478 30383314 58 Sox2 Sox2/m0_p3 420 30383256 - 5.7461 low 0.2746 + Sox2
456721 249.0001 3.0000e+00 0.8403 0.9134 1.9564 0.0785 7.0 0.8457 828.0001 6.0 0.8160 0.8985 1.0016 0.1475 42.0 0.8972 226.0001 3.0 0.6712 0.7456 2.2944 0.2084 8.0 0.6739 80.0001 4.0000e+00 0.9540 0.8146 4.5998 0.2260 5.0 0.7889 chr4 14487813 1485 Oct4 14486813 . 95596 NaN NaN NaN 40.7515 40.7515 Sox2 40.7515 high 1.0 True NaN NaN NaN 0.3323 0.3323 Sox2 0.3323 high 0.9075 nexus/binary Sox2/metacluster_0/pa... 498 507 527 14487340 58 Sox2 Sox2/m0_p3 469 14487282 + 26.7264 high 0.8844 - Sox2
476844 125.0001 2.0000e+00 0.7794 0.7470 3.8106 0.3369 9.0 0.9012 2247.0000 35.0 0.9756 0.9770 0.7873 0.1042 205.0 0.9946 837.0001 4.0 0.7483 0.9783 1.1528 0.0528 54.0 0.9946 399.0001 7.0000e+00 0.9824 0.9959 2.2875 0.0149 21.0 0.9892 chr5 131527458 101668 Klf4 131526458 . 115719 NaN NaN NaN 40.7138 40.7138 Sox2 40.7138 high 1.0 False NaN NaN NaN 0.2026 0.2026 Sox2 0.2026 medium 0.3526 nexus/binary Sox2/metacluster_0/pa... 492 483 521 131526979 58 Sox2 Sox2/m0_p3 463 131526921 - 5.4391 low 0.2572 + Sox2
469702 65.0001 2.0000e+00 0.7794 0.4939 4.6181 0.4858 4.0 0.6022 121.0001 4.0 0.7280 0.4668 3.9832 0.5792 6.0 0.4899 758.0001 19.0 0.9702 0.9689 1.4775 0.0893 52.0 0.9932 256.0001 9.0000e+00 0.9878 0.9811 2.9985 0.0474 20.0 0.9865 chr18 31569412 52302 Nanog 31568412 . 108577 NaN NaN NaN 40.0392 40.0392 Sox2 40.0392 high 1.0 True NaN NaN NaN 0.1972 0.1972 Sox2 0.1972 low 0.3078 nexus/binary Sox2/metacluster_0/pa... 551 542 580 31568992 58 Sox2 Sox2/m0_p3 522 31568934 - 2.3431 low 0.0636 + Sox2

Periodicity

Single example

In [49]:
from basepair.exp.chipnexus.spacing import remove_edge_instances, get_motif_pairs, motif_pair_dfi

from basepair.modisco.periodicity import smooth
In [43]:
# dfi = pd.concat([pd.read_parquet(models_dir / exp / "deeplift/dfi_subset.parq", engine='fastparquet').assign(model=model_name)
#                  for model_name, exp in tqdm(models.items())])
100%|██████████| 8/8 [00:14<00:00,  1.64s/it]
In [45]:
# dict(dfi.groupby("model").size())
Out[45]:
{'basset/binary': 227803,
 'factorized-basset/binary': 282214,
 'nexus/binary': 269379,
 'nexus/profile': 392571,
 'nexus/profile.peaks-union': 1281958,
 'seq/binary': 93983,
 'seq/profile': 169231,
 'seq/profile.peaks-union': 94067}
In [50]:
# create motif pairs
dfab = pd.concat([motif_pair_dfi(dfi[dfi.model == model], pair).assign(motif_pair="<>".join(pair)).assign(model=model) 
                  for pair in [['Nanog', 'Nanog']] for model in tqdm(models)],
                 axis=0)
In [51]:
motifs = ['Oct4-Sox2', 'Sox2', 'Nanog', 'Klf4']

motif_pair_name = 'Nanog<>Nanog'

pairs = get_motif_pairs(motifs)

Periodicity

In [56]:
from matplotlib import ticker
In [57]:
frac_10bp = dict()
for model in models:
    strand_comb = "++"
    fig, ax = plt.subplots(figsize = get_figsize(.25))
    center_diff = dfab[(dfab.strand_combination == strand_comb) & 
                       (dfab.model == model) &
                       (dfab.motif_pair == motif_pair_name)].center_diff
    counts, bin_edges = np.histogram(center_diff, bins=np.arange(150)+.5)
    smooth_part = smooth(counts, 10)
    plt.plot(bin_edges[1:], counts, label='raw')
    plt.plot(bin_edges[1:], smooth_part, label='smoothed')
    plt.legend()
    plt.title(f"{model} Nanog<>Nanog ({strand_comb})")
    plt.xlabel("Pairwise distance")
    plt.ylabel("Frequency");
    
    power = 0
    for strand_comb in ['++', "-+", '+-', "--"]:
        counts, bin_edges = np.histogram(dfab[(dfab.strand_combination == strand_comb) & 
                                              (dfab.model == model) &
                                              (dfab.motif_pair == motif_pair_name)].center_diff,
                 bins=np.arange(150)+.5)
        smooth_part = smooth(counts, 10)
        power += np.abs(np.fft.fft(counts-smooth_part)[:49])**2

    freq = np.fft.fftfreq(counts.shape[-1])
    power = power[3:49]
    x = 1 / freq[3:49]    

    fft_fig, ax = plt.subplots(1, 1, figsize=get_figsize(0.5, 1/5))
    # Store the peak height vs others
    frac_10bp[model] = power[np.argmax(power)] / power.sum()
    ax.plot(x, power)
    ax.scatter(x, power, s=10)
    ax.set_xlim([0, 50])
    ax.xaxis.set_major_locator(ticker.MaxNLocator(10, integer=True))
    # ax.grid(alpha=0.3)
    t0_max = x[np.argmax(power)]
    ax.axvline(x=t0_max, color='grey', alpha=0.5)
    print("Max frequency:", t0_max)
    ax.set_xlabel("1/Frequency [bp]")
    ax.set_ylabel("Power spectrum");
    ax.set_title(model)
    # fft_fig.savefig(fdir / 'Nanog.p1.fft.pdf')
    # fft_fig.savefig(fdir / 'Nanog.p1.fft.png')
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
Max frequency: 10.642857142857142
In [58]:
df = pd.Series(frac_10bp).reset_index()
df.columns = ['model', '10bp periodicity']

(ggplot(aes(x='model', y='10bp periodicity'), df) + 
 geom_bar(stat='identity') + 
 theme_classic() + 
 plt9_tilt_xlab()
)
Out[58]:
<ggplot: (-9223363274861830926)>