import pandas as pd
import numpy as np
from plotnine import *
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42


experiments_performance_pd = pd.read_csv("experiments_performance.tsv",sep="\t")
experiments_performance_pd.columns

Index(['entity:experiment_id', 'target', 'tissue_name', 'number_of_peaks',
       'auprc-run_id_3', 'auprc_wo_bias-run_id_3', 'auroc-run_id_3',
       'auroc_wo_bias-run_id_3', 'jsd-run_id_3', 'jsd_all_peaks-run_id_3',
       'jsd_all_peaks_wo_bias-run_id_3', 'jsd_wo_bias-run_id_3',
       'number_of_peaks_test_chroms_split0', 'pearson-run_id_3',
       'pearson_all_peaks-run_id_3', 'pearson_all_peaks_wo_bias-run_id_3',
       'pearson_with_control', 'pearson_with_control_all_peaks',
       'pearson_wo_bias-run_id_3', 'spearman-run_id_3',
       'spearman_all_peaks-run_id_3', 'spearman_all_peaks_wo_bias-run_id_3',
       'spearman_with_control', 'spearman_with_control_all_peaks',
       'spearman_wo_bias-run_id_3', 'protein_tag', 'sample_summary'],
      dtype='object')


experiments_performance_pd.corr(method='spearman').to_csv('model_performance_correlations.csv',sep=',')


(ggplot(experiments_performance_pd,aes('pearson-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="pearson scores")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 54 rows containing non-finite values.

<ggplot: (-9223363297638964150)>


(ggplot(experiments_performance_pd,aes('pearson_all_peaks-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="pearson scores all regions")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639094798)>


(ggplot(experiments_performance_pd,aes('pearson_wo_bias-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="pearson scores without bias")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 341 rows containing non-finite values.

<ggplot: (-9223363297639136354)>


(ggplot(experiments_performance_pd,aes('pearson_all_peaks_wo_bias-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="pearson scores all regions without bias")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 142 rows containing non-finite values.

<ggplot: (8739215560694)>


(ggplot(experiments_performance_pd,aes('jsd-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="jsd scores")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0.25,0.4,0.5,0.6,0.75), limits = (0.25,0.75))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 52 rows containing non-finite values.

<ggplot: (-9223363297639258515)>


(ggplot(experiments_performance_pd,aes('auprc-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="auprc")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0.25,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639311381)>


(ggplot(experiments_performance_pd,aes('auroc-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="auroc")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639355974)>


(ggplot(experiments_performance_pd,aes('auprc_wo_bias-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="auprc_wo_bias")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0.25,0.4,0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639095885)>


(ggplot(experiments_performance_pd,aes('auroc_wo_bias-run_id_3'))
 +stat_ecdf(geom = "line")
 +ylab("cdf")
 +labs(title="auroc_wo_bias")
 +scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
 +scale_x_continuous(breaks = (0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.

<ggplot: (8739215806654)>


from scipy.stats import *


(ggplot(experiments_performance_pd,aes('number_of_peaks','jsd-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0.2,0.8))+
theme_classic()+
labs(title='jsd vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
        np.nan_to_num(experiments_performance_pd['jsd-run_id_3']))[0],3)))
)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 47 rows containing missing values.

<ggplot: (8739215307938)>


(ggplot(experiments_performance_pd,aes('number_of_peaks','pearson-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0,1))+
theme_classic()+
labs(title='pearson vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
        np.nan_to_num(experiments_performance_pd['pearson-run_id_3']))[0],3)))
)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 54 rows containing missing values.

<ggplot: (-9223363297639575587)>


(ggplot(experiments_performance_pd,aes('number_of_peaks','pearson_wo_bias-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_wo_bias vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
        np.nan_to_num(experiments_performance_pd['pearson_wo_bias-run_id_3']))[0],3)))
)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.

<ggplot: (8739215270369)>


(ggplot(experiments_performance_pd,aes('number_of_peaks','jsd_wo_bias-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0.2,0.8))+
theme_classic()+
labs(title='jsd_wo_bias vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
        np.nan_to_num(experiments_performance_pd['jsd_wo_bias-run_id_3']))[0],3)))
)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 47 rows containing missing values.

<ggplot: (8739215066716)>


(ggplot(experiments_performance_pd,aes('pearson_wo_bias-run_id_3','pearson_with_control'))+
geom_point(aes(fill='number_of_peaks'),alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_wo_bias vs pearson with control. spearmanr='+''+
     str(round(spearmanr(np.nan_to_num(experiments_performance_pd['pearson_wo_bias-run_id_3']),
        np.nan_to_num(experiments_performance_pd['pearson_with_control']))[0],3)))


)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.

<ggplot: (-9223363297639310192)>


(ggplot(experiments_performance_pd,aes('pearson-run_id_3','pearson_with_control'))+
geom_point(aes(fill='number_of_peaks'),alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_ob_pred vs pearson with control. spearmanr='+''+
     str(round(spearmanr(np.nan_to_num(experiments_performance_pd['pearson-run_id_3']),
        np.nan_to_num(experiments_performance_pd['pearson_with_control']))[0],3)))


)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.

<ggplot: (8739211255837)>


temp=experiments_performance_pd[experiments_performance_pd['number_of_peaks']<10000].reset_index(drop=True)


(ggplot(temp,aes('pearson-run_id_3','pearson_with_control'))+
geom_point(alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_ob_pred vs pearson with control. spearmanr='+''+
     str(round(spearmanr(np.nan_to_num(temp['pearson-run_id_3']),
        np.nan_to_num(temp['pearson_with_control']))[0],3)))


)

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 11 rows containing missing values.

<ggplot: (-9223363297643540157)>


(ggplot(experiments_performance_pd,aes(x='protein_tag',y='pearson_wo_bias-run_id_3'))
 +geom_boxplot()
 +ylab("pearson")
 +labs(title="pearson scores peak only without bias")
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297643738673)>


(ggplot(experiments_performance_pd,aes(x='protein_tag',y='auprc_wo_bias-run_id_3'))
 +geom_boxplot()
 +ylab("auprc")
 +labs(title="auprc without bias")
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297643779447)>


(ggplot(experiments_performance_pd,aes(x='protein_tag',y='auroc_wo_bias-run_id_3'))
 +geom_boxplot()
 +ylab("auroc")
 +labs(title="auroc without bias")
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297643525134)>


(ggplot(experiments_performance_pd,aes(x='protein_tag',y='pearson_all_peaks_wo_bias-run_id_3'))
 +geom_boxplot()
 +ylab("pearson")
 +labs(title="pearson scores all regions without bias")
 +theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639384369)>


(ggplot(experiments_performance_pd,aes('protein_tag','auprc-run_id_3'))+
geom_boxplot()+
theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639283014)>


(ggplot(experiments_performance_pd,aes('protein_tag','jsd-run_id_3'))+
geom_boxplot()+
theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639133857)>


(ggplot(experiments_performance_pd,aes('protein_tag','pearson-run_id_3'))+
geom_boxplot()+
theme_classic())

/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.

<ggplot: (-9223363297639095201)>

Model performance analysis for the second round of models¶

More than 2/3rds of models have pearson counts performance > 0.6¶

Model performance is even higher when calculated on both the peak and non peak regions¶

This performance drops a lot when no bias is provided during prediction¶

jsd is generally quite high¶

Auprc is really good for most models. With more than 90% having auprc >0.7¶

AUROC is also really good for most models. With more than 90% having auprc >0.9¶

Both AUPRC and AUROC drop when the bias is set to zero during prediction as we saw during with the pearson¶

jsd and pearson both depend on the number of peaks in the experimental dataset¶

Pearson that we get without the bias depends even more strongly on the number of peaks in the experimental dataset¶

When the correlation between observed and control is high that we get low performance when bias is not provided during prediction¶

looking at a subset of experiments with <10000 peaks. The correlation that we get between observed and predicted is a lot dependent on the correlation between WCE and ChIP¶