import pandas as pd
import numpy as np
from plotnine import *
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
experiments_performance_pd = pd.read_csv("experiments_performance.tsv",sep="\t")
experiments_performance_pd.columns
Index(['entity:experiment_id', 'target', 'tissue_name', 'number_of_peaks', 'auprc-run_id_3', 'auprc_wo_bias-run_id_3', 'auroc-run_id_3', 'auroc_wo_bias-run_id_3', 'jsd-run_id_3', 'jsd_all_peaks-run_id_3', 'jsd_all_peaks_wo_bias-run_id_3', 'jsd_wo_bias-run_id_3', 'number_of_peaks_test_chroms_split0', 'pearson-run_id_3', 'pearson_all_peaks-run_id_3', 'pearson_all_peaks_wo_bias-run_id_3', 'pearson_with_control', 'pearson_with_control_all_peaks', 'pearson_wo_bias-run_id_3', 'spearman-run_id_3', 'spearman_all_peaks-run_id_3', 'spearman_all_peaks_wo_bias-run_id_3', 'spearman_with_control', 'spearman_with_control_all_peaks', 'spearman_wo_bias-run_id_3', 'protein_tag', 'sample_summary'], dtype='object')
experiments_performance_pd.corr(method='spearman').to_csv('model_performance_correlations.csv',sep=',')
(ggplot(experiments_performance_pd,aes('pearson-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="pearson scores")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 54 rows containing non-finite values.
<ggplot: (-9223363297638964150)>
(ggplot(experiments_performance_pd,aes('pearson_all_peaks-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="pearson scores all regions")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639094798)>
(ggplot(experiments_performance_pd,aes('pearson_wo_bias-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="pearson scores without bias")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 341 rows containing non-finite values.
<ggplot: (-9223363297639136354)>
(ggplot(experiments_performance_pd,aes('pearson_all_peaks_wo_bias-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="pearson scores all regions without bias")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 142 rows containing non-finite values.
<ggplot: (8739215560694)>
(ggplot(experiments_performance_pd,aes('jsd-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="jsd scores")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0.25,0.4,0.5,0.6,0.75), limits = (0.25,0.75))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 52 rows containing non-finite values.
<ggplot: (-9223363297639258515)>
(ggplot(experiments_performance_pd,aes('auprc-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="auprc")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0.25,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639311381)>
(ggplot(experiments_performance_pd,aes('auroc-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="auroc")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639355974)>
(ggplot(experiments_performance_pd,aes('auprc_wo_bias-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="auprc_wo_bias")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0.25,0.4,0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639095885)>
(ggplot(experiments_performance_pd,aes('auroc_wo_bias-run_id_3'))
+stat_ecdf(geom = "line")
+ylab("cdf")
+labs(title="auroc_wo_bias")
+scale_y_continuous(breaks = (0,0.1,0.25,0.4,0.5,0.6,0.75,0.9,1.0), limits = (0,1))
+scale_x_continuous(breaks = (0.5,0.6,0.7,0.8,0.9,1.0), limits = (0,1))
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_ecdf : Removed 43 rows containing non-finite values.
<ggplot: (8739215806654)>
from scipy.stats import *
(ggplot(experiments_performance_pd,aes('number_of_peaks','jsd-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0.2,0.8))+
theme_classic()+
labs(title='jsd vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
np.nan_to_num(experiments_performance_pd['jsd-run_id_3']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 47 rows containing missing values.
<ggplot: (8739215307938)>
(ggplot(experiments_performance_pd,aes('number_of_peaks','pearson-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0,1))+
theme_classic()+
labs(title='pearson vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
np.nan_to_num(experiments_performance_pd['pearson-run_id_3']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 54 rows containing missing values.
<ggplot: (-9223363297639575587)>
(ggplot(experiments_performance_pd,aes('number_of_peaks','pearson_wo_bias-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_wo_bias vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
np.nan_to_num(experiments_performance_pd['pearson_wo_bias-run_id_3']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.
<ggplot: (8739215270369)>
(ggplot(experiments_performance_pd,aes('number_of_peaks','jsd_wo_bias-run_id_3'))+
geom_point(alpha=0.2)+
scale_x_log10()+
lims(y=(0.2,0.8))+
theme_classic()+
labs(title='jsd_wo_bias vs number of peaks. spearmanr='+''+str(round(spearmanr(np.nan_to_num(np.log10(experiments_performance_pd['number_of_peaks'])),
np.nan_to_num(experiments_performance_pd['jsd_wo_bias-run_id_3']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 47 rows containing missing values.
<ggplot: (8739215066716)>
(ggplot(experiments_performance_pd,aes('pearson_wo_bias-run_id_3','pearson_with_control'))+
geom_point(aes(fill='number_of_peaks'),alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_wo_bias vs pearson with control. spearmanr='+''+
str(round(spearmanr(np.nan_to_num(experiments_performance_pd['pearson_wo_bias-run_id_3']),
np.nan_to_num(experiments_performance_pd['pearson_with_control']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.
<ggplot: (-9223363297639310192)>
(ggplot(experiments_performance_pd,aes('pearson-run_id_3','pearson_with_control'))+
geom_point(aes(fill='number_of_peaks'),alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_ob_pred vs pearson with control. spearmanr='+''+
str(round(spearmanr(np.nan_to_num(experiments_performance_pd['pearson-run_id_3']),
np.nan_to_num(experiments_performance_pd['pearson_with_control']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 43 rows containing missing values.
<ggplot: (8739211255837)>
temp=experiments_performance_pd[experiments_performance_pd['number_of_peaks']<10000].reset_index(drop=True)
(ggplot(temp,aes('pearson-run_id_3','pearson_with_control'))+
geom_point(alpha=0.2)+
lims(y=(-1,1))+
theme_classic()+
labs(title='pearson_ob_pred vs pearson with control. spearmanr='+''+
str(round(spearmanr(np.nan_to_num(temp['pearson-run_id_3']),
np.nan_to_num(temp['pearson_with_control']))[0],3)))
)
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:401: PlotnineWarning: geom_point : Removed 11 rows containing missing values.
<ggplot: (-9223363297643540157)>
(ggplot(experiments_performance_pd,aes(x='protein_tag',y='pearson_wo_bias-run_id_3'))
+geom_boxplot()
+ylab("pearson")
+labs(title="pearson scores peak only without bias")
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297643738673)>
(ggplot(experiments_performance_pd,aes(x='protein_tag',y='auprc_wo_bias-run_id_3'))
+geom_boxplot()
+ylab("auprc")
+labs(title="auprc without bias")
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297643779447)>
(ggplot(experiments_performance_pd,aes(x='protein_tag',y='auroc_wo_bias-run_id_3'))
+geom_boxplot()
+ylab("auroc")
+labs(title="auroc without bias")
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297643525134)>
(ggplot(experiments_performance_pd,aes(x='protein_tag',y='pearson_all_peaks_wo_bias-run_id_3'))
+geom_boxplot()
+ylab("pearson")
+labs(title="pearson scores all regions without bias")
+theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639384369)>
(ggplot(experiments_performance_pd,aes('protein_tag','auprc-run_id_3'))+
geom_boxplot()+
theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639283014)>
(ggplot(experiments_performance_pd,aes('protein_tag','jsd-run_id_3'))+
geom_boxplot()+
theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639133857)>
(ggplot(experiments_performance_pd,aes('protein_tag','pearson-run_id_3'))+
geom_boxplot()+
theme_classic())
/users/vir/anaconda3/envs/basepairmodels_latest/lib/python3.7/site-packages/plotnine/layer.py:324: PlotnineWarning: stat_boxplot : Removed 43 rows containing non-finite values.
<ggplot: (-9223363297639095201)>