Goal

  • make the paper figures for the hyper-parameters

Tasks

  • [x] gather the experiment table

TODO

  • [x] Put the weighting into context
  • [~] Make all the plots and assemble them together in Illustrator
  • [ ] Use the same y and x axis span for all the hyper-paramter plots (except the multi-TF plots)
In [1]:
# Imports
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from basepair.imports import *
from basepair.exp.paper.config import tf_colors
from basepair.functions import mean
from basepair.cli.imp_score import ImpScoreFile
Using TensorFlow backend.
In [2]:
paper_config()
In [3]:
fig_nexus_hp = Path(f"{ddir}/figures/model-evaluation/ChIP-nexus/hyper-parameters")
fig_seq_hp = Path(f"{ddir}/figures/model-evaluation/ChIP-seq/hyper-parameters")
In [4]:
osn_tfs = ['Oct4', 'Sox2', 'Nanog']
osnk_tfs = ['Oct4', 'Sox2', 'Nanog', 'Klf4']
In [5]:
!mkdir -p {fig_seq_hp}
!mkdir -p {fig_nexus_hp}
In [6]:
df = pd.read_csv("output/model.results.finished.csv")
In [7]:
df.set_index('exp', inplace=True)
In [8]:
# Setup the profile loss
df['best-epoch/val_profile_loss'] = 0
for tf in osnk_tfs:
    x = df[f'best-epoch/val_{tf}/profile_loss']
    not_null = ~ x.isnull()
    df['best-epoch/val_profile_loss'][not_null] += x[not_null]
In [9]:
# Setup the profile loss
df['best-epoch/val_counts_loss'] = 0
for tf in osnk_tfs:
    x = df[f'best-epoch/val_{tf}/counts_loss']
    not_null = ~ x.isnull()
    df['best-epoch/val_counts_loss'][not_null] += x[not_null]
In [10]:
len(df)
Out[10]:
14
In [11]:
nexus_metric_profile = 'valid-peaks/avg/profile/binsize=1/auprc' 
# nexus_metric_profile = 'best-epoch/val_loss' 
nexus_metric_counts = 'valid-peaks/avg/counts/spearmanr'
nexus_metric = 'best-epoch/val_loss'
seq_metric = 'best-epoch/val_loss'
seq_metric_profile = 'best-epoch/val_profile_loss'
seq_metric_profile2 = 'valid-peaks/avg/profile/binsize=1/auprc' 
seq_metric_counts = 'valid-peaks/avg/counts/spearmanr'
In [12]:
# Plot params
profile_auprc_name = 'Profile auPRC'
counts_spearman_name = r"Total counts $R_{s}$"

s_default = 20
In [13]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'nexus-single-task')])
In [14]:
exps
Out[14]:
['nexus,peaks,O,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE',
 'nexus,peaks,S,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE',
 'nexus,peaks,N,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE',
 'nexus,peaks,K,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE']
In [21]:
nexus_metric_profile
Out[21]:
'valid-peaks/avg/profile/binsize=1/auprc'
In [25]:
{tf: df.loc[nexus_default_exp][f'valid-peaks/{tf}/profile/binsize=1/auprc'] for tf in osnk_tfs}
Out[25]:
{'Oct4': 0.1833659351513462,
 'Sox2': 0.3802538585433689,
 'Nanog': 0.4486560937694963,
 'Klf4': 0.15679826648753378}

Multi-task

In [59]:
# Multi-task
v = {tf: df.loc[nexus_default_exp][f'valid-peaks/{tf}/profile/binsize=1/auprc'] for tf in osnk_tfs}
v
Out[59]:
{'Oct4': 0.1833659351513462,
 'Sox2': 0.3802538585433689,
 'Nanog': 0.4486560937694963,
 'Klf4': 0.15679826648753378}
In [60]:
mean(list(v.values()))
Out[60]:
0.2922685384879363
In [61]:
# Single task
v= {tf: dict(df.loc[exps][['tfs', nexus_metric_profile]].set_index("tfs").iloc[:,0])[tf[0]] for tf in osnk_tfs}
v
Out[61]:
{'Oct4': 0.20845759163615404,
 'Sox2': 0.4250493674411673,
 'Nanog': 0.4710824673004411,
 'Klf4': 0.17793632002159224}
In [62]:
mean(list(v.values()))
Out[62]:
0.32063143659983867

Single-task

In [63]:
# Multi-task
v = {tf: df.loc[nexus_default_exp][f'valid-peaks/{tf}/counts/spearmanr'] for tf in osnk_tfs}
v
Out[63]:
{'Oct4': 0.479076032353049,
 'Sox2': 0.44157320459105576,
 'Nanog': 0.6012117950669067,
 'Klf4': 0.5773766993655419}
In [64]:
mean(list(v.values()))
Out[64]:
0.5248094328441384
In [65]:
# Single task
v = {tf: dict(df.loc[exps][['tfs', f'valid-peaks/{tf}/counts/spearmanr']].set_index("tfs").iloc[:,0])[tf[0]] for tf in osnk_tfs}
v
Out[65]:
{'Oct4': 0.4984590360702481,
 'Sox2': 0.4642387531706982,
 'Nanog': 0.585168601772147,
 'Klf4': 0.5841399987556414}
In [66]:
mean(list(v.values()))
Out[66]:
0.5330015974421837

ChIP-nexus hyper-parameters

In [20]:
nexus_default_exp = 'nexus,peaks,OSNK,0,10,1,FALSE,same,0.5,64,25,0.004,9,FALSE-2'

Learning rate

In [356]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'lr-nexus')]) + [nexus_default_exp]
In [357]:
list(df.loc[exps].sort_values('lr')['lr'])
Out[357]:
[0.0005, 0.001, 0.002, 0.004, 0.005, 0.01, 0.02, 0.04, 0.05]
In [358]:
x_var = 'lr'
In [359]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], dfs[nexus_metric_profile], s=s_default)
ax.set_ylabel(profile_auprc_name);
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_nexus_hp / f'{x_var}.profile-auprc.pdf', bbox_inches='tight')
In [360]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], dfs['valid-peaks/avg/counts/spearmanr'], s=s_default)
ax.set_ylabel(counts_spearman_name);
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_nexus_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

De-conv size

In [334]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'deconv')]) + [nexus_default_exp]
In [335]:
x_var = 'tconv_kernel_size'
In [336]:
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
# ax.axvline(1, color='grey', alpha=0.2)
# ax.axvline(25, color='grey', alpha=0.2)

ax.scatter(dfs[x_var], dfs[nexus_metric_profile], s=s_default)
ax.set_ylabel(profile_auprc_name);
plt.xticks([1, 10, 25, 35])

ax.set_xlabel("De-convolution size");
fig.savefig(fig_nexus_hp / f'{x_var}.profile-auprc.pdf', bbox_inches='tight')
In [337]:
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.axvline(1, color='grey', alpha=0.2)
ax.axvline(25, color='grey', alpha=0.2)
plt.xticks([1, 10, 25, 35])
ax.scatter(dfs[x_var], dfs['valid-peaks/avg/counts/spearmanr'], s=s_default)
ax.set_ylabel(counts_spearman_name);

ax.set_xlabel("De-convolution size");
fig.savefig(fig_nexus_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

Number of layers (same padding)

Compute the receptive field:

In [338]:
from basepair.models import seq_bpnet_cropped_extra_seqlen
In [348]:
n_layers = np.arange(1, 14)
receptive_field = [seq_bpnet_cropped_extra_seqlen(conv1_kernel_size=25,
                               n_dil_layers=nl-1,
                               tconv_kernel_size=1,
                               target_seqlen=0) + 1
                   for nl in n_layers]
print(pd.DataFrame({"receptive_field": receptive_field, "n_layers": n_layers}).to_string())
    receptive_field  n_layers
0                25         1
1                29         2
2                37         3
3                53         4
4                85         5
5               149         6
6               277         7
7               533         8
8              1045         9
9              2069        10
10             4117        11
11             8213        12
12            16405        13
In [340]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'n layers')& (df.padding == 'same')]) + [nexus_default_exp]
In [341]:
x_var = 'n_dil_layers'
In [346]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osnk_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/profile/binsize=1/auprc'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(profile_auprc_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_nexus_hp / f'{x_var}.profile-auprc.pdf', bbox_inches='tight')
In [347]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osnk_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/counts/spearmanr'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.xticks([1, 5, 10])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xlabel("Number of Layers");
plt.ylabel(counts_spearman_name);
ax.set_xlabel("Number of layers");
fig.savefig(fig_nexus_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

Profile vs regression weight

In [138]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'regression_weight')]) + [nexus_default_exp]
In [139]:
x_var = 'regression_weight'
In [178]:
## Compute the median value of the total counts for each task
profiles = ImpScoreFile(f"output/{nexus_default_exp}/deeplift.imp_score.h5").get_profiles()

nexus_median_N = np.median(mean([p.sum(axis=-2).mean(axis=-1) for t,p in profiles.items()]))
print(nexus_median_N)
130.5
In [179]:
nexus_natural_weight = nexus_median_N // 2
In [141]:
midpoint = 1
dfs = df.loc[exps].sort_values(x_var)
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.35, 1), sharex=True, gridspec_kw=dict(hspace=0))
ax = axes[0]
ax.axvline(x=midpoint, color='grey', linestyle='--', alpha=0.5)
ax.scatter(dfs[x_var] / nexus_natural_weight, dfs[nexus_metric_profile], s=s_default)
ax.set_xscale('log')
ax.set_ylabel('Profile\nauPRC');
ax.set_xlim([5e-3, 5e2])
#ax.set_xticks([0.01, 0.1, 1, 10, 100, 1000]);

ax = axes[1]
ax.axvline(x=midpoint, color='grey', linestyle='--', alpha=0.5)
ax.scatter(dfs[x_var]  / nexus_natural_weight, dfs['valid-peaks/avg/counts/spearmanr'], s=s_default)
ax.set_xscale('log')
ax.set_ylabel('Tot. counts $R_{s}$');
ax.set_xlabel("Relative total count weight");
ax.set_xlim([5e-3, 5e2]);
#ax.set_xticks([0.01, 0.1, 1, 10, 100, 1000]);
fig.savefig(fig_nexus_hp / f'{x_var}.both-auprc-spearman.pdf', bbox_inches='tight')

ChIP-seq hyper-parameters

In [66]:
seq_default_exp = 'seq,peaks,OSN,0,10,1,FALSE,same,0.5,64,50,0.004,9,FALSE'

Learning rate

In [67]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'lr-nexus')]) + [seq_default_exp]
In [68]:
list(df.loc[exps].sort_values('lr')['lr'])
Out[68]:
[0.0005, 0.001, 0.002, 0.004, 0.005, 0.01, 0.02, 0.04, 0.05]
In [69]:
seq_metric_name = 'Profile LL'
In [70]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], -dfs[seq_metric_profile], s=s_default)
ax.set_ylabel(seq_metric_name);
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_seq_hp / f'{x_var}.profile-ll.pdf', bbox_inches='tight')
In [56]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], dfs[seq_metric_counts], s=s_default)
ax.set_ylabel(counts_spearman_name);
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_seq_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

De-conv size

In [71]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'deconv')]) + [seq_default_exp]
In [72]:
x_var = 'tconv_kernel_size'
In [75]:
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
# ax.axvline(1, color='grey', alpha=0.2)
# ax.axvline(25, color='grey', alpha=0.2)

ax.scatter(dfs[x_var], -dfs[seq_metric_profile], s=s_default)
ax.set_ylabel(seq_metric_name);
plt.xticks([1, 20, 50, 100])

ax.set_xlabel("De-convolution size");
fig.savefig(fig_seq_hp / f'{x_var}.profile-ll.pdf', bbox_inches='tight')
In [76]:
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.axvline(1, color='grey', alpha=0.2)
# ax.axvline(50, color='grey', alpha=0.2)
plt.xticks([1, 20, 50, 100])
ax.scatter(dfs[x_var], dfs[seq_metric_counts], s=s_default)
ax.set_ylabel(counts_spearman_name);

ax.set_xlabel("De-convolution size");
fig.savefig(fig_seq_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

Number of layers (same padding)

In [81]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'n layers')& (df.padding == 'same')]) + [seq_default_exp]
In [82]:
x_var = 'n_dil_layers'
In [84]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osn_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, -dfs[f'best-epoch/val_{tf}/profile_loss'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(seq_metric_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_seq_hp / f'{x_var}.profile-ll.pdf', bbox_inches='tight')
In [85]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osn_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/counts/spearmanr'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(counts_spearman_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_seq_hp / f'{x_var}.counts-spearman.pdf', bbox_inches='tight')

Profile vs regression weight

In [132]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'regression_weight')]) + [seq_default_exp]
In [133]:
x_var = 'regression_weight'
In [177]:
## Compute the median value of the total counts for each task
profiles = ImpScoreFile(f"output/{seq_default_exp}/deeplift.imp_score.h5").get_profiles()

seq_median_N = np.median(mean([p.sum(axis=-2).mean(axis=-1) for t,p in profiles.items()]))
print(seq_median_N)
49.0
In [134]:
seq_natural_weight = seq_median_N // 2
In [136]:
midpoint = 1
dfs = df.loc[exps].sort_values(x_var)
fig, axes = plt.subplots(2, 1, figsize=get_figsize(.35, 1), sharex=True, gridspec_kw=dict(hspace=0))
ax = axes[0]
ax.axvline(x=midpoint, color='grey', linestyle='--', alpha=0.5)
ax.scatter(dfs[x_var] / seq_natural_weight, -dfs[f'best-epoch/val_{tf}/profile_loss'], s=s_default)
ax.set_xscale('log')
ax.set_ylabel('Profile LL');
ax.set_xlim([5e-3, 5e2])
#ax.set_xticks([0.01, 0.1, 1, 10, 100, 1000]);

ax = axes[1]
ax.axvline(x=midpoint, color='grey', linestyle='--', alpha=0.5)
ax.scatter(dfs[x_var]  / seq_natural_weight, dfs['valid-peaks/avg/counts/spearmanr'], s=s_default)
ax.set_xscale('log')
ax.set_ylabel('Tot. counts $R_{s}$');
ax.set_xlabel("Relative total count weight");
ax.set_xlim([5e-3, 5e2]);
#ax.set_xticks([0.01, 0.1, 1, 10, 100, 1000]);
fig.savefig(fig_seq_hp / f'{x_var}.both-profileLL-spearman.pdf', bbox_inches='tight')

Genome-wide models

Chip-nexus

Learning rate

In [151]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'binary - lr')])
In [152]:
list(df.loc[exps].sort_values('lr')['lr'])
Out[152]:
[0.0005, 0.001, 0.002, 0.004, 0.005, 0.01, 0.02, 0.04, 0.05]
In [153]:
gw_binary_metric = 'valid-genome-wide/avg/class/auPR'
In [154]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], dfs[gw_binary_metric], s=s_default)
ax.set_ylabel('auPR');
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_nexus_hp / f'{x_var}.gw-binary-auPR.pdf', bbox_inches='tight')
In [155]:
features = [f'valid-genome-wide/{tf}/class/{feature}'
           for tf in osnk_tfs for feature in ['frac_positive', 'n_negative', 'n_positive']]
dict(dfs[features].drop_duplicates().dropna().iloc[0])
Out[155]:
{'valid-genome-wide/Oct4/class/frac_positive': 0.005918009992432528,
 'valid-genome-wide/Oct4/class/n_negative': 9913927.0,
 'valid-genome-wide/Oct4/class/n_positive': 59020.0,
 'valid-genome-wide/Sox2/class/frac_positive': 0.001896630955724521,
 'valid-genome-wide/Sox2/class/n_negative': 9954032.0,
 'valid-genome-wide/Sox2/class/n_positive': 18915.0,
 'valid-genome-wide/Nanog/class/frac_positive': 0.008883432349535198,
 'valid-genome-wide/Nanog/class/n_negative': 9884353.0,
 'valid-genome-wide/Nanog/class/n_positive': 88594.0,
 'valid-genome-wide/Klf4/class/frac_positive': 0.012622447507241341,
 'valid-genome-wide/Klf4/class/n_negative': 9847064.0,
 'valid-genome-wide/Klf4/class/n_positive': 125883.0}

Profile importance

In [156]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'binary + profile')])
default = 'nexus,gw,OSNK,1,0,0,FALSE,valid,0.5,64,25,0.001,9,FALSE'
In [157]:
x_var = 'profile_weight'
In [158]:
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter((dfs[x_var]), dfs[f'valid-genome-wide/avg/class/auPR'], s=s_default)
ax.axhline(df.loc[default][f'valid-genome-wide/avg/class/auPR'])
# ax.set_ylim([0.07, 0.15])
ax.set_xlim([5e-3, 2])
ax.set_ylabel("auPR");
ax.set_xscale('log')
ax.set_xlabel("Profile importance");
fig.savefig(fig_nexus_hp / f'{x_var}.gw-binary.auprc.pdf', bbox_inches='tight')

Chip-seq

Learning rate

In [159]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'binary - lr')])
In [160]:
list(df.loc[exps].sort_values('lr')['lr'])
Out[160]:
[0.0005, 0.001, 0.002, 0.004, 0.005, 0.01, 0.02, 0.04, 0.05]
In [161]:
gw_binary_metric = 'valid-genome-wide/avg/class/auPR'
In [37]:
x_var = 'lr'
In [162]:
x_var = 'lr'
dfs = df.loc[exps].sort_values(x_var)

# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter(dfs[x_var], dfs[gw_binary_metric], s=s_default)
ax.set_ylabel('auPR');
ax.set_xscale("log")
ax.set_xlim([3e-4, 0.1])
ax.set_xlabel("Learning rate");
fig.savefig(fig_seq_hp / f'{x_var}.gw-binary-auPR.pdf', bbox_inches='tight')
In [163]:
features = [f'valid-genome-wide/{tf}/class/{feature}'
           for tf in osn_tfs for feature in ['frac_positive', 'n_negative', 'n_positive']]
dict(dfs[features].drop_duplicates().dropna().iloc[0])
Out[163]:
{'valid-genome-wide/Oct4/class/frac_positive': 0.001396404224027518,
 'valid-genome-wide/Oct4/class/n_negative': 9958831.0,
 'valid-genome-wide/Oct4/class/n_positive': 13926.0,
 'valid-genome-wide/Sox2/class/frac_positive': 0.0005940182840111315,
 'valid-genome-wide/Sox2/class/n_negative': 9966833.0,
 'valid-genome-wide/Sox2/class/n_positive': 5924.0,
 'valid-genome-wide/Nanog/class/frac_positive': 0.0033060065536541197,
 'valid-genome-wide/Nanog/class/n_negative': 9939787.0,
 'valid-genome-wide/Nanog/class/n_positive': 32970.0}

Note

ChIP-nexus has roughly 3x more peaks hence auPR is higher.

Adding profile weight

In [164]:
default = 'seq,gw,OSN,1,0,0,FALSE,valid,0.5,64,50,0.001,9,FALSE'
exps = list(df.index[(df.assay == 'seq') & (df.note == 'binary + profile')])
In [165]:
x_var = 'profile_weight'
In [166]:
dfs = df.loc[exps].sort_values(x_var)


# Make the plot
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
ax.grid(True, alpha=0.2)
ax.scatter((dfs[x_var]), dfs[f'valid-genome-wide/avg/class/auPR'], s=s_default)
ax.axhline(df.loc[default][f'valid-genome-wide/avg/class/auPR'])
ax.set_ylim([0.07, 0.15])
ax.set_xlim([5e-3, 2])
ax.set_ylabel("auPR");
ax.set_xscale('log')
ax.set_xlabel("Profile importance");
fig.savefig(fig_seq_hp / f'{x_var}.gw-binary.auprc.pdf', bbox_inches='tight')

Extra

ChIP-nexus

Number of layers (valid padding)

In [167]:
exps = list(df.index[(df.assay == 'nexus') & (df.note == 'n layers')& (df.padding == 'valid')])
In [168]:
x_var = 'n_dil_layers'
In [169]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osnk_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/profile/binsize=1/auprc'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(profile_auprc_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_nexus_hp / f'{x_var}.valid-padding.profile-auprc.pdf', bbox_inches='tight')
In [170]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osnk_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/counts/spearmanr'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.xticks([1, 5, 10])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xlabel("Number of Layers");
plt.ylabel(counts_spearman_name);
ax.set_xlabel("Number of layers");
fig.savefig(fig_nexus_hp / f'{x_var}.valid-padding.counts-spearman.pdf', bbox_inches='tight')
In [171]:
seq_metric_profile
Out[171]:
'best-epoch/val_profile_loss'

ChIP-seq

Number of layers (valid padding)

In [173]:
exps = list(df.index[(df.assay == 'seq') & (df.note == 'n layers')& (df.padding == 'valid')])
In [174]:
x_var = 'n_dil_layers'
In [175]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osn_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, -dfs[f'best-epoch/val_{tf}/profile_loss'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(seq_metric_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_seq_hp / f'{x_var}.valid-padding.profile-ll.pdf', bbox_inches='tight')
In [176]:
fig, ax = plt.subplots(1, 1, figsize=get_figsize(.2, 1))
for tf in osn_tfs:
    dfs = df.loc[exps].sort_values(x_var)
    plt.scatter(dfs[x_var]+1, dfs[f'valid-peaks/{tf}/counts/spearmanr'], 
                label=tf,
                color=tf_colors[tf],
                s=15,
               )
plt.xlim([0, 14])
plt.legend(loc="upper left", bbox_to_anchor=(1,1))
plt.xticks([1, 5, 10])
plt.ylabel(counts_spearman_name);
plt.xlabel("Number of Layers");
plt.savefig(fig_seq_hp / f'{x_var}.valid-padding.counts-spearman.pdf', bbox_inches='tight')