Goal

  • Explore the results of genome-wide training of binary classifiers

Conclusion

  • the core architecture of BPNet (9-dillated layers) seems to be well appropriate for the task
In [59]:
from basepair.imports import *
from plotnine import *
import warnings
warnings.filterwarnings("ignore")

pd.options.display.max_colwidth = 100
In [2]:
assays = ['chipnexus', 'chipseq']
subsets = ['genome-wide', 'accessible']
tasks = ['Oct4', 'Sox2']
In [3]:
exp_dir = f"{ddir}/processed/chipseq/labels"
In [19]:
def read_bpnet(exp_dir, assay, subset, tasks, model='BPNetClassifier', exp='default'):
    metric_list = []
    for task in tasks:
        metrics = read_json(f"{exp_dir}/{assay}/{subset}/{model}/{exp}/evaluation.valid.json")
        metric_list.append({"model": "BPNetClassifier",
                 "assay": assay,
                 "subset": subset,
                  "task": task,
                 "auprc": metrics[task]['auprc']
         })
    return metric_list
In [36]:
files = !ls {exp_dir}/*/*/*/*/evaluation.valid.json
In [61]:
from basepair.utils import flatten
from copy import deepcopy
from kipoi.utils import relative_path

def add_entry(d, k,v):
    d = deepcopy(d)
    d[k] = v
    return d
In [44]:
dfa = pd.DataFrame([add_entry(flatten(read_json(f)), 'exp', relative_path(os.path.dirname(f), exp_dir)) for f in files])
In [69]:
print(dfa[['Oct4_auprc', 'Sox2_auprc', 'exp']].sort_values("exp").to_string())
   Oct4_auprc  Sox2_auprc                                            exp
1      0.3067      0.1772          chipnexus/accessible/BPNet-transfer/1
0      0.3457      0.1931   chipnexus/accessible/BPNetClassifier/default
4      0.1474      0.0640         chipnexus/genome-wide/BPNet-transfer/1
2      0.2254      0.1347        chipnexus/genome-wide/BPNetClassifier/2
3      0.2270      0.1081  chipnexus/genome-wide/BPNetClassifier/default
6      0.2504      0.1425            chipseq/accessible/BPNet-transfer/1
5      0.2793      0.1518     chipseq/accessible/BPNetClassifier/default
8      0.1164      0.0712           chipseq/genome-wide/BPNet-transfer/1
7      0.1688      0.0895    chipseq/genome-wide/BPNetClassifier/default
In [31]:
files
Out[31]:
['zsh:1: no matches found: */*/*/*/evaluation.valid.json']
In [13]:
DATA='/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/'
In [18]:
!ls {DATA}/chipnexus/genome-wide/BPNet-transfer/1
cometml.json					   history.csv
config.gin					   log
config.gin.json					   model.h5
evaluation.valid.json				   note_params.json
events.out.tfevents.1544465819.surya.stanford.edu
In [20]:
!ls {DATA}/chipnexus/genome-wide/BPNetClassifier
2  default  default2  model.gin  model.gin~  problem.gin  problem.gin~
In [21]:
!cat {DATA}/chipnexus/genome-wide/BPNetClassifier/default/evaluation.valid.json
{
  "Oct4": {
    "auprc": 0.22703258357254222,
    "auc": 0.9562279867699116,
    "accuracy": 0.9956203994486766
  },
  "Sox2": {
    "auprc": 0.10812543227303097,
    "auc": 0.9406154817241277,
    "accuracy": 0.9996290341035693
  }
}
In [26]:
!diff {DATA}/chipnexus/genome-wide/BPNetClassifier/2/config.gin {DATA}/chipnexus/genome-wide/BPNetClassifier/default/config.gin
1,4d0
< import basepair
< import basepair.datasets
< import basepair.models
< 
39c35
<     '/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/chipnexus/genome-wide//oct4-sox2.intervals_file.tsv.gz'
---
>     '/srv/scratch/avsec/workspace/chipnexus/data/processed/chipseq/labels/chipnexus/genome-wide/oct4-sox2.intervals_file.tsv.gz'
54,55c50,51
< train.train_epoch_frac = 0.1
< train.valid_epoch_frac = 0.2
---
> train.train_epoch_frac = 0.02
> train.valid_epoch_frac = 0.04
In [23]:
!cat {DATA}/chipnexus/genome-wide/BPNetClassifier/2/evaluation.valid.json
{
  "Oct4": {
    "auprc": 0.22541109677282956,
    "auc": 0.9548992871837259,
    "accuracy": 0.9948975910226223
  },
  "Sox2": {
    "auprc": 0.13470876310067453,
    "auc": 0.9448285378554975,
    "accuracy": 0.999454391067912
  }
}
In [12]:
bpnet_results = [r for task in tasks for subset in subsets for assay in assays
                for r in read_bpnet(exp_dir, assay, subset, tasks) ]
In [11]:
print(pd.DataFrame(bpnet_results).to_string())
        assay   auprc            model       subset  task
0   chipnexus  0.2270  BPNetClassifier  genome-wide  Oct4
1   chipnexus  0.1081  BPNetClassifier  genome-wide  Sox2
2     chipseq  0.1688  BPNetClassifier  genome-wide  Oct4
3     chipseq  0.0895  BPNetClassifier  genome-wide  Sox2
4   chipnexus  0.3457  BPNetClassifier   accessible  Oct4
5   chipnexus  0.1931  BPNetClassifier   accessible  Sox2
6     chipseq  0.2793  BPNetClassifier   accessible  Oct4
7     chipseq  0.1518  BPNetClassifier   accessible  Sox2
8   chipnexus  0.2270  BPNetClassifier  genome-wide  Oct4
9   chipnexus  0.1081  BPNetClassifier  genome-wide  Sox2
10    chipseq  0.1688  BPNetClassifier  genome-wide  Oct4
11    chipseq  0.0895  BPNetClassifier  genome-wide  Sox2
12  chipnexus  0.3457  BPNetClassifier   accessible  Oct4
13  chipnexus  0.1931  BPNetClassifier   accessible  Sox2
14    chipseq  0.2793  BPNetClassifier   accessible  Oct4
15    chipseq  0.1518  BPNetClassifier   accessible  Sox2
In [6]:
# TODO - fill in the metrics manually
other_results = [
# ---------------- copied from tf-dragonn logs
 ['tfdragonn-default', 'chipnexus', 'accessible', 'Oct4', 0.302],
 ['tfdragonn-default', 'chipnexus', 'accessible', 'Sox2', 0.165],
 ['tfdragonn-default', 'chipnexus', 'genome-wide', 'Oct4', 0.180],
 ['tfdragonn-default', 'chipnexus', 'genome-wide', 'Sox2', 0.075],
 ['tfdragonn-default', 'chipseq', 'accessible', 'Oct4', 0.230],
 ['tfdragonn-default', 'chipseq', 'accessible', 'Sox2', 0.093],
 ['tfdragonn-default', 'chipseq', 'genome-wide', 'Oct4', 0.129],
 ['tfdragonn-default', 'chipseq', 'genome-wide', 'Sox2', 0.055],
 
 
 ['basset', 'chipnexus', 'accessible', 'Oct4', 0.298],
 ['basset', 'chipnexus', 'accessible', 'Sox2', 0.134],
 ['basset', 'chipnexus', 'genome-wide', 'Oct4', 0.194],
 ['basset', 'chipnexus', 'genome-wide', 'Sox2', 0.071],
 ['basset', 'chipseq', 'accessible', 'Oct4', 0.255],
 ['basset', 'chipseq', 'accessible', 'Sox2', 0.097],
 #['basset', 'chipseq', 'genome-wide', 'Oct4', 0.129],
 #['basset', 'chipseq', 'genome-wide', 'Sox2', 0.055],
    
# ---------------- copied from the notebook
 ['BPNet-transfer', 'chipnexus', 'accessible', 'Oct4', 0.38444],
 ['BPNet-transfer', 'chipnexus', 'accessible', 'Sox2', 0.2224],    
]
In [7]:
df = pd.DataFrame(bpnet_results)
dfo = pd.DataFrame(other_results, columns = ['model', 'assay', 'subset', 'task', 'auprc'])
df = pd.concat([df, dfo], sort=True)
In [8]:
fig = ggplot(aes(x='assay', fill='model', y='auprc'), df) + \
  geom_bar(position='dodge', stat='identity') + facet_grid(".~subset+task") + \
  scale_fill_brewer('qual', 'Paired') + \
  theme_classic() + \
  theme(axis_text_x=element_text(angle=20, hjust = 1))
fig
Out[8]:
<ggplot: (-9223363306820092142)>
In [129]:
# fig.save("binary-classifiers.png", dpi=300)
# fig.save("binary-classifiers.pdf", dpi=300)
In [10]:
# upload the figure to gdrive
gdrive_upload_fig(fig, 'genome-wide-training/binary-classifiers')