\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=2cm,bottom=2cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{capt-of}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage{pagecolor}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionAfterwards]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{3pt}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}

\usepackage[hang]{footmisc}
\setlength\footnotemargin{0em}

\setlength{\skip\footins}{0.75cm}

\makeatletter
\renewcommand\footnoterule{%
  \kern-3\p@
  \hrule\@width \textwidth height 1.5pt
  \kern2.6\p@}
\makeatother

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}

\title{\bf Biochemical signatures and enhancer activity of ENCODE candidate regulatory elements}
\renewcommand\Authfont{\scshape\normalsize}
%\author[*]{ALL OTHER AUTHORS I'VE MISSED}
\author[1]{Gilberto DeSalvo}
\author[6]{Georgi K. Marinov}
\author[2]{Christopher Partridge}
\author[4,7]{Christopher M. Vockley}
\author[3]{Nergiz Dogan}
\author[8,9]{Ricardo Ramirez}
%\author[1]{Gordon Kwan}
\author[4,5]{Timothy E. Reddy}
\author[8,9]{Ali Mortazavi}
\author[3]{Ross C. Hardison}
\author[2]{Richard M. Myers}
\author[1]{Barbara J. Wold}
\renewcommand\Affilfont{\itshape\normalsize}
%\affil[*]{other affiliations}
\affil[1]{Division of Biology and Biological Engineering, California Institute of Technology, Pasadena, California, USA.}
\affil[2]{HudsonAlpha Institute for Biotechnology, 601 Genome Way, Huntsville, AL 35806, USA}
\affil[3]{Dept. of Biochemistry and Molecular Biology, Penn State University, 304 Wartik Laboratory, University Park, PA 16802, USA}
\affil[4]{Center for Genomic \& Computational Biology, Duke University, Durham, NC 27708, USA}
\affil[5]{Department of Biostatistics \& Bioinformatics, Duke University, Durham, NC 27708, USA}
\affil[6]{Department of Genetics, Stanford University School of Medicine, Stanford, CA 94305}
\affil[7]{Department of Cell Biology, Duke University, Durham, NC 27708, USA}
\affil[8]{Department of Developmental and Cell Biology, University of California Irvine, Irvine, CA 92697-2300, USA}
\affil[9]{Center for Complex Biological Systems, University of California Irvine, Irvine, CA 92697-2280, USA}
\date{}

\begin{document}
\maketitle

% \let\thefootnote\relax\footnotetext{Corresponding author:
% Georgi K. Marinov. Division of Biology and Biological Engineering, California Institute of Technology. Pasadena, CA 91125.}
% \centerline{}
% \centerline{}
\begin{abstract}
\noindent {\normalsize \textbf{An aspiration for functional genomics is to identify all \textit{cis}-acting regulatory elements in the genome, and to define their activities.  Genome-wide transcription factor occupancy and chromatin state profiles are widely used to identify candidate regulatory elements (cREs) such as enhancers, promoters and insulators, based on their characteristic biochemical signatures. Multiple large-scale efforts, in particular the ENCODE and NIH Roadmap Epigenomic Mapping consortia, have generated such datasets from a wide variety of cell lines, tissues and developmental stages in both human and mouse, serving as the foundation for cataloging the cRE repertoire on a genome-wide scale. However, accurately predicting the actual regulatory activity of cREs based on functional genomic maps alone has turned out to be difficult, and it is not yet clear to what extent active regulatory elements \textit{in vivo} can be identified from biochemical signatures. To address these issues, we carried out large-scale tests for transcriptional enhancer activity for hundreds of candidate enhancer elements (cEnhs) from five human and mouse cell types, including both immortalized cell lines and models for  cell differentiated during developmental transitions. Our cEnh collections were selected using biochemical signature criteria ranging from simple individual transcription factor (TF) occupancy to integrative machine learning models over multiple functional genomic datasets, and were designed to include full-length cREs. We supplement these tests with a massively parallel reporter assay (MPRA) characterization of cEnhs in one of the cell lines we studied. We find that irrespective of the selection criteria used, $\sim$50\% of cEnh elements exhibit enhancer activity. Examination of the functional predictivity of biochemical signatures reveals that the majority of cEnhs genome-wide are likely to exhibit modest regulatory activity. In the same time, most active enhancers in the genome are characterized by relatively modest biochemical signature strength, due to the much lower abundance of cEnhs exhibit very strong biochemical signals. % We also uncovered a set of developmentally distinct distal elements marked in progenitor cells to predict future, rather than contemporaneous, enhancer activity upon differentiation. Finally, developmentally privileged loci possessing multiple, highly regulated genes were enriched for enhancer cREs with exceptionally strong individual activity despite having only modest biochemical signatures. 
Finally, we discuss our results in the context of current models of the regulatory effect of enhancers on their cognate genes. We expect our findings to help guide future efforts towards cataloging the functional repertoire of mammalian genomes.}}
\centerline{}
% \centerline{}
\end{abstract}

\renewcommand{\footnotesize}{\fontsize{10pt}{12pt}\selectfont}

\begin{multicols}{2}

\section*{Introduction}

The complete and accurate understanding of the relationship between the human genome and its corresponding phenotypes requires the comprehensive characterization of its compendium of functional elements. The results of the many genome-wide epigenomic and transcriptomic studies carried out over the last decade reveal a remarkable picture, in which non-coding regulatory elements constitute the bulk of such functional regions in the genome\cite{mouseENCODE2014,ENCODE2012}, with the expression of each gene (protein coding or non-coding) being controlled by the integrated output of multiple proximal and distal enhancer, insulator and silencing elements.

The genome-wide mapping and characterization of non-coding regulatory elements is thus a major goal of the field, and features prominently among the objectives of the \textbf{ENC}yclopedia \textbf{O}f \textbf{D}NA \textbf{E}lements (ENCODE) consortium\cite{ENCODE101}. However, achieving it, although greatly aided by the advent of high-throughput sequencing and epigenomic tools, is still not a simple task.

The biochemical activity associated with the function of regulatory elements results in certain biochemical signatures that can be captured by epigenomic assays. For example, active promoters in eukaryotes are classically associated with the trimethylation of lysine 4 on histone 3 (H3K4me3)\cite{Vermeulen2010}, as well as other biochemical signatures, such as DNAse hypersensitivity\cite{Thurman2012}. Active enhancer elements have been proposed to exhibit their own biochemical signature, featuring DNAse hypersensitivity, the H3K27ac and H3K4me1 histone marks, and occupancy by the p300 acetyltransferase\cite{Creyghton2010,RadaIglesias2011,Thurman2012}, as well as by sequence-specific transcription factors (Figure \ref{Fig1}A).

These biochemical signatures enable the compilation of lists of candidate functional elements (cREs), but they do not on their own allow the conclusive identification of any given element as functional. While functional regulatory elements exhibit characteristic biochemical signatures, the reverse (that the presence of a biochemical signature necessarily means function) cannot be inferred straightforwardly\cite{Kellis2014}. Such inferences are further complicated by the observation that biochemical signatures are not binary but instead exist on a continuum between strong outstanding features, on one hand, and what is probably biochemical noise, on the other. For example, it is far from clear that all transcription factor binding sites that can be reproducibly identified using ChIP-seq and related techniques are in fact functional REs\cite{Fisher2012}. Therefore, individual cREs in the lists compiled by efforts such as the ENCODE and mouseENCODE consortia\cite{ENCODE2012,mouseENCODE2014} have to be subsequently directly tested and functionally characterized in detail.

The ultimate functional characterization of cREs will involve a combination of loss-of-function assays and direct assays for activity. The former have been until recently technically challenging, but are becoming more commonplace with the advent of large-scale CRISPR/Cas9-mediated mutagenesis techniques\cite{Korkmaz2016,Fulco2016}. Nevertheless, most work in the field has been based on testing cREs for regulatory activity using an exogenous plasmid construct combining a cRE, a promoter and a reporter gene. Classically, such testing has been done by cloning individual cREs into plasmids (or other vectors) and then assaying the expression of the reporter gene (luciferase activity in cell lines or staining for LacZ activity in embryos\cite{Visel2007,Fisher2012}). Numerous developmental enhancers have been characterized following that approach\cite{Visel2007,Visel2009,May2011}, starting from lists of cREs complied based on comparisons of evolutionary conservation and/or biochemical signature derived from ChIP-seq datasets. However, such studies have often focused only on the most outstanding biochemical signatures\cite{Visel2007}, thus obtaining very high success rates that are likely to be nonrepresentative with respect to the genome-wide population of cREs. 

High-throughput sequencing has enabled the development of assays that go beyond the testing of individual cREs, one by one; instead, very large numbers of sequences are analyzed in parallel, with the readout being based on sequencing DNA tags associated with the cRE or of the cRE itself. These are usually referred to as MPRAs\cite{Inoue2015}, and in the last few years a number of variations of the principle have been successfully applied to a multitude of biological problems and systems \cite{Patwardhan2009,Kinney2010,Kwasnieski2012,Melnikov2012,Patwardhan2012,STARRseq2013,FIREWACh2014}, including the question of testing cREs for activity within the context of the ENCODE Consortium\cite{Kheradpour2013,Kwasnieski2014,Ernst2016}. However, several issues complicate the interpretations of MPRA experiments.

First, the nature of MPRA designs is such that the elements tested are very short, in the 80--250bp neighborhood. This is significantly shorter than tens of thousands of blocks of conserved noncoding sequence that can be identified in the human genome by comparative genomic analysis (Figure \ref{FigS1}). Thus to what extent complete REs are assayed and what the corresponding false negative rate is have always been an obvious concern regarding MPRAs.

Second, given that it is very difficult to control the number of constructs going into each individual cell in transfection experiments, and that an MPRA features large numbers of different cREs being tested in the same time, there is significant potential for crosstalk between active and inactive cREs that end up in the same cell, resulting in numerous false positives.

While the latter concern can be alleviated to an extent through the use of genome-integrated MPRA constructs \cite{Maricque2016,Inoue2016}, the short length of constructs tested remains a significant issue, and one that might be behind the low positive rates reported by MPRAs in the past\cite{Vockley2016}. There is therefore a major gap in the field that needs to be filled by testing a large number of individual constructs of large size (500-1000 bp) using a traditional luciferase assay, and using the resulting data to examine the performance of biochemical signature predictions.

To this end, as part of the ENCODE Project Consortium efforts towards functional validation of cREs, we tested the regulatory activity of hundreds of candidate enhancer elements (cEnh) using constructs of such lengths  (Figure \ref{FigS2}) in several diverse mammalian cell lines, including both mouse and human systems. These cEnhs were selected from a wide range of biochemical signature strengths (Figure \ref{FigS3}), using both TF-centric selection criteria (identifying cEnhs based on ChIP-seq data for individual TFs) and machine learning ``TF-agnostic'' approaches (designed to find combinatorial signatures of enhancer elements from multiple epigenomic maps of histone modifications and DNAse hypersensitivity) for defining cREs.

\end{multicols}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig1V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Biochemical signatures and functional testing of candidate enhancer elements (cEnhs) in mammalian genomes}. (A) Biochemical signatures of cEnhs and promoters. Active enhancers are characterized by DNAse hypersensitivity due to nucleosome depletion, by p300 occupancy and by H3K27ac, as well H3K4me1 (not shown). Promoter elements share some of these features, but also associate with components of the transcription and transcription initiation machineries, and are marked by H3K4me3 (not shown); (B) Genome-wide commonalities and differences between the biochemical signatures of enhancers and promoters. Shown is the average signal profile around TSS distal (right; defined as regions more than 1kb away from an annotated TSS) and TSS proximal (left) cEnhs (defined as statistically significant peaks in the respective datasets; see the Methods section for further detail) in mouse and human cells for TFs (myogenin in differentiating mouse muscle cell, GATA1 in erythroid mouse cells, and the glucocorticoid receptor upon Dexamethasone stimulation of human A549 cells), DNAse hypersensitivity and H3K27ac; (C) The distribution of biochemical signal strength varies over a large continuum. Shown are the signal distribution for myogenin, p300, DNAse-seq, and H3K27ac relative to the summits of the top 500, middle 500 and bottom 500 reproducible myogenin ChIP-seq sites (total $n = 32,278$) in differentiated C2C12 muscle cells, as well as the distribution of the cognate myogenin TF binding
\centerline{}
\rightline{\textit{(legend continued on next page)}}
} 
\label{Fig1}
\end{figure*}
\clearpage
\let\thefootnote\relax\footnotetext{ motif. (D) Different cell types share a small fraction of their distal cEnh elements, in contrast to promoter elements. Shown are the common and cell-type specific TSS proximal (within 1kb of an annotated TSS) and TSS distal DHSs between the human erythroid K562 and hepatocyte HepG2 immortalized cell lines; E) Outline of cENH selection approaches, biological systems, experimental design and functional assays used in this study. Sets of cEnhs for functional testing were compiled based on: TF ChIP-Seq occupancy measurements (of the master regulators of muscle differentiation, MyoD and myogenin) in differentiating mouse C2C12 cells; phylogenetic conservation patterns and TF occupancy measurements (of the regulators of erythropoesis GATA1 and TAL1) in differentiating mouse G1E-ER4 cells; TF occupancy (multiple TFs) in immortalized K562 cells; machine learning methods (Self-Organizing Maps, chromHMM and Segway) defining integrated chromatin states over multiple histone modification, DNAse and TF occupancy measurements. These cEnhs were tested using luciferase assays. In addition, DNA fragments from GR ChIP-seq experiments in Dex-stimulated A549 cells were cloned and assayed for activity using the STARR-seq assay. Active elements identified using these methods were then evaluated for the presence and distribution of various biochemical signatures.}

\begin{multicols}{2}

We find that in general $\sim$50\% of both TF-selected and TF-agnostic cEnhs showed significant enhancer activity in transfection assays, observing similar proportions across all cell lines and conditions examined. We observe that the presence of TF recognition motifs in cEnhs displays no correlation with enhancer activity. Our results indicate that DNAse and H3K27ac are generally more predictive of enhancer activity than TF occupancy alone, and that simple biochemical signatures such as the combination of chromatin accessibility and H3K27ac are specifically predictive of enhancer activity compared to regions of the genome that lack them. However, the presence of these signatures is not a strong indicator of enhancer activity as nearly half of cEnhs exhibiting them are not significantly active in transfection assays. 

We also observe a positive correlation between biochemical signal strength and enhancer activity in transfection assays, with the highest fraction of cEnhs exhibiting significant enhancer activity being found among the most strongly occupied cEnhs. However, first, even among that latter group a large fraction of cEnhs displays no discernible enhancer 	activity, and second, because the distribution of biochemical signal strength is highly skewed towards weaker ChIP-seq and DNAse-seq peaks, the bulk of active enhancers in the genome are expected to be found among the larger population of cEnhs with modest biochemical signatures. Of note, enhancer activity as measured by transfection assays also exhibits a skewed pattern, with a smaller number of very highly active enhancers and a larger number of weaker ones. 

Finally, we corroborated these findings by applying the STARR-seq MPRA to assay the activity of thousands of genomic regions occupied by the glucocorticoid receptor (GR) in stimulated A549 cells. 

We expect that our findings will help guide efforts towards the comprehensive cataloging of functional elements in the human genome, and we discuss the implications of our findings in the context of models of gene regulation mediated by the action of distal enhancers.

\section*{Results}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-09-08-Gigio-V7edit/Figure2a-fixed.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional testing of cEnh regulatory activity in mammalian cells}. (A) Functional assay testing of cEnh regulatory activity in the context of muscle differentiation. Shown is luciferase assay fold activity in differentiated C2C12 myocytes across technical replicates ($n = 4$). The red arrow corresponds to the mean fold activity threshold above which elements are considered active. In addition, for each cEnh DNAse hypersensitivity, H3K27ac status, and myogenin occupancy are shown, both as RPM (Read Per Million) signal intensity values and as binary peak calls, as well as the number of myogenin motif (RRCAGSTG, derived from myogenin ChIP-seq data) occurrences. Tested cEnhs are sorted by mean fold activity. (B) Functional assay testing of cEnh regulatory activity in the context of erythropoesis. Shown is luciferase assay fold activity in K562 cells across biological ($n \in [1:9]$) and technical replicates ($n = 4$ for each biological replicate). The red arrow corresponds to the mean fold activity threshold above which elements are considered active. In addition, for each cEnh DNAse hypersensitivity, H3K27ac status, and GATA1/TAL1 occupancy are shown, both as RPM (Read Per Million) signal intensity values and as binary peak calls, as well as the number of TAL1 (CAGMTG) and GATA1 (WGATAA) motif occurrences. Tested cEnhs are sorted by mean fold activity.}
\label{Fig2}
\end{figure*}

\subsection*{Large-scale functional activity testing of full-length cREs}

One of the major goals of the ENCODE Project is to identify all functional regulatory elements controlling gene expression in the genome, to which end it has carried out mapping of DNAse hypersensitivity, dozens of histone modifications, numerous sequence-specific transcription factors, and RNA transcripts in hundreds of cell types, in both human and mouse \cite{ENCODE101,mouseENCODE2014,ENCODE2012,ENCODE3}. As active regulatory regions typically exhibit certain biochemical signatures, these datasets provide a compendium of candidate regulatory elements. Such regions can be defined using highly multidimensional inputs incorporating multiple biochemical measurements, but because considerable redundancy exists between individual biochemical marks, cREs can also be identified following simpler rules. For example, active enhancers and promoters are typically occupied by sequence-specific transcription factors, marked by H3K27ac and exhibit DNA hypersensitivity (Figure \ref{Fig1}B), and cEnhs can therefore be identified using TF occupancy, by the overlap of DNAse hypersensitive sites (DHS) and regions marked by H3K27ac, or by all three. As of the writing of this manuscript, based on DNAse-seq maps and histone marks profiles, % 1.31M
more than a million and % 0.43M
nearly half a million cREs in humans and mouse, respectively, have been identified by the ENCODE consortium\cite{ENCODE3}. 

However, there need not be a strict one-to-one relationship between these biochemical signatures and functional REs \cite{Kellis2014}, as their presence alone does not necessarily imply that the cRE plays an active regulatory role. In addition, biochemical marks on themselves do not provide direct understanding of how exactly functional REs are specified or exercise their function. An additional highly useful criterion for assessing functionality is evolutionary conservation of cREs between the genomes of distant species, as functional elements are usually subject to selective constraint at the sequence level. However, the absence of conservation on its own does not imply nonfunctionality, as recently evolved lineage-specific REs do not appear as conserved in comparative genomic analyses. Therefore cREs identified using functional genomics tools have to be directly tested for function and subsequently dissected in detail if they are to be comprehensively understood.

In recent years, multiple high-throughput approaches for measuring regulatory activity have been devised, relying on a sequencing readout of the effect of a given cRE or custom-designed DNA sequence on the expression of a reporter gene \cite{Patwardhan2009,Kinney2010,Kwasnieski2012,Melnikov2012,Patwardhan2012,STARRseq2013,FIREWACh2014,Kheradpour2013,Kwasnieski2014,Ernst2016}. On their own MPRAs are very powerful, however, they suffer from several shortcoming when it comes to testing cREs, in particular the short length (typically 80 to 250 bp) of the segments of DNA tested by most of them, which is likely to be significantly shorter than the size of functional regulatory elements in mammalian genomes. 

To assess how prominent this issue might be, we examined the distribution of the lengths of conserved noncoding segments (i.e. excluding sequences overlapping with or in the vicinity of annotated exons) in the human genome (Supplementary Figure \ref{FigS1}), and found that tens to hundreds of thousands (depending on the definition) of such blocks fall outside the range of testing of MPRAs. It is thus possible that MPRAs using short sequences exhibit substantial numbers of false negatives as they cannot assay the activity of complete, full-length REs. An additional concern with MPRAs is the possibility of cross-talk between different REs when multiple episomal constructs end up being transfected in the same cell, resulting in false positives. Alternative approaches towards testing the functionality of cREs are therefore needed. 

To address these issues, we carried out large-scale testing of cEnhs identified on the basis of biochemical signatures in a variety of mammalian systems using luciferase activity assays, which allow for much larger segments of DNA to be tested for enhancer activity. We aimed at comparing the predictivity of multiple approaches for identifying functional cEnhs, and at incorporating in our analysis a diversity of biological systems (Figure \ref{Fig1}E), including immortalized cell lines (the human K562 and HepG2 cell lines), model systems for major developmental transitions (myogenesis and erythropoesis), and model systems for cellular response to exogenous signaling stimuli (activation of the glucocorticoid receptor in the prostate cancer cell line A549).

Immortalized cell lines were targeted as they have been extensively studied by the ENCODE consortium\cite{ENCODE101,ENCODE2012} and they are the source of a significant portion of ENCODE data, while differentiation and external signaling stimulation represent the two main types of dynamic transition of cellular states associated with regulatory alterations of chromatin states: the slower and typically irreversible differentiation of one cell type into another, and the much faster and reversible cellular response to signaling molecules.

The process of myogenesis transforms undifferentiated precursor myoblast cells into differentiated myocyte muscle cells, and is primarily regulated by four key bHLH TFs known as Myogenic Regulatory Factors (MRFs) along with numerous cofactors, such as MEF2, E2A, HEB, Pbx1 and others\cite{Taylor1979,Davis1987,Hasty1993,Wright1989}. The epigenomic and occupancy landscape of these and other factors involved in the process is illustrated for reference in Supplementary Figure \ref{FigApp3}. These factors were profiled in the mouse C2C12 cell line\cite{Yaffe1977}, which has been for decades the main model system for studying myogenesis, a wealth of functional genomic data has been generated for it, and it was thus naturally also the focus of our study. The key specification MRF and the one expressed at high levels in myoblasts is MyoD, while myogenin is the most important differentiation TF and its expression is induced upon the onset of the process; the other two MRFs are Myf5 and Myf6. MyoD and myogenin occupy a highly overlappings set of sites (Supplementary Figure \ref{FigApp3}), the majority of which contain the classical muscle E-box sequence motif CAGSTG often in the extended RRCAGSTG form\cite{Moncaut2013,Tapscott2005}. While the majority of ChIP-Seq peaks contain this motif, these peaks include only a tiny fraction of the occurrences of the motif in the genome, underscoring the highly selective nature of in vivo occupancy by MRF TFs.

Lineage commitment during the process of erythropoesis is accomplished through the so called GATA switch\cite{Kaneko2010}. The onset of terminal differentiation is marked by the replacement of the GATA2 transcription factor at thousands of occupied sites by the GATA1 TF, which then regulates the expression of genes involved in red blood cell development and functions, with SCL/TAL1 being an important cofactor of GATA1, often forming closely spaced heterodimers with it\cite{Han2015,Wu2014,Tripic2009,Yu2009}. GATA1 binds to a WGATAA consensus recognition motif, while TAL1 is a bHLH factor targeting a CAGMTG E-box\cite{Han2015}. Mouse G1E cells have served as the key model system for erythropoesis for many years \hl{cite}. In these cells, the endogenous GATA1 gene is knocked out, and GATA1 expression can be specifically induced by estradiol exposure \hl{details} (a condition referred to as ``G1E-ER4'' cells), allowing for differentiation to be triggered rapidly and in a controlled manner. The epigenomic and occupancy landscape of the key TFs involved is illustrated for reference in Supplementary Figure \ref{FigApp5}, and it served as the basis for our cRE selections. 

\hl{XXXX SECTION ON A549 CELLS XXX}

While the full catalog of REs in the genome includes promoters, insulators, enhancers, silencers, and others. for the purposes of this study we focused on candidate transcriptional enhancers. A major reason for this choice is that cEnhs constitute the bulk of cREs distinguishing different cell types from each other, in contrast to, for example, active promoters, a major fraction of which is shared (Figure \ref{Fig1}C, Supplementary Figure \ref{FigApp7}). 

\end{multicols}

\begin{FPfigure}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig3.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Summary of cEnh activity predictions by different selection criteria}. (A) TF occupancy-centered selections. Tested eEnhs selected on the basis of TF occupancy in the context of mouse muscle differentiation and erythropoesis and in human K562 cells were further subselected with the additional requirement of exhibiting DNAse hypersensitivity and the H3K27ac histone mark. The fraction of active constructs in negative controls and cEnhs are shown on the left. The expected number of active cEnhs genome-wide is extrapolated on the left based on the number of TF$^+$/DNAse$^+$/H3K27ac$^+$ regions in the genome; (B) TF-occupancy agnostic selections. Tested eEnhs selected using Self-Organizing Maps in HepG2 cells, chromHMM in K562 cells, and evolutionary conservation of GATA1 motifs in G1E cells were further subselected with the additional requirement of exhibiting DNAse hypersensitivity and the H3K27ac histone mark. The fraction of active constructs in negative controls and cEnhs are shown on the left. The expected number of active cEnhs genome-wide is extrapolated on the left based on the number of DNAse$^+$/H3K27ac$^+$ (for HepG2 SOM and K562 chromHMM selections) DNAse$^+$/H3K27ac$^+$ regions with a conserved GATA1 motif (for GATA1 conservation selections) in the genome.} 
\label{Fig3}
\end{FPfigure}

\begin{multicols}{2}	

We also aimed to represent the full spectrum of cEnh biochemical signatures (Figure \ref{Fig1}D and Supplementary Figure \ref{FigS3}), as multiple studies have shown that the landscape of transcription factor occupancy, DNAse hypersensitivity and histone modification maps includes many more weaker sites than very strong peaks \cite{Landt2012,Kellis2014}. 

We applied several different strategies for compiling lists of cEnhs to be tested, broadly divided into two categories: TF-centric, based on TF ChIP-seq datasets, and TF-agnostic, based primarily on chromatin state signatures and evolutionary conservation. 

In the context of muscle differentiation, we selected cEnh regions based on myogenin ChIP-seq data in differentiated C2C12 myocytes. We randomly selected a set of regions ($n=89$) spanning the full range of myogenin occupancy levels, of which 88 contain a CAGSTG and 84 contain the extended RRCASGTG E-box. We also selected additional cEnh elements (\hl{XX n = ?? XX}) associated with genes playing a well characterized role in the muscle development (\hl{XXX list genes XXX}); the inclusion of these cEnhs allowed us to test whether cEnhs nearby known functionally relevant genes exhibit a higher levels of functionality that the genome-wide average. We also selected a group of negative controls ($n=23$) out of a set of well characterized enhancer elements active in T cells and in neurons and not being occupied by myogenin in C2C12 cells; 21 of them contained E-box motifs. These negative controls were selected in order to test the specificity of biochemical signatures for prediction of functional activity in a given cell type. A second set of negative control elements ($n=11$), 6 of them containing E-box motifs, were selected from regions without biochemical marks but located near genes highly expressed in C2C12 cells and near ChIP-seq positive regions. These were selected in order to assess the baseline functional activity of biochemically neutral regions.

TF-centric erythroid cEnhs ($n=114$) and negative controls ($n=74$) were selected on the basis of ChIP data for GATA1 and TAL1 in G1E-ER4 cells (see the Methods section for further details). Of note, even though they are not significantly occupied by GATA1 or TAL1, 45 of the negative control elements contain a WGATAA motif and 40 contain a CAGMTG E-box. Functional testing of cEnhs identified in G1E cells was then carried out in human K562 cells, which represent a similar developmental state but have the advantage of being much more easily transfectable. 

\hl{XXX K562 TF selection XXX}

TF-agnostic enhancer selection was based on evolutionary conservation and on the integration of measurements of multiple histone modifications and open chromatin. 

A set of evolutionarily conserved erythropoetic cEnhs ($n=46$) were selected by requiring both strong overall sequence conservation as assessed by multiple genome alignments of a collection of mammalian genomes and the conserved presence of WGATAA motifs. 

Multiple computational approaches for integrating high-dimensional collections of functional genomic datasets into a small set of chromatin states have been devised over the last few years and applied to the problem in ENCODE cell lines, including the Hidden Markov Model-based Segway\cite{Hoffman2012} and chromHMM\cite{Ernst2012}, as well as Self-Organizing Maps\cite{Mortazavi2013} (SOM). We selected cEnhs in K562 cells based on Segway and chromHMM chromatin state assignments and the presence of DNAse and H3K27ac ($n = 30$), with elements lacking both marks used as negative controls ($n = 21$). We also selected cEnhs based on SOMs trained DNAse and histone mark ChIP-seq data over multiple ENCODE cell types; these cEnh elements were picked so that they were specifically in an open chromatin state and marked by histone modifications associated with enhancer activity in HepG2 cells ($n=32$). Elements lacking both marks ($n=18$) were used as negative controls. 

The collection of cEnh elements that we tested includes primarily long fragments, between 500 and 1000 bp (Figure \ref{FigS2}), thus most likely encompassing complete functional regulatory elements provided that such are indeed present within the cEnh region. 

We tested these elements for functional activity using luciferase reporter assays \hl{XXX some details but not too many on how exactly this was done, refer to Methods for the rest XXX}. We subsequently compared measured activity with cEnh prediction based on various biochemical signatures to examine their predictivity of functional cEnhs.

\subsection*{Regulatory activity of TF-centric cEnh selections}

Figure \ref{Fig2} shows the measured enhancer activities for TF-centric cEnh selections and negative controls in myogenesis (Figure \ref{Fig2}A) and erythropoesis (Figure \ref{Fig2}B). We found \hl{XXX} out of \hl{XXX}, or \hl{XX}\% of muscle cEnhs to pass the threshold of activity in contrast to \hl{XXX} out of \hl{XXX}, or \hl{XXX}\% of muscle negative controls. Similarly, \hl{XXX} out of \hl{XXX}, or \hl{XX}\% erythropoetic cEnhs and \hl{XXX} out of \hl{XXX}, or \hl{XX}\% of negative controls, were found to be active. 

(Supplementary Figure \ref{eLifeFig9-K562_HepG2-funcassay})

\subsection*{Regulatory activity of TF-agnostic cEnh selections}

\end{multicols}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig4.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Enrichment of active cEnhs in different classes of cEnhs defined by the strength of their biochemical signatures}. (A) cEnhs (rectangle boxes) belonging to different signal classes (based on ChIP-seq data for myogenin in C2C12 myocytes; ``top'': RPM $\geq 10$; ``high'': RPM$\in [5,10]$; ``medium'': RPM$\in [2.5,5]$; ``low'' RPM$\leq 2.5$) in the neighborhood of the mouse \textit{Myog} gene; (B) Genome-wide distribution of cEnhs in different signal classes based on ChIP-seq data for myogenin in C2C12 myocytes; (C) Fraction of active enhancers in different cEnh signal classes (based on ChIP-seq data for myogenin in C2C12 myocytes; ``top'': $n = 66$; ``high'': $n = 49$; ``medium'': $n = 45$; ``low'' $n=27$) as well as in negative controls (with not myogenin occupancy; $n=34$). Only cEnhs positive for myogenin, DNAse and H3K27ac were included; (D) Extrapolated number of active enhancers in C2C12 belonging to each signal strength class
\centerline{}
\rightline{\textit{(legend continued on next page)}}
} 
\label{Fig4}
\end{figure*}
\clearpage
\let\thefootnote\relax\footnotetext{ based on the genome-wide number of myogenin$^+$/DNAse$^+$/H3K27ac$^+$ regions. (E) Genome-wide distribution of cEnhs in different signal classes based on the set of GR ChIP-STARR-seq cEnhs in A549 cells (``top'': A549 Dex GR ChIP-seq RPM $\geq 10$; ``high'': RPM$\in [5,10]$; ``medium'': RPM$\in [2.5,5]$; ``low'' RPM$\leq 2.5$). Only GR ChIP-seq regions significantly represented within STARR-seq libraries (i.e. with sufficiently many reads to score as active if they were in fact active) are shown for each signal class. (F) Fraction of cEnhs exhibiting significant activity in the GR ChIP-STARR-Seq assay in stimulated A549 cells for each signal strength class.}

\begin{multicols}{2}

\subsection*{The bulk of enhancers in a given cell type are marked by modest biochemical signatures}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig5.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Absence of general strong correlation between biochemical signal strength and enhancer activity of cEnhs}. (A) Distribution of tested cEnhs relative to the genome-wide DNAse and H3K27Ac signal distribution in C2C12 myocytes. Shown are DNAse and H3K27ac RPM values for all DNAse$^+$/H3K27ac$^+$ regions as well as for cEns tested for activity in C2C12 myocytes (outlined circles) and for occupancy negative control (outlined squares), with tested cEnhs separated into four classes based on their measured enhancer activity, from dark red (most active) to yellow (inactive). (B) Distribution of tested cEnhs relative to the genome-wide DNAse and H3K27Ac signal distribution in G1E-ER4 cells. Shown are DNAse and H3K27ac RPM values for all DNAse$^+$/H3K27ac$^+$ regions as well as for cEns tested for activity (outlined circles) and for occupancy negative control (outlined squares), with tested cEnhs separated into four classes based on their measured enhancer activity, from dark red (most active) to yellow (inactive). (C) Correlation between biochemical signals and measured enhancer activity in C2C12 and G1E cells. See also Supplementary Figures \ref{FigS4}, \ref{FigS7}, and \ref{FigS10} for more details.} 
\label{Fig5}
\end{figure*}

\subsection*{Biochemical signature strength is not strongly correlated with enhancer strength}

\subsection*{Temporal differences between biochemical mark profiles and their impact for cEnh prediction}

\section*{Discussion}

In this study, we have provided a comprehensive examination of cEnh activity in multiple mammalian systems and its relationship to biochemical signatures commonly used to select cEnh elements. Across cell types and methods for cEnh selection, approximately 50\% of cEnhs simultaneously exhibiting significant H3K27ac marking and DNase hypersensitivity appear to function as active enhancers. We also demonstrated that enhancer assays activity is specific to genomic regions that are distinguished by characteristic biochemical signatures. By studying cEnhs sampled across the full spectrum of ChIP occupancy for multiple transcription factors, we have demonstrated that the most strongly biochemically marked cEnhs are highly enriched for functionality. 

However, first, active functional enhancers are also present throughout the whole biochemical signal spectrum, and because of the very large number of the latter, the bulk of active enhancers in any given cell type in fact resides in the population of cEnhs with modest biochemical signatures, and second, we do not observe a particularly strong correlation between the magnitude of enhancer activity in functional assays and strength of biochemical marks as measured using functional genomic assays. 

These findings are in contrast to earlier studies, which reported over 80-90\% activity for cEnhs defined using, for example p300 ChIP-seq\cite{Visel2009}. This is most likely due to the fact the these studies only focused on elements selected among the most strongly enriched and likely to be functional cEnhs rather than the full spectrum of ChIP-seq signal.

We find a smaller fraction (15-25\%) of active cEnhs using a high-throughput ChIP-STARR-seq MPRA, but similar qualitative patterns across the spectrum of biochemical signatures defining cEnhs. The reasons for the lower activity rates returned by MPRAs are manifold, and include (but are likely not limited to) the fact that the DNA fragments used as input to the MPRA are shorter than the length of fully functional regions, and that ChIP-STARR-seq libraries do not provide deep and complex representation of the original pools of ChIP-seq fragments, leaving many modestly active enhancers with insufficiently many reads to cross the thresholds of statistical significance; both of these factors are expected to lead to high false negative rates.

Finally, we observe that biochemical marks can be decoupled from each other temporally, which can impact cEnh predictions based on their cooccurrence. For example, during muscle differentiation thousands of TF occupancy sites also exhibit DNAse hypersensitivity and p300 localization but are not yet robustly acetylated at H3K27, and conversely, in differentiated cells H3K27ac can remain for some time associated with sites previously TF-occupied DNAse hypersensitive sites even though they are no longer open.

% The population effect of ChIP-seq and connectivity measurements make it unclear if a subportion of elements found inactive in this study are simply marked for activity but would be active only in another cell context that utilizes myogenin; for example the XXX gene is involved in the formation of forelimb muscle while XXX gene is involved in the formation of limb. Both of these genes are used upstream of muscle and could control differential enhancers. (refs)
% During development of muscle, a highly dynamic process, there are thousands of TF occupied sites where EP300 is co-localized but H3K27 has not yet been acetylated, or conversely sites where the histone “history” has remained after the key TF and DNAseHS have disappeared. In these special contexts EP300 TF/EP300 co-marked elements that are negative for H3K27Ac will have been missed in the ENCODE encyclopedia annotation of candidate Enhancer elements. In the rapidly differentiation muscle landscape, many such sites score as enhancers in our assay. Although only a small number of elements were individually tested in immortalized cell lines, it would appear that in aggregate, there is little function in DNAse sites that are not co-marked with H3K27Ac once differentiated. Although only a small number was tested, H3K27Ac exclusive sites (no DNAseHS) do not appear to function as enhancers.
% Conversely, in GR the biochemical structure of DNAse and H3k27ac appears largely predestined; which would make these enhancers impossible to distinguish from A549 enhancers in an annotation like that of the encyclopedia. EP300 ChIP-seq proves a key differentiator for a large set of GR specific enhancers. Figure 6 
% YYYMore to tie in encyclopedia? We should consider a figure – once draft is more completeYYY

Promoter-enhancer specificity \cite{Zabidi2015}

integration in genome \cite{Maricque2016,Inoue2016,FIREWACh2014}

\section*{Methods}

Except where otherwise stated, all analyses were performed using custom-written python scripts. The GENCODE

\subsection*{Cell culture}

\subsubsection*{C2C12 cells}

C2C12 myoblasts were maintained and seeded for transfection in 20$\%$ FBS supplemented DMEM medium. Upon reaching $>$80$\%$ confluency, the cells were were differentiated using 2$\%$ horse serum and 1 $\mu$M insulin in DMEM medium.

\subsubsection*{G1E cells}

G1E cells were grown according to previously published protocols\cite{Wang2006,Cheng2008,Dogan2015}.

\subsubsection*{K562, HepG2 and A549 cells}

K562; HepG2 and A549 cells were grown according to the approved ENCODE cell culture protocols publicly available through the ENCODE portal (\burl{https://www.encodeproject.org/}).

\subsection*{Functional assays}

\subsection*{Cloning and DNA purification}

The specifics of each selection set, the promoters used for each cell line, and other details are publicly available through the ENCODE portal (\burl{https://www.encodeproject.org/}).

\subsubsection*{Functional assay testing of cEnhs in C2C12 cells}

Candidate REs and negative control regions were either PCR-amplified from female BALB/C purified mouse genomic DNA (Switchgear Genomics) or synthesized de novo (Genscript). The resulting DNA was cloned into a reporter vector 5$^\prime$ of a custom TK promoter (SwitchGear Genomics) driving a high-turnover sequence-optimized luciferase reporter gene. Plasmids were purified using Miniprep kits (Qiagen) and standardized to 30 ng/$\mu$L using the Qubit\textsuperscript{\textregistered} dsDNA HS (High Sensitivity) Assay Kit.

For the purpose of testing elements in the myoblast state, undifferentiated C2C12 cells were seeded in 96-well delta surface plates (NUNC) quadruplicates 12 hours before transfection at a concentration of 2500 cells/well. For the purpose of testing elements in the myocyte state, undifferentiated C2C12 cells were seeded at a density of 3500 cells/well. Transfections were carried out with 50 ng of DNA per construct in each replicate using Lipofectamine LTX, after a 5 minute incubation with a 1:16 dilution with the PLUS reagent (Thermo Fisher). Myoblast plates were lysed using a Steady-Glo\textsuperscript{\textregistered} kit, and luminescence was measured on a plate luminometer 24 hours post-transfection. Myocyte plates had their media exchanged with differentiation 12-16 hours post transfection and measured following the same procedure 24 hours later.

Aside from the plate reading step, the entirety of the transfection process was automated and carried out on a Tecan Freedom EVO 200 robot.

\subsubsection*{Functional assay testing of cEnhs in K562 and HepG2 cells}

The set of K562 and HepG2 cEnh regions was PCR-amplified and cloned 5$^\prime$ of the promoter of enhancer assay plasmids containing luciferase and renilla reporter genes; cloning was performed by SwitchGear Genomics. Each construct was quantified (using Qubit) and standardized to 30ng/$\mu$L before use in transfection assays. \hl{XXX details of transfection XXX}. \hl{Chris Partridge please review this}

\subsubsection*{Functional assay testing of erythropoetic cEnhs}

G1E candidate enhancer regions were tested in K562 cells according to protocols publicly available through the ENCODE portal \burl{https://www.encodeproject.org/}.

\subsection*{Functional Assay Data processing}

For each cEnh or negative control measurement, the ratio between its value and the corresponding basal promoter vector (relative assay activity) was calculated. Active cREs were discriminated from inactive using a $z$-score analysis, comparing the population of test element technical replicate values to the set of negative controls. \hl{XXX this could be stated more explicitly with the actual formulas XX}. 

\subsection*{ChIP-seq experiments}

Chromatin immunoprecipitation in A549 cells was performed as previously described (Reddy et al. 2009) using 2 $\times$ 10$^7$ A549 cells per replicate. Cells were sonicated using a Bioruptor XL (Diagenode) on the high setting until the resulting chromatin was fragmented to a median fragment size of 250 nt as assayed by agarose gel electrophoresis. GR ChIP was performed using 5 $\mu$g of a rabbit polyclonal $\alpha$-GR antibody (Santa Cruz Biotechnology sc-1003), and 200 $\mu$l of magnetic sheep anti-rabbit beads (Life Technologies M-280). H3K27ac ChIP was performed using \hl{XXX Ab source XXX}. After reversal of formaldehyde crosslinks at $65\,^{\circ}\mathrm{C}$ overnight, DNA was purified using MinElute DNA purification columns (QIAGEN). Illumina sequencing libraries were then generated using the Apollo 324 liquid handling platform according to manufacturer's specifications (Wafergen). 

ChIP-seq in C2C12 cells was performed using chromatin from 2 $\times$ 10$^7$ nuclei, which was fragmented using a Misonix probe tip sonicator and subjected to immunoprecipitation using a robotic ChIP pipeline described before. The resulting purified DNA was then converted into sequencing libraries and sequenced on an HiSeq 2500 (Illumina) as described previously\cite{Gasper2016}. The following antibodies were used: $\alpha$-myogenin (Santa Cruz Biotechnology SC-12732, lot K2311), $\alpha$-MyoD (Santa Cruz Biotechnology SC-32758, lot J3115), $\alpha$-MEF2 (Santa Cruz Biotechnology SC-17785, lot H1913), $\alpha$-p300 (Santa Cruz Biotechnology SC-585, lot H3115), $\alpha$-E2A (Santa Cruz Biotechnology SC-349X, lot B1207), $\alpha$-H2B (Santa Cruz Biotechnology SC-357, F2305), and $\alpha$-H3K27ac (Active Motif 39133, lot 34849). 

In addition, publicly available\cite{Pbx1} Pbx1 ChIP-seq and Control datasets were downloaded from GEO accession GSE76010. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} ChIP-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

\subsection*{DNAse-seq experiments}

In C2C12 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}

In A549 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} DNAse-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

% \subsection*{RNA-seq experiments}

% \hl{XXX C2C12 RiboMinus XXX}

\subsection*{STARR-seq experiments}

The STARR-seq experiments previously published by Vockley et al.\cite{Vockley2016} were used in this study. 

\subsection*{Genomic coordinate conversion}

The regions to be tested using functional assays were designed based on the \verb|mm8| and \verb|mm9| verions of the mouse genome and the \verb|hg19| version of the human genomes. Conversion of the original coordinates to \verb|mm10| and \verb|hg20| coordinates was performed using the \verb|liftOver| tool from the UCSC Genome Browser Utilities\cite{UCSCGB}.

\subsection*{Conservation analysis}

Sequence conservation analysis were carried out using the \verb|phastCons60way| and \verb|phastCons100way| conservation tracks, which were downloaded from the UCSC Genome Browser\cite{UCSCGB}. 

\subsection*{ChIP-seq data processing and analysis}

ChIP-seq reads were trimmed down to 36 bp in length and mapped against the \verb|hg20| (for human samples; the male or female version depending on the sex of the cell line the sample originated from) and \verb|mm10| (for mouse samples) using Bowtie \cite{Bowtie} (version 1.0.1) with the following settings: \verb|-v 2| \verb|-k 2| \verb|-m 1| \verb|--best| \verb|--strata|. DNAse-seq reads were processed similarly except that they were trimmed down to 20bp for A549 samples and 36bp for C2C12 cells (due to differences in the experimental protocol used to generate the data).

Peak calling was carried out as follows. For DNAse and H3K27ac datasets, MACS2 \cite{Feng2012} (version 2.1.0) was run on individual replicates and on pseudoreplicates (generated by randomly splitting the pooled set of reads for both replicates into two) with relaxed settings (\verb|--to-large| \verb|-p 1e-1|). For H3K27ac control datasets were subjected to the same treatment (no background/control is available for DNAse data) The top 100,000 peaks from each replicate or pseudoreplicate (ranked by $q$-value) were then used as input into IDR \cite{Li2011}. The number of peaks above a given IDR threshold called as reproducible between true replicates ($N_t$) and between pseudoreplicates  ($N_p$) were recorded. Peak calling was then carried out on the pooled set of reads and the top $max(N_t,N_p)$ peaks were chosen as the final set of reproducible peaks. For point-source \cite{Pepke2009} datasets (transcription factors), peak calling was carried out following the same procedure but using SPP \cite{Kharchenko2008} (version 1.10.1), using the top 300,000 peaks as input to IDR.

The pooled sets of reads were also used to calculate RPM (reads per million) enrichment values over elements tested in functional assays.

% \subsection*{RNA-seq data processing}

% ENCODE paired-end human RNA-seq reads were trimmed down to 2 $\times$ 75mers. C2C12 RiboMinus reads were aligned as 1 $\times$ 50mers. Reads were initially aligned against human or mouse ribosomal RNA sequence, and the unmapped reads were then aligned against the \verb|hg20| and \verb|mm10| versions of the human genome using TopHat\cite{TopHat2} (version 2.0.8). 

\subsection*{STARR-seq data processing and analysis}

STARR-seq and STARR-seq control/input reads ($2 \times $25mers) were mapped as paired ends to the \verb|hg20| version of the human genome using Bowtie with the same settings as described above. Post-IDR peaks obtained from GR ChIP-seq were used as the list of candidate cEnhs to be scored using the STARR-seq data. For each STARR-seq and STARR-seq control/input replicate, raw fragment counts were obtained from every GR ChIP-seq peak; in addition, the rest of the genome (i.e. the regions that fall between the post-IDR GR ChIP-seq peaks) was split into bins of at most 50 kb length, and read counts were calculated for all such regions. The fragment counts for GR ChIP-seq peaks and for the intervening regions were combined together and used as input to DESeq2\cite{DESeq2} for estimating differentially represented regions between STARR-seq and control/input libraries (at FDR-adjusted $p \leq 0.05$). The lowest average fragment counts value which was scored as significantly significant by DESeq2 was identified for each comparison, and all GR ChIP-seq regions with average fragment counts lower than this value were excluded from subsequent analysis, as such regions were not sufficiently represented in the available sequencing data to be reliably scored as active or inactive. We also carried out a fragment-level analysis, in which read counts were calculated for each individual sequencing fragment (defined as the pair of positions $\{i,j\}$, where $i$ and $j$ are respectively the $5^{\prime}$ and $3^{\prime}$ ends of the first and the second sequencing reads in a pair), using the same DESeq2 framework.

\section*{Acknowledgments}

Library generation and high-throughput sequencing for C2C12 ChIP-Seq samples was performed by Igor Antoshechkin at the Millard and Muriel Jacobs Genetics and Genomics Laboratory. The authors would also like to thank Diane Trout and Henry Amrhein for technical assistance with maintaining the computational infrastructure used to carry out this study.

This material is based upon work supported by the National Science Foundation under Grant No. CNS-0521433.

\begin{thebibliography}{100}

\input{references}

\end{thebibliography}

\end{multicols}

\clearpage

\setcounter{table}{0}
\renewcommand{\tablename}{Supplementary Table}
\setcounter{figure}{0}
\renewcommand{\figurename}{Supplementary Figure}

\setcounter{page}{1}
\renewcommand\thepage{{SM }\arabic{page}}

\section*{Supplementary Materials}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{eLifeFigApp1-CNE-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf The length of thousands of conserved noncoding elements in mammalian genomes greatly exceeds the size range of MPRA constructs}. (A) The length distribution of conserved noncoding regions in the human genome. The \texttt{phastCons100way} conservation track for the \texttt{hg20} version of the human genome was downloaded from the UCSC Genome Browser. Blocks of conservation, in which all nucleotides have \texttt{phastCons} scores higher than the indicated minimum (\texttt{phCons}), were identified, and then merged into larger regions if the length of the gaps between them was smaller than the indicated \texttt{maxGap} parameter. The distribution of the lengths of the resulting sets of regions was plotted. This approach captures the properties of enhancer elements observed in the genome, which often consist of multiple blocks of highly conserved sequences separated by gaps of less conserved sequences, resulting in an enhancer element of up to a few hundred base pairs in length or more. (B) Such an example is shown for the \textit{Acta1} gene in mouse.} 
\label{FigS1}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{eLifeFigApp2-contruct-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Length distribution of functional assays constructs used to test cREs in this study}. (A) Distribution of functional assay construct lengths tested in this study in C2C12 cells. (B) Distribution of functional assay construct lengths tested in this study in G1E cells. (C) Distribution of functional assay construct lengths tested in this study in K562 and HepG2 cells } 
\label{FigS2}
\end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18cm]{2017-08-31-Figs/FigS3.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of biochemical signal in tested cEnhs and genome-wide}. Shown is the distribution of ChIP-seq or DNAse-seq RPM values for the set of cEnhs tested and for the genome-wide set of cEnh with similar biochemical signatures shown in Figure \ref{Fig3}.} 
\label{FigS3}
\end{figure*}


\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp7-C2C12-G1E-HepG2-K562-overlaps.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Differential marking of proximal and distal cREs by DNAse and H3K27ac between different cell types and cell states}. (A) Promoter-proximal (within $\leq$1 kb of an annotated TSS) sites in K562 and HepG2 cells; (A) Distal ($\geq$1 kb from an annotated TSS) sites in K562 and HepG2 cells; (C) Promoter-proximal (within $\leq$1 kb of an annotated TSS) sites in differentiated and undifferentiated C2C12 and G1E cells; (D) Distal ($\geq$1 kb from an annotated TSS) sites in differentiated and undifferentiated C2C12 and G1E cells. The overlap score ($O_{xy}$) shown in each cell $(x,y)$ indicates the fraction of peaks in the dataset on the $y$-axis that are also found in the dataset on the $x$-axis, i.e. $O_{xy} = |X \cap Y|/|Y|$.} 
\label{FigApp7}
\end{figure*}

\clearpage

\includegraphics[width=18cm]{eLifeFigApp3a-C2C12-heatmaps-MyoD-V2.png}

\clearpage

\includegraphics[width=18cm]{eLifeFigApp3b-C2C12-heatmaps-myogenin-only-V2.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18cm]{eLifeFigApp3c-C2C12-heatmaps-MEF2-only-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of muscle differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, p300, the MRFs MyoD and myogenin, and cofactors (MEF2, E2A/TCF3, HEB/TCF12, and Pbx1) in undifferentiated (myoblast, or ``MB'') and differentiated (myocyte, or ``MC'') C2C12 cells were analyzed. Sites were split into multiple subgroups depending on regulatory factor occupancy (at IDR=0.05) -- MyoD-positive (in either condition) sites (A), myogenin-only sites (B), and MEF2-only sites (C) -- then sorted by MRF ChIP-seq signal (in the following order of priority: myoblast MyoD, myocyte MyoD, myocyte myogein, myoblast MEF2, myocyte MEF2); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp3}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig2-C2C12-DNAse-H3K27ac.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during muscle differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myoblasts; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myocytes; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myoblasts; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myocytes; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.}
\label{Fig2S}
\end{figure*}

\clearpage

\includegraphics[width=18.5cm]{eLifeFig3a-C2C12-funcassay-locus-V2.png}

\clearpage

\includegraphics[width=18.5cm]{eLifeFig3b-C2C12-funcassay-random-V2.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp4c-C2C12-funcassay-negative-controls-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in C2C12 cells}. Fold activity in myocytes (top) and myoblasts (bottom) across biological replicates ($n = 4$) and technical replicates  ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, p300, MyoD and myogenin occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs selected for their physical proximity to loci known for their importance to muscle development (``locus picks''); (B) randomly selected from the genome-wide set of MyoD/myogenin-occupied regions; (C) negative controls.} 
\label{Fig3S}
\end{figure*}

\clearpage

\begin{FPfigure}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig4-C2C12-predictivity-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in C2C12 cells}. (A and B) Correlation between fold activity and DNAse hypersensitivity, H3K27ac, p300, myogenin, MyoD and MEF2 occupancy in myoblasts and myocytes; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in myocytes; (D) AUROC (area under ROC curve) values for different biochemical marks in myocytes; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in myoblasts; (F) AUROC values for different biochemical marks in myoblasts.} 
\label{FigS4}
\end{FPfigure}

\begin{FPfigure}
\begin{center}
\includegraphics[width=16cm]{eLifeFigApp5-G1E-TAL1-GATA-heatmaps-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of eryhtroid differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, GATA1, TAL1 and GATA2 G1E and G1E-ER4 were analyzed. Sites were split into subgroups depending on GATA1 and TAL1 occupancy (IDR=0.05), then sorted by ChIP-seq signal (in the following order of priority: G1E-ER4 GATA1, G1E-ER4 TAL1); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp5}
\end{FPfigure}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18cm]{eLifeFig5-G1E-DNAse-H3K27ac.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during erythroid differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E-ER4 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E-ER4 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.} 
\label{Fig5S}
\end{figure*}

\clearpage

% \begin{FPfigure}
% \begin{center}
% \includegraphics[width=18.5cm]{eLifeFig6-G1E-funcassay.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf Functional assay testing of the regulatory activity of erythroid cREs}. Fold activity in K562 cells across biological replicates ($n \in [1,9]$) and technical replicates ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, GATA1, and TAL1 occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs randomly selected from the genome-wide set of GATA1/TAL1-occupied regions; (B) cREs selected among the set of highly evolutionarily constrained non-coding elements that contain a GATA1 motif (``regulatory potential selections'').} 
% \label{Fig6S}
% \end{FPfigure}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig7-G1E-predictivity.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in erythroid cells}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, H3K27ac, TAL1, and GATA1 occupancy in G1E and G1E-ER4 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity; (D) AUROC (area under ROC curve) values for different biochemical marks.} 
\label{FigS7}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig8-K562_HepG2-DNAse-H3K27ac.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation in immortalized human cell lines}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in K562 cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in K562 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in HepG2 cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in HepG2 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black.} 
\label{Fig8}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig9-K562_HepG2-funcassay.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in human immortalized cell lines}. Fold activity across biological replicates ($n =$ \hl{???}) and technical replicates ($n = $  \hl{???} for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity and H3K27ac status are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs tested in K562 cells (B) cREs tested in HepG2 cells.} 
\label{eLifeFig9-K562_HepG2-funcassay}
\end{figure*}

\begin{FPfigure}
\begin{center}
\includegraphics[width=17.5cm]{eLifeFig10-K562-HepG2-A549-plots.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in human immortalized cell lines}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, and transcription factor occupancy in K562 and HepG2 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (D) AUROC (area under ROC curve) values for different biochemical marks in K562 cells; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (F) AUROC (area under ROC curve) values for different biochemical marks in K562 cells.} 
\label{FigS10}
\end{FPfigure}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{eLifeFig11-predictivity-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CAPTION}. (A) CAPTION GOES HERE} 
\label{Fig11}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=13cm]{eLifeFig12-predictivity-vs-signal.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \hl{CAPTION} }. (A) CAPTION GOES HERE} 
\label{Fig12}
\end{figure*}

\begin{sidewaysfigure}
\begin{center}
\includegraphics[width=25cm]{eLifeFigApp6-A549-heatmaps-V2.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of GR response in A549 cells}. \hl{finish caption}} 
\label{FigApp6}
\end{sidewaysfigure}

\clearpage

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp8-A549-STARR-seq-activity.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Testing of cEnhs for activity using ChIP-STARR-seq for GR in A549 cells with and without Dexamethasone stimulation }. (A) Fraction of active cEnhs detected in each condition. Shown is the number of cEnhs that passed the minimum representation threshold (see the Methods section for more details) and were identified as active using DESeq2. (B) Fraction of significantly active (FDR-corrected $p$-value $\leq 0.05$) biochemically marked individually on in combinations by H3K27ac, DNAse, p300. (C) Length distribution of active and inactive STARR-seq fragments as defined by DESeq2. } 
\label{FigApp8}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig6.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Marking of common and cell state-specific active cEnhs by H32K7ac, DNAse and p300}. (A) STARR-seq data in A549 cells with and without Dexamethasone treatment (epigenomic datasets from the 3 hour time point were used); (B) Luciferase assay data in differentiated and undifferentiated C2C12 cells.} 
\label{Fig6}
\end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/FigS14-STARR-seq-activity-distribution.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of STARR-seq activity in A549 cells}. Shown is the distribution of $log_2$(FoldChange) values (defined by DESeq2) for STARR-seq experiments in resting EtOH-treated (A) and Dexamethasone-treated (B) A549 cells.} 
\label{FigS14}
\end{figure*}

\end{document}

https://www.overleaf.com/11055589xrycntbhwjnf