\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=2cm,bottom=2cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{capt-of}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage{pagecolor}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionAfterwards]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{3pt}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}

\usepackage[hang]{footmisc}
\setlength\footnotemargin{0em}

\setlength{\skip\footins}{0.75cm}

\makeatletter
\renewcommand\footnoterule{%
  \kern-3\p@
  \hrule\@width \textwidth height 1.5pt
  \kern2.6\p@}
\makeatother

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}

\title{\bf Biochemical signatures and enhancer activity of ENCODE candidate regulatory elements}
\renewcommand\Authfont{\scshape\normalsize}
%\author[*]{ALL OTHER AUTHORS I'VE MISSED}
\author[1]{Gilberto DeSalvo}
\author[6]{Georgi K. Marinov}
\author[2]{Christopher Partridge}
\author[4,7]{Christopher M. Vockley}
\author[3]{Nergiz Dogan}
\author[8,9]{Ricardo Ramirez}
%\author[1]{Gordon Kwan}
\author[4,5]{Timothy E. Reddy}
\author[8,9]{Ali Mortazavi}
\author[3]{Ross C. Hardison}
\author[2]{Richard M. Myers}
\author[1]{Barbara J. Wold}
\renewcommand\Affilfont{\itshape\normalsize}
%\affil[*]{other affiliations}
\affil[1]{Division of Biology and Biological Engineering, California Institute of Technology, Pasadena, California, USA.}
\affil[2]{HudsonAlpha Institute for Biotechnology, 601 Genome Way, Huntsville, AL 35806, USA}
\affil[3]{Dept. of Biochemistry and Molecular Biology, Penn State University, 304 Wartik Laboratory, University Park, PA 16802, USA}
\affil[4]{Center for Genomic \& Computational Biology, Duke University, Durham, NC 27708, USA}
\affil[5]{Department of Biostatistics \& Bioinformatics, Duke University, Durham, NC 27708, USA}
\affil[6]{Department of Genetics, Stanford University School of Medicine, Stanford, CA 94305}
\affil[7]{Department of Cell Biology, Duke University, Durham, NC 27708, USA}
\affil[8]{Department of Developmental and Cell Biology, University of California Irvine, Irvine, CA 92697-2300, USA}
\affil[9]{Center for Complex Biological Systems, University of California Irvine, Irvine, CA 92697-2280, USA}
\date{}

\begin{document}
\maketitle

% \let\thefootnote\relax\footnotetext{Corresponding author:
% Georgi K. Marinov. Division of Biology and Biological Engineering, California Institute of Technology. Pasadena, CA 91125.}
% \centerline{}
% \centerline{}
\begin{abstract}
\noindent {\normalsize \textbf{An aspiration for functional genomics is to identify all \textit{cis}-acting regulatory elements in the genome, and to define their activities.  Genome-wide transcription factor occupancy and chromatin state profiles are widely used to identify candidate regulatory elements (cREs) such as enhancers, promoters and insulators, based on their characteristic biochemical signatures. Multiple large-scale efforts, in particular the ENCODE and NIH Roadmap Epigenomic Mapping consortia, have generated such datasets from a wide variety of cell lines, tissues and developmental stages in both human and mouse, serving as the foundation for cataloging the cRE repertoire on a genome-wide scale. However, accurately predicting the actual regulatory activity of cREs based on functional genomic maps alone has turned out to be difficult, and it is not yet clear to what extent active regulatory elements \textit{in vivo} can be identified from biochemical signatures. To address these issues, we carried out large-scale tests for transcriptional enhancer activity for hundreds of candidate enhancer elements (cEnhs) from five human and mouse cell types, including both immortalized cell lines and models for  cell differentiation during developmental transitions. Our cEnh collections were selected using biochemical signature criteria ranging from simple individual transcription factor (TF) occupancy to integrative machine learning models over multiple functional genomic datasets, and were designed to include full-length cREs. We supplement these tests with a massively parallel reporter assay (MPRA) characterization of cEnhs in one of the cell lines we studied. We find that irrespective of the selection criteria used, $\sim$50\% of cEnh elements showing enhancer activity. Examination of the functional predictivity of biochemical signatures reveals that the majority of cEnhs genome-wide are likely to exhibit modest regulatory activity. In the same time, most active enhancers in the genome are characterized by relatively modest biochemical signature strength, due to the much lower abundance of cEnhs characterized by very strong biochemical signals. % We also uncovered a set of developmentally distinct distal elements marked in progenitor cells to predict future, rather than contemporaneous, enhancer activity upon differentiation. Finally, developmentally privileged loci possessing multiple, highly regulated genes were enriched for enhancer cREs with exceptionally strong individual activity despite having only modest biochemical signatures. 
Finally, we discuss our results in the context of current models of the regulatory effect of enhancers on their cognate genes. We expect our findings to help guide future efforts towards cataloging the functional repertoires of mammalian genomes.}}
\centerline{}
% \centerline{}
\end{abstract}

\renewcommand{\footnotesize}{\fontsize{10pt}{12pt}\selectfont}

\begin{multicols}{2}

\section*{Introduction}

The complete and accurate understanding of the relationship between the human genome and its corresponding phenotypes requires the comprehensive characterization of its compendium of functional elements. The results of the many genome-wide epigenomic and transcriptomic studies carried out over the last decade reveal a remarkable picture, in which non-coding regulatory elements constitute the bulk of such functional regions in the genome\cite{mouseENCODE2014,ENCODE2012}, with the expression of each gene (protein coding or non-coding) being controlled by the integrated output of multiple proximal and distal enhancer, insulator and silencing elements.

The genome-wide mapping and characterization of non-coding regulatory elements is thus a major goal of the field, and features prominently among the objectives of the \textbf{ENC}yclopedia \textbf{O}f \textbf{D}NA \textbf{E}lements (ENCODE) consortium\cite{ENCODE101}. However, achieving it, although greatly aided by the advent of high-throughput sequencing and epigenomic tools, is still not a simple task.

The biochemical activity associated with the function of regulatory elements results in certain biochemical signatures that can be captured by epigenomic assays. For example, active promoters in eukaryotes are classically associated with the trimethylation of lysine 4 on histone 3 (H3K4me3)\cite{Vermeulen2010}, as well as other biochemical signatures, such as DNAse hypersensitivity\cite{Thurman2012}. Active enhancer elements have been proposed to exhibit their own biochemical signature, featuring DNAse hypersensitivity, the H3K27ac and H3K4me1 histone marks, and occupancy by the p300 acetyltransferase\cite{Creyghton2010,RadaIglesias2011,Thurman2012}, as well as by sequence-specific transcription factors (Figure \ref{Fig1}A).

These biochemical signatures enable the compilation of lists of candidate functional elements (cREs), but they do not on their own allow the conclusive identification of any given element as functional. While functional regulatory elements exhibit characteristic biochemical signatures, the reverse (that the presence of a biochemical signature necessarily means function) cannot be inferred straightforwardly\cite{Kellis2014}. Such inferences are further complicated by the observation that biochemical signatures are not binary but instead exist on a continuum between strong outstanding features, on one hand, and what is probably biochemical noise, on the other. For example, it is far from clear that all transcription factor binding sites that can be reproducibly identified using ChIP-seq and related techniques are in fact functional REs\cite{Fisher2012}. Therefore, individual cREs in the lists compiled by efforts such as the ENCODE and mouseENCODE consortia\cite{ENCODE2012,mouseENCODE2014} have to be subsequently directly tested and functionally characterized in detail.

The ultimate functional characterization of cREs will involve a combination of loss-of-function assays and direct assays for activity. The former have been until recently technically challenging, but are becoming more commonplace with the advent of large-scale CRISPR/Cas9-mediated mutagenesis techniques\cite{Korkmaz2016,Fulco2016}. Nevertheless, most work in the field has been based on testing cREs for regulatory activity using an exogenous plasmid construct combining a cRE, a promoter and a reporter gene. Classically, such testing has been done by cloning individual cREs into plasmids (or other vectors) and then assaying the expression of the reporter gene (luciferase activity in cell lines or staining for LacZ activity in embryos\cite{Visel2007,Fisher2012}). Numerous developmental enhancers have been characterized following that approach\cite{Visel2007,Visel2009,May2011}, starting from lists of cREs complied based on comparisons of evolutionary conservation and/or biochemical signatures derived from ChIP-seq datasets. However, such studies have often focused only on the most outstanding biochemical signatures\cite{Visel2007}, thus obtaining very high success rates that are likely to be nonrepresentative with respect to the genome-wide population of cREs. 

High-throughput sequencing has enabled the development of assays that go beyond the testing of individual cREs, one by one; instead, very large numbers of sequences are analyzed in parallel, with the readout being based on sequencing DNA tags associated with the cRE or of the cRE itself. These are usually referred to as MPRAs\cite{Inoue2015}, and in the last few years a number of variations of the principle have been successfully applied to a multitude of biological problems and systems \cite{Patwardhan2009,Kinney2010,Kwasnieski2012,Melnikov2012,Patwardhan2012,STARRseq2013,FIREWACh2014}, including the question of testing cREs for activity within the context of the ENCODE Consortium\cite{Kheradpour2013,Kwasnieski2014,Ernst2016}. However, several issues complicate the interpretations of MPRA experiments.

First, the nature of MPRA designs is such that the elements tested are very short, in the 80--250bp neighborhood. This is significantly shorter than tens of thousands of blocks of conserved noncoding sequence that can be identified in the human genome by comparative genomic analysis (Supplementary Figure \ref{FigS1}). Thus to what extent complete REs are assayed and what the corresponding false negative rate is have always been an obvious concerns regarding MPRAs.

Second, given that it is very difficult to control the number of constructs going into each individual cell in transfection experiments, and that an MPRA features large numbers of different cREs being tested in the same time, there is significant potential for crosstalk between active and inactive cREs that end up in the same cell, resulting in numerous false positives.

While the latter concern can be alleviated to an extent through the use of genome-integrated MPRA constructs \cite{Maricque2016,Inoue2016}, the short length of constructs tested remains a significant issue, and one that might be behind the low positive rates reported by MPRAs in the past\cite{Vockley2016}. There is therefore a major gap in the field that needs to be filled by testing a large number of individual constructs of large size (500-1000 bp) using a traditional luciferase assay, and examining the performance of biochemical signature predictions based on the resulting data.

To this end, as part of the ENCODE Project Consortium's efforts towards functional validation of cREs, we tested the regulatory activity of hundreds of candidate enhancer elements (cEnhs) using constructs of such lengths  (Supplementary Figure \ref{FigS2}) in several diverse mammalian cell lines, including both mouse and human systems. These cEnhs were selected from a wide range of biochemical signature strengths (Supplementary Figure \ref{FigS3}), using both TF-centric selection criteria (identifying cEnhs based on ChIP-seq data for individual TFs) and machine learning ``TF-agnostic'' approaches (designed to find combinatorial signatures of enhancer elements from multiple epigenomic maps of histone modifications and DNAse hypersensitivity) for defining cREs.

\end{multicols}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig1V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Biochemical signatures and functional testing of candidate enhancer elements (cEnhs) in mammalian genomes}. (A) Biochemical signatures of cEnhs and promoters. Active enhancers are characterized by DNAse hypersensitivity due to nucleosome depletion, by p300 occupancy and by H3K27ac, as well H3K4me1 (not shown). Promoter elements share some of these features, but also associate with components of the transcription and transcription initiation machineries, and are marked by H3K4me3 (not shown); (B) Genome-wide commonalities and differences between the biochemical signatures of enhancers and promoters. Shown is the average signal profile around TSS distal (right; defined as regions more than 1kb away from an annotated TSS) and TSS proximal (left) cEnhs (defined as statistically significant peaks in the respective datasets; see the Methods section for further detail) in mouse and human cells for TFs (myogenin in differentiating mouse muscle cell, GATA1 in erythroid mouse cells, and the glucocorticoid receptor GR upon dexamethasone stimulation of human A549 cells), DNAse hypersensitivity and H3K27ac; (C) Different cell types share a small fraction of their distal cEnh elements, in contrast to promoter elements. Shown are the common and cell-type specific TSS proximal (within 1kb of an annotated TSS) and TSS distal DHSs between the human erythroid K562 and hepatocyte HepG2 immortalized cell lines; (D) The distribution of biochemical 
\centerline{}
\rightline{\textit{(legend continued on next page)}}
} 
\label{Fig1}
\end{figure*}
\clearpage
\let\thefootnote\relax\footnotetext{signal strength varies over a large continuum. Shown are the signal distribution for myogenin, p300, DNAse-seq, and H3K27ac relative to the summits of the top 500, middle 500 and bottom 500 reproducible myogenin ChIP-seq sites (total $n = 32,278$) in differentiated C2C12 muscle cells, as well as the distribution of the cognate myogenin TF binding motif. (E) Outline of cEnh selection approaches, biological systems, experimental design and functional assays used in this study. Sets of cEnhs for functional testing were compiled based on: TF ChIP-seq occupancy measurements (of the master regulators of muscle differentiation, MyoD and myogenin) in differentiating mouse C2C12 cells; phylogenetic conservation patterns and TF occupancy measurements (of the regulators of erythropoesis GATA1 and TAL1) in differentiating mouse G1E-ER4 cells; TF occupancy (multiple TFs) in immortalized K562 cells; machine learning methods (Self-Organizing Maps, chromHMM and Segway) defining integrated chromatin states over multiple histone modification, DNAse and TF occupancy measurements in K562 and HepG2 cells. These cEnhs were tested using luciferase assays. In addition, DNA fragments from GR ChIP-seq experiments in Dex-stimulated A549 cells were cloned and assayed for activity using the STARR-seq assay. Active elements identified using these methods were then evaluated for the presence and distribution of various biochemical signatures.}

\begin{multicols}{2}

We find that in general $\sim$50\% of both TF-selected and TF-agnostic cEnhs showed significant enhancer activity in transfection assays, observing similar proportions across all cell lines and conditions examined. We observe that the presence of TF recognition motifs in cEnhs displays no correlation with enhancer activity. Our results indicate that DNAse and H3K27ac are generally more predictive of enhancer activity than TF occupancy alone, and that simple biochemical signatures such as the combination of chromatin accessibility and H3K27ac are specifically predictive of enhancer activity compared to regions of the genome that lack them. However, the presence of these signatures is not a strong indicator of enhancer activity as nearly half of cEnhs exhibiting them are not significantly active in transfection assays. 

We also observe a positive correlation between biochemical signal strength and enhancer activity in transfection assays, with the highest fraction of cEnhs exhibiting significant enhancer activity being found among the most strongly occupied cEnhs. However, first, even among that latter group a large fraction of cEnhs displays no discernible enhancer 	activity, and second, because the distribution of biochemical signal strength is highly skewed towards weaker ChIP-seq and DNAse-seq peaks, the bulk of active enhancers in the genome are expected to be found among the larger population of cEnhs with modest biochemical signatures. Of note, enhancer activity as measured by transfection assays also exhibits a skewed pattern, with a smaller number of very highly active enhancers and a larger number of weaker ones. 

Finally, we corroborated these findings by applying the STARR-seq MPRA to assay the activity of thousands of genomic regions occupied by the glucocorticoid receptor (GR) in stimulated A549 cells. 

We expect that our findings will help guide efforts towards the comprehensive cataloging of functional elements in the human genome, and we discuss the implications of our findings in the context of models of gene regulation mediated by the action of distal enhancers.

\section*{Results}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-09-08-Gigio-V7edit/Figure2a-fixed.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional testing of cEnh regulatory activity in mammalian cells}. (A) Functional assay testing of cEnh regulatory activity in the context of muscle differentiation. Shown is luciferase assay fold activity in differentiated C2C12 myocytes across technical replicates ($n = 4$). The red arrow corresponds to the mean fold activity threshold above which elements are considered active. In addition, for each cEnh element, DNAse hypersensitivity, H3K27ac status, and myogenin occupancy are shown, both as RPM (Read Per Million) signal intensity values and as binary peak calls, as well as the number of myogenin motif (RRCAGSTG, derived from myogenin ChIP-seq data) occurrences. Tested cEnhs are sorted by mean fold activity. (B) Functional assay testing of cEnh regulatory activity in the context of erythropoesis. Shown is luciferase assay fold activity in K562 cells across biological ($n \in [1:9]$) and technical replicates ($n = 4$ for each biological replicate). The red arrow corresponds to the mean fold activity threshold above which elements are considered active. In addition, for each cEnh element, DNAse hypersensitivity, H3K27ac status, and GATA1/TAL1 occupancy are shown, both as RPM (Read Per Million) signal intensity values and as binary peak calls, as well as the number of TAL1 (CAGMTG) and GATA1 (WGATAA) motif occurrences. Tested cEnhs are sorted by mean fold activity.}
\label{Fig2}
\end{figure*}

\subsection*{Large-scale functional activity testing of full-length cREs}

One of the major goals of the ENCODE Project is to identify all functional regulatory elements controlling gene expression in the genome, to which end it has carried out mapping of DNAse hypersensitivity, dozens of histone modifications, numerous sequence-specific transcription factors, and RNA transcripts in hundreds of cell types, in both human and mouse \cite{ENCODE101,mouseENCODE2014,ENCODE2012,ENCODE3}. As active regulatory regions typically exhibit certain biochemical signatures, these datasets provide a compendium of candidate regulatory elements. Such regions can be defined using highly multidimensional inputs incorporating multiple biochemical measurements, but because considerable redundancy exists between individual biochemical marks, cREs can also be identified following simpler rules. For example, active enhancers and promoters are typically occupied by sequence-specific transcription factors, marked by H3K27ac and exhibit DNA hypersensitivity (Figure \ref{Fig1}B), and cEnhs can therefore be identified using TF occupancy, by the overlap of DNAse hypersensitive sites (DHS) and regions marked by H3K27ac, or by all three. As of the writing of this manuscript, based on DNAse-seq maps and histone marks profiles, % 1.31M
more than a million and % 0.43M
nearly half a million cREs in humans and mouse, respectively, have been identified by the ENCODE consortium\cite{ENCODE3}. 

However, there need not be a strict one-to-one relationship between these biochemical signatures and functional REs \cite{Kellis2014}, as their presence alone does not necessarily imply that the cRE plays an active regulatory role. In addition, biochemical marks on themselves do not provide direct understanding of how exactly functional REs are specified or exercise their function. An additional highly useful criterion for assessing functionality is evolutionary conservation of cREs between the genomes of distant species, as functional elements are usually subject to selective constraint at the sequence level. However, the absence of conservation on its own does not imply nonfunctionality, as recently evolved lineage-specific REs do not appear as conserved in comparative genomic analyses. Therefore cREs identified using functional genomics tools have to be directly tested for function and subsequently dissected in detail if they are to be comprehensively understood.

In recent years, multiple high-throughput approaches for measuring regulatory activity have been devised, relying on a sequencing readout of the effect of a given cRE or custom-designed DNA sequence on the expression of a reporter gene \cite{Patwardhan2009,Kinney2010,Kwasnieski2012,Melnikov2012,Patwardhan2012,STARRseq2013,FIREWACh2014,Kheradpour2013,Kwasnieski2014,Ernst2016}. On their own MPRAs are very powerful, however, they suffer from several shortcoming when it comes to testing cREs, in particular the short length (typically 80 to 250 bp) of the segments of DNA tested by most of them, which is likely to be significantly shorter than the size of functional regulatory elements in mammalian genomes. 

To assess how prominent this issue might be, we examined the distribution of the lengths of conserved noncoding segments (i.e. excluding sequences overlapping with or in the vicinity of annotated exons) in the human genome (Supplementary Figure \ref{FigS1}), and found that tens to hundreds of thousands (depending on the definition) of such blocks fall outside the range of testing of MPRAs. It is thus possible that MPRAs using short sequences exhibit substantial numbers of false negatives as they cannot assay the activity of complete, full-length REs. An additional concern with MPRAs is the possibility of cross-talk between different REs when multiple episomal constructs end up being transfected in the same cell, resulting in false positives. Alternative approaches towards testing the functionality of cREs are therefore needed. 

To address these issues, we carried out large-scale testing of cEnhs identified on the basis of biochemical signatures in a variety of mammalian systems using luciferase activity assays, which allow for much larger segments of DNA to be tested for enhancer activity. We aimed at comparing the predictivity of multiple approaches for identifying functional cEnhs, and at incorporating in our analysis a diversity of biological systems (Figure \ref{Fig1}E), including immortalized cell lines (the human K562 and HepG2 cell lines), model systems for major developmental transitions (myogenesis and erythropoesis), and model systems for cellular response to exogenous signaling stimuli (activation of the glucocorticoid receptor in the prostate cancer cell line A549).

Immortalized cell lines were targeted as they have been extensively studied by the ENCODE consortium\cite{ENCODE101,ENCODE2012} and they are the source of a significant portion of ENCODE data, while differentiation and external signaling stimulation represent the two main types of dynamic transition of cellular states associated with regulatory alterations of chromatin states: the slower and typically irreversible differentiation of one cell type into another, and the much faster and reversible cellular response to signaling molecules.

The process of myogenesis transforms undifferentiated precursor myoblast cells into differentiated myocyte muscle cells, and is primarily regulated by four key bHLH TFs known as Myogenic Regulatory Factors (MRFs) along with numerous cofactors, such as MEF2, E2A, HEB, Pbx1 and others\cite{Taylor1979,Davis1987,Hasty1993,Wright1989}. The epigenomic and occupancy landscape of these and other factors involved in the process is illustrated for reference in Supplementary Figure \ref{FigApp3}. These factors were profiled in the mouse C2C12 cell line\cite{Yaffe1977}, which has been for decades the main model system for studying myogenesis, a wealth of functional genomic data has been generated for it, and it was thus naturally also the focus of our study. The key specification MRF and the one expressed at high levels in myoblasts is MyoD, while myogenin is the most important differentiation TF and its expression is induced upon the onset of the process; the other two MRFs are Myf5 and Myf6. MyoD and myogenin occupy a highly overlappings set of sites (Supplementary Figure \ref{FigApp3}), the majority of which contain the classical muscle E-box sequence motif CAGSTG often in the extended RRCAGSTG form\cite{Moncaut2013,Tapscott2005}. While the majority of ChIP-seq peaks contain this motif, these peaks include only a tiny fraction of the occurrences of the motif in the genome, underscoring the highly selective nature of in vivo occupancy by MRF TFs.

Lineage commitment during the process of erythropoesis is accomplished through the so called GATA switch\cite{Kaneko2010}. The onset of terminal differentiation is marked by the replacement of the GATA2 transcription factor at thousands of occupied sites by GATA1, which then regulates the expression of genes involved in red blood cell development and functions, with SCL/TAL1 being an important cofactor of GATA1, often forming closely spaced heterodimers with it\cite{Han2015,Wu2014,Tripic2009,Yu2009}. GATA1 binds to a WGATAA consensus recognition motif, while TAL1 is a bHLH factor targeting a CAGMTG E-box\cite{Han2015}. Mouse G1E cells have served as the key model system for erythropoesis for many years \cite{Weiss1997}. These cells are derived from in vitro differentiated mouse embryonic stem cells in which the endogenous GATA1 gene has been knocked out; a subclone of them, termed G1E-ER4, expresses constitutively a GATA1-ER fusion that can be specifically activated by estradiol exposure, allowing for differentiation to be triggered rapidly and in a controlled manner \cite{Tsang1997,Rylski2003}. The epigenomic and occupancy landscape of the key TFs involved is illustrated for reference in Supplementary Figure \ref{FigApp5}, and it served as the basis for our cRE selections. 

The transcriptional response to glucocorticoids, on the other hand, is characterized by a more rapid kinetics of gene expression activation, and by its general reversibility. We used the response of A549 cells to activation of the GR transcription factor by the cortisol analog dexamethasone (Dex) as a model system for our study. Upon activation by Dex GR rapidly associates with thousands of sites along the genome, both directly through its cognate motif and indirectly through association with cofactors such as AP-1 \cite{Reddy2009,So2007,Gertz2013}, leading to changes in the expression of hundreds of genes. The epigenomic landscape of A549 cells during GR activation is illustrated for reference in Supplementary Figure \ref{FigApp6}.

\end{multicols}

\begin{FPfigure}
\begin{center}
\includegraphics[width=18.5cm]{2017-10-13-Fig3-revision/Fig3V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Summary of cEnh activity predictions by different selection criteria}. (A) TF occupancy-centered selections. Tested eEnhs selected on the basis of TF occupancy in the context of mouse muscle differentiation and erythropoesis and in human K562 cells were further subselected with the additional requirement of exhibiting DNAse hypersensitivity and the H3K27ac histone mark. The fractions of active constructs in negative controls and cEnhs are shown on the left. The expected number of active cEnhs genome-wide is extrapolated on the left based on the number of TF$^+$/DNAse$^+$/H3K27ac$^+$ regions in the genome; (B) TF-occupancy agnostic selections. Tested eEnhs selected using Self-Organizing Maps in HepG2 cells, chromHMM in K562 cells, and evolutionary conservation of GATA1 motifs in G1E cells were further subselected with the additional requirement of exhibiting DNAse hypersensitivity and the H3K27ac histone mark. The fractions of active constructs in negative controls and cEnhs are shown on the left. The expected number of active cEnhs genome-wide is extrapolated on the left based on the number of DNAse$^+$/H3K27ac$^+$ (for HepG2 SOM and K562 chromHMM selections) or DNAse$^+$/H3K27ac$^+$ regions with a conserved GATA1 motif (for GATA1 conservation selections) in the genome.} 
\label{Fig3}
\end{FPfigure}

\begin{multicols}{2}	

While the full catalog of REs in the genome includes promoters, insulators, enhancers, silencers, and others. for the purposes of this study we focused on candidate transcriptional enhancers. A major reason for this choice is that cEnhs constitute the bulk of cREs distinguishing different cell types from each other, in contrast to, for example, active promoters, a major fraction of which is shared (Figure \ref{Fig1}C, Supplementary Figures \ref{FigApp7}, \ref{Fig2S}, \ref{Fig5S} and \ref{Fig8}). 

We also aimed to represent the full spectrum of cEnh biochemical signatures (Figure \ref{Fig1}D and Supplementary Figure \ref{FigS3}), as multiple studies have shown that the landscape of transcription factor occupancy, DNAse hypersensitivity and histone modification maps includes many more weaker sites than very strong peaks \cite{Landt2012,Kellis2014}. 

We applied several different strategies for compiling lists of cEnhs to be tested, broadly divided into two categories: TF-centric, based on TF ChIP-seq datasets, and TF-agnostic, based primarily on chromatin state signatures and evolutionary conservation. 

In the context of muscle differentiation, we selected cEnh regions based on myogenin ChIP-seq data in differentiated C2C12 myocytes. We randomly selected a set of regions ($n=89$) spanning the full range of myogenin occupancy levels, of which 88 contain a CAGSTG and 84 contain the extended RRCASGTG E-box. We also selected additional cEnh elements (\hl{XX n = ?? XX}) associated with genes playing a well characterized role in the muscle development (\hl{XXX list genes XXX}); the inclusion of these cEnhs allowed us to test whether cEnhs nearby known functionally relevant genes exhibit a higher levels of functionality that the genome-wide average. We also selected a group of negative controls ($n=23$) out of a set of well characterized enhancer elements active in T cells and in neurons and not being occupied by myogenin in C2C12 cells; 21 of them contained E-box motifs. These negative controls were selected in order to test the specificity of biochemical signatures for prediction of functional activity in a given cell type. A second set of negative control elements ($n=11$), 6 of them containing E-box motifs, were selected from regions without biochemical marks but located near genes highly expressed in C2C12 cells and near ChIP-seq positive regions. These were selected in order to assess the baseline functional activity of biochemically neutral regions.

TF-centric erythroid cEnhs ($n=114$) and negative controls ($n=74$) were selected on the basis of ChIP data for GATA1 and TAL1 in G1E-ER4 cells (see the Methods section for further details). Of note, even though they are not significantly occupied by GATA1 or TAL1, 45 of the negative control elements contain a WGATAA motif and 40 contain a CAGMTG E-box. Functional testing of cEnhs identified in G1E cells was then carried out in human K562 cells, which represent a similar developmental state but have the advantage of being much more easily transfectable. 

\hl{XXX K562 TF selection XXX}

TF-agnostic enhancer selection was based on evolutionary conservation and on the integration of measurements of multiple histone modifications and open chromatin. 

A set of evolutionarily conserved erythropoetic cEnhs ($n=46$) were selected by requiring both strong overall sequence conservation as assessed by multiple genome alignments of a collection of mammalian genomes and the conserved presence of WGATAA motifs. 

Multiple computational approaches for integrating high-dimensional collections of functional genomic datasets into a small set of chromatin states have been devised over the last few years and applied to the problem in ENCODE cell lines, including the Hidden Markov Model-based Segway\cite{Hoffman2012} and chromHMM\cite{Ernst2012}, as well as Self-Organizing Maps\cite{Mortazavi2013} (SOM). We selected cEnhs in K562 cells based on Segway and chromHMM chromatin state assignments and the presence of DNAse hypersensitivity and H3K27ac ($n = 30$), with elements lacking both marks used as negative controls ($n = 21$). We also selected cEnhs based on SOMs trained on DNAse-seq and histone mark ChIP-seq data over multiple ENCODE cell types; these cEnh elements were picked so that they were specifically in an open chromatin state and marked by histone modifications associated with enhancer activity in HepG2 cells ($n=32$). Elements lacking both marks ($n=18$) were used as negative controls. 

The collection of cEnh elements that we tested includes primarily long fragments, between 500 and 1000 bp (Supplementary Figure \ref{FigS2}), thus most likely encompassing complete functional regulatory elements provided that such are indeed present within the cEnh region. 

We tested these elements for functional activity using luciferase reporter assays \hl{XXX some details but not too many on how exactly this was done, refer to Methods for the rest XXX}. 

In addition to luciferase assays, we also incorporate ChIP-STARR-seq data from untreated and Dex-stimulated A549 cells \cite{Vockley2016}. The ChIP-STARR-seq libraries were generated by cloning GR ChIP-seq DNA fragments into a STARR-seq vector, then transfected into A549 cells, and RNA was sequenced from untreated and Dex-treated cells.

We subsequently compared measured activity with cEnh predictions based on various biochemical signatures to examine their predictivity of functional cEnhs.

\subsection*{Regulatory activity of TF-centric and TF-agnostic cEnh selections}

Figure \ref{Fig2} shows the measured enhancer activities for TF-centric cEnh selections and negative controls in myogenesis (Figure \ref{Fig2}A, and Supplementary Figure \ref{Fig3S}) and erythropoesis (Figure \ref{Fig2}B). We found \hl{XXX} out of \hl{XXX}, or \hl{XX}\% of muscle cEnhs to pass the threshold of activity in contrast to \hl{XXX} out of \hl{XXX}, or \hl{XXX}\% of muscle negative controls. Similarly, \hl{XXX} out of \hl{XXX}, or \hl{XX}\% erythropoetic cEnhs and \hl{XXX} out of \hl{XXX}, or \hl{XX}\% of negative controls, were found to be active. 

In K562 and HepG2 cells, we found \hl{XXX} out of \hl{XXX}, or \hl{XX}\%, and \hl{XXX} out of \hl{XXX}, or \hl{XX}\% of TF-centric cEnhs to be active, respectively, in contrast to \hl{XXX} out of \hl{XXX} (\hl{XX}\%) and \hl{XXX} out of \hl{XXX} (\hl{XX}\%) in the corresponding sets of negative controls (Supplementary Figure \ref{eLifeFig9-K562_HepG2-funcassay}). 

We then examined the subsets of cEnhs in each system bearing the simultaneous biochemical signature of TF ChIP-seq occupancy, H3K27ac demarcation and DNAse hypersensitivity, which would represent the most likely to be functional cEnhs. We observed 48.2\% active such cEnhs in myogenic cEnhs, 50\% among erythroid ones, and 49.5\% in the set of cEnhs selected from human immortalized cell lines (Figure \ref{Fig3}A). 

The same analysis for TF-agnostic cEnh selections showed that \hl{XXX} out of \hl{XXX} (\hl{XX}\%) of SOM picks were active in HepG2 cells, \hl{XXX} out of \hl{XXX} (\hl{XX}\%) of chromHMM/Segway picks were active in K562 cells \hl{SUPP FIGURE SIMILAR TO FIG.2 FOR TF-agnostic picks}, and \hl{XXX} out of \hl{XXX} (\hl{XX}\%) of GATA1 conservation selections exhibited significant activity \hl{SUPP FIGURE SIMILAR TO FIG.2 FOR TF-agnostic picks}. Restricting our analysis to cEnhs positive for H3K27Ac and DNAse hypersensitivity, we observed 56\% active cEnhs among HepG2 SOM picks, 46.6\% among K562 chromHMM/Segway picks, and 46.1\% for GATA1 conservation selections. We note that HepG2 SOM selections were biased towards more strongly H3K27ac/DNAse positive regions (Supplementary Figure \ref{FigS3}), which might explain the higher levels of observed activity within that set. 

Overall we find similar levels of activity in both the TF-centric and TF-agnostic sets of cEnhs predictions, around 50\%, \hl{XX calculate proportions p-values once numbers are final XXX}. Using the total number of cEnhs in each cell type and the observed proportions of cEnhs active in functional assays, we estimate that there are \hl{XXX} active myogenin$^+$/H3K27ac$^+$/DNAse$^+$ enhancers in C2C12 cells, \hl{XXX} GATA1$^+$/H3K27ac$^+$/DNAse$^+$ ones in erythroid cells, \hl{XXX} TF$^+$/H3K27ac$^+$/DNAse$^+$ ones in K562 cells. Self Organizing Maps combined with functional testing predict \hl{XXX} active enhancers in K562 cells while chromHMM/Segway predict \hl{XXX} ones. The combination of GATA1 conservation and the H3K27ac/DNAse biochemical signature predicts \hl{XXX} active enhancers in erythroid cells.

We also note that in all functional tests we carried out using luciferase assays, we find a skewed distribution of activity, similar what is observed for biochemical signal strength in ChIP-seq and other functional genomic experiments. A small number of cEnhs appear to be highly active, while the majority of even active cEnhs exhibits only modest activity (Figure \ref{Fig2}, Supplementary Figure \ref{Fig3S} and Supplementary Figure \ref{eLifeFig9-K562_HepG2-funcassay}). 

\subsection*{The bulk of enhancers in a given cell type are marked by modest biochemical signatures}

We next examined the relationship between biochemical signature strength and functional activity. Figure \ref{Fig4}A shows the myogenin occupancy landscape in the neighborhood of the mouse \textit{Myog} and \textit{Mybph} genes (both of these are well known muscle genes). A range of biochemical signal strengths is observed in cEnhs around these loci, from low to high, and this is typical for all biochemical marks. Figure \ref{Fig4}B shows the genome-wide distribution of biochemical strength for cEnhs defined by myogenin occupancy in C2C12 cells; the bulk of cEnhs are characterized by low-level occupancy by myogenin. 

In order to asses the relationship between occupancy strength and functional activity, we split the cEnhs we tested in C2C12s into four bins (``low'', ``medium'', ``high'', and ``top'') according to the level of myogenin ChIP-seq signal observed (Figure \ref{Fig4}C). We find that the fraction of active cEnhs increases steadily with the strength of myogenin signal, with only $\sim$20\% of low-myogenin cEnhs exhibiting significant activity in contrast to $\sim$55\% of the most strongly occupied ones. These observations superficially imply a close relationship between biochemical signal strength and functional activity, however, we note that even in the latter group a large fraction (nearly half) of cEnhs is inactive when directly functionally tested. 

We then asked how many active enhancers genome-wide are likely to be found among each portion of the signal strength distribution. Figure \ref{Fig4}D shows the extrapolated numbers of active enhancers in each bin of myogenin occupancy strength. While the strongest myogenin sites are mostly likely to be functionally active, the much greater numbers of weaker sites mean that the 75\% of active enhancers in muscle cells are expected to be found among the sites belonging to the ``low'' and ``medium'' bins. Similar observations apply to other biochemical marks (Supplementary Figure \ref{Fig12}).

We corroborated these finding by examining our A549 GR ChIP-STARR-seq data. Comparing STARR-seq reads to their input DNA libraries, and only including cEnhs with sufficiently deep representation in sequencing libraries (see the Methods section for details), we identified $\sim$15\% of GR cEngs to be significantly active in Dex-stimulated A549 cells and $\sim$10\% to be active in untreated cells (Supplementary Figure \ref{Fig12}A). This fraction is considerably lower than what is observed for cEnhs with luciferase assays, an observation that is explained by a combination of the generally lower sensitivity of MPRAs and the shorter fragments being represented in STARR-seq libraries, which likely do not capture complete regulatory elements (although we note that we also carried out an activity analysis at the level of individual DNA fragments and did not observe longer DNA fragments to be preferentially active compared to shorter ones; Supplementary Figure \ref{Fig12}C). We also note that, similar to luciferase functional assays, the majority of active enhancers in ChIP-STARR-seq datasets exhibit moderate levels of activity, with only a small minority of very highly active functional enhancers (Supplementary Figure \ref{FigS14}).

The distribution of GR ChIP-seq signal strength in cEnhs tested by ChIP-STARR-seq is not as skewed in favor of low-occupancy sites as it is in other contexts (Figure \ref{Fig4}E), which is due to the fact that representation in the ChIP-STARR-seq input libraries is biased towards stronger sites and that we excluded cEnhs with insufficient number of reads to evaluate their activity. Nevertheless, we do see more weaker sites than stronger ones, and we also observe much higher levels of activity ($\sim$40\%) in the ``top'' bin (Figure \ref{Fig4}F) than within the ``low'' and ``medium'' ones ($\sim$10\%). 

Thus we can conclude that even though the most visible biochemical signatures are most likely to correspond to active regulatory elements, in most biological contexts most functional enhancers in fact reside among the population of cEnhs characterized by only modest biochemical signatures, underscoring the complexity of the task of identifying active cEnhs from biochemical measurements alone.

\subsection*{Biochemical signature strength is not strongly correlated with enhancer strength}

Finally, we examined the quantitative correlation between biochemical signal and functional activity. Figure \ref{Fig5} shows the distribution of active and inactive muscle (Figure \ref{Fig5}A) and erythroid (Figure \ref{Fig5}B) cEnhs relative to the spectrum of DNAse and H3K27ac signal genome-wide. While highly active enhancers are more often found among the most strongly H3K27ac$^+$/DNAse$^+$ regions in muscle cells, overall there is only modest correlation between biochemical marking and functional activity, and it is even less apparent in the erythroid context. We calculated the correlation between TF occupancy and biochemical marks on one hand, and functional activity on the other and found only a small positive quantitative correlation (Pearson $r^2 \leq 0.10$; Spearman rank correlation $r \leq 0.40$) between each of these signatures and enhancer activity (Figure \ref{Fig5}C), observations that also hold in the other contexts we examined (Supplementary Figures \ref{FigS4}A-B, \ref{FigS7}A-B, \ref{FigS10}A-B, and \ref{FigApp8}B). 

We also evaluated the predictivity of biochemical signatures using a receiver operating characteristic curve analysis (Supplementary Figures \ref{FigS4}C-F, \ref{FigS7}C-D, and \ref{FigS10}C-F). With the exception of erythroid cEnhs, where the combination of GATA1 and TAL1 was most predictive of functional activity, we find that DNAse and H3K27ac are most often the best predictors of functional enhancers. Their predictivity, however, is not incredibly strong, with AUROC values only exceeding 0.8 in K562 and HepG2 cells. 

Overall, the combination of H3K27ac and DNAse hypersensitivity appears to be as reliable a predictor of functional activity as any other biochemical signature, however, even it is in no way absolutely predictive of function, with only approximately half of H3K27ac$^+$/DNAse$^+$ actually exhibiting significant enhancer activity. 

% \subsection*{Temporal differences between biochemical mark profiles and their impact for cEnh prediction}

\section*{Discussion}

In this study, we have provided a comprehensive examination of cEnh activity in multiple mammalian systems and its relationship to biochemical signatures commonly used to select cEnh elements. Across cell types and methods for cEnh selection, approximately 50\% of cEnhs simultaneously exhibiting significant H3K27ac marking and DNase hypersensitivity appear to function as active enhancers. We also demonstrated that enhancer assays activity is specific to genomic regions that are distinguished by characteristic biochemical signatures. By studying cEnhs sampled across the full spectrum of ChIP occupancy for multiple transcription factors, we have demonstrated that the most strongly biochemically marked cEnhs are highly enriched for functionality. 

However, first, active functional enhancers are also present throughout the whole biochemical signal spectrum, and because of the very large number of the latter, the bulk of active enhancers in any given cell type in fact resides in the population of cEnhs with modest biochemical signatures, and second, we do not observe a particularly strong correlation between the magnitude of enhancer activity in functional assays and strength of biochemical marks as measured using functional genomic assays. 

These findings are in contrast to earlier studies, which reported over 80-90\% activity for cEnhs defined using, for example p300 ChIP-seq\cite{Visel2009}. This is most likely due to the fact the these studies only focused on elements selected among the most strongly enriched and likely to be functional cEnhs rather than the full spectrum of ChIP-seq signal.

\end{multicols}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig4.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Enrichment of active cEnhs in different classes of cEnhs defined by the strength of their biochemical signatures}. (A) cEnhs (rectangle boxes) belonging to different signal classes (based on ChIP-seq data for myogenin in C2C12 myocytes; ``top'': RPM $\geq 10$; ``high'': RPM$\in [5,10]$; ``medium'': RPM$\in [2.5,5]$; ``low'' RPM$\leq 2.5$) in the neighborhood of the mouse \textit{Myog} gene; (B) Genome-wide distribution of cEnhs in different signal classes based on ChIP-seq data for myogenin in C2C12 myocytes; (C) Fraction of active enhancers in different cEnh signal classes (based on ChIP-seq data for myogenin in C2C12 myocytes; ``top'': $n = 66$; ``high'': $n = 49$; ``medium'': $n = 45$; ``low'' $n=27$) as well as in negative controls (with no myogenin occupancy; $n=34$). Only cEnhs positive for myogenin, DNAse and H3K27ac were included; (D) Extrapolated numbers of active enhancers in C2C12 belonging to each signal strength class
\centerline{}
\rightline{\textit{(legend continued on next page)}}
} 
\label{Fig4}
\end{figure*}
\clearpage
\let\thefootnote\relax\footnotetext{ based on the genome-wide numbers of myogenin$^+$/DNAse$^+$/H3K27ac$^+$ regions. (E) Genome-wide distribution of cEnhs in different signal classes based on the set of GR ChIP-STARR-seq cEnhs in A549 cells (``top'': A549 Dex GR ChIP-seq RPM $\geq 10$; ``high'': RPM$\in [5,10]$; ``medium'': RPM$\in [2.5,5]$; ``low'' RPM$\leq 2.5$). Only GR ChIP-seq regions significantly represented within STARR-seq libraries (i.e. with sufficiently many reads to score as active if they were in fact active) are shown for each signal class. (F) Fraction of cEnhs exhibiting significant activity in the GR ChIP-STARR-Seq assay in stimulated A549 cells for each signal strength class.}

\begin{multicols}{2}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/Fig5.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Absence of general strong correlation between biochemical signal strength and enhancer activity of cEnhs}. (A) Distribution of tested cEnhs relative to the genome-wide DNAse and H3K27Ac signal distribution in C2C12 myocytes. Shown are DNAse and H3K27ac RPM values for all DNAse$^+$/H3K27ac$^+$ regions as well as for cEns tested for activity in C2C12 myocytes (outlined circles) and for occupancy negative control (outlined squares), with tested cEnhs separated into four classes based on their measured enhancer activity, from dark red (most active) to yellow (inactive). (B) Distribution of tested cEnhs relative to the genome-wide DNAse and H3K27Ac signal distribution in G1E-ER4 cells. Shown are DNAse and H3K27ac RPM values for all DNAse$^+$/H3K27ac$^+$ regions as well as for cEnhs tested for activity (outlined circles) and for occupancy-negative controls (outlined squares), with tested cEnhs separated into four classes based on their measured enhancer activity, from dark red (most active) to yellow (inactive). (C) Correlation between biochemical signals and measured enhancer activity in C2C12 and G1E cells. See also Supplementary Figures \ref{FigS4}, \ref{FigS7}, and \ref{FigS10} for more details.} 
\label{Fig5}
\end{figure*}

We find a smaller fraction (15-25\%) of active cEnhs using a high-throughput ChIP-STARR-seq MPRA, but similar qualitative patterns across the spectrum of biochemical signatures defining cEnhs. The reasons for the lower activity rates returned by MPRAs are manifold, and include (but are likely not limited to) the fact that the DNA fragments used as input to the MPRA are shorter than the length of fully functional regions, and that ChIP-STARR-seq libraries do not provide deep and complex representation of the original pools of ChIP-seq fragments, leaving many modestly active enhancers with insufficiently many reads to cross the thresholds of statistical significance; both of these factors are expected to lead to high false negative rates.

% Finally, we observe that biochemical marks can be decoupled from each other temporally, which can impact cEnh predictions based on their cooccurrence. For example, during muscle differentiation thousands of TF occupancy sites also exhibit DNAse hypersensitivity and p300 localization but are not yet robustly acetylated at H3K27, and conversely, in differentiated cells H3K27ac can remain for some time associated with sites previously TF-occupied DNAse hypersensitive sites even though they are no longer open.

% A possible explanation for this result can be gleaned from an enhancer located proximal to the muscle creatine kinase gene. This enhancer, studied over the last three decades for its outstanding function, displays only a modest myogenin ChIP signal that does not correlate with the $\geq95\%$ physical occupancy measured invivo on the chromosome \cite{Wold1989}. 

% We did not observe a particularly strong correlation between the magnitude of enhancer activity in functional assays and strength of biochemical marks, as expected if the decoupling of the biochemical measurements from the true physical occupancy is a genome-wide phenomenon.

% These findings are in contrast to earlier studies, which reported over 80-90\% activity for cEnhs defined using, for example p300 ChIP-seq\cite{Visel2009}. This is most likely due to the fact the these studies only focused on elements selected among the most strongly enriched and likely to be functional cEnhs rather than the full spectrum of ChIP-seq signal.

% We find a smaller fraction (15-25\%) of active cEnhs using a high-throughput ChIP-STARR-seq MPRA, but similar qualitative patterns across the spectrum of biochemical signatures defining cEnhs. The reasons for the lower activity rates returned by MPRAs are manifold, and include (but are likely not limited to) the fact that the DNA fragments used as input to the MPRA are shorter than the length of fully functional regions, and that ChIP-STARR-seq libraries do not provide deep and complex representation of the original pools of ChIP-seq fragments, leaving many modestly active enhancers with insufficiently many reads to cross the thresholds of statistical significance; both of these factors are expected to lead to high false negative rate.

% We observe that biochemical marks can be decoupled from each other temporally, which can impact cEnh predictions based on their co-occurrence. For example, during muscle differentiation thousands of TF occupancy sites also exhibit DNAse hypersensitivity and p300 localization but are not yet robustly acetylated at H3K27, and conversely, in differentiated cells H3K27ac can remain for some time associated with sites previously TF-occupied DNAse hypersensitive sites even though they are no longer open.

% A possible mechanism of action for these orphaned H3K27Ac sites is demonstrated by GR stimulated A549 cells; where a significant subset of biologically active enhancers are from GR occupancy at these formerly orphaned H3K27ac locations from unstimulated A549 cells. These orphaned sites may thus function as a guide for different responses to external stimuli; or lack thereof in different tissues (Supplemental figure \ref{FigS21}).

% It is unclear if the many elements found inactive in this study, including some which have been shown to be physically connected to a gene, may not function as individuals, or they may require a specific enhancer-enhancer or promoter-enhancer pairing indicated by the physical interaction maps \cite{Zabidi2015}.

% \textbf{YYY This may be a good place to include a BTG2-MYBPH-MYOGENIN summary figure showing that while enhancers are present next to strong genes; others are being kept silent? It is a point worth making, and that locus illustrates it well YYY}

% Some of these elements found to be inactive may only able to function in specific cell contexts. For example, the formation of muscle across different compartments of the embryo commonly uses myogenin, but involves both different temporal domains and the exposure to different cofactors.  This would impart for an additional level of control to activate only a subset of the programming. 

% In order for ENCODE to fulfill its self imposed mission of defining the activity of the components regulating transcription, we will likely be required to test multiple enhancer-promoter; enhancer-enhancer in more than the limited number of tissue types currently being presented. This poses a difficult challenge, that will likely take decades to solve.

Promoter-enhancer specificity \cite{Zabidi2015}

integration in genome \cite{Maricque2016,Inoue2016,FIREWACh2014}

\section*{Methods}

Except where otherwise stated, all analyses were performed using custom-written python scripts. The v19 and vM4 versions of the GENCODE \cite{GENCODE} annotations for human and mouse, respectively, were used for all analyses.

\subsection*{Cell culture}

\subsubsection*{C2C12 cells}

C2C12 myoblasts were maintained and seeded for transfection in 20$\%$ FBS supplemented DMEM medium. Upon reaching $>$80$\%$ confluency, the cells were were differentiated using 2$\%$ horse serum and 1 $\mu$M insulin in DMEM medium.

\subsubsection*{G1E cells}

G1E cells were grown according to previously published protocols\cite{Wang2006,Cheng2008,Dogan2015}.

\subsubsection*{K562, HepG2 and A549 cells}

K562; HepG2 and A549 cells were grown according to the approved ENCODE cell culture protocols publicly available through the ENCODE portal (\burl{https://www.encodeproject.org/}).

\subsection*{Functional assays}

\subsection*{Cloning and DNA purification}

The specifics of each selection set, the promoters used for each cell line, and other details are publicly available through the ENCODE portal (\burl{https://www.encodeproject.org/}).

\subsubsection*{Functional assay testing of cEnhs in C2C12 cells}

Candidate REs and negative control regions were either PCR-amplified from female BALB/C purified mouse genomic DNA (Switchgear Genomics) or synthesized de novo (Genscript). The resulting DNA was cloned into a reporter vector 5$^\prime$ of a custom TK promoter (SwitchGear Genomics) driving a high-turnover sequence-optimized luciferase reporter gene. Plasmids were purified using Miniprep kits (Qiagen) and standardized to 30 ng/$\mu$L using the Qubit\textsuperscript{\textregistered} dsDNA HS (High Sensitivity) Assay Kit.

For the purpose of testing elements in the myoblast state, undifferentiated C2C12 cells were seeded in 96-well delta surface plates (NUNC) in quadruplicates 12 hours before transfection at a concentration of 2500 cells/well. For the purpose of testing elements in the myocyte state, undifferentiated C2C12 cells were seeded at a density of 3500 cells/well. Transfections were carried out with 50 ng of DNA per construct in each replicate using Lipofectamine LTX, after a 5 minute incubation with a 1:16 dilution with the PLUS reagent (Thermo Fisher). Myoblast plates were lysed using a Steady-Glo\textsuperscript{\textregistered} kit, and luminescence was measured on a plate luminometer 24 hours post-transfection. Myocyte plates had their media exchanged with differentiation 12-16 hours post transfection and measured following the same procedure 24 hours later.

Aside from the plate reading step, the entirety of the transfection process was automated and carried out on a Tecan Freedom EVO 200 robot.

\subsubsection*{Functional assay testing of cEnhs in K562 and HepG2 cells}

The set of K562 and HepG2 cEnh regions was PCR-amplified and cloned 5$^\prime$ of the promoter of enhancer assay plasmids containing luciferase and renilla reporter genes; cloning was performed by SwitchGear Genomics. Each construct was quantified (using Qubit) and standardized to 30ng/$\mu$L before use in transfection assays. \hl{XXX details of transfection XXX}. \hl{Chris Partridge please review this}

\subsubsection*{Functional assay testing of erythropoetic cEnhs}

G1E candidate enhancer regions were tested in K562 cells according to protocols publicly available through the ENCODE portal \burl{https://www.encodeproject.org/}.

\subsection*{Functional Assay Data processing}

For each cEnh or negative control measurement, the ratio between its value and the corresponding basal promoter vector (relative assay activity) was calculated. Active cREs were discriminated from inactive using a $z$-score analysis, comparing the population of test element technical replicate values to the set of negative controls. \hl{XXX this could be stated more explicitly with the actual formulas XX}. 

\subsection*{ChIP-seq experiments}

Chromatin immunoprecipitation in A549 cells was performed as previously described (Reddy et al. 2009) using 2 $\times$ 10$^7$ A549 cells per replicate. Cells were sonicated using a Bioruptor XL (Diagenode) on the high setting until the resulting chromatin was fragmented to a median fragment size of 250 nt as assayed by agarose gel electrophoresis. GR ChIP was performed using 5 $\mu$g of a rabbit polyclonal $\alpha$-GR antibody (Santa Cruz Biotechnology sc-1003), and 200 $\mu$l of magnetic sheep anti-rabbit beads (Life Technologies M-280). H3K27ac ChIP was performed using \hl{XXX Ab source XXX}. After reversal of formaldehyde crosslinks at $65\,^{\circ}\mathrm{C}$ overnight, DNA was purified using MinElute DNA purification columns (QIAGEN). Illumina sequencing libraries were then generated using the Apollo 324 liquid handling platform according to manufacturer's specifications (Wafergen). 

ChIP-seq in C2C12 cells was performed using chromatin from 2 $\times$ 10$^7$ nuclei, which was fragmented using a Misonix probe tip sonicator and subjected to immunoprecipitation using a robotic ChIP pipeline described before. The resulting purified DNA was then converted into sequencing libraries and sequenced on an HiSeq 2500 (Illumina) as described previously\cite{Gasper2016}. The following antibodies were used: $\alpha$-myogenin (Santa Cruz Biotechnology SC-12732, lot K2311), $\alpha$-MyoD (Santa Cruz Biotechnology SC-32758, lot J3115), $\alpha$-MEF2 (Santa Cruz Biotechnology SC-17785, lot H1913), $\alpha$-p300 (Santa Cruz Biotechnology SC-585, lot H3115), $\alpha$-E2A (Santa Cruz Biotechnology SC-349X, lot B1207), $\alpha$-H2B (Santa Cruz Biotechnology SC-357, F2305), and $\alpha$-H3K27ac (Active Motif 39133, lot 34849). 

In addition, publicly available\cite{Pbx1} Pbx1 ChIP-seq and Control datasets were downloaded from GEO accession GSE76010. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} ChIP-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

\subsection*{DNAse-seq experiments}

In C2C12 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}

In A549 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} DNAse-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

% \subsection*{RNA-seq experiments}

% \hl{XXX C2C12 RiboMinus XXX}

\subsection*{STARR-seq experiments}

The STARR-seq experiments previously published by Vockley et al.\cite{Vockley2016} were used in this study. 

\subsection*{Genomic coordinate conversion}

The regions to be tested using functional assays were designed based on the \verb|mm8| and \verb|mm9| verions of the mouse genome and the \verb|hg19| version of the human genomes. Conversion of the original coordinates to \verb|mm10| and \verb|hg20| coordinates was performed using the \verb|liftOver| tool from the UCSC Genome Browser Utilities\cite{UCSCGB}.

\subsection*{Conservation analysis}

Sequence conservation analysis were carried out using the \verb|phastCons60way| and \verb|phastCons100way| conservation tracks, which were downloaded from the UCSC Genome Browser\cite{UCSCGB}. 

\subsection*{ChIP-seq data processing and analysis}

ChIP-seq reads were trimmed down to 36 bp in length and mapped against the \verb|hg20| (for human samples; the male or female version depending on the sex of the cell line the sample originated from) and \verb|mm10| (for mouse samples) using Bowtie \cite{Bowtie} (version 1.0.1) with the following settings: \verb|-v 2| \verb|-k 2| \verb|-m 1| \verb|--best| \verb|--strata|. DNAse-seq reads were processed similarly except that they were trimmed down to 20bp for A549 samples and 36bp for C2C12 cells (due to differences in the experimental protocol used to generate the data).

Peak calling was carried out as follows. For DNAse and H3K27ac datasets, MACS2 \cite{Feng2012} (version 2.1.0) was run on individual replicates and on pseudoreplicates (generated by randomly splitting the pooled set of reads for both replicates into two) with relaxed settings (\verb|--to-large| \verb|-p 1e-1|). The top 100,000 peaks from each replicate or pseudoreplicate (ranked by $q$-value) were then used as input into IDR \cite{Li2011}. The number of peaks above a given IDR threshold called as reproducible between true replicates ($N_t$) and between pseudoreplicates  ($N_p$) were recorded. Peak calling was then carried out on the pooled set of reads and the top $max(N_t,N_p)$ peaks were chosen as the final set of reproducible peaks. For point-source \cite{Pepke2009} datasets (transcription factors), peak calling was carried out following the same procedure but using SPP \cite{Kharchenko2008} (version 1.10.1), using the top 300,000 peaks as input to IDR.

The pooled sets of reads were also used to calculate RPM (reads per million) enrichment values over elements tested in functional assays.

% \subsection*{RNA-seq data processing}

% ENCODE paired-end human RNA-seq reads were trimmed down to 2 $\times$ 75mers. C2C12 RiboMinus reads were aligned as 1 $\times$ 50mers. Reads were initially aligned against human or mouse ribosomal RNA sequence, and the unmapped reads were then aligned against the \verb|hg20| and \verb|mm10| versions of the human genome using TopHat\cite{TopHat2} (version 2.0.8). 

\subsection*{STARR-seq data processing and analysis}

STARR-seq and STARR-seq control/input reads ($2 \times $25mers) were mapped as paired ends to the \verb|hg20| version of the human genome using Bowtie with the same settings as described above. Post-IDR peaks obtained from GR ChIP-seq were used as the list of candidate cEnhs to be scored using the STARR-seq data. For each STARR-seq and STARR-seq control/input replicate, raw fragment counts were obtained from every GR ChIP-seq peak; in addition, the rest of the genome (i.e. the regions that fall between the post-IDR GR ChIP-seq peaks) was split into bins of at most 50 kb length, and read counts were calculated for all such regions. The fragment counts for GR ChIP-seq peaks and for the intervening regions were combined together and used as input to DESeq2\cite{DESeq2} for estimating differentially represented regions between STARR-seq and control/input libraries (at FDR-adjusted $p \leq 0.05$). The lowest average fragment counts value which was scored as significantly significant by DESeq2 was identified for each comparison, and all GR ChIP-seq regions with average fragment counts lower than this value were excluded from subsequent analysis, as such regions were not sufficiently represented in the available sequencing data to be reliably scored as active or inactive. We also carried out a fragment-level analysis, in which read counts were calculated for each individual sequencing fragment (defined as the pair of positions $\{i,j\}$, where $i$ and $j$ are respectively the $5^{\prime}$ and $3^{\prime}$ ends of the first and the second sequencing reads in a pair), using the same DESeq2 framework.

\section*{Acknowledgments}

Library generation and high-throughput sequencing for C2C12 ChIP-seq samples was performed by Igor Antoshechkin at the Millard and Muriel Jacobs Genetics and Genomics Laboratory. The authors would also like to thank Diane Trout and Henry Amrhein for technical assistance with maintaining the computational infrastructure used to carry out this study.

This material is based upon work supported by the National Science Foundation under Grant No. CNS-0521433.

\begin{thebibliography}{100}

\input{references}

\end{thebibliography}

\end{multicols}

\clearpage

\setcounter{table}{0}
\renewcommand{\tablename}{Supplementary Table}
\setcounter{figure}{0}
\renewcommand{\figurename}{Supplementary Figure}

\setcounter{page}{1}
\renewcommand\thepage{{SM }\arabic{page}}

\section*{Supplementary Materials}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{eLifeFigApp1-CNE-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf The length of thousands of conserved noncoding elements in mammalian genomes greatly exceeds the size range of MPRA constructs}. (A) The length distribution of conserved noncoding regions in the human genome. The \texttt{phastCons100way} conservation track for the \texttt{hg20} version of the human genome was downloaded from the UCSC Genome Browser. Blocks of conservation, in which all nucleotides have \texttt{phastCons} scores higher than the indicated minimum (\texttt{phCons}), were identified, and then merged into larger regions if the length of the gaps between them was smaller than the indicated \texttt{maxGap} parameter. The distribution of the lengths of the resulting sets of regions was plotted. This approach captures the properties of enhancer elements observed in the genome, which often consist of multiple blocks of highly conserved sequences separated by gaps of less conserved sequences, resulting in an enhancer element of up to a few hundred base pairs in length or more. (B) Such an example is shown for the \textit{Acta1} gene in mouse.} 
\label{FigS1}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{eLifeFigApp2-contruct-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Length distribution of functional assays constructs used to test cREs in this study}. (A) Distribution of functional assay construct lengths tested in this study in C2C12 cells. (B) Distribution of functional assay construct lengths tested in this study in G1E cells. (C) Distribution of functional assay construct lengths tested in this study in K562 and HepG2 cells } 
\label{FigS2}
\end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18cm]{2017-08-31-Figs/FigS3.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of biochemical signal in tested cEnhs and genome-wide}. Shown is the distribution of ChIP-seq or DNAse-seq RPM values for the set of cEnhs tested and for the genome-wide set of cEnh with similar biochemical signatures shown in Figure \ref{Fig3}.} 
\label{FigS3}
\end{figure*}


\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp7-C2C12-G1E-HepG2-K562-overlaps.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Differential marking of proximal and distal cREs by DNAse and H3K27ac between different cell types and cell states}. (A) Promoter-proximal (within $\leq$1 kb of an annotated TSS) sites in K562 and HepG2 cells; (A) Distal ($\geq$1 kb from an annotated TSS) sites in K562 and HepG2 cells; (C) Promoter-proximal (within $\leq$1 kb of an annotated TSS) sites in differentiated and undifferentiated C2C12 and G1E cells; (D) Distal ($\geq$1 kb from an annotated TSS) sites in differentiated and undifferentiated C2C12 and G1E cells. The overlap score ($O_{xy}$) shown in each cell $(x,y)$ indicates the fraction of peaks in the dataset on the $y$-axis that are also found in the dataset on the $x$-axis, i.e. $O_{xy} = |X \cap Y|/|Y|$.} 
\label{FigApp7}
\end{figure*}

\clearpage

\includegraphics[width=18cm]{eLifeFigApp3a-C2C12-heatmaps-MyoD-V2.png}

\clearpage

\includegraphics[width=18cm]{eLifeFigApp3b-C2C12-heatmaps-myogenin-only-V2.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18cm]{eLifeFigApp3c-C2C12-heatmaps-MEF2-only-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of muscle differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, p300, the MRFs MyoD and myogenin, and cofactors (MEF2, E2A/TCF3, HEB/TCF12, and Pbx1) in undifferentiated (myoblast, or ``MB'') and differentiated (myocyte, or ``MC'') C2C12 cells were analyzed. Sites were split into multiple subgroups depending on regulatory factor occupancy (at IDR=0.05) -- MyoD-positive (in either condition) sites (A), myogenin-only sites (B), and MEF2-only sites (C) -- then sorted by MRF ChIP-seq signal (in the following order of priority: myoblast MyoD, myocyte MyoD, myocyte myogein, myoblast MEF2, myocyte MEF2); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp3}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig2-C2C12-DNAse-H3K27ac.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during muscle differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myoblasts; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myocytes; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myoblasts; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myocytes; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.}
\label{Fig2S}
\end{figure*}

\clearpage

\includegraphics[width=18.5cm]{eLifeFig3a-C2C12-funcassay-locus-V2.png}

\clearpage

\includegraphics[width=18.5cm]{eLifeFig3b-C2C12-funcassay-random-V2.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp4c-C2C12-funcassay-negative-controls-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in C2C12 cells}. Fold activity in myocytes (top) and myoblasts (bottom) across biological replicates ($n = 4$) and technical replicates  ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, p300, MyoD and myogenin occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs selected for their physical proximity to loci known for their importance to muscle development (``locus picks''); (B) randomly selected from the genome-wide set of MyoD/myogenin-occupied regions; (C) negative controls.} 
\label{Fig3S}
\end{figure*}

\clearpage

\begin{FPfigure}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig4-C2C12-predictivity-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in C2C12 cells}. (A and B) Correlation between fold activity and DNAse hypersensitivity, H3K27ac, p300, myogenin, MyoD and MEF2 occupancy in myoblasts and myocytes; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in myocytes; (D) AUROC (area under ROC curve) values for different biochemical marks in myocytes; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in myoblasts; (F) AUROC values for different biochemical marks in myoblasts.} 
\label{FigS4}
\end{FPfigure}

\begin{FPfigure}
\begin{center}
\includegraphics[width=16cm]{eLifeFigApp5-G1E-TAL1-GATA-heatmaps-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of eryhtroid differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, GATA1, TAL1 and GATA2 G1E and G1E-ER4 were analyzed. Sites were split into subgroups depending on GATA1 and TAL1 occupancy (IDR=0.05), then sorted by ChIP-seq signal (in the following order of priority: G1E-ER4 GATA1, G1E-ER4 TAL1); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp5}
\end{FPfigure}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18cm]{eLifeFig5-G1E-DNAse-H3K27ac.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during erythroid differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E-ER4 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E-ER4 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.} 
\label{Fig5S}
\end{figure*}

\clearpage

% \begin{FPfigure}
% \begin{center}
% \includegraphics[width=18.5cm]{eLifeFig6-G1E-funcassay.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf Functional assay testing of the regulatory activity of erythroid cREs}. Fold activity in K562 cells across biological replicates ($n \in [1,9]$) and technical replicates ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, GATA1, and TAL1 occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs randomly selected from the genome-wide set of GATA1/TAL1-occupied regions; (B) cREs selected among the set of highly evolutionarily constrained non-coding elements that contain a GATA1 motif (``regulatory potential selections'').} 
% \label{Fig6S}
% \end{FPfigure}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig7-G1E-predictivity.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in erythroid cells}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, H3K27ac, TAL1, and GATA1 occupancy in G1E and G1E-ER4 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity; (D) AUROC (area under ROC curve) values for different biochemical marks.} 
\label{FigS7}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig8-K562_HepG2-DNAse-H3K27ac.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation in immortalized human cell lines}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in K562 cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in K562 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in HepG2 cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in HepG2 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black.} 
\label{Fig8}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig9-K562_HepG2-funcassay.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in human immortalized cell lines}. Fold activity across biological replicates ($n =$ \hl{???}) and technical replicates ($n = $  \hl{???} for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity and H3K27ac status are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs tested in K562 cells (B) cREs tested in HepG2 cells.} 
\label{eLifeFig9-K562_HepG2-funcassay}
\end{figure*}

\begin{FPfigure}
\begin{center}
\includegraphics[width=17.5cm]{eLifeFig10-K562-HepG2-A549-plots.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in human immortalized cell lines}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, and transcription factor occupancy in K562 and HepG2 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (D) AUROC (area under ROC curve) values for different biochemical marks in K562 cells; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (F) AUROC (area under ROC curve) values for different biochemical marks in K562 cells.} 
\label{FigS10}
\end{FPfigure}

% \begin{figure*}[!ht]
% \begin{center}
% \includegraphics[width=15.5cm]{eLifeFig11-predictivity-V2.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf CAPTION}. (A) CAPTION GOES HERE} 
% \label{Fig11}
% \end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=13cm]{eLifeFig12-predictivity-vs-signal.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Enrichment of active cEnhs in different classes of myogenic cEnhs defined by the strength of their biochemical signatures.} (A) According to myogenin ChIP-seq signal strength; (B) According to H3K27ac ChIP-seq signal strength; (C) According to DNAse-seq signal strength. Shown on the right are the total number of cEnhs biochemically marked my myogenin, H3K27ac or DNAse, and the extrapolated number of active enhancer elements among them.} 
\label{Fig12}
\end{figure*}

\begin{sidewaysfigure}
\begin{center}
\includegraphics[width=25cm]{eLifeFigApp6-A549-heatmaps-V2.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of GR response in A549 cells}. GR ChIP-seq peaks were split into promoter-proximal and promoter-distal sites, then ranked by ChIP-seq signal strength within each group. Shown is the distribution of p300 and H3K27ac ChIP-seq and DNAse-seq signal around each site.} 
\label{FigApp6}
\end{sidewaysfigure}

\clearpage

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{eLifeFigApp8-A549-STARR-seq-activity.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Testing of cEnhs for activity using ChIP-STARR-seq for GR in A549 cells with and without Dexamethasone stimulation }. (A) Fraction of active cEnhs detected in each condition. Shown is the number of cEnhs that passed the minimum representation threshold (see the Methods section for more details) and were identified as active using DESeq2. (B) Fraction of significantly active (FDR-corrected $p$-value $\leq 0.05$) biochemically marked individually on in combinations by H3K27ac, DNAse, p300. (C) Length distribution of active and inactive STARR-seq fragments as defined by DESeq2. } 
\label{FigApp8}
\end{figure*}

% \begin{figure*}[!ht]
% \begin{center}
% \includegraphics[width=18.5cm]{2017-08-31-Figs/Fig6.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf Marking of common and cell state-specific active cEnhs by H32K7ac, DNAse and p300}. (A) STARR-seq data in A549 cells with and without Dexamethasone treatment (epigenomic datasets from the 3 hour time point were used); (B) Luciferase assay data in differentiated and undifferentiated C2C12 cells.} 
% \label{Fig6}
% \end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{2017-08-31-Figs/FigS14-STARR-seq-activity-distribution.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of STARR-seq activity in A549 cells}. Shown is the distribution of $log_2$(FoldChange) values (defined by DESeq2) for STARR-seq experiments in resting EtOH-treated (A) and Dexamethasone-treated (B) A549 cells.} 
\label{FigS14}
\end{figure*}

\end{document}

https://www.overleaf.com/11055589xrycntbhwjnf