\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=2cm,bottom=2cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage{pagecolor}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionBefore]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{3pt}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}
% \usepackage{fancyhdr}
% \pagestyle{fancy}
% \renewcommand{\sectionmark}[1]{
% \markright{\thesection\ #1}}
% \fancyhf{} % delete current header and footer
% \fancyhead[CO]{Third-generation sequencing and functional genomics}
% \fancyhead[CE]{Georgi K. Marinov}
% \fancyhead[RO,LE]{\thepage}
% \fancyhead[LO]{\bfseries\rightmark}
% \fancyhead[RE]{\bfseries\leftmark}
% \renewcommand{\headrulewidth}{0pt}
% \renewcommand{\footrulewidth}{0pt}
% \addtolength{\headheight}{0.5pt} % space for the rule
% \fancypagestyle{plain}{%
% }

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}

\title{\bf Enhancer activity and biochemical signatures in ENCODE Encyclopedia Candidate Regulatory Elements}
\renewcommand\Authfont{\scshape\normalsize}
%\author[*]{ALL OTHER AUTHORS I'VE MISSED}
\author[1]{Gilberto DeSalvo}
\author[6]{Georgi K. Marinov}
\author[2]{Christopher Partridge}
\author[4,7]{Christopher M. Vockley}
\author[3]{Nergiz Dogan}
\author[8,9]{Ricardo Ramirez}
%\author[1]{Gordon Kwan}
\author[4,5]{Timothy E. Reddy}
\author[8,9]{Ali Mortazavi}
\author[3]{Ross C. Hardison}
\author[2]{Richard M. Myers}
\author[1]{Barbara J. Wold}
\renewcommand\Affilfont{\itshape\normalsize}
%\affil[*]{other affiliations}
\affil[1]{Division of Biology and Biological Engineering, California Institute of Technology, Pasadena, California, USA.}
\affil[2]{HudsonAlpha Institute for Biotechnology, 601 Genome Way, Huntsville, AL 35806, USA}
\affil[3]{Dept. of Biochemistry and Molecular Biology, Penn State University, 304 Wartik Laboratory, University Park, PA 16802, USA}
\affil[4]{Center for Genomic \& Computational Biology, Duke University, Durham, NC 27708, USA}
\affil[5]{Department of Biostatistics \& Bioinformatics, Duke University, Durham, NC 27708, USA.}
\affil[6]{Independent researcher; Zahari Zograf 24 V 8, Pleven, Bulgaria}
\affil[7]{Department of Cell Biology, Duke University, Durham, NC 27708, USA}
\affil[8]{Department of Developmental and Cell Biology, University of California Irvine, Irvine, CA 92697-2300, USA}
\affil[9]{Center for Complex Biological Systems, University of California Irvine, Irvine, CA 92697-2280, USA}
\date{}

\begin{document}
\maketitle

% \let\thefootnote\relax\footnotetext{Corresponding author:
% Georgi K. Marinov. Division of Biology and Biological Engineering, California Institute of Technology. Pasadena, CA 91125.}
% \centerline{}
% \centerline{}
\begin{abstract}
\noindent {\normalsize \textbf{An aspiration for functional genomics is to identify all cis-acting regulatory elements in the genome and define their activities.  Genome-wide transcription factor occupancy and chromatin state profiles are widely used to identify candidate regulatory elements (cREs), although accurately predicting their varied regulatory activities and cell type specificities has proved surprisingly difficult. Multiple large-scale efforts, including ENCODE and Epigenomic Roadmap projects, have generated rich standardized biochemical data from large numbers of diverse cell lines, tissues and developmental stages for human and mouse. These data collections are driving attempts to catalog the global cRE repertoire, including a new ENCODE Encyclopedia of Elements V1.0. In light of these expanded resources, we aimed to uniformly analyze a collection of pertinent enhancer tests for hundreds of individual constructs in five cell types, including both progenitor cell states and mature differentiated cell systems.  The elements were drawn from both TSS-proximal and far-distal locations; were designed to include the full-length cREs; and were selected using criteria that ranged from simple single-TF occupancy to integrative machine learning models. A few shared qualitative and quantitative themes emerged: roughly 50\% of elements occupied by at least one TF and possessing the basic ENCODE-style cRE signature (DNAse$^+$/H3K27ac$^+$) showed enhancer activity. Having sampled across the full range of TF/DHS/H3K27Ac signal, we could project that the vast majority of enhancers genome-wide have very modest activity levels by this assay. We compared these results with a massively parallel STARR-seq design for glucocorticoid sensitive cREs and found them largely similar.  We also uncovered a set of developmentally distinct distal elements marked in progenitor cells to predict future, rather than contemporaneous, enhancer activity upon differentiation. Finally, developmentally privileged loci possessing multiple, highly regulated genes were enriched for enhancer cREs with exceptionally strong individual activity despite having only modest biochemical signatures. }}
% \centerline{}
% \centerline{}
\end{abstract}

\begin{multicols}{2}

\section*{Introduction}

The complete and accurate understanding of the relationship between the human genome and its corresponding phenotypes requires the comprehensive characterization of its compendium of functional elements. The results of the many genome-wide epigenomic and transcriptomic studies carried out over the last decade reveal a remarkable picture, in which non-coding regulatory elements constitute the bulk of such functional regions in the genome\cite{mouseENCODE2014,ENCODE2012}, with the expression of each gene (protein coding or non-coding) being controlled by the integrated output of multiple proximal and distal enhancer, insulator and silencing elements.

The genome-wide mapping and characterization of non-coding regulatory elements is thus a major goal of the field, and features prominently among the objectives of the \textbf{ENC}yclopedia \textbf{O}f \textbf{D}NA \textbf{E}lements (ENCODE) consortium\cite{ENCODE101}. However, achieving it, although greatly aided by the advent of high-throughput sequencing and epigenomic tools, is still not a simple task.

The biochemical activity associated with the function of regulatory elements results in certain biochemical signatures that can be captured by epigenomic assays. For example, active promoters in eukaryotes are classically associated with the trimethylation of lysine 4 on histone 3 (H3K4me3)\cite{Vermeulen2010}, as well as other biochemical signatures, such as DNAse hypersensitivity\cite{Thurman2012}. Active enhancer elements have been proposed to exhibit their own biochemical signature, featuring DNAse hypersensitivity, H3K27ac, H3K4me1 and occupancy by the p300 acetyltransferase\cite{Creyghton2010,RadaIglesias2011,Thurman2012} as well as by sequence-specific transcription factors.

These biochemical signatures enable the compilation of lists of candidate functional elements (cREs), but they do not on their own allow the conclusive identification of any given element as functional. While functional regulatory elements exhibit characteristic biochemical signatures, the reverse (that the presence of a biochemical signature necessarily means function) cannot be inferred straightforwardly\cite{Kellis2014}. Such inferences are further complicated by the observation that biochemical signatures are not binary but instead exist on a continuum between strong outstanding features, about which little doubt can exist, on one hand, and what is probably biochemical noise on the other. For example, it is far from clear that all transcription factor binding sites that can be reproducibly identified using ChIP-seq and related techniques are in fact functional REs\cite{Fisher2012}. Therefore, individual cREs in the lists compiled by efforts such as the ENCODE and mouseENCODE consortia\cite{ENCODE2012,mouseENCODE2014} have to be directly tested for function and characterized in detail.

The ultimate functional characterization of cREs will involve a combination of loss-of-function assays and direct assays for activity. The former have been until recently technically challenging, but are becoming more commonplace with the advent of large-scale CRISPR/Cas9-mediated mutagenesis techniques\cite{Korkmaz2016,Fulco201}. Nevertheless, most work in the field has been based on testing cREs for regulatory activity using an exogenous plasmid construct combining a cRE, a promoter and a reporter % we need an old-school reference for transfectons. 

Classically, such testing has been done by cloning individual cREs into plasmids (or other vectors) and then assaying the expression of the reporter gene (luciferase activity in cell lines or staining for LacZ activity in embryos\cite{Visel2007,Fisher2012}). Numerous developmental enhancers have been characterized following that approach\cite{Visel2007,Visel2009,May2011}, usually starting from lists of cREs derived from p300 ChIP-seq datasets. However, \hl{XXX explanation of issues with system XXX}. 

High-throughput sequencing has enabled the development of assays that go beyond the testing of individual cREs, one by one; instead very large numbers of sequences are analyzed in parallel, with the readout being based on sequencing DNA tags associated with the cRE or of the cRE itself. These are usually referred to as Massively Parallel Reporter Assays (MPRA)\cite{Inoue2015}, and a number of variations of the principle have been successfully applied to a multitude of biological problems and systems in the last few years\cite{Patwardhan2009,Kinney2010,Kwasnieski2012,Melnikov2012,Patwardhan2012,FIREWACh2014}, including the question of testing cREs for activity within the context of the ENCODE Consortium\cite{Kheradpour2013,Kwasnieski2014,Ernst2016}. However, several issues complicate the interpretations of MPRA experiments.

First, the nature of MPRA designs is such that the elements tested are very short, in the 80-250bp neighborhood. This is significantly shorter than tens of thousands of blocks of conserved noncoding sequence that can be identified in the human genome by comparative genomic analysis (Figure \ref{Fig1}). Thus to what extent complete REs are assayed and what the corresponding false negative rate is have always been an obvious concern regarding MPRAs. 

Second, given that it is very difficult to control the number of constructs going into each individual cell in transfection experiments, and that an MPRA features large numbers of different cREs being tested in the same time, there is significant potential for crosstalk between active and inactive cREs that end up in the same cell, resulting in numerous false positives. 

While the latter concern can be alleviated to an extent through the use of genome-integrated MPRA constructs \cite{Maricque2016,Inoue2016}, the short length of constructs tested remains a significant issue, and one that might be behind the low positive rates reported by MPRAs in the past\cite{Vockley2016}. 

There is therefore a major gap in the field that needs to be filled by testing a large number of individual constructs of large size (500-1000 bp) using a traditional luciferase assay, and using the resulting daya to examine the performance of biochemical signature predictions.

To this end, we tested hundreds of constructs of such length in several mammalian cell line systems (Figures \ref{Fig2} and \ref{Fig3}). \hl{XXXX FINISH XXX}

\hl{XXX SUMMARY OF RESULTS AT END OF INTRO, TO BE WRITTEN XX}

% \cite{Vockley2016}

% \cite{STARRseq2013}

\section*{Results}

\subsection*{Large-scale functional assay testing of cREs}

As the ENCODE encyclopedia utilizes a combination of DNAseHS and H3K27Ac to annotate candidate Regulatory Elements, it is important to understand the relationship of these two measurements in the immortalized lines. 

After summing the biochemical marks of several thousands of cell lines and states much of the genome may be covered by cREs. Only a small portion of sites are shared across many cell types, CTCF being the most prominent contributor. What would have perhaps been more sensible is to create individual candidate Regulatory Elements for each tissue and state individually. While this merged cRE collection results in a genomic coverage that is at first glance absurdly high, it is important to understand the functional contribution in each tissue. To this extent we survey functional studies from  several tissues for the functional contribution of DNAseHS and H3K27Ac co-marked elements.

The candidate Regulatory Elements in this study were in most cases selected prior to DNAseHS data being available. cREs were picked by ChIP-Seq measured histone marks or Transcription Factor (TF) occupancy(\ref{Fig2}). 

For the human immortalized cell lines chromatin state measurements were ran through machine learning algorithms including chromHMM in K562 and Self-Organizing Maps (SOM) in HepG2 to guide the majority of picks. 

In Erythroid progenitor (G1E) and eythroblast (G1E$-$ER4) cells cREs were selected by GATA1 and TAL1 occupancy but were tested in human K562 cells. 

In C2C12 Myoblasts and Myocytes we selected elements on the basis of Muscle Regulatory Factor (MRF) occupancy and tested in both cell states. 

Glucocorticoid Responsive ChIP-Seq (GR) was cloned into a Starr-Seq reporter vector and assayed in A549 cells under vehicle (EtOH) and GR (Dex) conditions. 

These elements were scored for DNAseHS and H3K27Ac, upon which the new ENCODE encyclopedia is based, to extrapolate their biological function predictivity rate. Here we present the predictive rates of those key biochemical signatures of the systems from which the candidate elements were derived.\cite{ENCODE2017}

\subsection*{Regulatory activity of cREs in human immortalized cell lines}
Two of the most studied immortalized cell lines in encode are perhaps K562 and HepG2. These highly characterized cell lines allow us to provide an expectation of the relationship between DNAseHS and H3K27Ac; the two measurements used in combination to characterized cREs for the encyclopedia. 

DNAseHS and H3K27Ac are expected to strongly overlap at promoters. As we see in \ref{Fig3} both in K562 and in HepG2 these measurements highlight a set of 13,118 and 12,485 putative promoter regions respectively. In distal cREs we find a similar number of common sites, along with a much expanded set of DNAseHS only sites; which are likely comprised mainly of looping anchors and chromatin insulator CTCF sites. cREs that were tested in our assay are represented by the black dots in \ref{Fig3} and are mainly distal elements to avoid the issue promoter-promoter crosstalk as they were cloned 5' in a plasmid containing a basal promoter. 

In order to assay the expected rate of functional elements within the new ENCODE encyclopedia we annotated the assay output of machine learning selected cREs from ENCODE2 for modern (ENCODE3 Standard) IDR peak called DNAseHS and H3K27Ac \ref{Fig4}. 

While the HepG2 SOM picks are a good match to the ENCODE encyclopedia design, the chromHMM cREs did not all pass modern IDR calls for H3K27ac. Negative control elements are shown on the right half of each plot and are consistently DNAseHS negative sites, indicating either a lack of TF occupancy or generally restricted accessibility.

Nevertheless, 30 cREs in K562 and 32 cREs in HepG2 that were tested are positive for both DNAseHS and H3K27Ac. From these we are able to predict that we expect \hl{XX$\%$} of cREs in K562 and \hl{XX$\%$} of cREs in HepG2 to be active regulatory elements. 

A small set of nine H3K27Ac only sites were tested in K562 as part of the negative control set, but none scored as enhancers in the assay indicating that there may not be much positive regulatory function in the 5,900 H3K27Ac exclusive sites in \ref{Fig3}C. Similarly none of the 12 DNAseHS exclusive sites in K562 scored as enhancers in our assay, indicating that function is likely largely exclusive to the overlap of DNAseHS and H3K27Ac. 

The general correlation between regulatory activity and biochemical marks can be found in \ref{Fig5} including correlations of activity to several of the key TFs as measured by ChIP-Seq in each cell line.

\end{multicols}

\begin{FPfigure}
\begin{center}
\includegraphics[width=15.5cm]{eLifeFig1-cartoon.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Outline of experimental design, cRE selection and testing}. \hl{XXXX details XXXX} } 
\label{Fig1}
\end{FPfigure}

\begin{multicols}{2}

\subsection*{The regulatory landscape of muscle differentiation}

Early myogenesis, the differentiation from specified muscle precursor myoblast cells to differentiated myocytes can be studied by the model C2C12 mouse cell line. 

This process involves up-regulating the transcript level of 628 genes, down-regulating 824, with 8704 genes remaining stable. This is represented in the biochemical markings at the promoter regions which shift  only modestly. \ref{Fig6}

In contrast, the biochemistry at distal sites changes significantly across the differentiation resulting in the restructuring of over 4000 exclusive DNAse Hypersensitive sites; more than half of which are significantly H3K27Ac marked.\ref{Fig6}

The process of myogenesis is regulated by four key BHLH TFs known as Myogenic Regulatory Factors (MRFs) along with numerous cofactors. MyoD and Myf5 are most similar by peptide sequence and can both function as specification factors (Braun et al. 1989). These two TFs have almost completely overlapped ChIP-Seq occupancy however their target genes are at distinct indicating that the mechanism for targeting the MRFs to the right genomic position is likely shared.

MyoD1, the key specification factor, was discovered by treating 10T1/2 cells with 5-azacytidine; which turn into multipotential somitic precursors, some of which spontaneously differentiated into myocytes (Taylor et al, 1979; Davis et al. 1987). Overexpression of MyoD in fibroblast cells (MEF) results in high overlap with the native sites found in differentiated C2C12s. (Tapscott 2014) MyoD is recruited by PBX1/Meis to the myogenin promoter. (De La Serna IL 2005; Kyle L. MacQuarrie 2014; Toto PC 2016) This implies that MyoD, likely through the help of PBX1/Meis is able to pioneer sites for later direct nearby occupancy, a mechanism which may be shared for Myf5 targeting. Both myogenin and MyoD, which are co-expressed in the myocyte, recognize a RRCAGSTG motif and occupy a highly overlapped set of sites. (Moncaut et al., 2013, Tapscott, 2005) 
 
Myogenin is the key differentiation factor in myogenesis and is able to compensate for the loss of the terminal differentiation Myf6/MRF4. A Myogenin knock out leads to a severe malformation of all skeletal muscle, and is lethal at birth (Hasty et al. 1993; Wright et al. 1989). It is likely that the differentiation into myocytes is gated by TFs such as Zeb1 and snail occupying the promoter regions and maintaining effective repression on terminal genes.

The C2C12 cell line has been well-characterized and studied in the Wold lab over the last decade, including factor occupancy measurements for several of the key muscle differentiation factors (MEF2, MyoD, Myogenin, E12/HEB) as well as key co-activators such as EP300. Much of this distal biochemical shift in myocyte can be visualized in the Supplementary heatmap \ref{FigXX} panels B and C which show myocyte exclusive occupancy of myogenin, Mef2, and that of a small subset of MyoD sites in panel A. 

This system allowed us to probe several outstanding biological questions:
1) Is the binding of myogenin, the primary differentiating TF, a determinant or simply a permissive condition for biological function? 

2) What is the enhancer function predictivity of these elements when scored according to the encyclopedia criteria of combined DNAseHS and H3K27Ac marks?

3) Is ChIP-seq is equally sensitive for enhancers across different signal strengths? 

4) Are myogenin selected elements that are MyoD occupied in myoblast functional in both cell states?

\subsection{Regulatory activity of cREs in Muscle Cells}

As noted above; it is unlikely that elements which are DNAseHS exclusive or H3K27Ac exclusive will function as enhancer elements in transactional assays. 

Here we tested a set of negative controls which include T-cell and neuronal enhancer elements centered upon an internal MRF motif; as well as unoccupied motifs within highly transcribed loci.

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig2-C2C12-DNAse-H3K27ac.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during muscle differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myoblasts; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in C2C12 myocytes; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myoblasts; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in C2C12 myocytes; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.}
\label{Fig2}
\end{figure*}

\end{multicols}

\clearpage

\includegraphics[width=18.5cm]{eLifeFig3a-C2C12-funcassay-locus-V2.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16.5cm]{eLifeFig3b-C2C12-funcassay-random-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in C2C12 cells}. Fold activity in myocytes (top) and myoblasts (bottom) across biological replicates ($n = 4$) and technical replicates  ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, p300, MyoD and myogenin occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs selected for their physical proximity to loci known for their importance to muscle development (``locus picks''); (B) randomly selected from the genome-wide set of MyoD/myogenin-occupied regions.} 
\label{Fig3}
\end{figure*}

\begin{multicols}{2}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=17.5cm]{eLifeFig4-C2C12-predictivity-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in C2C12 cells}. (A and B) Correlation between fold activity and DNAse hypersensitivity, H3K27ac, p300, myogenin, MyoD and MEF2 occupancy in myoblasts and myocytes; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in myocytes; (D) AUROC (area under ROC curve) values for different biochemical marks in myocytes; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in myoblasts; (F) AUROC values for different biochemical marks in myoblasts.} 
\label{Fig4}
\end{figure*}

\subsection*{The regulatory landscape of erythroid differentiation}

Similarly to muscle, during the differentiation of red blood cells, only a modest number of genes are regulated. This is also reflected in the biochemistry (Figure 9A;B) in contrast the distal element landscape is again highly dynamic, showing a shift for thousands of sites for only a few hundred changing genes. (Figure 9 E)

GATA1 is the master regulator of differentiation and proliferation of red blood cells (6), which by knockout results in an anemic phenotype. TAL1, a BHLH protein, is required for multiple functions in hematopoiesis, including terminal differentiation of red blood cells (7). Much like Myogenin TAL1 makes an obligatory heterodimer with E2A basic helix-loop-helix proteins, such as E47. GATA1, much like the MEF2 with myogenin in muscle, forms a complex with Tal1 along with other key components. 

Much like PBX1 and MyoD in muscle, GATA2 is thought to guide TAL1 to the landing sites in the genome. The motifs of DNA recognized for erythropoiesis and myogenesis have a striking similarity, but distinguishing features which results in distinct regions of the genome being used in each tissue. (discussion)

Several ChIP-seq experiments have been performed and are summarized in a heatmap (Supplementary Figure 2). 

\subsection*{Regulatory activity of cREs in erythroid cells}

Elements tested were selected based on two criteria by ChIP-Seq factor occupancy by TAL1/GATA1 (Figure 10A) and by evolutionary constraint and TF motif (Figure 10B) Although the elements were selected based on mouse biochemistry; they were tested in the human K562 erythroid cell line. This precludes a similar comparison of the differential in function across early erythropoeisis but allows us to contrast the functional predictivity of DNAse and H3K27Ac marked sites picked by a similar criteria as in muscle (presence of a TF) as well as a small but interesting set of sites picked prior to ChIP-Seq data being available by conservation and motif presence.
 

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig5-G1E-DNAse-H3K27ac.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation during erythroid differentiation}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in G1E-ER4 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in G1E-ER4 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black; (E) Dynamic changes in DNAse hypersensitivity and H3K27 acetylation upon differentiation for promoter-proximal and distal sites.} 
\label{Fig5}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=17.5cm]{eLifeFig6-G1E-funcassay.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of the regulatory activity of erythroid cREs}. Fold activity in K562 cells across biological replicates ($n \in [1,9]$) and technical replicates ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, GATA1, and TAL1 occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs randomly selected from the genome-wide set of GATA1/TAL1-occupied regions; (B) cREs selected among the set of highly evolutionarily constrained non-coding elements that contain a GATA1 motif (``regulatory potential selections'').} 
\label{Fig6}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16.5cm]{eLifeFig7-G1E-predictivity.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in erythroid cells}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, H3K27ac, TAL1, and GATA1 occupancy in G1E and G1E-ER4 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity; (D) AUROC (area under ROC curve) values for different biochemical marks.} 
\label{Fig7}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig8-K562_HepG2-DNAse-H3K27ac.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Relationship between DNAse hypersensitivity and H3K27 acetylation in immortalized human cell lines}. (A) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in K562 cells; (B) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in K562 cells; (C) Overlap between DNAse hypersensitive and H3K27ac-positive promoter-proximal regions in HepG2 cells; (D) Overlap between DNAse hypersensitive and H3K27ac-positive distal regions in HepG2 cells; the kernel density of the ChIP-seq/DNAse-seq signal distribution for each class of sites is overlaid over the scatter plots, and the distribution of tested cREs is shown in black.} 
\label{Fig8}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{eLifeFig9-K562_HepG2-funcassay.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in human immortalized cell lines}. Fold activity across biological replicates ($n =$ \hl{???}) and technical replicates ($n = $  \hl{???} for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity and H3K27ac status are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores. (A) cREs tested in K562 cells (B) cREs tested in HepG2 cells.} 
\label{Fig9}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=17.5cm]{eLifeFig10-K562-HepG2-A549-plots.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between regulatory activity and biochemical marks in human immortalized cell lines}. (A and B) Correlation between fold activity in K562 cells and DNAse hypersensitivity, and transcription factor occupancy in K562 and HepG2 cells; (C) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (D) AUROC (area under ROC curve) values for different biochemical marks in K562 cells; (E) ROC curves showing biochemical mark predictivity of cRE fold activity in K562 cells; (F) AUROC (area under ROC curve) values for different biochemical marks in K562 cells.} 
\label{Fig10}
\end{figure*}

\subsection*{The regulatory landscape of glucocorticoid response in A549 cells}

% Fig16: STARR-seq
% Fig17: ROC vs STARR-seq

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=14cm]{eLifeFig11-predictivity-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CAPTION}. (A) CAPTION GOES HERE} 
\label{Fig11}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=14cm]{eLifeFig12-predictivity-vs-signal.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \hl{CAPTION} }. (A) CAPTION GOES HERE} 
\label{Fig12}
\end{figure*}

\section*{Discussion}

The orientation of cREs tested relative to the TSS has an impact on the assay output; although these elements are very promoter proximal.  \ref{Fig17}
There is a notable orientation effect for cREs tested in our transfection assay. Although this effect could be due to the elements being tested 5' relative to the test plasmid TSS, this seems unlikely as the effect is also observed in A549 STARR-SEq

Promoter-enhancer specificity \cite{Zabidi2015}

integration in genome \cite{Maricque2016,Inoue2016,FIREWACh2014}

\section*{Methods}

Except where otherwise stated, all analyses were performed using custom-written python scripts. The GENCODE

\subsection*{Cell culture}

\subsubsection*{C2C12 cells}

C2C12 myoblasts %and 10T1/2 fibroblasts 
were maintained and seeded for transfection in 20$\%$ FBS supplemented DMEM medium. Upon reaching $>$80$\%$ confluency, the cells were were differentiated %, or mock differentiated for 10T1/2s 
using 2$\%$ horse serum and 1 $\mu$M insulin in DMEM medium.

\subsubsection*{G1E cells}

\hl{XXXX DETAILS XXXX}

\subsubsection*{K562 cells}

\hl{XXXX DETAILS XXXX}

\subsubsection*{HepG2 cells}

\hl{XXXX DETAILS XXXX}

\subsubsection*{A549 cells}

\hl{XXXX DETAILS XXXX}

\subsection*{Cloning and DNA purification}

\subsection*{Functional assays}

\subsubsection*{C2C12 cells}

Candidate REs as well as a set of negative control regions were either PCR-amplified from female BALB/C purified mouse genomic DNA (Switchgear Genomics) or synthesized de novo (Genscript). The resulting DNA was cloned into a reporter vector 5$^\prime$ of a custom TK promoter (SwitchGear Genomics) driving a high-turnover sequence-optimized luciferase reporter gene. Plasmids were purified using Miniprep kits (Quiagen) and standardized to 30 ng/$\mu$L using fluorometry concentration measurements (Qubit\textsuperscript{\textregistered} dsDNA HS (High Sensitivity) Assay Kit).

For the purpose of testing elements in the myoblast state, undifferentiated C2C12 cells were seeded in 96-well delta surface plates (NUNC) quadruplicates 12 hours before transfection at a concentration of 2500 cells/well. For the purpose of testing elements in the myocyte state, undifferentiated C2C12 cells were seeded at a density of 3500 cells/well. %10T1/2 cells are seeded at 5000 cells per well. 
Transfections were carried out with 50ng of DNA per construct in each replicate using Lipofectamine LTX, after a 5 minute incubation with a 1:16 dilution with the PLUS reagent(Thermo Fisher). Myoblast plates were lysed using a Steady-Glo\textsuperscript{\textregistered} kit, and luminescence was measured on a plate luminometer 24 hour post-transfection. Myocyte plates %and mock differentiated 10T1/2 mesodermal precursor 
had their media exchanged with differentiation 12-16 hours post transfection and measured following the same procedure 24 hours later.

Aside from the plate reading step, the entirety of the trasnfection process was automated and carried out on a Tecan Freedom EVO 200 robot.

\subsubsection*{G1E/K562 cells}

\hl{XXX DETAILS XXX}

\subsubsection*{K562, HepG2 and A549 cells}

\hl{XXX DETAILS XXX}

\subsection*{Functional Assay Data processing}

Luminometer data was ratioed relative to the basal promoter vector (relative assay activity). Active cREs were discriminated from inactive using a Z-score analysis that compared the population of test element technical replicate values to the set of negative control vectors. \hl{XXX this could be stated more explicitly with the actual formulas XX}. 

\subsection*{ChIP-seq experiments}

Chromatin immunoprecipitation in A549 cells was performed as previously described (Reddy et al. 2009) using 2 $\times$ 10$^7$ A549 cells per replicate. Cells were sonicated using a Bioruptor XL (Diagenode) on the high setting until the resulting chromatin was fragmented to a median fragment size of 250 nt as assayed by agarose gel electrophoresis. GR ChIP was performed using 5 $\mu$g of a rabbit polyclonal $\alpha$-GR antibody (Santa Cruz Biotechnology sc-1003), and 200 $\mu$l of magnetic sheep anti-rabbit beads (Life Technologies M-280). H3K27ac ChIP was performed using \hl{XXX Ab source XXX}. After reversal of formaldehyde crosslinks at $65\,^{\circ}\mathrm{C}$ overnight, DNA was purified using MinElute DNA purification columns (QIAGEN). Illumina sequencing libraries were then generated using the Apollo 324 liquid handling platform according to manufacturer's specifications (Wafergen). 

ChIP-seq in C2C12 cells was performed using chromatin from 2 $\times$ 10$^7$ nuclei, which was fragmented using a Misonix probe tip sonicator and subjected to immunoprecipitation using a robotic ChIP pipeline described before. The resulting purified DNA was then converted into sequencing libraries and sequenced on an HiSeq 2500 (Illumina) as described previously\cite{Gasper2016}. The following antibodies were used: $\alpha$-myogenin (\hl{xxx}), $\alpha$-MyoD (\hl{xxx}), $\alpha$-MEF2 (\hl{xxx}), $\alpha$-p300 (\hl{xxx}), $\alpha$-E2A (\hl{xxx}), $\alpha$-H2B (\hl{xxx}), and $\alpha$-H3K27ac (\hl{xxx}). 

In addition, publicly available\cite{} Pbx1 ChIP-seq and Control datasets were downloaded from GEO accession GSE76010. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} ChIP-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

\subsection*{DNAse-seq experiments}

In C2C12 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}

In A549 cells, DNAse-seq was carried out as follows: \hl{XXXXX DETAILS XXX}. 

For G1E, K562 and HepG2 cells, previously publicly available\cite{ENCODE101,ENCODE2012,mouseENCODE2014} DNAse-seq datasets were downloaded from the ENCODE portal \burl{https://www.encodeproject.org/}. 

\subsection*{RNA-seq experiments}

\hl{XXX C2C12 RiboMinus XXX}

\subsection*{STARR-seq experiments}

The STARR-seq experiments previously published by Vockley et al. \cite{Vockley2016} were used in this study. 

\subsection*{Genomic coordinate conversion}

The regions to be tested using functional assays were designed based on the \verb|mm8| and \verb|mm9| verions of the mouse genome and the \verb|hg19| version of the human genomes. Conversion of the original coordinates to \verb|mm10| and \verb|hg20| coordinates was performed using the \verb|liftOver| tool from the UCSC Genome Browser Utilities\cite{UCSCGB}.

\subsection*{Conservation analysis}

Sequence conservation analysis were carried out using the \verb|phastCons60way| and \verb|phastCons100way| conservation tracks, which were downloaded from the UCSC Genome Browser\cite{UCSCGB}. 

\subsection*{ChIP-seq data processing and analysis}

ChIP-seq reads were trimmed down to 36 bp in length and mapped against the \verb|hg20| (for human samples; the male or female version depending on the sex of the cell line the sample originated from) and \verb|mm10| (for mouse samples) using Bowtie \cite{Bowtie} (version 1.0.1) with the following settings: \verb|-v 2| \verb|-k 2| \verb|-m 1| \verb|--best| \verb|--strata|. DNAse-seq reads were processed similarly except that they were trimmed down to 20bp for A549 samples and 36bp for C2C12 cells (due to differences in the experimental protocol used to generate the data).

Peak calling was carried out as follows. For DNAse and H3K27ac datasets, MACS2 \cite{Feng2012} (version 2.1.0) was run on individual replicates and on pseudoreplicates (generated by randomly splitting the pooled set of reads for both replicates into two) with relaxed settings (\verb|--to-large| \verb|-p 1e-1|). For H3K27ac control datasets were subjected to the same treatment (no background/control is available for DNAse data) The top 100,000 peaks from each replicate or pseudoreplicate (ranked by $q$-value) were then used as input into IDR \cite{Li2011}. The number of peaks above a given IDR threshold called as reproducible between true replicates ($N_t$) and between pseudoreplicates  ($N_p$) were recorded. Peak calling was then carried out on the pooled set of reads and the top $max(N_t,N_p)$ peaks were chosen as the final set of reproducible peaks. For point-source \cite{Pepke2009} datasets (transcription factors), peak calling was carried out following the same procedure but using SPP \cite{Kharchenko2008} (version 1.10.1), using the top 300K peaks as input to IDR.

The pooled sets of reads were also used to calculate RPM (reads per million) enrichment values over elements tested in functional assays.

\subsection*{RNA-seq data processing}

ENCODE paired-end human RNA-seq reads were trimmed down to 2 $\times$ 75mers. C2C12 RiboMinus reads were aligned as 1 $\times$ 50mers. Reads were initially aligned against human or mouse ribosomal RNA sequence, and the unmapped reads were then aligned against the \verb|hg20| and \verb|mm10| versions of the human genome using TopHat\cite{TopHat2} (version 2.0.8). 

\subsection*{STARR-seq data processing and analysis}

% Estimation of Regulatory Activity in STARR-Seq Assayed Sites
% BAC STARR-seq output libraries were sequenced on an Illumina MiSeq Instrument using paired-end 25 bp reads. Reads were aligned to the hg19 reference genome sequence for the target BACs using Bowtie (Langmead and Salzberg 2012), paired end reads were converted to corresponding fragments, and the number of reads per fragment was counted for each replicate. The number of reads in each experiment was then normalized to the median of the number of reads per fragment divided by the geometric mean read count of that fragment. To estimate the significance of differences between EtOH and DEX treatment, a Wilcoxon signed-rank test was performed using a sliding 1 bp window across the target BACs. For each test, the normalized read counts for the fragments overlapping the window were compared between DEX and EtOH treatment conditions, and a p value was reported.

\section*{Acknowledgments}

Library generation and high-throughput sequencing for C2C12 ChIP-Seq samples was performed by Igor Antoshechkin at the Millard and Muriel Jacobs Genetics and Genomics Laboratory. The authors would also like to thank Diane Trout and Henry Amrhein for technical assistance with maintaining the computational infrastructure used to carry out this study.

This material is based upon work supported by the National Science Foundation under Grant No. CNS-0521433.

\begin{thebibliography}{100}

\input{references}

\end{thebibliography}

\end{multicols}

\setcounter{table}{0}
\renewcommand{\tablename}{Appendix-Table}
\renewcommand\thetable{\arabic{table}}    
\setcounter{figure}{0}
\renewcommand{\figurename}{Appendix-Figure}
\renewcommand\thefigure{\arabic{figure}}    

\section*{Appendix}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=14.5cm]{eLifeFigApp1-CNE-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf The length of thousands of conserved noncoding elements in mammalian genomes greatly exceeds the size range of MPRA constructs}. (A) The length distribution of conserved noncoding regions in the human genome. The \texttt{phastCons100way} conservation track for the \texttt{hg20} version of the human genome was downloaded from the UCSC Genome Browser. Blocks of conservation, in which all nucleotides have \texttt{phastCons} scores higher than the indicated minimum (\texttt{phCons}), were identified, and then merged into larger regions if the length of the gaps between them was smaller than the indicated \texttt{maxGap} parameter. The distribution of the lengths of the resulting sets of regions was plotted. This approach captures the properties of enhancer elements observed in the genome, which often consist of multiple blocks of highly conserved sequences separated by gaps of less conserved sequences, resulting in an enhancer element of up to a few hundred base pairs in length or more. (B) Such an example is shown for the \textit{Acta1} gene in mouse.} 
\label{FigApp1}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=14cm]{eLifeFigApp2-contruct-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Length distribution of functional assays constructs used to test cREs in this study}. (A) Distribution of functional assay construct lengths tested in this study in C2C12 cells. (B) Distribution of functional assay construct lengths tested in this study in G1E cells. (C) Distribution of functional assay construct lengths tested in this study in K562 and HepG2 cells } 
\label{FigApp2}
\end{figure*}

\clearpage

\includegraphics[width=17.5cm]{eLifeFigApp3a-C2C12-heatmaps-MyoD.png}

\clearpage

\includegraphics[width=17.5cm]{eLifeFigApp3b-C2C12-heatmaps-myogenin-only.png}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=17.5cm]{eLifeFigApp3c-C2C12-heatmaps-MEF2-only.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of muscle differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, p300, the MRFs MyoD and myogenin, and cofactors (MEF2, E2A/TCF3, HEB/TCF12, and Pbx1) in undifferentiated (myoblast, or ``MB'') and differentiated (myocyte, or ``MC'') C2C12 cells were analyzed. Sites were split into multiple subgroups depending on regulatory factor occupancy (at IDR=0.05) -- MyoD-positive (in either condition) sites (A), myogenin-only sites (B), and MEF2-only sites (C) -- then sorted by MRF ChIP-seq signal (in the following order of priority: myoblast MyoD, myocyte MyoD, myocyte myogein, myoblast MEF2, myocyte MEF2); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp3}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16.5cm]{eLifeFigApp4c-C2C12-funcassay-negative-controls-V2.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional assay testing of cRE regulatory activity in C2C12 cells, negative controls}. Fold activity in myocytes (top) and myoblasts (bottom) across biological replicates ($n = 4$) and technical replicates  ($n = 4$ for each biological replicate) is shown. Candidate REs were sorted first by their DNAse status and then by their mean fold activity. The horizontal dotted line corresponds to the mean fold activity threshold above which elements are considered active. In addition, DNAse hypersensitivity, H3K27ac status, p300, MyoD and myogenin occupancy are shown for each cRE, both as binary (IDR=0.05) calls (red coloring indicates occupancy), and as RPM scores.} 
\label{FigApp4}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=17.5cm]{eLifeFigApp5-G1E-TAL1-GATA-heatmaps.png}
% \includegraphics[angle=90,origin=c,width=17.5cm]{Fig1.png}
% \includegraphics[width=23.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of eryhtroid differentiation}. DNAse-seq and ChIP-seq experiments against H3K27ac, GATA1, TAL1 and GATA2 G1E and G1E-ER4 were analyzed. Sites were split into subgroups depending on GATA1 and TAL1 occupancy (IDR=0.05), then sorted by ChIP-seq signal (in the following order of priority: G1E-ER4 GATA1, G1E-ER4 TAL1); the signal in the 500bp-radius region around the ChIP-seq peak position is shown.} 
\label{FigApp5}
\end{figure*}

\clearpage

\begin{sidewaysfigure}
\begin{center}
\includegraphics[width=24cm]{eLifeFigApp6-A549-heatmaps.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Regulatory landscape of GR response in A549 cells}. \hl{finish caption}} 
\label{FigApp6}
\end{sidewaysfigure}

\clearpage

\end{document}