\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=2cm,bottom=2cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionAfterwards]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{0pt plus 0.3ex}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\titleformat{\subsubsection}[block]{\normalsize\sc\itshape\filright}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}

\title{\bf KAS-ATAC reveals the genome-wide single-stranded accessible chromatin landscape of the human genome}
\renewcommand\Authfont{\scshape\normalsize}
\author[1,*]{Samuel H. Kim}
\author[2,*,$\#$]{Georgi K. Marinov}
% \author[XXX]{XXXXX}
% \author[2,3]{Anshul Kundaje}
\author[2,3,4,5]{William J. Greenleaf}
\renewcommand\Affilfont{\itshape\small}
\affil[1]{Cancer Biology Programs, School of Medicine, Stanford University, Stanford, CA 94305, USA}
\affil[2]{Department of Genetics, School of Medicine, Stanford University, Stanford, CA 94305, USA}
% \affil[3]{Department of Computer Science, Stanford University, Stanford, CA 94305, USA}
\affil[3]{Department of Applied Physics, Stanford University, Stanford, CA 94305, USA}
\affil[4]{Center for Personal Dynamic Regulomes, Stanford University, Stanford, CA, 94305, USA}
\affil[5]{Chan Zuckerberg Biohub, San Francisco, California, USA}
\affil[*]{These authors contributed equally to this work}
\affil[$\#$]{Corresponding author}
\date{}

\begin{document}
\maketitle

\centerline{}
\centerline{}
\begin{abstract}

\noindent {\normalsize \textbf{Gene regulation in most eukaryotes involves two fundamental physical processes -- alterations in the packaging of the genome by nucleosomes, with active \textit{cis}-regulatory elements (CREs) generally characterized by an open-chromatin configuration, and the activation of transcription. Mapping these physical properties and biochemical activities genome-wide -- through profiling chromatin accessibility and active transcription -- are key tools used to understand the logic and mechanisms of transcription and its regulation. However, the relationship between these two states has until now not been accessible to simultaneous measurement. To address this, we developed KAS-ATAC, a combination of the KAS-seq (\textbf{K}ethoxal-\textbf{A}ssisted \textbf{S}sDNA \textbf{seq}uencing and ATAC-seq (\textbf{A}ssay for \textbf{T}ransposase-\textbf{A}ccessible \textbf{C}hromatin using \textbf{seq}uencing) methods for mapping single-stranded DNA (and thus active transcription) and chromatin accessibility, respectively, enabling the genome-wide identification of DNA fragments that are simultaneously accessible and contain ssDNA. We use KAS-ATAC to evaluate levels of active transcription over different classes of regulatory elements in the human genome, to estimate the absolute levels of transcribed accessible DNA over CREs, to map the nucleosomal configurations associated with RNA polymerase activities, and to assess transcription factor association with transcribed DNA through transcription factor binding site (TFBS) footprinting. We observe lower levels of transcription over distal enhancers compared to promoters, surprisingly high abundance of ssDNA immediately around/within CTCF occupancy footprints, and distinct nucleosomal configurations around transcription initiation sites associated with active transcription. Remarkably, most TFs associate equally with transcribed and non-transcribed DNA but a few factors specifically do not exhibit footprints over ssDNA-containing fragments. We anticipate KAS-ATAC to continue to derive useful insights into chromatin organization and transcriptional regulation in other contexts in the future.
% We anticipate that KAS-ATAC and analogous techniques will enable the derivation of further insights into the fine mechanisms of transcriptional regulation in the future.
} 
}
\centerline{}
\centerline{}
\end{abstract}

\begin{multicols}{2}

\section*{Introduction}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Overview of the KAS-ATAC assay}. 
(a) N$_3$-kethoxal specifically covalently labels unpaired guanine bases. 
(b) KAS-ATAC captures DNA fragments that are simultaneously single-stranded and accessible, by first labeling ssDNA with N$_3$-kethoxal, then carrying out an ATAC-seq reaction. After purification of DNA and a click reaction that conjugates biotin onto kethoxal-labeled DNA, accessible ssDNA is specifically pulled down using streptavidin beads and amplified. In parallel, a control ``biotin-ATAC'' library can be prepared using biotinylated Tn5 that then undergoes the same pull-down procedure as KAS-ATAC DNA fragments (if the goal is to quantify the relative abundances of accessible ssDNA and accessible DNA).
(c) KAS-seq, ATAC-seq, and KAS-ATAC mitochondrial genome profiles in human GM12878 cells.
(d) Fragment length distribution in biotin-ATAC-seq and KAS-ATAC libraries (GM12878 cells).
(e) Genome-wide TSS metaprofiles for biotin-ATAC-seq and KAS-ATAC libraries (GM12878 cells).
(f,g) Representative genome browser snapshots showing ATAC-seq, KAS-seq, and KAS-ATAC profiles around nuclear loci in GM12878 cells.
% (g) \hl{Reproducibility of technical replicate KAS-ATAC libraries}
% (h) \hl{Estimated library complexity (per 50,000) cells for biotin-ATAC-seq and KAS-ATAC libraries}
}
\label{Fig1}
\end{figure*}

Active \textit{cis}-regulatory elements in most eukaryotes are usually characterized by low levels of nucleosome occupancy, a unique property first appreciated more than four decades ago\cite{Wu1980,Keene1981,McGhe1981}. Therefore, active CREs are found in regions of increased chromatin accessibility. The genome-wide profiling of chromatin accessibility has been a key tool for identifying active regulator elements and assessing their dynamics across time and space. Methods for carrying out this mapping use the preferential cleavage or labeling of physically accessible DNA by various enzymes. Cleavage by DNase I was the first such approach to be widely adopted, initially coupled to microarray readouts\cite{Dorschner2004,Sabo2006} and later combined with high-throughput sequencing in the form of DNase-seq\cite{Crawford2006,Boyle2008,Thurman2012}. An alternative approach has been to use methylation and either short- or long-read readouts to map chromatin accessibility both genome-wide and at a single-molecule level\cite{Kelly2012,Krebs2017,Shipony2018}. The most widely used assay for mapping open chromatin is ATAC-seq\cite{Buenrostro2013}, which is based on the preferential insertion of the hyperactive Tn5 transposase into accessible DNA.

Apart from identifying putative regulatory elements, ATAC-seq can also be used to map nucleosome positioning in the vicinity of CREs\cite{Schep2015}, and both DNase-seq and ATAC-seq can also footprint the interactions between individual transcription factors (TFs) and DNA, as TFs protect DNA from cleavage/insertion\cite{Hesselberth2009,Pique2011,Neph2012a,Neph2012b,Stergachis2014,Vierstra2016,Vierstra2020}.

Gene regulatory inputs in the form of transcription factor occupancy and chromatin remodeler activity are integrated into changes in transcriptional activity at promoters. Therefore, mapping active transcription has also long been a key objective (contrasted with conventional RNA-seq profiling of steady-state polyadenylated transcripts\cite{Mortazavi2008,Nagalakshmi2008,Wilhelm2008,Sultan2008}, the levels of which reflect several additional layers of regulation of RNA stability). 

\end{multicols}

\begin{FPfigure}
\begin{center}
\includegraphics[width=17.5cm]{Fig2-differences.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf The single-stranded accessible chromatin landscape is distinct from total open chromatin genomic maps}. 
(a) DESeq2 analysis of GM12878 ATAC and KAS-ATAC libraries.
(b) DESeq2 analysis of HEK293 ATAC and KAS-ATAC libraries.
(c) Enrichment of KAS-ATAC/ATAC differential peaks over chromHMM chromatin states. Enrichment is calculated as the $log_2$ ratio of the observed versus expected fractions of peaks overlapping each state (based on the overlap of the total set of peaks with chromHMM states).
(d) ATAC-seq and KAS-ATAC profiles around promoter-proximal and distal peaks (GM12878 cells).
(e) ATAC-seq and KAS-ATAC profiles around occupied CTCF sites in GM12878 cells (excluding promoter-proximal CTCF peaks; CTCF ChIP-seq peak calls were obtained from ENCODE accession ENCFF827JRI).
(f) ATAC-seq, KAS-ATAC, and KAS-ATAC/ATAC differential V-plot\cite{Henikoff2011} around occupied CTCF sites.
(g) ATAC-seq, KAS-ATAC, and KAS-ATAC/ATAC differential V-plot\cite{Henikoff2011} around RAMPAGE\cite{Batut2013} peaks in promoter and in distal CRE regions (RAMPAGE peaks were obtained from ENCODE accession ID ENCSR000AEI).
}
\label{Fig2}
\end{FPfigure}

\begin{multicols}{2}

To this end, several methods for mapping nascent transcription, based on adaptations of the nuclear run-on techniques or capturing RNA polymerase molecules associated with DNA, such as NET-seq\cite{Churchman2011}, GRO-seq\cite{Core2008,Core2012}, PRO-seq\cite{Kwak2013}, and variations of\cite{Tome2018} have been developed. However, these methods often involve very elaborate experimental protocols and thus their deployment has been relatively limited. The recently developed KAS-seq assay\cite{KAS}, by contrast, provides a straightforward method for identifying RNA polymerase-associated DNA based on the highly specific covalent labeling of unpaired single-stranded guanine residues with N$_3$-kethoxal (Figure \ref{Fig1}a); N$_3$-kethoxal adducts can subsequently be subjected to a click reaction and the addition of biotin, which is then used to specifically pull down labeled DNA fragments. Most ssDNA in the genome is found in the context of the transcriptional bubbles of elongating and paused RNA polymerases, with the rest arising from active replication and secondary structures such as G-quadruplexes\cite{Bochman2012}.

Unlike RNA-based methods for mapping RNA polymerase activity, \hl{and unlike previous methods for mapping ssDNA, such as KMnO$_4$ footprinting}\cite{Hayatsu1967,Rubin1980,Sasse-Dwight1989,Mueller1989,Kouzine2013,Kouzine2017}, KAS-seq provides the unique opportunity to directly map the relationship between chromatin accessibility and polymerase activity (and potentially other ssDNA structures), i.e. to what extent are various active CREs subject to active transcription, what nucleosomal configurations is this activity associated with, and what transcription factors may be enriched or depleted in terms of their physical associations with DNA on actively transcribed DNA fragments, as kethoxal labels the same DNA molecule that accessibility is measured on rather than a separate RNA one. 

To address these questions, we developed KAS-ATAC, a method for jointly profiling ssDNA and chromatin accessibility (Figure \ref{Fig1}b). We applied KAS-ATAC together with appropriate controls to evaluate the relative degree of transcription in different classes of regulatory elements and chromatin states in the human genome as well as to estimate the absolute abundance of transcribed and accessible/single-stranded structures. In addition, we map nucleosomal and subnucleosomal structures associated with active transcription and comprehensively measure TF footprints on polymerase-bound DNA molecules.

\section*{Results}

\subsection*{KAS-ATAC maps identify genomic DNA fragments that are simultaneously accessible and contain single-stranded DNA}

We developed KAS-ATAC by taking advantage of the complementary properties of the Tn5 transposase, which preferentially inserts into open chromatin, and the N$_3$-kethoxal compound, which covalently attaches to unpaired G bases in ssDNA with very high specificity (Figure \ref{Fig1}a). N$_3$-kethoxal adducts can then be subjected to click chemistry and biotin addition, and DNA fragments thus labeled can be stringently purified using streptavidin pulldown. To capture DNA fragments that are simultaneously accessible and containing ssDNA, we combined these two properties into KAS-ATAC as follows (Figure \ref{Fig1}b). First, cells are subjected to N$_3$-kethoxal treatment to label ssDNA, after which the kethoxal is washed away. Next, cells are immediately processed through the standard ATAC-seq protocol, with Tn5 inserting into open chromatin regions. DNA is then isolated, and a click reaction is carried out, followed by streptavidin pull-down and library amplification. To control for sample loss and other potential biases associated with the pulldown procedure, we generated not just paired regular ATAC-seq libraries (from kethoxal-treated cells, but without a click reaction and biotin pulldown, in order to control for any effects of the kethoxal treatment), but also (for some key experiments) a ``biotin-ATAC'' control, in which no kethoxal labeling was carried out, but a transposase carrying biotinylated adapters was used, then these fragments were pulled down following the same procedure that was applied for KAS-ATAC (Figure \ref{Fig1}b).

We carried out KAS-ATAC (together with parallel KAS-seq and ATAC-seq) in GM12878 lymphoblastoid cells and in HEK293 embryonic kidney cells as these have long been among the main cell lines of the ENCODE Project Consortium\cite{ENCODE2012,ENCODE2020} and a wealth of other functional genomic data exists for them. Because we expected to recover less DNA than for regular ATAC libraries given that KAS-ATAC captures only a subset of the accessible genome and that the traditional ATAC-seq protocol only uses 50,000 cells as input, we pool at least two separate transposition reactions of 50,000 cells each for a given KAS-ATAC library.

As an initial assessment of the content of KAS-ATAC libraries, we examined KAS-ATAC, ATAC-seq and KAS-seq profiles over the mitochondrial genome (Figure \ref{Fig1}c; Supplementary Figure \ref{FigS1}); this was for two reasons: first, the well-known high abundance of mitochondrial reads in ATAC libraries due to the absence of nucleosomes in the mitochondrial nucleoid, leaving it highly susceptible to Tn5 insertion\cite{Buenrostro2013,Marinov2014}, and second, that the mitochondrial genome contains the most abundant ssDNA structure in cells -- the D-loop in its non-coding region\cite{Shadel1997}. The D-loop is the major site of replication and transcription initiation in mitochondria\cite{Cantatore1980,Montoya1982} and exists as a triple-stranded structure, i.e. one of these three strands has numerous unpaired guanines and is accordingly highly enriched in KAS-seq libraries. We observe that while regular ATAC-seq libraries are slightly depleted over the D-loop, KAS-ATAC libraries are highly enriched over this region of chrM, with a broadly similar, but also distinct in some places profile compared to KAS-seq libraries. Thus, we conclude that KAS-ATAC indeed captures DNA fragments that are both accessible and single-stranded. Based on KAS-ATAC and ATAC-seq coverage over the D-loop, we estimate that we obtain at least 30$\times$ enrichment in KAS-ATAC over the transposition background. The overall fraction of reads originating from mitochondria was similar between all our matching KAS-ATAC and ATAC control libraries (Supplementary Figure \ref{FigS8}).

Next, we examined the broad properties of KAS-ATAC datasets in the nuclear genome. Typical ATAC-seq libraries exhibit a characteristic periodic fragment length distribution\cite{Buenrostro2013}, featuring a prominent subnucleosomal peak, corresponding to DNA fragments originating from within CREs, a second peak of size roughly that of the protection footprint of single nucleosomes in the immediate vicinity of CREs (as well as less frequent transposition events throughout the genome), a third dinucleosomal peak, and so on. We observe a similar picture in KAS-ATAC libraries (Figure \ref{Fig1}d); however, while in ATAC-seq libraries the subnucleosomal peak is often (though not always) the strongest, all KAS-ATAC libraries we have examined feature relatively more prominent nucleosomal peaks (Supplementary Figure \ref{FigS7}). This is to a certain extent counterintuitive and unexpected, given that the strongest KAS-seq peaks are those in the promoter regions of genes, corresponding to paused and initiating RNA polymerases; therefore, one may expect subnucleosomal fragments to dominate KAS-ATAC libraries. However, on the other hand much of ssDNA globally originates from transcribed intragenic regions, where nucleosomes are mostly intact. Indeed, we compared fragment length distributions across ENCODE chromatin states (annotated by the chromHMM algorithm\cite{Ernst2012}), and we observed that the mononucleosomal peak is most prominent relative to the subnucleosomal one in the actively transcribed intragenic states (e.g. state 5 ``Tx''; Supplementary Figures \ref{FigS5} and \ref{FigS9}). 

KAS-ATAC libraries also display a distinct signature around transcription start sites (TSSs), where a sharper peak is observed right at the TSS in metaprofile plots (Figure \ref{Fig1}e; Supplementary Figure \ref{FigS10}), likely corresponding to fragments associated with initiating/paused RNA polymerase.

\subsection*{The single-stranded accessible chromatin landscape is distinct from total open chromatin genomic maps}

\begin{figure*}
\begin{center}
\begin{minipage}[c]{0.80\linewidth}
\includegraphics[width=14.5cm]{Fig3-absolute-levels-V2.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.20\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Estimating absolute levels of single-stranded accessible DNA}. 
(a) Estimates for total library complexity in KAS-ATAC and biotin-ATAC HEK293 samples.
(b) Distribution of the ratios between the number of KAS-ATAC molecules and biotin-ATAC molecules for ATAC-seq peaks found in different chromatin states and for CTCF occupancy sites.
}
\label{Fig3}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig4-footprints.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprints in KAS-ATAC and ATAC datasets}. Shown are the aggregate average Tn5 insertion profiles (see Methods for details) around the motifs for the indicated factors that are located within ENCODE ChIP-seq peaks for the TF in the given cell line.
(a) HEK293 CTCF; 
(b) GM12878 ATF3; 
(c) GM12878 NR2F1; 
(d) HEK293 YY1; 
(e) GM12878 USF1; 
(f) GM12878 E4F1; 
(g) GM12878 CREM; 
(h) GM12878 CREB1. 
}
\label{Fig4}
\end{figure*}

We then surveyed local single-stranded accessible profiles around the genome. Figures \ref{Fig1}f-g show representative KAS-seq, KAS-ATAC and ATAC-seq snapshots in GM12878 cells. Both ATAC-seq and KAS-ATAC exhibit enrichment over promoters and distal regulatory elements in the genome; however, in a number of cases KAS-ATAC peaks over promoter-distal CREs are considerably weaker than ATAC-seq signal over the same regions (e.g. highlighted areas in Figures \ref{Fig1}f-g).

We quantified global differences between ATAC and KAS-ATAC by carrying out differential accessibility analysis using DESeq2\cite{DESeq2} on the set of common peaks called for both assays. In GM12878 cells, 1,948 and 4,641 regions were identified as preferentially enriched in ATAC-seq and KAS-ATAC, respectively, and we identified 2,064 and 10,581 such peaks in HEK293 cells (Figure \ref{Fig2}a-b). We compared these differential regions against chromHMM\cite{Ernst2012} chromatin state annotations for these two cell lines and found that they are not uniformly and randomly distributed across chromatin states (Figure \ref{Fig2}c). KAS-ATAC libraries are relatively enriched over active TSSs (``TssA'' state) and very strongly enriched over transcribed regions (``Tx'' state), as well as over intragenic enhancer regions (``EnhG1/2''). They are depleted over active intergenic enhancers and over weak enhancers (``EnhG1'' and ``EnhWk''), and bivalent TSSs and enhancers. These observations generalized the anecdotal trend observed at individual loci -- accessible intergenic enhancers appear to be less frequently engaged with polymerase bubbles than promoter regions. ATAC-seq and KAS-ATAC profiles around promoters and distal CREs also corroborate this conclusion (Figure \ref{Fig2}d) -- the two assays exhibit broadly similar profiles within promoters but in distal CREs it is often the case that KAS-ATAC signal is weaker.

Next, we turned our attention to the properties of sites occupied by the CTCF protein, which is the main insulator factor in mammalian cells\cite{Bell1999}. We excluded promoter-proximal CTCF sites from our analysis and focused on distal CTCF peaks only. KAS-ATAC signal is notably weaker at CTCF sites than ATAC-seq (Figure \ref{Fig2}e). CTCF also has a very strong effect on nucleosome positioning through its very robust and persistent occupancy of its own binding sites, often fixing the positions of more than a dozen nucleosomes around it\cite{Fu2008}; CTCF sites thus display a characteristic V-plot\cite{Henikoff2011} signature in ATAC-seq datasets, with a strong subnucleosomal footprint corresponding to the binding of CTCF itself and larger fragments including neighboring nucleosomes surrounding it (Figure \ref{Fig2}f). This signature is also observed in KAS-ATAC data, although with a reduced magnitude. However, it has been previously reported that G-quadruplexes promote CTCF binding in mammalian cells\cite{Wulfridge2023}, and this could account for our observations of relatively weak KAS-ATAC signal in these datasets at CTCF sites. Alternatively, this signal might be the product of background DNA ``breathing'' or DNA replication intermediates that CTCF associates with, as there is only modest enrichment of CTCF sites in regular KAS-seq libraries (Supplementary Figure \ref{FigS15}).

We also analyzed fragment structure around transcription initiation sites. To this end, we took advantage of existing ENCODE TSS annotations for total RNA obtained from RAMPAGE\cite{Batut2013} datasets, which we divided into promoter-proximal and promoter-distal groups, and then analyzed ATAC and KAS-ATAC fragment distributions (in a transcription directionality-aware manner; Figure \ref{Fig2}g). ATAC-seq V-plots around promoters feature a prominent mass of subnucleosomal and some mononucleosomal fragments immediately upstream of the TSS (corresponding to occupancy by regulatory factors, and general transcription factors), another group of subnucleosomal fragments immediately around the TSS (likely corresponding to RNA polymerase initiation complexes), as well as +1 nucleosome and dinucleosomal-fragment masses downstream of the TSS. This signature is preserved in KAS-ATAC, but with important differences -- the upstream subnucleosomal fragments are depleted in KAS-ATAC, while the initiation complex ones are enriched, as is the +1 nucleosome and the downstream dinucleosome. While transcription initiation in mammals has long been demonstrated to be bidirectional\cite{Seila2008,Core2008}, these results suggest \hl{possible} preferential absolute abundance of polymerase bubbles in the primary direction of transcription \hl{(with a cautionary note that the upstream divergent TSS maybe not as well positioned relative to the primary TSS and thus the V-plot signal could be somewhat diluted)}. In addition, engaged polymerase at the promoter and immediately downstream of it is associated with intact nucleosomes. 

In contrast to promoters, transcribed distal CREs exhibit a similar picture in its broad structure in ATAC-seq data, but this signature is very weak in KAS-ATAC, and there is no preferential enrichment of KAS-ATAC fragments downstream of the transcription initiation site.

\hl{We also compared KAS, ATAC and KAS-ATAC datasets to existing Precision Run On (PRO-seq}\cite{Kwak2013})\hl{ maps of active transcription at the RNA level. We observe similar correlation between KAS and KAS-ATAC and PRO-seq, and better correlation between KAS/KAS-ATAC and PRO-seq over enhancer elements than over promoters} (Supplementary Figure \ref{FigS17}). \hl{Finally, we compared KAS, ATAC and KAS-ATAC in the context of the Activity-By-Contact (ABC) model\cite{Fulco2019} relating biochemical activity at enhancer elements with promoter output; we observe similar predictive performance for the different modalities in terms of their ABC scores} (Supplementary Figure \ref{FigS18}).

\subsection*{Estimating the absolute abundance of accessible ssDNA}

KAS-ATAC also offers an opportunity to address the question of how often accessible DNA fragments contain ssDNA inside them in absolute terms. To answer this question, we used a paired biotin-ATAC dataset so that we could control as closely as possible for the inevitable loss of material during sample handling and to obtain comparable in their absolute molecular content ATAC and KAS-ATAC libraries. To maximize sample recovery, we also combined 12 normal ATAC samples (of 50,000 cells each, for a total of 600,000 cells input) into each of these libraries, which were generated in HEK293 cells. We sequenced them to saturation, and estimated the total library complexity using the \verb|preseq| algorithm\cite{Daley2013} (Figure \ref{Fig3}a). As expected, the total molecular complexity of KAS-ATAC libraries is lower than that of ATAC-seq ones, by $\sim$50\%.

We used the total molecular complexity of the KAS-ATAC and biotin-ATAC samples to calculate the total number of fragments (that was derived from $\sim$600,000 cells) for each ATAC-seq peak found in a given chromHMMM state as well as for CTCF peaks, and then estimated the ratio between the two to evaluate how often accessible chromatin regions are also associated with ssDNA (Figure \ref{Fig3}b; Supplementary Figure \ref{FigS6}). The median estimate we obtain for active TSSs is that $\sim$45\% of ATAC fragments contain ssDNA. The highest such estimates are for peaks in the actively transcribed (``Tx'') and intragenic enhancer (``EnhG1'') states, at $\sim$50\%, but also often approaching 100\%. In contrast, for active intergenic enhancers the median fraction of ssDNA fragments is in the 20-25\% range (``EnhA1/A2''). For CTCF sites the median estimate is $\sim$30\%. These results \hl{should be taken with some caution because of the possibility of capturing some baseline DNA ``breathing'' during kethoxal labeling, but overall they} corroborate the observation of lower levels of transcription at intergenic enhancers, and of the presence of ssDNA structures at CTCF sites, while also quantifying their abundance in cells.

\subsection*{Transcription factor footprints in KAS-ATAC}

Finally, we asked whether transcription factors are associated with DNA in a similar way regardless of whether transcription (or other processes generating ssDNA) are occurring in the immediate vicinity. To this end, we calculated aggregate transcription factor footprints over TF motifs within ChIP-seq peaks for each TF in GM12878 and HEK293 cells, including all such TFs for which ChIP-seq data is available from the ENCODE Project Consortium\cite{ENCODE2012} and for which motifs are annotated in the CIS-BP database\cite{CIS-BP}. We applied bias-correction utilizing bias models trained using the ChromBPNet framework (see the Methods section for detail), as Tn5 insertion bias is a well-documented confounding issue in ATAC-seq datasets\cite{Sung2016,Zhang2021,Li2019,Baek2017,Bentsen2020}.

For most TFs we find similar depth of footprints in both ATAC and KAS-ATAC datasets (Figure \ref{Fig4}a-d; Supplementary Figures \ref{FigS16}--\ref{FigS14}). For example, CTCF footprints in HEK293 cells are largely the same in both the total ATAC library and within ssDNA-containing fragments (Figure \ref{Fig4}a), with concordant footprint depths and fine-scale features. Analogous conclusions apply to factors such as the bZIP family factor ATF3 (Figure \ref{Fig4}b), the nuclear receptor NR2F1 (Figure \ref{Fig4}c), the YY1 zing finger TF (Figure \ref{Fig4}d), and most others (Supplementary Figures \ref{FigS11}--\ref{FigS14}). Thus, most transcription factors appear to physically associate with DNA fragments where active transcription is ongoing in the immediate vicinity.

However, we also find several TFs for which strong footprints are only observed in ATAC-seq libraries and not in KAS-ATAC. Examples include the bHLH TF USF1 (Figure \ref{Fig4}e), the zinc finger E4F1 factor (Figure \ref{Fig4}f), and the bZIP family TFs CREM and CREB1  (Figure \ref{Fig4}g-h). These are factors which appear to preferentially associate with DNA fragments that are not actively undergoing transcription, although the precise mechanisms underlying this phenomenon are still unclear for each individual case.

\section*{Summary}

In this work, we describe the KAS-ATAC assay, which we developed to identify the subset of physically accessible DNA fragments in the genome that also contain embedded ssDNA structures, the bulk of which is the result of the opening of the DNA double strand associated with engaged RNA polymerase molecules. We use KAS-ATAC to answer several questions regarding the relationship between active transcription/ssDNA formation and fine-scale chromatin architecture. 

We show that the genomic regions that are jointly accessible and containing ssDNA are preferentially located in promoters and inside gene bodies. In comparison, distal enhancers, although often comparably accessible to promoters, are considerably less frequently transcribed. These two classes of elements are also characterized by subtle differences in their subnucleosomal and nucleosomal organization as it relates to the act of transcription. 

We evaluate the absolute fraction of accessible DNA that is engaged in transcription for different classes of regulatory elements, estimating that up to half of fragments within promoters contain ssDNA, while in distal enhancers this fraction is less than a quarter of fragments \hl{(with a cautionary note about the possibility of DNA breathing contributing some of these percentages, i.e. they are upper bounds)}. 

Finally, we demonstrate that transcription factors physically associate with actively transcribed DNA, and most of them do so with comparable frequency to non-ssDNA-containing DNA. However, a number of TFs are in fact depleted in KAS-ATAC libraries and are preferentially associated with non-ssDNA DNA fragments.

We expect KAS-ATAC to be a useful tool for further future exploration of the detailed relationship between transcription/ssDNA structures and the physical organization of chromatin in many other context, as will the combination of kethoxal labeling and other molecular readouts of the chromatin structure.


\section*{Methods}

\subsection*{Cell culture}

GM12878 and HEK293T cells were grown using the standard protocols for each cell lines -- RPMI 1640, 2 mM L-glutamine, 15\% fetal bovine serum and 1\% penicillin-streptomycin for GM12878 cells, and DMEM + sodium pyruvate, 6 mM L-glutamine, 10\% fetal bovine serum and 1\% penicillin-streptomycin for HEK293T cells.

\subsection*{KAS-seq}

KAS-seq experiments were executed as previously described\cite{Marinov2023}. 

For HEK293 cells, media was removed and room-temperature 1$\times$ PBS was used to wash them, then they were dissociated with trypsin, which was quenched with media. Cells were pelleted at room temperature, and then resuspended in 500 $\mu$L of media supplemented with 5 mM N$_3$-kethoxal (final concentration). GM12878 cells were pelleted down by centrifugation at room temperature and also resuspended in 500 $\mu$L of media supplemented with 5 mM N$_3$-kethoxal (final concentration). Cells were incubated for 10 minutes at 37$\,^{\circ}\mathrm{C}$ with shaking at 500 rpm in a Thermomixer, then pelleted by centrifuging at 500 $g$ for 5 minutes at 4$\,^{\circ}\mathrm{C}$. Genomic DNA was extracted using the Monarch gDNA Purification Kit (NEB T3010S) following the standard protocol but with elution using 175 $\mu$L 25 mM K$_3$BO$_3$ at pH 7.0. 

The click reaction was carried out by combining transposed DNA, 5 $\mu$L 20 mM DBCO-PEG4-biotin (DMSO solution, Sigma 760749), 20 mM K$_3$BO$_3$ (pH 7.0) and 20 $\mu$L 10$\times$ PBS in a final volume of 200 $\mu$L. 

Biotinylated DNA was purified using AMPure XP beads and sheared on a Covaris E220 instrument down to $\sim$150-200 bp size.

For streptavidin pulldown of biotin-labeled DNA, 10 $\mu$L of 10 mg/mL Dynabeads MyOne Streptavidin T1 beads (Life Technologies, 65602) were separated on a magnetic stand, then washed with 300 $\mu$L of 1$\times$ TWB (Tween Washing Buffer; 5 mM Tris-HCl pH 7.5; 0.5 mM EDTA; 1 M NaCl; 0.05\% Tween 20). The beads were resuspended in 300 $\mu$L of 2$\times$ Binding Buffer (10 mM Tris-HCl pH 7.5, 1 mM EDTA; 2 M NaCl), the sonicated DNA was added (diluted to a final volume of 300 $\mu$L if necessary), and the beads were incubated for $\geq$15 minutes at room temperature on a rotator. After separation on a magnetic stand, the beads were washed with 300 $\mu$L of 1$\times$ TWB, and heated at 55$\,^{\circ}\mathrm{C}$ in a Thermomixer with shaking for 2 minutes. After removal of the supernatant on a magnetic stand, the TWB wash and 55$\,^{\circ}\mathrm{C}$ incubation were repeated. 

Final libraries were prepared on beads using the NEBNext Ultra II DNA Library Prep Kit (NEB, $\#$E7645) as follows. End repair was carried out by resuspending beads in 50 $\mu$L 1$\times$ EB buffer, and adding 3 $\mu$L NEB Ultra End Repair Enzyme and 7 $\mu$L NEB Ultra End Repair Enzyme, followed by incubation at 20$\,^{\circ}\mathrm{C}$ for 30 minutes (in a Thermomixer, with shaking at 1,000 rpm) and then at 65$\,^{\circ}\mathrm{C}$ for 30 minutes. 

Adapters were ligated to DNA fragments by adding 30 $\mu$L Blunt Ligation mix, 1 $\mu$L Ligation Enhancer and 2.5 $\mu$L NEB Adapter, incubating at 20$\,^{\circ}\mathrm{C}$ for 20 minutes, adding 3 $\mu$L USER enzyme, and incubating at 37$\,^{\circ}\mathrm{C}$ for 15 minutes (in a Thermomixer, with shaking at 1,000 rpm) . 

Beads were then separated on a magnetic stand, and washed with 300 $\mu$L TWB for 2 minutes at 55$\,^{\circ}\mathrm{C}$, 1000 rpm in a Thermomixer. After separation on a magnetic stand, beads were washed in 100 $\mu$L 0.1 $\times$ TE buffer, then resuspended in 15 $\mu$L 0.1 $\times$ TE buffer, and heated at 98$\,^{\circ}\mathrm{C}$ for 10 minutes. 

For PCR, 5 $\mu$L of each of the i5 and i7 NEB Next sequencing adapters were added together with 25 $\mu$L 2$\times$ NEB Ultra PCR Mater Mix. PCR was carried out with a 98$\,^{\circ}\mathrm{C}$ incubation for 30 seconds and 12 cycles of 98$\,^{\circ}\mathrm{C}$ for 10 seconds, 65$\,^{\circ}\mathrm{C}$ for 30 seconds, and 72$\,^{\circ}\mathrm{C}$ for 1 minute, followed by incubation at 72$\,^{\circ}\mathrm{C}$ for 5 minutes. 

Beads were separated on a magnetic stand, and the supernatant was cleaned up using 1.8$\times$ AMPure XP beads. 

Libraries were sequenced in a paired-end format on an Illumina NextSeq instrument using NextSeq 500/550 high output kits (2$\times$38 cycles).

\subsection*{Tn5 production}

Tn5 transposase was produced as previously described\cite{Picelli2014}, with the following modifications of the adapter oligos for biotin-ATAC experiments:

Tn5MErev:

\begin{verbatim}
/5Phos/CTGTCTCTTATACACATCT
\end{verbatim}

Tn5ME-A:

\begin{verbatim}
/5Biotin/TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
\end{verbatim}

Tn5ME-B:

\begin{verbatim}
/5Biotin/GTCTCGTGGGCTCGGAGATGTGTATAAGAGACAG
\end{verbatim}

% \hl{XXX DESCRIBE DETAILS XXXXX}

% \hl{XXX LIST EXACT POSITION OF BIOTIN XXXXX}

\subsection*{ATAC-seq experiments}

ATAC-seq experiments were carried out following the Omni-ATAC protocol as previously described \cite{Corces2017,Marinov2023ATAC}. Briefly, $\sim$50,000 cells (with or without prior kethoxal treatment) were centrifuged at 500 $g$, then resuspended in 500 $\mu$L 1$\times$ PBS and centrifuged again. Cells were then resuspended in 50 $\mu$L ATAC-RSB-Lysis buffer (10 mM Tris-HCl pH 7.4, 10 mM NaCl, 3 mM MgCl$_2$, 0.1\% IGEPAL CA-630, 0.1\% Tween-20, 0.01\% Digitonin) and incubated on ice for 3 minutes. Subsequently 1 mL ATAC-RSB-Wash buffer (10 mM Tris-HCl pH 7.4, 10 mM NaCl, 3 mM MgCl$_2$, 0.1\% Tween-20, 0.01\% Digitonin) were added, the tubes were inverted several times, and nuclei were centrifuged at 500 $g$ for 5 min at $4\,^{\circ}\mathrm{C}$. 

Transposition was carried out by resuspending nuclei in a mix of 25 $\mu$L 2$\times$ TD buffer (20 mM Tris-HCl pH 7.6, 10 mM MgCl$_2$, 20\% Dimethyl Formamide), 2.5 $\mu$L Tn5 transposase (custom produced) and 22.5 $\mu$L nuclease-free H$_2$O, and incubating at $37\,^{\circ}\mathrm{C}$ for 30 min in a Thermomixer at 1000 RPM. 

Transposed DNA was isolated using the MinElute PCR Purification Kit (Qiagen Cat\# 28004/28006), and PCR amplified as previously described\cite{Corces2017,Marinov2023ATAC} ($72\,^{\circ}\mathrm{C}$ for 3 minutes, $98\,^{\circ}\mathrm{C}$ for 30 seconds, 10 cycles of $98\,^{\circ}\mathrm{C}$ for 10 seconds, $63\,^{\circ}\mathrm{C}$ for 30 seconds, $72\,^{\circ}\mathrm{C}$ for 30 seconds). Libraries were purified using the MinElute kit, then sequenced on an Illumina NextSeq 550 as 2$\times$38mers.

\subsection*{KAS-ATAC experiments}

For KAS-ATAC experiments, cells were processed and subjected to N$_3$-kethoxal treatment as described above for KAS-seq.

Subsequently they were washed by centrifugation and removal of media, resuspension in 500 $\mu$L 1$\times$ PBS, and centrifugation. The ATAC-seq step was then carried out, by resuspending cells in 50 $\mu$L ATAC-RSB-Lysis buffer, incubation on ice for 3 minutes, addition of 1 mL ATAC-RSB-Wash buffer, centrifugation at 500 $g$ for 5 min at $4\,^{\circ}\mathrm{C}$, and transposition in a total volume of 50 $\mu$L. DNA purification was carried out using the MinElute PCR Purification Kit.

Multiple parallel ATAC reactions were carried out, including $\sim$50,000 cells each, then were pooled together for the click reaction and biotin pull down.

Click reaction was carried out as described above, by combining 175 $\mu$L  transposed DNA, 5 $\mu$L 20 mM DBCO-PEG4-biotin (DMSO solution, Sigma 760749), 20 mM K$_3$BO$_3$ (pH 7.0) and 20 $\mu$L 10$\times$ PBS in a final volume of 200 $\mu$L. Biotinylated DNA was purified using the MinElute PCR Purification Kit.

For streptavidin pulldown of biotin-labeled DNA, 10 $\mu$L of 10 mg/mL Dynabeads MyOne Streptavidin T1 beads (Life Technologies, 65602) were separated on a magnetic stand, then washed with 300 $\mu$L of 1$\times$ TWB (Tween Washing Buffer; 5 mM Tris-HCl pH 7.5; 0.5 mM EDTA; 1 M NaCl; 0.05\% Tween 20). The beads were resuspended in 300 $\mu$L of 2$\times$ Binding Buffer (10 mM Tris-HCl pH 7.5, 1 mM EDTA; 2 M NaCl), the biotinylated DNA was added (diluted to a final volume of 300 $\mu$L if necessary), and the beads were incubated for $\geq$15 minutes at room temperature on a rotator. After separation on a magnetic stand, the beads were washed with 300 $\mu$L of 1$\times$ TWB, and heated at 55$\,^{\circ}\mathrm{C}$ in a Thermomixer with shaking for 2 minutes. After removal of the supernatant on a magnetic stand, the TWB wash and 55$\,^{\circ}\mathrm{C}$ incubation were repeated. 

Libraries were prepared on beads using the same settings as described above ($72\,^{\circ}\mathrm{C}$ for 3 minutes, $98\,^{\circ}\mathrm{C}$ for 30 seconds, 10 cycles of $98\,^{\circ}\mathrm{C}$ for 10 seconds, $63\,^{\circ}\mathrm{C}$ for 30 seconds, $72\,^{\circ}\mathrm{C}$ for 30 seconds). Beads were separated on a magnetic stand, and the supernatant was cleaned up using the MinElute PCR Purification Kit.

\subsection*{Biotin-ATAC experiments}

ATAC-seq experiments were carried out as described above but using a custom-produced biotinylated Tn5 transposase to tagment chromatin. 

Purified transposed DNA was then subjected to streptavidin pulldown and PCR amplification on beads as described above.

\subsection*{ATAC-seq, KAS-seq, and KAS-ATAC data processing}

Computational processing was carried out as previously described\cite{Marinov2023,Marinov2021}

Demultipexed FASTQ files were mapped to the \verb|hg38| assembly of the human genome as 2$\times$36mers using Bowtie\cite{Bowtie2009} with the following settings: \verb|-v 2| \verb|-k 2| \verb|-m 1| \verb|--best| \verb|--strata|. Mitochondrial mapping reads were filtered out and duplicate reads were removed using \verb|picard|\verb|-tools| (version 1.99). 

Reads were also mapped separately to the mitochondrial genome using the following settings: \verb|-v 2| \verb|-a| \verb|--best| \verb|--strata|, both in order to estimate the extent of mitochondrial contamination and to generate mitochondrial genome coverage tracks.

Browser tracks generation, fragment length estimation, TSS enrichment calculations, and other analyses were carried out using custom-written Python scripts\burl{https://github.com/georgimarinov/GeorgiScripts}. The refSeq annotation was used for evaluation of enrichment around TSSs.

Peak calling was carried out using MACS2\cite{MACS2}, with the following settings: \verb|-g| \verb|hs| \verb|-f| \verb|BAMPE| \verb|--to-large| \verb|--keep-dup| \verb|all| \verb|--nomodel|. 

Absolute library complexity was estimated using the \verb|lc_extrap| function in the \verb|preseq| package\cite{Daley2013}.

For differential accessibility analysis, a set of unified peaks was compiled across all libraries and conditions being compared, and read counts were calculated for each peak. Differential analysis was the carried out using DESeq2\cite{DESeq2}.

\subsection*{Transcription factor motif analyses}

As a first step, motif locations for all human TF motifs in the CIS-BP database\cite{CIS-BP} were identified genome-wide using FIMO\cite{FIMO}. These maps were then intersected with ChIP-seq peaks from the ENCODE Project Consortium (ENCODE IDs: GM12878 ARID3A: ENCFF531CLQ; GM12878 ARNT: ENCFF596RIT; GM12878 ATF3: ENCFF882AEU; GM12878 ATF7: ENCFF277FJJ; GM12878 BACH1: ENCFF631RZH; GM12878 BATF: ENCFF728KFD; GM12878 BCL11A: ENCFF824QXX; GM12878 BHLHE40: ENCFF445XYV; GM12878 BRCA1: ENCFF303GOQ; GM12878 CBFB: ENCFF876CUT; GM12878 CEBPB: ENCFF864EON; GM12878 CEBPZ: ENCFF639KLW; GM12878 CREB1: ENCFF053UZX; GM12878 CREM: ENCFF294IGP; GM12878 CTCF: ENCFF796WRU; GM12878 CUX1: ENCFF451AII; GM12878 E2F4: ENCFF744QAC; GM12878 E2F8: ENCFF961ZFW; GM12878 E4F1: ENCFF855PXT; GM12878 EBF1: ENCFF895MHN; GM12878 EGR1: ENCFF612EIU; GM12878 ELF1: ENCFF146SYU; GM12878 ELK1: ENCFF164MPE; GM12878 ESRRA: ENCFF660PIB; GM12878 ETS1: ENCFF568AZT; GM12878 ETV6: ENCFF151UJT; GM12878 FOS: ENCFF571DGT; GM12878 FOXM1: ENCFF549GKZ; GM12878 GABPA: ENCFF093KLR; GM12878 HSF1: ENCFF663ODL; GM12878 IRF3: ENCFF785MSW; GM12878 IRF4: ENCFF113VGD; GM12878 IRF5: ENCFF478SRO; GM12878 JUNB: ENCFF912OPT; GM12878 JUND: ENCFF023QWA; GM12878 KLF5: ENCFF992IBT; GM12878 MAFK: ENCFF436SJS; GM12878 MAX: ENCFF361EVH; GM12878 MAZ: ENCFF250FJC; GM12878 MEF2A: ENCFF826GQU; GM12878 MEF2B: ENCFF884QQW; GM12878 MEF2C: ENCFF238UKB; GM12878 MXI1: ENCFF376AEL; GM12878 MYB: ENCFF705CGM; GM12878 MYC: ENCFF214XPD; GM12878 NFATC1: ENCFF172KBM; GM12878 NFATC3: ENCFF002XEC; GM12878 NFYA: ENCFF937QTP; GM12878 NFYB: ENCFF156MUM; GM12878 NR2C1: ENCFF626EEU; GM12878 NR2C2: ENCFF782KRV; GM12878 NR2F1: ENCFF118UKC; GM12878 NRF1: ENCFF864EBU; GM12878 PAX5: ENCFF192WNJ; GM12878 PAX8: ENCFF934NVD; GM12878 PBX3: ENCFF402DQD; GM12878 PKNOX1: ENCFF946JFA; GM12878 POU2F2: ENCFF934JFA; GM12878 RELA: ENCFF141SMI; GM12878 RELB: ENCFF968MBN; GM12878 REST: ENCFF262MRD; GM12878 RFX5: ENCFF601WHE; GM12878 RUNX3: ENCFF346JDW; GM12878 RXRA: ENCFF063KLF; GM12878 SIX5: ENCFF878FBM; GM12878 SMAD1: ENCFF033MGN; GM12878 SP1: ENCFF038AVV; GM12878 SPI1: ENCFF492ZRZ; GM12878 SREBF1: ENCFF397YRO; GM12878 SREBF2: ENCFF896MPE; GM12878 SRF: ENCFF089YRX; GM12878 STAT1: ENCFF896SEI; GM12878 STAT3: ENCFF029PAA; GM12878 STAT5A: ENCFF006GSY; GM12878 TBX21: ENCFF170IZO; GM12878 TCF12: ENCFF068YYR; GM12878 TCF3: ENCFF700TAS; GM12878 TCF7: ENCFF900GTF; GM12878 USF1: ENCFF879TPT; GM12878 USF2: ENCFF196JQW; GM12878 YBX1: ENCFF488IGF; GM12878 YY1: ENCFF258UEW; GM12878 ZBED1: ENCFF753UAX; GM12878 ZBTB33: ENCFF845VSZ; GM12878 ZBTB4: ENCFF880BEK; GM12878 ZEB1: ENCFF703XCL; GM12878 ZNF274: ENCFF034UEJ; GM12878 ZNF384: ENCFF955FRU; HEK293 BCL11A: ENCFF972WFM; HEK293 BCL6B: ENCFF884UOX; HEK293 CTCF: ENCFF071FSR; HEK293 EGR2: ENCFF501WRO; HEK293 ELK4: ENCFF226QHL; HEK293 GFI1B: ENCFF199YAZ; HEK293 GLI2: ENCFF576SDC; HEK293 GLIS1: ENCFF628APG; HEK293 GLIS2: ENCFF311QUH; HEK293 HIC1: ENCFF979QGX; HEK293 HOXA7: ENCFF521UKW; HEK293 HOXB7: ENCFF610SSK; HEK293 HOXC10: ENCFF722GPK; HEK293 HOXD13: ENCFF223BOC; HEK293 KLF16: ENCFF585OJO; HEK293 KLF1: ENCFF103JPK; HEK293 KLF8: ENCFF790TTV; HEK293 MAZ: ENCFF529XTF; HEK293 MEIS1: ENCFF628KKY; HEK293 MZF1: ENCFF669FUB; HEK293 PBX3: ENCFF851ZEB; HEK293 PRDM1: ENCFF753PSU; HEK293 PRDM4: ENCFF351OYP; HEK293 REST: ENCFF437IAR; HEK293 SCRT1: ENCFF424QVD; HEK293 SCRT2: ENCFF916ITG; HEK293 SP2: ENCFF444VWF; HEK293 SP3: ENCFF663PCO; HEK293 TCF7L2: ENCFF136SOA; HEK293 WT1: ENCFF006ZKN; HEK293 YY1: ENCFF430PDB; HEK293 YY2: ENCFF755NLD; HEK293 ZBTB49: ENCFF066ZQN; HEK293 ZBTB6: ENCFF139IGE; HEK293 ZBTB7A: ENCFF114DGV; HEK293 ZEB1: ENCFF079WZH; HEK293 ZFHX2: ENCFF153XOB; HEK293 ZIC2: ENCFF359GIY; HEK293 ZNF148: ENCFF410SFG; HEK293 ZNF263: ENCFF108NXV; HEK293 ZNF274: ENCFF315BAF; HEK293 ZNF350: ENCFF064ZJO; HEK293 ZNF423: ENCFF222HPT; HEK293 ZSCAN16: ENCFF180POM; HEK293 ZSCAN4: ENCFF952OOU). 

Aggregate footprints were generated for each TF by calculating the average number of Tn5 insertions (shifted by +4/-4 bp) normalized to RPM (Read Per Million). Bias correction was carried out using a genome-wide bias prediction track for a ChromBPNet model (\burl{https://github.com/kundajelab/chrombpnet}) trained on the very deeply sequenced main ENCODE ATAC-seq dataset for GM12878 cells by subtracting the rescaled predicted bias profile from the observed ATAC/KAS-ATAC insertion profile.

\section*{Data availability}

Sequencing reads for libraries included in this manuscript have been submitted to the Gene Expression Omnibus and are available under GEO accession GSE264534.

\section*{Acknowledgments}

% This work was supported by NIH grants (P50HG007735, RO1 HG008140, U19AI057266 and UM1HG009442 to W.J.G., 1UM1HG009436 to W.J.G. and A.K., 1DP2OD022870-01 and 1U01HG009431 to A.K., the Rita Allen Foundation (to W.J.G.), the Baxter Foundation Faculty Scholar Grant, and the Human Frontiers Science Program grant RGY006S (to W.J.G). W.J.G. also acknowledges support by grants 2017-174468 and 2018-182817 from the Chan Zuckerberg Initiative. S.K. was supported by MSTP training grant T32GM007365 and the Paul and Daisy Soros Fellowship. Fellowship support was also provided by the Stanford School of Medicine Dean's Fellowship (G.K.M.).

This work was supported by NIH grants (P50HG007735, RO1 HG008140, U19AI057266, UM1HG009442 and 1UM1HG009436 to W.J.G., the Rita Allen Foundation (to W.J.G.), the Baxter Foundation Faculty Scholar Grant, and the Human Frontiers Science Program grant RGY006S (to W.J.G). W.J.G. also acknowledges support by grants 2017-174468 and 2018-182817 from the Chan Zuckerberg Initiative. S.H.K. was supported by MSTP training grant T32GM007365 and the Paul and Daisy Soros Fellowship. Fellowship support was also provided by the Stanford School of Medicine Dean's Fellowship (G.K.M.).

The authors would like to thank Anusri Pampari, Zohar Shipony, and Anna Shcherbina for technical assistance, and members of the Greenleaf, and Kundaje labs for helpful discussions and suggestions.

\section*{Author contributions}

G.K.M. and S.H.K. conceived the project. S.H.K. and G.K.M. carried out experiments. G.K.M. analyzed data. G.K.M. and S.H.K. wrote the manuscript with input from all authors.

\section*{Competing interests}

The authors declare no competing interests.

\begin{thebibliography}{100}

\input{references-V2}

\end{thebibliography}

\end{multicols}

\clearpage

\setcounter{table}{0}
\renewcommand{\tablename}{Supplementary Table}
\setcounter{figure}{0}
\renewcommand{\figurename}{Supplementary Figure}

\setcounter{page}{1}
\renewcommand\thepage{{SM }\arabic{page}}

\begin{center}
% {\LARGE \textbf{\begin{spacing}{1.1}XXXX. \\ Supplementary Materials\end{spacing} }}
{\LARGE \textbf{Supplementary Materials}}
\end{center}

\section*{Supplementary Tables}

\begin{center}
\begin{longtable}{m{6.50cm}m{1.60cm}m{1.50cm}m{1.50cm}m{1.5cm}m{0.7cm}m{0.75cm}}
\caption{Mapping statistics for datasets in this study}\\
\hline
Library & Raw fragments & Unique non-chrM reads & Unique non-chrM reads after dedup & Complexity & chrM fraction & TSS ratio \\
\hline
\endfirsthead
\multicolumn{7}{c}%
{\tablename\ \thetable\ -- \textit{Continued from previous page}} \\
\hline
Library & Raw fragments & Unique non-chrM reads & Unique non-chrM reads after dedup & Complexity & chrM fraction & TSS ratio \\
\hline
\endhead
\hline \multicolumn{7}{r}{\textit{Continued on next page}} \\
\endfoot
\hline
\endlastfoot
SK-L001-GM12878-ATAC-1 & 25,204,077 & 39,693,750 & 33,191,786 & 0.79 & 0.22 & 9.23 \\
SK-L002-GM12878-ATAC-2 & 21,019,294 & 32,301,936 & 24,852,942 & 0.70 & 0.32 & 12.80 \\
SK-L005-GM12878-ATAC\_Kethoxal-1 & 26,967,412 & 42,349,494 & 34,720,194 & 0.76 & 0.22 & 11.92 \\
SK-L006-GM12878-ATAC\_Kethoxal-2 & 28,083,530 & 43,996,380 & 35,745,648 & 0.75 & 0.23 & 12.35 \\
SK-L007-GM12878-ATAC\_Kethoxal\_biotin\_pulldown-1 & 30,391,595 & 48,649,648 & 3,367,986 & 0.06 & 0.24 & 9.84 \\
SK-L008-GM12878-ATAC\_Kethoxal\_biotin\_pulldown-2 & 36,320,602 & 58,638,602 & 3,595,220 & 0.05 & 0.25 & 8.92 \\
SK-L009-GM12878-ATAC\_Kethoxal\_input-1 & 13,002,248 & 20,324,336 & 7,296,094 & 0.33 & 0.24 & 13.81 \\
SK-L010-GM12878-ATAC\_Kethoxal\_input-2 & 11,984,129 & 18,941,686 & 8,168,106 & 0.40 & 0.22 & 11.07 \\
SK-L011-HEK293T-ATAC\_Kethoxal-1 & 12,576,111 & 19,216,260 & 14,670,352 & 0.69 & 0.41 & 15.20 \\
SK-L012-HEK293T-ATAC\_Kethoxal-2 & 13,567,064 & 20,905,448 & 16,161,544 & 0.70 & 0.40 & 14.42 \\
SK-L013-HEK293T-ATAC\_Kethoxal\_with\_biotin\_enriched-1 & 46,013,562 & 72,318,598 & 12,153,228 & 0.14 & 0.39 & 11.95 \\
SK-L014-HEK293T-ATAC\_Kethoxal\_with\_biotin\_enriched-2 & 61,750,300 & 97,499,704 & 13,715,198 & 0.12 & 0.39 & 11.04 \\
SK-L015-HEK293T-ATAC\_Kethoxal\_pulldown\_supernatant-1 & 11,382,288 & 17,461,940 & 13,582,016 & 0.70 & 0.40 & 15.20 \\
SK-L016-HEK293T-ATAC\_Kethoxal\_pulldown\_supernatant-2 & 7,632,787 & 11,717,616 & 9,312,228 & 0.72 & 0.41 & 14.96 \\
SK-L017-HEK293T-ATAC\_Kethoxal\_input-1 & 6,738,330 & 10,606,360 & 8,108,918 & 0.69 & 0.36 & 15.09 \\
SK-L018-HEK293T-ATAC\_Kethoxal\_input-2 & 7,382,103 & 11,648,988 & 8,897,006 & 0.70 & 0.35 & 13.88 \\
SK-L019-HEK293T-ATAC-Kethoxal-1 & 5,662,131 & 7,893,954 & 5,029,326 & 0.56 & 0.52 & 16.02 \\
SK-L020-HEK293T-ATAC-Kethoxal-2 & 9,081,011 & 12,362,796 & 6,974,112 & 0.49 & 0.54 & 15.36 \\
SK-L024-HEK293T-Biotin\_ATAC\_600k\_pulldown & 224,738,071 & 310,871,120 & 27,191,952 & 0.07 & 0.52 & 17.25 \\
SK-L025-HEK293T-KAS\_ATAC-600k\_supernatant & 17,108,823 & 23,408,630 & 12,589,982 & 0.46 & 0.53 & 23.35 \\
SK-L027-HEK293T-KAS\_ATAC-600k\_input & 175,596,606 & 255,157,430 & 59,741,052 & 0.20 & 0.49 & 19.18 \\
SK-L028-HEK293T-KAS\_ATAC-600k\_Kethoxal\_supernatant & 35,921,381 & 50,662,014 & 28,053,546 & 0.48 & 0.52 & 18.66 \\
SK-L029-HEK293T-KAS\_ATAC-600k\_Kethoxal\_pulldown & 66,158,532 & 100,347,548 & 13,280,000 & 0.11 & 0.49 & 23.81
\label{TableS1}
\end{longtable}
\end{center}
% \end{scriptsize}

\clearpage

\section*{Supplementary Figures}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf KAS-seq, ATAC-seq, and  KAS-ATAC mitochondrial genome profiles in human GM12878 cells}. 
(a) Shown are KAS-seq, ATAC-seq, and  KAS-ATAC mitochondrial genome profiles 
(b) Shown are KAS-seq, ATAC-seq, and  KAS-ATAC mitochondrial genome profiles with signal capped so that differences outside the D-loop can be visualized.
}
\label{FigS1}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=10cm]{FigS8-chrM-fraction.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Fraction of mitochondrial reads in ATAC-seq and KAS-ATAC libraries}. 
}
\label{FigS8}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS7-fragment-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Fragment length distribution in ATAC-seq and KAS-ATAC}. 
(a) GM12878 cells, replicate 1; 
(b) GM12878 cells, replicate 2; 
(c) HEK293 cells, replicate 1; 
(d) HEK293 cells, replicate 2; 
(e) HEK293 cells, replicate 3.
}
\label{FigS7}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16.5cm]{FigS9-insertLenDist_GM12878_18_segments.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of KAS-ATAC and ATAC-seq fragment lengths in different chromatin states in GM12878 (replicate 1) cells}. 
}
\label{FigS9}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16.5cm]{FigS5-insertLenDist_Hek293_18_segments.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Distribution of KAS-ATAC and ATAC-seq fragment lengths in different chromatin states in HEK293 (replicate 3) cells}. 
}
\label{FigS5}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS7-fragment-length.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Genome-wide TSS metaprofiles for ATAC-seq and KAS-ATAC libraries}. 
(a) GM12878 cells, replicate 1; 
(b) GM12878 cells, replicate 2; 
(c) HEK293 cells, replicate 1; 
(d) HEK293 cells, replicate 2; 
(e) HEK293 cells, replicate 3.
}
\label{FigS10}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS17-PRO-seq-correlations.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Correlation between KAS-seq, ATAC-seq, KAS-ATAC and PRO-seq datasets}. Enhancer elements were defined as the outersection of ENCODE DNase peaks for each cell line (ENCODE ID ENCFF759OLD for GM12878 and ENCODE ID ENCFF285OXK for HEK293) and annotated TSSs (refSeq). PRO-seq datasets were obtained from the indicated GEO accession IDs.
}
\label{FigS17}
\end{figure*}

\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=12cm]{FigS18-ABC-correlations.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Comparison of Activity-By-Contact (ABC) modeling of promoter output using different data modalities}. Shown are Spearman $r$ correlation scores between ABC scores and RNA-seq output for GM12878 cells using ENCODE datasets (accession IDs for RNA-seq are indicated in the figure). ABC scores were calculated for each human gene following the previously described approach\cite{Fulco2019} and using all ENCODE Hi-C datasets for GM12878 cells (accession IDs ENCFF194JDI, ENCFF955UVZ, ENCFF387IXM, ENCFF668TPL, ENCFF938ACB, ENCFF160PIP, ENCFF475GWB, ENCFF229MEW, ENCFF410FUO, ENCFF592GYT, ENCFF779SSO, ENCFF717LOV, ENCFF982XTF, ENCFF037PYX, ENCFF639IHA, ENCFF925SSJ, ENCFF005XHG, ENCFF457BVX, ENCFF131MFY, ENCFF512XPV, ENCFF837MNZ, ENCFF908NNJ, ENCFF558LDH, ENCFF558LDH, ENCFF991GHV, ENCFF075YWD, ENCFF290KPE, ENCFF930OVW, ENCFF515UDW, ENCFF916NSQ, ENCFF613NZE, ENCFF104TDF, ENCFF811VCC, ENCFF665QXT, ENCFF682LLL, ENCFF789MTX, ENCFF993IGW, ENCFF784LIP, ENCFF372XCU, ENCFF949VQC, ENCFF275RDF, ENCFF409YJA, ENCFF800UMS, ENCFF905EYO, ENCFF856SNL, ENCFF103ZBU, ENCFF064AOF, ENCFF693AYH, ENCFF825CNB, ENCFF825CNB, ENCFF591GZA, ENCFF361ECG, ENCFF625ZDB, ENCFF649NLE, ENCFF238SIE, ENCFF688XZY, ENCFF416CVP, ENCFF997OUB, ENCFF170SLP, ENCFF989YDB, ENCFF127QEU, ENCFF791APL, ENCFF572CHW, ENCFF346VTM, ENCFF690OEX, ENCFF360RWN, ENCFF849BZT, ENCFF610VEI, ENCFF748NRZ, ENCFF610VEI, ENCFF748NRZ, ENCFF847SEP, ENCFF568AYH, ENCFF149TDN, ENCFF871DEB, ENCFF814SHU, ENCFF907SUU, ENCFF659EAO, ENCFF677FWQ, ENCFF975DWR, ENCFF030WIP, ENCFF975DWR, ENCFF030WIP, ENCFF652INK, ENCFF551PHE, ENCFF021GFH, ENCFF931LZN, ENCFF377VFH, ENCFF779SSO, ENCFF717LOV, ENCFF982XTF, and ENCFF037PYX), which were processed separately and merged into a single map for maximum coverage using Jucer\cite{Durand2016}. ABC scores were calculated for each data modality for all elements within a radius of 2.5 Mbp of a gene promoter as $ABC(P_i) = \displaystyle \sum_{j} A(E_j) \times C(P_i,E_j)$ where $A(E_j)$ is the ATAC/KAS/KAS-ATAC/PRO-seq signal at enhancer element $E_j$ within the specified neighborhood of the promoter and $C(P_i,E_j)$ is the Hi-C interaction density between each enhancer $E_j$ and the promoter $P_i$. 
}
\label{FigS18}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{FigS15-CTCF-sites.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf KAS-seq, KAS-ATAC and KAS levels (normalized to RPM) around intergenic ($\geq$ 1,000 bp away from genes) CTCF sites in HEK293 cells}. 
(a) All three modalities. KAS-seq libraries used are from Marinov \& Kim et al. 2023\cite{Marinov2023}
(b) Representative KAS-seq dataset only.
}
\label{FigS15}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{FigS6-absolute-levels.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Estimating absolute levels of single-stranded accessible DNA}. Distribution of the ratios between the number of KAS-ATAC molecules and ATAC molecules for ATAC-seq peaks found in different chromatin states and for CTCF occupancy sites.
}
\label{FigS6}
\end{figure*}

\clearpage


\begin{figure*}
\begin{center}
\begin{minipage}[c]{0.7\linewidth}
\includegraphics[width=11.75cm]{FigS16-footprints-scores.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprint scores in KAS-ATAC and ATAC datasets}. Footprint scores were calculated as $FPS = \cfrac{\displaystyle\sum_{i=a}^{b} C_i}{ \displaystyle\sum_{i \leq c} C_i + \displaystyle\sum_{i \geq d} C_i}$, where $C_i$ is the coverage over a given position in the footprint metaprofile, and in this case $a = -20$, $b = +20$, $c = -30$, and $d = +30$.
}
\label{Fig16}
\end{minipage}
\end{center}
\end{figure*}


\clearpage

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS11-footprints-1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprints in KAS-ATAC and ATAC datasets}. Shown are the average Tn5 insertion profiles (see Methods for details) around the motifs for the indicated factors that are located within ENCODE ChIP-seq peaks for the TF in the given cell line.
}
\label{FigS11}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS12-footprints-2.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprints in KAS-ATAC and ATAC datasets}. Shown are the average Tn5 insertion profiles (see Methods for details) around the motifs for the indicated factors that are located within ENCODE ChIP-seq peaks for the TF in the given cell line.
}
\label{FigS12}
\end{figure*}

\clearpage


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS13-footprints-3.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprints in KAS-ATAC and ATAC datasets}. Shown are the average Tn5 insertion profiles (see Methods for details) around the motifs for the indicated factors that are located within ENCODE ChIP-seq peaks for the TF in the given cell line.
}
\label{FigS13}
\end{figure*}

\clearpage

\begin{FPfigure}
\begin{center}
\includegraphics[width=18.5cm]{FigS14-footprints-4.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Transcription factor footprints in KAS-ATAC and ATAC datasets}. Shown are the average Tn5 insertion profiles (see Methods for details) around the motifs for the indicated factors that are located within ENCODE ChIP-seq peaks for the TF in the given cell line.
}
\label{FigS14}
\end{FPfigure}

\clearpage

% \begin{figure*}[!ht]
% \begin{center}
% \includegraphics[width=16.5cm]{FigS3-DESeq-states.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf Differential abundance in KAS-ATAC and ATAC libraries for peaks located in different chromHMM chromatin states}. 
% }
% \label{FigS3}
% \end{figure*}

% \begin{figure*}[!ht]
% \begin{center}
% \includegraphics[width=18.5cm]{FigS4-DESeq-states-enrichment.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf XXXXX}. 
% }
% \label{FigS4}
% \end{figure*}

\clearpage

\end{document}
