\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=2cm,bottom=2cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionAfterwards]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{0pt plus 0.3ex}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\titleformat{\subsubsection}[block]{\normalsize\sc\itshape\filright}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}

\title{\bf Direct profiling of genome-wide dCas9 and Cas9 specificity using ssDNA mapping (CasKAS)}
\renewcommand\Authfont{\scshape\normalsize}
\author[1,$\#$]{Georgi K. Marinov}
\author[2]{Samuel H. Kim}
\author[1]{S. Tansu Bagdatli}
\author[3,4]{Alexandro E. Trevino}
\author[1]{Josh Tycko}
\author[5]{Tong Wu}
\author[4]{Lacramioara Bintu}
\author[1,6]{Michael C. Bassik}
\author[5,7,8]{Chuan He}
\author[1,9]{Anshul Kundaje}
\author[1,10,11,12,$\#$]{William J. Greenleaf}
\renewcommand\Affilfont{\itshape\small}
\affil[1]{Department of Genetics, School of Medicine, Stanford University, Stanford, CA 94305, USA}
\affil[2]{Cancer Biology Programs, School of Medicine, Stanford University, Stanford, CA 94305, USA}
\affil[3]{Center for Personal Dynamic Regulomes, Stanford University, Stanford, CA 94305, USA}
\affil[4]{Department of Bioengineering, Stanford University, Stanford, CA 94305, USA}
\affil[5]{Department of Chemistry and Institute for Biophysical Dynamics, The University of Chicago, Chicago, IL, 60637, USA}
\affil[6]{Chemistry, Engineering, and Medicine for Human Health (ChEM-H), Stanford University, Stanford, CA, 94305, USA}
\affil[7]{Department of Biochemistry and Molecular Biology and Institute for Biophysical Dynamics, The University of Chicago, Chicago, IL, 60637, USA}
\affil[8]{Howard Hughes Medical Institute, The University of Chicago, Chicago, IL, 60637, USA}
\affil[9]{Department of Computer Science, Stanford University, Stanford, CA 94305, USA}
\affil[10]{Department of Applied Physics, Stanford University, Stanford, CA 94305, USA}
\affil[11]{Center for Personal Dynamic Regulomes, Stanford University, Stanford, CA, 94305, USA}
\affil[12]{Chan Zuckerberg Biohub, San Francisco, California, USA}
% \affil[*]{These authors contributed equally to this work}
\affil[$\#$]{Corresponding author}
\date{}

\begin{document}
\maketitle

% \centerline{}
% \centerline{}
% \begin{abstract}

% \noindent {\normalsize \textbf{ABSTRACT GOES HERE} 
% }
% \centerline{}
% \centerline{}
% \end{abstract}

\textbf{The practical application of CRISPR-mediated genome and epigenome editing has often been plagued by off-target activity. While numerous methods have been developed to map sgRNA specificity genome-wide, they are generally cumbersome and/or expensive, and often not applicable to catalytically dead CRISPR enzymes. We have developed a rapid, inexpensive, and facile assay for identifying off-target CRISPR binding and cleavage based on directly mapping the single-stranded DNA structures formed upon CRISPR binding (``CasKAS''). We demonstrate this method in both \textit{in vitro} and \textit{in vivo} contexts.}

CRISPR-based methods for editing the genome and epigenome have emerged as a highly versatile means of manipulating the genetic makeup and regulatory states of cells. CRISPR technology may have the potential to transform medical practice by enabling direct elimination of pathogenic sequence variants. CRISPR has also become a standard tool for discovery in fundamental biomedical research, for example in its use in high-throughput, massively parallel CRISPR screens\cite{Wang2014}. 

However, the presence of significant off-target effects for many guide RNAs (sgRNAs), wherein the guide-CRISPR complex likely has biochemical activities at genomic sites that are not perfect matches to the sgRNA, presents a major hurdle to fully realizing this potential. Off-target effects are particularly problematic for medical applications, where risks of negative consequences for a patient’s health must to be minimized as fully as possible.

To this end, numerous approaches have been developed to experimentally map off-target effects genome-wide. Methods such as Digenome-seq\cite{Kim2015} look for particular types of cut sites around target sequences in whole-genome sequencing data; however, deep whole-genome sequencing is still quite expensive to carry out. Assays such as BLESS\cite{BLESS}, GUIDE-seq\cite{Tsai2015}, HTGTS\cite{HTGTS}, DSBCapture\cite{DSBCapture}, BLISS\cite{Yan2017}, SITE-seq\cite{Cameron2017}, CIRCLE--seq\cite{Tsai2017}, TTISS\cite{TTISS}, INDUCE-seq\cite{INDUCE-seq}, and CHANGE-seq\cite{CHANGE-seq} aim to instead directly map Cas9 cleavage events; however, they all involve some combination of complex and laborious molecular biology protocols and non-standard reagents, and have not been widely adopted as a result. Other methods, such as DISCOVER-seq\cite{Wienert2019}, which maps DNA repair activity by applying ChIP-seq against the MRE11 protein, as well as earlier applications of ChIP-seq to map catalytically dead dCas9 occupancy sites genome-wide\cite{Wu2014,Kuscu2014}, suffer from background and specificity issues associated with the ChIP procedure. Most recently, long-read sequencing has been adapted to the problem of Cas9 specificity profiling, in the form of SMRT-OTS and Nano-OTS\cite{OTS}, but the cost of these methods is relatively high while their throughput is comparatively low.

Various computational models have also been trained to predict off-targets genome-wide\cite{Doench2016,Perez2017}. However, these exhibit far from perfect accuracy, and thus in many situations, especially within clinical contexts, direct experimental evidence is needed to accurately identify potential unintended effects of CRISPR-based reagents.

Therefore a faster, more accessible, and versatile method for mapping CRISPR off targets is still a critical need in the field. When a Cas9-sgRNA ribonucleoprotein (RNP) is engaged with its target site, the sgRNA invades the DNA double helix, forming a ssDNA structure on the other strand (Fig. \ref{Fig1}a). We thus reasoned that mapping ssDNA-containing regions should be a sensitive biochemical signal of productive Cas9 binding. The recently developed KAS-seq\cite{KAS} assay for mapping single-stranded DNA (ssDNA) structures (\textbf{k}ethoxal-\textbf{a}ssisted \textbf{s}sDNA sequencing\cite{KAS}) is ideally suited for the purpose of identifying ssDNA structures generated by CRISPR protein binding to DNA (Fig. \ref{Fig1}a-b). KAS-seq is based on the specific covalent labeling of unpaired guanine bases with N$_3$-kethoxal, generating an adduct to which biotin can then be added using click chemistry. After shearing, biotinylated DNA, corresponding to regions containing ssDNA structure, can be specifically enriched for and sequenced. 

To determine if KAS-seq can be used to map regions of ssDNA generated by Cas9 binding, we carried out an initial \textit{in vitro} experiment using mouse genomic DNA (gDNA), purified dCas9 and two sgRNAs targeting the \textit{Hoxa} locus.

Strikingly, we observed strong and highly specific peaks at the expected target sites for each sgRNA (Fig. \ref{Fig1}c). Detailed examination of dCas9 CasKAS profiles around the predicted sgRNA target sites revealed strand coverage asymmetry patterns similar to those observed for ChIP-seq around transcription factor binding sites\cite{Landt2012}  (Fig. \ref{Fig1}d), indicating that enrichment derives from the sgRNA target site itself and confirming the utility of N$_3$-kethoxal for mapping dCas9 occupancy sites. We term the assay ``CasKAS''. 

We then reasoned that CasKAS should also capture active Cas9 complexed with DNA, as the enzyme is thought to remain associated with DNA for some time after cleavage\cite{Richardson2016}. We carried out Cas9 CasKAS experiments with the same sgRNAs and again observed enrichment at the expected on-target sites (Fig. \ref{Fig1}e). Remarkably, examination of Cas9 CasKAS read profiles around the on-target site showed that the 5' ends of reads are precisely positioned around the expected cut site, with one cut position on one strand and two to three such positions on the other (Fig. \ref{Fig1}f), consistent with the previously known patterns of Cas9 cleavage \cite{Gisler2019,Jones2021}. CasKAS therefore provides target specificity profiles for both active and catalytically dead Cas9 versions.

\textit{In vitro} CasKAS data is highly reproducible between replicates  (Fig. \ref{Fig1}g), and a modest sequencing depth of between 10 and 20 million mapped reads is generally sufficient to capture off-target specificity profiles (Fig. \ref{Fig1}h). 

We observed similar results with two mouse sgRNAs targeting the \textit{Nanog} locus (Supplementary Fig. \ref{FigS7}) and with two human sgRNA (``EMX1'' and ``VEGFA''; Supplementary Fig. \ref{FigS4} and \ref{FigS5}). We find no enrichment using components of the RNP in isolation -- sgRNAs, dCas9 or Cas9 (Supplementary Fig. \ref{FigS4}). 

Next we tested the application of CasKAS \textit{in vivo}. Living cells contain substantial ssDNA due to active transcription and other processes\cite{KAS}, so the \textit{in vivo} CasKAS signal is a mixture of signals from ssDNA associated with the Cas9 RNP and endogenous processes that generate ssDNA. We carried out KAS-seq experiments using both dCas9 and Cas9 in HEK293 cells transfected with EMX1 or VEGFA RNPs, as well as negative, no-guide controls, which provide a map of background endogenous ssDNA profiles. At the EMX1 gene, which is not active in HEK293 cells, we observe strong peaks at the expected target site (Fig. \ref{Fig1}i), an asymmetric read profile around it for dCas9 (Fig. \ref{Fig1}j), and a substantial degree of 5' end clustering at the cut site, similar to what is observed \textit{in vitro} for active Cas9 (Fig. \ref{Fig1}g). The VEGFA gene is active in HEK293 cells, but the dCas9/Cas9 CasKAS signal is still readily identifiable as an addition to the endogenous ssDNA enrichment pattern (Supplementary Fig. \ref{FigS18}). These results demonstrate the utility of CasKAS for profiling CRISPR specificity both \textit{in vitro} and \textit{in vivo}

We next examined the genome-wide specificity of sgRNAs as measured by CasKAS. We focus on the mouse sgRNA \#1 as it displayed a substantial number of off-targets yet that number was also sufficiently small for all of them to be examined directly. We first called peaks \textit{de novo} (see Methods for details) without relying on off-target prediction algorithms, then manually curated the resulting peak set (Fig. \ref{Fig2}a). Remarkably, while we find 32 peaks at predicted off-target sites, we also find 192 (i.e. $\sim$6$\times$ as many) additional manually curated peaks; while these peaks exhibit generally lower CasKAS signal (Fig. \ref{Fig2}b), they all appear to be genuine sites of occupancy as they display proper peak shape characteristics (see Supplementary Fig. \ref{FigS8} for details). Most of the predicted (in total $\sim$7,500) off-target sites for this sgRNA do not show substantial occupancy by dCas9 CasKAS (Fig. \ref{Fig2}c-d). 

Sequence comparison of the occupied predicted off-target sites allowed us to evaluate determinants of Cas9 specificity (Fig. \ref{Fig2}e). Consistent with previous reports \cite{Hsu2013,Semenova2011}, the PAM-distal region is much less sequence-constrained than the PAM-proximal one. We observed a similar pattern with the other sgRNAs we profiled, in both mouse and human (Supplementary Fig. \ref{FigS13}-\ref{FigS16} and Supplementary Fig. \ref{FigS9}-\ref{FigS12}). 

When analysing peaks not associated with predicted off-target sites (Supplementary Fig. \ref{FigS17}) we observed other telling patterns -- at numerous sites with strong dCas9 CasKAS signal, we observe a large number of mismatches to the sgRNA sequence as well as ``bulge'' regions wherein indels are observed in the target sequence. These mismatches and bulges were in general much larger than what is considered permissible by off-target prediction algorithms; we speculate that the lack of consideration of potential target sequences with large numbers of mismatches or substantial insertions likely explains the much larger number of such sites compared to the set of occupied predicted off-targets. 

We next devised a simple metric for evaluating the degree of read clustering at cut sites (a ``$C$-score''; see Methods for details), and used it to estimate the degree of cutting by Cas9. Strikingly, while the on-target site exhibits the second highest dCas9 CasKAS signal, and even though all off-target sites show binding by CasKAS, only the on-target site displays strong cutting activity (Fig. \ref{Fig2}f). The behavior of other sgRNAs varies (Supplementary Fig. \ref{FigS13}-\ref{FigS16} and \ref{FigS21}), with some showing multiple cut sites. Thus combining dCas9 and Cas9 CasKAS (or even Cas9 CasKAS alone) provides a powerful tool for detecting both binding specificity and the promiscuity of catalytic activity for arbitrary sgRNAs.

Finally, we compared \textit{in vitro} and \textit{in vivo} CasKAS profiles (Fig. \ref{Fig2}g-h). We find many fewer strongly enriched sites in \textit{in vivo} datasets than \textit{in vitro}, with the on-target site being either the top (for dCas9) or among the top (for Cas9) sites in vivo. A potential explanation for this difference is the previously reported impediment of Cas9/dCas9 binding to DNA by the presence of nucleosomes\cite{Horlbeck2016}; this inhibitory effect need not be complete to generate the observed patterns as CasKAS measures the physical occupancy of DNA by CRISPR proteins at the moment of harvesting cells % , i.e. Cas9/dCas9 could still bind nucleosome-protected DNA but much more transiently than \textit{in vitro}. As their effect, in particularly cutting, but also base editing (in the case of dCas9 fused with base editing enzymes) is not necessarily dependent on constant physical association with DNA, the optimal strategy for off-target identification might include a combination of \textit{in vitro} experimentation on purified gDNA (generating a maximally permissible set of sites) combined with an \textit{in vivo} occupancy map (providing an estimation of the potentially most relevant sites \textit{in vivo}).

In conclusion, we have presented CasKAS, a new, simple and robust method for mapping the specificity of active and catalytically dead versions of CRISPR enzymes. CasKAS has numerous advantages over existing tools while also opening up new possibilities for studying CRISPR biology. CasKAS requires no specialized molecular biology protocols, takes just a few hours \textit{in vitro} (and a similar amount of time after harvesting cells \textit{in vivo}), and is inexpensive as it actively and strongly enriches for off-targets. It measures strand invasion by CRISPR rather than association with DNA, a biochemically more specific event. We compared \textit{de novo} called CasKAS peaks to those generated by other means, and while we found large sets of peaks unique to each method, those found only by CasKAS contained much higher fractions of predicted off-target sites than those unique to other methods (Supplementary Fig. \ref{FigS19}). CasKAS can be used to profile the specificity of all types of DNA-targeting CRISPR proteins as it does not rely on measuring DNA cleavage or modification. CasKAS may be applied in primary cells as what is measured is physical association with DNA and not the outcome of CRISPR activity that may only be detectable after cell division. A limitation of CasKAS is the requirement that a G nucleotide is present within the sgRNA sequence, as without it there would be no kethoxal labeling; however, only a small fraction ($\leq$5\%) of sgRNAs in the human genome lack any Gs (Supplementary Fig. \ref{FigS3}). Another minor limitation of the current \textit{in vitro} protocol is that labeling is carried out on high molecular weight (HMW) DNA and samples must sheared serially. We have explored using pre-sheared and end-repaired DNA (to minimize kethoxal labeling of Gs on sticky ends generated by sonication), with comparable results to using HMW DNA; we anticipate that further optimization should allow the parallel high-throughput plate-based profiling of the specificity of very large numbers of sgRNAs. 

In addition to being highly valuable for off-target profiling \textit{in vitro} and in previously difficult to assay settings such as primary cells, we expect CasKAS to provide fruitful insights into the mechanisms and dynamics of \textit{in vivo} CRISPR action (taking advantage of finely controllable CRISPR systems such as vfCRISPR\cite{vfCRISPR}), and the influence of transcriptional, regulatory, and epigenetic and other functional genomic contexts on CRISPR activity.

\begin{thebibliography}{100}

\input{references-V3}

\end{thebibliography}

\clearpage

\section*{Figures}

\begin{FPfigure}[
\begin{center}
\includegraphics[width=18.5cm]{Fig1V3.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS maps dCas9- and Cas9-mediated strand invasion and cleavage events genome-wide \textit{in vitro} and \textit{in vivo}}. 
(a) CasKAS is based on the KAS-seq assay for mapping ssDNA structures. N$_3$-kethoxal covalently modifies unpaired guanine bases (while having no activity for G bases paired within dsDNA). Strand invasion by Cas9/dCas9 carrying an sgRNA results in the formation of a ssDNA structure, which can be directly identified using N$_3$-kethoxal. 
(b) Outline of \textit{in vivo} and \textit{in vitro} CasKAS. For in \textit{in vitro} CasKAS, gDNA is incubated with a dCas9/Cas9 RNP, then N$_3$-kethoxal is added to the reaction; for in \textit{in vivo} CasKAS, cells are transfected with an RNP, then treated with kethoxal. DNA is then purified, click chemistry is carried out, DNA is sheared, labeled fragments are pulled down with streptavidin beads, and sequenced.
(c and d) Mapping of dCas9 targets \textit{in vitro}. 
(c) Mouse gDNA was incubated with dCas9 RNPs carrying one of two sgRNAs targeting the mouse \textit{HOXA} locus. Highly specific labeling is observed at the expected target location of each sgRNA. 
(d) Asymmetric strand distribution of \textit{in vitro} dCas9 CasKAS reads around the sgRNA target site. 
(e and f) Mapping of Cas9 targets \textit{in vitro}. 
(e) Mouse gDNA was incubated with Cas9 RNPs carrying one of same two sgRNAs targeting the mouse \textit{HOXA} locus. 
(f) The distribution of 5' read ends around targets sites in \textit{in vitro} CasKAS datasets shows direct capture of the intermediate cleavage state.
(g) Reproducibility of in vivo dCas9 CasKAS datasets. Shown are RPM values for 500bp windows centered on the top $\sim$7,000 predicted target sites for the ``sgRNA \#1'' in two \textit{in vitro} CasKAS experiments. Off-target sites are color-coded by the number of mismatches relative to the sgRNA.
(h) CasKAS requires a moderate sequencing depth of 10-20 $\times$ 10$^6$ reads to accurately rank potential off-targets.
(i-k) \textit{In vitro} CasKAS maps Cas9 and dCas9 target sites. 
(i) Shown are CasKAS experiments with Cas9 and dCas9 and with the EMX1 sgRNA or with no sgRNA (negative control)
(j) Assymmetric 5' end distribution around target sites in dCas9 \textit{in vivo} CasKAS. 
(k) In \textit{in vivo} Cas9 CasKAS, a mixture distribution is observed between phased cleavage sites and broader ssDNA labeling.
} 
\label{Fig1}
\end{FPfigure}
\clearpage
\let\thefootnote\relax\footnotetext{}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig2-V4.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS profiles sgRNA specificity genome-wide}. 
(a) Summary of de novo peak calls for sgRNA \#1 (using MACS2)
(b) CasKAS signal is stronger over predicted off-target sites, but legitimate interactions are also found elsewhere in the genome.
(c) CasKAS profile over predicted (by Cas-OFFinder) off-target sites for sgRNA \#1 with dCas9 (all such sites and focusing only on the top 100 ranked by dCas9 CasKAS signal).
(d) CasKAS profile over peak calls outside predicted (by Cas-OFFinder) off-target sites for sgRNA \#1 with dCas9.
(e) Determinants of sequence specificity as measured by dCas9 CasKAS (for sgRNA \#1). PAM-distal regions of the sgRNA are less constrained than its PAM-proximal parts. The on-target sgRNA is highlighted in yellow. 
(f) Active Cas9 signal read profiles can be used to distinguish off-targets associated with cutting from those where only binding occurs. Shown are the same off-target sites as in (e) and the plus- and minus-strand active Cas9 5' end profiles around the sgRNA. In this case (sgRNA \#1), only the on-target site shows a Cas9 CasKAS pattern indicating cleavage; at the other sites even active Cas9 likely only binds but does not cut. A simple cutting score metric (``$C$-score'') based on multiplying the 5' end forward- and reverse-strand profiles can be used to quantify cutting vs. binding. 
(g and h) Comparison between \textit{in vitro} and \textit{in vivo} CasKAS signal over predicted off-target sites for the EMX1 sgRNA. \textit{In vivo} CasKAS is quantified as the difference in read per million ($\pm$500 bp of the sgRNA site) between the sgRNA KAS-seq and the no-guide control KAS-seq (``RPM$_{diff}$). The on-target site is shown in blue.
}
\label{Fig2}
\end{figure*}


\clearpage

\section*{Methods}

\subsection*{Guide RNA sequences}

Guide RNAs were obtained from IDT (``sgRNA \#1'' and ``sgRNA \#2'') or from Synthego (all others).

The following sgRNA sequences were used in this study:

\begin{enumerate}
\item ``sgRNA \#1'': \verb|GCTTAATTAAGGTAAACGTC|
\item ``sgRNA \#2'': \verb|CCAACCTGGCGGCTCGTTGG| 
\item ``EMX1\_Tsai'': \verb|GAGTCCGAGCAGAAGAAGAA|
\item ``VEGFA-site1'': \verb|GGGTGGGGGGAGTTTGCTCC|
\item ``Nanog-sg2'': \verb|GATCTCTAGTGGGAAGTTTC|
\item ``Nanog-sg3'': \verb|GTCTGTAGAAAGAATGGAAG|
\end{enumerate}

Guide RNAs were dissolved to a concentration of 100 $\mu$M using nuclease-free 1$\times$ TE buffer and stored at --20$\,^{\circ}\mathrm{C}$.

\subsection*{\textit{In vitro} CasKAS}

\textit{In vitro} CasKAS experiments were executed as follows. 

First, 1 $\mu$L of each synthetic sgRNA were incubated at room temperature with 1 $\mu$L of recombinant purified dCas9 (MCLab dCAS9B-200) for 20 minutes. The RNP was then incubated with 1 $\mu$g of gDNA at 37$\,^{\circ}\mathrm{C}$ for 10 minutes. 

The KAS reaction was then carried out by adding 1 $\mu$L of 500 mM N$_3$-kethoxal (ApeXBio A8793). DNA was immediately purified using the MinElute PCR Purification Kit (Qiagen 28006), and eluted in 87.5 or 175 $\mu$L 25mM K$_3$BO$_3$.

\subsection*{\textit{In vivo} CasKAS}

For \textit{in vivo} CasKAS experiments, HEK293T cells were seeded at 400,000 cells/well into a 6-well plate the day before RNP transfection. Media was exchanged 2 hours before transfection. For each well, 6,250 ng of Cas9 (MCLAB CAS9-200) or dCas9 (MCLAB dCAS9B-200) and 1,200 ng sgRNA was complexed with CRISPRMAX reagent in Opti-MEM following manufacturer's protocol. After incubation at room temperature for 15 minutes, the RNP solution was directly added to each well and gently mixed. The cells were incubated with the RNP complex for 14 hours at 37$\,^{\circ}\mathrm{C}$. To harvest and perform kethoxal labeling, media was removed and room temperature 1$\times$ PBS was used to wash the cells. Cells were then dissociated with trypsin, trypsin was quenched with media, cells were pelleted at room temperature, and then resuspended in 100 $\mu$L of media supplemented with 5 мM N$_3$-kethoxal. Cells were incubated for 10 minutes at 37$\,^{\circ}\mathrm{C}$ with shaking at 500 rpm in a Thermomixer. Cells were then pelleted by centrifuging at 500 $g$ for 5 minutes at 4$\,^{\circ}\mathrm{C}$. Genomic DNA was then extracted using the Monarch gDNA Purification Kit (NEB T3010S) following the standard protocol but with elution using 85 $\mu$L 25 mM K$_3$BO$_3$ at pH 7.0. 

\subsection*{Click reaction, biotin pull down and library generation}

The click reaction was carried out by combining 175 $\mu$L purified and sheared DNA, 5 $\mu$L 20 mM DBCO-PEG4-biotin (DMSO solution, Sigma 760749), and 20 $\mu$L 10$\times$ PBS in a final volume of 200 $\mu$L or 87.5 $\mu$L purified and sheared DNA, 2.5 $\mu$L 20 mM DBCO-PEG4-biotin (DMSO solution, Sigma 760749), and 10 $\mu$L 10$\times$ PBS in a final volume of 100 $\mu$L. The reaction was incubated at 37$\,^{\circ}\mathrm{C}$ for 90 minutes.

DNA was purified using AMPure XP beads (50 $\mu$L for a 100 $\mu$L reaction or 100 $\mu$L for a 200 $\mu$L reaction), beads were washed on a magnetic stand twice with 80\% EtOH, and eluted in 130 $\mu$L 25mM K$_3$BO$_3$.

Purified DNA was then sheared on a Covaris E220 instrument down to $\sim$150-400 bp size.

For streptavidin pulldown of biotin-labeled DNA, 10 $\mu$L of 10 mg/mL Dynabeads MyOne Streptavidin T1 beads (Life Technologies, 65602) were separated on a magnetic stand, then washed with 300 $\mu$L of 1$\times$ TWB (Tween Washing Buffer; 5 mM Tris-HCl pH 7.5; 0.5 mM EDTA; 1 M NaCl; 0.05\% Tween 20). The beads were resuspended in 300 $\mu$L of 2$\times$ Binding Buffer (10 mM Tris-HCl pH 7.5, 1 mM EDTA; 2 M NaCl), the sonicated DNA was added (diluted to a final volume of 300 $\mu$L if necessary), and the beads were incubated for $\geq$15 minutes at room temperature on a rotator. After separation on a magnetic stand, the beads were washed with 300 $\mu$L of 1$\times$ TWB, and heated at 55$\,^{\circ}\mathrm{C}$ in a Thermomixer with shaking for 2 minutes. After removal of the supernatant on a magnetic stand, the TWB wash and 55$\,^{\circ}\mathrm{C}$ incubation were repeated. 

Final libraries were prepared on beads using the NEBNext Ultra II DNA Library Prep Kit (NEB, $\#$E7645) as follows. End repair was carried out by resuspending beads in 50 $\mu$L 1$\times$ EB buffer, and adding 3 $\mu$L NEB Ultra End Repair Enzyme and 7 $\mu$L NEB Ultra End Repair Enzyme, followed by incubation at 20$\,^{\circ}\mathrm{C}$ for 30 minutes (in a Thermomixer, with shaking at 1,000 rpm) and then at 65$\,^{\circ}\mathrm{C}$ for 30 minutes. 

Adapters were ligated to DNA fragments by adding 30 $\mu$L Blunt Ligation mix, 1 $\mu$L Ligation Enhancer and 2.5 $\mu$L NEB Adapter, incubating at 20$\,^{\circ}\mathrm{C}$ for 20 minutes, adding 3 $\mu$L USER enzyme, and incubating at 37$\,^{\circ}\mathrm{C}$ for 15 minutes (in a Thermomixer, with shaking at 1,000 rpm) . 

Beads were then separated on a magnetic stand, and washed with 300 $\mu$L TWB for 2 minutes at 55$\,^{\circ}\mathrm{C}$, 1000 rpm in a Thermomixer. After separation on a magnetic stand, beads were washed in 100 $\mu$L 0.1 $\times$ TE buffer, then resuspended in 15 $\mu$L 0.1 $\times$ TE buffer, and heated at 98$\,^{\circ}\mathrm{C}$ for 10 minutes. 

For PCR, 5 $\mu$L of each of the i5 and i7 NEB Next sequencing adapters were added together with 25 $\mu$L 2$\times$ NEB Ultra PCR Mater Mix. PCR was carried out with a 98$\,^{\circ}\mathrm{C}$ incubation for 30 seconds and 12 cycles of 98$\,^{\circ}\mathrm{C}$ for 10 seconds, 65$\,^{\circ}\mathrm{C}$ for 30 seconds, and 72$\,^{\circ}\mathrm{C}$ for 1 minute, followed by incubation at 72$\,^{\circ}\mathrm{C}$ for 5 minutes. 

Beads were separated on a magnetic stand, and the supernatant was cleaned up using 1.8$\times$ AMPure XP beads. 

Libraries were sequenced in a paired-end format on a Illumina NextSeq instrument using NextSeq 500/550 high output kits (2$\times$36 cycles). 

\subsection*{Data processing}

Demultipexed fastq files were mapped to the \verb|hg38| assembly of the human genome or the \verb|mm10| version of the mouse genome as 2$\times$36mers using Bowtie\cite{Bowtie2009} with the following settings: \verb|-v 2| \verb|-k 2| \verb|-m 1| \verb|--best| \verb|--strata| \verb|-X 1000|. Duplicate reads were removed using \verb|picard|\verb|-tools| (version 1.99). 

Browser tracks generation, fragment length estimation, TSS enrichment calculations, and other analyses were carried out using custom-written Python scripts (\burl{https://github.com/georgimarinov/GeorgiScripts}). The \verb|refSeq| set of annotations were used for evaluation of enrichment around TSSs.

\subsection*{Peak calling}

Peak calling on \textit{in vitro} binding datasets was carried out using version 2.1.0 of MACS2\cite{MACS2} with default settings.

Peaks were then compared against the ENCODE set of ``blacklisted'' regions\cite{BL2019} to filter out likely artifacts.

\subsection*{Sequence analysis}

Guide RNA off-target predictions were obtained from Cas-OFFinder\cite{CasOFFinder}

Multiple sequence alignments of sgRNA sequences and their off-targets were generated using MUSCLE\cite{MUSCLE} and visualized using JalView\cite{JalView}.

\subsection*{Quantification}

\subsection*{Cutting score calculation}

The Cas9 cutting $C$-score was calculated as follows. 

First, basepair-level Read-Per-Million (RPM) profiles for mapped read 5' ends were generated separately for the forward and reverse strands. Then the $C$-score was calculated by multiply the forward and reverse strand profiles (summed over a running window of 3 bp):

\begin{equation}
C\mbox{-score}_{c,i} = \sum^{j=i+1}_{j=i-1} RPM^+_{c,j} \times \sum^{j=i+1}_{j=i-1} RPM^-_{c,j}
\end{equation}

Where ${c,i}$ indicate the coordinates by chromosome and position.

\section*{Data availability}

Sequencing reads for the datasets described in this study are available from GEO accession XXXX TO BE SUBMITTED XXX.

\section*{Author contributions}

G.K.M. conceptualized the study, performed initial \textit{in vitro} CasKAS experiments, analyzed data, and wrote the manuscript with input from all authors. S.H.K. developed the \textit{in vivo} CasKAS protocol and performed \textit{in vivo} CasKAS experiments. S.T.B. carried out \textit{in vitro} CasKAS optimization. A.E.T. and J.T. supplied sgRNAs and designed off-target profiling experiments. A.E.T. carried out off-target analysis for mouse sgRNAs. T.W. provided key reagents. W.J.G., A.K., C.H. M.C.B. and L.B. supervised the study. 

\section*{Acknowledgments}

This work was supported by NIH grants (P50HG007735, RO1 HG008140, U19AI057266 and UM1HG009442 to W.J.G., 1UM1HG009436 to W.J.G. and A.K., 1DP2OD022870-01 and 1U01HG009431 to A.K.), the Rita Allen Foundation (to W.J.G.), the Baxter Foundation Faculty Scholar Grant, and the Human Frontiers Science Program grant RGY006S (to W.J.G). W.J.G is a Chan Zuckerberg Biohub investigator and acknowledges grants 2017-174468 and 2018-182817 from the Chan Zuckerberg Initiative. J.T. is supported by the NIDDK F99/K00 fellowship of the National Institutes of Health (F99DK126120). M.C.B. is supported by a grant from Stanford ChEM-H and an NIH Director’s New Innovator Award (1DP2HD08406901). Fellowship support also provided by the Stanford School of Medicine Dean's Fellowship (G.K.M.), the Siebel Scholars, the Enhancing Diversity in Graduate Education Program and the Weiland Family Fellowship (A.E.T.). 

The authors would like to thank Zohar Shipony and members of the Greenleaf, Kundaje, and Bassik labs for helpful discussion and suggestions regarding this work.

\section*{Competing interests}

The authors declare no competing interests.

\clearpage

\setcounter{table}{0}
\renewcommand{\tablename}{Supplementary Table}
\setcounter{figure}{0}
\renewcommand{\figurename}{Supplementary Figure}

\setcounter{page}{1}
\renewcommand\thepage{{SM }\arabic{page}}

\begin{center}
% {\LARGE \textbf{\begin{spacing}{1.1}XXXX. \\ Supplementary Materials\end{spacing} }}
{\LARGE \textbf{Supplementary Materials}}
\end{center}

% \section*{Supplementary Tables}

\section*{Supplementary Figures}

% \begin{figure*}[!ht]
% \begin{center}
% \includegraphics[width=8cm]{FigS1-correlations-V2.png}
% \end{center}
% \captionsetup{singlelinecheck=off,justification=justified}
% \caption{
% {\bf Correspondence between \textit{in vitro} dCas9 and active Cas9 CasKAS profiles for the mouse sgRNA \#1 guide}. 
% }
% \label{FigS1}
% \end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS7-Nanog.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \textit{In vitro} dCas9 and Cas9 CasKAS profiles around the mouse \textit{Nanog} locus using the ``Nanog-sg2'' and ``Nanog-sg3'' sgRNAs}. 
}
\label{FigS7}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS4-EMX1-in-vitro-negative-controls.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS signal \textit{in vitro} is specific to the activity of the dCas9/Cas9 protein combined with its sgRNA}. CasKAS was carried out with the EMX1 sgRNA and with the following combinations of protein and sgRNA: dCas9 + sgRNA, Cas9 + sgRNA, dCas9 alone, Cas9 alone, or sgRNA alone.
}
\label{FigS4}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS5-VEGFA-in-vitro.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS signal \textit{in vitro} around the \textit{VEGFA} gene with the VEGFA sgRNA}. 
}
\label{FigS5}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS18-VEGFA-in-vivo.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS signal \textit{in vivo} around the \textit{VEGFA} gene with the VEGFA sgRNA}. 
}
\label{FigS18}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS13-heatmap-Nanog-sg2.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \textit{In vitro} dCas9 and Cas9 CasKAS profiles for the ``Nanog-sg2'' sgRNA}. CasKAS profiles are shown for all off-target sites predicted by Cas-OFFinder as well as for the top 1000 sites (ranked by CasKAS RPM values over the $\pm$500bp region around the sgRNA target site). 
}
\label{FigS13}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS14-heatmap-Nanog-sg3.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \textit{In vitro} dCas9 and Cas9 CasKAS profiles for the ``Nanog-sg3'' sgRNA}. CasKAS profiles are shown for all off-target sites predicted by Cas-OFFinder as well as for the top 1000 sites (ranked by CasKAS RPM values over the $\pm$500bp region around the sgRNA target site). 
}
\label{FigS14}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS15-heatmap-EMX1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \textit{In vitro} dCas9 and Cas9 CasKAS profiles for the ``EMX1\_Tsai'' sgRNA}. CasKAS profiles are shown for all off-target sites predicted by Cas-OFFinder as well as for the top 1000 sites (ranked by CasKAS RPM values over the $\pm$500bp region around the sgRNA target site). 
}
\label{FigS15}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS16-heatmap-VEGFA.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \textit{In vitro} dCas9 and Cas9 CasKAS profiles for the ``VEGFA-site1'' sgRNA}. CasKAS profiles are shown for all off-target sites predicted by Cas-OFFinder as well as for the top 1000 sites (ranked by CasKAS RPM values over the $\pm$500bp region around the sgRNA target site). 
}
\label{FigS16}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.70\linewidth}
\includegraphics[width=11.75cm]{FigS9-sgRNA-off-targets-Nanog-sg2.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Multiple sequence alignment of off-target sites identified by \textit{in vitro} dCas9 and Cas9 CasKAS for the ``Nanog-sg2'' sgRNA}. Shown are the top 100 off-target sites as predicted by Cas-OFFinder and ranked by CasKAS signal. The on-target site (if within the top 100) is highlighted in yellow. 
}
\label{FigS9}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.70\linewidth}
\includegraphics[width=11.75cm]{FigS10-sgRNA-off-targets-Nanog-sg3.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Multiple sequence alignment of off-target sites identified by \textit{in vitro} dCas9 and Cas9 CasKAS for the ``Nanog-sg3'' sgRNA}. Shown are the top 100 off-target sites as predicted by Cas-OFFinder and ranked by CasKAS signal. The on-target site (if within the top 100) is highlighted in yellow. 
}
\label{FigS10}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.70\linewidth}
\includegraphics[width=11.75cm]{FigS11-sgRNA-off-targets-EMX1.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Multiple sequence alignment of off-target sites identified by \textit{in vitro} dCas9 and Cas9 CasKAS for the ``EMX1\_Tsai'' sgRNA}. Shown are the top 100 off-target sites as predicted by Cas-OFFinder and ranked by CasKAS signal. The on-target site (if within the top 100) is highlighted in yellow. 
}
\label{FigS11}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.70\linewidth}
\includegraphics[width=11.75cm]{FigS12-sgRNA-off-targets-VEGFA.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Multiple sequence alignment of off-target sites identified by \textit{in vitro} dCas9 and Cas9 CasKAS for the ``VEGFA-site1'' sgRNA}. Shown are the all target sites with RPM $\geq$ 1.5 as predicted by Cas-OFFinder and ranked by CasKAS signal. The on-target site (if within the top 100) is highlighted in yellow. 
}
\label{FigS12}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS8-peak-shape.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS identifies proper off-target sites that are missed by sgRNA prediction algorithms}. Shown is \textit{in vitro} dCas9 CasKAS for the ``sgRNA \#1'' sgRNA. Peaks were called \textit{de novo} using MACS2, then intersected with Cas-OFFinder off-target prediction, and the outersect was manually filtered to exclude obvious artifacts based on peak shape (e.g. arising from repetitive elements in the genome). 
(a) Aggregate forward- and reverse-strand profiles around off-target sites predicted by Cas-OFFinder (centered on the sgRNA);
(b) Aggregate forward- and reverse-strand profiles around sites not predicted by Cas-OFFinder (centered on the MACS2 peak summit);
(c) Example UCSC Genome Browser snapshot of a CasKAS read profile around an off-target site predicted by Cas-OFFinder;
(c) Example UCSC Genome Browser snapshot of a CasKAS read profile around an off-target site not predicted by Cas-OFFinder. Both predicted and identified through peak calling sites exhibit the expected asymmetric read distribution around a fixed occupancy point (the sgRNA-dCas9 RNP complexed with DNA).
}
\label{FigS8}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.50\linewidth}
\includegraphics[width=5.25cm]{FigS17-sgRNA-off-targets-sgRNA1-other-peak-calls.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.50\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Multiple sequence alignment of off-target sites identified by \textit{in vitro} dCas9 and Cas9 CasKAS for the ``sgRNA \#1'' sgRNA outside the list of predicted off-targets by Cass-OFFinder}. MACS2 peak calls were manually filtered to exclude artifactual peaks, then the sequence of the $\pm$50-bp region around the peak summit was used as input to the multiple sequence alignment, together with the sgRNA itself.
\label{FigS17}}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\begin{minipage}[c]{0.70\linewidth}
\includegraphics[width=12cm]{FigS21-VEGFA-cutting.png}
\end{minipage}\hfill
\begin{minipage}[c]{0.30\linewidth}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Cutting profiles around on- and off-target sites for the VEGFA sgRNA}. Four sites where cleavage is observed are identified within the list of predicted off-targets.
}
\label{FigS21}
\end{minipage}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS19-GUIDE-seq-ChIP-seq-comparison.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Comparing \textit{in vitro} dCas9 results to using ChIP-seq and CHANGE-seq for off-target profiling}. Shown is the overlap between MACS2 peak calls for the Nanog-sg3 sgRNA with Nanog ChIP-seq dataset (SRR1168384 from GEO accession ID GSE54745) in (a) and the EMX1 sgRNA with EMX1 CHANGE-seq (SRA accession SRX8227890) in (b). The fraction of peaks common or unique to each assay that are predicted to be off-targets for each sgRNA by Cas-OFFinder is shown in (c).
}
\label{FigS19}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS3-sgRNA-G-content.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Most sgRNAs in the human genome contain multiple G nucleotides and are thus subject to labeling by N$_3$-kethoxal}. Statistics were calculated for all valid sgRNAs as defined by GuideScan\cite{Perez2017}
(a) Cumulative fraction of sgRNAs.
(b) Absolute number of sgRNAs.
}
\label{FigS3}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{FigS6-pre-sheared-EMX1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf CasKAS can be performed on pre-sheared DNA}. CasKAS was performed \textit{in vitro} using the EMX1 sgRNA, first, conventionally, by carrying out the CasKAS reaction, then isolating and shearing genomic DNA, and also by pre-shearing the DNA and carrying out the CasKAS reaction on the fragmented DNA. The concern in that case is that the presence of sticky ends containing Gs and unprotected from the action of the N$_3$-kethoxal would lower the background. This problem can be addressed by carrying out end repair on the sheared DNA prior to the CasKAS reaction. 
}
\label{FigS6}
\end{figure*}

\end{document}