\documentclass[10pt]{article}
\usepackage[hmargin=1.5cm,top=1.8cm,bottom=1.8cm]{geometry}
\usepackage{multicol}
\setlength\columnsep{15pt}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{array}
\usepackage{booktabs}
\usepackage{tabularx}
\usepackage[auth-sc]{authblk}
\usepackage{longtable}
\usepackage{multirow}
\usepackage{hyperref}
\usepackage{enumerate}
\usepackage[labelfont=bf]{caption}
\usepackage[usenames,dvipsnames]{xcolor}
\usepackage{mdframed}
\usepackage{graphics}
\usepackage{multirow}
\usepackage{rotating}
\usepackage{array}
\usepackage{capt-of}
\usepackage{lscape}
\usepackage{caption}
\usepackage{breakurl}
\usepackage{todonotes}
\usepackage{hanging}
\usepackage{pagecolor}
\usepackage[final]{pdfpages}
\usepackage[leftFloats,CaptionAfterwards]{fltpage}
\usepackage[numbers,super,sort&compress]{natbib}
\setlength{\bibsep}{0pt plus 0.3ex}
\usepackage{abstract}
\usepackage{enumitem}
\usepackage{soul}
\usepackage{titlesec}
\titleformat{\section}[block]{\large\bfseries\filcenter}{\thesection.}{0.4em}{}
\titleformat{\subsection}[block]{\normalsize\sc\bfseries\filcenter}{\thesubsection.}{0.4em}{}
\titleformat{\subsubsection}[block]{\normalsize\sc\itshape\filright}{\thesubsection.}{0.4em}{}
\setcounter{secnumdepth}{5}

\usepackage[hang]{footmisc}
\setlength\footnotemargin{0em}

\setlength{\skip\footins}{0.75cm}

\makeatletter
\renewcommand\footnoterule{%
  \kern-3\p@
  \hrule\@width \textwidth height 1.5pt
  \kern2.6\p@}
\makeatother

\makeatletter
\def\@biblabel#1{\@ifnotempty{#1}{#1.}}
\makeatother

\newcommand{\filllastline}[1]{
\setlength\leftskip{0pt}
\setlength\rightskip{0pt}
\setlength\parfillskip{0pt}
#1}

\newenvironment{Figure}
{\par\medskip\noindent\minipage{\linewidth}}
{\endminipage\par\medskip}


\title{\bf Reconstructing the \textit{cis}-regulatory landscape of archaic hominids}
\renewcommand\Authfont{\scshape\normalsize}
\author[1] {Aman Patel}
\author[2]{Georgi K. Marinov}
\author[1,2]{Anshul Kundaje}
\renewcommand\Affilfont{\itshape\normalsize}
\affil[1]{Department of Computer Science, Stanford University, Stanford, CA 94305, USA}
\affil[2]{Department of Genetics, Stanford University, Stanford, CA 94305, USA}
% \affil[$\#$]{Corresponding author}
% \affil[*]{These authors contributed equally}
\date{}

\begin{document}
\maketitle

% \centerline{}
% \centerline{}
\begin{abstract}

\noindent {\normalsize \textbf{The ability to sequence genomes from samples dating back many thousands of years has transformed our understanding of human evolution and migration. However, the wealth of information contained in the sequences obtained from ancient samples has not yet been fully utilized, as the impact of the variation observed in these genomes relative to that in modern humans is still poorly understood. In particular, despite the crucial role of \textit{cis}-regulatory elements in the regulation of gene expression, variants associated with them have so far received very little attention. In this study we apply interpretable deep learning models trained on a vast collection of human chromatin accessibility datasets to examine the cell-type specific functional effects of sequence variants unique to Neanderthals and Denisovans. We identify 4,081 putative functionally variants, nearby 2,063 genes that are enriched for metabolic, neural development, skeletal morphology, and numerous other functions and phenotypes, thus greatly expanding the list of genes that might underly the phenotypic differences between ancient and modern humans. We utilize feature attributions to identify the likely transcription factor binding motifs/sites (TFBMs/TFBSs) altered by ancient human variants, observing instances of both \textit{de novo} motif creation and disruption. These variants are predicted to both increase/decrease and create/abrogate cRE activity, and we also observe numerous cases of complex synergistic combined effects for multiple clustered sequence variants, with positive, negative or mutually buffering effects on chromatin accessibility. Our efforts demonstrate the utility of interpretable deep learning methods in the study of ancient genomes, and provide a comprehensive dataset of candidate functionally relevant archaic non-coding sequence variants for future experimental investigation.}.
}
\centerline{}
\centerline{}
\end{abstract}

\begin{multicols}{2}

\section*{Introduction}

A series of technological advances over the last few decades have enabled the recovery of first, the sequences of hort and abundant DNA molecules, in particular mitochondrial genomes\cite{Krings1997,Krings2000,Ovchinnikov2000,Schmitz2002,Caramelli2003,Serre2004,Paabo2004,Briggs2007}, then partial whole genome sequences\cite{Burbano2010,Green2010}, and eventually high-coverage genomes\cite{Prufer2014,Meyer2012,Prufer2017} from ancient DNA, dating as far back as several tens of thousands years ago. Thousands of ancient genomes are now available from more recent historical samples together with a handful of archaic human genomes, and the wealth of information contained in them has dramatically reshaped our understanding of human evolution and migration patterns\cite{Reich2018}.

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=16cm]{Fig1V6.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicting the \textit{cis}-regulatory effects of archaic human variants}. 
(A) Phylogenetic relationships between modern humans and the archaic humans analyzed in this study\cite{Stringer2012,Schlebusch2017,Patterson2006}. The star indicates the point after which archaic-specific variants diverge from the modern human lineage.
(B) Most archaic human sequence variants are non-coding.
(C) Predicting the \textit{cis}-regulatory of sequence variation using ChromBPNet.
(D) Predicting the effects of archaic human variants on chromatin accessibility based on a large collection of \hl{1,078} human ChromBPNet models.
}
\label{Fig1}
\end{center}
\end{figure*}

Multiple partial and high-coverage genomes have been reported for Neanderthals\cite{Burbano2010,Green2010,Prufer2014,Mafessoni2020,Prufer2017,Slon2018,Petr2020} while in the process of sequencing further archaic skeletal remains the Denisovans, a separate lineage of archaic humans that went extinct around the same time Neanderthals did (40-50,000 years ago) were discovered\cite{Reich2010,Meyer2012,Sawyer2015,Slon2017,Slon2018,Petr2020}. A number of other human genomes of similar age but belonging to the modern human lineage have also been sequenced\cite{Fu2014,Meyer2016,Fu2016}.

Analysis of these sequences has uncovered previously unexpected levels of interbreeding and bidirectional genetic material flow between the different hominid lineages that were contemporaneous in Eurasia 40--60,000 years ago\cite{Abi-Rached2011,Reich2011,Kuhlwilm2016,Vernot2016,Slon2018,Jacobs2019,Massilani2020,Larena2021,Sankararaman2016}, an ancestry that still persists in some modern human populations, which posses an appreciable amount of Neanderthal and Denisovan genetic material. Most non-African humans have 1-4\% Neanderthal DNA. Denisovan DNA is found in several Asian populations, particularly Melanesians, in which it comprises approximately 4-6\% of the genome. Neanderthal and Denisovan ancestry has likely had substantial phenotypic effect on modern humans. Neanderthal-derived sequences have been suggested to impact neurological development and disease, metabolism, the immune system, hair and skin features\cite{Simonti2016,McCoy2017,Sankararaman2014,Khrameeva2014,Castellano2014}, and even susceptibility to severe COVID-19\cite{Zeberg2020}. Denisovan-derived genes have been proposed to convey essential hypoxia resistance to the Tibetan population of the Himalayas\cite{Huerta2014}.

However, the wealth of information contained in ancient human sequences has not yet been fully utilized. Most studies have focused on charting the phylogenetic relationships between and the migration patterns of various archaic and ancient human groups. The impact of genetic variation that is either shared with moderns humans or unique to extinct hominid groups has primarily been investigated at the level of protein coding genes. The effects of that variation on gene expression and organism-level phenotypic features are still a largely unexplored area even though they hold the key to answering the crucial questions regarding the genetic basis for the differences between distinct hominid lineages. As a longer-term goal, if these phenotypes can eventually be reliably predicted from genomic sequence alone, it might even be possible to infer the phenotypic features of ancient and archaic humans that are not readily observable in available paleontoligcal remains\cite{Brand2022b}, which are also often partial and incomplete (e.g. Denisovans are known only from extremely fragmentary material consisting of individual bones and teeth\cite{Reich2010,Bennett2019,Krause2010,Sawyer2015,Slon2017,Demeter2022,Brown2016,Chen2019}).

To fill this gap, it will be essential to integrate both coding and non-coding variation, as the expression of mammalian genes is regulated in a cell-type specific manner by the activity a large number of cognate \textit{cis}-regulatory elements (CREs)\cite{ENCODE2011,ENCODE2012,ENCODE2020}. However, up until now little attention has been paid to the variation in ancient and archaic humans associated with CREs, in large part due to the difficulty of interpreting the impact of such sequence variants from sequence alone. As it is impossible to obtain live biological material from ancients samples, and therefore measuring gene expression, chromatin accessibility, and histone modifications using common assays such as RNA-seq, ATAC-seq and ChIP-seq is not an option\cite{Gokhman2014,Gokhman2019,Colbran2019}, the only way to assess such effects is either through directly installing the relevant mutations into cell lines using genomic engineering and then applying the above-mentioned techniques, or by using synthetic constructs and carrying out functional assays that measure a reporter gene expression\cite{Weyer2016,Weiss2021}. However, these methods are some combination of expensive and low-throughput, and, most importantly, they can only be applied to cell lines in culture, while the phenotypic impacts of the sequence variants in question may only manifest themselves in a small fraction of the hundreds of cell types in the human body. Thus it is largely impossible to reconstruct most archaic human phenotypes through experimental methods alone. Reliable computational predictions of such impacts are therefore an invaluable tool for dissecting regulatory sequence variation.

Some work in this direction has been carried out in recent years. Gokhman et al.\cite{Gokhman2014} utilized the characteristic decay patterns of methylated and unmethylated cytosines to regenerate DNA methylation maps of Neanderthal skeletal tissue. When considering regions that are differentially methylated between Neanderthals and modern humans, associations with certain enhancer regions and disease-mediated genes were found. Combined with computational predictions based on the effect of loss-of-function mutations in modern humans, this method was later used to infer the face of the first Denisovan specimen ever discovered\cite{Gokhman2019}.

More recently, deep learning methods that have been developed and trained on experimental data for modern humans have been used to infer the 3D genome organization\cite{McArthur2022,Gilbertson2023} and the alternative splicing landscape\cite{Brand2022} in archaic humans. 

Some work has also been directed at \textit{cis}-regulatory variation. PrediXcan, a linear model that uses a gene's nearby SNPs to predict its expression levels was used to predict associations between regions of differential expression and traits such as bone morphology, mental disorders, skin pigmentation, and brain development\cite{Colbran2019}. Another direction has been to leverage eQTL results from modern human cohorts to identify the phenotypic associations of Neandertal variants\cite{Yermakovich2021}.

However, the full power of currently existing tools has not yet been applied to the topic, and past studies suffer from several shortcomings. First, no approach has shown the ability to finely dissect the mechanisms of action of genetic variation. Even for variants identified to convey an effect on regulation, the actual biological function of this variant -- e.g. disrupting the binding site of a particular transcription factor (TF) -- has so far remained unknown. Second, there is significant room for improvement in the predictive power of current models as e.g. linear models are severely limited in the complexity of data they can effectively model. Finally, such work has not yet been conducted on a comprehensive list of human cell types, thus is limited in its ability to identify all phenotypically relevant impacts of genetic variation.

Interpretable deep learning methods have now proven themselves as extremely powerful tools for dissecting the \textit{cis}-regulatory effects of sequence variation\cite{Zhou2015,Kelley2016,Zhou2018,Kelley2018,Zhou2019,Maslova2020,Avsec2021,Nair2019,Avsec2021B,Ameen2022,Yuan2022,Kim2021,Trevino2021,Kelley2020,Fudenberg2020}. More recently, ChromBPNet -- a residual convolutional neural network which predicts chromatin accessibility profiles from DNA sequence -- was shown to be extremely effective at both predicting and interpreting the sequence determinants of chromatin accessibility, and has been applied to thousands of cell lines samples, tissues and cell types forming the most extensive compendium of candidate \textit{cis}-regulatory element (cCRE) predictions available to date \hl{XXX CITE ChromBPNet PAPER/preprint XXX}. Combined with deep learning interpretability tools\cite{deeplift,shap}, ChromBPNet can accurately identify the TF binding motifs that drive chromatin accessibility and predict the effects of sequence variation within these motifs. 

In this study, we apply ChromBPNet models to probe the cell type-specific effects of 44,507 fixed single nucleotide polymorphisms (SNPs) that differentiate archaic humans from modern human lineages. We calculate the impacts that each variant has on predicted chromatin accessibility, and we analyze feature attributions to evaluate the biological mechanisms by which each variant conveys its effects. Finally, by examining genes nearby to important variants, we associate alterations at these loci with potential phenotypic consequences. Overall, we seek to create a comprehensive catalog for the cell-type specific regulatory effects for the genetic positions that differ between modern and archaic humans. 

\section*{Results}

\subsection*{Predicting the \textit{cis}-regulatory effects of archaic human variants}

\begin{figure*}
\begin{center}
\includegraphics[width=18cm]{Fig2-V8.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Sequence variants predicted to affect chromatin accessibility in archaic humans relative to modern humans}. 
(A) Hierarchical clustering of signed --$log$10($p$-value) scores for predicted accessibility levels in archaic variants in all \hl{1,078} ChromBPNet atlas models. The tissue and cell types to which the samples on which  ChromBPNet models were trained are shown on top.
(B) Distribution predicted accessibility effects along the genome. Outer ring shows the largest predicted effect for each SNV (as a --$log$10($p$-value) score). Middle ring shows enrichment/depletion of SNVs with minimum (across all models) --$log$10($p$-value) $\leq 0.001$ normalized to chromosome size. Inner ring shows enrichment/depletion of SNVs normalized to the number of all variants per chromosome.
(C) Distribution of the largest predicted effects of all \hl{44,507} archaic SNVs.
(D) Distribution of --$log$10($p$-value) $\leq 0.001$ variants with respect to ENCODE annotated cCREs.
(E) Number of nearest genes to a significant SNV (including genes for which effects in both directions are predicted).
(F) Number of significant SNVs per gene.
(G) Number of cCREs with significant SNVs (cCREs are defined by combining SNVs if $\leq$ 500 bp from each other) per gene.
}
\label{Fig2}
\end{center}
\end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{Fig9-coding-non-coding-overlap.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Archaic and modern genomes are distinguished by more putative regulatory variants than nonsynonymous coding variants}. 
(A) Number of non-coding variants with strong regulatory effects predicted by ChromBPNet by gene (defined as the nearest TSS to the variant) versus the number of nonsynonymous coding variants (archaic derived and modern derived, as defined previously by Pr\"ufer et al.\cite{Prufer2014});
(B) Overlap between genes associated with putative regulatory variants and genes containing archaic derived nonsynonymous coding variants (as defined previously by Pr\"ufer et al.\cite{Prufer2014});
(C) Overlap between genes associated with putative regulatory variants and genes containing modern human derived nonsynonymous coding variants (as defined previously by Pr\"ufer et al.\cite{Prufer2014}).
}
\label{Fig9}
\end{center}
\end{figure*}

We began our analysis of the \textit{cis}-regulatory effects of archaic human variants by compiling a set of % 51,109 
\hl{44,507} single-nucleotide variants (SNVs) that differ between archaic and modern humans (Figure \ref{Fig1}A). These variants were defined as the set for which the four high-coverage archaic genomes - the Altai Denisovan\cite{Meyer2012} and the Altai\cite{Prufer2014}, Vindija\cite{Prufer2017}, and Chagyrskaya\cite{Mafessoni2020} Neanderthals -- are all homozygous for a common minor allele that was absent from all genomes in the 1000 Genomes Project\cite{1000Genomes2012}. Notably, 95.3\% of such variants lie outside coding regons in the genome  (Figure \ref{Fig1}B), underscoring the importance of analyzing non-coding variation and its potential regulatory effects

In parallel, we created a vast collection of interpretable deep learning chromatin accessibility models using the ChromBPNet framework (described in detail elsewhere \hl{XXX CITE XXX}). In brief, the rationale behind ChromBPNet is based on the fact that DNase-seq and ATAC-seq datasets contain not only information about the general location of regions of open chromatin, with nucleosomal DNA being refractory to DNase I cleavage and Tn5 insertion, but are also much deeper and richer as TF occupancy of DNA also blocks enzymatic cleavage, and as a result TF footprints are identifiable in DNase-seq and ATAC-seq data when the insertion/cleavage maps are examined\cite{Hesselberth2009,Neph2012,Buenrostro2013}. ChromBPNet takes advantage of this property by modeling the detailed insertion/cleavage profile as a function of DNA sequence analogously to the way BPNet\cite{Avsec2021} models ChIP-exo/ChIP-seq profiles around TF binding sites (TFBSs) as a function of sequence, but with an additional very important DNase/Tn5 bias modeling component (Figure \ref{Fig1}C). See the Methods section for details on the exact ChromBPNet implementation.

The ENCODE Consortium Project has generated a vast collection of DNase-seq/ATAC-seq datasets, providning nearly complete coverage of human tissues and cell types\cite{ENCODE2011,ENCODE2012,ENCODE2020}. We built a human chromatin accessibility atlas of interpretable ChromBPNet models for all these datasets (in total, \hl{1,078} individual models, corresponding to \hl{439} unique cell types/tissues), which in turn allowed us to evaluate the predicted regulatory effects of archaic human variants (Figure \ref{Fig1}C). To this end, we generated ChromBPNet predictions for both overall accessibility levels and detailed insertion/cleavage profiles for all ENCODE tissues/cell types for each pair of archaic and modern human alleles. We also generated scores for each pair of alleles based on the level of difference between the predicted profiles for the modern and archaic sequences.  

Predictions and scores were calculated in two ways. First, each variant was considered in isolation, meaning all scores calculated were the result of substituting the modern and archaic alleles at only the specific position of the variant. This method resulted in % 51,109 
\hl{44,506} sets of scores and predictions, one for each variant. Second, to test for additive and combinatorial effects, we made substitutions not only for the position in question but also for all other variants that happened to fall within the same model input window. \hl{We then aggregated scores across each unique combination of variants to share a window, thus resulting in scores for a total of 41,592 unique windows - \textbf{can delete this part - we are not doing this anymore}}.

Hierarchical clustering of the predicted effects on overall accessibility levels (Figure \ref{Fig2}A) revealed several distinct patterns of regulatory variation across cell types and tissues. Importantly, it also returned the expected structure in cell type/tissue space, i.e. variants exhibit similar predicted effects in the same or highly similar cell types, underscoring the robustness of our analysis framework.

Archaic human sequence variants are predicted to both increase and decrease chromatin accessibility, in roughly equal proportions (Figure \ref{Fig2}C). We set a threshold of $min(-log_{10}p) \geq 2.7$ (or $max(p) \leq 0.002$) over all ChromBPNet models as one signifying a variant with a strong effect on accessibility.

Next, we examined the genomic distribution of regulatory divergence between archaic and modern humans (Figure \ref{Fig2}C). Individual variants with strong predicted effects on accessibility are found on all chromosomes, but are particularly enriched on chrX, and also on chr8; this pattern reflects the general enrichment of ancient human variants over these chromosomes.

We also compared the distribution of archaic variants with strong predictive effects with respect to the ENCODE collection of annotated cCREs\cite{ENCODE2020}. Somewhat surprisingly, most such variants are found outside the main classes of annotated ENCODE cCREs (Figure \ref{Fig2}D). The remaining are found predominantly in predicted distal enhancer elements, reflecting the overall abundance of such elements in the genome.

We associated archaic cCRE variants with their likely cognate gene based on physical proximity, resulting in a set of 2,213 such genes. Intriguingly, in the vicinity of about a quarter of these genes both variants that are predicted to increase and other variants predicted to decrease accessibility are found  (Figure \ref{Fig2}E). For $\sim$40\% of genes, more than one significant variant is found nearby, with 11.7\% associated with $\geq$ 4 such variants (Figure \ref{Fig2}F).

We also observe frequent proximal clustering of significant variants on the scale of a single cCREs (Figure \ref{Fig2}G). These observations we discus in more detail further below.

\subsection*{Archaic \textit{cis}-regulatory variant create and disrupt TFBSs}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{Fig4.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Archaic \textit{cis}-regulatory variants create and disrupt TFBSs}. 
}
(A) \hl{describe each -- need to list which model this is, and which cell line/tissue it is from}
(B) 
(C) 
(D) 
(E) 
(F) 
(G) 
(H) 
(I) 
(J) 
(K) 
(L) 
(M) 
(N) 
(O) 
(P) 
(Q) \hl{describe what is shown}
\label{Fig3}
\end{center}
\end{figure*}

\begin{figure*}
\begin{center}
\includegraphics[width=18.5cm]{Fig5-effect-types.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf \hl{Predicted profiles and importance scores for notable multiple-variant neighborhoods}}. 
(A) 
(B) 
(C) 
}
\label{Fig5}
\end{center}
\end{figure*}

In order to understand in finer detail the mechanisms underlying the predicted changes in chromatin accessibility, we leveraged the interpretability  of ChromBPNet models. \hl{ChromBPNet is designed to learn the set of transcription factor binding motifs that drive chromatin accessibility in the system in question. Motifs driving individual predictions can be detected through the use of deep learning feature attribution methods like DeepSHAP, which assign scores to each input nucleotide based on its importance in producing the output prediction. Nucleotides comprising motifs are therefore assigned high scores, while all other nucleotides are assigned low scores. For each variant, we obtained importance tracks for both alleles, using the model for which the variant produced the lowest p-value. We then classified the motif found in the importance tracks by comparing it to an established list of motifs found in the cell type in question. These comparisons are conducted using Finemo, a tool which utilizes convolutions to scan an input sequence for instances of each supplied motif. Using these steps, we can not only determine the identity of the motif in question, but also whether the binding site exists in the modern or archaic sequence}. 

We found \hl{{2,050}} instances of TFBMs that appear to be disrupted in ancient humans relative to modern populations and another \hl{{2,184}} instances of TFBMs that are created as a result of sequence variation in archaic humans and that are predicted to be associated with corresponding alterations in chromatin accessibility. We show representative examples for the CTCF (Figure \ref{Fig4}A-D), Ets  (Figure \ref{Fig4}E-H), IRF  (Figure \ref{Fig4}I-L), and Sp1/2 (Figure \ref{Fig4}M-P)  transcription factors. Overall, we can readily map relevant for chromatin accessibility sequence variation to alterations in $\sim$ 50 known TF motifs. Importantly, these can be grouped into functionally coherent clusters (Figure \ref{Fig4}Q). While the creation or disruption of CTCF motifs is predicted to affect chromatin accessibility in all cell lines and tissues, consistent with its ubiquitous role as the major insulator protein in mammalian genomes, and Sp1 factors, also acting nearly universally across many cell types, follow an analogously broad fashion, other TFs are associated with particular cell lineages (i.e. IRF, ETV, Ets, KLF, RUNX SPI, and other in T/B-cell lineages, SOX motifs in brain samples, and others).

\subsection*{Functional enrichment of archaic \textit{cis}-regulatory variants}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig3.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Functional enrichment of archaic \textit{cis}-regulatory variants}. 
(A) GREAT\cite{GREAT} enrichment of Human Phenotype categories over all significant archaic variants.
(B) K-means clustering ($k=10$) of predicted variant effects.
(C) GREAT\cite{GREAT} enrichment of Human Phenotype categories over each cluster.
}
\label{Fig3}
\end{center}
\end{figure*}

\hl{XXXXXXX CURRENTLY UP TO HERE XXXXX}

For the purposes of our functional studies, we defined a variant to appreciably affect chromatin accessibility if, for any cell type, the variant produced a Jensen-Shannon Divergence (JSD) or log fold-change (LogFC) that fell within the top 0.1\% of all values. In practice, this method resulted in thresholds of 0.128 for JSD and 0.718 for LogFC. Of the 51,109 SNPs we examined, a total of 5,285 passed at least one of these thresholds in at least one tissue type in our single-variant score dataset (Figure \ref{Fig1}D). These variants are mostly uniformly distributed along the genome, with a few exceptions (Figure \ref{Fig2}A-B) -- particularly strong enrichment for such variants was seen over chromosome X and also on chromosome 8, while an under-enrichment was observed on chromosome 14. 

Of these 5,285 variants, we find that 2,743 increase accessibility in archaic humans relative to modern humans, and 2,709 result in lower accessibility in archaic humans. From these numbers, we can also determine that 167 variants cause opposite effects in different cell types. (Figure \ref{Fig2}C-D). 

We first asked which genes putative functionally relevant variant are associated with. We identified 5,617 unique protein-coding genes located in proximity to -- and therefore possibly regulated by -- the CREs containing the 5,285 variants evaluated as significant by us. Functional enrichment was evaluated using the gProfiler tool\cite{gProfiler} (see Methods for details) utilizing two different background distributions. 

\hl{XXX I have not edited further below, it needs figures}

\hl{3A(B,C,etc.) -- show functionally enriched categories} 

\hl{rest of figure 3 -- show the motifs that are enriched; make a clustergram with motifs enriched in each tissue/cell type; See Figures 2I, S3A, etc. from the heart scATAC paper}

First, we considered the background of all human genes. Enrichment against this background would indicate the overall list of functions or pathways for which the relevant variants convey a disproportionate effect. The genes appeared to be heavily enriched for several ontology terms corresponding to the nervous system, particularly relating to neurogenesis and development. This result helps confirm the findings of several prior studies, which have already identified key nervous system differences between modern and archaic humans \cite{Colbran2019, Simonti2016, Weiss2021, McCoy2017, kochiyama_brain_morphology}. Several terms concerning metabolism were returned as well, including metabolism of a variety of different organic compounds. These terms are perhaps indicative of dietary differences between archaic and modern hominids. The general process of anatomic structure development was also heavily enriched, likely corresponding to the known morphological differences between modern and archaic humans. Finally, the analysis returned several terms concerning assorted cellular and regulatory processes, particularly relating to transcriptional regulation.   \\

The second background consisted of the list of genes in proximity to all 51,109 archaic variants. Enrichment detected against this background indicates whether high-scoring loci exhibit different patterns of localization than the overall set of archaic variants. With this query, enrichment was localized in two main areas. First, as before, several terms related to brain and nervous system development were returned, once again indicating clear differences in these areas between modern and archaic humans. Enrichment was also detected for a variety phenotypes relating to skeletal morphology. These phenotypes were particularly concentrated in the head region, including the forehead, face, nose, skull, and mouth, but also encompassed several other areas of the body. Of course, it is well known that archaic and modern humans possessed significant morphological differences, particularly regarding skeletal structure \cite{neanderthal_skeletal_review}. Thus, these findings fit perfectly with expectation. The strong presence of lips in the list is also notable, as it indicates a morphological difference that may be less evident from the skeletal record.    

Next, to uncover the global patterns of functional effect distribution with respect to cell and tissue types, we performed hierarchical clustering on the predicted logFC effects for our 41,592 unique input windows. (Figure \ref{Fig2}A). The clustering uncovered clear groupings on both axes. \\
We theorized that these clusters are largely driven by variants located within motifs for the same groups of transcription factors, many of which exhibit distinct cell type-specific activity. To verify this assertion, we utilized model importance scores to label the motif occupied by each variant, if applicable. This process was conducted by first applying an automated method to locate the motif within the overall sequence, followed by using the open-source program TOMTOM to query against a list of labeled motifs learned by the model from which the importance scores were derived.\\
Indeed, the motifs enriched in each cluster largely followed established knowledge about transcription factor cell-type specificity. For example, multiple clusters exist in which the highest effect scores are observed in various immune cell types, including T cells, B cells, and lymphoblast cell lines. The most prominent predicted binding motifs in these clusters include those for the RUNX, IRF, and SPI families, which are all known to have specialized effects in the immune system. Conversely, the lowest effects are observed in neural cell types, in which these transcription factors are not expected to act. Other clusters demonstrate the opposite pattern, in which nervous system cells show the greatest effect and immune cells the least. Prominent motifs in these clusters include the SP1/2, NRF, and CTCF families, for which the cell type profile is once again intuitive. Clusters are also observed with the highest scores in placenta and trophoblast cells, indicating potentially interesting differences in the reproductive system between modern and archaic humans. These clusters are characterized by the presence of TEAD, AP2A, GCM, and GATA binding domains. The latter two families are especially known to have important roles in placental functions and trophoblast development. The next set of clusters involves high effects in stem cells, most prominently through transcription factors like SOX and NANOG, which are known to play vital roles in stem cell development and proliferation. Another cluster emphasizes effects in muscle cells and is enriched for the NFIX family of transcription factors, which are known to mediate muscle regeneration through myostatin interactions. Finally, a cluster is observed with focus on the digestive system and metabolism. Two of the most prominent transcription factors in this cluster are FOXA and HNF1B, both of which are known to have roles in metabolism and the development of the digestive tract.  \hl{(make supplement figures showing this in detail)}. \\
By inspecting these clusters, we obtain a powerful method for systematically characterizing the cell types and body systems most strongly affected by genetic differences between modern and archaic humans. Indeed, many of the systems uncovered, including the nervous, immune, reproductive, metabolic, and muscular, closely follow the existing literature. The presence of stem cell-related clusters is particularly interesting, as it indicates potential fundamental developmental differences between the two groups. By identifying the transcription factors driving each cluster, we are also able to suggest the mechanisms by which each system is affected, once again demonstrating the power of interpretable deep learning in the study of the noncoding genome.    

\hl{XXX MAKE A BLOW-UP FIGURE OF THE CLUSTERS AND THEIR SAMPLE IDENTITY EITHER FOR FIGURE2 OR IN FIGURE 3}


\subsection*{Important archaic SNPs are highly enriched on the X chromosome}

We next examined the chromosome-level distribution of the variants in our dataset. When the entire variant set was considered, we observed a massive enrichment in variants located on the X chromosome, as shown in \textbf{THIS FIGURE}. As also displayed, this trend was maintained when restricting the analysis to only high-scoring variants. This observation is perhaps not unexpected due to the higher rates of X-linked divergence predicted by the faster-X effect \cite{fasterx}. However, applying gProfiler on the set of genes nearby all high-scoring variants on the X chromosome produced highly distinctive results. When considering the background of all human genes, significant enrichment was observed for a variety of phenotypes in three major categories. First, many brain-related phenotypes were present, particularly relating to intellectual disability and abnormal activity. Of course, it is well known that archaic and modern humans possessed significant neurological differences, so this result fits well with expectation. Second, the results showed a variety of phenotypes related to skeletal morphology, particularly localized to the skull and the face. Once again, these findings reflect prior knowledge \cite{neanderthal_skeletal_review}. Additionally, as might be expected on the X chromosome, a variety of phenotypes related to the reproductive system were also returned. These three trends were only intensified when the background was changed to the list of genes near significant variants from all chromosomes. The skeletal morphology terms extended to more parts of the body, while the neurological and reproductive terms were very similar to before.  \\
Overall, even with respect to similar variants on other chromosomes, the high-scoring X chromosome variants clearly show enrichment for many of the most salient differences between modern and archaic humans. The X chromosome therefore appears to be an integral piece of the puzzle in understanding the evolutionary relationships between humans and our closest neighbors. 

It is interesting, however, to juxtapose these results with the established knowledge that archaic introgression in humans is actually drastically reduced on the X chromosome \cite{Sankararaman2014}. The most likely explanation is that while differences between the two groups on the X chromosome are frequent, they also, as shown, convey massive effects for many species-specific traits. Thus, admixture events of human and archaic X chromosomes are heavily selected against. \\
Our results therefore characterize and explain the great importance of inter-species variation on the X chromosome as a driver for species-specific differentiation.
  
\subsection*{Archaic SNPs convey important effects on a variety of disparate processes}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig6-metabolic.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Examples of archaic cRE-affecting variants putatively influencing the expression of key metabolic genes}. 
(A) \textit{AMY2A} and \textit{AMY2B} \hl{XXX};
(B) \textit{UGT2B4} \hl{XXX}.
}
\label{Fig3}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig7-developmental.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Examples of archaic cRE-affecting variants putatively influencing the expression of key developmental genes}. 
(A) \textit{ARHGAP29} \hl{XXX};
(B) \textit{HIPK1} \hl{XXX}.
}
\label{Fig3}
\end{center}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=18.5cm]{Fig8-developmental.png}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Examples of archaic cRE-affecting variants putatively influencing the expression of key developmental genes}. 
(A) \textit{NR0B1}/DAX1 \hl{XXX};
(B) \textit{SPRY3} \hl{XXX}.
}
\label{Fig3}
\end{center}
\end{figure*}

While our analysis methods allow us to extract powerful dataset-level insights, as described in the previous sections, they also allow us to probe effects at individual-variant resolution. For each variant, we can query our model attribution scores to identify the binding site containing it, we can analyze model predictions to determine how the presence of this binding site affects the local accessibility profile, and we can utilize gene annotations to determine the genes putatively impacted by the variant. In this manner, we can produce a clear characterization of each variant's mechanism of action. Here, we present several notable examples. \\

At position 104129642 on chromosome 1, we observe a $T\rightarrow{C}$ substitution in all archaic genomes as compared to the human reference. From querying the model attribution scores, we observe the creation of a binding motif for CTCF, one of the most prevalent transcription factors in human cells. The corresponding accessibility profile exhibits a peak, while none was present with the reference sequence. The nearest genes to this variant are AMY2A and AMY2B, two genes which encode a form of amylase - the most common human enzyme responsible for carbohydrate digestion - that is produced in the pancreas \cite{AMY2Paper}. As might be expected given the ubiquitous expression of CTCF, this substitution is predicted to produce a strong effect in a wide variety of cell types, including pancreatic cells. Thus, we observe creation of a transcription factor binding motif near an important dietary gene in the appropriate cell type. It is therefore highly possible that the effects on gene expression conveyed by this variant partially explain the dietary differences between modern and archaic humans. \\

As previously stated, modern and archaic humans are known to exhibit several key differences concerning the brain and central nervous system. We find one potential candidate at position 154994954 on the X chromosome. According to model attribution scores, the $A\rightarrow{T}$ substitution found in the archaic genomes results in the activation of two adjacent leucine zipper motifs, which trigger a significantly higher accessibility peak. This peak is predicted to exist in a wide variety of cell types, including several found in the brain, thus indicating its likely importance in transcriptional regulation in these tissues. SPRY3 - the nearest gene to this variant - has been implicated in intellectual development disorders, most notably autism \cite{SPRY32015, SPRY32019}. Another nearby gene, TMLHE, has known associations with multiple neurodevelopmental disorders, once again including Autism \cite{TMLHEAutism}. In summary, we once again observe the creation of a binding motif near important genes in the cell type of interest. This region may therefore contribute to the differences in nervous system development between modern and archaic humans.\\
Bone development and morphology represents another category with great divergence between the two groups. On position 116360038 of chromosome 8, the archaic genomes exhibit a $G\rightarrow{A}$ substitution, which results in the suppression of a leucine zipper binding motif. As a result, the predicted accessibility peak is correspondingly reduced. The nearest gene to this variant is TRPS1, which serves as a repressor for GATA-regulated transcription. As part of this process, TRPS1 is known to regulate development of bone and cartilage, the tissues that comprise the human skeleton \cite{TRPS1Paper}. The accessibility change is predicted to occur in a variety of connective tissues in the dataset. Therefore, we once again observe a potential alteration in regulation of a gene connected to a previously identified function.  
\textbf{(CCDC87 fertility, RFXAP immune, PLS3 bone, FLNA bone) 
}

\subsection*{Groups of SNPs exhibit synergistic effects in motif creation}

In most hits in our dataset, regardless of the number of variants present in a neighborhood, it is a single variant that primarily drives creation or deletion of a binding motif. However, in certain interesting cases, multiple variants in close proximity are required to produce the same effect. The presence and preservation of mutiple effectual variants is a likely indicator of the importance of the functions conveyed by these sites. 

A striking example can be found on chromosome 19, where archaic-specific variants are present at three distinct positions within four base pairs - 57186733, 57186734, and 57186736. When the archaic forms of all three variants are considered together, model interpretations indicate the creation of a binding motif for SPI1 - an important transcription factor in immune cell development \cite{SPI1Paper}. Indeed, variant scores are highest for models trained on a wide variety of immune cells, including several types of B cells, T cells, and macrophages. However, we do not observe motif creation when only one of these variants is substituted for its archaic form, indicating that multiple archaic variants are necessary to convey the observed effect. Among the genes nearby to this locus are two Zinc finger transcription factors (ZNF835 and ZNF71), indicating a possible role for the locus in transcriptional regulation. Therefore, we observe a likely example of multiple archaic variants acting synergistically to perform an important function in immune cells. 

Another example occurs on chromosome 10, where we find four mutations in a span of six base pairs - at positions 2682468, 2682471, 2682472, and 2682473. Once again, these variants convey negligible effects when considered individually. However, when considered together, model interpretation indicates the creation of a second BCL1 motif in addition to the one already present in the modern human sequence. As a result, the predicted accessibility profile possesses a much higher peak. Nearby genes include PITRM1, which has been implicated in the presense of neurodegenerative disorders including Spinocerebellar Ataxia, thus furthering the established narrative regarding brain-related differences between modern and archaic humans \cite{PITRM1Paper1, PITRM1Paper2, PITRM1Paper3}. 

We observe multiple further examples of this phenomenon, including strings of three consecutive variants on chromosome 9 and chromosome 16. It is of course highly unlikely for such clusters of variants to arise and become fixed by chance. Instead, these loci likely carry functional importance, a hypothesis furthered by our variant scoring and interpretation methods. Thus, these variant clusters serve as fascinating examples of divergent evolution between modern and archaic humans. 


\section*{Discussion}

In this study, we have utilized a large atlas of over 1,500 deep learning models to create a comprehensive set of archaic-specific chromatin accessibility tracks from virtually every cell type present in the ENCODE dataset. Through \textit{in silico} mutagenesis experiments, we have determined the effects putatively conveyed by archaic-specific variants in each of these cell types. Through application of deep learning interpretability procedures, we probed the specific mechanisms of action of each variant. These steps allowed for the analysis of archaic regulation across all bodily systems and functions while still achieving single-nucleotide resolution. \\
The application of these analysis methods uncovered several notable insights. When hierarchical clustering was performed on mutagenesis-based variant scores, the resulting tissue groupings resembled known organ systems. Enrichment analyses on genes near highly ranked SNPs returned terms corresponding to known differences between modern and archaic humans, including the nervous system, metabolism, and skeletal morphology, among many others. These differences were found to be pronounced on the X chromosome. We then utilized our base-level importance scores to probe the mechanisms of action of certain notable variants, and we isolated some with putative physiologically relevant functions. In addition, we discover multiple cases of synergistic effects between multiple variants in close proximity. In addition to our own findings, we hope the data we have produced can serve as a springboard for further studies of archaic human regulation. \\
This study serves as an illustration of the immense power of interpretable machine learning in uncovering genomic insights and relating them to epigenomic features. Of course, in the case of archaic humans or other ancient genomes, no epigenetic data exists at all. Imputation is the next best solution, and it can be performed effectively using state-of-the-art deep learning models trained on modern human data. These models are also ideal to perform \textit{in silico} mutagenesis experiments to gauge the effects of one of more archaic variants. Finally, through application of state-of-the-art deep learning interpretability methods, one can probe the genetic drivers behind regulation on a single-nucleotide level. In this manner, utilizing deep learning can vastly enhance the amount of information available regarding archaic genetics and epigenetics. Instead of simply having the genome available, we now have the ability to generate: accurate genome-wide cross cell-type epigenetic tracks, genome-wide variant-effect scores, and maps of the sequence motifs driving accessibility at each locus. Of course, this information will enable far more powerful insights than would be possible otherwise. For this reason, deep learning surely represents an essential tool for future ancient genome studies along with the general discipline of imputation of epigenetic data from sequence. \\
The same methodology can be applied to answer a wide variety of interesting related questions. Neanderthals and denisovans are not the only hominids for which we possess genomes but no corresponding regulatory data. The same can be said for \textit{Homo sapiens} from various points through history - in most cases, while DNA can be extracted from extant skeletal matter, it is similarly impossible for any epigenetic data to be preserved. However, through deep learning-based approaches, we can analyze in detail the epigenetic changes present between two groups of humans separated by time or geography. We can also identify common genetic changes between the two groups and utilize feature attribution to determine the effects these changes might convey. In this manner, we can more fully uncover the effects certain major events or population trends have imparted on the process of human evolution, thus answering fundamental questions about the nature of our species and our history. \\
More generally, with the appropriate models, this framework can be used as a powerful tool to compare population-level epigenetic differences between any two related groups, as well as the effects of genetic differences between these groups. Models can be trained on one group with known genetic and epigenetic data, and the analysis steps described in this study can then be used to compare the two groups. \\
Exploring various modeling approaches also represents another area of future exploration. Although the models utilized in this study are trained on modern \textit{Homo sapiens}, Neanderthals and Denisovans are sufficiently in-distribution that these models can be applied accurately. However, out-of-distribution concerns may be relevant when applying this framework to more distantly related species or groups. In this case, training schemes to account for this distribution shift will likely be necessary. Further study regarding the optimal training method for this task may be highly useful. \\
Overall, this study has uncovered a variety of insights regarding archaic-specific regulation, produced a rich dataset for further analysis, and suggested a variety of directions for future exploration. 

\section*{Methods}

\subsection*{Selection of archaic human-specific sequence variants}

\hl{CHECK IF STILL CORRECT}

The set of variants that differ between archaic and modern humans was obtained by utilizing all four high-coverage archaic genomes available at the time -- the Altai, Vindija, and Chagyrskaya Neanderthals, along with the Altai Denisovan\cite{Mafessoni2020,Prufer2017,Meyer2012,Prufer2014}. The variants found in these genomes were compared to the collection of all modern \textit{Homo sapiens} genome variation compiled by the 1000 Genomes Project\cite{1000Genomes2012}. The following two criteria for filtering were applied. First, all archaic genomes were required to be homozygous for the same minor allele. Second, that minor allele was required to not be present in the available modern genomes. The resulting variant set is likely to have arisen after the divergence between modern and archaic humans; it is therefore a potentially significant finding if substitution of the human version of a variant with the archaic one results in predicted modulation of chromatin accessibility.


\subsection*{ChromBPNet models}

ChromBPNet\hl{XXXCITEXXX} is an extension of the previously published BPNet architecture\cite{Avsec2021} designed to predict basepair-level chromatin accessibility profiles (ATAC-seq or DNse-seq) of a particular region given its input DNA sequence. These models specifically accept a 2,114 BP sequence input and predict the accessibility track over the middle 1,000 BP for the particular cell type the model was trained on.

The set of models utilized for this study consisted \hl{1,078} unique ChromBPNet models, each trained on a different ATAC-seq or DNase-seq sample from the most recent ENCODE Consortium collection\cite{ENCODE2020}. These models accept a one-hot encoded DNA sequence as input, first passing it through a single non-residual convolution layer followed by eight dilated residual convolution layers of increasing kernel size. The model then utilizes the output of this subnetwork to make two predictions. First, the convolution output is passed through a Global Average Pooling (GAP) layer followed by a fully connected layer to predict the total ATAC-seq or DNAse-seq read counts within the output region. Second, the convolution output is passed through another convolution layer with a large kernel size and only one output channel, thus producing the logits for the predicted probability profile of reads in the output region. To obtain the full predicted track for the output region, one can multiply the total predicted counts by the predicted probability at each position. The counts prediction is trained using mean squared error loss, while the profile prediction is trained using log-likelihood loss based on a multinomial distribution. 

A major difference between the standard BPNet framework and ChromBPNet is that the latter takes into account certain factors specific to DNAse-seq or ATAC-seq datasets. In both data types, the relevant enzymes do not cleave/insert uniformly into all DNA sequences; rather, substantial sequence bias exists\cite{He2014,Zhang2021,Li2019,Karabacak2019,Baek2017}. To uncover the true accessibility profile, it is necessary to correct for this bias. The correction is accomplished by pre-training another BPNet-style model on regions outside accessibility peaks, with the assumption that the read distribution in these regions is purely determined by enzyme bias. When training ChromBPNet models, we obtain predicted profile logits from this ``bias model'', and we add these to the predicted logits from the model being trained. The loss function for profile prediction is then calculated on this sum. In this manner, the bias model accounts for the contribution of enzyme bias to the overall observed accessibility profile, thus allowing the ChromBPNet model to learn the true unbiased profile. 

\subsection*{Variant scoring using ChromBPNet}

Due to its targeted nature and base-resolution input and output, the ChromBPNet framework is ideal for performing \textit{in silico} mutagenesis experiments. Specifically, by making alterations to the input sequence (e.g. variant substitution) and observing the changes to the predicted accessibility profile, we can make inferences about the importance of the altered positions as drivers of accessibility.  

To score the difference in predicted regulatory activity between archaic and modern alleles for each variant, we implemented the following procedure. For each variant, we extracted a sequence of length 2,114 bp centered around its position in the human reference genome. We then created another version of this sequence with the archaic allele substituted in for the modern one. All \hl{1,078} ChromBPNet models were then used to make accessibility profile predictions for both sequences. As stated, models specifically make two predictions -- the total ATAC-seq or DNAse-seq read \textit{counts} for a region, and the predicted probability distribution (or \textit{profile}) of reads across the region. We then compared predictions on the two sequences using the following metrics:

\begin{itemize}
    \item Log fold-change between total predicted counts
    \item Jensen-Shannon Divergence (JSD) between predicted profiles
\end{itemize}

Variants which scored high for either metric were flagged for further analysis. 

We also performed a modified version of this scoring scheme, where if other variants in our set fell within the 2,114 bp window, we substituted those as well to account for synergistic effects. We further aggregated these scores across each unique input window, as defined by the set of variants found in the window. Specifically, we took the value with the greatest magnitude from the scores for each variant in the window. In this manner, we obtained neighborhood-level scores both for each variant and, after aggregation, each window. 

\subsection*{Interpretation of ChromBPNet predictions}

We next employed a feature attribution-based approach to investigate putative mechanisms of action of high-scoring variants. For each variant, we applied SHAP, a deep learning interpretability method which scores each input feature based on its importance in producing the output prediction, on the corresponding modern and archaic sequences \cite{deeplift,shap}. In our problem setting, this corresponds to the importance of each nucleotide in influencing the predicted ATAC-seq readout. By comparing SHAP scores from both sequences, we can gain significant insight about how the variant affects chromatin accessibility. One common mechanism, for example, is that one allele at a certain locus is part of a functionally important TFBS. Substituting for the other allele breaks the motif, thus abrogating accessibility. Intact such motif are expected to show high SHAP scores, while for disrupted TFBMs the scores are expected to decrease considerably.

For high-scoring variants displaying such mechanisms, we utilized the open-source tool TOMTOM\cite{Bailey2009,Tanaka2011} to identify which TF matched each observed TFBM. % Thus, for each variant, we were able to produce the following information:

% In this manner, our feature-attribution based approach produces a comprehensive picture of the mechanisms by which human-specific SNPs convey an effect on regulation across a diverse array of cell types. 

\subsection*{Gene Linking and Enrichment Analysis}

We compiled a list of the genes whose regulation might be affected by \textit{cis}-regulatory variants in archaic human using a simple method -- for each variant, we considered the three nearest protein-coding genes within a neighborhood of 1 Mbp.

Functional enrichment was carried out by utilizing the publicly available tool gProfiler\cite{gProfiler}. This tool queries a variety of public ontology tools, including Gene Ontology Analysis (GO)\cite{GOConsortium2021} and Human Phenotype Ontology (HP)\cite{HPO} among many others\cite{gProfiler,Ashburner2000,PANTHERGO2019}, and aggregates the results for the given input gene set. % By applying these two tools, we gained a comprehensive picture of the bodily functions impacted by interesting gene sets, which could include the full list of archaic SNP-associated genes, genes associated with SNPs affecting notable tissue types, or genes associated with SNPs on particular chromosomes. \\

We utilized two different background distributions for our analysis. The first was the list of all human genes, which was useful to determine the generals functional areas of enrichment for archaic variant-adjacent genes. The second was the list of genes in proximity to all archaic variants, which allowed us to determine whether high-scoring variants disproportionately affected certain functions or pathways as compared to the total set of archaic variants. 

To determine trends related to gene proximity, we utilized the same list of nearby genes as in the ontology enrichment analysis -- for each variant, we considered the three nearest protein-coding genes within a neighborhood of 1 Mb. To calculate whether effective variant presence was enriched around certain genes, we first calculated the total number of genes nearby to all high-scoring variants in the dataset. We next created a background distribution for this value by taking 1,000 random samples of the same size from the total set of archaic variants and calculating the total number of nearby genes for each one. To obtain a $p$-value between the observed and background values, we fitted a normal distribution to the background and calculated the CDF at the observed value. 

To determine which variants intersected with peaks in each cell type, we utilized the open-source tool \verb|bedtools|\cite{Quinlan2010}, and in particular, the \verb|bedtools| \verb|intersect| command. For each variant-peak pair found to intersect, we then calculated the distance from the variant to the peak summit, and we utilized these values to conduct our peak occupancy analyses. 

\subsection*{Variant clustering and Intepretation}

Hierarchical clustering was applied to both axes of the variant score matrix, thus producing clusterings for both variants and models. ENCODE IDs were categorized by function and cell type using the ENCODE online portal, and these categories were utilized to determine patterns in the model clustering. 

For the variant axis, it was first necessary to categorize each variant by the TFBM it comprised, if applicable. We automated this task as follows. First, we identified the model which produced the highest $log$FC for each variant. We then calculated sequence-wide importance scores for both alleles, retaining the sequence with the highest maximum score. For these importance scores, the 99th percentile score was calculated from the flanking region, where no motif would be expected, then, starting from the center of the sequence (where the variant in question was located), all positions in each direction where the importance score was above this quantile were identified, ceasing the search when two consecutive positions with scores below the threshold were encountered. When the search was complete in both directions, the resulting positions comprised our putative motif. We then utilized TOMTOM to classify this sequence by comparison with the sequences of known motifs identified by the relevant model. If TOMTOM failed to identify a motif, we retried the procedure with the 98th and 95th percentiles, along with an exit condition of one rather than two positions. If no combination of parameters resulted in identification of a valid motif, we concluded that the variant in question did not comprise a motif. 

To determine if motifs were enriched within certain variant clusters, we first divided the variant set into equal-length blocks based on the ordering produced by the hierarchical clustering. We then calculated the frequency of each motif in each block, and we determined if any outliers existed in these values. Finally, we calculated aggregate statistics across the variants comprising these outliers -- most notably, average score for each model -- and determined which cell types were most prominently represented. A variety of literature and data was used to compare these findings to the known activity profiles of the relevant TFs. 

\subsection*{Genome versions and annotations}

The set of variants we worked with  was defined within \verb|hg19| version of the human genomes. ChromBPNet models were trained in \verb|hg20| space, which in the case of their use for predicting the effects of variants is an irrelevant distinction, because sequences of length 2,114 bp are passed to the model as input, i.e. outside of any coordinate system.

For the purpose of comparing to ENCODE annotations, variant coordinates were translated from \verb|hg19| to \verb|hg38| spaces using the \verb|liftOver| tool from the UCSC Genome Browser\cite{UCSC} utilities package.

ENCODE cCRE annotations were downloaded from the UCSC Genome Browser (\burl{http://hgdownload.soe.ucsc.edu/gbdb/hg38/encode3/ccre/encodeCcreCombined.bb}).

The refSeq\cite{refSeq} annotations for the \verb|hg19| and \verb|hg38| versions of the human assembly were used for associating variants with genes.

\section*{Author contributions}

A.P. carried out data curation, analysis and interpretation. G.K.M., A.K. and A.P. conceptualized the project. A.P., G.K.M. and A.K. wrote the manuscript.

\section*{Acknowledgments}

\hl{LIST GRANTS}

\begin{thebibliography}{100}

\input{references}

\end{thebibliography}

% \bibliographystyle{plainnat}
% \bibliography{references}

\end{multicols}

\clearpage

\setcounter{table}{0}
\renewcommand{\tablename}{Supplementary Table}
\setcounter{figure}{0}
\renewcommand{\figurename}{Supplementary Figure}

\setcounter{page}{1}
\renewcommand\thepage{{SM }\arabic{page}}

\begin{center}
% {\LARGE \textbf{\begin{spacing}{1.1}XXXX. \\ Supplementary Materials\end{spacing} }}
{\LARGE \textbf{Supplementary Materials}}
\end{center}

% \section*{Supplementary Tables}

\section*{Supplementary Figures}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS41-all-significant-variants-0.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS41}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{FigS1-all-significant-variants-1.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS1}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS2-all-significant-variants-2.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS2}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS3-all-significant-variants-3.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS3}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS4-all-significant-variants-4.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS4}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS5-all-significant-variants-5.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS5}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS6-all-significant-variants-6.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS6}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS7-all-significant-variants-7.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS7}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS8-all-significant-variants-8.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS8}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS9-all-significant-variants-9.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS9}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS10-all-significant-variants-10.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS10}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS11-all-significant-variants-11.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS11}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS12-all-significant-variants-12.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS12}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS13-all-significant-variants-13.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS13}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS14-all-significant-variants-14.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS14}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS15-all-significant-variants-15.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS15}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS16-all-significant-variants-16.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS16}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS17-all-significant-variants-17.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS17}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS18-all-significant-variants-18.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS18}
\end{figure*}


\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS19-all-significant-variants-19.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS19}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS20-all-significant-variants-20.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS20}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS21-all-significant-variants-21.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS21}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS22-all-significant-variants-22.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS22}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS23-all-significant-variants-23.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS23}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS24-all-significant-variants-24.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS24}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS25-all-significant-variants-25.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS25}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS26-all-significant-variants-26.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS26}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS27-all-significant-variants-27.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS27}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS28-all-significant-variants-28.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS28}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS29-all-significant-variants-29.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS29}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15cm]{FigS30-all-significant-variants-30.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS30}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS31-all-significant-variants-31.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS31}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS32-all-significant-variants-32.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS32}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS33-all-significant-variants-33.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS33}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS34-all-significant-variants-34.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS34}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS35-all-significant-variants-35.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS35}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS36-all-significant-variants-36.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS36}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS37-all-significant-variants-37.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS37}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS38-all-significant-variants-38.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS3}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS39-all-significant-variants-39.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS39}
\end{figure*}

\begin{figure*}[!ht]
\begin{center}
\includegraphics[width=15.5cm]{FigS40-all-significant-variants-40.png}
\end{center}
\captionsetup{singlelinecheck=off,justification=justified}
\caption{
{\bf Predicted effects of ancient human variants on chromatin accessibility across human cell types and tissues}. Shown are the nearest genes for each variant/region together with the distance to the TSS of the nearest gene and the predicted accessibility values from Figure \ref{Fig2}A.
}
\label{FigS40}
\end{figure*}

\end{document}