1 Dataset overview

Added by James: This Rmd file is adapted from 04-overview.ipynb, but uses cell type annotations after subclustering.

library(tidyr)
library(dplyr)
library(purrr)
library(readr)
library(glue)
library(ggplot2)
library(Signac)
library(Seurat)
source("/users/guozy/psychencode/functions_r/scRNAseq.R")

theme_set(theme_min())
# load labels and color scheme
labels <- readr::read_tsv("label_map.tsv") %>% 
  tibble::rowid_to_column(var = "Order")
## Rows: 60 Columns: 5
## ── Column specification ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Class, Label, Color, Label_harmonized, Label_chrombpnet
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
palette_labels <- labels %>% dplyr::select(Label, Color) %>% tibble::deframe()
palette_labels_harmonized <- labels %>% distinct(Label_harmonized, Color) %>% 
  tibble::deframe()
palette_condition <- c("Control" = "gray70",
                       "Bipolar" = "darkolivegreen",
                       "Schizophrenia" = "darkorchid4")

labels

1.1 Load Data

merged_dir <- "/oak/stanford/groups/akundaje/projects/psychencode/outs/20231009_multiome_analysis/03-integration/merged_objects"
# added by James: use cell type annotations from after the subclustering
dacc <- readRDS(file.path(merged_dir, "dACC.integrated.batch.analyzed.cell_metadata.labeled.rds"))
dlpfc <- readRDS(file.path(merged_dir, "DLPFC.integrated.batch.analyzed.cell_metadata.labeled.rds"))
hippocampus <- readRDS(file.path(merged_dir, "Hippocampus.integrated.batch.analyzed.cell_metadata.labeled.rds"))

1.2 QC

Here let’s perform a global overview of QC.

qc_table_file <- readr::read_tsv('/oak/stanford/groups/akundaje/projects/psychencode/outs/20231009_multiome_analysis/metadata/multiome.qc.tsv', show_col_types=FALSE)
length(unique(qc_table_file$subject_id))
## [1] 40
head(qc_table_file)
all_metadata <- bind_rows(dacc, dlpfc, hippocampus)

# deal with a naming issue in region columns in seurat objects
all_metadata$region <- all_metadata$region.x
all_metadata$region.x <- NULL
all_metadata$region.y <- NULL

all_metadata <- all_metadata %>% 
  arrange(region) %>% 
  mutate(sample = factor(sample, levels = unique(.$sample)))
colnames(all_metadata) |> 
  sort()
##   [1] "age"                                                  
##   [2] "all_doublets"                                         
##   [3] "atac_barcode"                                         
##   [4] "atac_chimeric_reads"                                  
##   [5] "atac_doublets"                                        
##   [6] "atac_dup_reads"                                       
##   [7] "atac_fragments"                                       
##   [8] "atac_lowmapq"                                         
##   [9] "atac_mitochondrial_reads"                             
##  [10] "atac_peak_region_cutsites"                            
##  [11] "atac_peak_region_fragments"                           
##  [12] "atac_raw_reads"                                       
##  [13] "atac_TSS_fragments"                                   
##  [14] "atac_unmapped_reads"                                  
##  [15] "ATAC.weight"                                          
##  [16] "barcode"                                              
##  [17] "batch"                                                
##  [18] "cell_type_broad"                                      
##  [19] "cell_type_granular"                                   
##  [20] "Clusters_all"                                         
##  [21] "Clusters_subset_GABAergic"                            
##  [22] "Clusters_subset_glutamatergic"                        
##  [23] "condition"                                            
##  [24] "DF.classifications_0.25_0.01_112"                     
##  [25] "DF.classifications_0.25_0.01_113"                     
##  [26] "DF.classifications_0.25_0.01_114"                     
##  [27] "DF.classifications_0.25_0.01_115"                     
##  [28] "DF.classifications_0.25_0.01_117"                     
##  [29] "DF.classifications_0.25_0.01_118"                     
##  [30] "DF.classifications_0.25_0.01_12"                      
##  [31] "DF.classifications_0.25_0.01_120"                     
##  [32] "DF.classifications_0.25_0.01_122"                     
##  [33] "DF.classifications_0.25_0.01_123"                     
##  [34] "DF.classifications_0.25_0.01_126"                     
##  [35] "DF.classifications_0.25_0.01_127"                     
##  [36] "DF.classifications_0.25_0.01_134"                     
##  [37] "DF.classifications_0.25_0.01_135"                     
##  [38] "DF.classifications_0.25_0.01_138"                     
##  [39] "DF.classifications_0.25_0.01_140"                     
##  [40] "DF.classifications_0.25_0.01_141"                     
##  [41] "DF.classifications_0.25_0.01_142"                     
##  [42] "DF.classifications_0.25_0.01_144"                     
##  [43] "DF.classifications_0.25_0.01_182"                     
##  [44] "DF.classifications_0.25_0.01_183"                     
##  [45] "DF.classifications_0.25_0.01_184"                     
##  [46] "DF.classifications_0.25_0.01_185"                     
##  [47] "DF.classifications_0.25_0.01_186"                     
##  [48] "DF.classifications_0.25_0.01_188"                     
##  [49] "DF.classifications_0.25_0.01_193"                     
##  [50] "DF.classifications_0.25_0.01_195"                     
##  [51] "DF.classifications_0.25_0.01_196"                     
##  [52] "DF.classifications_0.25_0.01_198"                     
##  [53] "DF.classifications_0.25_0.01_199"                     
##  [54] "DF.classifications_0.25_0.01_201"                     
##  [55] "DF.classifications_0.25_0.01_203"                     
##  [56] "DF.classifications_0.25_0.01_209"                     
##  [57] "DF.classifications_0.25_0.01_213"                     
##  [58] "DF.classifications_0.25_0.01_215"                     
##  [59] "DF.classifications_0.25_0.01_268"                     
##  [60] "DF.classifications_0.25_0.01_269"                     
##  [61] "DF.classifications_0.25_0.01_272"                     
##  [62] "DF.classifications_0.25_0.01_280"                     
##  [63] "DF.classifications_0.25_0.01_283"                     
##  [64] "DF.classifications_0.25_0.01_285"                     
##  [65] "DF.classifications_0.25_0.01_287"                     
##  [66] "DF.classifications_0.25_0.01_289"                     
##  [67] "DF.classifications_0.25_0.01_293"                     
##  [68] "DF.classifications_0.25_0.01_295"                     
##  [69] "DF.classifications_0.25_0.01_30"                      
##  [70] "DF.classifications_0.25_0.01_31"                      
##  [71] "DF.classifications_0.25_0.01_35"                      
##  [72] "DF.classifications_0.25_0.01_36"                      
##  [73] "DF.classifications_0.25_0.01_37"                      
##  [74] "DF.classifications_0.25_0.01_370"                     
##  [75] "DF.classifications_0.25_0.01_374"                     
##  [76] "DF.classifications_0.25_0.01_379"                     
##  [77] "DF.classifications_0.25_0.01_39"                      
##  [78] "DF.classifications_0.25_0.01_403"                     
##  [79] "DF.classifications_0.25_0.01_408"                     
##  [80] "DF.classifications_0.25_0.01_409"                     
##  [81] "DF.classifications_0.25_0.01_410"                     
##  [82] "DF.classifications_0.25_0.01_413"                     
##  [83] "DF.classifications_0.25_0.01_493"                     
##  [84] "DF.classifications_0.25_0.01_500"                     
##  [85] "DF.classifications_0.25_0.01_503"                     
##  [86] "DF.classifications_0.25_0.01_6"                       
##  [87] "DF.classifications_0.25_0.01_60"                      
##  [88] "DF.classifications_0.25_0.01_61"                      
##  [89] "DF.classifications_0.25_0.01_62"                      
##  [90] "DF.classifications_0.25_0.01_64"                      
##  [91] "DF.classifications_0.25_0.01_65"                      
##  [92] "DF.classifications_0.25_0.01_652"                     
##  [93] "DF.classifications_0.25_0.01_68"                      
##  [94] "DF.classifications_0.25_0.01_69"                      
##  [95] "DF.classifications_0.25_0.01_71"                      
##  [96] "DF.classifications_0.25_0.01_73"                      
##  [97] "DF.classifications_0.25_0.01_75"                      
##  [98] "DF.classifications_0.25_0.01_76"                      
##  [99] "DF.classifications_0.25_0.01_77"                      
## [100] "DF.classifications_0.25_0.01_8"                       
## [101] "DF.classifications_0.25_0.01_83"                      
## [102] "donor"                                                
## [103] "excluded_reason"                                      
## [104] "gex_barcode"                                          
## [105] "gex_conf_exonic_antisense_reads"                      
## [106] "gex_conf_exonic_dup_reads"                            
## [107] "gex_conf_exonic_reads"                                
## [108] "gex_conf_exonic_unique_reads"                         
## [109] "gex_conf_intergenic_reads"                            
## [110] "gex_conf_intronic_antisense_reads"                    
## [111] "gex_conf_intronic_dup_reads"                          
## [112] "gex_conf_intronic_reads"                              
## [113] "gex_conf_intronic_unique_reads"                       
## [114] "gex_conf_txomic_unique_reads"                         
## [115] "gex_exonic_umis"                                      
## [116] "gex_genes_count"                                      
## [117] "gex_intronic_umis"                                    
## [118] "gex_mapped_reads"                                     
## [119] "gex_raw_reads"                                        
## [120] "gex_umis_count"                                       
## [121] "is_cell"                                              
## [122] "mapping.score"                                        
## [123] "nCount_ATAC"                                          
## [124] "nCount_refAssay"                                      
## [125] "nCount_RNA"                                           
## [126] "nCount_SCT"                                           
## [127] "nFeature_ATAC"                                        
## [128] "nFeature_refAssay"                                    
## [129] "nFeature_RNA"                                         
## [130] "nFeature_SCT"                                         
## [131] "nucleosome_percentile"                                
## [132] "nucleosome_signal"                                    
## [133] "orig.ident"                                           
## [134] "pANN_0.25_0.01_112"                                   
## [135] "pANN_0.25_0.01_113"                                   
## [136] "pANN_0.25_0.01_114"                                   
## [137] "pANN_0.25_0.01_115"                                   
## [138] "pANN_0.25_0.01_117"                                   
## [139] "pANN_0.25_0.01_118"                                   
## [140] "pANN_0.25_0.01_12"                                    
## [141] "pANN_0.25_0.01_120"                                   
## [142] "pANN_0.25_0.01_122"                                   
## [143] "pANN_0.25_0.01_123"                                   
## [144] "pANN_0.25_0.01_126"                                   
## [145] "pANN_0.25_0.01_127"                                   
## [146] "pANN_0.25_0.01_134"                                   
## [147] "pANN_0.25_0.01_135"                                   
## [148] "pANN_0.25_0.01_138"                                   
## [149] "pANN_0.25_0.01_140"                                   
## [150] "pANN_0.25_0.01_141"                                   
## [151] "pANN_0.25_0.01_142"                                   
## [152] "pANN_0.25_0.01_144"                                   
## [153] "pANN_0.25_0.01_182"                                   
## [154] "pANN_0.25_0.01_183"                                   
## [155] "pANN_0.25_0.01_184"                                   
## [156] "pANN_0.25_0.01_185"                                   
## [157] "pANN_0.25_0.01_186"                                   
## [158] "pANN_0.25_0.01_188"                                   
## [159] "pANN_0.25_0.01_193"                                   
## [160] "pANN_0.25_0.01_195"                                   
## [161] "pANN_0.25_0.01_196"                                   
## [162] "pANN_0.25_0.01_198"                                   
## [163] "pANN_0.25_0.01_199"                                   
## [164] "pANN_0.25_0.01_201"                                   
## [165] "pANN_0.25_0.01_203"                                   
## [166] "pANN_0.25_0.01_209"                                   
## [167] "pANN_0.25_0.01_213"                                   
## [168] "pANN_0.25_0.01_215"                                   
## [169] "pANN_0.25_0.01_268"                                   
## [170] "pANN_0.25_0.01_269"                                   
## [171] "pANN_0.25_0.01_272"                                   
## [172] "pANN_0.25_0.01_280"                                   
## [173] "pANN_0.25_0.01_283"                                   
## [174] "pANN_0.25_0.01_285"                                   
## [175] "pANN_0.25_0.01_287"                                   
## [176] "pANN_0.25_0.01_289"                                   
## [177] "pANN_0.25_0.01_293"                                   
## [178] "pANN_0.25_0.01_295"                                   
## [179] "pANN_0.25_0.01_30"                                    
## [180] "pANN_0.25_0.01_31"                                    
## [181] "pANN_0.25_0.01_35"                                    
## [182] "pANN_0.25_0.01_36"                                    
## [183] "pANN_0.25_0.01_37"                                    
## [184] "pANN_0.25_0.01_370"                                   
## [185] "pANN_0.25_0.01_374"                                   
## [186] "pANN_0.25_0.01_379"                                   
## [187] "pANN_0.25_0.01_39"                                    
## [188] "pANN_0.25_0.01_403"                                   
## [189] "pANN_0.25_0.01_408"                                   
## [190] "pANN_0.25_0.01_409"                                   
## [191] "pANN_0.25_0.01_410"                                   
## [192] "pANN_0.25_0.01_413"                                   
## [193] "pANN_0.25_0.01_493"                                   
## [194] "pANN_0.25_0.01_500"                                   
## [195] "pANN_0.25_0.01_503"                                   
## [196] "pANN_0.25_0.01_6"                                     
## [197] "pANN_0.25_0.01_60"                                    
## [198] "pANN_0.25_0.01_61"                                    
## [199] "pANN_0.25_0.01_62"                                    
## [200] "pANN_0.25_0.01_64"                                    
## [201] "pANN_0.25_0.01_65"                                    
## [202] "pANN_0.25_0.01_652"                                   
## [203] "pANN_0.25_0.01_68"                                    
## [204] "pANN_0.25_0.01_69"                                    
## [205] "pANN_0.25_0.01_71"                                    
## [206] "pANN_0.25_0.01_73"                                    
## [207] "pANN_0.25_0.01_75"                                    
## [208] "pANN_0.25_0.01_76"                                    
## [209] "pANN_0.25_0.01_77"                                    
## [210] "pANN_0.25_0.01_8"                                     
## [211] "pANN_0.25_0.01_83"                                    
## [212] "pct_frags_in_peaks"                                   
## [213] "percent.mt"                                           
## [214] "predicted.cell_class"                                 
## [215] "predicted.cell_class_percluster"                      
## [216] "predicted.cell_type_percluster"                       
## [217] "predicted.class"                                      
## [218] "predicted.class__humancortexref"                      
## [219] "predicted.class.score"                                
## [220] "predicted.class.score__humancortexref"                
## [221] "predicted.cluster"                                    
## [222] "predicted.cluster__humancortexref"                    
## [223] "predicted.cluster_broad"                              
## [224] "predicted.cluster_broad.score"                        
## [225] "predicted.cluster.score"                              
## [226] "predicted.cluster.score__humancortexref"              
## [227] "predicted.cross_species_cluster"                      
## [228] "predicted.cross_species_cluster__humancortexref"      
## [229] "predicted.cross_species_cluster.score"                
## [230] "predicted.cross_species_cluster.score__humancortexref"
## [231] "predicted.subclass"                                   
## [232] "predicted.subclass__humancortexref"                   
## [233] "predicted.subclass.score"                             
## [234] "predicted.subclass.score__humancortexref"             
## [235] "race"                                                 
## [236] "region"                                               
## [237] "rna_doublets"                                         
## [238] "sample"                                               
## [239] "SCT_snn_res.0.8"                                      
## [240] "SCT.weight"                                           
## [241] "seurat_clusters"                                      
## [242] "sex"                                                  
## [243] "source"                                               
## [244] "TSS.enrichment"                                       
## [245] "TSS.percentile"                                       
## [246] "wsnn_batch_res.0.6"                                   
## [247] "wsnn_res.0.6"
# number of cells
all_metadata %>%
  ggplot(aes(x = sample)) +
  # geom_violin(scale = "width", aes(fill = region)) +
  geom_bar(aes(fill = region)) +
  coord_cartesian(ylim = c(0, 7500)) +
  ggtitle("# cells per sample") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 15, angle = 90, hjust = 1, vjust = 0.5),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

all_metadata %>%
   ggplot(aes(x = sample, y = nFeature_RNA)) +
   # geom_violin(scale = "width", aes(fill = region)) +
   geom_boxplot(outlier.shape = NA, aes(fill = region)) +
   coord_cartesian(ylim = c(0, 6500)) +
   ggtitle("# genes detected across samples")  + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 10, angle = 90, hjust = 1, vjust = 0.5),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

all_metadata %>% ggplot(aes(x = sample, y = nCount_RNA)) +
   # geom_violin(scale = "width", aes(fill = region)) +
   geom_boxplot(outlier.shape = NA, aes(fill = region)) +
   coord_cartesian(ylim = c(0, 35000)) +
   ggtitle("# UMIs across samples") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 10, angle = 90, hjust = 1, vjust = 0.5),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

all_metadata %>% ggplot(aes(x = sample, y = percent.mt)) +
   geom_boxplot(outlier.shape = NA, aes(fill = region)) +
   ggtitle("% mitochondrial reads") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 10, angle = 90, hjust = 1, vjust = 0.5),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

all_metadata %>% ggplot(aes(x = sample, y = TSS.enrichment)) +
   # geom_violin(scale = "width", aes(fill = region)) +
   geom_boxplot(outlier.shape = NA, aes(fill = region)) +
   coord_cartesian(ylim = c(1, 8)) +
   ggtitle("TSS enrichment") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 17),                     
    axis.text.x = element_text(size = 14, angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

all_metadata %>% 
  ggplot(aes(x = sample, y = nCount_ATAC)) +
   # geom_violin(scale = "width", aes(fill = region)) +
   geom_boxplot(outlier.shape = NA, aes(fill = region)) +
   coord_cartesian(ylim = c(0, 20000)) +
   ggtitle("# cut sites") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 15, angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

all_metadata %>% ggplot(aes(x = predicted.cell_type_percluster)) +
   geom_bar(aes(fill = batch), position = "fill") +
   ggtitle("batch") + NoLegend() + ylab("proportion") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 14, angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

all_metadata %>% ggplot(aes(x = predicted.cell_type_percluster)) +
   geom_bar(aes(fill = sample), position = "fill") +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
         axis.text.y = element_text(size = 15)) +
   ggtitle("sample") + NoLegend() + ylab("proportion") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 14),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

summary(all_metadata$nCount_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     501    1592    3401    8473    8654  185099
summary(all_metadata$nFeature_RNA)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     251     953    1653    2325    3060   11890
dim(all_metadata)
## [1] 417487    247

1.3 Global

head(all_metadata)
# get final cell types
# added by James

all_metadata2 <- all_metadata %>% 
    select(barcode, sample, cell_type_broad, condition, region) %>%
  mutate(cell_type_broad = as.factor(cell_type_broad)) 

head(all_metadata2)
all_metadata2 %>% ggplot(aes(x = sample)) +
    geom_bar(aes(fill = cell_type_broad)) +
    facet_wrap(~ region, nrow = 1, scales = "free_x") +
    scale_fill_manual(values = palette_labels_harmonized) +
    rotate_x() + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

all_metadata2 %>% ggplot(aes(x = sample)) +
    geom_bar(aes(fill = cell_type_broad), position = "fill") +
    facet_wrap(~ region, nrow = 1, scales = "free_x") +
    scale_fill_manual(values = palette_labels_harmonized) +
    rotate_x() + 
  ylab("proportion")  + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p1 <- all_metadata2 %>% ggplot(aes(x = cell_type_broad)) +
    geom_bar(aes(fill = cell_type_broad)) +
    facet_wrap(~ region, nrow = 1, scales = "free_x") +
    scale_fill_manual(values = palette_labels_harmonized) +
    rotate_x() +
    theme(axis.text.x = element_blank(), axis.title.x = element_blank()) +
    ylab("# cells") +
    hide_legend()  + 
  theme(
    plot.title = element_text(size = 20),        
    # axis.title.x = element_text(size = 16),                     
    # axis.text.x = element_text(size = 14),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )
  

p2 <- all_metadata2 %>% ggplot(aes(x = cell_type_broad)) +
    geom_bar(aes(fill = condition), position = "fill") +
    facet_wrap(~ region, nrow = 1, scales = "free_x") +
    rotate_x() +
    theme(strip.text.x = element_blank()) +
    ylab("% cells") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

cowplot::plot_grid(p1, p2, ncol = 1, rel_heights = c(0.3, 0.7), align = "v", axis = "rl")

Added by James:

1.4 Compare cell count per cell type with previously merged objects

old_merged_dir <- "/oak/stanford/groups/akundaje/projects/psychencode/outs-archive/20231009_multiome_analysis/03-integration/merged_objects"
dacc_metadata_old <- readRDS(file.path(old_merged_dir, "dACC/dACC.integrated.batch.analyzed.cell_metadata.labeled.rds"))
dlpfc_metadata_old <- readRDS(file.path(old_merged_dir, "DLPFC/DLPFC.integrated.batch.analyzed.cell_metadata.labeled.rds"))
hippocampus_metadata_old <-  readRDS(file.path(old_merged_dir, "Hippocampus/Hippocampus.integrated.batch.analyzed.cell_metadata.labeled.rds"))

cell_type_count <- function(metadata) {
  return(metadata %>%
           group_by(cell_type_broad) %>%
           summarise(n = n()) %>%
           arrange(cell_type_broad))
}
dacc_old_cell_type_count <- dacc_metadata_old %>% cell_type_count()
dacc_new_cell_type_count <- dacc %>% cell_type_count()
dacc_cell_type_count_long <- full_join(dacc_old_cell_type_count, 
                                  dacc_new_cell_type_count, 
                                  by = "cell_type_broad") %>%
  rename(archived = n.x, new = n.y) %>%
  pivot_longer(cols = c(archived, new), names_to = "time_point", values_to = "count") %>%
  replace_na(list(count = 0))

ggplot(dacc_cell_type_count_long, aes(x = cell_type_broad, 
                                      y = count, fill = time_point)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "dACC Cell Type Count Comparison: Archived and New", x = "Cell Type", 
       y = "Count") + theme(legend.title = element_blank()) + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 13, angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

dlpfc_old_cell_type_count <- dlpfc_metadata_old %>% cell_type_count()
dlpfc_new_cell_type_count <- dlpfc %>% cell_type_count()
dlpfc_cell_type_count_long <- full_join(dlpfc_old_cell_type_count, 
                                  dlpfc_new_cell_type_count, 
                                  by = "cell_type_broad") %>%
  rename(archived = n.x, new = n.y) %>%
  pivot_longer(cols = c(archived, new), names_to = "time_point", values_to = "count") %>%
  replace_na(list(count = 0))

ggplot(dlpfc_cell_type_count_long, aes(x = cell_type_broad, 
                                       y = count, fill = time_point)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "DLPFC Cell Type Count Comparison: Archived and New", x = "Cell Type", 
       y = "Count") + theme(legend.title = element_blank()) + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 13 , angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

hippocampus_old_cell_type_count <- hippocampus_metadata_old %>% cell_type_count()
hippocampus_new_cell_type_count <- hippocampus %>% cell_type_count()
hippocampus_cell_type_count_long <- full_join(hippocampus_old_cell_type_count, 
                                  hippocampus_new_cell_type_count, 
                                  by = "cell_type_broad") %>%
  rename(archived = n.x, new = n.y) %>%
  pivot_longer(cols = c(archived, new), names_to = "time_point", values_to = "count") %>%
  replace_na(list(count = 0))

ggplot(hippocampus_cell_type_count_long, aes(x = cell_type_broad, 
                                             y = count, fill = time_point)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Hippocampus Cell Type Count Comparison: Archived and New", 
       x = "Cell Type", y = "Count") + theme(legend.title = element_blank())  + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 16),                     
    axis.text.x = element_text(size = 13, angle = 90, vjust = 0.5, hjust=1),                     
    axis.title.y = element_text(size = 16),                     
    axis.text.y = element_text(size = 14),                      
    legend.title = element_text(size = 16),      
    legend.text = element_text(size = 14)                      
  )

1.5 Neuron vs. Non-Neuron

# collect cell_type_broad from all regions
cell_type_broad_all_regions <- rbind(data.frame(cell_type_broad = dacc$cell_type_broad,
                                        region = 'dACC'), 
                             data.frame(cell_type_broad = dlpfc$cell_type_broad, 
                                        region = 'DLPFC'),
                             data.frame(cell_type_broad = 
                                          hippocampus$cell_type_broad,
                                        region = 'Hippocampus')) |>
  setNames(c("Label_harmonized", "region"))

# remove duplicate labels
labels_distinct <- distinct(labels, Label_harmonized, .keep_all = TRUE)

# map by Label_harmonized
cell_type_broad_all_regions <- full_join(cell_type_broad_all_regions, 
                                         labels_distinct, 
                                         by = "Label_harmonized") |>
  select(Label_harmonized, Class, region) |>
  mutate(Class = if_else(Class %in% c("Glutamatergic", "GABAergic"), 
                         "Neuronal", Class))

class_per_region <- cell_type_broad_all_regions %>%
  group_by(region, Class) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(region) %>%
  mutate(total_count = sum(count)) %>%
  mutate(percentage = count / total_count * 100) %>%
  ungroup() %>%
  select(-total_count) 

region_per_class <- cell_type_broad_all_regions %>%
  group_by(Class, region) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Class) %>%
  mutate(total_count = sum(count)) %>%
  mutate(percentage = count / total_count * 100) %>%
  ungroup() %>%
  select(-total_count) 

1.6 Cell Class (Neuronal vs. Non-Neuronal) per Region

p3 <- ggplot(cell_type_broad_all_regions, aes(x = region, fill = Class)) +
  geom_bar() +
  labs(title = "Stacked Barplot of Cell Class per Region",
       x = "Region",
       y = "Count") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p4 <- ggplot(class_per_region, aes(x = region, y = percentage, fill = Class)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Stacked Percentage Barplot of Cell Class per Region",
       x = "Region",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent)  + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p3 | p4

1.7 Region Per Cell Class (Neuronal vs. Non-Neuronal)

p5 <- ggplot(cell_type_broad_all_regions, aes(x = Class, fill = region)) +
  geom_bar() +
  labs(title = "Stacked Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Count") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )


p6 <- ggplot(region_per_class, aes(x = Class, y = percentage, fill = region)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Stacked Percentage Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent) + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )


p5 | p6

1.8 Compare Cell Class per Region with previously merged objects

cell_type_broad_all_regions_old <- rbind(
  data.frame(cell_type_broad = dacc_metadata_old$cell_type_broad, region = 'dACC'), 
  data.frame(cell_type_broad = dlpfc_metadata_old$cell_type_broad, region = 'DLPFC'),
  data.frame(cell_type_broad = hippocampus_metadata_old$cell_type_broad, region = 'Hippocampus')) |>
  setNames(c("Label_harmonized", "region")) 


# map by Label_harmonized
cell_type_broad_all_regions_old <- full_join(cell_type_broad_all_regions_old, 
                                             labels_distinct, 
                                             by = "Label_harmonized") |> 
  select(Label_harmonized, Class, region) |>
  mutate(Class = if_else(Class %in% c("Glutamatergic", "GABAergic"), 
                         "Neuronal", Class))

class_per_region_old <- cell_type_broad_all_regions_old %>%
  group_by(region, Class) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(region) %>%
  mutate(total_count = sum(count)) %>%
  mutate(percentage = count / total_count * 100) %>%
  ungroup() %>%
  select(-total_count) 

region_per_class_old <- cell_type_broad_all_regions_old %>%
  group_by(Class, region) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Class) %>%
  mutate(total_count = sum(count)) %>%
  mutate(percentage = count / total_count * 100) %>%
  ungroup() %>%
  select(-total_count) 

1.8.1 Count of Cell Class per Region

p7 <- ggplot(cell_type_broad_all_regions, aes(x = region, fill = Class)) +
  geom_bar() +
  labs(title = "New Stacked Barplot of Cell Class per Region", x = "Region", y = "Count") + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p8 <- ggplot(cell_type_broad_all_regions_old, aes(x = region, fill = Class)) +
  geom_bar() +
  labs(title = "Archived Stacked Barplot of Cell Class per Region",
       x = "Region",
       y = "Count") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )


p7 | p8

The distribution of cell class count per region look similar.

1.8.2 Percentage of Cell Class per Region

p9 <- ggplot(class_per_region, aes(x = region, y = percentage, fill = Class)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "New Stacked Percentage Barplot of Cell Class per Region",
       x = "Region",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent) +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )


p10 <- ggplot(class_per_region_old, aes(x = region, y = percentage, fill = Class)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Archived Stacked Percentage Barplot of Cell Class per Region",
       x = "Region",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent) +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p9 | p10

There are more unidentified cells that do not fall into either the neuronal or non-neuronal categories in the new datasets.

1.9 Compare Region per Cell Class with previously merged objects

1.9.1 Count of Region per Cell Class

p11 <- ggplot(cell_type_broad_all_regions, aes(x = Class, fill = region)) +
  geom_bar() +
  labs(title = "New Stacked Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Count") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p12 <- ggplot(cell_type_broad_all_regions_old, aes(x = Class, fill = region)) +
  geom_bar() +
  labs(title = "Archived Stacked Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Count") +
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p11 | p12

There are more unidentified cells in the new datasets. The other two cell classes had a similar distributions in the new and archived datasets.

1.9.2 Percentage of Region per Cell Class

p13 <- ggplot(region_per_class, aes(x = Class, y = percentage, fill = region)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "New Stacked Percentage Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent)  + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )


p14 <- ggplot(region_per_class_old, aes(x = Class, y = percentage, fill = region)) +
  geom_bar(stat = "identity", position = "fill") +
  labs(title = "Archived Stacked Percentage Barplot of Region per Cell Class",
       x = "Cell Class",
       y = "Percentage") +
  scale_y_continuous(labels = scales::percent) + 
  theme(
    plot.title = element_text(size = 20),        
    axis.title.x = element_text(size = 18),                     
    axis.text.x = element_text(size = 16),                     
    axis.title.y = element_text(size = 18),                     
    axis.text.y = element_text(size = 16),                      
    legend.title = element_text(size = 18),      
    legend.text = element_text(size = 16)                      
  )

p13 | p14

The unidentified cells from the DLPFC region make up a larger percentage, compared to the archived datasets.

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/libf77blas.so.3.10.3
## LAPACK: /users/guozy/miniconda3/envs/psych/lib/libopenblasp-r0.3.27.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 viridis_0.6.5      viridisLite_0.4.2  SeuratObject_5.0.0
##  [5] Seurat_4.3.0       Signac_1.13.0      ggplot2_3.5.1      glue_1.7.0        
##  [9] readr_2.1.5        purrr_1.0.2        dplyr_1.1.4        tidyr_1.3.0       
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.univar_3.0-0  spam_2.10-0            fastmatch_1.1-4       
##   [4] plyr_1.8.9             igraph_2.0.3           lazyeval_0.2.2        
##   [7] sp_2.1-4               splines_4.2.0          BiocParallel_1.32.6   
##  [10] listenv_0.9.1          scattermore_1.2        GenomeInfoDb_1.34.9   
##  [13] digest_0.6.36          htmltools_0.5.8.1      fansi_1.0.6           
##  [16] magrittr_2.0.3         tensor_1.5             cluster_2.1.6         
##  [19] ROCR_1.0-11            tzdb_0.4.0             globals_0.16.3        
##  [22] Biostrings_2.66.0      matrixStats_1.3.0      vroom_1.6.5           
##  [25] spatstat.sparse_3.1-0  colorspace_2.1-1       ggrepel_0.9.5         
##  [28] xfun_0.46              crayon_1.5.3           RCurl_1.98-1.16       
##  [31] jsonlite_1.8.8         progressr_0.14.0       spatstat.data_3.1-2   
##  [34] survival_3.7-0         zoo_1.8-12             polyclip_1.10-6       
##  [37] gtable_0.3.5           zlibbioc_1.44.0        XVector_0.38.0        
##  [40] leiden_0.4.3.1         future.apply_1.11.2    BiocGenerics_0.44.0   
##  [43] abind_1.4-8            scales_1.3.0           spatstat.random_3.3-1 
##  [46] miniUI_0.1.1.1         Rcpp_1.0.11            xtable_1.8-4          
##  [49] reticulate_1.38.0      bit_4.0.5              dotCall64_1.1-1       
##  [52] stats4_4.2.0           htmlwidgets_1.6.4      httr_1.4.7            
##  [55] ica_1.0-3              farver_2.1.2           pkgconfig_2.0.3       
##  [58] sass_0.4.9             uwot_0.2.2             deldir_2.0-4          
##  [61] utf8_1.2.4             labeling_0.4.3         tidyselect_1.2.1      
##  [64] rlang_1.1.4            reshape2_1.4.4         later_1.3.2           
##  [67] munsell_0.5.1          tools_4.2.0            cachem_1.1.0          
##  [70] cli_3.6.3              generics_0.1.3         ggridges_0.5.6        
##  [73] evaluate_0.24.0        stringr_1.5.1          fastmap_1.2.0         
##  [76] yaml_2.3.9             goftest_1.2-3          bit64_4.0.5           
##  [79] knitr_1.48             fitdistrplus_1.2-1     RANN_2.6.1            
##  [82] pbapply_1.7-2          future_1.33.2          nlme_3.1-165          
##  [85] mime_0.12              RcppRoll_0.3.1         compiler_4.2.0        
##  [88] plotly_4.10.4          png_0.1-8              spatstat.utils_3.0-5  
##  [91] tibble_3.2.1           bslib_0.7.0            stringi_1.8.4         
##  [94] highr_0.11             lattice_0.22-6         Matrix_1.6-1.1        
##  [97] vctrs_0.6.5            pillar_1.9.0           lifecycle_1.0.4       
## [100] spatstat.geom_3.3-2    lmtest_0.9-40          jquerylib_0.1.4       
## [103] RcppAnnoy_0.0.22       data.table_1.15.4      cowplot_1.1.3         
## [106] bitops_1.0-7           irlba_2.3.5.1          httpuv_1.6.15         
## [109] patchwork_1.2.0        GenomicRanges_1.50.2   R6_2.5.1              
## [112] promises_1.3.0         KernSmooth_2.23-24     gridExtra_2.3         
## [115] IRanges_2.32.0         parallelly_1.37.1      codetools_0.2-20      
## [118] MASS_7.3-56            withr_3.0.0            sctransform_0.4.0     
## [121] Rsamtools_2.14.0       S4Vectors_0.36.2       GenomeInfoDbData_1.2.9
## [124] parallel_4.2.0         hms_1.1.3              grid_4.2.0            
## [127] rmarkdown_2.27         Rtsne_0.17             spatstat.explore_3.3-1
## [130] shiny_1.8.1.1