exp_id     <- "20231009_multiome_analysis/03-integration" # determines the name of the cache folder
doc_id     <- "05c" # determines name of the subfolder of `figures` where pngs/pdfs are saved
out        <- here::here("outs", exp_id, doc_id, "/"); dir.create(out, recursive = TRUE)
figout     <- here::here("outs", exp_id, doc_id, "figures/")
cache      <- file.path("~/scratch/cache", exp_id, doc_id, "/")

1 Overview

Here, we’ll assemble the final annotations for each dataset. Consulting several resources:

Hodge, R.D., Bakken, T.E., Miller, J.A., Smith, K.A., Barkan, E.R., Graybuck, L.T., Close, J.L., Long, B., Johansen, N., Penn, O., et al. (2019). Conserved cell types with divergent features in human versus mouse cortex. Nature 573, 61–68. 10.1038/s41586-019-1506-7.

Bakken, T.E., Jorstad, N.L., Hu, Q., Lake, B.B., Tian, W., Kalmbach, B.E., Crow, M., Hodge, R.D., Krienen, F.M., Sorensen, S.A., et al. (2021). Comparative cellular analysis of motor cortex in human, marmoset and mouse. Nature 598, 111–119. 10.1038/s41586-021-03465-8.

Franjic, D., Skarica, M., Ma, S., Arellano, J.I., Tebbenkamp, A.T.N., Choi, J., Xu, C., Li, Q., Morozov, Y.M., Andrijevic, D., et al. (2022). Transcriptomic taxonomy and neurogenic trajectories of adult human, macaque, and pig hippocampal and entorhinal cells. Neuron 110, 452–469.e14. 10.1016/j.neuron.2021.10.036.

In general, to finalize cell type annotation, we will subcluster glutamatergic and GABAergic neurons separately.

2 Set up

library(here)
library(tidyr)
library(dplyr)
library(purrr)
library(readr)
library(glue)
library(Seurat)
library(ggplot2)
library(Signac)

source(here("functions_r/scRNAseq.R"))

ggplot2::theme_set(theme_min())

3 Load data

merged_dir <- "/oak/stanford/groups/akundaje/projects/psychencode/outs/20231009_multiome_analysis/03-integration/merged_objects"
hippocampus <- readRDS(file.path(merged_dir, 
                                 "Hippocampus.integrated.batch.analyzed.rds"))

4 Cell type annotation

We manually assigned colors for each broad label, taken from the following studies to approximately match the color scheme used in the Allen Institute papers.

Table 3 in Zhang et al, Nature, 2021 (https://www.nature.com/articles/s41586-021-03705-x)
Table 3 in Yao et al, Cell, 2021 (https://www.sciencedirect.com/science/article/pii/S0092867421005018?via%3Dihub#app2)

Conventions:

predicted.cluster_broad is the per-cell automated annotation from Azimuth, for the hippocampus reference
predicted.cell_type_percluster is the majority vote of the per-cell label, within each cluster

# load labels
labels <- readr::read_tsv(
  here("exps/20231009_multiome_analysis_v2/03-integration/label_map.tsv")) %>% 
  tibble::rowid_to_column(var = "Order")

## Rows: 60 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (5): Class, Label, Color, Label_harmonized, Label_chrombpnet
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

palette_labels <- labels %>% dplyr::select(Label, Color) %>% tibble::deframe()
palette_labels_harmonized <- labels %>% distinct(Label_harmonized, Color) %>% 
  tibble::deframe()

labels

Idents(hippocampus) <- "seurat_clusters"
p0 <- plot_dr(hippocampus, color_by = "wsnn_batch_res.0.6", point_size = 0.2, 
              label = TRUE, reduction = "harm.wnn.umap_batch",
    label_size = 5, legend = FALSE,
    title = glue("Hippocampus, n = {ncol(hippocampus)}")) + xlab("UMAP 1") + 
  ylab("UMAP 2")

Idents(hippocampus) <- "predicted.cluster_broad"
p1 <- plot_dr(hippocampus, color_by = "predicted.subclass__humancortexref", 
              point_size = 0.2, label = TRUE, reduction = "harm.wnn.umap_batch", 
              colors = palette_labels,
    label_size = 5, legend = FALSE,
    title = glue("Hippocampus, n = {ncol(hippocampus)}, cortex ref")) + 
  xlab("UMAP 1") + ylab("UMAP 2")

p2 <- plot_dr(hippocampus, color_by = "predicted.cluster_broad", 
              point_size = 0.2, label = TRUE, reduction = "harm.wnn.umap_batch", 
              colors = palette_labels, label_size = 5, legend = FALSE, 
              title = glue("Hippocampus, n = {ncol(hippocampus)}, hippocampus ref"))

p0 + p1 + p2

Tabulate predictions with different references:

# tabulate predictions
df_compare_refs <- hippocampus@meta.data %>%
    dplyr::select(Human_ctx = predicted.subclass__humancortexref, 
                  Human_hippocampus = predicted.cluster_broad) %>% 
    group_by(Human_ctx) %>%
    mutate(N_per_ctx_label = n()) %>%
    ungroup() %>% 
    group_by(Human_ctx, Human_hippocampus, N_per_ctx_label) %>%
    summarize(N_per_hipp_label = n()) %>%
    mutate(Prop_per_ctx_label = N_per_hipp_label / N_per_ctx_label) %>%
    arrange(desc(Prop_per_ctx_label)) %>% 
    dplyr::select(-N_per_ctx_label, -N_per_hipp_label)

## `summarise()` has grouped output by 'Human_ctx', 'Human_hippocampus'. You can
## override using the `.groups` argument.

df_compare_refs

# make into a matrix
df_compare_refs <- df_compare_refs %>% 
    dplyr::select(Human_ctx, Human_hippocampus, Prop_per_ctx_label) %>% 
    pivot_wider(names_from = Human_hippocampus, values_from = Prop_per_ctx_label, 
                values_fill = 0) %>% as.data.frame()
rownames(df_compare_refs) <- df_compare_refs$Human_ctx
df_compare_refs <- df_compare_refs[, -1]
df_compare_refs

Plot comparison between references:

# plot -- where rows indicate the cortex ref and cols indicate the hippocampus ref
pheatmap::pheatmap(df_compare_refs,
    cluster_rows = FALSE,
    cluster_cols = FALSE,
    scale = "none", 
    cellwidth = 30,
    cellheight = 30,
    fontsize_row = 15,
    fontsize_col = 15,
    display_numbers = TRUE,
    number_color = "white",
    main = "Proportion of cells per cortex reference class (rows) assigned to each hippocampus reference class (cols)",
    )

plot_dr(hippocampus, color_by = "predicted.cell_type_percluster", 
        point_size = 0.2, label = TRUE, reduction = "harm.wnn.umap_batch", 
        colors = palette_labels, label_size = 5, legend = FALSE, 
        title = glue("Hippocampus, n = {ncol(hippocampus)} (majority per cluster, using hippocampus ref)")) + 
  xlab("UMAP 1") + ylab("UMAP 2")

p1 <- hippocampus@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = predicted.subclass__humancortexref), position = "fill") +
    scale_fill_manual(values = palette_labels) +
    ggtitle("Cluster breakdown by cortex ref predictions")

p2 <- hippocampus@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = predicted.cluster_broad), position = "fill") +
    scale_fill_manual(values = palette_labels) +
    ggtitle("Cluster breakdown by hippocampus ref predictions")

p3 <- hippocampus@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = predicted.cell_type_percluster), position = "fill") +
    scale_fill_manual(values = palette_labels) +
    ggtitle("Cluster breakdown by hippocampus ref predictions, majority per cluster")

cowplot::plot_grid(p1, p2, p3, ncol = 1, axis = "rl", align = "v")

Add in broader labels:

# add in broader cell class from labels dataframe column
hippocampus@meta.data$predicted.cell_class <- hippocampus@meta.data$predicted.cluster_broad
hippocampus@meta.data$predicted.cell_class <- plyr::mapvalues(
  hippocampus@meta.data$predicted.cell_class, from = labels$Label, 
  to = labels$Class)

## The following `from` values were not present in `x`: Micro-PVM, VLMC_VSMC, Vip, Sst, Sst Chodl, Pvalb, Lamp5, Sncg, ExN, L2-5 IT, L2-6 IT, L2/3 IT, L5 IT, L5 ET, L5/6 NP, L6b, L6 CT and L6b, L6 IT, L6 IT Car3, L6 CT, DG, CA1-3_and_subiculum, SUB, Unknown

# get the most common cell class for each cluster
df_cluster_celltype <- hippocampus@meta.data %>% 
  dplyr::select(seurat_clusters, predicted.cell_class) %>% 
  group_by(seurat_clusters, predicted.cell_class) %>%
  summarise(n = n()) %>% 
  top_n(1, n)

## `summarise()` has grouped output by 'seurat_clusters'. You can override using
## the `.groups` argument.

# add it back to the object
hippocampus@meta.data$predicted.cell_class_percluster <- plyr::mapvalues(
  hippocampus@meta.data$seurat_clusters, from = df_cluster_celltype$seurat_clusters, 
  to = df_cluster_celltype$predicted.cell_class)

## The following `from` values were not present in `x`: 31

4.1 Glutamatergic

plot_dr(hippocampus, color_by = "predicted.cluster_broad", point_size = 0.2, 
        label = FALSE, reduction = "harm.wnn.umap_batch", colors = palette_labels, 
        cells = WhichCells(hippocampus, 
                           expr = predicted.cell_class == "Glutamatergic"), 
        label_size = 5, legend = FALSE, 
        title = "Hippocampus, highlighting glutamatergic cells")

hippocampus_glutamatergic <- subset(hippocampus, cells = WhichCells(
  hippocampus, expr = predicted.cell_class_percluster == "Glutamatergic"))
table(hippocampus_glutamatergic$predicted.cell_class_percluster)

## 
##  Non-neuronal Glutamatergic     GABAergic 
##             0         27666             0

# call subcluster_seurat() function in functions_r/scRNAseq.R
hippocampus_glutamatergic <- subcluster_seurat(hippocampus_glutamatergic, 
                                               key = "Glutamatergic", 
                                               resolution = 1)

## Performing TF-IDF normalization

## Running SVD

## Scaling cell embeddings

cowplot::plot_grid(umap(hippocampus_glutamatergic, 
                        color_by = "predicted.cluster_broad", 
                        colors = palette_labels), 
                   umap(hippocampus_glutamatergic, 
                        color_by = "Clusters_subset_Glutamatergic"))

# added by James: bar plot for cell type proportion per cluster
hippocampus_glutamatergic@meta.data %>%
  ggplot(aes(x = seurat_clusters)) +
  geom_bar(aes(fill = predicted.cluster_broad), position = "fill") +
  scale_fill_manual(values = palette_labels) +
  ylab('proportion')+
  ggtitle("hippocampus cluster breakdown for glutamatergic")

table(hippocampus_glutamatergic@meta.data$seurat_clusters)

## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 3756 1811 1799 1710 1462 1455 1346 1314 1200 1129 1009  888  862  708  643  550 
##   17   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32 
##  545  526  505  458  394  392  371  365  358  350  328  324  280  260  165  161 
##   33   34   35 
##  130   62   50

# for each subcluster, first get the majority vote. Then, if any clusters have
# the same majority, add a numeric suffix (1, 2, etc)

# get the most common cell class for each cluster
df_glutamatergic <- hippocampus_glutamatergic@meta.data %>% 
  dplyr::select(Clusters_subset_Glutamatergic, predicted.cluster_broad) %>% 
  group_by(Clusters_subset_Glutamatergic, predicted.cluster_broad) %>%
  dplyr::summarise(n_per = dplyr::n()) %>% 
  top_n(1, n_per) %>%
  # add suffixes to clusters with the same majority vote, only if there's more than one cluster
  group_by(predicted.cluster_broad) %>%
  dplyr::mutate(n_per_label = dplyr::n()) %>% 
  dplyr::mutate(new_label = case_when(
    n_per_label > 1 ~ paste0(predicted.cluster_broad, "_", row_number()),
    TRUE ~ predicted.cluster_broad
  ))

## `summarise()` has grouped output by 'Clusters_subset_Glutamatergic'. You can
## override using the `.groups` argument.

df_glutamatergic

hippocampus_glutamatergic@meta.data$cell_type_final <- plyr::mapvalues(
  hippocampus_glutamatergic@meta.data$Clusters_subset_Glutamatergic,
  from = df_glutamatergic$Clusters_subset_Glutamatergic,
  to = df_glutamatergic$new_label)

Idents(hippocampus_glutamatergic) <- "cell_type_final"

plot_dr(hippocampus_glutamatergic, color_by = "predicted.cluster_broad", 
        point_size = 0.2, label = TRUE, reduction = "harm.wnn.umap_batch", 
        colors = palette_labels, label_size = 5, legend = TRUE, 
        title = glue("Hippocampus glutamatergic neurons, final cell type labels, \nn = {ncol(hippocampus_glutamatergic)}"))

saveRDS(hippocampus_glutamatergic, file = file.path(
  merged_dir, "Hippocampus.subset_glutamatergic.seurat.Rds"))

# call markers on subsetted object
markers_glutamatergic <- FindAllMarkers(hippocampus_glutamatergic, 
                                        only.pos = TRUE, min.pct = 0.25, 
                                        logfc.threshold = 0.25)

## Calculating cluster CA3_1

## Calculating cluster CA3_2

## Calculating cluster DG GC_1

## Calculating cluster DG GC_2

## Calculating cluster DG GC_3

## Calculating cluster DG GC_4

## Calculating cluster CA3_3

## Calculating cluster DG GC_5

## Calculating cluster DG GC_6

## Calculating cluster CA1 dorsal_1

## Calculating cluster DG GC_7

## Calculating cluster DG GC_8

## Calculating cluster CA1 dorsal_2

## Calculating cluster Oligo

## Calculating cluster CA3_4

## Calculating cluster DG GC_9

## Calculating cluster DG GC_10

## Calculating cluster DG GC_11

## Calculating cluster DG GC_12

## Calculating cluster DG GC_13

## Calculating cluster DG GC_14

## Calculating cluster DG GC_15

## Calculating cluster DG GC_16

## Calculating cluster EC L2_1

## Calculating cluster DG GC_17

## Calculating cluster EC L2_2

## Calculating cluster EC L2_3

## Calculating cluster SUB proximal

## Calculating cluster CA1 dorsal_3

## Calculating cluster CA1 ventral

## Calculating cluster DG MC

## Calculating cluster SUB distal

## Calculating cluster DG GC_18

## Calculating cluster EC L6

## Calculating cluster EC L3

write_tsv(markers_glutamatergic, file.path(
  merged_dir, "Hippocampus.subset_glutamatergic.markers.tsv"))

top5 <- markers_glutamatergic %>%
    group_by(cluster) %>%
    slice_max(n = 5, order_by = avg_log2FC)
DoHeatmap(subset(x = hippocampus_glutamatergic, downsample = 100), 
          features = top5$gene) + viridis::scale_fill_viridis()

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

hippocampus_glutamatergic@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = condition), position = "fill")

hippocampus_glutamatergic@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = sample), position = "fill")

4.2 GABAergic

plot_dr(hippocampus, color_by = "predicted.cluster_broad", point_size = 0.2, 
        label = FALSE, reduction = "harm.wnn.umap_batch", colors = palette_labels,
    cells = WhichCells(hippocampus, expr = predicted.cell_class == "GABAergic"),
    label_size = 5, legend = FALSE,
    title = "Hippocampus, highlighting GABAergic cells")

hippocampus_gabaergic <- subset(hippocampus, cells = WhichCells(
  hippocampus, expr = predicted.cell_class_percluster == "GABAergic"))
table(hippocampus_gabaergic$predicted.cell_class_percluster)

## 
##  Non-neuronal Glutamatergic     GABAergic 
##             0             0          3244

hippocampus_gabaergic <- subcluster_seurat(hippocampus_gabaergic, key = "GABAergic", resolution = 1)

## Performing TF-IDF normalization

## Running SVD

## Scaling cell embeddings

cowplot::plot_grid(umap(hippocampus_gabaergic, 
                        color_by = "predicted.cluster_broad", 
                        colors = palette_labels), 
                   umap(hippocampus_gabaergic, 
                        color_by = "Clusters_subset_GABAergic"))

# added by James: bar plot for cell type percentage per cluster
hippocampus_gabaergic@meta.data %>%
  ggplot(aes(x = seurat_clusters)) +
  geom_bar(aes(fill = predicted.cluster_broad), position = "fill") +
  scale_fill_manual(values = palette_labels) +
  ylab('proportion')+
  ggtitle("hippocampus cluster breakdown for gabaergic")

table(hippocampus_gabaergic@meta.data$seurat_clusters)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17 
## 402 309 277 265 245 228 196 192 183 176 172 162 120 100  80  80  57

# for each subcluster, first get the majority vote. Then, if any clusters have
# the same majority, add a numeric suffix (1, 2, etc)

# get the most common cell class for each cluster
df_gabaergic <- hippocampus_gabaergic@meta.data %>% 
  dplyr::select(Clusters_subset_GABAergic, predicted.cluster_broad) %>% 
  group_by(Clusters_subset_GABAergic, predicted.cluster_broad) %>%
  dplyr::summarise(n_per = dplyr::n()) %>% 
  top_n(1, n_per) %>%
  # add suffixes to clusters with the same majority vote, only if there's more than one cluster
  group_by(predicted.cluster_broad) %>%
  dplyr::mutate(n_per_label = dplyr::n()) %>% 
  dplyr::mutate(new_label = case_when(
    n_per_label > 1 ~ paste0(predicted.cluster_broad, "_", row_number()),
    TRUE ~ predicted.cluster_broad
  ))

## `summarise()` has grouped output by 'Clusters_subset_GABAergic'. You can
## override using the `.groups` argument.

df_gabaergic

hippocampus_gabaergic@meta.data$cell_type_final <- plyr::mapvalues(
  hippocampus_gabaergic@meta.data$Clusters_subset_GABAergic,
  from = df_gabaergic$Clusters_subset_GABAergic,
  to = df_gabaergic$new_label)

Idents(hippocampus_gabaergic) <- "cell_type_final"

saveRDS(hippocampus_gabaergic, file = file.path(
  merged_dir, "Hippocampus.subset_gabaergic.seurat.Rds"))

plot_dr(hippocampus_gabaergic, color_by = "predicted.cluster_broad", 
        point_size = 0.2, label = TRUE, reduction = "harm.wnn.umap_batch", 
        colors = palette_labels,
    label_size = 5, legend = TRUE,
    title = glue("Hippocampus GABAergic neurons, final cell type labels, \nn = {ncol(hippocampus_gabaergic)}"))

# call markers on subsetted object
markers_gabaergic <- FindAllMarkers(hippocampus_gabaergic, only.pos = TRUE, 
                                    min.pct = 0.25, logfc.threshold = 0.25)

## Calculating cluster InN LAMP5_1

## Calculating cluster InN PVALB_1

## Calculating cluster InN LAMP5_2

## Calculating cluster InN SST_1

## Calculating cluster InN VIP_1

## Calculating cluster InN VIP_2

## Calculating cluster InN SST_2

## Calculating cluster InN NR2F2_1

## Calculating cluster InN VIP_3

## Calculating cluster InN LAMP5_3

## Calculating cluster InN VIP_4

## Calculating cluster InN VIP_5

## Calculating cluster InN SST_3

## Calculating cluster InN NR2F2_2

## Calculating cluster InN NR2F2_3

## Calculating cluster InN PVALB_2

## Calculating cluster InN VIP_6

write_tsv(markers_gabaergic, 
          file.path(merged_dir, "Hippocampus.subset_gabaergic.markers.tsv"))

top5 <- markers_gabaergic %>%
    group_by(cluster) %>%
    slice_max(n = 5, order_by = avg_log2FC)
DoHeatmap(subset(x = hippocampus_gabaergic, downsample = 100), 
          features = top5$gene) + viridis::scale_fill_viridis()

## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

hippocampus_gabaergic@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = condition), position = "fill")

hippocampus_gabaergic@meta.data %>%
    ggplot(aes(x = seurat_clusters)) +
    geom_bar(aes(fill = sample), position = "fill")

4.3 Assemble final annotations

Specify certain subclusters to label as “unknown” due to lack of clear markers or mixed annotations.

# define clusters to drop, then convert the column back to character; and replace those labels
Idents(hippocampus_glutamatergic) <- "seurat_clusters"
clusters_unk_glut <- c('14', '20', '22', '24', '26', '27', '28')
cells_unk_glut <- WhichCells(hippocampus_glutamatergic, idents = clusters_unk_glut)
hippocampus_glutamatergic$cell_type_final <- as.character(
  hippocampus_glutamatergic$cell_type_final)
hippocampus_glutamatergic$cell_type_final[cells_unk_glut] <- "Unknown"

Idents(hippocampus_gabaergic) <- "seurat_clusters"
clusters_unk_gaba <- c("4", "11", "17")
cells_unk_gaba <- WhichCells(hippocampus_gabaergic, idents = clusters_unk_gaba)
hippocampus_gabaergic$cell_type_final <- as.character(hippocampus_gabaergic$cell_type_final)
hippocampus_gabaergic$cell_type_final[cells_unk_gaba] <- "Unknown"

# as a default, set the final cell type to the majority cell type per cluster
hippocampus$cell_type_final <- as.character(
  hippocampus$predicted.cell_type_percluster)

hippocampus@meta.data$Clusters_all <- hippocampus@meta.data$seurat_clusters
hippocampus@meta.data$Clusters_subset_glutamatergic <- NA
hippocampus@meta.data$Clusters_subset_GABAergic <- NA

# add glutamatergic subclusters into main object
cells_glut <- Cells(hippocampus_glutamatergic)
hippocampus@meta.data[cells_glut, ]$Clusters_subset_glutamatergic <- hippocampus_glutamatergic@meta.data$Clusters_subset_Glutamatergic
hippocampus@meta.data[cells_glut, ]$cell_type_final <- as.character(hippocampus_glutamatergic@meta.data$cell_type_final)

# add gabaergic subclusters into main object
cells_gab <- Cells(hippocampus_gabaergic)
hippocampus@meta.data[cells_gab, ]$Clusters_subset_GABAergic <- hippocampus_gabaergic@meta.data$Clusters_subset_GABAergic
hippocampus@meta.data[cells_gab, ]$cell_type_final <- as.character(hippocampus_gabaergic@meta.data$cell_type_final)

# relabel with harmonized cell type labels
label_map_new <- data.frame(Label_granular = unique(hippocampus@meta.data$cell_type_final)) %>% 
  mutate(Label_granular_old = Label_granular) %>% 
  separate(Label_granular, into = c("Label_wo_suffix", "suffix"), sep = "_") %>% 
  left_join(labels, by = c("Label_wo_suffix" = "Label")) %>% 
  mutate(Label_granular = case_when(
    !is.na(suffix) ~ paste0(Label_harmonized, "_", suffix),
    TRUE ~ Label_harmonized
  )) %>% 
  arrange(Order)

hippocampus@meta.data$cell_type_granular <- plyr::mapvalues(
  hippocampus@meta.data$cell_type_final, 
  from = label_map_new$Label_granular_old, to = label_map_new$Label_granular)

hippocampus@meta.data$cell_type_broad <- plyr::mapvalues(
  hippocampus@meta.data$cell_type_final, from = label_map_new$Label_granular_old, 
  to = label_map_new$Label_harmonized)

hippocampus@meta.data$cell_type_final <- NULL

# put the labels in the cell type order from our label map, to get a rational order
hippocampus_order <- label_map_new %>%
  dplyr::filter(Label_granular %in% hippocampus$cell_type_granular) %>%
  pull(Label_granular) %>% base::unique()
hippocampus_order

##  [1] "OPC"                     "Oligo"                  
##  [3] "Astro"                   "Microglia PVM"          
##  [5] "Endo"                    "Vascular leptomeningeal"
##  [7] "VIP_3"                   "VIP_5"                  
##  [9] "VIP_1"                   "VIP_2"                  
## [11] "NR2F2_1"                 "NR2F2_3"                
## [13] "NR2F2_2"                 "SST_3"                  
## [15] "SST_2"                   "PVALB_1"                
## [17] "PVALB_2"                 "LAMP5_1"                
## [19] "LAMP5_3"                 "LAMP5_2"                
## [21] "L3"                      "L6"                     
## [23] "DG granule_7"            "DG granule_5"           
## [25] "DG granule_10"           "DG granule_9"           
## [27] "DG granule_6"            "DG granule_11"          
## [29] "DG granule_1"            "DG granule_4"           
## [31] "DG granule_12"           "DG granule_14"          
## [33] "DG granule_16"           "DG granule_8"           
## [35] "DG granule_17"           "DG granule_3"           
## [37] "DG granule_2"            "DG granule_18"          
## [39] "DG mossy"                "CA3_4"                  
## [41] "CA3_2"                   "CA3_1"                  
## [43] "CA3_3"                   "CA1 dorsal_1"           
## [45] "CA1 dorsal_2"            "CA1 dorsal_3"           
## [47] "CA1 ventral"             "Subiculum distal"       
## [49] "Unknown"

hippocampus_order_broad <- labels %>%
  dplyr::filter(Label_harmonized %in% unique(hippocampus$cell_type_broad)) %>%
  pull(Label_harmonized) %>%
  base::unique()
hippocampus_order_broad

##  [1] "OPC"                     "Oligo"                  
##  [3] "Astro"                   "Microglia PVM"          
##  [5] "Endo"                    "Vascular leptomeningeal"
##  [7] "VIP"                     "NR2F2"                  
##  [9] "SST"                     "PVALB"                  
## [11] "LAMP5"                   "L3"                     
## [13] "L6"                      "DG granule"             
## [15] "DG mossy"                "CA3"                    
## [17] "CA1 dorsal"              "CA1 ventral"            
## [19] "Subiculum distal"        "Unknown"

# put the labels in the cell type order from our label map, to get a rational order
hippocampus$cell_type_broad <- factor(hippocampus$cell_type_broad, 
                                      levels = hippocampus_order_broad)
hippocampus$cell_type_granular <- factor(hippocampus$cell_type_granular, 
                                         levels = hippocampus_order)
Idents(hippocampus) <- "cell_type_granular"

plot_dr(hippocampus, color_by = "cell_type_broad", point_size = 0.2, label = TRUE,
        reduction = "harm.wnn.umap_batch", colors = palette_labels_harmonized,
    label_size = 5, legend = TRUE,
    title = glue("DLPFC AD, final cell type labels, n = {ncol(hippocampus)}"))

Idents(hippocampus) <- "cell_type_broad"
plot_dr(hippocampus, color_by = "cell_type_broad", point_size = 0.2, label = TRUE,
        reduction = "harm.wnn.umap_batch", colors = palette_labels_harmonized,
        label_size = 5, legend = TRUE,
        title = glue("hippocampus, final cell type labels, broad, n = {ncol(hippocampus)}"))

Plot cluster numbers:

hippocampus@meta.data %>% 
  group_by(cell_type_broad, cell_type_granular) %>% 
  count() %>% 
  ggplot(aes(x = cell_type_granular, y = n)) +
  geom_bar(aes(fill = cell_type_broad), stat = "identity") +
  geom_text(aes(label = n, color = cell_type_broad), hjust = -0.1, angle = 90) +
  scale_fill_manual(values = palette_labels_harmonized) +
  scale_color_manual(values = palette_labels_harmonized) +
  rotate_x() +
  ggtitle("Number of cells per granular cluster")

hippocampus@meta.data %>% 
  group_by(cell_type_broad) %>% 
  count() %>% 
  ggplot(aes(x = cell_type_broad, y = log10(n))) +
  geom_bar(aes(fill = cell_type_broad), stat = "identity") +
  geom_text(aes(label = n, color = cell_type_broad), hjust = -0.1, angle = 90) +
  scale_fill_manual(values = palette_labels_harmonized) +
  scale_color_manual(values = palette_labels_harmonized) +
  rotate_x() +
  ggtitle("Number of cells per broad cluster")

hippocampus@meta.data %>% 
  group_by(cell_type_broad) %>% 
  summarize(sum_ncount = sum(nCount_ATAC)) %>% 
  ggplot(aes(x = cell_type_broad, y = log10(sum_ncount))) +
  geom_bar(aes(fill = cell_type_broad), stat = "identity") +
  geom_text(aes(label = sum_ncount, color = cell_type_broad), hjust = -0.1, 
            angle = 90) +
  scale_fill_manual(values = palette_labels_harmonized) +
  scale_color_manual(values = palette_labels_harmonized) +
  rotate_x() +
  ggtitle("Number of fragments in peaks, per broad cluster") +
  coord_cartesian(ylim = c(0, 10))

DotPlot(hippocampus, assay = "RNA", group.by = "cell_type_granular", features = rev(c(
  "PDGFRA",
    "PLP1",
    "AQP4",
    "LY86",
    "SKAP1",
    "VWF",
    "GAD1",
    "VIP",
    "NR2F2",
    "SST",
  "PVALB",
    "LAMP5",
    "SLC17A6",
    "PROX1", # DG
  "FGF5",
  "POU3F1"
))) +
    ylab("Cell type") +
    coord_flip() +
    rotate_x()

DotPlot(hippocampus, assay = "RNA", group.by = "cell_type_broad", features = rev(c(
  "PDGFRA",
    "PLP1",
    "AQP4",
    "LY86",
    "SKAP1",
    "VWF",
    "GAD1",
    "VIP",
    "NR2F2",
    "SST",
  "PVALB",
    "LAMP5",
    "SLC17A6",
    "PROX1", # DG
  "FGF5",
  "POU3F1"
))) +
  ylab("Cell type") +
  coord_flip() +
  rotate_x()

hippocampus_meta <- hippocampus@meta.data
saveRDS(hippocampus_meta, file = file.path(
  merged_dir, "Hippocampus.integrated.batch.analyzed.cell_metadata.labeled.rds"))

5 Call markers

Idents(hippocampus) <- "cell_type_broad"
markers_hippocampus <- FindAllMarkers(hippocampus, only.pos = TRUE, 
                                      logfc.threshold = 0.25, 
                                      max.cells.per.ident = 1000) %>% 
  mutate(pct.diff = pct.1 - pct.2)

## Calculating cluster OPC

## Calculating cluster Oligo

## Calculating cluster Astro

## Calculating cluster Microglia PVM

## Calculating cluster Endo

## Calculating cluster Vascular leptomeningeal

## Calculating cluster VIP

## Calculating cluster NR2F2

## Calculating cluster SST

## Calculating cluster PVALB

## Calculating cluster LAMP5

## Calculating cluster L2

## Calculating cluster L3

## Calculating cluster L6

## Calculating cluster DG granule

## Calculating cluster DG mossy

## Calculating cluster CA3

## Calculating cluster CA1 dorsal

## Calculating cluster CA1 ventral

## Calculating cluster Subiculum distal

## Calculating cluster Subiculum proximal

## Calculating cluster Unknown

write_tsv(markers_hippocampus, file.path(merged_dir, "Hippocampus.markers.cell_type_broad.tsv"))

6 Session info

sessionInfo()

## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/libf77blas.so.3.10.3
## LAPACK: /users/guozy/miniconda3/envs/psych/lib/libopenblasp-r0.3.27.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=C             
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 viridis_0.6.5      viridisLite_0.4.2  Signac_1.13.0     
##  [5] ggplot2_3.5.1      SeuratObject_5.0.0 Seurat_4.3.0       glue_1.7.0        
##  [9] readr_2.1.5        purrr_1.0.2        dplyr_1.1.4        tidyr_1.3.0       
## [13] here_1.0.1         Cairo_1.6-2       
## 
## loaded via a namespace (and not attached):
##   [1] spatstat.univar_3.0-0  spam_2.10-0            fastmatch_1.1-4       
##   [4] plyr_1.8.9             igraph_2.0.3           lazyeval_0.2.2        
##   [7] sp_2.1-4               splines_4.2.0          BiocParallel_1.32.6   
##  [10] listenv_0.9.1          scattermore_1.2        GenomeInfoDb_1.34.9   
##  [13] digest_0.6.36          htmltools_0.5.8.1      fansi_1.0.6           
##  [16] magrittr_2.0.3         tensor_1.5             cluster_2.1.6         
##  [19] ROCR_1.0-11            limma_3.54.2           tzdb_0.4.0            
##  [22] Biostrings_2.66.0      globals_0.16.3         matrixStats_1.3.0     
##  [25] vroom_1.6.5            spatstat.sparse_3.1-0  colorspace_2.1-0      
##  [28] rappdirs_0.3.3         ggrepel_0.9.5          xfun_0.46             
##  [31] crayon_1.5.3           RCurl_1.98-1.16        jsonlite_1.8.8        
##  [34] progressr_0.14.0       spatstat.data_3.1-2    survival_3.7-0        
##  [37] zoo_1.8-12             polyclip_1.10-6        gtable_0.3.5          
##  [40] zlibbioc_1.44.0        XVector_0.38.0         leiden_0.4.3.1        
##  [43] future.apply_1.11.2    BiocGenerics_0.44.0    abind_1.4-5           
##  [46] scales_1.3.0           pheatmap_1.0.12        spatstat.random_3.3-1 
##  [49] miniUI_0.1.1.1         Rcpp_1.0.11            xtable_1.8-4          
##  [52] reticulate_1.38.0      bit_4.0.5              dotCall64_1.1-1       
##  [55] stats4_4.2.0           htmlwidgets_1.6.4      httr_1.4.7            
##  [58] ica_1.0-3              farver_2.1.2           pkgconfig_2.0.3       
##  [61] sass_0.4.9             uwot_0.2.2             deldir_2.0-4          
##  [64] utf8_1.2.4             labeling_0.4.3         tidyselect_1.2.1      
##  [67] rlang_1.1.4            reshape2_1.4.4         later_1.3.2           
##  [70] munsell_0.5.1          tools_4.2.0            cachem_1.1.0          
##  [73] cli_3.6.3              generics_0.1.3         ggridges_0.5.6        
##  [76] evaluate_0.24.0        stringr_1.5.1          fastmap_1.2.0         
##  [79] yaml_2.3.9             goftest_1.2-3          bit64_4.0.5           
##  [82] knitr_1.48             fitdistrplus_1.2-1     RANN_2.6.1            
##  [85] pbapply_1.7-2          future_1.33.2          nlme_3.1-165          
##  [88] mime_0.12              RcppRoll_0.3.1         compiler_4.2.0        
##  [91] plotly_4.10.4          png_0.1-8              spatstat.utils_3.0-5  
##  [94] tibble_3.2.1           bslib_0.7.0            stringi_1.8.4         
##  [97] highr_0.11             lattice_0.22-6         Matrix_1.6-1.1        
## [100] vctrs_0.6.5            pillar_1.9.0           lifecycle_1.0.4       
## [103] spatstat.geom_3.3-2    lmtest_0.9-40          jquerylib_0.1.4       
## [106] RcppAnnoy_0.0.22       data.table_1.15.4      cowplot_1.1.3         
## [109] bitops_1.0-7           irlba_2.3.5.1          httpuv_1.6.15         
## [112] patchwork_1.2.0        GenomicRanges_1.50.2   R6_2.5.1              
## [115] promises_1.3.0         KernSmooth_2.23-24     gridExtra_2.3         
## [118] IRanges_2.32.0         parallelly_1.37.1      codetools_0.2-20      
## [121] MASS_7.3-56            rprojroot_2.0.4        withr_3.0.0           
## [124] sctransform_0.4.0      Rsamtools_2.14.0       S4Vectors_0.36.2      
## [127] GenomeInfoDbData_1.2.9 parallel_4.2.0         hms_1.1.3             
## [130] grid_4.2.0             rmarkdown_2.27         Rtsne_0.17            
## [133] spatstat.explore_3.3-1 shiny_1.8.1.1

Subclustering hippocampus

19 September, 2024