Step 10: Cell Type Annotation

Setup: Environment and Data

In this final step of the single-sample pipeline, we assign biological identities to our mathematical clusters and define the marker genes for these overarching cell types. First, we load the necessary libraries and the clustered dataset.

library(Seurat)
library(dplyr)
library(ggplot2)

# Catch the baton: Load the clustered SO from the previous step
import_path <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/scripts/SO_05_Clustered.rds"
SO <- readRDS(import_path)

Step 10A: Annotating Clusters We map the numeric clusters to their biological cell types based on the classical marker genes we reviewed in Step 9. Several clusters often merge into a single broad lineage (e.g., Clusters 0, 1, 13, and 14 all representing Neurons).

new.cluster.ids <- c(
  "0" = "Unassigned", # Low logFC, mostly lncRNAs
  "1" = "Astrocyte",                # S100B, GPD1
  "2" = "Oligodendrocyte",          # OPALIN, CD9, PLLP
  "3" = "Unassigned", # Low logFC, ADAMTS18 overlap
  "4" = "Astrocyte",                # SLC14A1 (Ependymal-like)
  "5" = "Stressed Cell",            # HSPA1A, HSPA1B, CRYAB
  "6" = "Microglia / Macrophage",   # SYK, CD86, MS4A6A
  "7" = "OPCs",                     # PDGFRA, PCDH15
  "8" = "Unassigned", # Extremely low logFC, ADAMTS18
  "9" = "Endothelial Cell",         # CLDN5, EMCN, ABCB1
  "10"= "Inhibitory Neuron",        # GAD2, EBF3, HOXB8
  "11"= "Fibroblast / Pericyte",    # COL1A2, FOXD1, NOTCH3
  "12"= "Reactive Astrocyte",       # CHI3L1, TNC
  "13"= "Unassigned Glial",         # CYP4F12
  "14"= "Neuron",                   # MCHR2, CBLN2
  "15"= "Neuron",                   # ST8SIA2
  "16"= "Excitatory Neuron",        # NEUROD1, GRM4, UNC13C
  "17"= "T Cell / NK Cell",         # CD3E, CD2, IL7R, GZMA
  "18"= "Fibroblast"                # COL6A3, CEMIP
)

# Apply the new biological names to the Seurat Object
SO <- Seurat::RenameIdents(SO, new.cluster.ids)

# 確認用のUMAPプロット
clusters_annot <- Seurat::DimPlot(SO, 
                                  reduction = "umap",
                                  label = TRUE, 
                                  repel = TRUE, 
                                  pt.size = 0.5) + 
  Seurat::NoLegend()

clusters_annot

Step 10B: Characterizing Final Cell Types

Because we have merged multiple sub-clusters, we perform differential expression analysis one final time to identify the core marker genes that define these broad cell type categories.

# 1. Find features per cell type
markers <- Seurat::FindAllMarkers(SO, 
                                  only.pos = TRUE,
                                  min.pct = 0.25,
                                  verbose = FALSE)

# 2. Translate Ensembl IDs to Gene Symbols for readability
library(AnnotationDbi)
library(org.Hs.eg.db)

markers$symbol <- mapIds(org.Hs.eg.db,
                         keys = markers$gene,
                         column = "SYMBOL",
                         keytype = "ENSEMBL",
                         multiVals = "first")
markers$symbol <- ifelse(is.na(markers$symbol), markers$gene, markers$symbol)

# 3. Group by cell type and select the top 5 genes based on fold change
top_fc <- markers |> 
  dplyr::group_by(cluster) |> 
  dplyr::slice_max(n = 5, order_by = avg_log2FC)

# 4. Downsample to prevent memory crashes during heatmap generation
SO_subset <- subset(SO, downsample = 500)

# 5. Create Heatmap
Seurat::DoHeatmap(SO_subset, 
                  features = top_fc$symbol, 
                  label = TRUE,
                  angle = 45) + 
  Seurat::NoLegend()

Heatmap of the top 5 marker genes defining each cell type.

Final Save: Single Sample Pipeline Complete

# Define the final save path
save_path <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/scripts/SO_07_Annotated.RDS"

# Save the final annotated Seurat Object
saveRDS(object = SO, file = save_path)
print("Analysis Complete! Final object saved successfully.")
[1] "Analysis Complete! Final object saved successfully."