Step 13A: Microglia Pre-processing, Harmony Integration & Clustering

Extract microglia from the Fernando’s object, run Harmony integration by sample, and cluster

1 Setup and Load Seurat Object

Show code
library(Seurat)
library(tidyverse)
library(patchwork)
library(harmony)
library(ggrepel)
set.seed(1234)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"

fernando_path <- file.path(
  project_dir,
  "lncRNAs_PD_Fer",
  "snRNAseq",
  "seurat_object_annotated.rds"
)

results_dir <- file.path(
  project_dir,
  "results",
  "Step13_microglia_harmony_reclustering"
)

dir.create(results_dir, recursive = TRUE, showWarnings = FALSE)

if (!file.exists(fernando_path)) {
  stop("Fernando-annotated Seurat object not found at: ", fernando_path)
}

SO <- readRDS(fernando_path)
SO
An object of class Seurat 
87674 features across 46066 samples within 2 assays 
Active assay: SCT (39901 features, 3000 variable features)
 3 layers present: counts, data, scale.data
 1 other assay present: RNA
 3 dimensional reductions calculated: pca, umap, harmony

2 Inspect Full-Object UMAP

Show code
available_reductions <- Reductions(SO)
available_reductions

full_umap_reduction <- case_when(
  "umap_rpca" %in% available_reductions ~ "umap_rpca",
  "umap" %in% available_reductions ~ "umap",
  TRUE ~ NA_character_
)

if (!is.na(full_umap_reduction)) {
  DimPlot(
    SO,
    reduction = full_umap_reduction,
    group.by = "Cell_types",
    label = TRUE,
    repel = TRUE,
    pt.size = 0.4
  ) +
    labs(title = "Full Object: Cell-Type Annotations")
} else {
  message("No existing UMAP reduction found. Skipping full-object DimPlot.")
}

3 Check Metadata

Show code
required_metadata <- c("Cell_types", "condition", "Sample")

missing_metadata <- setdiff(
  required_metadata,
  colnames(SO@meta.data)
)

if (length(missing_metadata) > 0) {
  stop(
    "Missing required metadata columns: ",
    paste(missing_metadata, collapse = ", ")
  )
}

# Create standardised columns
SO$fernando_celltype_annotation <- as.character(SO$Cell_types)
SO$condition_use <- as.character(SO$condition)
SO$sample_use   <- as.character(SO$Sample)

# Infer sex from sample name suffix
SO$sex <- case_when(
  stringr::str_detect(SO$sample_use, "_F$") ~ "Female",
  stringr::str_detect(SO$sample_use, "_M$") ~ "Male",
  TRUE ~ "Unknown"
)
SO$sex <- factor(SO$sex, levels = c("Female", "Male", "Unknown"))

table(SO$fernando_celltype_annotation)

      Astrocytes       Fibroblast        Microglia          Neurons 
            5259              100             3439             2616 
Oligodendrocytes             OPCs        Pericytes           T-cell 
           28717             2799             1114              130 
        Vascular 
            1892 
Show code
table(SO$condition_use)

    C    PD 
23527 22539 
Show code
table(SO$condition, SO$sex)
    
     Female  Male Unknown
  C   13933  9594       0
  PD  14064  8475       0
Show code
table(SO$sample_use)

NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F 
    3422      954     9557     4977     3460     1157     3592     6604 
PDSN04_F PDSN06_M PDSN07_M PDSN09_M 
    3868     3130     1002     4343 
Show code
# Save metadata summary tables
write.csv(
  as.data.frame(table(SO$fernando_celltype_annotation)),
  file.path(results_dir, "Step13A_fernando_celltype_annotation_table.csv"),
  row.names = FALSE
)
write.csv(
  as.data.frame(table(SO$condition_use)),
  file.path(results_dir, "Step13A_condition_table.csv"),
  row.names = FALSE
)
write.csv(
  as.data.frame(table(SO$sample_use)),
  file.path(results_dir, "Step13A_sample_table.csv"),
  row.names = FALSE
)

4 Extract Microglia

Show code
all_fernando_celltypes <- sort(unique(SO$fernando_celltype_annotation))

target_microglia <- all_fernando_celltypes[
  stringr::str_detect(
    all_fernando_celltypes,
    regex("micro", ignore_case = TRUE)
  )
]
target_microglia
[1] "Microglia"
Show code
if (length(target_microglia) == 0) {
  stop("No Fernando-defined microglia labels found. Check table(SO$fernando_celltype_annotation).")
}

SO_micro <- subset(
  SO,
  subset = fernando_celltype_annotation %in% target_microglia
)

if (ncol(SO_micro) == 0) {
  stop("No microglia cells remained after subsetting.")
}

# Transfer standardised metadata
SO_micro$micro_original_identity <- SO_micro$fernando_celltype_annotation
SO_micro$condition <- SO_micro$condition_use
SO_micro$sample   <- SO_micro$sample_use

table(SO_micro$micro_original_identity)

Microglia 
     3439 
Show code
table(SO_micro$condition)

   C   PD 
1748 1691 
Show code
table(SO_micro$condition, SO_micro$sex)
    
     Female Male Unknown
  C     889  859       0
  PD    909  782       0
Show code
table(SO_micro$sample)

NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F 
     357      120      412      628      202       29      120      182 
PDSN04_F PDSN06_M PDSN07_M PDSN09_M 
     607      378      124      280 
Show code
cat("Microglia cells retained:", ncol(SO_micro), "\n")
Microglia cells retained: 3439 

5 Sample-Level Microglia Fraction

The fraction is computed from the full object so that the denominator is total cells per sample, not microglia-only cells.

Show code
# Flag microglia in the full object
SO$is_microglia <- SO$fernando_celltype_annotation %in% target_microglia

# Determine which gene-count column exists
feature_col <- case_when(
  "nFeature_RNA" %in% colnames(SO@meta.data) ~ "nFeature_RNA",
  "nFeature_SCT" %in% colnames(SO@meta.data) ~ "nFeature_SCT",
  TRUE ~ NA_character_
)

if (is.na(feature_col)) {
  stop("Neither nFeature_RNA nor nFeature_SCT found in metadata.")
}

sample_microglia_qc <- SO@meta.data %>%
  mutate(
    is_microglia = SO$is_microglia,
    nFeature_use = .data[[feature_col]]
  ) %>%
  group_by(
    sample_use,
    condition_use,
    sex = case_when(
      stringr::str_detect(sample_use, "_F$") ~ "Female",
      stringr::str_detect(sample_use, "_M$") ~ "Male",
      TRUE ~ "Unknown"
    )
  ) %>%
  summarise(
    total_cells        = n(),
    microglia_cells    = sum(is_microglia),
    microglia_percent  = 100 * microglia_cells / total_cells,
    mean_gene_number   = mean(nFeature_use, na.rm = TRUE),
    median_gene_number = median(nFeature_use, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(microglia_percent))

write.csv(
  sample_microglia_qc,
  file.path(results_dir, "Step13A_sample_microglia_fraction_vs_gene_number.csv"),
  row.names = FALSE
)

5.1 Simple bar plot of microglia fraction per sample

Show code
plot_microglia_fraction <- function(sort_method) {
  if (sort_method == "microglia_percent") {
    sample_levels <- sample_microglia_qc %>% arrange(microglia_percent) %>% pull(sample_use)
  } else if (sort_method == "microglia_count") {
    sample_levels <- sample_microglia_qc %>% arrange(microglia_cells) %>% pull(sample_use)
  } else {
    # sample_name (coord_flipで上が若番になるよう逆順に並べる)
    sample_levels <- sort(unique(sample_microglia_qc$sample_use), decreasing = TRUE)
  }

  sample_microglia_qc_ordered <- sample_microglia_qc %>%
    mutate(sample_use = factor(sample_use, levels = sample_levels))

  sample_microglia_fraction_long <- sample_microglia_qc_ordered %>%
    mutate(other_cells = total_cells - microglia_cells) %>%
    select(
      sample_use,
      condition_use,
      total_cells,
      microglia_cells,
      other_cells,
      microglia_percent
    ) %>%
    pivot_longer(
      cols = c(microglia_cells, other_cells),
      names_to = "cell_category",
      values_to = "n_cells"
    ) %>%
    mutate(
      cell_category = case_when(
        cell_category == "microglia_cells" ~ "Microglia",
        cell_category == "other_cells" ~ "Other cells"
      ),
      cell_category = factor(
        cell_category,
        levels = c("Other cells", "Microglia")
      )
    )

  p <- ggplot(
    sample_microglia_fraction_long,
    aes(
      x = sample_use,
      y = n_cells,
      fill = cell_category
    )
  ) +
    geom_col(width = 0.75) +
    geom_text(
      data = sample_microglia_qc_ordered %>%
        mutate(
          label = sprintf(
            "%.1f%%  (%d / %d cells)",
            microglia_percent,
            microglia_cells,
            total_cells
          )
        ),
      aes(
        x = sample_use,
        y = total_cells,
        label = label
      ),
      inherit.aes = FALSE,
      hjust = -0.05,
      size = 3.2
    ) +
    coord_flip(clip = "off") +
    scale_y_continuous(
      expand = expansion(mult = c(0, 0.35))
    ) +
    labs(
      title = "Microglia Fraction per Sample",
      subtitle = "Each bar represents total cell count; label shows microglia fraction",
      x = "Sample",
      y = "Total cell count",
      fill = "Cell category"
    ) +
    theme_minimal() +
    theme(
      plot.margin = margin(5.5, 80, 5.5, 5.5)
    )
    
  return(p)
}
Show code
plot_microglia_fraction("microglia_percent")

Show code
plot_microglia_fraction("microglia_count")

Show code
plot_microglia_fraction("sample_name")

5.2 Sample-Level Microglia QC Table

Show code
DT::datatable(
  sample_microglia_qc,
  filter = "top",
  rownames = FALSE,
  options = list(pageLength = 10, scrollX = TRUE)
)

5.3 Save microglia input tables

6 Preprocess Microglia

PCA is re-computed within the microglia subset. This PCA is used as input for Harmony.

Show code
DefaultAssay(SO_micro) <- "RNA"

if ("Assay5" %in% class(SO_micro[["RNA"]])) {
  SO_micro <- JoinLayers(SO_micro, assay = "RNA")
}

SO_micro <- NormalizeData(SO_micro, verbose = FALSE)

SO_micro <- FindVariableFeatures(
  SO_micro,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

SO_micro <- ScaleData(SO_micro, verbose = FALSE)

SO_micro <- RunPCA(
  SO_micro,
  npcs = 40,
  reduction.name = "pca_micro",
  reduction.key = "microPCA_",
  verbose = FALSE
)

ElbowPlot(SO_micro, reduction = "pca_micro", ndims = 40) +
  labs(title = "Microglia PCA Elbow Plot")

7 Run Harmony

Harmony is applied using sample identity as the grouping variable. Condition is not used as a Harmony variable because disease status is biologically relevant and should not be regressed out.

Show code
harmony_group_var <- "sample"

if (!harmony_group_var %in% colnames(SO_micro@meta.data)) {
  stop("Harmony grouping variable not found in metadata: ", harmony_group_var)
}

SO_micro <- RunHarmony(
  object = SO_micro,
  group.by.vars = harmony_group_var,
  reduction = "pca_micro",
  reduction.save = "harmony_micro",
  verbose = TRUE
)

8 Clustering and UMAP on Harmony Embeddings

Show code
set.seed(1234)

dims_use_harmony <- 1:30
resolution_use_harmony <- 0.5

SO_micro <- FindNeighbors(
  SO_micro,
  reduction = "harmony_micro",
  dims = dims_use_harmony,
  graph.name = "micro_harmony_snn",
  verbose = FALSE
)

SO_micro <- FindClusters(
  SO_micro,
  graph.name = "micro_harmony_snn",
  resolution = resolution_use_harmony,
  verbose = FALSE
)

SO_micro$seurat_clusters_harmony <- as.character(SO_micro$seurat_clusters)

SO_micro <- RunUMAP(
  SO_micro,
  reduction = "harmony_micro",
  dims = dims_use_harmony,
  reduction.name = "umap_micro_harmony",
  reduction.key = "microHarmonyUMAP_",
  verbose = FALSE
)

# Run UMAP on unintegrated PCA for comparison
SO_micro <- RunUMAP(
  SO_micro,
  reduction = "pca_micro",
  dims = dims_use_harmony,
  reduction.name = "umap_micro_unintegrated",
  reduction.key = "microUnintegratedUMAP_",
  verbose = FALSE
)

table(SO_micro$seurat_clusters_harmony)

   0    1    2    3    4    5 
1523 1191  406  153  142   24 

9 Microglia Cluster Composition per Sample

Show code
if (!"seurat_clusters_harmony" %in% colnames(SO_micro@meta.data)) {
  stop("seurat_clusters_harmony not found. Run clustering first.")
}

cluster_fraction_by_sample <- SO_micro@meta.data %>%
  dplyr::count(
    sample,
    condition,
    sex,
    seurat_clusters_harmony,
    name = "n_cells"
  ) %>%
  dplyr::group_by(sample) %>%
  dplyr::mutate(
    total_microglia = sum(n_cells),
    cluster_percent = 100 * n_cells / total_microglia
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    sample_label = paste0(sample, "\n(N = ", total_microglia, ")"),
    percent_label = ifelse(
      cluster_percent >= 5,
      paste0(round(cluster_percent, 1), "%"),
      ""
    ),
    hover_text = paste0("Cells in cluster: ", n_cells)
  )


write.csv(
  cluster_fraction_by_sample,
  file.path(results_dir, "Step13A_cluster_composition_per_sample.csv"),
  row.names = FALSE
)

p_cluster_fraction <- ggplot(
  cluster_fraction_by_sample,
  aes(
    x = sample_label,
    y = cluster_percent,
    fill = seurat_clusters_harmony,
    text = hover_text
  )
) +
  geom_col(width = 0.75, position = position_stack(reverse = TRUE)) +
  geom_text(
    aes(label = percent_label),
    position = position_stack(vjust = 0.5, reverse = TRUE),
    size = 3
  ) +
  coord_flip() +
  labs(
    title = "Microglia Cluster Composition per Sample",
    subtitle = "Each bar represents the composition of microglia subclusters within each sample",
    x = NULL,
    y = "Percent of microglia cells",
    fill = "Harmony cluster"
  ) +
  theme_classic()

plotly::ggplotly(
  p_cluster_fraction,
  tooltip = "text"
)

9.1 Cluster Composition Data Table

Show code
DT::datatable(
  cluster_fraction_by_sample,
  filter = "top",
  rownames = FALSE,
  options = list(pageLength = 10, scrollX = TRUE)
)

10 Visualise Clusters

10.1 Before and After Harmony Integration

Show code
p_before <- DimPlot(
  SO_micro,
  reduction = "umap_micro_unintegrated",
  group.by = "sample_use",
  shuffle = TRUE,
  pt.size = 0.4
) +
  labs(title = "Before Harmony (PCA-based UMAP)") +
  theme(legend.position = "none")

p_after <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sample_use",
  shuffle = TRUE,
  pt.size = 0.4
) +
  labs(title = "After Harmony")

p_before | p_after

10.2 Cluster Annotations

Show code
p_harmony_cluster <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.5
) +
  labs(title = "Harmony-Corrected Microglia Subclusters")

p_harmony_condition <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(title = "Condition Distribution")

p_harmony_sex <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sex",
  pt.size = 0.4
) +
  labs(title = "Sex Distribution")

p_harmony_sample <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sample",
  pt.size = 0.4
) +
  labs(title = "Sample Distribution")

p_split_condition <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "condition",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Clusters Split by Condition")

p_split_sex <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sex",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Clusters Split by Sex")

p_split_sample <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sample",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.25,
  ncol = 4
) +
  labs(title = "Clusters Split by Sample")
Show code
p_harmony_cluster | p_harmony_condition

Show code
table(SO_micro$condition)

   C   PD 
1748 1691 
Show code
p_split_condition

Show code
p_harmony_cluster | p_harmony_sex

Show code
table(SO_micro$sex)

 Female    Male Unknown 
   1798    1641       0 
Show code
p_split_sex

Show code
p_harmony_cluster | p_harmony_sample

Show code
table(SO_micro$sample)

NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F 
     357      120      412      628      202       29      120      182 
PDSN04_F PDSN06_M PDSN07_M PDSN09_M 
     607      378      124      280 
Show code
p_split_sample

11 Save Step 13A Object