Step 13B: PD vs Control Differential Expression, Pathway Enrichment & State Annotation

Sex-adjusted DE in all microglia and per cluster, GO enrichment, module scoring, and conservative state labeling

13B.1 Setup and Load Step 13A Object

The primary differential expression analysis is PD vs Control across all microglia, with sex included as a latent variable.

Harmony-based clusters are not used as DE groups. Instead, clusters are used after PD-vs-Control DE to identify which microglia subpopulations carry PD-associated transcriptional changes.

All expression analyses use the RNA assay. Harmony is used only for clustering and UMAP (Step 13A).

Show code
library(Seurat)
library(tidyverse)
library(patchwork)
library(ggrepel)
library(pheatmap)
set.seed(1234)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"

results_dir <- file.path(
  project_dir,
  "results",
  "Step13_microglia_harmony_reclustering"
)

input_path <- file.path(
  results_dir,
  "SO_micro_step13A_harmony_reclustered.rds"
)

if (!file.exists(input_path)) {
  stop("Step 13A object not found. Run Step13A first: ", input_path)
}

SO_micro <- readRDS(input_path)

DefaultAssay(SO_micro) <- "RNA"

if ("Assay5" %in% class(SO_micro[["RNA"]])) {
  SO_micro <- JoinLayers(SO_micro, assay = "RNA")
}

cat("Loaded Step 13A object:", ncol(SO_micro), "cells\n")
Loaded Step 13A object: 3439 cells
Show code
cat("Clusters:", paste(sort(unique(SO_micro$seurat_clusters_harmony)), collapse = ", "), "\n")
Clusters: 0, 1, 2, 3, 4, 5 

13B.2 Verify Metadata

Show code
# condition must be a factor with C first
SO_micro$condition <- factor(SO_micro$condition, levels = c("C", "PD"))

# sex must exist
if (!"sex" %in% colnames(SO_micro@meta.data)) {
  stop("Sex column is missing from metadata.")
}

# Summarise
table(SO_micro$condition)

   C   PD 
1748 1691 
Show code
table(SO_micro$sex)

 Female    Male Unknown 
   1798    1641       0 
Show code
table(SO_micro$condition, SO_micro$sex)
    
     Female Male Unknown
  C     889  859       0
  PD    909  782       0
Show code
# Check for Unknown sex
n_unknown_sex <- sum(SO_micro$sex == "Unknown", na.rm = TRUE)
cat("Cells with Unknown sex:", n_unknown_sex, "\n")
Cells with Unknown sex: 0 
Show code
# If all cells have Unknown sex, MAST cannot use sex as covariate
sex_usable <- n_unknown_sex < ncol(SO_micro) &&
  length(unique(SO_micro$sex[SO_micro$sex != "Unknown"])) >= 2

cat("Sex usable as covariate:", sex_usable, "\n")
Sex usable as covariate: TRUE 

13B.3 Overall DE: PD vs Control (Sex-Adjusted)

We use MAST with latent.vars = "sex" to account for sex differences. If MAST is unavailable or sex is not usable, we fall back to Wilcoxon.

Show code
# Helper function for DE with optional MAST + sex covariate
run_de_safe <- function(object, ident1, ident2, use_sex = TRUE,
                        min_pct = 0.05, logfc_threshold = 0.10) {

  mast_available <- requireNamespace("MAST", quietly = TRUE)

  # Check sex distribution
  sex_vals <- unique(object$sex)
  sex_ok <- use_sex && length(setdiff(sex_vals, "Unknown")) >= 2

  if (mast_available && sex_ok) {
    cat("  → MAST with latent.vars = 'sex'\n")
    de <- FindMarkers(
      object,
      ident.1 = ident1,
      ident.2 = ident2,
      assay = "RNA",
      test.use = "MAST",
      latent.vars = "sex",
      min.pct = min_pct,
      logfc.threshold = logfc_threshold,
      verbose = FALSE
    )
    attr(de, "de_method") <- "MAST_sex"
  } else {
    if (!mast_available) cat("  → MAST not installed; using Wilcoxon\n")
    if (!sex_ok) cat("  → Sex not usable as covariate; using Wilcoxon\n")
    de <- FindMarkers(
      object,
      ident.1 = ident1,
      ident.2 = ident2,
      assay = "RNA",
      test.use = "wilcox",
      min.pct = min_pct,
      logfc.threshold = logfc_threshold,
      verbose = FALSE
    )
    attr(de, "de_method") <- "wilcox"
  }

  return(de)
}

# Set identity to condition
Idents(SO_micro) <- "condition"

cat("Running overall PD vs Control DE...\n")
Running overall PD vs Control DE...
Show code
de_overall <- run_de_safe(SO_micro, ident1 = "PD", ident2 = "C", use_sex = sex_usable)
  → MAST with latent.vars = 'sex'
Show code
de_overall_method <- attr(de_overall, "de_method")
cat("Method used:", de_overall_method, "\n")
Method used: MAST_sex 
Show code
cat("Total genes tested:", nrow(de_overall), "\n")
Total genes tested: 5493 
Show code
# Add gene column
de_overall <- de_overall %>%
  tibble::rownames_to_column("gene")

# Significant genes
de_sig <- de_overall %>%
  filter(p_val_adj < 0.05) %>%
  arrange(desc(abs(avg_log2FC)))

cat("Significant genes (adj.p < 0.05):", nrow(de_sig), "\n")
Significant genes (adj.p < 0.05): 985 
Show code
pd_up_genes   <- de_sig %>% filter(avg_log2FC > 0) %>% pull(gene)
ctrl_up_genes <- de_sig %>% filter(avg_log2FC < 0) %>% pull(gene)

cat("PD-up:", length(pd_up_genes), " | Control-up:", length(ctrl_up_genes), "\n")
PD-up: 635  | Control-up: 350 
Show code
# Save
write.csv(
  de_overall,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_microglia_all.csv"),
  row.names = FALSE
)

write.csv(
  de_sig,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_microglia_significant.csv"),
  row.names = FALSE
)

13B.4 Volcano Plot: Overall PD vs Control

Show code
de_plot <- de_overall %>%
  mutate(
    direction = case_when(
      p_val_adj < 0.05 & avg_log2FC > 0   ~ "PD_up",
      p_val_adj < 0.05 & avg_log2FC < 0   ~ "Control_up",
      TRUE                                 ~ "NS"
    ),
    label_gene = ifelse(
      p_val_adj < 0.05 & abs(avg_log2FC) > 0.5,
      gene,
      NA_character_
    )
  )

ggplot(de_plot, aes(x = avg_log2FC, y = -log10(p_val_adj), color = direction)) +
  geom_point(alpha = 0.5, size = 1) +
  scale_color_manual(
    values = c("PD_up" = "#D32F2F", "Control_up" = "#1976D2", "NS" = "grey70")
  ) +
  geom_text_repel(
    aes(label = label_gene),
    size = 2.5,
    max.overlaps = 20,
    show.legend = FALSE
  ) +
  labs(
    title = paste0("Overall PD vs Control DE in Microglia (", de_overall_method, ")"),
    x = "Log2 Fold Change (PD / Control)",
    y = "-log10(Adjusted P-value)",
    color = "Direction"
  ) +
  theme_classic()

13B.5 DotPlot of Top DE Genes Across Harmony Clusters

This identifies which clusters carry the PD-associated transcriptional signature.

Show code
top_pd_genes   <- head(pd_up_genes, 15)
top_ctrl_genes <- head(ctrl_up_genes, 15)
top_de_genes   <- intersect(c(top_pd_genes, top_ctrl_genes), rownames(SO_micro))

if (length(top_de_genes) > 0) {
  DotPlot(
    SO_micro,
    features = top_de_genes,
    assay = "RNA",
    group.by = "seurat_clusters_harmony",
    cols = c("blue", "red"),
    dot.scale = 6
  ) +
    RotatedAxis() +
    labs(
      title = "Top PD-up & Control-up DE Genes Across Harmony Clusters",
      subtitle = "Genes ranked by |log2FC| from overall PD vs Control DE",
      y = "Harmony cluster"
    )
} else {
  message("No significant DE genes to plot.")
}

13B.6 Pathway Enrichment (GO Biological Process)

GO enrichment is run separately for PD-upregulated and Control-upregulated genes.

Show code
go_available <- requireNamespace("clusterProfiler", quietly = TRUE) &&
                requireNamespace("org.Hs.eg.db", quietly = TRUE)

if (go_available) {
  library(clusterProfiler)
  library(org.Hs.eg.db)

  run_go_bp <- function(gene_list, label) {
    if (length(gene_list) < 5) {
      cat("Skipping GO for", label, "- fewer than 5 genes.\n")
      return(NULL)
    }
    ego <- enrichGO(
      gene         = gene_list,
      OrgDb        = org.Hs.eg.db,
      keyType      = "SYMBOL",
      ont          = "BP",
      pAdjustMethod = "BH",
      qvalueCutoff = 0.2,
      readable     = TRUE
    )
    return(ego)
  }

  ego_pd   <- run_go_bp(pd_up_genes,   "PD_up")
  ego_ctrl <- run_go_bp(ctrl_up_genes,  "Control_up")

  if (!is.null(ego_pd)) {
    write.csv(
      as.data.frame(ego_pd),
      file.path(results_dir, "Step13B_GO_BP_PD_up_microglia.csv"),
      row.names = FALSE
    )
  }
  if (!is.null(ego_ctrl)) {
    write.csv(
      as.data.frame(ego_ctrl),
      file.path(results_dir, "Step13B_GO_BP_Control_up_microglia.csv"),
      row.names = FALSE
    )
  }
} else {
  cat("clusterProfiler or org.Hs.eg.db not installed. Skipping GO enrichment.\n")
  ego_pd   <- NULL
  ego_ctrl <- NULL
}

GO Dot Plots

Show code
if (!is.null(ego_pd) && nrow(as.data.frame(ego_pd)) > 0) {
  dotplot(ego_pd, showCategory = 10) +
    labs(title = "GO BP: PD-Upregulated Genes in Microglia")
} else {
  message("No significant GO terms for PD-up genes.")
}

Show code
if (!is.null(ego_ctrl) && nrow(as.data.frame(ego_ctrl)) > 0) {
  dotplot(ego_ctrl, showCategory = 10) +
    labs(title = "GO BP: Control-Upregulated Genes in Microglia")
} else {
  message("No significant GO terms for Control-up genes.")
}

13B.7 Cluster-Wise PD vs Control DE

Within each Harmony cluster, we compare PD vs Control cells. Sex is included as a covariate when both sexes are present and each group has ≥10 cells.

Show code
clusters <- sort(unique(SO_micro$seurat_clusters_harmony))
cluster_de_list   <- list()
method_summary_df <- tibble(
  cluster       = character(),
  n_cells_PD    = integer(),
  n_cells_C     = integer(),
  n_sexes       = integer(),
  method        = character(),
  n_sig_genes   = integer()
)

min_cells_per_group <- 10

for (cl in clusters) {
  cat("\n--- Cluster", cl, "---\n")

  cells_cl <- colnames(SO_micro)[SO_micro$seurat_clusters_harmony == cl]
  sub_obj  <- subset(SO_micro, cells = cells_cl)
  Idents(sub_obj) <- "condition"

  cond_tbl <- table(sub_obj$condition)
  n_pd <- as.integer(cond_tbl["PD"])
  n_c  <- as.integer(cond_tbl["C"])

  if (is.na(n_pd)) n_pd <- 0L
  if (is.na(n_c))  n_c  <- 0L

  cat("  PD:", n_pd, " C:", n_c, "\n")

  if (n_pd < min_cells_per_group || n_c < min_cells_per_group) {
    cat("  Skipped: insufficient cells.\n")
    method_summary_df <- method_summary_df %>%
      add_row(
        cluster     = cl,
        n_cells_PD  = n_pd,
        n_cells_C   = n_c,
        n_sexes     = NA_integer_,
        method      = "SKIPPED",
        n_sig_genes = NA_integer_
      )
    next
  }

  # Check sex distribution in this cluster
  sex_in_cluster <- unique(sub_obj$sex[sub_obj$sex != "Unknown"])
  n_sexes <- length(sex_in_cluster)

  de_cl <- run_de_safe(
    sub_obj,
    ident1 = "PD",
    ident2 = "C",
    use_sex = (n_sexes >= 2)   
  )

  de_method <- attr(de_cl, "de_method")

  de_cl <- de_cl %>%
    tibble::rownames_to_column("gene") %>%
    mutate(cluster = cl)

  n_sig <- sum(de_cl$p_val_adj < 0.05, na.rm = TRUE)
  cat("  Method:", de_method, " | Sig genes:", n_sig, "\n")

  # Save per-cluster CSV
  write.csv(
    de_cl,
    file.path(results_dir, paste0("Step13B_DE_PD_vs_Control_cluster_", cl, ".csv")),
    row.names = FALSE
  )

  cluster_de_list[[cl]] <- de_cl

  method_summary_df <- method_summary_df %>%
    add_row(
      cluster     = cl,
      n_cells_PD  = n_pd,
      n_cells_C   = n_c,
      n_sexes     = n_sexes,
      method      = de_method,
      n_sig_genes = n_sig
    )
}

--- Cluster 0 ---
  PD: 778  C: 745 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 275 

--- Cluster 1 ---
  PD: 626  C: 565 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 382 

--- Cluster 2 ---
  PD: 143  C: 263 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 29 

--- Cluster 3 ---
  PD: 57  C: 96 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 1 

--- Cluster 4 ---
  PD: 76  C: 66 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 4 

--- Cluster 5 ---
  PD: 11  C: 13 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 0 
Show code
# Combine all cluster-wise DE results
combined_cluster_de <- bind_rows(cluster_de_list)

write.csv(
  combined_cluster_de,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_clusterwise_all.csv"),
  row.names = FALSE
)

write.csv(
  method_summary_df,
  file.path(results_dir, "Step13B_clusterwise_DE_method_summary.csv"),
  row.names = FALSE
)

method_summary_df
Show code
sex_de_list <- list()

for (sex_group in c("Female", "Male")) {
  cat("\n--- Sex-stratified DE:", sex_group, "---\n")

  cells_sex <- colnames(SO_micro)[SO_micro$sex == sex_group]
  sub_sex <- subset(SO_micro, cells = cells_sex)

  cond_tbl <- table(sub_sex$condition)
  n_pd <- as.integer(cond_tbl["PD"])
  n_c  <- as.integer(cond_tbl["C"])

  if (is.na(n_pd)) n_pd <- 0L
  if (is.na(n_c))  n_c <- 0L

  cat("PD:", n_pd, " C:", n_c, "\n")

  if (n_pd < 10 || n_c < 10) {
    cat("Skipped:", sex_group, "- insufficient cells.\n")
    next
  }

  Idents(sub_sex) <- "condition"

  de_sex <- FindMarkers(
    sub_sex,
    ident.1 = "PD",
    ident.2 = "C",
    assay = "RNA",
    test.use = "wilcox",
    min.pct = 0.05,
    logfc.threshold = 0.10,
    verbose = FALSE
  ) %>%
    tibble::rownames_to_column("gene") %>%
    dplyr::mutate(sex = sex_group)

  sex_de_list[[sex_group]] <- de_sex

  write.csv(
    de_sex,
    file.path(results_dir, paste0("Step13B_DE_PD_vs_Control_", sex_group, "_microglia.csv")),
    row.names = FALSE
  )
}

--- Sex-stratified DE: Female ---
PD: 909  C: 889 

--- Sex-stratified DE: Male ---
PD: 782  C: 859 
Show code
sex_de_combined <- dplyr::bind_rows(sex_de_list)

write.csv(
  sex_de_combined,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_sex_stratified_microglia_all.csv"),
  row.names = FALSE
)

13B.7.5 GO Enrichment for Cluster 0 and Cluster 1 PD-vs-Control DE

GO enrichment is performed separately for PD-upregulated and Control-upregulated genes within cluster 0 and cluster 1.
This helps determine whether PD-associated transcriptional programs differ between homeostatic-like and DAM-like microglia states.

Show code
go_available <- requireNamespace("clusterProfiler", quietly = TRUE) &&
                requireNamespace("org.Hs.eg.db", quietly = TRUE)

if (go_available) {
  library(clusterProfiler)
  library(org.Hs.eg.db)

  run_go_bp_cluster <- function(gene_list, label) {
    gene_list <- unique(gene_list)
    
    if (length(gene_list) < 5) {
      message("Skipping GO for ", label, ": fewer than 5 genes.")
      return(NULL)
    }

    ego <- clusterProfiler::enrichGO(
      gene          = gene_list,
      OrgDb         = org.Hs.eg.db::org.Hs.eg.db,
      keyType       = "SYMBOL",
      ont           = "BP",
      pAdjustMethod = "BH",
      qvalueCutoff  = 0.2,
      readable      = TRUE
    )

    return(ego)
  }

  target_clusters_for_go <- c("0", "1")
  cluster_go_results <- list()

  for (cl in target_clusters_for_go) {
    message("Running GO enrichment for cluster ", cl, "...")

    de_cl <- combined_cluster_de %>%
      dplyr::filter(cluster == cl)

    if (nrow(de_cl) == 0) {
      message("No DE results found for cluster ", cl)
      next
    }

    cl_pd_up_genes <- de_cl %>%
      dplyr::filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
      dplyr::pull(gene) %>%
      unique()

    cl_ctrl_up_genes <- de_cl %>%
      dplyr::filter(p_val_adj < 0.05, avg_log2FC < 0) %>%
      dplyr::pull(gene) %>%
      unique()

    message("Cluster ", cl, " PD-up genes: ", length(cl_pd_up_genes))
    message("Cluster ", cl, " Control-up genes: ", length(cl_ctrl_up_genes))

    ego_cl_pd <- run_go_bp_cluster(
      cl_pd_up_genes,
      paste0("Cluster_", cl, "_PD_up")
    )

    ego_cl_ctrl <- run_go_bp_cluster(
      cl_ctrl_up_genes,
      paste0("Cluster_", cl, "_Control_up")
    )

    cluster_go_results[[paste0("cluster_", cl, "_PD_up")]] <- ego_cl_pd
    cluster_go_results[[paste0("cluster_", cl, "_Control_up")]] <- ego_cl_ctrl

    if (!is.null(ego_cl_pd)) {
      write.csv(
        as.data.frame(ego_cl_pd),
        file.path(results_dir, paste0("Step13B_GO_BP_cluster_", cl, "_PD_up_microglia.csv")),
        row.names = FALSE
      )
    }

    if (!is.null(ego_cl_ctrl)) {
      write.csv(
        as.data.frame(ego_cl_ctrl),
        file.path(results_dir, paste0("Step13B_GO_BP_cluster_", cl, "_Control_up_microglia.csv")),
        row.names = FALSE
      )
    }
  }

} else {
  message("clusterProfiler or org.Hs.eg.db not installed. Skipping cluster 0/1 GO enrichment.")
  cluster_go_results <- list()
}

GO Dot Plots for Cluster 0 and Cluster 1

Show code
if (exists("cluster_go_results") && length(cluster_go_results) > 0) {

  for (go_name in names(cluster_go_results)) {
    ego_obj <- cluster_go_results[[go_name]]

    if (!is.null(ego_obj) && nrow(as.data.frame(ego_obj)) > 0) {
      print(
        clusterProfiler::dotplot(ego_obj, showCategory = 10) +
          labs(title = paste0("GO BP: ", go_name))
      )
    } else {
      message("No significant GO terms for ", go_name)
    }
  }

} else {
  message("No cluster-specific GO results available.")
}

13B.8 Cluster-Wise DE Gene Count Bar Plot

Show code
if (nrow(combined_cluster_de) > 0) {
  cluster_de_counts <- combined_cluster_de %>%
    filter(p_val_adj < 0.05) %>%
    mutate(direction = ifelse(avg_log2FC > 0, "PD_up", "Control_up")) %>%
    count(cluster, direction, name = "n_genes")

  if (nrow(cluster_de_counts) > 0) {
    ggplot(cluster_de_counts, aes(x = cluster, y = n_genes, fill = direction)) +
      geom_col(position = "dodge") +
      scale_fill_manual(values = c("PD_up" = "#D32F2F", "Control_up" = "#1976D2")) +
      labs(
        title = "Significant DE Genes per Cluster (PD vs Control)",
        x = "Harmony Cluster",
        y = "Number of Significant Genes",
        fill = "Direction"
      ) +
      theme_minimal()
  } else {
    message("No cluster-wise significant DE genes found.")
  }
} else {
  message("No cluster-wise DE results available.")
}

13B.9 DotPlot of Top PD-Up Genes Across Clusters

Show code
# From cluster-wise results, find top PD-up genes per cluster
if (nrow(combined_cluster_de) > 0) {
  top_pd_per_cluster <- combined_cluster_de %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
    group_by(cluster) %>%
    slice_max(order_by = avg_log2FC, n = 5) %>%
    ungroup()

  top_pd_cluster_genes <- intersect(
    unique(top_pd_per_cluster$gene),
    rownames(SO_micro)
  )

  if (length(top_pd_cluster_genes) > 0) {
    DotPlot(
      SO_micro,
      features = top_pd_cluster_genes,
      assay = "RNA",
      group.by = "seurat_clusters_harmony",
      cols = c("blue", "red"),
      dot.scale = 6
    ) +
      RotatedAxis() +
      labs(
        title = "Top PD-Up Genes (Cluster-Wise DE) Across Harmony Clusters",
        y = "Harmony cluster"
      )
  } else {
    message("No PD-up cluster-wise DE genes for DotPlot.")
  }
}

13B.10 Canonical Microglia & DAM Marker Visualization

These markers are used to interpret cluster identity after the DE analysis. They are not used to define the primary DE comparison.

Show code
homeostatic_markers <- c("CX3CR1", "P2RY12", "TMEM119", "AIF1", "CSF1R")
dam_markers         <- c("APOE", "TREM2", "CTSB", "CTSD", "LGALS3", "LPL", "GPNMB", "SPP1")
inflammatory_markers <- c("IL1B", "TNF", "IL6", "CXCL10", "CCL2")

all_markers <- c(homeostatic_markers, dam_markers, inflammatory_markers)
markers_present <- intersect(all_markers, rownames(SO_micro))

if (length(markers_present) > 0) {
  DotPlot(
    SO_micro,
    features = markers_present,
    assay = "RNA",
    group.by = "seurat_clusters_harmony",
    cols = c("blue", "red"),
    dot.scale = 6
  ) +
    RotatedAxis() +
    labs(
      title = "Canonical Microglia Markers Across Harmony Clusters",
      subtitle = "Homeostatic | DAM | Inflammatory",
      y = "Harmony cluster"
    )
}

Show code
# FeaturePlot for a subset of key markers
key_markers <- intersect(
  c("CX3CR1", "P2RY12", "TMEM119", "APOE", "TREM2", "SPP1"),
  rownames(SO_micro)
)

if (length(key_markers) > 0) {
  FeaturePlot(
    SO_micro,
    features = key_markers,
    reduction = "umap_micro_harmony",
    ncol = 3,
    pt.size = 0.3
  )
}

13B.11 Module Score Calculation

Module scores provide a cell-level summary of microglia state activation.

Show code
DefaultAssay(SO_micro) <- "RNA"

module_list <- list(
  Homeostatic = c(
    "CX3CR1", "P2RY12", "TMEM119", "CSF1R", "HEXB", "TGFBR1"
  ),

  DAM = c(
    "APOE", "TREM2", "CTSB", "CTSD", "LGALS3", "LPL", "GPNMB", "SPP1"
  ),

  Inflammatory_NFkB = c(
    "NFKBIA", "NFKBIZ", "TNFAIP3", "IL1B", "TNF", "CCL2", "CXCL8", "CXCL10"
  ),

  Complement = c(
    "C1QA", "C1QB", "C1QC", "C3", "C4A", "C4B", "CFH", "CD74"
  ),

  Antigen_Presentation = c(
    "HLA-DRA", "HLA-DRB1", "HLA-DPA1", "HLA-DPB1", "HLA-A", "HLA-B", "HLA-C", "CD74"
  ),

  Lysosomal_Phagosome = c(
    "CTSB", "CTSD", "CTSL", "LAMP1", "LAMP2", "CD68", "TYROBP", "FCGR3A"
  ),

  Oxidative_Stress = c(
    "HMOX1", "NQO1", "SOD1", "SOD2", "GPX1", "PRDX1", "PRDX2", "TXN"
  ),

  Mitochondrial_Stress = c(
    "PINK1", "PRKN", "PARK7", "BNIP3", "BNIP3L", "SQSTM1", "OPTN"
  ),

  Proteostasis_HSR = c(
    "HSPA1A", "HSPA1B", "HSPA6", "HSPH1", "HSP90AA1", "DNAJB1", "BAG3", "HSPD1"
  ),

  Interferon = c(
    "IFIT1", "IFIT2", "IFIT3", "ISG15", "MX1", "OAS1", "STAT1", "IRF7"
  )
)

for (mod_name in names(module_list)) {
  genes_present <- intersect(module_list[[mod_name]], rownames(SO_micro))
  if (length(genes_present) >= 2) {
    SO_micro <- AddModuleScore(
      SO_micro,
      features = list(genes_present),
      name = paste0(mod_name, "_Score"),
      assay = "RNA",
      verbose = FALSE
    )
    cat(mod_name, ": used", length(genes_present), "of", length(module_list[[mod_name]]), "genes\n")
  } else {
    cat(mod_name, ": skipped (fewer than 2 genes present)\n")
  }
}
Homeostatic : used 6 of 6 genes
DAM : used 8 of 8 genes
Inflammatory_NFkB : used 8 of 8 genes
Complement : used 8 of 8 genes
Antigen_Presentation : used 8 of 8 genes
Lysosomal_Phagosome : used 8 of 8 genes
Oxidative_Stress : used 8 of 8 genes
Mitochondrial_Stress : used 7 of 7 genes
Proteostasis_HSR : used 8 of 8 genes
Interferon : used 8 of 8 genes

Module Score Visualisation

Show code
score_cols <- grep("_Score1$", colnames(SO_micro@meta.data), value = TRUE)

if (length(score_cols) > 0) {
  FeaturePlot(
    SO_micro,
    features = score_cols,
    reduction = "umap_micro_harmony",
    ncol = 3,
    pt.size = 0.3
  )
}

Show code
if (length(score_cols) > 0) {
  plots <- lapply(score_cols, function(sc) {
    VlnPlot(
      SO_micro,
      features = sc,
      group.by = "seurat_clusters_harmony",
      pt.size = 0
    ) + NoLegend()
  })
  wrap_plots(plots, ncol = 3)
}

13B.11.5 Module Scores by Condition

Show code
# Module score columns
score_cols <- grep("_Score1$", colnames(SO_micro@meta.data), value = TRUE)

score_cols
 [1] "Homeostatic_Score1"          "DAM_Score1"                 
 [3] "Inflammatory_NFkB_Score1"    "Complement_Score1"          
 [5] "Antigen_Presentation_Score1" "Lysosomal_Phagosome_Score1" 
 [7] "Oxidative_Stress_Score1"     "Mitochondrial_Stress_Score1"
 [9] "Proteostasis_HSR_Score1"     "Interferon_Score1"          
Show code
# Long-format metadata table
module_score_long <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::select(
    barcode,
    condition,
    sex,
    sample,
    seurat_clusters_harmony,
    dplyr::all_of(score_cols)
  ) %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(score_cols),
    names_to = "module",
    values_to = "score"
  ) %>%
    dplyr::mutate(
    module = stringr::str_replace(module, "_Score1$", "")
  )

# Violin plot: all microglia, PD vs Control
ggplot(
  module_score_long,
  aes(x = condition, y = score, fill = condition)
) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Module Scores by Condition in All Microglia",
    subtitle = "PD vs Control comparison across all microglia cells",
    x = "Condition",
    y = "Module score"
  ) +
  theme_classic() +
  theme(legend.position = "none")

Show code
# Overall PD vs Control module score comparison
module_condition_stats <- module_score_long %>%
  dplyr::group_by(module) %>%
  dplyr::summarise(
    n_C = sum(condition == "C"),
    n_PD = sum(condition == "PD"),
    mean_C = mean(score[condition == "C"], na.rm = TRUE),
    mean_PD = mean(score[condition == "PD"], na.rm = TRUE),
    median_C = median(score[condition == "C"], na.rm = TRUE),
    median_PD = median(score[condition == "PD"], na.rm = TRUE),
    wilcox_p = wilcox.test(score ~ condition)$p.value,
    .groups = "drop"
  ) %>%
  dplyr::mutate(
    wilcox_p_adj = p.adjust(wilcox_p, method = "BH"),
    delta_mean_PD_minus_C = mean_PD - mean_C,
    delta_median_PD_minus_C = median_PD - median_C
  ) %>%
  dplyr::arrange(wilcox_p_adj)

module_condition_stats
Show code
write.csv(
  module_condition_stats,
  file.path(results_dir, "Step13B_module_score_PD_vs_Control_overall_stats.csv"),
  row.names = FALSE
)

# Cluster-wise PD vs Control module score comparison
module_cluster_condition_stats <- module_score_long %>%
  dplyr::group_by(seurat_clusters_harmony, module) %>%
  dplyr::filter(
    sum(condition == "C") >= 10,
    sum(condition == "PD") >= 10
  ) %>%
  dplyr::summarise(
    n_C = sum(condition == "C"),
    n_PD = sum(condition == "PD"),
    mean_C = mean(score[condition == "C"], na.rm = TRUE),
    mean_PD = mean(score[condition == "PD"], na.rm = TRUE),
    median_C = median(score[condition == "C"], na.rm = TRUE),
    median_PD = median(score[condition == "PD"], na.rm = TRUE),
    wilcox_p = wilcox.test(score ~ condition)$p.value,
    .groups = "drop"
  ) %>%
  dplyr::group_by(module) %>%
  dplyr::mutate(
    wilcox_p_adj = p.adjust(wilcox_p, method = "BH")
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    delta_mean_PD_minus_C = mean_PD - mean_C,
    delta_median_PD_minus_C = median_PD - median_C
  ) %>%
  dplyr::arrange(seurat_clusters_harmony, wilcox_p_adj)

module_cluster_condition_stats
Show code
write.csv(
  module_cluster_condition_stats,
  file.path(results_dir, "Step13B_module_score_PD_vs_Control_by_cluster_stats.csv"),
  row.names = FALSE
)

13B.11.6 Module Scores by Cluster and Condition

Show code
module_score_long_01 <- module_score_long %>%
  dplyr::filter(seurat_clusters_harmony %in% c("0", "1")) %>%
  dplyr::mutate(
    cluster_condition = paste0(
      "Cl",
      seurat_clusters_harmony,
      "_",
      condition
    ),
    cluster_condition = factor(
      cluster_condition,
      levels = c("Cl0_C", "Cl0_PD", "Cl1_C", "Cl1_PD")
    )
  )

ggplot(
  module_score_long_01,
  aes(x = cluster_condition, y = score, fill = condition)
) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Module Scores by Cluster and Condition",
    subtitle = "Testing whether PD changes module scores within clusters 0 and 1",
    x = "Cluster / condition",
    y = "Module score"
  ) +
  theme_classic()

13B.11.7 Sample-Level Module Scores by Condition

Show code
sample_module_score <- module_score_long %>%
  dplyr::group_by(sample, condition, sex, module) %>%
  dplyr::summarise(
    mean_score = mean(score, na.rm = TRUE),
    median_score = median(score, na.rm = TRUE),
    n_cells = dplyr::n(),
    .groups = "drop"
  )

sample_module_score
Show code
write.csv(
  sample_module_score,
  file.path(results_dir, "Step13B_sample_level_module_scores_by_condition.csv"),
  row.names = FALSE
)

ggplot(
  sample_module_score,
  aes(x = condition, y = mean_score)
) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(
    aes(shape = sex),
    width = 0.15,
    size = 2.5,
    alpha = 0.8
  ) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Sample-Level Module Scores by Condition",
    subtitle = "Each point represents one sample/donor",
    x = "Condition",
    y = "Mean module score per sample"
  ) +
  theme_classic()

13B.12 Cluster Summary Table

Show code
meta <- SO_micro@meta.data %>%
  as_tibble(rownames = "barcode")

cluster_summary <- meta %>%
  group_by(cluster = seurat_clusters_harmony) %>%
  summarise(
    n_cells   = n(),
    n_PD      = sum(condition == "PD"),
    n_C       = sum(condition == "C"),
    pct_PD    = 100 * n_PD / n_cells,
    across(
      all_of(score_cols),
      list(mean = ~mean(., na.rm = TRUE)),
      .names = "{.col}_mean"
    ),
    .groups = "drop"
  ) %>%
  arrange(cluster)

cluster_summary
Show code
write.csv(
  cluster_summary,
  file.path(results_dir, "Step13B_harmony_cluster_summary.csv"),
  row.names = FALSE
)

13B.13 Conservative Microglia State Labels

State labels are assigned conservatively based on convergent evidence from:

  1. PD vs Control DE gene expression across clusters
  2. Module scores
  3. Canonical marker patterns
Show code
# Conservative state labels are assigned manually based on
# PD-vs-Control DE, canonical marker expression, and module-score interpretation.
#
# Rationale:
# - Cluster 1 shows the strongest DAM-like / activated microglia signature.
# - Cluster 0 is used as the relatively homeostatic / baseline microglia population.
# - Clusters 2–5 are retained as Other because they are smaller, less central,
#   or may include non-core / mixed microglia-like states.

SO_micro$micro_state_harmony <- dplyr::case_when(
  SO_micro$seurat_clusters_harmony == "1" ~ "DAM_like",
  SO_micro$seurat_clusters_harmony == "0" ~ "Homeostatic_like",
  SO_micro$seurat_clusters_harmony %in% c("2", "3", "4", "5") ~ "Other",
  TRUE ~ "Unassigned"
)

SO_micro$micro_state_harmony <- factor(
  SO_micro$micro_state_harmony,
  levels = c("Homeostatic_like", "DAM_like", "Other", "Unassigned")
)

table(SO_micro$micro_state_harmony)

Homeostatic_like         DAM_like            Other       Unassigned 
            1523             1191              725                0 
Show code
state_summary <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::count(
    seurat_clusters_harmony,
    micro_state_harmony,
    condition,
    sex,
    sample,
    name = "n_cells"
  ) %>%
  dplyr::arrange(seurat_clusters_harmony, condition, sample)

write.csv(
  state_summary,
  file.path(results_dir, "Step13B_microglia_state_summary_manual_labels.csv"),
  row.names = FALSE
)

state_summary
Show code
DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "micro_state_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.5
) +
  labs(title = "Conservative Microglia State Labels")

13B.14 Export Microglia State Barcodes

Show code
barcode_export <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::select(
    barcode,
    seurat_clusters_harmony,
    condition,
    sex,
    sample,
    micro_state_harmony
  )
write.csv(
  barcode_export,
  file.path(results_dir, "Step13B_microglia_state_barcodes_all.csv"),
  row.names = FALSE
)

# Also save per-state barcode files
for (state_name in levels(SO_micro$micro_state_harmony)) {
  barcodes <- barcode_export %>%
    filter(micro_state_harmony == state_name) %>%
    pull(barcode)
  write.csv(
    data.frame(barcode = barcodes),
    file.path(results_dir, paste0("Step13B_barcodes_", state_name, ".csv")),
    row.names = FALSE
  )
}

13B.15 Save Step 13B Object

Show code
output_path <- file.path(
  results_dir,
  "SO_micro_step13B_DE_pathway_state_annotated.rds"
)

saveRDS(SO_micro, output_path)

write.csv(
  data.frame(
    setting = c(
      "input_object",
      "overall_DE_method",
      "overall_DE_covariate",
      "n_overall_sig_genes",
      "n_PD_up",
      "n_Control_up",
      "n_clusters_tested",
      "module_list"
    ),
    value = c(
      input_path,
      de_overall_method,
      ifelse(grepl("sex", de_overall_method), "sex", "none"),
      nrow(de_sig),
      length(pd_up_genes),
      length(ctrl_up_genes),
      sum(method_summary_df$method != "SKIPPED"),
      paste(names(module_list), collapse = ", ")
    )
  ),
  file.path(results_dir, "Step13B_analysis_settings.csv"),
  row.names = FALSE
)

cat("Step 13B complete.\n")
Step 13B complete.
Show code
cat("Saved:", output_path, "\n")
Saved: /Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/results/Step13_microglia_harmony_reclustering/SO_micro_step13B_DE_pathway_state_annotated.rds 
Show code
cat("Cells:", ncol(SO_micro), "\n")
Cells: 3439 
Show code
cat("States:", paste(levels(SO_micro$micro_state_harmony), collapse = ", "), "\n")
States: Homeostatic_like, DAM_like, Other, Unassigned 

Narrative summary: The primary differential expression analysis was performed between PD and control microglia while accounting for sex. Harmony-based clusters were not used as the primary DE groups. Instead, clusters were used after PD-vs-Control DE to identify which microglia subpopulations carry PD-associated transcriptional changes.