Step 13B: PD vs Control Differential Expression, Pathway Enrichment & State Annotation

Sex-adjusted DE in all microglia and per cluster, GO enrichment, module scoring, and conservative state labeling

1 Load Step 13A Object

All expression analyses use the RNA assay. Harmony is used only for clustering and UMAP (Step 13A).

Show code
library(Seurat)
library(tidyverse)
library(patchwork)
library(ggrepel)
library(pheatmap)
set.seed(1234)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"

results_dir <- file.path(
  project_dir,
  "results",
  "Step13_microglia_harmony_reclustering"
)

input_path <- file.path(
  results_dir,
  "SO_micro_step13A_harmony_reclustered.rds"
)

if (!file.exists(input_path)) {
  stop("Step 13A object not found. Run Step13A first: ", input_path)
}

SO_micro <- readRDS(input_path)

DefaultAssay(SO_micro) <- "RNA"

if ("Assay5" %in% class(SO_micro[["RNA"]])) {
  SO_micro <- JoinLayers(SO_micro, assay = "RNA")
}

cat("Loaded Step 13A object:", ncol(SO_micro), "cells\n")
Loaded Step 13A object: 3439 cells
Show code
# cat("Clusters:", paste(sort(unique(SO_micro$seurat_clusters_harmony)), collapse = ", "), "\n")
# cat("\nNumber of cells per cluster:\n")
table(SO_micro$seurat_clusters_harmony)

   0    1    2    3    4    5 
1523 1191  406  153  142   24 
Show code
DimPlot(
  SO_micro, 
  reduction = "umap_micro_harmony", 
  group.by = "seurat_clusters_harmony", 
  label = TRUE
) +
  ggtitle("Harmony-Corrected Microglia Subclusters")

2 Verify Metadata

Show code
# condition must be a factor with C first
SO_micro$condition <- factor(SO_micro$condition, levels = c("C", "PD"))

# sex must exist
if (!"sex" %in% colnames(SO_micro@meta.data)) {
  stop("Sex column is missing from metadata.")
}

# Summarise
table(SO_micro$condition)
## 
##    C   PD 
## 1748 1691
table(SO_micro$sex)
## 
## Female   Male 
##   1798   1641
table(SO_micro$condition, SO_micro$sex)
##     
##      Female Male
##   C     889  859
##   PD    909  782

# Check for Unknown sex
n_unknown_sex <- sum(SO_micro$sex == "Unknown", na.rm = TRUE)
cat("Cells with Unknown sex:", n_unknown_sex, "\n")
## Cells with Unknown sex: 0

# If all cells have Unknown sex, MAST cannot use sex as covariate
sex_usable <- n_unknown_sex < ncol(SO_micro) &&
  length(unique(SO_micro$sex[SO_micro$sex != "Unknown"])) >= 2

cat("Sex usable as covariate:", sex_usable, "\n")
## Sex usable as covariate: TRUE

3 Overall DE: PD vs Control (Sex-Adjusted)

We use MAST with latent.vars = "sex" to account for sex differences. If MAST is unavailable or sex is not usable, we fall back to Wilcoxon.

Harmony-based clusters are not used as DE groups. Instead, clusters are used after PD-vs-Control DE to identify which microglia subpopulations carry PD-associated transcriptional changes.

Show code
# Helper function for DE with optional MAST + sex covariate
run_de_safe <- function(object, ident1, ident2, use_sex = TRUE,
                        min_pct = 0.05, logfc_threshold = 0.10) {

  mast_available <- requireNamespace("MAST", quietly = TRUE)

  # Check sex distribution
  sex_vals <- unique(object$sex)
  sex_ok <- use_sex && length(setdiff(sex_vals, "Unknown")) >= 2

  if (mast_available && sex_ok) {
    de <- FindMarkers(
      object,
      ident.1 = ident1,
      ident.2 = ident2,
      assay = "RNA",
      test.use = "MAST",
      latent.vars = "sex",
      min.pct = min_pct,
      logfc.threshold = logfc_threshold,
      verbose = FALSE
    )
    attr(de, "de_method") <- "MAST_sex"
  } else {
    de <- FindMarkers(
      object,
      ident.1 = ident1,
      ident.2 = ident2,
      assay = "RNA",
      test.use = "wilcox",
      min.pct = min_pct,
      logfc.threshold = logfc_threshold,
      verbose = FALSE
    )
    attr(de, "de_method") <- "wilcox"
  }

  return(de)
}

# Set identity to condition
Idents(SO_micro) <- "condition"

de_overall <- run_de_safe(SO_micro, ident1 = "PD", ident2 = "C", use_sex = sex_usable)

de_overall_method <- attr(de_overall, "de_method")
cat("Total genes tested:", nrow(de_overall), "\n")
Total genes tested: 5493 
Show code
# Add gene column
de_overall <- de_overall %>%
  tibble::rownames_to_column("gene")

# Significant genes
de_sig <- de_overall %>%
  filter(p_val_adj < 0.05) %>%
  arrange(desc(abs(avg_log2FC)))

cat("Significant genes (adj.p < 0.05):", nrow(de_sig), "\n")
Significant genes (adj.p < 0.05): 985 
Show code
pd_up_genes   <- de_sig %>% filter(avg_log2FC > 0) %>% pull(gene)
ctrl_up_genes <- de_sig %>% filter(avg_log2FC < 0) %>% pull(gene)

cat("PD-up:", length(pd_up_genes), " | Control-up:", length(ctrl_up_genes), "\n")
PD-up: 635  | Control-up: 350 
Show code
# Save
write.csv(
  de_overall,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_microglia_all.csv"),
  row.names = FALSE
)

write.csv(
  de_sig,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_microglia_significant.csv"),
  row.names = FALSE
)

4 Volcano Plot: Overall PD vs Control

Show code
de_plot <- de_overall %>%
  mutate(
    direction = case_when(
      p_val_adj < 0.05 & avg_log2FC > 0   ~ "PD_up",
      p_val_adj < 0.05 & avg_log2FC < 0   ~ "Control_up",
      TRUE                                 ~ "NS"
    ),
    label_gene = ifelse(
      p_val_adj < 0.05 & abs(avg_log2FC) > 0.5,
      gene,
      NA_character_
    )
  )

ggplot(de_plot, aes(x = avg_log2FC, y = -log10(p_val_adj), color = direction)) +
  geom_point(alpha = 0.5, size = 1) +
  scale_color_manual(
    values = c("PD_up" = "#D32F2F", "Control_up" = "#1976D2", "NS" = "grey70")
  ) +
  geom_text_repel(
    aes(label = label_gene),
    size = 2.5,
    max.overlaps = 20,
    show.legend = FALSE
  ) +
  labs(
    title = paste0("Overall PD vs Control DE in Microglia (", de_overall_method, ")"),
    x = "Log2 Fold Change (PD / Control)",
    y = "-log10(Adjusted P-value)",
    color = "Direction"
  ) +
  theme_classic()

5 DotPlot of Top DE Genes Across Harmony Clusters

This identifies which clusters carry the PD-associated transcriptional signature.

Show code
top_pd_genes   <- head(pd_up_genes, 15)
top_ctrl_genes <- head(ctrl_up_genes, 15)
top_de_genes   <- intersect(c(top_pd_genes, top_ctrl_genes), rownames(SO_micro))

if (length(top_de_genes) > 0) {
  DotPlot(
    SO_micro,
    features = top_de_genes,
    assay = "RNA",
    group.by = "seurat_clusters_harmony",
    cols = c("blue", "red"),
    dot.scale = 6
  ) +
    RotatedAxis() +
    labs(
      title = "Top PD-up & Control-up DE Genes Across Harmony Clusters",
      subtitle = "Genes ranked by |log2FC| from overall PD vs Control DE",
      y = "Harmony cluster"
    )
} else {
  message("No significant DE genes to plot.")
}

6 Pathway Enrichment (GO Biological Process)

GO enrichment is run separately for PD-upregulated and Control-upregulated genes.

Show code
go_available <- requireNamespace("clusterProfiler", quietly = TRUE) &&
                requireNamespace("org.Hs.eg.db", quietly = TRUE)

if (go_available) {
  library(clusterProfiler)
  library(org.Hs.eg.db)

  run_go_bp <- function(gene_list, label) {
    if (length(gene_list) < 5) {
      cat("Skipping GO for", label, "- fewer than 5 genes.\n")
      return(NULL)
    }
    ego <- enrichGO(
      gene         = gene_list,
      OrgDb        = org.Hs.eg.db,
      keyType      = "SYMBOL",
      ont          = "BP",
      pAdjustMethod = "BH",
      qvalueCutoff = 0.2,
      readable     = TRUE
    )
    return(ego)
  }

  ego_pd   <- run_go_bp(pd_up_genes,   "PD_up")
  ego_ctrl <- run_go_bp(ctrl_up_genes,  "Control_up")

  if (!is.null(ego_pd)) {
    write.csv(
      as.data.frame(ego_pd),
      file.path(results_dir, "Step13B_GO_BP_PD_up_microglia.csv"),
      row.names = FALSE
    )
  }
  if (!is.null(ego_ctrl)) {
    write.csv(
      as.data.frame(ego_ctrl),
      file.path(results_dir, "Step13B_GO_BP_Control_up_microglia.csv"),
      row.names = FALSE
    )
  }
} else {
  cat("clusterProfiler or org.Hs.eg.db not installed. Skipping GO enrichment.\n")
  ego_pd   <- NULL
  ego_ctrl <- NULL
}

6.1 GO Dot Plots

Show code
if (!is.null(ego_pd) && nrow(as.data.frame(ego_pd)) > 0) {
  dotplot(ego_pd, showCategory = 10) +
    labs(title = "GO BP: PD-Upregulated Genes in Microglia")
} else {
  message("No significant GO terms for PD-up genes.")
}

Show code
if (!is.null(ego_ctrl) && nrow(as.data.frame(ego_ctrl)) > 0) {
  dotplot(ego_ctrl, showCategory = 10) +
    labs(title = "GO BP: Control-Upregulated Genes in Microglia")
} else {
  message("No significant GO terms for Control-up genes.")
}

7 Cluster-Wise PD vs Control DE

Within each Harmony cluster, we compare PD vs Control cells. Sex is included as a covariate when both sexes are present and each group has ≥10 cells.

Show code
clusters <- sort(unique(SO_micro$seurat_clusters_harmony))
cluster_de_list   <- list()
method_summary_df <- tibble(
  cluster       = character(),
  n_cells_PD    = integer(),
  n_cells_C     = integer(),
  n_sexes       = integer(),
  method        = character(),
  n_sig_genes   = integer()
)

min_cells_per_group <- 10

for (cl in clusters) {
  cat("\n--- Cluster", cl, "---\n")

  cells_cl <- colnames(SO_micro)[SO_micro$seurat_clusters_harmony == cl]
  sub_obj  <- subset(SO_micro, cells = cells_cl)
  Idents(sub_obj) <- "condition"

  cond_tbl <- table(sub_obj$condition)
  n_pd <- as.integer(cond_tbl["PD"])
  n_c  <- as.integer(cond_tbl["C"])

  if (is.na(n_pd)) n_pd <- 0L
  if (is.na(n_c))  n_c  <- 0L

  cat("  PD:", n_pd, " C:", n_c, "\n")

  if (n_pd < min_cells_per_group || n_c < min_cells_per_group) {
    cat("  Skipped: insufficient cells.\n")
    method_summary_df <- method_summary_df %>%
      add_row(
        cluster     = cl,
        n_cells_PD  = n_pd,
        n_cells_C   = n_c,
        n_sexes     = NA_integer_,
        method      = "SKIPPED",
        n_sig_genes = NA_integer_
      )
    next
  }

  # Check sex distribution in this cluster
  sex_in_cluster <- unique(sub_obj$sex[sub_obj$sex != "Unknown"])
  n_sexes <- length(sex_in_cluster)

  de_cl <- run_de_safe(
    sub_obj,
    ident1 = "PD",
    ident2 = "C",
    use_sex = (n_sexes >= 2)   
  )

  de_method <- attr(de_cl, "de_method")

  de_cl <- de_cl %>%
    tibble::rownames_to_column("gene") %>%
    mutate(cluster = cl)

  n_sig <- sum(de_cl$p_val_adj < 0.05, na.rm = TRUE)
  cat("  Method:", de_method, " | Sig genes:", n_sig, "\n")

  # Save per-cluster CSV
  write.csv(
    de_cl,
    file.path(results_dir, paste0("Step13B_DE_PD_vs_Control_cluster_", cl, ".csv")),
    row.names = FALSE
  )

  cluster_de_list[[cl]] <- de_cl

  method_summary_df <- method_summary_df %>%
    add_row(
      cluster     = cl,
      n_cells_PD  = n_pd,
      n_cells_C   = n_c,
      n_sexes     = n_sexes,
      method      = de_method,
      n_sig_genes = n_sig
    )
}

--- Cluster 0 ---
  PD: 778  C: 745 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 275 

--- Cluster 1 ---
  PD: 626  C: 565 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 382 

--- Cluster 2 ---
  PD: 143  C: 263 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 29 

--- Cluster 3 ---
  PD: 57  C: 96 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 1 

--- Cluster 4 ---
  PD: 76  C: 66 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 4 

--- Cluster 5 ---
  PD: 11  C: 13 
  → MAST with latent.vars = 'sex'
  Method: MAST_sex  | Sig genes: 0 
Show code
# Combine all cluster-wise DE results
combined_cluster_de <- bind_rows(cluster_de_list)

write.csv(
  combined_cluster_de,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_clusterwise_all.csv"),
  row.names = FALSE
)

write.csv(
  method_summary_df,
  file.path(results_dir, "Step13B_clusterwise_DE_method_summary.csv"),
  row.names = FALSE
)

method_summary_df
Show code
sex_de_list <- list()

for (sex_group in c("Female", "Male")) {
  cat("\n--- Sex-stratified DE:", sex_group, "---\n")

  cells_sex <- colnames(SO_micro)[SO_micro$sex == sex_group]
  sub_sex <- subset(SO_micro, cells = cells_sex)

  cond_tbl <- table(sub_sex$condition)
  n_pd <- as.integer(cond_tbl["PD"])
  n_c  <- as.integer(cond_tbl["C"])

  if (is.na(n_pd)) n_pd <- 0L
  if (is.na(n_c))  n_c <- 0L

  cat("PD:", n_pd, " C:", n_c, "\n")

  if (n_pd < 10 || n_c < 10) {
    cat("Skipped:", sex_group, "- insufficient cells.\n")
    next
  }

  Idents(sub_sex) <- "condition"

  de_sex <- FindMarkers(
    sub_sex,
    ident.1 = "PD",
    ident.2 = "C",
    assay = "RNA",
    test.use = "wilcox",
    min.pct = 0.05,
    logfc.threshold = 0.10,
    verbose = FALSE
  ) %>%
    tibble::rownames_to_column("gene") %>%
    dplyr::mutate(sex = sex_group)

  sex_de_list[[sex_group]] <- de_sex

  write.csv(
    de_sex,
    file.path(results_dir, paste0("Step13B_DE_PD_vs_Control_", sex_group, "_microglia.csv")),
    row.names = FALSE
  )
}

--- Sex-stratified DE: Female ---
PD: 909  C: 889 

--- Sex-stratified DE: Male ---
PD: 782  C: 859 
Show code
sex_de_combined <- dplyr::bind_rows(sex_de_list)

write.csv(
  sex_de_combined,
  file.path(results_dir, "Step13B_DE_PD_vs_Control_sex_stratified_microglia_all.csv"),
  row.names = FALSE
)

8 GO Enrichment for Cluster 0 and Cluster 1 PD-vs-Control DE

GO enrichment is performed separately for PD-upregulated and Control-upregulated genes within cluster 0 and cluster 1.
This helps determine whether PD-associated transcriptional programs differ between homeostatic-like and DAM-like microglia states.

Show code
go_available <- requireNamespace("clusterProfiler", quietly = TRUE) &&
                requireNamespace("org.Hs.eg.db", quietly = TRUE)

if (go_available) {
  library(clusterProfiler)
  library(org.Hs.eg.db)

  run_go_bp_cluster <- function(gene_list, label) {
    gene_list <- unique(gene_list)
    
    if (length(gene_list) < 5) {
      message("Skipping GO for ", label, ": fewer than 5 genes.")
      return(NULL)
    }

    ego <- clusterProfiler::enrichGO(
      gene          = gene_list,
      OrgDb         = org.Hs.eg.db::org.Hs.eg.db,
      keyType       = "SYMBOL",
      ont           = "BP",
      pAdjustMethod = "BH",
      qvalueCutoff  = 0.2,
      readable      = TRUE
    )

    return(ego)
  }

  target_clusters_for_go <- c("0", "1")
  cluster_go_results <- list()

  for (cl in target_clusters_for_go) {
    message("Running GO enrichment for cluster ", cl, "...")

    de_cl <- combined_cluster_de %>%
      dplyr::filter(cluster == cl)

    if (nrow(de_cl) == 0) {
      message("No DE results found for cluster ", cl)
      next
    }

    cl_pd_up_genes <- de_cl %>%
      dplyr::filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
      dplyr::pull(gene) %>%
      unique()

    cl_ctrl_up_genes <- de_cl %>%
      dplyr::filter(p_val_adj < 0.05, avg_log2FC < 0) %>%
      dplyr::pull(gene) %>%
      unique()

    message("Cluster ", cl, " PD-up genes: ", length(cl_pd_up_genes))
    message("Cluster ", cl, " Control-up genes: ", length(cl_ctrl_up_genes))

    ego_cl_pd <- run_go_bp_cluster(
      cl_pd_up_genes,
      paste0("Cluster_", cl, "_PD_up")
    )

    ego_cl_ctrl <- run_go_bp_cluster(
      cl_ctrl_up_genes,
      paste0("Cluster_", cl, "_Control_up")
    )

    cluster_go_results[[paste0("cluster_", cl, "_PD_up")]] <- ego_cl_pd
    cluster_go_results[[paste0("cluster_", cl, "_Control_up")]] <- ego_cl_ctrl

    if (!is.null(ego_cl_pd)) {
      write.csv(
        as.data.frame(ego_cl_pd),
        file.path(results_dir, paste0("Step13B_GO_BP_cluster_", cl, "_PD_up_microglia.csv")),
        row.names = FALSE
      )
    }

    if (!is.null(ego_cl_ctrl)) {
      write.csv(
        as.data.frame(ego_cl_ctrl),
        file.path(results_dir, paste0("Step13B_GO_BP_cluster_", cl, "_Control_up_microglia.csv")),
        row.names = FALSE
      )
    }
  }

} else {
  message("clusterProfiler or org.Hs.eg.db not installed. Skipping cluster 0/1 GO enrichment.")
  cluster_go_results <- list()
}

8.1 GO Dot Plots for Cluster 0 and Cluster 1

Show code
if (exists("cluster_go_results") && length(cluster_go_results) > 0) {

  for (go_name in names(cluster_go_results)) {
    ego_obj <- cluster_go_results[[go_name]]

    if (!is.null(ego_obj) && nrow(as.data.frame(ego_obj)) > 0) {
      print(
        clusterProfiler::dotplot(ego_obj, showCategory = 10) +
          labs(title = paste0("GO BP: ", go_name))
      )
    } else {
      message("No significant GO terms for ", go_name)
    }
  }

} else {
  message("No cluster-specific GO results available.")
}

9 Cluster-Wise DE Gene Count Bar Plot

Show code
if (nrow(combined_cluster_de) > 0) {
  cluster_de_counts <- combined_cluster_de %>%
    filter(p_val_adj < 0.05) %>%
    mutate(direction = ifelse(avg_log2FC > 0, "PD_up", "Control_up")) %>%
    count(cluster, direction, name = "n_genes")

  if (nrow(cluster_de_counts) > 0) {
    ggplot(cluster_de_counts, aes(x = cluster, y = n_genes, fill = direction)) +
      geom_col(position = "dodge") +
      scale_fill_manual(values = c("PD_up" = "#D32F2F", "Control_up" = "#1976D2")) +
      labs(
        title = "Significant DE Genes per Cluster (PD vs Control)",
        x = "Harmony Cluster",
        y = "Number of Significant Genes",
        fill = "Direction"
      ) +
      theme_minimal()
  } else {
    message("No cluster-wise significant DE genes found.")
  }
} else {
  message("No cluster-wise DE results available.")
}

10 DotPlot of Top PD-Up Genes Across Clusters

Show code
# From cluster-wise results, find top PD-up genes per cluster
if (nrow(combined_cluster_de) > 0) {
  top_pd_per_cluster <- combined_cluster_de %>%
    filter(p_val_adj < 0.05, avg_log2FC > 0) %>%
    group_by(cluster) %>%
    slice_max(order_by = avg_log2FC, n = 5) %>%
    ungroup()

  top_pd_cluster_genes <- intersect(
    unique(top_pd_per_cluster$gene),
    rownames(SO_micro)
  )

  if (length(top_pd_cluster_genes) > 0) {
    DotPlot(
      SO_micro,
      features = top_pd_cluster_genes,
      assay = "RNA",
      group.by = "seurat_clusters_harmony",
      cols = c("blue", "red"),
      dot.scale = 6
    ) +
      RotatedAxis() +
      labs(
        title = "Top PD-Up Genes (Cluster-Wise DE) Across Harmony Clusters",
        y = "Harmony cluster"
      )
  } else {
    message("No PD-up cluster-wise DE genes for DotPlot.")
  }
}

11 Canonical Microglia & DAM Marker Visualization

These markers are used to interpret cluster identity after the DE analysis. They are not used to define the primary DE comparison.

Show code
homeostatic_markers <- c("CX3CR1", "P2RY12", "TMEM119", "AIF1", "CSF1R")
dam_markers         <- c("APOE", "TREM2", "CTSB", "CTSD", "LGALS3", "LPL", "GPNMB", "SPP1")
inflammatory_markers <- c("IL1B", "TNF", "IL6", "CXCL10", "CCL2")

all_markers <- c(homeostatic_markers, dam_markers, inflammatory_markers)
markers_present <- intersect(all_markers, rownames(SO_micro))

if (length(markers_present) > 0) {
  DotPlot(
    SO_micro,
    features = markers_present,
    assay = "RNA",
    group.by = "seurat_clusters_harmony",
    cols = c("blue", "red"),
    dot.scale = 6
  ) +
    RotatedAxis() +
    labs(
      title = "Canonical Microglia Markers Across Harmony Clusters",
      subtitle = "Homeostatic | DAM | Inflammatory",
      y = "Harmony cluster"
    )
}

Show code
# FeaturePlot for a subset of key markers
key_markers <- intersect(
  c("CX3CR1", "P2RY12", "TMEM119", "APOE", "TREM2", "SPP1"),
  rownames(SO_micro)
)

if (length(key_markers) > 0) {
  FeaturePlot(
    SO_micro,
    features = key_markers,
    reduction = "umap_micro_harmony",
    ncol = 3,
    pt.size = 0.3
  )
}

12 Module Score Calculation

Module scores provide a cell-level summary of microglia state activation.

Show code
DefaultAssay(SO_micro) <- "RNA"

module_list <- list(
  Homeostatic = c(
    "CX3CR1", "P2RY12", "TMEM119", "CSF1R", "HEXB", "TGFBR1"
  ),

  DAM = c(
    "APOE", "TREM2", "CTSB", "CTSD", "LGALS3", "LPL", "GPNMB", "SPP1"
  ),

  Inflammatory_NFkB = c(
    "NFKBIA", "NFKBIZ", "TNFAIP3", "IL1B", "TNF", "CCL2", "CXCL8", "CXCL10"
  ),

  Complement = c(
    "C1QA", "C1QB", "C1QC", "C3", "C4A", "C4B", "CFH", "CD74"
  ),

  Antigen_Presentation = c(
    "HLA-DRA", "HLA-DRB1", "HLA-DPA1", "HLA-DPB1", "HLA-A", "HLA-B", "HLA-C", "CD74"
  ),

  Lysosomal_Phagosome = c(
    "CTSB", "CTSD", "CTSL", "LAMP1", "LAMP2", "CD68", "TYROBP", "FCGR3A"
  ),

  Oxidative_Stress = c(
    "HMOX1", "NQO1", "SOD1", "SOD2", "GPX1", "PRDX1", "PRDX2", "TXN"
  ),

  Mitochondrial_Stress = c(
    "PINK1", "PRKN", "PARK7", "BNIP3", "BNIP3L", "SQSTM1", "OPTN"
  ),

  Proteostasis_HSR = c(
    "HSPA1A", "HSPA1B", "HSPA6", "HSPH1", "HSP90AA1", "DNAJB1", "BAG3", "HSPD1"
  ),

  Interferon = c(
    "IFIT1", "IFIT2", "IFIT3", "ISG15", "MX1", "OAS1", "STAT1", "IRF7"
  )
)

for (mod_name in names(module_list)) {
  genes_present <- intersect(module_list[[mod_name]], rownames(SO_micro))
  if (length(genes_present) >= 2) {
    SO_micro <- AddModuleScore(
      SO_micro,
      features = list(genes_present),
      name = paste0(mod_name, "_Score"),
      assay = "RNA",
      verbose = FALSE
    )
    cat(mod_name, ": used", length(genes_present), "of", length(module_list[[mod_name]]), "genes\n")
  } else {
    cat(mod_name, ": skipped (fewer than 2 genes present)\n")
  }
}
Homeostatic : used 6 of 6 genes
DAM : used 8 of 8 genes
Inflammatory_NFkB : used 8 of 8 genes
Complement : used 8 of 8 genes
Antigen_Presentation : used 8 of 8 genes
Lysosomal_Phagosome : used 8 of 8 genes
Oxidative_Stress : used 8 of 8 genes
Mitochondrial_Stress : used 7 of 7 genes
Proteostasis_HSR : used 8 of 8 genes
Interferon : used 8 of 8 genes

12.1 Module Score Visualisation

Show code
score_cols <- grep("_Score1$", colnames(SO_micro@meta.data), value = TRUE)

if (length(score_cols) > 0) {
  FeaturePlot(
    SO_micro,
    features = score_cols,
    reduction = "umap_micro_harmony",
    ncol = 3,
    pt.size = 0.3
  )
}

Show code
if (length(score_cols) > 0) {
  plots <- lapply(score_cols, function(sc) {
    VlnPlot(
      SO_micro,
      features = sc,
      group.by = "seurat_clusters_harmony",
      pt.size = 0
    ) + NoLegend()
  })
  wrap_plots(plots, ncol = 3)
}

13 Module Scores by Condition

Show code
# Module score columns
score_cols <- grep("_Score1$", colnames(SO_micro@meta.data), value = TRUE)

score_cols
 [1] "Homeostatic_Score1"          "DAM_Score1"                 
 [3] "Inflammatory_NFkB_Score1"    "Complement_Score1"          
 [5] "Antigen_Presentation_Score1" "Lysosomal_Phagosome_Score1" 
 [7] "Oxidative_Stress_Score1"     "Mitochondrial_Stress_Score1"
 [9] "Proteostasis_HSR_Score1"     "Interferon_Score1"          
Show code
# Long-format metadata table
module_score_long <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::select(
    barcode,
    condition,
    sex,
    sample,
    seurat_clusters_harmony,
    dplyr::all_of(score_cols)
  ) %>%
  tidyr::pivot_longer(
    cols = dplyr::all_of(score_cols),
    names_to = "module",
    values_to = "score"
  ) %>%
    dplyr::mutate(
    module = stringr::str_replace(module, "_Score1$", "")
  )

# Violin plot: all microglia, PD vs Control
ggplot(
  module_score_long,
  aes(x = condition, y = score, fill = condition)
) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Module Scores by Condition in All Microglia",
    subtitle = "PD vs Control comparison across all microglia cells",
    x = "Condition",
    y = "Module score"
  ) +
  theme_classic() +
  theme(legend.position = "none")

Show code
# Overall PD vs Control module score comparison
module_condition_stats <- module_score_long %>%
  dplyr::group_by(module) %>%
  dplyr::summarise(
    n_C = sum(condition == "C"),
    n_PD = sum(condition == "PD"),
    mean_C = mean(score[condition == "C"], na.rm = TRUE),
    mean_PD = mean(score[condition == "PD"], na.rm = TRUE),
    median_C = median(score[condition == "C"], na.rm = TRUE),
    median_PD = median(score[condition == "PD"], na.rm = TRUE),
    wilcox_p = wilcox.test(score ~ condition)$p.value,
    .groups = "drop"
  ) %>%
  dplyr::mutate(
    wilcox_p_adj = p.adjust(wilcox_p, method = "BH"),
    delta_mean_PD_minus_C = mean_PD - mean_C,
    delta_median_PD_minus_C = median_PD - median_C
  ) %>%
  dplyr::arrange(wilcox_p_adj)

module_condition_stats
Show code
write.csv(
  module_condition_stats,
  file.path(results_dir, "Step13B_module_score_PD_vs_Control_overall_stats.csv"),
  row.names = FALSE
)

# Cluster-wise PD vs Control module score comparison
module_cluster_condition_stats <- module_score_long %>%
  dplyr::group_by(seurat_clusters_harmony, module) %>%
  dplyr::filter(
    sum(condition == "C") >= 10,
    sum(condition == "PD") >= 10
  ) %>%
  dplyr::summarise(
    n_C = sum(condition == "C"),
    n_PD = sum(condition == "PD"),
    mean_C = mean(score[condition == "C"], na.rm = TRUE),
    mean_PD = mean(score[condition == "PD"], na.rm = TRUE),
    median_C = median(score[condition == "C"], na.rm = TRUE),
    median_PD = median(score[condition == "PD"], na.rm = TRUE),
    wilcox_p = wilcox.test(score ~ condition)$p.value,
    .groups = "drop"
  ) %>%  
  dplyr::group_by(module) %>%
  dplyr::mutate(
    wilcox_p_adj = p.adjust(wilcox_p, method = "BH")
  ) %>%
  dplyr::ungroup() %>%
  dplyr::mutate(
    delta_mean_PD_minus_C = mean_PD - mean_C,
    delta_median_PD_minus_C = median_PD - median_C
  ) %>%
  dplyr::arrange(seurat_clusters_harmony, wilcox_p_adj)

module_cluster_condition_stats
Show code
write.csv(
  module_cluster_condition_stats,
  file.path(results_dir, "Step13B_module_score_PD_vs_Control_by_cluster_stats.csv"),
  row.names = FALSE
)

14 Module Scores by Cluster and Condition

Show code
module_score_long_01 <- module_score_long %>%
  dplyr::filter(seurat_clusters_harmony %in% c("0", "1")) %>%
  dplyr::mutate(
    cluster_condition = paste0(
       "Cl",
      seurat_clusters_harmony,
      "_",
      condition
    ),
    cluster_condition = factor(
      cluster_condition,
      levels = c("Cl0_C", "Cl0_PD", "Cl1_C", "Cl1_PD")
    )
  )

ggplot(
  module_score_long_01,
  aes(x = cluster_condition, y = score, fill = condition)
) +
  geom_violin(scale = "width", trim = TRUE) +
  geom_boxplot(width = 0.12, outlier.shape = NA, alpha = 0.6) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Module Scores by Cluster and Condition",
    subtitle = "Testing whether PD changes module scores within clusters 0 and 1",
    x = "Cluster / condition",
    y = "Module score"
  ) +
  theme_classic()

15 Sample-Level Module Scores by Condition

Show code
sample_module_score <- module_score_long %>%
  dplyr::group_by(sample, condition, sex, module) %>%
  dplyr::summarise(
    mean_score = mean(score, na.rm = TRUE),
    median_score = median(score, na.rm = TRUE),
    n_cells = dplyr::n(),
    .groups = "drop"
  )

sample_module_score
Show code
write.csv(
  sample_module_score,
  file.path(results_dir, "Step13B_sample_level_module_scores_by_condition.csv"),
  row.names = FALSE
)

ggplot(
  sample_module_score,
  aes(x = condition, y = mean_score)
) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(
    aes(shape = sex),
    width = 0.15,
    size = 2.5,
    alpha = 0.8
  ) +
  facet_wrap(~ module, scales = "free_y", ncol = 3) +
  labs(
    title = "Sample-Level Module Scores by Condition",
    subtitle = "Each point represents one sample/donor",
    x = "Condition",
    y = "Mean module score per sample"
  ) +
  theme_classic()

16 Cluster Summary Table

Show code
meta <- SO_micro@meta.data %>%
  as_tibble(rownames = "barcode")

cluster_summary <- meta %>%
  group_by(cluster = seurat_clusters_harmony) %>%
  summarise(
    n_cells   = n(),
    n_PD      = sum(condition == "PD"),
    n_C       = sum(condition == "C"),
    pct_PD    = 100 * n_PD / n_cells,
    across(
      all_of(score_cols),
      list(mean = ~mean(., na.rm = TRUE)),
      .names = "{.col}_mean"
    ),
    .groups = "drop"
  ) %>%
  arrange(cluster)

cluster_summary
Show code
write.csv(
  cluster_summary,
  file.path(results_dir, "Step13B_harmony_cluster_summary.csv"),
  row.names = FALSE
)

17 Conservative Microglia State Labels

State labels are assigned conservatively based on convergent evidence from:

  1. PD vs Control DE gene expression across clusters
  2. Module scores
  3. Canonical marker patterns
Show code
# Conservative state labels are assigned manually based on
# PD-vs-Control DE, canonical marker expression, and module-score interpretation.
#
# Rationale:
# - Cluster 1 shows the strongest DAM-like / activated microglia signature.
# - Cluster 0 is used as the relatively homeostatic / baseline microglia population.
# - Clusters 2–5 are retained as Other because they are smaller, less central,
#   or may include non-core / mixed microglia-like states.

SO_micro$micro_state_harmony <- dplyr::case_when(
  SO_micro$seurat_clusters_harmony == "1" ~ "DAM_like",
  SO_micro$seurat_clusters_harmony == "0" ~ "Homeostatic_like",
  SO_micro$seurat_clusters_harmony %in% c("2", "3", "4", "5") ~ "Other",
  TRUE ~ "Unassigned"
)

SO_micro$micro_state_harmony <- factor(
  SO_micro$micro_state_harmony,
  levels = c("Homeostatic_like", "DAM_like", "Other", "Unassigned")
)

table(SO_micro$micro_state_harmony)

Homeostatic_like         DAM_like            Other       Unassigned 
            1523             1191              725                0 
Show code
state_summary <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::count(
    seurat_clusters_harmony,
    micro_state_harmony,
    condition,
    sex,
    sample,
    name = "n_cells"
  ) %>%
  dplyr::arrange(seurat_clusters_harmony, condition, sample)

write.csv(
  state_summary,
  file.path(results_dir, "Step13B_microglia_state_summary_manual_labels.csv"),
  row.names = FALSE
)

state_summary
Show code
DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "micro_state_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.5
) +
  labs(title = "Conservative Microglia State Labels")

18 Export Microglia State Barcodes

Show code
barcode_export <- SO_micro@meta.data %>%
  tibble::as_tibble(rownames = "barcode") %>%
  dplyr::select(
    barcode,
    seurat_clusters_harmony,
    condition,
    sex,
    sample,
    micro_state_harmony
  )
write.csv(
  barcode_export,
  file.path(results_dir, "Step13B_microglia_state_barcodes_all.csv"),
  row.names = FALSE
)

# Also save per-state barcode files
for (state_name in levels(SO_micro$micro_state_harmony)) {
  barcodes <- barcode_export %>%
    filter(micro_state_harmony == state_name) %>%
    pull(barcode)
  write.csv(
    data.frame(barcode = barcodes),
    file.path(results_dir, paste0("Step13B_barcodes_", state_name, ".csv")),
    row.names = FALSE
  )
}

19 Save Step 13B Object

Show code
output_path <- file.path(
  results_dir,
  "SO_micro_step13B_DE_pathway_state_annotated.rds"
)

saveRDS(SO_micro, output_path)

write.csv(
  data.frame(
    setting = c(
      "input_object",
      "overall_DE_method",
      "overall_DE_covariate",
      "n_overall_sig_genes",
      "n_PD_up",
      "n_Control_up",
      "n_clusters_tested",
      "module_list"
    ),
    value = c(
      input_path,
      de_overall_method,
      ifelse(grepl("sex", de_overall_method), "sex", "none"),
      nrow(de_sig),
      length(pd_up_genes),
      length(ctrl_up_genes),
      sum(method_summary_df$method != "SKIPPED"),
      paste(names(module_list), collapse = ", ")
    )
  ),
  file.path(results_dir, "Step13B_analysis_settings.csv"),
  row.names = FALSE
)

cat("Step 13B complete.\n")
Step 13B complete.
Show code
cat("Saved:", output_path, "\n")
Saved: /Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/results/Step13_microglia_harmony_reclustering/SO_micro_step13B_DE_pathway_state_annotated.rds 
Show code
cat("Cells:", ncol(SO_micro), "\n")
Cells: 3439 
Show code
cat("States:", paste(levels(SO_micro$micro_state_harmony), collapse = ", "), "\n")
States: Homeostatic_like, DAM_like, Other, Unassigned 

Narrative summary: The primary differential expression analysis was performed between PD and control microglia while accounting for sex. Harmony-based clusters were not used as the primary DE groups. Instead, clusters were used after PD-vs-Control DE to identify which microglia subpopulations carry PD-associated transcriptional changes.