Step 13: Microglia State Analysis

Harmony-based reclustering, activated/DAM state assessment, and lncRNA expression profiling in microglia

1 Setup: Environment and Data

Harmony correction is applied at the microglia-only level using sample identity as the grouping variable. Harmony-corrected embeddings are used for clustering and visualization, whereas RNA assay expression values are retained for marker visualization, module scoring, and downstream differential expression.

The biological goal is:

Microglia
→ homeostatic microglia
→ activated / disease-associated microglia (DAM)
→ inflammatory microglia state

Show code

library(Seurat)
library(tidyverse)
library(patchwork)
library(harmony)
library(pheatmap)
library(Matrix)
library(rtracklayer)
library(DT)

set.seed(1234)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"

fernando_path <- file.path(
  project_dir,
  "lncRNAs_PD_Fer",
  "snRNAseq",
  "seurat_object_annotated.rds"
)

results_dir <- file.path(
  project_dir,
  "results",
  "Step13_microglia_harmony_reclustering"
)

dir.create(
  results_dir,
  recursive = TRUE,
  showWarnings = FALSE
)

SO <- readRDS(fernando_path)

SO

An object of class Seurat 
87674 features across 46066 samples within 2 assays 
Active assay: SCT (39901 features, 3000 variable features)
 3 layers present: counts, data, scale.data
 1 other assay present: RNA
 3 dimensional reductions calculated: pca, umap, harmony

2 Step 13.1: Check Metadata in Object

This analysis requires three metadata columns:

Cell_types = cell-type annotation
condition  = disease condition, such as C or PD
Sample     = sample identity used for Harmony correction

Show code

required_metadata <- c("Cell_types", "condition", "Sample")

missing_metadata <- setdiff(
  required_metadata,
  colnames(SO@meta.data)
)

if (length(missing_metadata) > 0) {
  stop(
    paste(
      "Missing required metadata columns:",
      paste(missing_metadata, collapse = ", ")
    )
  )
}

SO$fernando_celltype_annotation <- as.character(SO$Cell_types)
SO$condition_use <- as.character(SO$condition)
SO$sample_use <- as.character(SO$Sample)

table(SO$fernando_celltype_annotation)
table(SO$condition_use)
table(SO$sample_use)

Show code

write.csv(
  as.data.frame(table(SO$fernando_celltype_annotation)),
  file.path(results_dir, "Step13_fernando_celltype_annotation_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO$condition_use)),
  file.path(results_dir, "Step13_condition_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO$sample_use)),
  file.path(results_dir, "Step13_sample_table.csv"),
  row.names = FALSE
)

3 Step 13.2: Inspect Full-Object UMAP

The downstream microglia analysis uses newly computed microglia-only Harmony embeddings.

Show code

available_reductions <- Reductions(SO)
available_reductions

[1] "pca"     "umap"    "harmony"

Show code

full_umap_reduction <- case_when(
  "umap_rpca" %in% available_reductions ~ "umap_rpca",
  "umap" %in% available_reductions ~ "umap",
  TRUE ~ NA_character_
)

full_umap_reduction

[1] "umap"

Show code

if (!is.na(full_umap_reduction)) {
  DimPlot(
    SO,
    reduction = full_umap_reduction,
    group.by = "fernando_celltype_annotation",
    label = TRUE,
    repel = TRUE,
    pt.size = 0.4
  ) +
    labs(title = "Annotated Cell Types")
} else {
  message("No existing UMAP reduction found in Fernando object. Skipping full-object DimPlot.")
}

4 Step 13.3: Extract Microglia

Show code

all_fernando_celltypes <- sort(unique(SO$fernando_celltype_annotation))

target_microglia <- all_fernando_celltypes[
  stringr::str_detect(
    all_fernando_celltypes,
    regex("micro", ignore_case = TRUE)
  )
]

target_microglia

if (length(target_microglia) == 0) {
  stop("No Fernando-defined microglia labels found. Check table(SO$fernando_celltype_annotation).")
}

SO_micro <- subset(
  SO,
  subset = fernando_celltype_annotation %in% target_microglia
)

if (ncol(SO_micro) == 0) {
  stop("No microglia cells remained after subsetting.")
}

SO_micro$micro_original_identity <- SO_micro$fernando_celltype_annotation
SO_micro$condition <- SO_micro$condition_use
SO_micro$sample <- SO_micro$sample_use
SO_micro$sex <- case_when(
  stringr::str_detect(SO_micro$sample, "_F$") ~ "Female",
  stringr::str_detect(SO_micro$sample, "_M$") ~ "Male",
  TRUE ~ "Unknown"
)

SO_micro$sex <- factor(
  SO_micro$sex,
  levels = c("Female", "Male", "Unknown")
)

table(SO_micro$sex)
table(SO_micro$condition, SO_micro$sex)

table(SO_micro$micro_original_identity)
table(SO_micro$condition)
table(SO_micro$sample)

SO_micro

Show code

write.csv(
  data.frame(cell = Cells(SO_micro)),
  file.path(results_dir, "Step13_fernando_defined_microglia_cells.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO_micro$micro_original_identity, SO_micro$condition)),
  file.path(results_dir, "Step13_microglia_label_condition_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO_micro$micro_original_identity, SO_micro$sample)),
  file.path(results_dir, "Step13_microglia_label_sample_table.csv"),
  row.names = FALSE
)

Show code

if (!is.na(full_umap_reduction)) {
  p_original_label <- DimPlot(
    SO_micro,
    reduction = full_umap_reduction,
    group.by = "micro_original_identity",
    label = TRUE,
    repel = TRUE,
    pt.size = 0.5
  ) +
    labs(title = "Fernando-Defined Microglia Subset")

  p_original_condition <- DimPlot(
    SO_micro,
    reduction = full_umap_reduction,
    group.by = "condition",
    pt.size = 0.5
  ) +
    labs(title = "Condition Distribution in Fernando-Defined Microglia")

  p_original_condition$layers[[1]]$aes_params$alpha <- 0.3

  p_original_label | p_original_condition
} else {
  message("No existing UMAP reduction found. Skipping pre-reclustering microglia UMAP.")
}

Show code

rm(SO)
gc()

           used  (Mb) gc trigger (Mb) limit (Mb)  max used   (Mb)
Ncells 10728295 573.0   16926111  904         NA  16926111  904.0
Vcells 71436918 545.1  917104496 6997     102400 870565854 6641.9

5 Step 13.4: Preprocess Microglia

After subsetting microglia, PCA is re-computed within the microglia subset. This PCA is used as input for Harmony correction.

Show code

DefaultAssay(SO_micro) <- "RNA"

if ("Assay5" %in% class(SO_micro[["RNA"]])) {
  SO_micro <- JoinLayers(SO_micro, assay = "RNA")
}

SO_micro <- NormalizeData(
  SO_micro,
  verbose = FALSE
)

SO_micro <- FindVariableFeatures(
  SO_micro,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

SO_micro <- ScaleData(
  SO_micro,
  verbose = FALSE
)

SO_micro <- RunPCA(
  SO_micro,
  npcs = 40,
  reduction.name = "pca_micro",
  reduction.key = "microPCA_",
  verbose = FALSE
)

ElbowPlot(
  SO_micro,
  reduction = "pca_micro",
  ndims = 40
) +
  labs(title = "Fernando-Defined Microglia PCA Elbow Plot")

6 Step 13.5: Run Harmony Within Microglia

Harmony is applied within the microglia subset using sample identity as the grouping variable.

Condition is not used as the Harmony grouping variable because disease condition is biologically relevant and should not be directly regressed out at this step.

Show code

harmony_group_var <- "sample"

if (!harmony_group_var %in% colnames(SO_micro@meta.data)) {
  stop(
    paste(
      "Harmony grouping variable not found in metadata:",
      harmony_group_var
    )
  )
}

SO_micro <- RunHarmony(
  object = SO_micro,
  group.by.vars = harmony_group_var,
  reduction = "pca_micro",
  reduction.save = "harmony_micro",
  verbose = TRUE
)

Initializing centroids

Show code

SO_micro

An object of class Seurat 
87674 features across 3439 samples within 2 assays 
Active assay: RNA (47773 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 1 other assay present: SCT
 5 dimensional reductions calculated: pca, umap, harmony, pca_micro, harmony_micro

7 Step 13.6: Harmony-Based Microglia Clustering and UMAP

Harmony-corrected embeddings are used for neighbor graph construction, clustering, and UMAP visualization.

Show code

set.seed(1234)

dims_use_harmony <- 1:30
resolution_use_harmony <- 0.5

SO_micro <- FindNeighbors(
  SO_micro,
  reduction = "harmony_micro",
  dims = dims_use_harmony,
  graph.name = "micro_harmony_snn",
  verbose = FALSE
)

SO_micro <- FindClusters(
  SO_micro,
  graph.name = "micro_harmony_snn",
  resolution = resolution_use_harmony,
  verbose = FALSE
)

SO_micro$seurat_clusters_harmony <- as.character(SO_micro$seurat_clusters)

SO_micro <- RunUMAP(
  SO_micro,
  reduction = "harmony_micro",
  dims = dims_use_harmony,
  reduction.name = "umap_micro_harmony",
  reduction.key = "microHarmonyUMAP_",
  verbose = FALSE
)

table(SO_micro$seurat_clusters_harmony)


   0    1    2    3    4    5 
1523 1191  406  153  142   24

Show code

p_harmony_cluster <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.5
) +
  labs(title = "Harmony-Corrected Microglia Subclusters")

p_harmony_condition <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(title = "Condition Distribution")

p_harmony_sex <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sex",
  pt.size = 0.4
) +
  labs(title = "Sex Distribution")

p_harmony_sample <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sample",
  pt.size = 0.4
) +
  labs(title = "Sample Distribution")

p_split_condition <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "condition",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Harmony Microglia Clusters Split by Condition")

p_split_sex <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sex",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Harmony Microglia Clusters Split by Sex")

p_split_sample <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sample",
  label = FALSE,
  pt.size = 0.25,
  ncol = 4
) +
  labs(title = "Harmony Microglia Clusters Split by Sample")

p_harmony_cluster$layers[[1]]$aes_params$alpha <- 0.5
p_harmony_condition$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sex$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sample$layers[[1]]$aes_params$alpha <- 0.4

Show code

p_harmony_cluster | p_harmony_condition

Show code

p_split_condition

Show code

p_harmony_cluster | p_harmony_sex

Show code

p_split_sex

Show code

p_harmony_cluster | p_harmony_sample

Show code

p_split_sample

8 Optional: Resolution Sensitivity Check

Show code

resolution_grid <- c(0.3, 0.4, 0.5, 0.6)

SO_micro <- FindClusters(
  SO_micro,
  graph.name = "micro_harmony_snn",
  resolution = resolution_grid,
  verbose = FALSE
)

resolution_cols <- paste0("micro_harmony_snn_res.", resolution_grid)

p_resolution_list <- lapply(seq_along(resolution_grid), function(i) {
  res <- resolution_grid[i]
  res_col <- resolution_cols[i]

  DimPlot(
    SO_micro,
    reduction = "umap_micro_harmony",
    group.by = res_col,
    label = TRUE,
    repel = TRUE,
    pt.size = 0.35
  ) +
    labs(
      title = paste0("Resolution = ", res),
      x = "UMAP 1",
      y = "UMAP 2"
    )
})

wrap_plots(p_resolution_list, ncol = 2)

9 Step 13.6.5: Compare Unintegrated and Harmony-Corrected Microglia UMAPs

This comparison shows whether Harmony correction reduces sample- or condition-driven separation within the microglia subset.

The unintegrated UMAP is computed from microglia-only RNA PCA, whereas the Harmony UMAP is computed from Harmony-corrected microglia embeddings.

Show code

set.seed(1234)

dims_use_unintegrated <- dims_use_harmony
resolution_use_unintegrated <- resolution_use_harmony

SO_micro <- FindNeighbors(
  SO_micro,
  reduction = "pca_micro",
  dims = dims_use_unintegrated,
  graph.name = "micro_unintegrated_snn",
  verbose = FALSE
)

SO_micro <- FindClusters(
  SO_micro,
  graph.name = "micro_unintegrated_snn",
  resolution = resolution_use_unintegrated,
  cluster.name = "seurat_clusters_unintegrated",
  verbose = FALSE
)

SO_micro$seurat_clusters_unintegrated <- as.character(
  SO_micro$seurat_clusters_unintegrated
)

SO_micro <- RunUMAP(
  SO_micro,
  reduction = "pca_micro",
  dims = dims_use_unintegrated,
  reduction.name = "umap_micro_unintegrated",
  reduction.key = "microUnintegratedUMAP_",
  verbose = FALSE
)

table(SO_micro$seurat_clusters_unintegrated)


  0   1   2   3   4   5   6   7   8   9 
659 540 491 491 396 367 334 109  50   2

Show code

p_unintegrated_condition <- DimPlot(
  SO_micro,
  reduction = "umap_micro_unintegrated",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(
    title = "Before Harmony",
    subtitle = "Microglia-only RNA PCA UMAP",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_condition_compare <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(
    title = "After Harmony",
    subtitle = "Microglia-only Harmony UMAP",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_condition$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_condition_compare$layers[[1]]$aes_params$alpha <- 0.4

p_unintegrated_condition | p_harmony_condition_compare

Show code

p_unintegrated_sample <- DimPlot(
  SO_micro,
  reduction = "umap_micro_unintegrated",
  group.by = "sample",
  pt.size = 0.35
) +
  labs(
    title = "Before Harmony",
    subtitle = "Colored by sample",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_sample_compare <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "sample",
  pt.size = 0.35
) +
  labs(
    title = "After Harmony",
    subtitle = "Colored by sample",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_sample$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sample_compare$layers[[1]]$aes_params$alpha <- 0.4

p_unintegrated_sample | p_harmony_sample_compare

Show code

p_unintegrated_cluster <- DimPlot(
  SO_micro,
  reduction = "umap_micro_unintegrated",
  group.by = "seurat_clusters_unintegrated",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.4
) +
  labs(
    title = "Before Harmony",
    subtitle = "Unintegrated RNA PCA clusters",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_cluster_compare <- DimPlot(
  SO_micro,
  reduction = "umap_micro_harmony",
  group.by = "seurat_clusters_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.4
) +
  labs(
    title = "After Harmony",
    subtitle = "Harmony-based clusters",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_cluster$layers[[1]]$aes_params$alpha <- 0.5
p_harmony_cluster_compare$layers[[1]]$aes_params$alpha <- 0.5

p_unintegrated_cluster | p_harmony_cluster_compare

Show code

cluster_comparison_table <- table(
  unintegrated_cluster = SO_micro$seurat_clusters_unintegrated,
  harmony_cluster = SO_micro$seurat_clusters_harmony
)

write.csv(
  as.data.frame.matrix(cluster_comparison_table),
  file.path(results_dir, "Step13_unintegrated_vs_harmony_cluster_table.csv")
)

cluster_comparison_table

                    harmony_cluster
unintegrated_cluster   0   1   2   3   4   5
                   0 155 449  24   9  19   3
                   1 262 260   9   6   2   1
                   2 263  46  20 107  37  18
                   3 224 237   5   7  18   0
                   4 358  15   2  12   7   2
                   5 219 119  15   5   9   0
                   6   4   1 324   4   1   0
                   7  35  58   6   2   8   0
                   8   2   5   1   1  41   0
                   9   1   1   0   0   0   0

10 Step 13.7: Validate Microglia and Activated Microglia Markers

Here we check expression, not only differential expression.

The point is:

canonical microglia marker expression = confirms broad microglia identity
top DE marker after microglia reclustering = identifies microglia state/subtype differences

After subsetting microglia, canonical microglia markers may not appear as top differential markers between microglia subclusters because they are expected to be broadly expressed across microglia.

Show code

micro_marker_genes <- c(
  "CX3CR1", "P2RY12", "TMEM119", "AIF1", "CSF1R",
  "TREM2", "TYROBP", "CD68", "SPP1", "APOE", "LPL"
)

micro_marker_present <- intersect(
  micro_marker_genes,
  rownames(SO_micro)
)

if (length(micro_marker_present) == 0) {
  stop("None of the microglia marker genes were found in SO_micro.")
}

micro_marker_present

 [1] "CX3CR1"  "P2RY12"  "TMEM119" "AIF1"    "CSF1R"   "TREM2"   "TYROBP" 
 [8] "CD68"    "SPP1"    "APOE"    "LPL"

Show code

FeaturePlot(
  SO_micro,
  features = micro_marker_present,
  reduction = "umap_micro_harmony",
  ncol = 4,
  pt.size = 0.5
)

Show code

DotPlot(
  SO_micro,
  features = micro_marker_present,
  group.by = "seurat_clusters_harmony"
) +
  RotatedAxis() +
  labs(title = "Canonical and Activated Microglia Marker Expression by Harmony Cluster")

11 Step 13.8: Define Microglia Activation Modules

Activated and disease-associated microglia (DAM) states are inferred inside the microglia subset using marker expression and module scores.

Show code

micro_homeostatic_genes <- c(
  "CX3CR1", "P2RY12", "TMEM119", "CSF1R", "HEXB", "SALL1", "GPR34"
)

dam_genes <- c(
  "TREM2", "TYROBP", "SPP1", "APOE", "LPL", "CD9", "CST7", "GPNMB",
  "CTSD", "ITGAX", "CLEC7A", "LGALS3"
)

inflammatory_genes <- c(
  "IL1B", "TNF", "IL6", "CCL2", "CXCL10", "NFKBIA", "TNFAIP3", "SOCS3",
  "HLA-DRA", "HLA-DRB1", "CD74"
)

phagocytic_genes <- c(
  "CD68", "LAMP1", "CTSD", "CTSB", "CTSL", "LGALS3", "FCGR3A", "MSR1"
)

interferon_response_genes <- c(
  "IFIT1", "IFIT2", "IFIT3", "ISG15", "MX1", "OAS1", "STAT1", "IRF7", "IFI6"
)

module_gene_list <- list(
  Micro_homeostatic = intersect(micro_homeostatic_genes, rownames(SO_micro)),
  DAM = intersect(dam_genes, rownames(SO_micro)),
  Inflammatory = intersect(inflammatory_genes, rownames(SO_micro)),
  Phagocytic = intersect(phagocytic_genes, rownames(SO_micro)),
  Interferon_response = intersect(interferon_response_genes, rownames(SO_micro))
)

module_gene_list

$Micro_homeostatic
[1] "CX3CR1"  "P2RY12"  "TMEM119" "CSF1R"   "HEXB"    "SALL1"   "GPR34"  

$DAM
 [1] "TREM2"  "TYROBP" "SPP1"   "APOE"   "LPL"    "CD9"    "CST7"   "GPNMB" 
 [9] "CTSD"   "ITGAX"  "CLEC7A" "LGALS3"

$Inflammatory
 [1] "IL1B"     "TNF"      "IL6"      "CCL2"     "CXCL10"   "NFKBIA"  
 [7] "TNFAIP3"  "SOCS3"    "HLA-DRA"  "HLA-DRB1" "CD74"    

$Phagocytic
[1] "CD68"   "LAMP1"  "CTSD"   "CTSB"   "CTSL"   "LGALS3" "FCGR3A" "MSR1"  

$Interferon_response
[1] "IFIT1" "IFIT2" "IFIT3" "ISG15" "MX1"   "OAS1"  "STAT1" "IRF7"  "IFI6"

Show code

empty_modules <- names(module_gene_list)[lengths(module_gene_list) == 0]

if (length(empty_modules) > 0) {
  warning(
    paste(
      "These modules have no genes present in the object:",
      paste(empty_modules, collapse = ", ")
    )
  )
}

Show code

for (module_name in names(module_gene_list)) {
  genes_use <- module_gene_list[[module_name]]

  if (length(genes_use) > 0) {
    SO_micro <- AddModuleScore(
      SO_micro,
      features = list(genes_use),
      name = paste0(module_name, "_score")
    )
  }
}

score_features <- c(
  "Micro_homeostatic_score1",
  "DAM_score1",
  "Inflammatory_score1",
  "Phagocytic_score1",
  "Interferon_response_score1"
)

score_features_present <- intersect(
  score_features,
  colnames(SO_micro@meta.data)
)

if (length(score_features_present) == 0) {
  stop("No module score columns found.")
}

score_features_present

[1] "Micro_homeostatic_score1"   "DAM_score1"                
[3] "Inflammatory_score1"        "Phagocytic_score1"         
[5] "Interferon_response_score1"

12 Step 13.9: Visualize Microglia Activation Modules

Show code

FeaturePlot(
  SO_micro,
  features = score_features_present,
  reduction = "umap_micro_harmony",
  ncol = 3,
  pt.size = 0.5
)

Show code

activation_score_features <- intersect(
  c(
    "DAM_score1",
    "Inflammatory_score1",
    "Phagocytic_score1",
    "Interferon_response_score1"
  ),
  colnames(SO_micro@meta.data)
)

VlnPlot(
  SO_micro,
  features = activation_score_features,
  group.by = "seurat_clusters_harmony",
  pt.size = 0
)

13 Step 13.10: Summarize Microglia Clusters

We summarize each microglia cluster by:

cell number
module scores
condition composition
sample composition

This helps distinguish robust microglia states from clusters dominated by a single sample.

Show code

harmony_cluster_condition_table <- table(
  harmony_cluster = SO_micro$seurat_clusters_harmony,
  condition = SO_micro$condition
)

harmony_cluster_sample_table <- table(
  harmony_cluster = SO_micro$seurat_clusters_harmony,
  sample = SO_micro$sample
)

harmony_cluster_sex_table <- table(
  harmony_cluster = SO_micro$seurat_clusters_harmony,
  sex = SO_micro$sex
)

harmony_cluster_condition_table

               condition
harmony_cluster   C  PD
              0 745 778
              1 565 626
              2 263 143
              3  96  57
              4  66  76
              5  13  11

Show code

harmony_cluster_sample_table

               sample
harmony_cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F
              0      243       19      218      162      101        2       62
              1       28       37      113      370       17        0       22
              2       57       45       62       57       28       14       13
              3       15       16        9       14       32       10       13
              4        9        1       10       21       22        3        9
              5        5        2        0        4        2        0        1
               sample
harmony_cluster PDSN02_F PDSN04_F PDSN06_M PDSN07_M PDSN09_M
              0      123      285      172       35      101
              1       15      260      144       59      126
              2       26       36       36       16       16
              3       10       11        7        5       11
              4        5       13       15        9       25
              5        3        2        4        0        1

Show code

harmony_cluster_sex_table

               sex
harmony_cluster Female Male Unknown
              0    950  573       0
              1    475  716       0
              2    239  167       0
              3     74   79       0
              4     47   95       0
              5     13   11       0

Show code

cluster_sample_composition <- SO_micro@meta.data %>%
  mutate(
    harmony_cluster = as.character(seurat_clusters_harmony)
  ) %>%
  count(
    harmony_cluster,
    sample,
    condition,
    sex,
    name = "n_cells"
  ) %>%
  group_by(harmony_cluster) %>%
  mutate(
    cluster_total_cells = sum(n_cells),
    percent_of_cluster = 100 * n_cells / cluster_total_cells
  ) %>%
  ungroup() %>%
  arrange(
    harmony_cluster,
    desc(n_cells)
  ) %>%
  mutate(
    percent_of_cluster = round(percent_of_cluster, 2)
  )

DT::datatable(
  cluster_sample_composition,
  rownames = FALSE,
  filter = "top",
  options = list(
    pageLength = 25,
    autoWidth = TRUE,
    scrollX = TRUE
  )
)

Show code

dominant_sample_summary <- cluster_sample_composition %>%
  group_by(harmony_cluster) %>%
  slice_max(
    order_by = percent_of_cluster,
    n = 1,
    with_ties = FALSE
  ) %>%
  ungroup() %>%
  select(
    harmony_cluster,
    dominant_sample = sample,
    dominant_condition = condition,
    dominant_sex = sex,
    dominant_sample_cells = n_cells,
    cluster_total_cells,
    dominant_sample_percent = percent_of_cluster
  ) %>%
  arrange(desc(dominant_sample_percent))

DT::datatable(
  dominant_sample_summary,
  rownames = FALSE,
  filter = "top",
  options = list(
    pageLength = 10,
    autoWidth = TRUE,
    scrollX = TRUE
  )
)

Show code

write.csv(
  cluster_sample_composition,
  file.path(results_dir, "Step13_cluster_sample_composition_long.csv"),
  row.names = FALSE
)

write.csv(
  dominant_sample_summary,
  file.path(results_dir, "Step13_dominant_sample_summary_by_cluster.csv"),
  row.names = FALSE
)

Show code

harmony_cluster_summary <- SO_micro@meta.data %>%
  mutate(
    harmony_cluster = as.character(seurat_clusters_harmony)
  ) %>%
  group_by(harmony_cluster) %>%
  summarise(
    n_cells = n(),
    mean_micro_homeostatic = mean(Micro_homeostatic_score1, na.rm = TRUE),
    mean_dam = mean(DAM_score1, na.rm = TRUE),
    mean_inflammatory = mean(Inflammatory_score1, na.rm = TRUE),
    mean_phagocytic = mean(Phagocytic_score1, na.rm = TRUE),
    mean_interferon = mean(Interferon_response_score1, na.rm = TRUE),
    percent_PD = mean(condition == "PD", na.rm = TRUE),
    percent_C = mean(condition == "C", na.rm = TRUE),
    percent_female = mean(sex == "Female", na.rm = TRUE),
    percent_male = mean(sex == "Male", na.rm = TRUE),
    dominant_sex = names(sort(table(sex), decreasing = TRUE))[1],
    n_samples = n_distinct(sample),
    dominant_condition = names(sort(table(condition), decreasing = TRUE))[1],
    dominant_sample = names(sort(table(sample), decreasing = TRUE))[1],
    dominant_sample_percent = max(table(sample)) / n(),
    .groups = "drop"
  ) %>%
  arrange(desc(mean_dam))

DT::datatable(
  harmony_cluster_summary,
  rownames = FALSE,
  filter = "top",
  options = list(
    pageLength = 10,
    autoWidth = TRUE,
    scrollX = TRUE
  )
)

Show code

harmony_cluster_summary

14 Step 13.11: Define Microglia States

Harmony-corrected clusters are used to define microglia states conservatively.

State labels are intentionally left unassigned at this stage and should be manually defined after inspecting:

module scores
canonical and activated marker expression
condition composition
sample composition
cluster robustness

Show code

SO_micro$micro_cluster_harmony <- as.character(SO_micro$seurat_clusters_harmony)

dam_clusters <- c("1")
homeostatic_clusters <- c("0")
other_clusters <- c("2", "3", "4", "5")

if (length(c(dam_clusters, homeostatic_clusters, other_clusters)) == 0) {
  message("No microglia states assigned yet. Assign cluster IDs after inspecting the results above.")
  SO_micro$micro_state_harmony <- "Unassigned"
} else {
  SO_micro$micro_state_harmony <- case_when(
    SO_micro$micro_cluster_harmony %in% dam_clusters ~ "DAM",
    SO_micro$micro_cluster_harmony %in% homeostatic_clusters ~ "Homeostatic",
    SO_micro$micro_cluster_harmony %in% other_clusters ~ "Other",
    TRUE ~ "Unassigned"
  )
}

SO_micro$micro_state_harmony <- factor(
  SO_micro$micro_state_harmony,
  levels = c(
    "DAM",
    "Homeostatic",
    "Other",
    "Unassigned"
  )
)

table(SO_micro$micro_cluster_harmony, SO_micro$micro_state_harmony)

   
     DAM Homeostatic Other Unassigned
  0    0        1523     0          0
  1 1191           0     0          0
  2    0           0   406          0
  3    0           0   153          0
  4    0           0   142          0
  5    0           0    24          0

Show code

table(SO_micro$micro_state_harmony, SO_micro$condition)

             
                C  PD
  DAM         565 626
  Homeostatic 745 778
  Other       438 287
  Unassigned    0   0

Show code

micro_state_barcodes <- SO_micro@meta.data %>%
  tibble::rownames_to_column("barcode") %>%
  select(
    barcode,
    micro_cluster_harmony,
    micro_state_harmony,
    condition,
    sample,
    sex
  ) %>%
  arrange(
    micro_state_harmony,
    micro_cluster_harmony,
    sample
  )

write.csv(
  micro_state_barcodes,
  file.path(results_dir, "Step13_microglia_state_barcodes_all.csv"),
  row.names = FALSE
)

# Export one barcode file per microglia state
micro_states_use <- levels(SO_micro$micro_state_harmony)

for (state in micro_states_use) {
  state_safe <- stringr::str_replace_all(state, "[^A-Za-z0-9]+", "_")
  
  state_barcodes <- micro_state_barcodes %>%
    filter(micro_state_harmony == state)
  
  write.csv(
    state_barcodes,
    file.path(
      results_dir,
      paste0("Step13_barcodes_", state_safe, ".csv")
    ),
    row.names = FALSE
  )
}

table(micro_state_barcodes$micro_state_harmony)


        DAM Homeostatic       Other  Unassigned 
       1191        1523         725           0

15 Step 13.12: Add External GENCODE Annotation for Context-Based lncRNA Subtype Classification

Recent GENCODE releases use a broad lncRNA gene biotype and do not always preserve older lncRNA subtypes such as lincRNA, antisense, sense_intronic, or sense_overlapping.

Therefore, lncRNAs are classified here by genomic context relative to protein-coding genes using the comprehensive GENCODE annotation.

The context-based categories are:

LINC
= lncRNA genes without overlap with protein-coding genes

Antisense
= lncRNA genes overlapping protein-coding genes on the opposite strand

Sense_gene_body_contained
= lncRNA genes fully contained within protein-coding genes on the same strand

Sense_overlapping
= lncRNA genes overlapping protein-coding genes on the same strand, but not fully intronic

Other_lncRNA_context
= lncRNA genes that do not clearly fit the above categories

Show code

reference_dir <- file.path(project_dir, "reference")
dir.create(reference_dir, recursive = TRUE, showWarnings = FALSE)

gencode_comprehensive_gtf_path <- file.path(
  project_dir,
  "reference",
  "gencode.v49.annotation.gtf.gz"
)

if (!file.exists(gencode_comprehensive_gtf_path)) {
  stop("GENCODE comprehensive GTF file not found. Download it from Terminal first.")
}

gencode_all_gtf <- rtracklayer::import(gencode_comprehensive_gtf_path)

gencode_all_genes <- as.data.frame(gencode_all_gtf) %>%
  filter(type == "gene") %>%
  as_tibble() %>%
  transmute(
    seqnames = as.character(seqnames),
    start = start,
    end = end,
    strand = as.character(strand),
    gene_id = gene_id,
    gene_id_clean = stringr::str_replace(gene_id, "\\..*$", ""),
    gene_name = gene_name,
    gene_type = gene_type
  ) %>%
  distinct()

gencode_gene_type_summary <- gencode_all_genes %>%
  count(gene_type, name = "n_genes") %>%
  arrange(desc(n_genes))

write.csv(
  gencode_gene_type_summary,
  file.path(results_dir, "Step13_GENCODE_gene_type_summary.csv"),
  row.names = FALSE
)

gencode_gene_type_summary %>%
  filter(gene_type %in% c("protein_coding", "lncRNA"))

Show code

library(GenomicRanges)

lncRNA_gr <- gencode_all_genes %>%
  filter(gene_type == "lncRNA") %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    strand.field = "strand"
  )

protein_coding_gr <- gencode_all_genes %>%
  filter(gene_type == "protein_coding") %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    strand.field = "strand"
  )

# Any overlap with protein-coding genes, ignoring strand.
any_pc_overlap_hits <- findOverlaps(
  lncRNA_gr,
  protein_coding_gr,
  ignore.strand = TRUE
)

overlap_pairs <- tibble(
  lncRNA_gene_id = mcols(lncRNA_gr)$gene_id_clean[queryHits(any_pc_overlap_hits)],
  lncRNA_gene_name = mcols(lncRNA_gr)$gene_name[queryHits(any_pc_overlap_hits)],
  lncRNA_strand = as.character(strand(lncRNA_gr))[queryHits(any_pc_overlap_hits)],
  lncRNA_start = start(lncRNA_gr)[queryHits(any_pc_overlap_hits)],
  lncRNA_end = end(lncRNA_gr)[queryHits(any_pc_overlap_hits)],
  pc_gene_id = mcols(protein_coding_gr)$gene_id_clean[subjectHits(any_pc_overlap_hits)],
  pc_gene_name = mcols(protein_coding_gr)$gene_name[subjectHits(any_pc_overlap_hits)],
  pc_strand = as.character(strand(protein_coding_gr))[subjectHits(any_pc_overlap_hits)],
  pc_start = start(protein_coding_gr)[subjectHits(any_pc_overlap_hits)],
  pc_end = end(protein_coding_gr)[subjectHits(any_pc_overlap_hits)]
)

# Opposite-strand protein-coding overlap.
antisense_lncRNA_ids <- overlap_pairs %>%
  filter(
    lncRNA_strand != pc_strand,
    lncRNA_strand %in% c("+", "-"),
    pc_strand %in% c("+", "-")
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# Same-strand protein-coding overlap.
same_strand_pairs <- overlap_pairs %>%
  filter(
    lncRNA_strand == pc_strand,
    lncRNA_strand %in% c("+", "-"),
    pc_strand %in% c("+", "-")
  )

# Sense-intronic:
# lncRNA is fully contained within a protein-coding gene on the same strand.
sense_intronic_lncRNA_ids <- same_strand_pairs %>%
  filter(
    lncRNA_start >= pc_start,
    lncRNA_end <= pc_end
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# Sense-overlapping:
# lncRNA overlaps a protein-coding gene on the same strand,
# but is not fully contained within it.
sense_overlapping_lncRNA_ids <- same_strand_pairs %>%
  filter(
    !(lncRNA_start >= pc_start & lncRNA_end <= pc_end)
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# LINC:
# lncRNA has no overlap with any protein-coding gene.
pc_overlapping_lncRNA_ids <- overlap_pairs %>%
  pull(lncRNA_gene_id) %>%
  unique()

linc_lncRNA_ids <- setdiff(
  mcols(lncRNA_gr)$gene_id_clean,
  pc_overlapping_lncRNA_ids
)

# Priority:
# 1. Antisense
# 2. Sense_gene_body_contained
# 3. Sense_overlapping
# 4. LINC
# 5. Other_lncRNA_context
#
# If one lncRNA overlaps multiple protein-coding genes in multiple ways,
# the most interpretable overlap class is assigned by this priority.
gencode_lncRNA_context_anno <- tibble(
  gene_id_clean = mcols(lncRNA_gr)$gene_id_clean,
  gencode_gene_id = mcols(lncRNA_gr)$gene_id,
  gencode_gene_name = mcols(lncRNA_gr)$gene_name,
  gencode_gene_type = mcols(lncRNA_gr)$gene_type,
  lncRNA_type = case_when(
  gene_id_clean %in% antisense_lncRNA_ids ~ "Antisense",
  gene_id_clean %in% sense_intronic_lncRNA_ids ~ "Sense_gene_body_contained",
  gene_id_clean %in% sense_overlapping_lncRNA_ids ~ "Sense_overlapping",
  gene_id_clean %in% linc_lncRNA_ids ~ "LINC",
  TRUE ~ "Other_lncRNA_context"
)
)

table(gencode_lncRNA_context_anno$lncRNA_type)

write.csv(
  gencode_lncRNA_context_anno,
  file.path(results_dir, "Step13_GENCODE_lncRNA_context_annotation.csv"),
  row.names = FALSE
)

write.csv(
  overlap_pairs,
  file.path(results_dir, "Step13_GENCODE_lncRNA_protein_coding_overlap_pairs.csv"),
  row.names = FALSE
)

Show code

gene_meta <- SO_micro[["RNA"]][[]] %>%
  tibble::rownames_to_column("gene") %>%
  mutate(
    gene_id_clean = stringr::str_replace(gene, "\\..*$", "")
  )

# Match by Ensembl gene ID.
gene_meta_context_ensembl <- gene_meta %>%
  left_join(
    gencode_lncRNA_context_anno,
    by = "gene_id_clean"
  )

n_context_matched_by_ensembl <- sum(!is.na(gene_meta_context_ensembl$lncRNA_type))

# Match by gene symbol.
# Keep both the Seurat gene name and the GENCODE gene name.
gene_meta_context_symbol <- gene_meta %>%
  left_join(
    gencode_lncRNA_context_anno %>%
      mutate(gencode_gene_name_join = gencode_gene_name) %>%
      select(
        gencode_gene_name_join,
        gencode_gene_id,
        gencode_gene_name,
        gene_id_clean_gencode = gene_id_clean,
        lncRNA_type
      ),
    by = c("gene" = "gencode_gene_name_join")
  )

n_context_matched_by_symbol <- sum(!is.na(gene_meta_context_symbol$lncRNA_type))

n_context_matched_by_ensembl

[1] 19045

Show code

n_context_matched_by_symbol

[1] 23746

Show code

if (n_context_matched_by_ensembl >= n_context_matched_by_symbol) {
  gene_meta <- gene_meta_context_ensembl
  lncRNA_context_match_method <- "Ensembl_gene_id"
} else {
  gene_meta <- gene_meta_context_symbol
  lncRNA_context_match_method <- "gene_symbol"
}

lncRNA_context_match_method

[1] "gene_symbol"

Show code

gene_meta <- gene_meta %>%
  mutate(
    lncRNA_type = if_else(
      is.na(lncRNA_type),
      "Not_lncRNA_or_unmatched",
      lncRNA_type
    ),
    is_lncRNA = lncRNA_type != "Not_lncRNA_or_unmatched"
  )

lncRNA_genes <- gene_meta %>%
  filter(is_lncRNA) %>%
  pull(gene) %>%
  intersect(rownames(SO_micro))

length(lncRNA_genes)

[1] 23730

Show code

table(gene_meta$lncRNA_type)


                Antisense                      LINC   Not_lncRNA_or_unmatched 
                     9465                     12339                     24043 
Sense_gene_body_contained         Sense_overlapping 
                     1498                       444

Show code

write.csv(
  gene_meta %>%
    filter(is_lncRNA),
  file.path(results_dir, "Step13_microglia_lncRNA_gene_annotation_context_based.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(gene_meta$lncRNA_type)),
  file.path(results_dir, "Step13_lncRNA_type_counts_context_based.csv"),
  row.names = FALSE
)

Show code

lncRNA_type_detection_summary <- gene_meta %>%
  filter(is_lncRNA) %>%
  count(lncRNA_type, name = "n_genes") %>%
  arrange(desc(n_genes))

lncRNA_type_detection_summary

Show code

write.csv(
  lncRNA_type_detection_summary,
  file.path(results_dir, "Step13_lncRNA_context_type_summary.csv"),
  row.names = FALSE
)

Show code

get_assay_data_safe <- function(object, assay = "RNA", layer_or_slot = "counts") {
  tryCatch(
    GetAssayData(object, assay = assay, layer = layer_or_slot),
    error = function(e) {
      GetAssayData(object, assay = assay, slot = layer_or_slot)
    }
  )
}

counts_mat <- get_assay_data_safe(
  SO_micro,
  assay = "RNA",
  layer_or_slot = "counts"
)

gene_detection_summary <- tibble(
  gene = rownames(counts_mat),
  n_cells_detected = Matrix::rowSums(counts_mat > 0),
  pct_cells_detected = Matrix::rowSums(counts_mat > 0) / ncol(counts_mat)
) %>%
  left_join(
    gene_meta %>%
      select(
        gene,
        gencode_gene_id,
        gencode_gene_name,
        lncRNA_type,
        is_lncRNA
      ),
    by = "gene"
  )

lncRNA_detection_summary <- gene_detection_summary %>%
  filter(is_lncRNA) %>%
  arrange(desc(n_cells_detected))

lncRNA_detection_summary %>%
  slice_head(n = 20)

Show code

write.csv(
  lncRNA_detection_summary,
  file.path(results_dir, "Step13_microglia_lncRNA_detection_summary.csv"),
  row.names = FALSE
)

16 Step 13.13: Global lncRNA Expression Fraction in Microglia

This section adds a per-cell lncRNA expression fraction. This helps check whether lncRNA expression is broadly distributed or concentrated in specific clusters, samples, or conditions.

Show code

if (length(lncRNA_genes) == 0) {
  stop("No lncRNA genes were identified from the available gene annotation.")
}

SO_micro[["percent_lncRNA"]] <- PercentageFeatureSet(
  SO_micro,
  features = lncRNA_genes,
  assay = "RNA"
)

summary(SO_micro$percent_lncRNA)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  4.512   9.968  11.515  11.935  13.430  35.765

Show code

FeaturePlot(
  SO_micro,
  features = "percent_lncRNA",
  reduction = "umap_micro_harmony",
  pt.size = 0.5
) +
  labs(title = "Percent lncRNA Expression in Microglia")

Show code

p_lnc_cluster <- VlnPlot(
  SO_micro,
  features = "percent_lncRNA",
  group.by = "seurat_clusters_harmony",
  pt.size = 0
) +
  labs(title = "Percent lncRNA by Cluster")

p_lnc_condition <- VlnPlot(
  SO_micro,
  features = "percent_lncRNA",
  group.by = "condition",
  pt.size = 0
) +
  labs(title = "Percent lncRNA by Condition")

p_lnc_cluster | p_lnc_condition

17 Step 13.14: Identify lncRNA Markers of Microglia Clusters

Here we identify lncRNAs that are enriched in each microglia cluster.
Harmony is used only to define clusters. Differential expression is performed on the RNA assay.

Show code

DefaultAssay(SO_micro) <- "RNA"
Idents(SO_micro) <- "seurat_clusters_harmony"

micro_harmony_markers <- FindAllMarkers(
  SO_micro,
  assay = "RNA",
  only.pos = TRUE,
  min.pct = 0.05,
  logfc.threshold = 0.25,
  verbose = FALSE
)

micro_lncRNA_markers <- micro_harmony_markers %>%
  left_join(
    gene_meta %>%
      select(
        gene,
        gencode_gene_id,
        gencode_gene_name,
        lncRNA_type,
        is_lncRNA
      ),
    by = "gene"
  ) %>%
  filter(
    is_lncRNA,
    p_val_adj < 0.05
  ) %>%
  mutate(
    pct_diff = pct.1 - pct.2
  ) %>%
  arrange(
    cluster,
    desc(avg_log2FC),
    desc(pct_diff),
    p_val_adj
  )

micro_lncRNA_markers_top_strict <- micro_lncRNA_markers %>%
  filter(pct_diff > 0.05) %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  ungroup()

micro_lncRNA_markers_top_per_cluster <- micro_lncRNA_markers %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  ungroup()

write.csv(
  micro_harmony_markers,
  file.path(results_dir, "Step13_harmony_cluster_all_markers.csv"),
  row.names = FALSE
)

write.csv(
  micro_lncRNA_markers,
  file.path(results_dir, "Step13_harmony_cluster_lncRNA_markers.csv"),
  row.names = FALSE
)

write.csv(
  micro_lncRNA_markers_top_strict,
  file.path(results_dir, "Step13_harmony_cluster_top_lncRNA_markers_strict_pctdiff.csv"),
  row.names = FALSE
)

write.csv(
  micro_lncRNA_markers_top_per_cluster,
  file.path(results_dir, "Step13_harmony_cluster_top_lncRNA_markers_per_cluster.csv"),
  row.names = FALSE
)

Show code

lncRNA_marker_type_summary <- micro_lncRNA_markers %>%
  count(cluster, lncRNA_type, name = "n_marker_lncRNAs") %>%
  arrange(cluster, desc(n_marker_lncRNAs))

lncRNA_marker_type_summary

Show code

write.csv(
  lncRNA_marker_type_summary,
  file.path(results_dir, "Step13_lncRNA_marker_type_summary_by_cluster.csv"),
  row.names = FALSE
)

18 Step 13.15: Visualize lncRNA Marker Expression Patterns

These plots directly address the question of where lncRNAs are expressed across microglia clusters. The heatmap and dot plot should be interpreted as expression-pattern summaries, not as proof of function.

Show code

top_lncRNA_features <- micro_lncRNA_markers_top_per_cluster %>%
  pull(gene) %>%
  unique() %>%
  intersect(rownames(SO_micro))

# Fallback: if few marker lncRNAs pass the marker threshold, use the most detected lncRNAs.
if (length(top_lncRNA_features) < 2) {
  top_lncRNA_features <- lncRNA_detection_summary %>%
    slice_head(n = 20) %>%
    pull(gene) %>%
    unique() %>%
    intersect(rownames(SO_micro))
}

top_lncRNA_features

 [1] "DNAJC6-AS1"      "ENSG00000281120" "LINC02473"       "ENSG00000240687"
 [5] "LINC02882"       "ZFPM2-AS1"       "LINC02712"       "ENSG00000287335"
 [9] "LINC01091"       "ENSG00000302619" "TBC1D22A-DT"     "ENSG00000301432"
[13] "HLA-DRB6"        "ENSG00000296449" "ENSG00000293795" "ENSG00000253557"
[17] "LINC02698"       "LINC00937"       "ENSG00000287255" "ENSG00000308577"
[21] "LINC00499"       "ADAMTS9-AS2"     "ENSG00000286610" "TESHL"          
[25] "SLC44A3-AS1"     "ENSG00000274769" "FENDRR"          "ENSG00000250863"
[29] "ENSG00000245008" "ENSG00000224477"

Show code

if (length(top_lncRNA_features) > 0) {
  DotPlot(
    SO_micro,
    features = top_lncRNA_features,
    assay = "RNA",
    group.by = "seurat_clusters_harmony"
  ) +
    RotatedAxis() +
    guides(
      color = guide_colorbar(title = "Average\nExpression"),
      size = guide_legend(title = "Percent\nExpressed")
    ) +
    labs(
      title = "Top lncRNA Marker Expression by Harmony-Based Microglia Cluster",
      x = "lncRNA",
      y = "Harmony cluster"
    )
} else {
  message("No lncRNA features available for DotPlot.")
}

Show code

if (length(top_lncRNA_features) > 1) {
  SO_micro <- ScaleData(
    SO_micro,
    features = top_lncRNA_features,
    assay = "RNA",
    verbose = FALSE
  )

  DoHeatmap(
    SO_micro,
    features = top_lncRNA_features,
    assay = "RNA",
    group.by = "seurat_clusters_harmony"
  ) +
    NoLegend() +
    labs(title = "Top lncRNA Marker Heatmap by Harmony-Based Microglia Cluster")
} else {
  message("Not enough lncRNA features for Seurat heatmap.")
}

Warning: Different features in new layer data than already exists for
scale.data

Show code

DefaultAssay(SO_micro) <- "RNA"

expr_mat <- GetAssayData(
  SO_micro,
  assay = "RNA",
  layer = "data"
)

lncRNA_genes_use <- intersect(lncRNA_genes, rownames(expr_mat))

lncRNA_detected <- lncRNA_genes_use[
  Matrix::rowSums(expr_mat[lncRNA_genes_use, ] > 0) / ncol(expr_mat) >= 0.01
]

if (length(lncRNA_detected) < 2) {
  stop("Not enough detected lncRNAs for variable lncRNA heatmap.")
}

lncRNA_variance <- Matrix::rowMeans(expr_mat[lncRNA_detected, ]^2) -
  Matrix::rowMeans(expr_mat[lncRNA_detected, ])^2

top_variable_lncRNAs <- names(sort(lncRNA_variance, decreasing = TRUE)) %>%
  head(n = min(1000, length(.)))

# Exclude clusters 2, 3, 4, and 5 to reduce noise in the heatmap
heatmap_clusters <- setdiff(unique(SO_micro$seurat_clusters_harmony), c("2", "3", "4", "5"))
SO_micro_heatmap <- subset(
  SO_micro,
  subset = seurat_clusters_harmony %in% heatmap_clusters
)

SO_micro_heatmap$cluster_condition_sex <- paste(
  "Cl",
  SO_micro_heatmap$seurat_clusters_harmony,
  SO_micro_heatmap$condition,
  SO_micro_heatmap$sex,
  sep = "-"
)

avg_variable_lncRNA <- AverageExpression(
  SO_micro_heatmap,
  assays = "RNA",
  features = top_variable_lncRNAs,
  group.by = "cluster_condition_sex",
  slot = "data",
  verbose = FALSE
)$RNA %>%
  as.matrix()

avg_variable_lncRNA_z <- t(scale(t(avg_variable_lncRNA)))
avg_variable_lncRNA_z[is.na(avg_variable_lncRNA_z)] <- 0
avg_variable_lncRNA_z <- pmax(pmin(avg_variable_lncRNA_z, 2), -2)

# Build annotation directly from metadata
column_annotation <- SO_micro_heatmap@meta.data %>%
  mutate(
    cluster_condition_sex = paste(
      "Cl",
      seurat_clusters_harmony,
      condition,
      sex,
      sep = "-"
    ),
    cluster = as.character(seurat_clusters_harmony),
    condition = as.character(condition),
    sex = as.character(sex)
  ) %>%
  distinct(cluster_condition_sex, cluster, condition, sex) %>%
  filter(cluster_condition_sex %in% colnames(avg_variable_lncRNA_z)) %>%
  arrange(match(cluster_condition_sex, colnames(avg_variable_lncRNA_z)))

column_annotation <- as.data.frame(column_annotation)

rownames(column_annotation) <- column_annotation$cluster_condition_sex
column_annotation$cluster_condition_sex <- NULL

# Force same order as heatmap matrix
column_annotation <- column_annotation[colnames(avg_variable_lncRNA_z), , drop = FALSE]

# Safety check
stopifnot(identical(rownames(column_annotation), colnames(avg_variable_lncRNA_z)))
stopifnot(!any(is.na(column_annotation$cluster)))
stopifnot(!any(is.na(column_annotation$condition)))
stopifnot(!any(is.na(column_annotation$sex)))

column_annotation$cluster <- factor(column_annotation$cluster)
column_annotation$condition <- factor(
  column_annotation$condition,
  levels = c("C", "PD")
)
column_annotation$sex <- factor(
  column_annotation$sex,
  levels = c("Female", "Male", "Unknown")
)

pheatmap::pheatmap(
  avg_variable_lncRNA_z,
  annotation_col = column_annotation,
  show_rownames = FALSE,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  breaks = seq(-2, 2, length.out = 101),
  main = "Top Variable lncRNA Expression Across Microglia Clusters",
  fontsize_col = 6,
  border_color = NA
)

Show code

rm(SO_micro_heatmap)

write.csv(
  top_variable_lncRNAs,
  file.path(results_dir, "Step13_top_variable_lncRNAs_for_heatmap.csv"),
  row.names = FALSE
)

write.csv(
  avg_variable_lncRNA,
  file.path(results_dir, "Step13_average_variable_lncRNA_expression_by_cluster_condition.csv")
)

write.csv(
  avg_variable_lncRNA_z,
  file.path(results_dir, "Step13_average_variable_lncRNA_expression_by_cluster_condition_zscore.csv")
)

19 Step 13.16: Save Microglia Object and Results

Show code

saveRDS(
  SO_micro,
  file.path(results_dir, "SO_micro_step13_fernando_harmony_reclustered_scored.rds")
)

write.csv(
  harmony_cluster_summary,
  file.path(results_dir, "Step13_harmony_cluster_summary.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame.matrix(harmony_cluster_condition_table),
  file.path(results_dir, "Step13_harmony_cluster_condition_table.csv")
)

write.csv(
  as.data.frame.matrix(harmony_cluster_sample_table),
  file.path(results_dir, "Step13_harmony_cluster_sample_table.csv")
)

write.csv(
  as.data.frame.matrix(harmony_cluster_sex_table),
  file.path(results_dir, "Step13_harmony_cluster_sex_table.csv")
)

write.csv(
  data.frame(
    setting = c(
      "input_object",
      "annotation_column",
      "condition_column",
      "sample_column",
      "microglia_labels_used",
      "normalization",
      "variable_features",
      "pca_dimensions_computed",
      "harmony_group_var",
      "dims_use_harmony",
      "resolution_use_harmony",
      "primary_cluster_column",
      "primary_umap_reduction",
      "optional_unintegrated_dims",
      "optional_unintegrated_resolution",
      "lncRNA_annotation_method",
      "lncRNA_context_match_method",
      "n_lncRNA_genes_detected_in_microglia",
      "n_microglia_cells"
    ),
    value = as.character(c(
      fernando_path,
      "Cell_types",
      "condition",
      "Sample",
      paste(target_microglia, collapse = "; "),
      "LogNormalize",
      2000,
      40,
      harmony_group_var,
      paste(dims_use_harmony, collapse = ","),
      resolution_use_harmony,
      "seurat_clusters_harmony",
      "umap_micro_harmony",
      paste(dims_use_unintegrated, collapse = ","),
      resolution_use_unintegrated,
      "GENCODE_v49_genomic_context",
      lncRNA_context_match_method,
      length(lncRNA_genes),
      ncol(SO_micro)
    ))
  ),
  file.path(results_dir, "Step13_analysis_settings.csv"),
  row.names = FALSE
)

20 Interpretation Note

This analysis intentionally separates three concepts:

canonical microglia marker expression

microglia subcluster structure

homeostatic / DAM / inflammatory / phagocytic / interferon response module scores

lncRNA marker expression and lncRNA subtype annotation

After subsetting microglia, canonical microglia markers may not appear as top subcluster markers because they are expected to be broadly expressed across microglia.

Therefore:

CX3CR1 / P2RY12 / TMEM119 / AIF1 / CSF1R expression confirms microglia identity.
DAM / inflammatory / phagocytic / interferon response scores help interpret microglia states.
Differential expression should be performed using the RNA assay, not Harmony-corrected embeddings.

Recent GENCODE releases use a broad lncRNA biotype. Therefore, lncRNA candidates were annotated by genomic context relative to protein-coding genes using GENCODE v49. LncRNAs without protein-coding gene overlap were labeled LINC. LncRNAs overlapping protein-coding genes on the opposite strand were labeled Antisense. Same-strand lncRNAs fully contained within protein-coding gene bodies were labeled Sense_gene_body_contained, and same-strand partial overlaps were labeled Sense_overlapping. These labels are context-based annotations, not original GENCODE subtype labels.