Setup: Environment and Data
Harmony correction is applied at the microglia-only level using sample identity as the grouping variable. Harmony-corrected embeddings are used for clustering and visualization, whereas RNA assay expression values are retained for marker visualization, module scoring, and downstream differential expression.
The biological goal is:
Microglia
→ homeostatic microglia
→ activated / disease-associated microglia (DAM)
→ inflammatory microglia state
Show code
library (Seurat)
library (tidyverse)
library (patchwork)
library (harmony)
library (pheatmap)
library (Matrix)
library (rtracklayer)
library (DT)
set.seed (1234 )
project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"
fernando_path <- file.path (
project_dir,
"lncRNAs_PD_Fer" ,
"snRNAseq" ,
"seurat_object_annotated.rds"
)
results_dir <- file.path (
project_dir,
"results" ,
"Step13_microglia_harmony_reclustering"
)
dir.create (
results_dir,
recursive = TRUE ,
showWarnings = FALSE
)
SO <- readRDS (fernando_path)
SO
An object of class Seurat
87674 features across 46066 samples within 2 assays
Active assay: SCT (39901 features, 3000 variable features)
3 layers present: counts, data, scale.data
1 other assay present: RNA
3 dimensional reductions calculated: pca, umap, harmony
Step 13.2: Inspect Full-Object UMAP
The downstream microglia analysis uses newly computed microglia-only Harmony embeddings.
Show code
available_reductions <- Reductions (SO)
available_reductions
[1] "pca" "umap" "harmony"
Show code
full_umap_reduction <- case_when (
"umap_rpca" %in% available_reductions ~ "umap_rpca" ,
"umap" %in% available_reductions ~ "umap" ,
TRUE ~ NA_character_
)
full_umap_reduction
Show code
if (! is.na (full_umap_reduction)) {
DimPlot (
SO,
reduction = full_umap_reduction,
group.by = "fernando_celltype_annotation" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (title = "Annotated Cell Types" )
} else {
message ("No existing UMAP reduction found in Fernando object. Skipping full-object DimPlot." )
}
Step 13.4: Preprocess Microglia
After subsetting microglia, PCA is re-computed within the microglia subset. This PCA is used as input for Harmony correction.
Show code
DefaultAssay (SO_micro) <- "RNA"
if ("Assay5" %in% class (SO_micro[["RNA" ]])) {
SO_micro <- JoinLayers (SO_micro, assay = "RNA" )
}
SO_micro <- NormalizeData (
SO_micro,
verbose = FALSE
)
SO_micro <- FindVariableFeatures (
SO_micro,
selection.method = "vst" ,
nfeatures = 2000 ,
verbose = FALSE
)
SO_micro <- ScaleData (
SO_micro,
verbose = FALSE
)
SO_micro <- RunPCA (
SO_micro,
npcs = 40 ,
reduction.name = "pca_micro" ,
reduction.key = "microPCA_" ,
verbose = FALSE
)
ElbowPlot (
SO_micro,
reduction = "pca_micro" ,
ndims = 40
) +
labs (title = "Fernando-Defined Microglia PCA Elbow Plot" )
Step 13.5: Run Harmony Within Microglia
Harmony is applied within the microglia subset using sample identity as the grouping variable.
Condition is not used as the Harmony grouping variable because disease condition is biologically relevant and should not be directly regressed out at this step.
Show code
harmony_group_var <- "sample"
if (! harmony_group_var %in% colnames (SO_micro@ meta.data)) {
stop (
paste (
"Harmony grouping variable not found in metadata:" ,
harmony_group_var
)
)
}
SO_micro <- RunHarmony (
object = SO_micro,
group.by.vars = harmony_group_var,
reduction = "pca_micro" ,
reduction.save = "harmony_micro" ,
verbose = TRUE
)
Show code
An object of class Seurat
87674 features across 3439 samples within 2 assays
Active assay: RNA (47773 features, 2000 variable features)
3 layers present: counts, data, scale.data
1 other assay present: SCT
5 dimensional reductions calculated: pca, umap, harmony, pca_micro, harmony_micro
Step 13.6: Harmony-Based Microglia Clustering and UMAP
Harmony-corrected embeddings are used for neighbor graph construction, clustering, and UMAP visualization.
Show code
set.seed (1234 )
dims_use_harmony <- 1 : 30
resolution_use_harmony <- 0.5
SO_micro <- FindNeighbors (
SO_micro,
reduction = "harmony_micro" ,
dims = dims_use_harmony,
graph.name = "micro_harmony_snn" ,
verbose = FALSE
)
SO_micro <- FindClusters (
SO_micro,
graph.name = "micro_harmony_snn" ,
resolution = resolution_use_harmony,
verbose = FALSE
)
SO_micro$ seurat_clusters_harmony <- as.character (SO_micro$ seurat_clusters)
SO_micro <- RunUMAP (
SO_micro,
reduction = "harmony_micro" ,
dims = dims_use_harmony,
reduction.name = "umap_micro_harmony" ,
reduction.key = "microHarmonyUMAP_" ,
verbose = FALSE
)
table (SO_micro$ seurat_clusters_harmony)
0 1 2 3 4 5
1523 1191 406 153 142 24
Show code
p_harmony_cluster <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "seurat_clusters_harmony" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.5
) +
labs (title = "Harmony-Corrected Microglia Subclusters" )
p_harmony_condition <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (title = "Condition Distribution" )
p_harmony_sex <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "sex" ,
pt.size = 0.4
) +
labs (title = "Sex Distribution" )
p_harmony_sample <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "sample" ,
pt.size = 0.4
) +
labs (title = "Sample Distribution" )
p_split_condition <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "condition" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (title = "Harmony Microglia Clusters Split by Condition" )
p_split_sex <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "sex" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (title = "Harmony Microglia Clusters Split by Sex" )
p_split_sample <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "sample" ,
label = FALSE ,
pt.size = 0.25 ,
ncol = 4
) +
labs (title = "Harmony Microglia Clusters Split by Sample" )
p_harmony_cluster$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_harmony_condition$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sex$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sample$ layers[[1 ]]$ aes_params$ alpha <- 0.4
Show code
p_harmony_cluster | p_harmony_condition
Show code
p_harmony_cluster | p_harmony_sex
Show code
p_harmony_cluster | p_harmony_sample
Optional: Resolution Sensitivity Check
Show code
resolution_grid <- c (0.3 , 0.4 , 0.5 , 0.6 )
SO_micro <- FindClusters (
SO_micro,
graph.name = "micro_harmony_snn" ,
resolution = resolution_grid,
verbose = FALSE
)
resolution_cols <- paste0 ("micro_harmony_snn_res." , resolution_grid)
p_resolution_list <- lapply (seq_along (resolution_grid), function (i) {
res <- resolution_grid[i]
res_col <- resolution_cols[i]
DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = res_col,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (
title = paste0 ("Resolution = " , res),
x = "UMAP 1" ,
y = "UMAP 2"
)
})
wrap_plots (p_resolution_list, ncol = 2 )
Step 13.6.5: Compare Unintegrated and Harmony-Corrected Microglia UMAPs
This comparison shows whether Harmony correction reduces sample- or condition-driven separation within the microglia subset.
The unintegrated UMAP is computed from microglia-only RNA PCA, whereas the Harmony UMAP is computed from Harmony-corrected microglia embeddings.
Show code
set.seed (1234 )
dims_use_unintegrated <- dims_use_harmony
resolution_use_unintegrated <- resolution_use_harmony
SO_micro <- FindNeighbors (
SO_micro,
reduction = "pca_micro" ,
dims = dims_use_unintegrated,
graph.name = "micro_unintegrated_snn" ,
verbose = FALSE
)
SO_micro <- FindClusters (
SO_micro,
graph.name = "micro_unintegrated_snn" ,
resolution = resolution_use_unintegrated,
cluster.name = "seurat_clusters_unintegrated" ,
verbose = FALSE
)
SO_micro$ seurat_clusters_unintegrated <- as.character (
SO_micro$ seurat_clusters_unintegrated
)
SO_micro <- RunUMAP (
SO_micro,
reduction = "pca_micro" ,
dims = dims_use_unintegrated,
reduction.name = "umap_micro_unintegrated" ,
reduction.key = "microUnintegratedUMAP_" ,
verbose = FALSE
)
table (SO_micro$ seurat_clusters_unintegrated)
0 1 2 3 4 5 6 7 8 9
659 540 491 491 396 367 334 109 50 2
Show code
p_unintegrated_condition <- DimPlot (
SO_micro,
reduction = "umap_micro_unintegrated" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (
title = "Before Harmony" ,
subtitle = "Microglia-only RNA PCA UMAP" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_condition_compare <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (
title = "After Harmony" ,
subtitle = "Microglia-only Harmony UMAP" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_condition$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_condition_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_unintegrated_condition | p_harmony_condition_compare
Show code
p_unintegrated_sample <- DimPlot (
SO_micro,
reduction = "umap_micro_unintegrated" ,
group.by = "sample" ,
pt.size = 0.35
) +
labs (
title = "Before Harmony" ,
subtitle = "Colored by sample" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_sample_compare <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "sample" ,
pt.size = 0.35
) +
labs (
title = "After Harmony" ,
subtitle = "Colored by sample" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_sample$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sample_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_unintegrated_sample | p_harmony_sample_compare
Show code
p_unintegrated_cluster <- DimPlot (
SO_micro,
reduction = "umap_micro_unintegrated" ,
group.by = "seurat_clusters_unintegrated" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (
title = "Before Harmony" ,
subtitle = "Unintegrated RNA PCA clusters" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_cluster_compare <- DimPlot (
SO_micro,
reduction = "umap_micro_harmony" ,
group.by = "seurat_clusters_harmony" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (
title = "After Harmony" ,
subtitle = "Harmony-based clusters" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_cluster$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_harmony_cluster_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_unintegrated_cluster | p_harmony_cluster_compare
Show code
cluster_comparison_table <- table (
unintegrated_cluster = SO_micro$ seurat_clusters_unintegrated,
harmony_cluster = SO_micro$ seurat_clusters_harmony
)
write.csv (
as.data.frame.matrix (cluster_comparison_table),
file.path (results_dir, "Step13_unintegrated_vs_harmony_cluster_table.csv" )
)
cluster_comparison_table
harmony_cluster
unintegrated_cluster 0 1 2 3 4 5
0 155 449 24 9 19 3
1 262 260 9 6 2 1
2 263 46 20 107 37 18
3 224 237 5 7 18 0
4 358 15 2 12 7 2
5 219 119 15 5 9 0
6 4 1 324 4 1 0
7 35 58 6 2 8 0
8 2 5 1 1 41 0
9 1 1 0 0 0 0
Step 13.7: Validate Microglia and Activated Microglia Markers
Here we check expression, not only differential expression.
The point is:
canonical microglia marker expression = confirms broad microglia identity
top DE marker after microglia reclustering = identifies microglia state/subtype differences
After subsetting microglia, canonical microglia markers may not appear as top differential markers between microglia subclusters because they are expected to be broadly expressed across microglia.
Show code
micro_marker_genes <- c (
"CX3CR1" , "P2RY12" , "TMEM119" , "AIF1" , "CSF1R" ,
"TREM2" , "TYROBP" , "CD68" , "SPP1" , "APOE" , "LPL"
)
micro_marker_present <- intersect (
micro_marker_genes,
rownames (SO_micro)
)
if (length (micro_marker_present) == 0 ) {
stop ("None of the microglia marker genes were found in SO_micro." )
}
micro_marker_present
[1] "CX3CR1" "P2RY12" "TMEM119" "AIF1" "CSF1R" "TREM2" "TYROBP"
[8] "CD68" "SPP1" "APOE" "LPL"
Show code
FeaturePlot (
SO_micro,
features = micro_marker_present,
reduction = "umap_micro_harmony" ,
ncol = 4 ,
pt.size = 0.5
)
Show code
DotPlot (
SO_micro,
features = micro_marker_present,
group.by = "seurat_clusters_harmony"
) +
RotatedAxis () +
labs (title = "Canonical and Activated Microglia Marker Expression by Harmony Cluster" )
Step 13.8: Define Microglia Activation Modules
Activated and disease-associated microglia (DAM) states are inferred inside the microglia subset using marker expression and module scores.
Show code
micro_homeostatic_genes <- c (
"CX3CR1" , "P2RY12" , "TMEM119" , "CSF1R" , "HEXB" , "SALL1" , "GPR34"
)
dam_genes <- c (
"TREM2" , "TYROBP" , "SPP1" , "APOE" , "LPL" , "CD9" , "CST7" , "GPNMB" ,
"CTSD" , "ITGAX" , "CLEC7A" , "LGALS3"
)
inflammatory_genes <- c (
"IL1B" , "TNF" , "IL6" , "CCL2" , "CXCL10" , "NFKBIA" , "TNFAIP3" , "SOCS3" ,
"HLA-DRA" , "HLA-DRB1" , "CD74"
)
phagocytic_genes <- c (
"CD68" , "LAMP1" , "CTSD" , "CTSB" , "CTSL" , "LGALS3" , "FCGR3A" , "MSR1"
)
interferon_response_genes <- c (
"IFIT1" , "IFIT2" , "IFIT3" , "ISG15" , "MX1" , "OAS1" , "STAT1" , "IRF7" , "IFI6"
)
module_gene_list <- list (
Micro_homeostatic = intersect (micro_homeostatic_genes, rownames (SO_micro)),
DAM = intersect (dam_genes, rownames (SO_micro)),
Inflammatory = intersect (inflammatory_genes, rownames (SO_micro)),
Phagocytic = intersect (phagocytic_genes, rownames (SO_micro)),
Interferon_response = intersect (interferon_response_genes, rownames (SO_micro))
)
module_gene_list
$Micro_homeostatic
[1] "CX3CR1" "P2RY12" "TMEM119" "CSF1R" "HEXB" "SALL1" "GPR34"
$DAM
[1] "TREM2" "TYROBP" "SPP1" "APOE" "LPL" "CD9" "CST7" "GPNMB"
[9] "CTSD" "ITGAX" "CLEC7A" "LGALS3"
$Inflammatory
[1] "IL1B" "TNF" "IL6" "CCL2" "CXCL10" "NFKBIA"
[7] "TNFAIP3" "SOCS3" "HLA-DRA" "HLA-DRB1" "CD74"
$Phagocytic
[1] "CD68" "LAMP1" "CTSD" "CTSB" "CTSL" "LGALS3" "FCGR3A" "MSR1"
$Interferon_response
[1] "IFIT1" "IFIT2" "IFIT3" "ISG15" "MX1" "OAS1" "STAT1" "IRF7" "IFI6"
Show code
empty_modules <- names (module_gene_list)[lengths (module_gene_list) == 0 ]
if (length (empty_modules) > 0 ) {
warning (
paste (
"These modules have no genes present in the object:" ,
paste (empty_modules, collapse = ", " )
)
)
}
Show code
for (module_name in names (module_gene_list)) {
genes_use <- module_gene_list[[module_name]]
if (length (genes_use) > 0 ) {
SO_micro <- AddModuleScore (
SO_micro,
features = list (genes_use),
name = paste0 (module_name, "_score" )
)
}
}
score_features <- c (
"Micro_homeostatic_score1" ,
"DAM_score1" ,
"Inflammatory_score1" ,
"Phagocytic_score1" ,
"Interferon_response_score1"
)
score_features_present <- intersect (
score_features,
colnames (SO_micro@ meta.data)
)
if (length (score_features_present) == 0 ) {
stop ("No module score columns found." )
}
score_features_present
[1] "Micro_homeostatic_score1" "DAM_score1"
[3] "Inflammatory_score1" "Phagocytic_score1"
[5] "Interferon_response_score1"
Step 13.9: Visualize Microglia Activation Modules
Show code
FeaturePlot (
SO_micro,
features = score_features_present,
reduction = "umap_micro_harmony" ,
ncol = 3 ,
pt.size = 0.5
)
Show code
activation_score_features <- intersect (
c (
"DAM_score1" ,
"Inflammatory_score1" ,
"Phagocytic_score1" ,
"Interferon_response_score1"
),
colnames (SO_micro@ meta.data)
)
VlnPlot (
SO_micro,
features = activation_score_features,
group.by = "seurat_clusters_harmony" ,
pt.size = 0
)
Step 13.10: Summarize Microglia Clusters
We summarize each microglia cluster by:
cell number
module scores
condition composition
sample composition
This helps distinguish robust microglia states from clusters dominated by a single sample.
Show code
harmony_cluster_condition_table <- table (
harmony_cluster = SO_micro$ seurat_clusters_harmony,
condition = SO_micro$ condition
)
harmony_cluster_sample_table <- table (
harmony_cluster = SO_micro$ seurat_clusters_harmony,
sample = SO_micro$ sample
)
harmony_cluster_sex_table <- table (
harmony_cluster = SO_micro$ seurat_clusters_harmony,
sex = SO_micro$ sex
)
harmony_cluster_condition_table
condition
harmony_cluster C PD
0 745 778
1 565 626
2 263 143
3 96 57
4 66 76
5 13 11
Show code
harmony_cluster_sample_table
sample
harmony_cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F
0 243 19 218 162 101 2 62
1 28 37 113 370 17 0 22
2 57 45 62 57 28 14 13
3 15 16 9 14 32 10 13
4 9 1 10 21 22 3 9
5 5 2 0 4 2 0 1
sample
harmony_cluster PDSN02_F PDSN04_F PDSN06_M PDSN07_M PDSN09_M
0 123 285 172 35 101
1 15 260 144 59 126
2 26 36 36 16 16
3 10 11 7 5 11
4 5 13 15 9 25
5 3 2 4 0 1
Show code
harmony_cluster_sex_table
sex
harmony_cluster Female Male Unknown
0 950 573 0
1 475 716 0
2 239 167 0
3 74 79 0
4 47 95 0
5 13 11 0
Show code
cluster_sample_composition <- SO_micro@ meta.data %>%
mutate (
harmony_cluster = as.character (seurat_clusters_harmony)
) %>%
count (
harmony_cluster,
sample,
condition,
sex,
name = "n_cells"
) %>%
group_by (harmony_cluster) %>%
mutate (
cluster_total_cells = sum (n_cells),
percent_of_cluster = 100 * n_cells / cluster_total_cells
) %>%
ungroup () %>%
arrange (
harmony_cluster,
desc (n_cells)
) %>%
mutate (
percent_of_cluster = round (percent_of_cluster, 2 )
)
DT:: datatable (
cluster_sample_composition,
rownames = FALSE ,
filter = "top" ,
options = list (
pageLength = 25 ,
autoWidth = TRUE ,
scrollX = TRUE
)
)
Show code
dominant_sample_summary <- cluster_sample_composition %>%
group_by (harmony_cluster) %>%
slice_max (
order_by = percent_of_cluster,
n = 1 ,
with_ties = FALSE
) %>%
ungroup () %>%
select (
harmony_cluster,
dominant_sample = sample,
dominant_condition = condition,
dominant_sex = sex,
dominant_sample_cells = n_cells,
cluster_total_cells,
dominant_sample_percent = percent_of_cluster
) %>%
arrange (desc (dominant_sample_percent))
DT:: datatable (
dominant_sample_summary,
rownames = FALSE ,
filter = "top" ,
options = list (
pageLength = 10 ,
autoWidth = TRUE ,
scrollX = TRUE
)
)
Show code
write.csv (
cluster_sample_composition,
file.path (results_dir, "Step13_cluster_sample_composition_long.csv" ),
row.names = FALSE
)
write.csv (
dominant_sample_summary,
file.path (results_dir, "Step13_dominant_sample_summary_by_cluster.csv" ),
row.names = FALSE
)
Show code
harmony_cluster_summary <- SO_micro@ meta.data %>%
mutate (
harmony_cluster = as.character (seurat_clusters_harmony)
) %>%
group_by (harmony_cluster) %>%
summarise (
n_cells = n (),
mean_micro_homeostatic = mean (Micro_homeostatic_score1, na.rm = TRUE ),
mean_dam = mean (DAM_score1, na.rm = TRUE ),
mean_inflammatory = mean (Inflammatory_score1, na.rm = TRUE ),
mean_phagocytic = mean (Phagocytic_score1, na.rm = TRUE ),
mean_interferon = mean (Interferon_response_score1, na.rm = TRUE ),
percent_PD = mean (condition == "PD" , na.rm = TRUE ),
percent_C = mean (condition == "C" , na.rm = TRUE ),
percent_female = mean (sex == "Female" , na.rm = TRUE ),
percent_male = mean (sex == "Male" , na.rm = TRUE ),
dominant_sex = names (sort (table (sex), decreasing = TRUE ))[1 ],
n_samples = n_distinct (sample),
dominant_condition = names (sort (table (condition), decreasing = TRUE ))[1 ],
dominant_sample = names (sort (table (sample), decreasing = TRUE ))[1 ],
dominant_sample_percent = max (table (sample)) / n (),
.groups = "drop"
) %>%
arrange (desc (mean_dam))
DT:: datatable (
harmony_cluster_summary,
rownames = FALSE ,
filter = "top" ,
options = list (
pageLength = 10 ,
autoWidth = TRUE ,
scrollX = TRUE
)
)
Show code
Step 13.11: Define Microglia States
Harmony-corrected clusters are used to define microglia states conservatively.
State labels are intentionally left unassigned at this stage and should be manually defined after inspecting:
module scores
canonical and activated marker expression
condition composition
sample composition
cluster robustness
Show code
SO_micro$ micro_cluster_harmony <- as.character (SO_micro$ seurat_clusters_harmony)
dam_clusters <- c ("1" )
homeostatic_clusters <- c ("0" )
other_clusters <- c ("2" , "3" , "4" , "5" )
if (length (c (dam_clusters, homeostatic_clusters, other_clusters)) == 0 ) {
message ("No microglia states assigned yet. Assign cluster IDs after inspecting the results above." )
SO_micro$ micro_state_harmony <- "Unassigned"
} else {
SO_micro$ micro_state_harmony <- case_when (
SO_micro$ micro_cluster_harmony %in% dam_clusters ~ "DAM" ,
SO_micro$ micro_cluster_harmony %in% homeostatic_clusters ~ "Homeostatic" ,
SO_micro$ micro_cluster_harmony %in% other_clusters ~ "Other" ,
TRUE ~ "Unassigned"
)
}
SO_micro$ micro_state_harmony <- factor (
SO_micro$ micro_state_harmony,
levels = c (
"DAM" ,
"Homeostatic" ,
"Other" ,
"Unassigned"
)
)
table (SO_micro$ micro_cluster_harmony, SO_micro$ micro_state_harmony)
DAM Homeostatic Other Unassigned
0 0 1523 0 0
1 1191 0 0 0
2 0 0 406 0
3 0 0 153 0
4 0 0 142 0
5 0 0 24 0
Show code
table (SO_micro$ micro_state_harmony, SO_micro$ condition)
C PD
DAM 565 626
Homeostatic 745 778
Other 438 287
Unassigned 0 0
Show code
micro_state_barcodes <- SO_micro@ meta.data %>%
tibble:: rownames_to_column ("barcode" ) %>%
select (
barcode,
micro_cluster_harmony,
micro_state_harmony,
condition,
sample,
sex
) %>%
arrange (
micro_state_harmony,
micro_cluster_harmony,
sample
)
write.csv (
micro_state_barcodes,
file.path (results_dir, "Step13_microglia_state_barcodes_all.csv" ),
row.names = FALSE
)
# Export one barcode file per microglia state
micro_states_use <- levels (SO_micro$ micro_state_harmony)
for (state in micro_states_use) {
state_safe <- stringr:: str_replace_all (state, "[^A-Za-z0-9]+" , "_" )
state_barcodes <- micro_state_barcodes %>%
filter (micro_state_harmony == state)
write.csv (
state_barcodes,
file.path (
results_dir,
paste0 ("Step13_barcodes_" , state_safe, ".csv" )
),
row.names = FALSE
)
}
table (micro_state_barcodes$ micro_state_harmony)
DAM Homeostatic Other Unassigned
1191 1523 725 0
Step 13.12: Add External GENCODE Annotation for Context-Based lncRNA Subtype Classification
Recent GENCODE releases use a broad lncRNA gene biotype and do not always preserve older lncRNA subtypes such as lincRNA, antisense, sense_intronic, or sense_overlapping.
Therefore, lncRNAs are classified here by genomic context relative to protein-coding genes using the comprehensive GENCODE annotation.
The context-based categories are:
LINC
= lncRNA genes without overlap with protein-coding genes
Antisense
= lncRNA genes overlapping protein-coding genes on the opposite strand
Sense_gene_body_contained
= lncRNA genes fully contained within protein-coding genes on the same strand
Sense_overlapping
= lncRNA genes overlapping protein-coding genes on the same strand, but not fully intronic
Other_lncRNA_context
= lncRNA genes that do not clearly fit the above categories
Show code
reference_dir <- file.path (project_dir, "reference" )
dir.create (reference_dir, recursive = TRUE , showWarnings = FALSE )
gencode_comprehensive_gtf_path <- file.path (
project_dir,
"reference" ,
"gencode.v49.annotation.gtf.gz"
)
if (! file.exists (gencode_comprehensive_gtf_path)) {
stop ("GENCODE comprehensive GTF file not found. Download it from Terminal first." )
}
gencode_all_gtf <- rtracklayer:: import (gencode_comprehensive_gtf_path)
gencode_all_genes <- as.data.frame (gencode_all_gtf) %>%
filter (type == "gene" ) %>%
as_tibble () %>%
transmute (
seqnames = as.character (seqnames),
start = start,
end = end,
strand = as.character (strand),
gene_id = gene_id,
gene_id_clean = stringr:: str_replace (gene_id, " \\ ..*$" , "" ),
gene_name = gene_name,
gene_type = gene_type
) %>%
distinct ()
gencode_gene_type_summary <- gencode_all_genes %>%
count (gene_type, name = "n_genes" ) %>%
arrange (desc (n_genes))
write.csv (
gencode_gene_type_summary,
file.path (results_dir, "Step13_GENCODE_gene_type_summary.csv" ),
row.names = FALSE
)
gencode_gene_type_summary %>%
filter (gene_type %in% c ("protein_coding" , "lncRNA" ))
Show code
library (GenomicRanges)
lncRNA_gr <- gencode_all_genes %>%
filter (gene_type == "lncRNA" ) %>%
makeGRangesFromDataFrame (
keep.extra.columns = TRUE ,
seqnames.field = "seqnames" ,
start.field = "start" ,
end.field = "end" ,
strand.field = "strand"
)
protein_coding_gr <- gencode_all_genes %>%
filter (gene_type == "protein_coding" ) %>%
makeGRangesFromDataFrame (
keep.extra.columns = TRUE ,
seqnames.field = "seqnames" ,
start.field = "start" ,
end.field = "end" ,
strand.field = "strand"
)
# Any overlap with protein-coding genes, ignoring strand.
any_pc_overlap_hits <- findOverlaps (
lncRNA_gr,
protein_coding_gr,
ignore.strand = TRUE
)
overlap_pairs <- tibble (
lncRNA_gene_id = mcols (lncRNA_gr)$ gene_id_clean[queryHits (any_pc_overlap_hits)],
lncRNA_gene_name = mcols (lncRNA_gr)$ gene_name[queryHits (any_pc_overlap_hits)],
lncRNA_strand = as.character (strand (lncRNA_gr))[queryHits (any_pc_overlap_hits)],
lncRNA_start = start (lncRNA_gr)[queryHits (any_pc_overlap_hits)],
lncRNA_end = end (lncRNA_gr)[queryHits (any_pc_overlap_hits)],
pc_gene_id = mcols (protein_coding_gr)$ gene_id_clean[subjectHits (any_pc_overlap_hits)],
pc_gene_name = mcols (protein_coding_gr)$ gene_name[subjectHits (any_pc_overlap_hits)],
pc_strand = as.character (strand (protein_coding_gr))[subjectHits (any_pc_overlap_hits)],
pc_start = start (protein_coding_gr)[subjectHits (any_pc_overlap_hits)],
pc_end = end (protein_coding_gr)[subjectHits (any_pc_overlap_hits)]
)
# Opposite-strand protein-coding overlap.
antisense_lncRNA_ids <- overlap_pairs %>%
filter (
lncRNA_strand != pc_strand,
lncRNA_strand %in% c ("+" , "-" ),
pc_strand %in% c ("+" , "-" )
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# Same-strand protein-coding overlap.
same_strand_pairs <- overlap_pairs %>%
filter (
lncRNA_strand == pc_strand,
lncRNA_strand %in% c ("+" , "-" ),
pc_strand %in% c ("+" , "-" )
)
# Sense-intronic:
# lncRNA is fully contained within a protein-coding gene on the same strand.
sense_intronic_lncRNA_ids <- same_strand_pairs %>%
filter (
lncRNA_start >= pc_start,
lncRNA_end <= pc_end
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# Sense-overlapping:
# lncRNA overlaps a protein-coding gene on the same strand,
# but is not fully contained within it.
sense_overlapping_lncRNA_ids <- same_strand_pairs %>%
filter (
! (lncRNA_start >= pc_start & lncRNA_end <= pc_end)
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# LINC:
# lncRNA has no overlap with any protein-coding gene.
pc_overlapping_lncRNA_ids <- overlap_pairs %>%
pull (lncRNA_gene_id) %>%
unique ()
linc_lncRNA_ids <- setdiff (
mcols (lncRNA_gr)$ gene_id_clean,
pc_overlapping_lncRNA_ids
)
# Priority:
# 1. Antisense
# 2. Sense_gene_body_contained
# 3. Sense_overlapping
# 4. LINC
# 5. Other_lncRNA_context
#
# If one lncRNA overlaps multiple protein-coding genes in multiple ways,
# the most interpretable overlap class is assigned by this priority.
gencode_lncRNA_context_anno <- tibble (
gene_id_clean = mcols (lncRNA_gr)$ gene_id_clean,
gencode_gene_id = mcols (lncRNA_gr)$ gene_id,
gencode_gene_name = mcols (lncRNA_gr)$ gene_name,
gencode_gene_type = mcols (lncRNA_gr)$ gene_type,
lncRNA_type = case_when (
gene_id_clean %in% antisense_lncRNA_ids ~ "Antisense" ,
gene_id_clean %in% sense_intronic_lncRNA_ids ~ "Sense_gene_body_contained" ,
gene_id_clean %in% sense_overlapping_lncRNA_ids ~ "Sense_overlapping" ,
gene_id_clean %in% linc_lncRNA_ids ~ "LINC" ,
TRUE ~ "Other_lncRNA_context"
)
)
table (gencode_lncRNA_context_anno$ lncRNA_type)
write.csv (
gencode_lncRNA_context_anno,
file.path (results_dir, "Step13_GENCODE_lncRNA_context_annotation.csv" ),
row.names = FALSE
)
write.csv (
overlap_pairs,
file.path (results_dir, "Step13_GENCODE_lncRNA_protein_coding_overlap_pairs.csv" ),
row.names = FALSE
)
Show code
gene_meta <- SO_micro[["RNA" ]][[]] %>%
tibble:: rownames_to_column ("gene" ) %>%
mutate (
gene_id_clean = stringr:: str_replace (gene, " \\ ..*$" , "" )
)
# Match by Ensembl gene ID.
gene_meta_context_ensembl <- gene_meta %>%
left_join (
gencode_lncRNA_context_anno,
by = "gene_id_clean"
)
n_context_matched_by_ensembl <- sum (! is.na (gene_meta_context_ensembl$ lncRNA_type))
# Match by gene symbol.
# Keep both the Seurat gene name and the GENCODE gene name.
gene_meta_context_symbol <- gene_meta %>%
left_join (
gencode_lncRNA_context_anno %>%
mutate (gencode_gene_name_join = gencode_gene_name) %>%
select (
gencode_gene_name_join,
gencode_gene_id,
gencode_gene_name,
gene_id_clean_gencode = gene_id_clean,
lncRNA_type
),
by = c ("gene" = "gencode_gene_name_join" )
)
n_context_matched_by_symbol <- sum (! is.na (gene_meta_context_symbol$ lncRNA_type))
n_context_matched_by_ensembl
Show code
n_context_matched_by_symbol
Show code
if (n_context_matched_by_ensembl >= n_context_matched_by_symbol) {
gene_meta <- gene_meta_context_ensembl
lncRNA_context_match_method <- "Ensembl_gene_id"
} else {
gene_meta <- gene_meta_context_symbol
lncRNA_context_match_method <- "gene_symbol"
}
lncRNA_context_match_method
Show code
gene_meta <- gene_meta %>%
mutate (
lncRNA_type = if_else (
is.na (lncRNA_type),
"Not_lncRNA_or_unmatched" ,
lncRNA_type
),
is_lncRNA = lncRNA_type != "Not_lncRNA_or_unmatched"
)
lncRNA_genes <- gene_meta %>%
filter (is_lncRNA) %>%
pull (gene) %>%
intersect (rownames (SO_micro))
length (lncRNA_genes)
Show code
table (gene_meta$ lncRNA_type)
Antisense LINC Not_lncRNA_or_unmatched
9465 12339 24043
Sense_gene_body_contained Sense_overlapping
1498 444
Show code
write.csv (
gene_meta %>%
filter (is_lncRNA),
file.path (results_dir, "Step13_microglia_lncRNA_gene_annotation_context_based.csv" ),
row.names = FALSE
)
write.csv (
as.data.frame (table (gene_meta$ lncRNA_type)),
file.path (results_dir, "Step13_lncRNA_type_counts_context_based.csv" ),
row.names = FALSE
)
Show code
lncRNA_type_detection_summary <- gene_meta %>%
filter (is_lncRNA) %>%
count (lncRNA_type, name = "n_genes" ) %>%
arrange (desc (n_genes))
lncRNA_type_detection_summary
Show code
write.csv (
lncRNA_type_detection_summary,
file.path (results_dir, "Step13_lncRNA_context_type_summary.csv" ),
row.names = FALSE
)
Show code
get_assay_data_safe <- function (object, assay = "RNA" , layer_or_slot = "counts" ) {
tryCatch (
GetAssayData (object, assay = assay, layer = layer_or_slot),
error = function (e) {
GetAssayData (object, assay = assay, slot = layer_or_slot)
}
)
}
counts_mat <- get_assay_data_safe (
SO_micro,
assay = "RNA" ,
layer_or_slot = "counts"
)
gene_detection_summary <- tibble (
gene = rownames (counts_mat),
n_cells_detected = Matrix:: rowSums (counts_mat > 0 ),
pct_cells_detected = Matrix:: rowSums (counts_mat > 0 ) / ncol (counts_mat)
) %>%
left_join (
gene_meta %>%
select (
gene,
gencode_gene_id,
gencode_gene_name,
lncRNA_type,
is_lncRNA
),
by = "gene"
)
lncRNA_detection_summary <- gene_detection_summary %>%
filter (is_lncRNA) %>%
arrange (desc (n_cells_detected))
lncRNA_detection_summary %>%
slice_head (n = 20 )
Show code
write.csv (
lncRNA_detection_summary,
file.path (results_dir, "Step13_microglia_lncRNA_detection_summary.csv" ),
row.names = FALSE
)
Step 13.13: Global lncRNA Expression Fraction in Microglia
This section adds a per-cell lncRNA expression fraction. This helps check whether lncRNA expression is broadly distributed or concentrated in specific clusters, samples, or conditions.
Show code
if (length (lncRNA_genes) == 0 ) {
stop ("No lncRNA genes were identified from the available gene annotation." )
}
SO_micro[["percent_lncRNA" ]] <- PercentageFeatureSet (
SO_micro,
features = lncRNA_genes,
assay = "RNA"
)
summary (SO_micro$ percent_lncRNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4.512 9.968 11.515 11.935 13.430 35.765
Show code
FeaturePlot (
SO_micro,
features = "percent_lncRNA" ,
reduction = "umap_micro_harmony" ,
pt.size = 0.5
) +
labs (title = "Percent lncRNA Expression in Microglia" )
Show code
p_lnc_cluster <- VlnPlot (
SO_micro,
features = "percent_lncRNA" ,
group.by = "seurat_clusters_harmony" ,
pt.size = 0
) +
labs (title = "Percent lncRNA by Cluster" )
p_lnc_condition <- VlnPlot (
SO_micro,
features = "percent_lncRNA" ,
group.by = "condition" ,
pt.size = 0
) +
labs (title = "Percent lncRNA by Condition" )
p_lnc_cluster | p_lnc_condition
Step 13.14: Identify lncRNA Markers of Microglia Clusters
Here we identify lncRNAs that are enriched in each microglia cluster.
Harmony is used only to define clusters. Differential expression is performed on the RNA assay.
Show code
DefaultAssay (SO_micro) <- "RNA"
Idents (SO_micro) <- "seurat_clusters_harmony"
micro_harmony_markers <- FindAllMarkers (
SO_micro,
assay = "RNA" ,
only.pos = TRUE ,
min.pct = 0.05 ,
logfc.threshold = 0.25 ,
verbose = FALSE
)
micro_lncRNA_markers <- micro_harmony_markers %>%
left_join (
gene_meta %>%
select (
gene,
gencode_gene_id,
gencode_gene_name,
lncRNA_type,
is_lncRNA
),
by = "gene"
) %>%
filter (
is_lncRNA,
p_val_adj < 0.05
) %>%
mutate (
pct_diff = pct.1 - pct.2
) %>%
arrange (
cluster,
desc (avg_log2FC),
desc (pct_diff),
p_val_adj
)
micro_lncRNA_markers_top_strict <- micro_lncRNA_markers %>%
filter (pct_diff > 0.05 ) %>%
group_by (cluster) %>%
slice_head (n = 5 ) %>%
ungroup ()
micro_lncRNA_markers_top_per_cluster <- micro_lncRNA_markers %>%
group_by (cluster) %>%
slice_head (n = 5 ) %>%
ungroup ()
write.csv (
micro_harmony_markers,
file.path (results_dir, "Step13_harmony_cluster_all_markers.csv" ),
row.names = FALSE
)
write.csv (
micro_lncRNA_markers,
file.path (results_dir, "Step13_harmony_cluster_lncRNA_markers.csv" ),
row.names = FALSE
)
write.csv (
micro_lncRNA_markers_top_strict,
file.path (results_dir, "Step13_harmony_cluster_top_lncRNA_markers_strict_pctdiff.csv" ),
row.names = FALSE
)
write.csv (
micro_lncRNA_markers_top_per_cluster,
file.path (results_dir, "Step13_harmony_cluster_top_lncRNA_markers_per_cluster.csv" ),
row.names = FALSE
)
Show code
lncRNA_marker_type_summary <- micro_lncRNA_markers %>%
count (cluster, lncRNA_type, name = "n_marker_lncRNAs" ) %>%
arrange (cluster, desc (n_marker_lncRNAs))
lncRNA_marker_type_summary
Show code
write.csv (
lncRNA_marker_type_summary,
file.path (results_dir, "Step13_lncRNA_marker_type_summary_by_cluster.csv" ),
row.names = FALSE
)
Step 13.15: Visualize lncRNA Marker Expression Patterns
These plots directly address the question of where lncRNAs are expressed across microglia clusters. The heatmap and dot plot should be interpreted as expression-pattern summaries, not as proof of function.
Show code
top_lncRNA_features <- micro_lncRNA_markers_top_per_cluster %>%
pull (gene) %>%
unique () %>%
intersect (rownames (SO_micro))
# Fallback: if few marker lncRNAs pass the marker threshold, use the most detected lncRNAs.
if (length (top_lncRNA_features) < 2 ) {
top_lncRNA_features <- lncRNA_detection_summary %>%
slice_head (n = 20 ) %>%
pull (gene) %>%
unique () %>%
intersect (rownames (SO_micro))
}
top_lncRNA_features
[1] "DNAJC6-AS1" "ENSG00000281120" "LINC02473" "ENSG00000240687"
[5] "LINC02882" "ZFPM2-AS1" "LINC02712" "ENSG00000287335"
[9] "LINC01091" "ENSG00000302619" "TBC1D22A-DT" "ENSG00000301432"
[13] "HLA-DRB6" "ENSG00000296449" "ENSG00000293795" "ENSG00000253557"
[17] "LINC02698" "LINC00937" "ENSG00000287255" "ENSG00000308577"
[21] "LINC00499" "ADAMTS9-AS2" "ENSG00000286610" "TESHL"
[25] "SLC44A3-AS1" "ENSG00000274769" "FENDRR" "ENSG00000250863"
[29] "ENSG00000245008" "ENSG00000224477"
Show code
if (length (top_lncRNA_features) > 0 ) {
DotPlot (
SO_micro,
features = top_lncRNA_features,
assay = "RNA" ,
group.by = "seurat_clusters_harmony"
) +
RotatedAxis () +
guides (
color = guide_colorbar (title = "Average \n Expression" ),
size = guide_legend (title = "Percent \n Expressed" )
) +
labs (
title = "Top lncRNA Marker Expression by Harmony-Based Microglia Cluster" ,
x = "lncRNA" ,
y = "Harmony cluster"
)
} else {
message ("No lncRNA features available for DotPlot." )
}
Show code
if (length (top_lncRNA_features) > 1 ) {
SO_micro <- ScaleData (
SO_micro,
features = top_lncRNA_features,
assay = "RNA" ,
verbose = FALSE
)
DoHeatmap (
SO_micro,
features = top_lncRNA_features,
assay = "RNA" ,
group.by = "seurat_clusters_harmony"
) +
NoLegend () +
labs (title = "Top lncRNA Marker Heatmap by Harmony-Based Microglia Cluster" )
} else {
message ("Not enough lncRNA features for Seurat heatmap." )
}
Warning: Different features in new layer data than already exists for
scale.data
Show code
DefaultAssay (SO_micro) <- "RNA"
expr_mat <- GetAssayData (
SO_micro,
assay = "RNA" ,
layer = "data"
)
lncRNA_genes_use <- intersect (lncRNA_genes, rownames (expr_mat))
lncRNA_detected <- lncRNA_genes_use[
Matrix:: rowSums (expr_mat[lncRNA_genes_use, ] > 0 ) / ncol (expr_mat) >= 0.01
]
if (length (lncRNA_detected) < 2 ) {
stop ("Not enough detected lncRNAs for variable lncRNA heatmap." )
}
lncRNA_variance <- Matrix:: rowMeans (expr_mat[lncRNA_detected, ]^ 2 ) -
Matrix:: rowMeans (expr_mat[lncRNA_detected, ])^ 2
top_variable_lncRNAs <- names (sort (lncRNA_variance, decreasing = TRUE )) %>%
head (n = min (1000 , length (.)))
# Exclude clusters 2, 3, 4, and 5 to reduce noise in the heatmap
heatmap_clusters <- setdiff (unique (SO_micro$ seurat_clusters_harmony), c ("2" , "3" , "4" , "5" ))
SO_micro_heatmap <- subset (
SO_micro,
subset = seurat_clusters_harmony %in% heatmap_clusters
)
SO_micro_heatmap$ cluster_condition_sex <- paste (
"Cl" ,
SO_micro_heatmap$ seurat_clusters_harmony,
SO_micro_heatmap$ condition,
SO_micro_heatmap$ sex,
sep = "-"
)
avg_variable_lncRNA <- AverageExpression (
SO_micro_heatmap,
assays = "RNA" ,
features = top_variable_lncRNAs,
group.by = "cluster_condition_sex" ,
slot = "data" ,
verbose = FALSE
)$ RNA %>%
as.matrix ()
avg_variable_lncRNA_z <- t (scale (t (avg_variable_lncRNA)))
avg_variable_lncRNA_z[is.na (avg_variable_lncRNA_z)] <- 0
avg_variable_lncRNA_z <- pmax (pmin (avg_variable_lncRNA_z, 2 ), - 2 )
# Build annotation directly from metadata
column_annotation <- SO_micro_heatmap@ meta.data %>%
mutate (
cluster_condition_sex = paste (
"Cl" ,
seurat_clusters_harmony,
condition,
sex,
sep = "-"
),
cluster = as.character (seurat_clusters_harmony),
condition = as.character (condition),
sex = as.character (sex)
) %>%
distinct (cluster_condition_sex, cluster, condition, sex) %>%
filter (cluster_condition_sex %in% colnames (avg_variable_lncRNA_z)) %>%
arrange (match (cluster_condition_sex, colnames (avg_variable_lncRNA_z)))
column_annotation <- as.data.frame (column_annotation)
rownames (column_annotation) <- column_annotation$ cluster_condition_sex
column_annotation$ cluster_condition_sex <- NULL
# Force same order as heatmap matrix
column_annotation <- column_annotation[colnames (avg_variable_lncRNA_z), , drop = FALSE ]
# Safety check
stopifnot (identical (rownames (column_annotation), colnames (avg_variable_lncRNA_z)))
stopifnot (! any (is.na (column_annotation$ cluster)))
stopifnot (! any (is.na (column_annotation$ condition)))
stopifnot (! any (is.na (column_annotation$ sex)))
column_annotation$ cluster <- factor (column_annotation$ cluster)
column_annotation$ condition <- factor (
column_annotation$ condition,
levels = c ("C" , "PD" )
)
column_annotation$ sex <- factor (
column_annotation$ sex,
levels = c ("Female" , "Male" , "Unknown" )
)
pheatmap:: pheatmap (
avg_variable_lncRNA_z,
annotation_col = column_annotation,
show_rownames = FALSE ,
cluster_rows = TRUE ,
cluster_cols = TRUE ,
breaks = seq (- 2 , 2 , length.out = 101 ),
main = "Top Variable lncRNA Expression Across Microglia Clusters" ,
fontsize_col = 6 ,
border_color = NA
)
Show code
rm (SO_micro_heatmap)
write.csv (
top_variable_lncRNAs,
file.path (results_dir, "Step13_top_variable_lncRNAs_for_heatmap.csv" ),
row.names = FALSE
)
write.csv (
avg_variable_lncRNA,
file.path (results_dir, "Step13_average_variable_lncRNA_expression_by_cluster_condition.csv" )
)
write.csv (
avg_variable_lncRNA_z,
file.path (results_dir, "Step13_average_variable_lncRNA_expression_by_cluster_condition_zscore.csv" )
)
Step 13.16: Save Microglia Object and Results
Show code
saveRDS (
SO_micro,
file.path (results_dir, "SO_micro_step13_fernando_harmony_reclustered_scored.rds" )
)
write.csv (
harmony_cluster_summary,
file.path (results_dir, "Step13_harmony_cluster_summary.csv" ),
row.names = FALSE
)
write.csv (
as.data.frame.matrix (harmony_cluster_condition_table),
file.path (results_dir, "Step13_harmony_cluster_condition_table.csv" )
)
write.csv (
as.data.frame.matrix (harmony_cluster_sample_table),
file.path (results_dir, "Step13_harmony_cluster_sample_table.csv" )
)
write.csv (
as.data.frame.matrix (harmony_cluster_sex_table),
file.path (results_dir, "Step13_harmony_cluster_sex_table.csv" )
)
write.csv (
data.frame (
setting = c (
"input_object" ,
"annotation_column" ,
"condition_column" ,
"sample_column" ,
"microglia_labels_used" ,
"normalization" ,
"variable_features" ,
"pca_dimensions_computed" ,
"harmony_group_var" ,
"dims_use_harmony" ,
"resolution_use_harmony" ,
"primary_cluster_column" ,
"primary_umap_reduction" ,
"optional_unintegrated_dims" ,
"optional_unintegrated_resolution" ,
"lncRNA_annotation_method" ,
"lncRNA_context_match_method" ,
"n_lncRNA_genes_detected_in_microglia" ,
"n_microglia_cells"
),
value = as.character (c (
fernando_path,
"Cell_types" ,
"condition" ,
"Sample" ,
paste (target_microglia, collapse = "; " ),
"LogNormalize" ,
2000 ,
40 ,
harmony_group_var,
paste (dims_use_harmony, collapse = "," ),
resolution_use_harmony,
"seurat_clusters_harmony" ,
"umap_micro_harmony" ,
paste (dims_use_unintegrated, collapse = "," ),
resolution_use_unintegrated,
"GENCODE_v49_genomic_context" ,
lncRNA_context_match_method,
length (lncRNA_genes),
ncol (SO_micro)
))
),
file.path (results_dir, "Step13_analysis_settings.csv" ),
row.names = FALSE
)
Interpretation Note
This analysis intentionally separates three concepts:
canonical microglia marker expression
microglia subcluster structure
homeostatic / DAM / inflammatory / phagocytic / interferon response module scores
lncRNA marker expression and lncRNA subtype annotation
After subsetting microglia, canonical microglia markers may not appear as top subcluster markers because they are expected to be broadly expressed across microglia.
Therefore:
CX3CR1 / P2RY12 / TMEM119 / AIF1 / CSF1R expression confirms microglia identity.
DAM / inflammatory / phagocytic / interferon response scores help interpret microglia states.
Differential expression should be performed using the RNA assay, not Harmony-corrected embeddings.
Recent GENCODE releases use a broad lncRNA biotype. Therefore, lncRNA candidates were annotated by genomic context relative to protein-coding genes using GENCODE v49. LncRNAs without protein-coding gene overlap were labeled LINC. LncRNAs overlapping protein-coding genes on the opposite strand were labeled Antisense. Same-strand lncRNAs fully contained within protein-coding gene bodies were labeled Sense_gene_body_contained, and same-strand partial overlaps were labeled Sense_overlapping. These labels are context-based annotations, not original GENCODE subtype labels.