Step 11.5: Astrocyte State Analysis

Harmony-based reclustering, reactive/stress-like state assessment, and lncRNA expression profiling in astrocytes

1 Setup: Environment and Data

The purpose is to make the astrocyte analysis reproducible from the lab reference object, instead of starting from an independently re-annotated object. After subsetting astrocytes, dimensional reduction and clustering are re-run within the astrocyte compartment.

Harmony correction is applied at the astrocyte-only level using sample identity as the grouping variable. Harmony-corrected embeddings are used for clustering and visualization, whereas RNA assay expression values are retained for marker visualization, module scoring, and downstream differential expression.

The biological goal remains:

Astrocyte
→ reactive astrocyte
→ stressed / inflammatory astrocyte state

This trajectory should be interpreted as an astrocyte reactivity trajectory, not a neuronal degeneration trajectory.

Show code
library(Seurat)
library(tidyverse)
library(patchwork)
library(harmony)
library(pheatmap)
library(Matrix)
library(rtracklayer)

set.seed(1234)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"

fernando_path <- file.path(
  project_dir,
  "lncRNAs_PD_Fer",
  "snRNAseq",
  "seurat_object_annotated.rds"
)

results_dir <- file.path(
  project_dir,
  "results",
  "Step11_5_fernando_astrocyte_harmony_reclustering"
)

dir.create(
  results_dir,
  recursive = TRUE,
  showWarnings = FALSE
)

SO <- readRDS(fernando_path)

SO
An object of class Seurat 
87674 features across 46066 samples within 2 assays 
Active assay: SCT (39901 features, 3000 variable features)
 3 layers present: counts, data, scale.data
 1 other assay present: RNA
 3 dimensional reductions calculated: pca, umap, harmony

2 Step 11.1: Check Metadata in Object

This analysis requires three metadata columns:

Cell_types = cell-type annotation
condition  = disease condition, such as C or PD
Sample     = sample identity used for Harmony correction
Show code
required_metadata <- c("Cell_types", "condition", "Sample")

missing_metadata <- setdiff(
  required_metadata,
  colnames(SO@meta.data)
)

if (length(missing_metadata) > 0) {
  stop(
    paste(
      "Missing required metadata columns:",
      paste(missing_metadata, collapse = ", ")
    )
  )
}

SO$fernando_celltype_annotation <- as.character(SO$Cell_types)
SO$condition_use <- as.character(SO$condition)
SO$sample_use <- as.character(SO$Sample)

table(SO$fernando_celltype_annotation)
table(SO$condition_use)
table(SO$sample_use)
Show code
write.csv(
  as.data.frame(table(SO$fernando_celltype_annotation)),
  file.path(results_dir, "Step11_5_fernando_celltype_annotation_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO$condition_use)),
  file.path(results_dir, "Step11_5_condition_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO$sample_use)),
  file.path(results_dir, "Step11_5_sample_table.csv"),
  row.names = FALSE
)

3 Step 11.2: Inspect Full-Object UMAP

The downstream astrocyte analysis uses newly computed astrocyte-only Harmony embeddings.

Show code
available_reductions <- Reductions(SO)
available_reductions
[1] "pca"     "umap"    "harmony"
Show code
full_umap_reduction <- case_when(
  "umap_rpca" %in% available_reductions ~ "umap_rpca",
  "umap" %in% available_reductions ~ "umap",
  TRUE ~ NA_character_
)

full_umap_reduction
[1] "umap"
Show code
if (!is.na(full_umap_reduction)) {
  DimPlot(
    SO,
    reduction = full_umap_reduction,
    group.by = "fernando_celltype_annotation",
    label = TRUE,
    repel = TRUE,
    pt.size = 0.4
  ) +
    labs(title = "Annotated Cell Types")
} else {
  message("No existing UMAP reduction found in Fernando object. Skipping full-object DimPlot.")
}

4 Step 11.3: Extract Astrocytes

Show code
all_fernando_celltypes <- sort(unique(SO$fernando_celltype_annotation))

target_astrocytes <- all_fernando_celltypes[
  stringr::str_detect(
    all_fernando_celltypes,
    regex("astro", ignore_case = TRUE)
  )
]

target_astrocytes

if (length(target_astrocytes) == 0) {
  stop("No Fernando-defined astrocyte labels found. Check table(SO$fernando_celltype_annotation).")
}

SO_astro <- subset(
  SO,
  subset = fernando_celltype_annotation %in% target_astrocytes
)

if (ncol(SO_astro) == 0) {
  stop("No astrocyte cells remained after subsetting.")
}

SO_astro$astro_original_identity <- SO_astro$fernando_celltype_annotation
SO_astro$condition <- SO_astro$condition_use
SO_astro$sample <- SO_astro$sample_use
SO_astro$sex <- case_when(
  stringr::str_detect(SO_astro$sample, "_F$") ~ "Female",
  stringr::str_detect(SO_astro$sample, "_M$") ~ "Male",
  TRUE ~ "Unknown"
)

SO_astro$sex <- factor(
  SO_astro$sex,
  levels = c("Female", "Male", "Unknown")
)

table(SO_astro$sex)
table(SO_astro$condition, SO_astro$sex)

table(SO_astro$astro_original_identity)
table(SO_astro$condition)
table(SO_astro$sample)

SO_astro
Show code
write.csv(
  data.frame(cell = Cells(SO_astro)),
  file.path(results_dir, "Step11_5_fernando_defined_astrocyte_cells.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO_astro$astro_original_identity, SO_astro$condition)),
  file.path(results_dir, "Step11_5_astrocyte_label_condition_table.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(SO_astro$astro_original_identity, SO_astro$sample)),
  file.path(results_dir, "Step11_5_astrocyte_label_sample_table.csv"),
  row.names = FALSE
)
Show code
if (!is.na(full_umap_reduction)) {
  p_original_label <- DimPlot(
    SO_astro,
    reduction = full_umap_reduction,
    group.by = "astro_original_identity",
    label = TRUE,
    repel = TRUE,
    pt.size = 0.5
  ) +
    labs(title = "Fernando-Defined Astrocyte Subset")

  p_original_condition <- DimPlot(
    SO_astro,
    reduction = full_umap_reduction,
    group.by = "condition",
    pt.size = 0.5
  ) +
    labs(title = "Condition Distribution in Fernando-Defined Astrocytes")

  p_original_condition$layers[[1]]$aes_params$alpha <- 0.3

  p_original_label | p_original_condition
} else {
  message("No existing UMAP reduction found. Skipping pre-reclustering astrocyte UMAP.")
}

Show code
rm(SO)
gc()
            used  (Mb) gc trigger (Mb) limit (Mb)  max used   (Mb)
Ncells  10727734 573.0   16925414  904         NA  16925414  904.0
Vcells 104340638 796.1  917098640 6997     102400 905595467 6909.2

5 Step 11.4: Preprocess Astrocytes

After subsetting astrocytes, PCA is re-computed within the astrocyte subset. This PCA is used as input for Harmony correction.

Show code
DefaultAssay(SO_astro) <- "RNA"

if ("Assay5" %in% class(SO_astro[["RNA"]])) {
  SO_astro <- JoinLayers(SO_astro, assay = "RNA")
}

SO_astro <- NormalizeData(
  SO_astro,
  verbose = FALSE
)

SO_astro <- FindVariableFeatures(
  SO_astro,
  selection.method = "vst",
  nfeatures = 2000,
  verbose = FALSE
)

SO_astro <- ScaleData(
  SO_astro,
  verbose = FALSE
)

SO_astro <- RunPCA(
  SO_astro,
  npcs = 40,
  reduction.name = "pca_astro",
  reduction.key = "astroPCA_",
  verbose = FALSE
)

ElbowPlot(
  SO_astro,
  reduction = "pca_astro",
  ndims = 40
) +
  labs(title = "Fernando-Defined Astrocyte PCA Elbow Plot")

6 Step 11.5: Run Harmony Within Astrocytes

Harmony is applied within the astrocyte subset using sample identity as the grouping variable.

Condition is not used as the Harmony grouping variable because disease condition is biologically relevant and should not be directly regressed out at this step.

Show code
harmony_group_var <- "sample"

if (!harmony_group_var %in% colnames(SO_astro@meta.data)) {
  stop(
    paste(
      "Harmony grouping variable not found in metadata:",
      harmony_group_var
    )
  )
}

SO_astro <- RunHarmony(
  object = SO_astro,
  group.by.vars = harmony_group_var,
  reduction = "pca_astro",
  reduction.save = "harmony_astro",
  verbose = TRUE
)
Initializing centroids
Show code
SO_astro
An object of class Seurat 
87674 features across 5259 samples within 2 assays 
Active assay: RNA (47773 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 1 other assay present: SCT
 5 dimensional reductions calculated: pca, umap, harmony, pca_astro, harmony_astro

7 Step 11.6: Harmony-Based Astrocyte Clustering and UMAP

Harmony-corrected embeddings are used for neighbor graph construction, clustering, and UMAP visualization.

Show code
set.seed(1234)

dims_use_harmony <- 1:30
resolution_use_harmony <- 0.5

SO_astro <- FindNeighbors(
  SO_astro,
  reduction = "harmony_astro",
  dims = dims_use_harmony,
  graph.name = "astro_harmony_snn",
  verbose = FALSE
)

SO_astro <- FindClusters(
  SO_astro,
  graph.name = "astro_harmony_snn",
  resolution = resolution_use_harmony,
  verbose = FALSE
)

SO_astro$seurat_clusters_harmony <- as.character(SO_astro$seurat_clusters)

SO_astro <- RunUMAP(
  SO_astro,
  reduction = "harmony_astro",
  dims = dims_use_harmony,
  reduction.name = "umap_astro_harmony",
  reduction.key = "astroHarmonyUMAP_",
  verbose = FALSE
)

table(SO_astro$seurat_clusters_harmony)

   0    1    2    3    4    5    6    7    8    9 
1258 1248 1217  526  413  235  179  102   57   24 
Show code
p_harmony_cluster <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "seurat_clusters_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.5
) +
  labs(title = "Harmony-Corrected Astrocyte Subclusters")

p_harmony_condition <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(title = "Condition Distribution")

p_harmony_sex <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "sex",
  pt.size = 0.4
) +
  labs(title = "Sex Distribution")

p_harmony_sample <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "sample",
  pt.size = 0.4
) +
  labs(title = "Sample Distribution")

p_split_condition <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "condition",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Harmony Astrocyte Clusters Split by Condition")

p_split_sex <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sex",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.35
) +
  labs(title = "Harmony Astrocyte Clusters Split by Sex")

p_split_sample <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "seurat_clusters_harmony",
  split.by = "sample",
  label = FALSE,
  pt.size = 0.25,
  ncol = 4
) +
  labs(title = "Harmony Astrocyte Clusters Split by Sample")

p_harmony_cluster$layers[[1]]$aes_params$alpha <- 0.5
p_harmony_condition$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sex$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sample$layers[[1]]$aes_params$alpha <- 0.4
Show code
p_harmony_cluster | p_harmony_condition

Show code
p_split_condition

Show code
p_harmony_cluster | p_harmony_sex

Show code
p_split_sex

Show code
p_harmony_cluster | p_harmony_sample

Show code
p_split_sample

8 Optional: Resolution Sensitivity Check

Show code
resolution_grid <- c(0.3, 0.4, 0.5, 0.6)

SO_astro <- FindClusters(
  SO_astro,
  graph.name = "astro_harmony_snn",
  resolution = resolution_grid,
  verbose = FALSE
)

resolution_cols <- paste0("astro_harmony_snn_res.", resolution_grid)

p_resolution_list <- lapply(seq_along(resolution_grid), function(i) {
  res <- resolution_grid[i]
  res_col <- resolution_cols[i]

  DimPlot(
    SO_astro,
    reduction = "umap_astro_harmony",
    group.by = res_col,
    label = TRUE,
    repel = TRUE,
    pt.size = 0.35
  ) +
    labs(
      title = paste0("Resolution = ", res),
      x = "UMAP 1",
      y = "UMAP 2"
    )
})

wrap_plots(p_resolution_list, ncol = 2)

9 Step 11.6.5: Compare Unintegrated and Harmony-Corrected Astrocyte UMAPs

This comparison shows whether Harmony correction reduces sample- or condition-driven separation within the astrocyte subset.

The unintegrated UMAP is computed from astrocyte-only RNA PCA, whereas the Harmony UMAP is computed from Harmony-corrected astrocyte embeddings.

Show code
set.seed(1234)

dims_use_unintegrated <- dims_use_harmony
resolution_use_unintegrated <- resolution_use_harmony

SO_astro <- FindNeighbors(
  SO_astro,
  reduction = "pca_astro",
  dims = dims_use_unintegrated,
  graph.name = "astro_unintegrated_snn",
  verbose = FALSE
)

SO_astro <- FindClusters(
  SO_astro,
  graph.name = "astro_unintegrated_snn",
  resolution = resolution_use_unintegrated,
  cluster.name = "seurat_clusters_unintegrated",
  verbose = FALSE
)

SO_astro$seurat_clusters_unintegrated <- as.character(
  SO_astro$seurat_clusters_unintegrated
)

SO_astro <- RunUMAP(
  SO_astro,
  reduction = "pca_astro",
  dims = dims_use_unintegrated,
  reduction.name = "umap_astro_unintegrated",
  reduction.key = "astroUnintegratedUMAP_",
  verbose = FALSE
)

table(SO_astro$seurat_clusters_unintegrated)

  0   1  10  11   2   3   4   5   6   7   8   9 
910 829  61  38 719 512 480 458 425 409 248 170 
Show code
p_unintegrated_condition <- DimPlot(
  SO_astro,
  reduction = "umap_astro_unintegrated",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(
    title = "Before Harmony",
    subtitle = "Astrocyte-only RNA PCA UMAP",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_condition_compare <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "condition",
  pt.size = 0.4
) +
  labs(
    title = "After Harmony",
    subtitle = "Astrocyte-only Harmony UMAP",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_condition$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_condition_compare$layers[[1]]$aes_params$alpha <- 0.4

p_unintegrated_condition | p_harmony_condition_compare

Show code
p_unintegrated_sample <- DimPlot(
  SO_astro,
  reduction = "umap_astro_unintegrated",
  group.by = "sample",
  pt.size = 0.35
) +
  labs(
    title = "Before Harmony",
    subtitle = "Colored by sample",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_sample_compare <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "sample",
  pt.size = 0.35
) +
  labs(
    title = "After Harmony",
    subtitle = "Colored by sample",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_sample$layers[[1]]$aes_params$alpha <- 0.4
p_harmony_sample_compare$layers[[1]]$aes_params$alpha <- 0.4

p_unintegrated_sample | p_harmony_sample_compare

Show code
p_unintegrated_cluster <- DimPlot(
  SO_astro,
  reduction = "umap_astro_unintegrated",
  group.by = "seurat_clusters_unintegrated",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.4
) +
  labs(
    title = "Before Harmony",
    subtitle = "Unintegrated RNA PCA clusters",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_harmony_cluster_compare <- DimPlot(
  SO_astro,
  reduction = "umap_astro_harmony",
  group.by = "seurat_clusters_harmony",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.4
) +
  labs(
    title = "After Harmony",
    subtitle = "Harmony-based clusters",
    x = "UMAP 1",
    y = "UMAP 2"
  )

p_unintegrated_cluster$layers[[1]]$aes_params$alpha <- 0.5
p_harmony_cluster_compare$layers[[1]]$aes_params$alpha <- 0.5

p_unintegrated_cluster | p_harmony_cluster_compare

Show code
cluster_comparison_table <- table(
  unintegrated_cluster = SO_astro$seurat_clusters_unintegrated,
  harmony_cluster = SO_astro$seurat_clusters_harmony
)

write.csv(
  as.data.frame.matrix(cluster_comparison_table),
  file.path(results_dir, "Step11_5_unintegrated_vs_harmony_cluster_table.csv")
)

cluster_comparison_table
                    harmony_cluster
unintegrated_cluster   0   1   2   3   4   5   6   7   8   9
                  0    6 209 231 434   1  21   0   0   2   6
                  1   50 295 352  64  11   8   5  29   4  11
                  10   1   0   9   0  50   0   0   0   0   1
                  11   0   0   0   1   0   1   0   0  36   0
                  2  225  92 210   7   0   9 167   7   2   0
                  3    2  61 103   1 333   4   0   4   2   2
                  4  339   0  70   3   6   5   1  53   2   1
                  5  215 164  42   2  12  11   5   3   3   1
                  6  285   1 124   7   0   3   0   1   3   1
                  7    0 384   5   7   0   8   0   2   2   1
                  8  135  40  68   0   0   4   1   0   0   0
                  9    0   2   3   0   0 161   0   3   1   0

10 Step 11.7: Validate Astrocyte and Reactive Astrocyte Markers

Here we check expression, not only differential expression.

The point is:

canonical astrocyte marker expression = confirms broad astrocyte identity
top DE marker after astrocyte reclustering = identifies astrocyte state/subtype differences

After subsetting astrocytes, canonical astrocyte markers may not appear as top differential markers between astrocyte subclusters because they are expected to be broadly expressed across astrocytes.

Show code
astro_marker_genes <- c(
  "AQP4", "GFAP", "ALDH1L1", "SLC1A3", "S100B",
  "CHI3L1", "TNC", "C3", "SERPINA3", "HSPB1", "VIM"
)

astro_marker_present <- intersect(
  astro_marker_genes,
  rownames(SO_astro)
)

if (length(astro_marker_present) == 0) {
  stop("None of the astrocyte marker genes were found in SO_astro.")
}

astro_marker_present
 [1] "AQP4"     "GFAP"     "ALDH1L1"  "SLC1A3"   "S100B"    "CHI3L1"  
 [7] "TNC"      "C3"       "SERPINA3" "HSPB1"    "VIM"     
Show code
FeaturePlot(
  SO_astro,
  features = astro_marker_present,
  reduction = "umap_astro_harmony",
  ncol = 4,
  pt.size = 0.5
)

Show code
DotPlot(
  SO_astro,
  features = astro_marker_present,
  group.by = "seurat_clusters_harmony"
) +
  RotatedAxis() +
  labs(title = "Canonical and Reactive Astrocyte Marker Expression by Harmony Cluster")

11 Step 11.8: Define Astrocyte Reactivity Modules

Reactive-like states are inferred inside the astrocyte subset using marker expression and module scores.

Show code
astro_identity_genes <- c(
  "AQP4", "GFAP", "ALDH1L1", "SLC1A3", "S100B"
)

reactive_astrocyte_genes <- c(
  "CHI3L1", "TNC", "C3", "SERPINA3", "HSPB1", "VIM"
)

inflammatory_genes <- c(
  "IL6", "CXCL10", "CCL2", "CXCL8", "NFKBIA", "TNFAIP3", "SOCS3"
)

stress_genes <- c(
  "HSPA1A", "HSPA1B", "HSPB1", "DDIT3", "FOS", "JUN", "ATF3"
)

ecm_remodeling_genes <- c(
  "TNC", "VIM", "SERPINA3", "COL4A1", "COL4A2", "MMP2", "MMP9"
)

module_gene_list <- list(
  Astro_identity = intersect(astro_identity_genes, rownames(SO_astro)),
  Reactive_astro = intersect(reactive_astrocyte_genes, rownames(SO_astro)),
  Inflammatory = intersect(inflammatory_genes, rownames(SO_astro)),
  Stress = intersect(stress_genes, rownames(SO_astro)),
  ECM = intersect(ecm_remodeling_genes, rownames(SO_astro))
)

module_gene_list
$Astro_identity
[1] "AQP4"    "GFAP"    "ALDH1L1" "SLC1A3"  "S100B"  

$Reactive_astro
[1] "CHI3L1"   "TNC"      "C3"       "SERPINA3" "HSPB1"    "VIM"     

$Inflammatory
[1] "IL6"     "CXCL10"  "CCL2"    "CXCL8"   "NFKBIA"  "TNFAIP3" "SOCS3"  

$Stress
[1] "HSPA1A" "HSPA1B" "HSPB1"  "DDIT3"  "FOS"    "JUN"    "ATF3"  

$ECM
[1] "TNC"      "VIM"      "SERPINA3" "COL4A1"   "COL4A2"   "MMP2"     "MMP9"    
Show code
empty_modules <- names(module_gene_list)[lengths(module_gene_list) == 0]

if (length(empty_modules) > 0) {
  warning(
    paste(
      "These modules have no genes present in the object:",
      paste(empty_modules, collapse = ", ")
    )
  )
}
Show code
for (module_name in names(module_gene_list)) {
  genes_use <- module_gene_list[[module_name]]

  if (length(genes_use) > 0) {
    SO_astro <- AddModuleScore(
      SO_astro,
      features = list(genes_use),
      name = paste0(module_name, "_score")
    )
  }
}

score_features <- c(
  "Astro_identity_score1",
  "Reactive_astro_score1",
  "Inflammatory_score1",
  "Stress_score1",
  "ECM_score1"
)

score_features_present <- intersect(
  score_features,
  colnames(SO_astro@meta.data)
)

if (length(score_features_present) == 0) {
  stop("No module score columns found.")
}

score_features_present
[1] "Astro_identity_score1" "Reactive_astro_score1" "Inflammatory_score1"  
[4] "Stress_score1"         "ECM_score1"           

12 Step 11.9: Visualize Astrocyte Reactivity Modules

Show code
FeaturePlot(
  SO_astro,
  features = score_features_present,
  reduction = "umap_astro_harmony",
  ncol = 3,
  pt.size = 0.5
)

Show code
reactivity_score_features <- intersect(
  c(
    "Reactive_astro_score1",
    "Inflammatory_score1",
    "Stress_score1",
    "ECM_score1"
  ),
  colnames(SO_astro@meta.data)
)

VlnPlot(
  SO_astro,
  features = reactivity_score_features,
  group.by = "seurat_clusters_harmony",
  pt.size = 0
)

13 Step 11.10: Summarize Astrocyte Clusters

We summarize each astrocyte cluster by:

cell number
module scores
condition composition
sample composition

This helps distinguish robust astrocyte states from clusters dominated by a single sample.

Show code
harmony_cluster_condition_table <- table(
  harmony_cluster = SO_astro$seurat_clusters_harmony,
  condition = SO_astro$condition
)

harmony_cluster_sample_table <- table(
  harmony_cluster = SO_astro$seurat_clusters_harmony,
  sample = SO_astro$sample
)

harmony_cluster_sex_table <- table(
  harmony_cluster = SO_astro$seurat_clusters_harmony,
  sex = SO_astro$sex
)

harmony_cluster_condition_table
               condition
harmony_cluster   C  PD
              0 624 634
              1 742 506
              2 449 768
              3  30 496
              4  52 361
              5 135 100
              6 177   2
              7  78  24
              8  37  20
              9  12  12
Show code
harmony_cluster_sample_table
               sample
harmony_cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F
              0       33        8      214      222      120       27      189
              1       54       17      165      457       40        9       90
              2       68       31       54      216       66       14       71
              3        7        0        2       19        2        0        1
              4        8       22       12        0        6        4        7
              5       22       34       15       28       17       19       11
              6        2        5        6      162        2        0        0
              7        6       17        5       16       17       17        4
              8        3        6        6       18        3        1        4
              9        4        1        2        1        2        2        1
               sample
harmony_cluster PDSN02_F PDSN04_F PDSN06_M PDSN07_M PDSN09_M
              0        3        6      274      136       26
              1       62      210       27       40       77
              2      111      238      164       78      106
              3        1      422       10        3       59
              4      351        1        1        0        1
              5        8       27       36        9        9
              6        0        0        0        1        1
              7        5        7        3        1        4
              8        3        6        2        1        4
              9        2        6        2        0        1
Show code
harmony_cluster_sex_table
               sex
harmony_cluster Female Male Unknown
              0    453  805       0
              1    598  650       0
              2    573  644       0
              3    433   93       0
              4    401   12       0
              5    117  118       0
              6     13  166       0
              7     44   58       0
              8     28   29       0
              9     16    8       0
Show code
harmony_cluster_summary <- SO_astro@meta.data %>%
  mutate(
    harmony_cluster = as.character(seurat_clusters_harmony)
  ) %>%
  group_by(harmony_cluster) %>%
  summarise(
    n_cells = n(),
    mean_astro_identity = mean(Astro_identity_score1, na.rm = TRUE),
    mean_reactive = mean(Reactive_astro_score1, na.rm = TRUE),
    mean_inflammatory = mean(Inflammatory_score1, na.rm = TRUE),
    mean_stress = mean(Stress_score1, na.rm = TRUE),
    mean_ecm = mean(ECM_score1, na.rm = TRUE),
    percent_PD = mean(condition == "PD", na.rm = TRUE),
    percent_C = mean(condition == "C", na.rm = TRUE),
    percent_female = mean(sex == "Female", na.rm = TRUE),
    percent_male = mean(sex == "Male", na.rm = TRUE),
    dominant_sex = names(sort(table(sex), decreasing = TRUE))[1],
    n_samples = n_distinct(sample),
    dominant_condition = names(sort(table(condition), decreasing = TRUE))[1],
    dominant_sample = names(sort(table(sample), decreasing = TRUE))[1],
    .groups = "drop"
  ) %>%
  arrange(desc(mean_reactive))

harmony_cluster_summary

14 Step 11.11: Define Astrocyte States

Harmony-corrected clusters are used to define astrocyte states conservatively.

State labels are intentionally left unassigned at this stage and should be manually defined after inspecting:

  1. module scores
  2. canonical and reactive marker expression
  3. condition composition
  4. sample composition
  5. cluster robustness
Show code
SO_astro$astro_cluster_harmony <- as.character(SO_astro$seurat_clusters_harmony)

pd_reactive <- c("3")
homeostatic <- c("0", "1", "2", "5")
other <- c("4", "6", "7", "8", "9")

SO_astro$astro_state_harmony <- case_when(
  SO_astro$astro_cluster_harmony %in% pd_reactive ~ "PD Reactive",
  SO_astro$astro_cluster_harmony %in% homeostatic ~ "Homeostatic",
  SO_astro$astro_cluster_harmony %in% other ~ "Other",
  TRUE ~ "Unassigned"
)

SO_astro$astro_state_harmony <- factor(
  SO_astro$astro_state_harmony,
  levels = c(
    "PD Reactive",
    "Homeostatic",
    "Other",
    "Unassigned"
  )
)

table(SO_astro$astro_cluster_harmony, SO_astro$astro_state_harmony)
   
    PD Reactive Homeostatic Other Unassigned
  0           0        1258     0          0
  1           0        1248     0          0
  2           0        1217     0          0
  3         526           0     0          0
  4           0           0   413          0
  5           0         235     0          0
  6           0           0   179          0
  7           0           0   102          0
  8           0           0    57          0
  9           0           0    24          0
Show code
table(SO_astro$astro_state_harmony, SO_astro$condition)
             
                 C   PD
  PD Reactive   30  496
  Homeostatic 1950 2008
  Other        356  419
  Unassigned     0    0

15 Step 11.12: Add External GENCODE Annotation for Context-Based lncRNA Subtype Classification

Recent GENCODE releases use a broad lncRNA gene biotype and do not always preserve older lncRNA subtypes such as lincRNA, antisense, sense_intronic, or sense_overlapping.

Therefore, lncRNAs are classified here by genomic context relative to protein-coding genes using the comprehensive GENCODE annotation.

The context-based categories are:

LINC
= lncRNA genes without overlap with protein-coding genes

Antisense
= lncRNA genes overlapping protein-coding genes on the opposite strand

Sense_gene_body_contained
= lncRNA genes fully contained within protein-coding genes on the same strand

Sense_overlapping
= lncRNA genes overlapping protein-coding genes on the same strand, but not fully intronic

Other_lncRNA_context
= lncRNA genes that do not clearly fit the above categories
Show code
reference_dir <- file.path(project_dir, "reference")
dir.create(reference_dir, recursive = TRUE, showWarnings = FALSE)

gencode_comprehensive_gtf_path <- file.path(
  project_dir,
  "reference",
  "gencode.v49.annotation.gtf.gz"
)

if (!file.exists(gencode_comprehensive_gtf_path)) {
  stop("GENCODE comprehensive GTF file not found. Download it from Terminal first.")
}

gencode_all_gtf <- rtracklayer::import(gencode_comprehensive_gtf_path)

gencode_all_genes <- as.data.frame(gencode_all_gtf) %>%
  filter(type == "gene") %>%
  as_tibble() %>%
  transmute(
    seqnames = as.character(seqnames),
    start = start,
    end = end,
    strand = as.character(strand),
    gene_id = gene_id,
    gene_id_clean = stringr::str_replace(gene_id, "\\..*$", ""),
    gene_name = gene_name,
    gene_type = gene_type
  ) %>%
  distinct()

gencode_gene_type_summary <- gencode_all_genes %>%
  count(gene_type, name = "n_genes") %>%
  arrange(desc(n_genes))

write.csv(
  gencode_gene_type_summary,
  file.path(results_dir, "Step11_5_GENCODE_gene_type_summary.csv"),
  row.names = FALSE
)

gencode_gene_type_summary %>%
  filter(gene_type %in% c("protein_coding", "lncRNA"))
Show code
library(GenomicRanges)

lncRNA_gr <- gencode_all_genes %>%
  filter(gene_type == "lncRNA") %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    strand.field = "strand"
  )

protein_coding_gr <- gencode_all_genes %>%
  filter(gene_type == "protein_coding") %>%
  makeGRangesFromDataFrame(
    keep.extra.columns = TRUE,
    seqnames.field = "seqnames",
    start.field = "start",
    end.field = "end",
    strand.field = "strand"
  )

# Any overlap with protein-coding genes, ignoring strand.
any_pc_overlap_hits <- findOverlaps(
  lncRNA_gr,
  protein_coding_gr,
  ignore.strand = TRUE
)

overlap_pairs <- tibble(
  lncRNA_gene_id = mcols(lncRNA_gr)$gene_id_clean[queryHits(any_pc_overlap_hits)],
  lncRNA_gene_name = mcols(lncRNA_gr)$gene_name[queryHits(any_pc_overlap_hits)],
  lncRNA_strand = as.character(strand(lncRNA_gr))[queryHits(any_pc_overlap_hits)],
  lncRNA_start = start(lncRNA_gr)[queryHits(any_pc_overlap_hits)],
  lncRNA_end = end(lncRNA_gr)[queryHits(any_pc_overlap_hits)],
  pc_gene_id = mcols(protein_coding_gr)$gene_id_clean[subjectHits(any_pc_overlap_hits)],
  pc_gene_name = mcols(protein_coding_gr)$gene_name[subjectHits(any_pc_overlap_hits)],
  pc_strand = as.character(strand(protein_coding_gr))[subjectHits(any_pc_overlap_hits)],
  pc_start = start(protein_coding_gr)[subjectHits(any_pc_overlap_hits)],
  pc_end = end(protein_coding_gr)[subjectHits(any_pc_overlap_hits)]
)

# Opposite-strand protein-coding overlap.
antisense_lncRNA_ids <- overlap_pairs %>%
  filter(
    lncRNA_strand != pc_strand,
    lncRNA_strand %in% c("+", "-"),
    pc_strand %in% c("+", "-")
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# Same-strand protein-coding overlap.
same_strand_pairs <- overlap_pairs %>%
  filter(
    lncRNA_strand == pc_strand,
    lncRNA_strand %in% c("+", "-"),
    pc_strand %in% c("+", "-")
  )

# Sense-intronic:
# lncRNA is fully contained within a protein-coding gene on the same strand.
sense_intronic_lncRNA_ids <- same_strand_pairs %>%
  filter(
    lncRNA_start >= pc_start,
    lncRNA_end <= pc_end
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# Sense-overlapping:
# lncRNA overlaps a protein-coding gene on the same strand,
# but is not fully contained within it.
sense_overlapping_lncRNA_ids <- same_strand_pairs %>%
  filter(
    !(lncRNA_start >= pc_start & lncRNA_end <= pc_end)
  ) %>%
  pull(lncRNA_gene_id) %>%
  unique()

# LINC:
# lncRNA has no overlap with any protein-coding gene.
pc_overlapping_lncRNA_ids <- overlap_pairs %>%
  pull(lncRNA_gene_id) %>%
  unique()

linc_lncRNA_ids <- setdiff(
  mcols(lncRNA_gr)$gene_id_clean,
  pc_overlapping_lncRNA_ids
)

# Priority:
# 1. Antisense
# 2. Sense_gene_body_contained
# 3. Sense_overlapping
# 4. LINC
# 5. Other_lncRNA_context
#
# If one lncRNA overlaps multiple protein-coding genes in multiple ways,
# the most interpretable overlap class is assigned by this priority.
gencode_lncRNA_context_anno <- tibble(
  gene_id_clean = mcols(lncRNA_gr)$gene_id_clean,
  gencode_gene_id = mcols(lncRNA_gr)$gene_id,
  gencode_gene_name = mcols(lncRNA_gr)$gene_name,
  gencode_gene_type = mcols(lncRNA_gr)$gene_type,
  lncRNA_type = case_when(
  gene_id_clean %in% antisense_lncRNA_ids ~ "Antisense",
  gene_id_clean %in% sense_intronic_lncRNA_ids ~ "Sense_gene_body_contained",
  gene_id_clean %in% sense_overlapping_lncRNA_ids ~ "Sense_overlapping",
  gene_id_clean %in% linc_lncRNA_ids ~ "LINC",
  TRUE ~ "Other_lncRNA_context"
)
)

table(gencode_lncRNA_context_anno$lncRNA_type)

write.csv(
  gencode_lncRNA_context_anno,
  file.path(results_dir, "Step11_5_GENCODE_lncRNA_context_annotation.csv"),
  row.names = FALSE
)

write.csv(
  overlap_pairs,
  file.path(results_dir, "Step11_5_GENCODE_lncRNA_protein_coding_overlap_pairs.csv"),
  row.names = FALSE
)
Show code
gene_meta <- SO_astro[["RNA"]][[]] %>%
  tibble::rownames_to_column("gene") %>%
  mutate(
    gene_id_clean = stringr::str_replace(gene, "\\..*$", "")
  )

# Match by Ensembl gene ID.
gene_meta_context_ensembl <- gene_meta %>%
  left_join(
    gencode_lncRNA_context_anno,
    by = "gene_id_clean"
  )

n_context_matched_by_ensembl <- sum(!is.na(gene_meta_context_ensembl$lncRNA_type))

# Match by gene symbol.
# Keep both the Seurat gene name and the GENCODE gene name.
gene_meta_context_symbol <- gene_meta %>%
  left_join(
    gencode_lncRNA_context_anno %>%
      mutate(gencode_gene_name_join = gencode_gene_name) %>%
      select(
        gencode_gene_name_join,
        gencode_gene_id,
        gencode_gene_name,
        gene_id_clean_gencode = gene_id_clean,
        lncRNA_type
      ),
    by = c("gene" = "gencode_gene_name_join")
  )

n_context_matched_by_symbol <- sum(!is.na(gene_meta_context_symbol$lncRNA_type))

n_context_matched_by_ensembl
[1] 19045
Show code
n_context_matched_by_symbol
[1] 23746
Show code
if (n_context_matched_by_ensembl >= n_context_matched_by_symbol) {
  gene_meta <- gene_meta_context_ensembl
  lncRNA_context_match_method <- "Ensembl_gene_id"
} else {
  gene_meta <- gene_meta_context_symbol
  lncRNA_context_match_method <- "gene_symbol"
}

lncRNA_context_match_method
[1] "gene_symbol"
Show code
gene_meta <- gene_meta %>%
  mutate(
    lncRNA_type = if_else(
      is.na(lncRNA_type),
      "Not_lncRNA_or_unmatched",
      lncRNA_type
    ),
    is_lncRNA = lncRNA_type != "Not_lncRNA_or_unmatched"
  )

lncRNA_genes <- gene_meta %>%
  filter(is_lncRNA) %>%
  pull(gene) %>%
  intersect(rownames(SO_astro))

length(lncRNA_genes)
[1] 23730
Show code
table(gene_meta$lncRNA_type)

                Antisense                      LINC   Not_lncRNA_or_unmatched 
                     9465                     12339                     24043 
Sense_gene_body_contained         Sense_overlapping 
                     1498                       444 
Show code
write.csv(
  gene_meta %>%
    filter(is_lncRNA),
  file.path(results_dir, "Step11_5_astrocyte_lncRNA_gene_annotation_context_based.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame(table(gene_meta$lncRNA_type)),
  file.path(results_dir, "Step11_5_lncRNA_type_counts_context_based.csv"),
  row.names = FALSE
)
Show code
lncRNA_type_detection_summary <- gene_meta %>%
  filter(is_lncRNA) %>%
  count(lncRNA_type, name = "n_genes") %>%
  arrange(desc(n_genes))

lncRNA_type_detection_summary
Show code
write.csv(
  lncRNA_type_detection_summary,
  file.path(results_dir, "Step11_5_lncRNA_context_type_summary.csv"),
  row.names = FALSE
)
Show code
get_assay_data_safe <- function(object, assay = "RNA", layer_or_slot = "counts") {
  tryCatch(
    GetAssayData(object, assay = assay, layer = layer_or_slot),
    error = function(e) {
      GetAssayData(object, assay = assay, slot = layer_or_slot)
    }
  )
}

counts_mat <- get_assay_data_safe(
  SO_astro,
  assay = "RNA",
  layer_or_slot = "counts"
)

gene_detection_summary <- tibble(
  gene = rownames(counts_mat),
  n_cells_detected = Matrix::rowSums(counts_mat > 0),
  pct_cells_detected = Matrix::rowSums(counts_mat > 0) / ncol(counts_mat)
) %>%
  left_join(
    gene_meta %>%
      select(
        gene,
        gencode_gene_id,
        gencode_gene_name,
        lncRNA_type,
        is_lncRNA
      ),
    by = "gene"
  )

lncRNA_detection_summary <- gene_detection_summary %>%
  filter(is_lncRNA) %>%
  arrange(desc(n_cells_detected))

lncRNA_detection_summary %>%
  slice_head(n = 20)
Show code
write.csv(
  lncRNA_detection_summary,
  file.path(results_dir, "Step11_5_astrocyte_lncRNA_detection_summary.csv"),
  row.names = FALSE
)

16 Step 11.13: Global lncRNA Expression Fraction in Astrocytes

This section adds a per-cell lncRNA expression fraction. This helps check whether lncRNA expression is broadly distributed or concentrated in specific clusters, samples, or conditions.

Show code
if (length(lncRNA_genes) == 0) {
  stop("No lncRNA genes were identified from the available gene annotation.")
}

SO_astro[["percent_lncRNA"]] <- PercentageFeatureSet(
  SO_astro,
  features = lncRNA_genes,
  assay = "RNA"
)

summary(SO_astro$percent_lncRNA)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  1.812  12.814  14.756  15.742  17.681  47.784 
Show code
FeaturePlot(
  SO_astro,
  features = "percent_lncRNA",
  reduction = "umap_astro_harmony",
  pt.size = 0.5
) +
  labs(title = "Percent lncRNA Expression in Astrocytes")

Show code
p_lnc_cluster <- VlnPlot(
  SO_astro,
  features = "percent_lncRNA",
  group.by = "seurat_clusters_harmony",
  pt.size = 0
) +
  labs(title = "Percent lncRNA by Cluster")

p_lnc_condition <- VlnPlot(
  SO_astro,
  features = "percent_lncRNA",
  group.by = "condition",
  pt.size = 0
) +
  labs(title = "Percent lncRNA by Condition")

p_lnc_cluster | p_lnc_condition

17 Step 11.14: Identify lncRNA Markers of Astrocyte Clusters

Here we identify lncRNAs that are enriched in each astrocyte cluster.
Harmony is used only to define clusters. Differential expression is performed on the RNA assay.

Show code
DefaultAssay(SO_astro) <- "RNA"
Idents(SO_astro) <- "seurat_clusters_harmony"

astro_harmony_markers <- FindAllMarkers(
  SO_astro,
  assay = "RNA",
  only.pos = TRUE,
  min.pct = 0.05,
  logfc.threshold = 0.25,
  verbose = FALSE
)

astro_lncRNA_markers <- astro_harmony_markers %>%
  left_join(
    gene_meta %>%
      select(
        gene,
        gencode_gene_id,
        gencode_gene_name,
        lncRNA_type,
        is_lncRNA
      ),
    by = "gene"
  ) %>%
  filter(
    is_lncRNA,
    p_val_adj < 0.05
  ) %>%
  mutate(
    pct_diff = pct.1 - pct.2
  ) %>%
  arrange(
    cluster,
    desc(avg_log2FC),
    desc(pct_diff),
    p_val_adj
  )

astro_lncRNA_markers_top_strict <- astro_lncRNA_markers %>%
  filter(pct_diff > 0.05) %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  ungroup()

astro_lncRNA_markers_top_per_cluster <- astro_lncRNA_markers %>%
  group_by(cluster) %>%
  slice_head(n = 5) %>%
  ungroup()

write.csv(
  astro_harmony_markers,
  file.path(results_dir, "Step11_5_harmony_cluster_all_markers.csv"),
  row.names = FALSE
)

write.csv(
  astro_lncRNA_markers,
  file.path(results_dir, "Step11_5_harmony_cluster_lncRNA_markers.csv"),
  row.names = FALSE
)

write.csv(
  astro_lncRNA_markers_top_strict,
  file.path(results_dir, "Step11_5_harmony_cluster_top_lncRNA_markers_strict_pctdiff.csv"),
  row.names = FALSE
)

write.csv(
  astro_lncRNA_markers_top_per_cluster,
  file.path(results_dir, "Step11_5_harmony_cluster_top_lncRNA_markers_per_cluster.csv"),
  row.names = FALSE
)
Show code
lncRNA_marker_type_summary <- astro_lncRNA_markers %>%
  count(cluster, lncRNA_type, name = "n_marker_lncRNAs") %>%
  arrange(cluster, desc(n_marker_lncRNAs))

lncRNA_marker_type_summary
Show code
write.csv(
  lncRNA_marker_type_summary,
  file.path(results_dir, "Step11_5_lncRNA_marker_type_summary_by_cluster.csv"),
  row.names = FALSE
)

18 Step 11.15: Visualize lncRNA Marker Expression Patterns

These plots directly address the question of where lncRNAs are expressed across astrocyte clusters. The heatmap and dot plot should be interpreted as expression-pattern summaries, not as proof of function.

Show code
top_lncRNA_features <- astro_lncRNA_markers_top_per_cluster %>%
  pull(gene) %>%
  unique() %>%
  intersect(rownames(SO_astro))

# Fallback: if few marker lncRNAs pass the marker threshold, use the most detected lncRNAs.
if (length(top_lncRNA_features) < 2) {
  top_lncRNA_features <- lncRNA_detection_summary %>%
    slice_head(n = 20) %>%
    pull(gene) %>%
    unique() %>%
    intersect(rownames(SO_astro))
}

top_lncRNA_features
 [1] "ENSG00000234147" "LINC02277"       "ENSG00000302932" "ENSG00000300891"
 [5] "ENSG00000233928" "ENSG00000291054" "FAM182B"         "OXA1L-DT"       
 [9] "UGDH-AS1"        "ATP1A1-AS1"      "ENSG00000305928" "ENSG00000302585"
[13] "ENSG00000302111" "ENSG00000230258" "LINC01497"       "LINC02552"      
[17] "ENSG00000295757" "TRD-AS1"         "ENSG00000237480" "LINC03077"      
[21] "ENSG00000289591" "SH3TC2-DT"       "LINC01608"       "SAMMSON"        
[25] "ENSG00000289279" "FOXG1-AS1"       "LINC01551"       "LINC02984"      
[29] "LINC02715"       "LINC00609"       "ENSG00000287684" "ENSG00000307137"
[33] "ENSG00000290441" "ENSG00000257258" "ENSG00000302619" "ENSG00000294326"
[37] "ENSG00000303315" "ENSG00000309644" "LINC02381"       "ENSG00000258711"
[41] "ENSG00000303264" "ENSG00000231873" "ENSG00000258520" "ENSG00000305782"
[45] "LINC01141"       "ENSG00000307068" "ENSG00000272783" "ENSG00000287470"
[49] "PCAT19"          "ENSG00000289404"
Show code
if (length(top_lncRNA_features) > 0) {
  DotPlot(
    SO_astro,
    features = top_lncRNA_features,
    assay = "RNA",
    group.by = "seurat_clusters_harmony"
  ) +
    RotatedAxis() +
    guides(
      color = guide_colorbar(title = "Average\nExpression"),
      size = guide_legend(title = "Percent\nExpressed")
    ) +
    labs(
      title = "Top lncRNA Marker Expression by Harmony-Based Astrocyte Cluster",
      x = "lncRNA",
      y = "Harmony cluster"
    )
} else {
  message("No lncRNA features available for DotPlot.")
}

Show code
if (length(top_lncRNA_features) > 1) {
  SO_astro <- ScaleData(
    SO_astro,
    features = top_lncRNA_features,
    assay = "RNA",
    verbose = FALSE
  )

  DoHeatmap(
    SO_astro,
    features = top_lncRNA_features,
    assay = "RNA",
    group.by = "seurat_clusters_harmony"
  ) +
    NoLegend() +
    labs(title = "Top lncRNA Marker Heatmap by Harmony-Based Astrocyte Cluster")
} else {
  message("Not enough lncRNA features for Seurat heatmap.")
}
Warning: Different features in new layer data than already exists for
scale.data

Show code
DefaultAssay(SO_astro) <- "RNA"

expr_mat <- GetAssayData(
  SO_astro,
  assay = "RNA",
  layer = "data"
)

lncRNA_genes_use <- intersect(lncRNA_genes, rownames(expr_mat))

lncRNA_detected <- lncRNA_genes_use[
  Matrix::rowSums(expr_mat[lncRNA_genes_use, ] > 0) / ncol(expr_mat) >= 0.01
]

if (length(lncRNA_detected) < 2) {
  stop("Not enough detected lncRNAs for variable lncRNA heatmap.")
}

lncRNA_variance <- Matrix::rowMeans(expr_mat[lncRNA_detected, ]^2) -
  Matrix::rowMeans(expr_mat[lncRNA_detected, ])^2

top_variable_lncRNAs <- names(sort(lncRNA_variance, decreasing = TRUE)) %>%
  head(n = min(1000, length(.)))

SO_astro$cluster_condition <- paste0(
  "Cl",
  SO_astro$seurat_clusters_harmony,
  "_",
  SO_astro$condition
)

avg_variable_lncRNA <- AverageExpression(
  SO_astro,
  assays = "RNA",
  features = top_variable_lncRNAs,
  group.by = "cluster_condition",
  slot = "data",
  verbose = FALSE
)$RNA %>%
  as.matrix()

avg_variable_lncRNA_z <- t(scale(t(avg_variable_lncRNA)))
avg_variable_lncRNA_z[is.na(avg_variable_lncRNA_z)] <- 0
avg_variable_lncRNA_z <- pmax(pmin(avg_variable_lncRNA_z, 2), -2)

heatmap_breaks <- seq(-2, 2, length.out = 101)

column_annotation <- data.frame(
  group = colnames(avg_variable_lncRNA_z)
)

column_annotation$cluster <- stringr::str_match(
  column_annotation$group,
  "^Cl([^-]+)-"
)[, 2]

column_annotation$condition <- stringr::str_match(
  column_annotation$group,
  "-([^-]+)$"
)[, 2]

rownames(column_annotation) <- column_annotation$group
column_annotation$group <- NULL

column_annotation$cluster <- factor(column_annotation$cluster)
column_annotation$condition <- factor(
  column_annotation$condition,
  levels = c("C", "PD")
)

column_annotation
Show code
pheatmap::pheatmap(
  avg_variable_lncRNA_z,
  annotation_col = column_annotation,
  show_rownames = FALSE,
  cluster_rows = TRUE,
  cluster_cols = TRUE,
  breaks = heatmap_breaks,
  main = "Top Variable lncRNA Average Expression by Astrocyte Cluster and Condition",
  fontsize_col = 8,
  border_color = NA
)

Show code
write.csv(
  top_variable_lncRNAs,
  file.path(results_dir, "Step11_5_top_variable_lncRNAs_for_heatmap.csv"),
  row.names = FALSE
)

write.csv(
  avg_variable_lncRNA,
  file.path(results_dir, "Step11_5_average_variable_lncRNA_expression_by_cluster_condition.csv")
)

write.csv(
  avg_variable_lncRNA_z,
  file.path(results_dir, "Step11_5_average_variable_lncRNA_expression_by_cluster_condition_zscore.csv")
)

19 Step 11.16: Save Astrocyte Object and Results

Show code
saveRDS(
  SO_astro,
  file.path(results_dir, "SO_astro_step11_5_fernando_harmony_reclustered_scored.rds")
)

write.csv(
  harmony_cluster_summary,
  file.path(results_dir, "Step11_5_harmony_cluster_summary.csv"),
  row.names = FALSE
)

write.csv(
  as.data.frame.matrix(harmony_cluster_condition_table),
  file.path(results_dir, "Step11_5_harmony_cluster_condition_table.csv")
)

write.csv(
  as.data.frame.matrix(harmony_cluster_sample_table),
  file.path(results_dir, "Step11_5_harmony_cluster_sample_table.csv")
)

write.csv(
  as.data.frame.matrix(harmony_cluster_sex_table),
  file.path(results_dir, "Step11_5_harmony_cluster_sex_table.csv")
)

write.csv(
  data.frame(
    setting = c(
      "input_object",
      "annotation_column",
      "condition_column",
      "sample_column",
      "astrocyte_labels_used",
      "normalization",
      "variable_features",
      "pca_dimensions_computed",
      "harmony_group_var",
      "dims_use_harmony",
      "resolution_use_harmony",
      "primary_cluster_column",
      "primary_umap_reduction",
      "optional_unintegrated_dims",
      "optional_unintegrated_resolution",
      "lncRNA_annotation_method",
      "lncRNA_context_match_method",
      "n_lncRNA_genes_detected_in_astrocytes",
      "n_astrocyte_cells"
    ),
    value = as.character(c(
      fernando_path,
      "Cell_types",
      "condition",
      "Sample",
      paste(target_astrocytes, collapse = "; "),
      "LogNormalize",
      2000,
      40,
      harmony_group_var,
      paste(dims_use_harmony, collapse = ","),
      resolution_use_harmony,
      "seurat_clusters_harmony",
      "umap_astro_harmony",
      paste(dims_use_unintegrated, collapse = ","),
      resolution_use_unintegrated,
      "GENCODE_v49_genomic_context",
      lncRNA_context_match_method,
      length(lncRNA_genes),
      ncol(SO_astro)
    ))
  ),
  file.path(results_dir, "Step11_5_analysis_settings.csv"),
  row.names = FALSE
)

20 Interpretation Note

This analysis intentionally separates three concepts:

canonical astrocyte marker expression
astrocyte subcluster structure
reactive / inflammatory / stress / ECM module scores
lncRNA marker expression and lncRNA subtype annotation

After subsetting astrocytes, canonical astrocyte markers may not appear as top subcluster markers because they are expected to be broadly expressed across astrocytes.

Therefore:

AQP4 / GFAP / ALDH1L1 / SLC1A3 expression confirms astrocyte identity.
Reactive / inflammatory / stress / ECM scores help interpret astrocyte states..
Differential expression should be performed using the RNA assay, not Harmony-corrected embeddings.

Recent GENCODE releases use a broad lncRNA biotype. Therefore, lncRNA candidates were annotated by genomic context relative to protein-coding genes using GENCODE v49. LncRNAs without protein-coding gene overlap were labeled LINC. LncRNAs overlapping protein-coding genes on the opposite strand were labeled Antisense. Same-strand lncRNAs fully contained within protein-coding gene bodies were labeled Sense_gene_body_contained, and same-strand partial overlaps were labeled Sense_overlapping. These labels are context-based annotations, not original GENCODE subtype labels.