Setup: Environment and Data
The purpose is to make the astrocyte analysis reproducible from the lab reference object, instead of starting from an independently re-annotated object. After subsetting astrocytes, dimensional reduction and clustering are re-run within the astrocyte compartment.
Harmony correction is applied at the astrocyte-only level using sample identity as the grouping variable. Harmony-corrected embeddings are used for clustering and visualization, whereas RNA assay expression values are retained for marker visualization, module scoring, and downstream differential expression.
The biological goal remains:
Astrocyte
→ reactive astrocyte
→ stressed / inflammatory astrocyte state
This trajectory should be interpreted as an astrocyte reactivity trajectory , not a neuronal degeneration trajectory.
Show code
library (Seurat)
library (tidyverse)
library (patchwork)
library (harmony)
library (pheatmap)
library (Matrix)
library (rtracklayer)
set.seed (1234 )
project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"
fernando_path <- file.path (
project_dir,
"lncRNAs_PD_Fer" ,
"snRNAseq" ,
"seurat_object_annotated.rds"
)
results_dir <- file.path (
project_dir,
"results" ,
"Step11_5_fernando_astrocyte_harmony_reclustering"
)
dir.create (
results_dir,
recursive = TRUE ,
showWarnings = FALSE
)
SO <- readRDS (fernando_path)
SO
An object of class Seurat
87674 features across 46066 samples within 2 assays
Active assay: SCT (39901 features, 3000 variable features)
3 layers present: counts, data, scale.data
1 other assay present: RNA
3 dimensional reductions calculated: pca, umap, harmony
Step 11.2: Inspect Full-Object UMAP
The downstream astrocyte analysis uses newly computed astrocyte-only Harmony embeddings.
Show code
available_reductions <- Reductions (SO)
available_reductions
[1] "pca" "umap" "harmony"
Show code
full_umap_reduction <- case_when (
"umap_rpca" %in% available_reductions ~ "umap_rpca" ,
"umap" %in% available_reductions ~ "umap" ,
TRUE ~ NA_character_
)
full_umap_reduction
Show code
if (! is.na (full_umap_reduction)) {
DimPlot (
SO,
reduction = full_umap_reduction,
group.by = "fernando_celltype_annotation" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (title = "Annotated Cell Types" )
} else {
message ("No existing UMAP reduction found in Fernando object. Skipping full-object DimPlot." )
}
Step 11.4: Preprocess Astrocytes
After subsetting astrocytes, PCA is re-computed within the astrocyte subset. This PCA is used as input for Harmony correction.
Show code
DefaultAssay (SO_astro) <- "RNA"
if ("Assay5" %in% class (SO_astro[["RNA" ]])) {
SO_astro <- JoinLayers (SO_astro, assay = "RNA" )
}
SO_astro <- NormalizeData (
SO_astro,
verbose = FALSE
)
SO_astro <- FindVariableFeatures (
SO_astro,
selection.method = "vst" ,
nfeatures = 2000 ,
verbose = FALSE
)
SO_astro <- ScaleData (
SO_astro,
verbose = FALSE
)
SO_astro <- RunPCA (
SO_astro,
npcs = 40 ,
reduction.name = "pca_astro" ,
reduction.key = "astroPCA_" ,
verbose = FALSE
)
ElbowPlot (
SO_astro,
reduction = "pca_astro" ,
ndims = 40
) +
labs (title = "Fernando-Defined Astrocyte PCA Elbow Plot" )
Step 11.5: Run Harmony Within Astrocytes
Harmony is applied within the astrocyte subset using sample identity as the grouping variable.
Condition is not used as the Harmony grouping variable because disease condition is biologically relevant and should not be directly regressed out at this step.
Show code
harmony_group_var <- "sample"
if (! harmony_group_var %in% colnames (SO_astro@ meta.data)) {
stop (
paste (
"Harmony grouping variable not found in metadata:" ,
harmony_group_var
)
)
}
SO_astro <- RunHarmony (
object = SO_astro,
group.by.vars = harmony_group_var,
reduction = "pca_astro" ,
reduction.save = "harmony_astro" ,
verbose = TRUE
)
Show code
An object of class Seurat
87674 features across 5259 samples within 2 assays
Active assay: RNA (47773 features, 2000 variable features)
3 layers present: counts, data, scale.data
1 other assay present: SCT
5 dimensional reductions calculated: pca, umap, harmony, pca_astro, harmony_astro
Step 11.6: Harmony-Based Astrocyte Clustering and UMAP
Harmony-corrected embeddings are used for neighbor graph construction, clustering, and UMAP visualization.
Show code
set.seed (1234 )
dims_use_harmony <- 1 : 30
resolution_use_harmony <- 0.5
SO_astro <- FindNeighbors (
SO_astro,
reduction = "harmony_astro" ,
dims = dims_use_harmony,
graph.name = "astro_harmony_snn" ,
verbose = FALSE
)
SO_astro <- FindClusters (
SO_astro,
graph.name = "astro_harmony_snn" ,
resolution = resolution_use_harmony,
verbose = FALSE
)
SO_astro$ seurat_clusters_harmony <- as.character (SO_astro$ seurat_clusters)
SO_astro <- RunUMAP (
SO_astro,
reduction = "harmony_astro" ,
dims = dims_use_harmony,
reduction.name = "umap_astro_harmony" ,
reduction.key = "astroHarmonyUMAP_" ,
verbose = FALSE
)
table (SO_astro$ seurat_clusters_harmony)
0 1 2 3 4 5 6 7 8 9
1258 1248 1217 526 413 235 179 102 57 24
Show code
p_harmony_cluster <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "seurat_clusters_harmony" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.5
) +
labs (title = "Harmony-Corrected Astrocyte Subclusters" )
p_harmony_condition <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (title = "Condition Distribution" )
p_harmony_sex <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "sex" ,
pt.size = 0.4
) +
labs (title = "Sex Distribution" )
p_harmony_sample <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "sample" ,
pt.size = 0.4
) +
labs (title = "Sample Distribution" )
p_split_condition <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "condition" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (title = "Harmony Astrocyte Clusters Split by Condition" )
p_split_sex <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "sex" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (title = "Harmony Astrocyte Clusters Split by Sex" )
p_split_sample <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "seurat_clusters_harmony" ,
split.by = "sample" ,
label = FALSE ,
pt.size = 0.25 ,
ncol = 4
) +
labs (title = "Harmony Astrocyte Clusters Split by Sample" )
p_harmony_cluster$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_harmony_condition$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sex$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sample$ layers[[1 ]]$ aes_params$ alpha <- 0.4
Show code
p_harmony_cluster | p_harmony_condition
Show code
p_harmony_cluster | p_harmony_sex
Show code
p_harmony_cluster | p_harmony_sample
Optional: Resolution Sensitivity Check
Show code
resolution_grid <- c (0.3 , 0.4 , 0.5 , 0.6 )
SO_astro <- FindClusters (
SO_astro,
graph.name = "astro_harmony_snn" ,
resolution = resolution_grid,
verbose = FALSE
)
resolution_cols <- paste0 ("astro_harmony_snn_res." , resolution_grid)
p_resolution_list <- lapply (seq_along (resolution_grid), function (i) {
res <- resolution_grid[i]
res_col <- resolution_cols[i]
DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = res_col,
label = TRUE ,
repel = TRUE ,
pt.size = 0.35
) +
labs (
title = paste0 ("Resolution = " , res),
x = "UMAP 1" ,
y = "UMAP 2"
)
})
wrap_plots (p_resolution_list, ncol = 2 )
Step 11.6.5: Compare Unintegrated and Harmony-Corrected Astrocyte UMAPs
This comparison shows whether Harmony correction reduces sample- or condition-driven separation within the astrocyte subset.
The unintegrated UMAP is computed from astrocyte-only RNA PCA, whereas the Harmony UMAP is computed from Harmony-corrected astrocyte embeddings.
Show code
set.seed (1234 )
dims_use_unintegrated <- dims_use_harmony
resolution_use_unintegrated <- resolution_use_harmony
SO_astro <- FindNeighbors (
SO_astro,
reduction = "pca_astro" ,
dims = dims_use_unintegrated,
graph.name = "astro_unintegrated_snn" ,
verbose = FALSE
)
SO_astro <- FindClusters (
SO_astro,
graph.name = "astro_unintegrated_snn" ,
resolution = resolution_use_unintegrated,
cluster.name = "seurat_clusters_unintegrated" ,
verbose = FALSE
)
SO_astro$ seurat_clusters_unintegrated <- as.character (
SO_astro$ seurat_clusters_unintegrated
)
SO_astro <- RunUMAP (
SO_astro,
reduction = "pca_astro" ,
dims = dims_use_unintegrated,
reduction.name = "umap_astro_unintegrated" ,
reduction.key = "astroUnintegratedUMAP_" ,
verbose = FALSE
)
table (SO_astro$ seurat_clusters_unintegrated)
0 1 10 11 2 3 4 5 6 7 8 9
910 829 61 38 719 512 480 458 425 409 248 170
Show code
p_unintegrated_condition <- DimPlot (
SO_astro,
reduction = "umap_astro_unintegrated" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (
title = "Before Harmony" ,
subtitle = "Astrocyte-only RNA PCA UMAP" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_condition_compare <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "condition" ,
pt.size = 0.4
) +
labs (
title = "After Harmony" ,
subtitle = "Astrocyte-only Harmony UMAP" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_condition$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_condition_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_unintegrated_condition | p_harmony_condition_compare
Show code
p_unintegrated_sample <- DimPlot (
SO_astro,
reduction = "umap_astro_unintegrated" ,
group.by = "sample" ,
pt.size = 0.35
) +
labs (
title = "Before Harmony" ,
subtitle = "Colored by sample" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_sample_compare <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "sample" ,
pt.size = 0.35
) +
labs (
title = "After Harmony" ,
subtitle = "Colored by sample" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_sample$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_harmony_sample_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.4
p_unintegrated_sample | p_harmony_sample_compare
Show code
p_unintegrated_cluster <- DimPlot (
SO_astro,
reduction = "umap_astro_unintegrated" ,
group.by = "seurat_clusters_unintegrated" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (
title = "Before Harmony" ,
subtitle = "Unintegrated RNA PCA clusters" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_harmony_cluster_compare <- DimPlot (
SO_astro,
reduction = "umap_astro_harmony" ,
group.by = "seurat_clusters_harmony" ,
label = TRUE ,
repel = TRUE ,
pt.size = 0.4
) +
labs (
title = "After Harmony" ,
subtitle = "Harmony-based clusters" ,
x = "UMAP 1" ,
y = "UMAP 2"
)
p_unintegrated_cluster$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_harmony_cluster_compare$ layers[[1 ]]$ aes_params$ alpha <- 0.5
p_unintegrated_cluster | p_harmony_cluster_compare
Show code
cluster_comparison_table <- table (
unintegrated_cluster = SO_astro$ seurat_clusters_unintegrated,
harmony_cluster = SO_astro$ seurat_clusters_harmony
)
write.csv (
as.data.frame.matrix (cluster_comparison_table),
file.path (results_dir, "Step11_5_unintegrated_vs_harmony_cluster_table.csv" )
)
cluster_comparison_table
harmony_cluster
unintegrated_cluster 0 1 2 3 4 5 6 7 8 9
0 6 209 231 434 1 21 0 0 2 6
1 50 295 352 64 11 8 5 29 4 11
10 1 0 9 0 50 0 0 0 0 1
11 0 0 0 1 0 1 0 0 36 0
2 225 92 210 7 0 9 167 7 2 0
3 2 61 103 1 333 4 0 4 2 2
4 339 0 70 3 6 5 1 53 2 1
5 215 164 42 2 12 11 5 3 3 1
6 285 1 124 7 0 3 0 1 3 1
7 0 384 5 7 0 8 0 2 2 1
8 135 40 68 0 0 4 1 0 0 0
9 0 2 3 0 0 161 0 3 1 0
Step 11.7: Validate Astrocyte and Reactive Astrocyte Markers
Here we check expression, not only differential expression.
The point is:
canonical astrocyte marker expression = confirms broad astrocyte identity
top DE marker after astrocyte reclustering = identifies astrocyte state/subtype differences
After subsetting astrocytes, canonical astrocyte markers may not appear as top differential markers between astrocyte subclusters because they are expected to be broadly expressed across astrocytes.
Show code
astro_marker_genes <- c (
"AQP4" , "GFAP" , "ALDH1L1" , "SLC1A3" , "S100B" ,
"CHI3L1" , "TNC" , "C3" , "SERPINA3" , "HSPB1" , "VIM"
)
astro_marker_present <- intersect (
astro_marker_genes,
rownames (SO_astro)
)
if (length (astro_marker_present) == 0 ) {
stop ("None of the astrocyte marker genes were found in SO_astro." )
}
astro_marker_present
[1] "AQP4" "GFAP" "ALDH1L1" "SLC1A3" "S100B" "CHI3L1"
[7] "TNC" "C3" "SERPINA3" "HSPB1" "VIM"
Show code
FeaturePlot (
SO_astro,
features = astro_marker_present,
reduction = "umap_astro_harmony" ,
ncol = 4 ,
pt.size = 0.5
)
Show code
DotPlot (
SO_astro,
features = astro_marker_present,
group.by = "seurat_clusters_harmony"
) +
RotatedAxis () +
labs (title = "Canonical and Reactive Astrocyte Marker Expression by Harmony Cluster" )
Step 11.8: Define Astrocyte Reactivity Modules
Reactive-like states are inferred inside the astrocyte subset using marker expression and module scores.
Show code
astro_identity_genes <- c (
"AQP4" , "GFAP" , "ALDH1L1" , "SLC1A3" , "S100B"
)
reactive_astrocyte_genes <- c (
"CHI3L1" , "TNC" , "C3" , "SERPINA3" , "HSPB1" , "VIM"
)
inflammatory_genes <- c (
"IL6" , "CXCL10" , "CCL2" , "CXCL8" , "NFKBIA" , "TNFAIP3" , "SOCS3"
)
stress_genes <- c (
"HSPA1A" , "HSPA1B" , "HSPB1" , "DDIT3" , "FOS" , "JUN" , "ATF3"
)
ecm_remodeling_genes <- c (
"TNC" , "VIM" , "SERPINA3" , "COL4A1" , "COL4A2" , "MMP2" , "MMP9"
)
module_gene_list <- list (
Astro_identity = intersect (astro_identity_genes, rownames (SO_astro)),
Reactive_astro = intersect (reactive_astrocyte_genes, rownames (SO_astro)),
Inflammatory = intersect (inflammatory_genes, rownames (SO_astro)),
Stress = intersect (stress_genes, rownames (SO_astro)),
ECM = intersect (ecm_remodeling_genes, rownames (SO_astro))
)
module_gene_list
$Astro_identity
[1] "AQP4" "GFAP" "ALDH1L1" "SLC1A3" "S100B"
$Reactive_astro
[1] "CHI3L1" "TNC" "C3" "SERPINA3" "HSPB1" "VIM"
$Inflammatory
[1] "IL6" "CXCL10" "CCL2" "CXCL8" "NFKBIA" "TNFAIP3" "SOCS3"
$Stress
[1] "HSPA1A" "HSPA1B" "HSPB1" "DDIT3" "FOS" "JUN" "ATF3"
$ECM
[1] "TNC" "VIM" "SERPINA3" "COL4A1" "COL4A2" "MMP2" "MMP9"
Show code
empty_modules <- names (module_gene_list)[lengths (module_gene_list) == 0 ]
if (length (empty_modules) > 0 ) {
warning (
paste (
"These modules have no genes present in the object:" ,
paste (empty_modules, collapse = ", " )
)
)
}
Show code
for (module_name in names (module_gene_list)) {
genes_use <- module_gene_list[[module_name]]
if (length (genes_use) > 0 ) {
SO_astro <- AddModuleScore (
SO_astro,
features = list (genes_use),
name = paste0 (module_name, "_score" )
)
}
}
score_features <- c (
"Astro_identity_score1" ,
"Reactive_astro_score1" ,
"Inflammatory_score1" ,
"Stress_score1" ,
"ECM_score1"
)
score_features_present <- intersect (
score_features,
colnames (SO_astro@ meta.data)
)
if (length (score_features_present) == 0 ) {
stop ("No module score columns found." )
}
score_features_present
[1] "Astro_identity_score1" "Reactive_astro_score1" "Inflammatory_score1"
[4] "Stress_score1" "ECM_score1"
Step 11.9: Visualize Astrocyte Reactivity Modules
Show code
FeaturePlot (
SO_astro,
features = score_features_present,
reduction = "umap_astro_harmony" ,
ncol = 3 ,
pt.size = 0.5
)
Show code
reactivity_score_features <- intersect (
c (
"Reactive_astro_score1" ,
"Inflammatory_score1" ,
"Stress_score1" ,
"ECM_score1"
),
colnames (SO_astro@ meta.data)
)
VlnPlot (
SO_astro,
features = reactivity_score_features,
group.by = "seurat_clusters_harmony" ,
pt.size = 0
)
Step 11.10: Summarize Astrocyte Clusters
We summarize each astrocyte cluster by:
cell number
module scores
condition composition
sample composition
This helps distinguish robust astrocyte states from clusters dominated by a single sample.
Show code
harmony_cluster_condition_table <- table (
harmony_cluster = SO_astro$ seurat_clusters_harmony,
condition = SO_astro$ condition
)
harmony_cluster_sample_table <- table (
harmony_cluster = SO_astro$ seurat_clusters_harmony,
sample = SO_astro$ sample
)
harmony_cluster_sex_table <- table (
harmony_cluster = SO_astro$ seurat_clusters_harmony,
sex = SO_astro$ sex
)
harmony_cluster_condition_table
condition
harmony_cluster C PD
0 624 634
1 742 506
2 449 768
3 30 496
4 52 361
5 135 100
6 177 2
7 78 24
8 37 20
9 12 12
Show code
harmony_cluster_sample_table
sample
harmony_cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F
0 33 8 214 222 120 27 189
1 54 17 165 457 40 9 90
2 68 31 54 216 66 14 71
3 7 0 2 19 2 0 1
4 8 22 12 0 6 4 7
5 22 34 15 28 17 19 11
6 2 5 6 162 2 0 0
7 6 17 5 16 17 17 4
8 3 6 6 18 3 1 4
9 4 1 2 1 2 2 1
sample
harmony_cluster PDSN02_F PDSN04_F PDSN06_M PDSN07_M PDSN09_M
0 3 6 274 136 26
1 62 210 27 40 77
2 111 238 164 78 106
3 1 422 10 3 59
4 351 1 1 0 1
5 8 27 36 9 9
6 0 0 0 1 1
7 5 7 3 1 4
8 3 6 2 1 4
9 2 6 2 0 1
Show code
harmony_cluster_sex_table
sex
harmony_cluster Female Male Unknown
0 453 805 0
1 598 650 0
2 573 644 0
3 433 93 0
4 401 12 0
5 117 118 0
6 13 166 0
7 44 58 0
8 28 29 0
9 16 8 0
Show code
harmony_cluster_summary <- SO_astro@ meta.data %>%
mutate (
harmony_cluster = as.character (seurat_clusters_harmony)
) %>%
group_by (harmony_cluster) %>%
summarise (
n_cells = n (),
mean_astro_identity = mean (Astro_identity_score1, na.rm = TRUE ),
mean_reactive = mean (Reactive_astro_score1, na.rm = TRUE ),
mean_inflammatory = mean (Inflammatory_score1, na.rm = TRUE ),
mean_stress = mean (Stress_score1, na.rm = TRUE ),
mean_ecm = mean (ECM_score1, na.rm = TRUE ),
percent_PD = mean (condition == "PD" , na.rm = TRUE ),
percent_C = mean (condition == "C" , na.rm = TRUE ),
percent_female = mean (sex == "Female" , na.rm = TRUE ),
percent_male = mean (sex == "Male" , na.rm = TRUE ),
dominant_sex = names (sort (table (sex), decreasing = TRUE ))[1 ],
n_samples = n_distinct (sample),
dominant_condition = names (sort (table (condition), decreasing = TRUE ))[1 ],
dominant_sample = names (sort (table (sample), decreasing = TRUE ))[1 ],
.groups = "drop"
) %>%
arrange (desc (mean_reactive))
harmony_cluster_summary
Step 11.11: Define Astrocyte States
Harmony-corrected clusters are used to define astrocyte states conservatively.
State labels are intentionally left unassigned at this stage and should be manually defined after inspecting:
module scores
canonical and reactive marker expression
condition composition
sample composition
cluster robustness
Show code
SO_astro$ astro_cluster_harmony <- as.character (SO_astro$ seurat_clusters_harmony)
pd_reactive <- c ("3" )
homeostatic <- c ("0" , "1" , "2" , "5" )
other <- c ("4" , "6" , "7" , "8" , "9" )
SO_astro$ astro_state_harmony <- case_when (
SO_astro$ astro_cluster_harmony %in% pd_reactive ~ "PD Reactive" ,
SO_astro$ astro_cluster_harmony %in% homeostatic ~ "Homeostatic" ,
SO_astro$ astro_cluster_harmony %in% other ~ "Other" ,
TRUE ~ "Unassigned"
)
SO_astro$ astro_state_harmony <- factor (
SO_astro$ astro_state_harmony,
levels = c (
"PD Reactive" ,
"Homeostatic" ,
"Other" ,
"Unassigned"
)
)
table (SO_astro$ astro_cluster_harmony, SO_astro$ astro_state_harmony)
PD Reactive Homeostatic Other Unassigned
0 0 1258 0 0
1 0 1248 0 0
2 0 1217 0 0
3 526 0 0 0
4 0 0 413 0
5 0 235 0 0
6 0 0 179 0
7 0 0 102 0
8 0 0 57 0
9 0 0 24 0
Show code
table (SO_astro$ astro_state_harmony, SO_astro$ condition)
C PD
PD Reactive 30 496
Homeostatic 1950 2008
Other 356 419
Unassigned 0 0
Step 11.12: Add External GENCODE Annotation for Context-Based lncRNA Subtype Classification
Recent GENCODE releases use a broad lncRNA gene biotype and do not always preserve older lncRNA subtypes such as lincRNA, antisense, sense_intronic, or sense_overlapping.
Therefore, lncRNAs are classified here by genomic context relative to protein-coding genes using the comprehensive GENCODE annotation.
The context-based categories are:
LINC
= lncRNA genes without overlap with protein-coding genes
Antisense
= lncRNA genes overlapping protein-coding genes on the opposite strand
Sense_gene_body_contained
= lncRNA genes fully contained within protein-coding genes on the same strand
Sense_overlapping
= lncRNA genes overlapping protein-coding genes on the same strand, but not fully intronic
Other_lncRNA_context
= lncRNA genes that do not clearly fit the above categories
Show code
reference_dir <- file.path (project_dir, "reference" )
dir.create (reference_dir, recursive = TRUE , showWarnings = FALSE )
gencode_comprehensive_gtf_path <- file.path (
project_dir,
"reference" ,
"gencode.v49.annotation.gtf.gz"
)
if (! file.exists (gencode_comprehensive_gtf_path)) {
stop ("GENCODE comprehensive GTF file not found. Download it from Terminal first." )
}
gencode_all_gtf <- rtracklayer:: import (gencode_comprehensive_gtf_path)
gencode_all_genes <- as.data.frame (gencode_all_gtf) %>%
filter (type == "gene" ) %>%
as_tibble () %>%
transmute (
seqnames = as.character (seqnames),
start = start,
end = end,
strand = as.character (strand),
gene_id = gene_id,
gene_id_clean = stringr:: str_replace (gene_id, " \\ ..*$" , "" ),
gene_name = gene_name,
gene_type = gene_type
) %>%
distinct ()
gencode_gene_type_summary <- gencode_all_genes %>%
count (gene_type, name = "n_genes" ) %>%
arrange (desc (n_genes))
write.csv (
gencode_gene_type_summary,
file.path (results_dir, "Step11_5_GENCODE_gene_type_summary.csv" ),
row.names = FALSE
)
gencode_gene_type_summary %>%
filter (gene_type %in% c ("protein_coding" , "lncRNA" ))
Show code
library (GenomicRanges)
lncRNA_gr <- gencode_all_genes %>%
filter (gene_type == "lncRNA" ) %>%
makeGRangesFromDataFrame (
keep.extra.columns = TRUE ,
seqnames.field = "seqnames" ,
start.field = "start" ,
end.field = "end" ,
strand.field = "strand"
)
protein_coding_gr <- gencode_all_genes %>%
filter (gene_type == "protein_coding" ) %>%
makeGRangesFromDataFrame (
keep.extra.columns = TRUE ,
seqnames.field = "seqnames" ,
start.field = "start" ,
end.field = "end" ,
strand.field = "strand"
)
# Any overlap with protein-coding genes, ignoring strand.
any_pc_overlap_hits <- findOverlaps (
lncRNA_gr,
protein_coding_gr,
ignore.strand = TRUE
)
overlap_pairs <- tibble (
lncRNA_gene_id = mcols (lncRNA_gr)$ gene_id_clean[queryHits (any_pc_overlap_hits)],
lncRNA_gene_name = mcols (lncRNA_gr)$ gene_name[queryHits (any_pc_overlap_hits)],
lncRNA_strand = as.character (strand (lncRNA_gr))[queryHits (any_pc_overlap_hits)],
lncRNA_start = start (lncRNA_gr)[queryHits (any_pc_overlap_hits)],
lncRNA_end = end (lncRNA_gr)[queryHits (any_pc_overlap_hits)],
pc_gene_id = mcols (protein_coding_gr)$ gene_id_clean[subjectHits (any_pc_overlap_hits)],
pc_gene_name = mcols (protein_coding_gr)$ gene_name[subjectHits (any_pc_overlap_hits)],
pc_strand = as.character (strand (protein_coding_gr))[subjectHits (any_pc_overlap_hits)],
pc_start = start (protein_coding_gr)[subjectHits (any_pc_overlap_hits)],
pc_end = end (protein_coding_gr)[subjectHits (any_pc_overlap_hits)]
)
# Opposite-strand protein-coding overlap.
antisense_lncRNA_ids <- overlap_pairs %>%
filter (
lncRNA_strand != pc_strand,
lncRNA_strand %in% c ("+" , "-" ),
pc_strand %in% c ("+" , "-" )
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# Same-strand protein-coding overlap.
same_strand_pairs <- overlap_pairs %>%
filter (
lncRNA_strand == pc_strand,
lncRNA_strand %in% c ("+" , "-" ),
pc_strand %in% c ("+" , "-" )
)
# Sense-intronic:
# lncRNA is fully contained within a protein-coding gene on the same strand.
sense_intronic_lncRNA_ids <- same_strand_pairs %>%
filter (
lncRNA_start >= pc_start,
lncRNA_end <= pc_end
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# Sense-overlapping:
# lncRNA overlaps a protein-coding gene on the same strand,
# but is not fully contained within it.
sense_overlapping_lncRNA_ids <- same_strand_pairs %>%
filter (
! (lncRNA_start >= pc_start & lncRNA_end <= pc_end)
) %>%
pull (lncRNA_gene_id) %>%
unique ()
# LINC:
# lncRNA has no overlap with any protein-coding gene.
pc_overlapping_lncRNA_ids <- overlap_pairs %>%
pull (lncRNA_gene_id) %>%
unique ()
linc_lncRNA_ids <- setdiff (
mcols (lncRNA_gr)$ gene_id_clean,
pc_overlapping_lncRNA_ids
)
# Priority:
# 1. Antisense
# 2. Sense_gene_body_contained
# 3. Sense_overlapping
# 4. LINC
# 5. Other_lncRNA_context
#
# If one lncRNA overlaps multiple protein-coding genes in multiple ways,
# the most interpretable overlap class is assigned by this priority.
gencode_lncRNA_context_anno <- tibble (
gene_id_clean = mcols (lncRNA_gr)$ gene_id_clean,
gencode_gene_id = mcols (lncRNA_gr)$ gene_id,
gencode_gene_name = mcols (lncRNA_gr)$ gene_name,
gencode_gene_type = mcols (lncRNA_gr)$ gene_type,
lncRNA_type = case_when (
gene_id_clean %in% antisense_lncRNA_ids ~ "Antisense" ,
gene_id_clean %in% sense_intronic_lncRNA_ids ~ "Sense_gene_body_contained" ,
gene_id_clean %in% sense_overlapping_lncRNA_ids ~ "Sense_overlapping" ,
gene_id_clean %in% linc_lncRNA_ids ~ "LINC" ,
TRUE ~ "Other_lncRNA_context"
)
)
table (gencode_lncRNA_context_anno$ lncRNA_type)
write.csv (
gencode_lncRNA_context_anno,
file.path (results_dir, "Step11_5_GENCODE_lncRNA_context_annotation.csv" ),
row.names = FALSE
)
write.csv (
overlap_pairs,
file.path (results_dir, "Step11_5_GENCODE_lncRNA_protein_coding_overlap_pairs.csv" ),
row.names = FALSE
)
Show code
gene_meta <- SO_astro[["RNA" ]][[]] %>%
tibble:: rownames_to_column ("gene" ) %>%
mutate (
gene_id_clean = stringr:: str_replace (gene, " \\ ..*$" , "" )
)
# Match by Ensembl gene ID.
gene_meta_context_ensembl <- gene_meta %>%
left_join (
gencode_lncRNA_context_anno,
by = "gene_id_clean"
)
n_context_matched_by_ensembl <- sum (! is.na (gene_meta_context_ensembl$ lncRNA_type))
# Match by gene symbol.
# Keep both the Seurat gene name and the GENCODE gene name.
gene_meta_context_symbol <- gene_meta %>%
left_join (
gencode_lncRNA_context_anno %>%
mutate (gencode_gene_name_join = gencode_gene_name) %>%
select (
gencode_gene_name_join,
gencode_gene_id,
gencode_gene_name,
gene_id_clean_gencode = gene_id_clean,
lncRNA_type
),
by = c ("gene" = "gencode_gene_name_join" )
)
n_context_matched_by_symbol <- sum (! is.na (gene_meta_context_symbol$ lncRNA_type))
n_context_matched_by_ensembl
Show code
n_context_matched_by_symbol
Show code
if (n_context_matched_by_ensembl >= n_context_matched_by_symbol) {
gene_meta <- gene_meta_context_ensembl
lncRNA_context_match_method <- "Ensembl_gene_id"
} else {
gene_meta <- gene_meta_context_symbol
lncRNA_context_match_method <- "gene_symbol"
}
lncRNA_context_match_method
Show code
gene_meta <- gene_meta %>%
mutate (
lncRNA_type = if_else (
is.na (lncRNA_type),
"Not_lncRNA_or_unmatched" ,
lncRNA_type
),
is_lncRNA = lncRNA_type != "Not_lncRNA_or_unmatched"
)
lncRNA_genes <- gene_meta %>%
filter (is_lncRNA) %>%
pull (gene) %>%
intersect (rownames (SO_astro))
length (lncRNA_genes)
Show code
table (gene_meta$ lncRNA_type)
Antisense LINC Not_lncRNA_or_unmatched
9465 12339 24043
Sense_gene_body_contained Sense_overlapping
1498 444
Show code
write.csv (
gene_meta %>%
filter (is_lncRNA),
file.path (results_dir, "Step11_5_astrocyte_lncRNA_gene_annotation_context_based.csv" ),
row.names = FALSE
)
write.csv (
as.data.frame (table (gene_meta$ lncRNA_type)),
file.path (results_dir, "Step11_5_lncRNA_type_counts_context_based.csv" ),
row.names = FALSE
)
Show code
lncRNA_type_detection_summary <- gene_meta %>%
filter (is_lncRNA) %>%
count (lncRNA_type, name = "n_genes" ) %>%
arrange (desc (n_genes))
lncRNA_type_detection_summary
Show code
write.csv (
lncRNA_type_detection_summary,
file.path (results_dir, "Step11_5_lncRNA_context_type_summary.csv" ),
row.names = FALSE
)
Show code
get_assay_data_safe <- function (object, assay = "RNA" , layer_or_slot = "counts" ) {
tryCatch (
GetAssayData (object, assay = assay, layer = layer_or_slot),
error = function (e) {
GetAssayData (object, assay = assay, slot = layer_or_slot)
}
)
}
counts_mat <- get_assay_data_safe (
SO_astro,
assay = "RNA" ,
layer_or_slot = "counts"
)
gene_detection_summary <- tibble (
gene = rownames (counts_mat),
n_cells_detected = Matrix:: rowSums (counts_mat > 0 ),
pct_cells_detected = Matrix:: rowSums (counts_mat > 0 ) / ncol (counts_mat)
) %>%
left_join (
gene_meta %>%
select (
gene,
gencode_gene_id,
gencode_gene_name,
lncRNA_type,
is_lncRNA
),
by = "gene"
)
lncRNA_detection_summary <- gene_detection_summary %>%
filter (is_lncRNA) %>%
arrange (desc (n_cells_detected))
lncRNA_detection_summary %>%
slice_head (n = 20 )
Show code
write.csv (
lncRNA_detection_summary,
file.path (results_dir, "Step11_5_astrocyte_lncRNA_detection_summary.csv" ),
row.names = FALSE
)
Step 11.13: Global lncRNA Expression Fraction in Astrocytes
This section adds a per-cell lncRNA expression fraction. This helps check whether lncRNA expression is broadly distributed or concentrated in specific clusters, samples, or conditions.
Show code
if (length (lncRNA_genes) == 0 ) {
stop ("No lncRNA genes were identified from the available gene annotation." )
}
SO_astro[["percent_lncRNA" ]] <- PercentageFeatureSet (
SO_astro,
features = lncRNA_genes,
assay = "RNA"
)
summary (SO_astro$ percent_lncRNA)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.812 12.814 14.756 15.742 17.681 47.784
Show code
FeaturePlot (
SO_astro,
features = "percent_lncRNA" ,
reduction = "umap_astro_harmony" ,
pt.size = 0.5
) +
labs (title = "Percent lncRNA Expression in Astrocytes" )
Show code
p_lnc_cluster <- VlnPlot (
SO_astro,
features = "percent_lncRNA" ,
group.by = "seurat_clusters_harmony" ,
pt.size = 0
) +
labs (title = "Percent lncRNA by Cluster" )
p_lnc_condition <- VlnPlot (
SO_astro,
features = "percent_lncRNA" ,
group.by = "condition" ,
pt.size = 0
) +
labs (title = "Percent lncRNA by Condition" )
p_lnc_cluster | p_lnc_condition
Step 11.14: Identify lncRNA Markers of Astrocyte Clusters
Here we identify lncRNAs that are enriched in each astrocyte cluster.
Harmony is used only to define clusters. Differential expression is performed on the RNA assay.
Show code
DefaultAssay (SO_astro) <- "RNA"
Idents (SO_astro) <- "seurat_clusters_harmony"
astro_harmony_markers <- FindAllMarkers (
SO_astro,
assay = "RNA" ,
only.pos = TRUE ,
min.pct = 0.05 ,
logfc.threshold = 0.25 ,
verbose = FALSE
)
astro_lncRNA_markers <- astro_harmony_markers %>%
left_join (
gene_meta %>%
select (
gene,
gencode_gene_id,
gencode_gene_name,
lncRNA_type,
is_lncRNA
),
by = "gene"
) %>%
filter (
is_lncRNA,
p_val_adj < 0.05
) %>%
mutate (
pct_diff = pct.1 - pct.2
) %>%
arrange (
cluster,
desc (avg_log2FC),
desc (pct_diff),
p_val_adj
)
astro_lncRNA_markers_top_strict <- astro_lncRNA_markers %>%
filter (pct_diff > 0.05 ) %>%
group_by (cluster) %>%
slice_head (n = 5 ) %>%
ungroup ()
astro_lncRNA_markers_top_per_cluster <- astro_lncRNA_markers %>%
group_by (cluster) %>%
slice_head (n = 5 ) %>%
ungroup ()
write.csv (
astro_harmony_markers,
file.path (results_dir, "Step11_5_harmony_cluster_all_markers.csv" ),
row.names = FALSE
)
write.csv (
astro_lncRNA_markers,
file.path (results_dir, "Step11_5_harmony_cluster_lncRNA_markers.csv" ),
row.names = FALSE
)
write.csv (
astro_lncRNA_markers_top_strict,
file.path (results_dir, "Step11_5_harmony_cluster_top_lncRNA_markers_strict_pctdiff.csv" ),
row.names = FALSE
)
write.csv (
astro_lncRNA_markers_top_per_cluster,
file.path (results_dir, "Step11_5_harmony_cluster_top_lncRNA_markers_per_cluster.csv" ),
row.names = FALSE
)
Show code
lncRNA_marker_type_summary <- astro_lncRNA_markers %>%
count (cluster, lncRNA_type, name = "n_marker_lncRNAs" ) %>%
arrange (cluster, desc (n_marker_lncRNAs))
lncRNA_marker_type_summary
Show code
write.csv (
lncRNA_marker_type_summary,
file.path (results_dir, "Step11_5_lncRNA_marker_type_summary_by_cluster.csv" ),
row.names = FALSE
)
Step 11.15: Visualize lncRNA Marker Expression Patterns
These plots directly address the question of where lncRNAs are expressed across astrocyte clusters. The heatmap and dot plot should be interpreted as expression-pattern summaries, not as proof of function.
Show code
top_lncRNA_features <- astro_lncRNA_markers_top_per_cluster %>%
pull (gene) %>%
unique () %>%
intersect (rownames (SO_astro))
# Fallback: if few marker lncRNAs pass the marker threshold, use the most detected lncRNAs.
if (length (top_lncRNA_features) < 2 ) {
top_lncRNA_features <- lncRNA_detection_summary %>%
slice_head (n = 20 ) %>%
pull (gene) %>%
unique () %>%
intersect (rownames (SO_astro))
}
top_lncRNA_features
[1] "ENSG00000234147" "LINC02277" "ENSG00000302932" "ENSG00000300891"
[5] "ENSG00000233928" "ENSG00000291054" "FAM182B" "OXA1L-DT"
[9] "UGDH-AS1" "ATP1A1-AS1" "ENSG00000305928" "ENSG00000302585"
[13] "ENSG00000302111" "ENSG00000230258" "LINC01497" "LINC02552"
[17] "ENSG00000295757" "TRD-AS1" "ENSG00000237480" "LINC03077"
[21] "ENSG00000289591" "SH3TC2-DT" "LINC01608" "SAMMSON"
[25] "ENSG00000289279" "FOXG1-AS1" "LINC01551" "LINC02984"
[29] "LINC02715" "LINC00609" "ENSG00000287684" "ENSG00000307137"
[33] "ENSG00000290441" "ENSG00000257258" "ENSG00000302619" "ENSG00000294326"
[37] "ENSG00000303315" "ENSG00000309644" "LINC02381" "ENSG00000258711"
[41] "ENSG00000303264" "ENSG00000231873" "ENSG00000258520" "ENSG00000305782"
[45] "LINC01141" "ENSG00000307068" "ENSG00000272783" "ENSG00000287470"
[49] "PCAT19" "ENSG00000289404"
Show code
if (length (top_lncRNA_features) > 0 ) {
DotPlot (
SO_astro,
features = top_lncRNA_features,
assay = "RNA" ,
group.by = "seurat_clusters_harmony"
) +
RotatedAxis () +
guides (
color = guide_colorbar (title = "Average \n Expression" ),
size = guide_legend (title = "Percent \n Expressed" )
) +
labs (
title = "Top lncRNA Marker Expression by Harmony-Based Astrocyte Cluster" ,
x = "lncRNA" ,
y = "Harmony cluster"
)
} else {
message ("No lncRNA features available for DotPlot." )
}
Show code
if (length (top_lncRNA_features) > 1 ) {
SO_astro <- ScaleData (
SO_astro,
features = top_lncRNA_features,
assay = "RNA" ,
verbose = FALSE
)
DoHeatmap (
SO_astro,
features = top_lncRNA_features,
assay = "RNA" ,
group.by = "seurat_clusters_harmony"
) +
NoLegend () +
labs (title = "Top lncRNA Marker Heatmap by Harmony-Based Astrocyte Cluster" )
} else {
message ("Not enough lncRNA features for Seurat heatmap." )
}
Warning: Different features in new layer data than already exists for
scale.data
Show code
DefaultAssay (SO_astro) <- "RNA"
expr_mat <- GetAssayData (
SO_astro,
assay = "RNA" ,
layer = "data"
)
lncRNA_genes_use <- intersect (lncRNA_genes, rownames (expr_mat))
lncRNA_detected <- lncRNA_genes_use[
Matrix:: rowSums (expr_mat[lncRNA_genes_use, ] > 0 ) / ncol (expr_mat) >= 0.01
]
if (length (lncRNA_detected) < 2 ) {
stop ("Not enough detected lncRNAs for variable lncRNA heatmap." )
}
lncRNA_variance <- Matrix:: rowMeans (expr_mat[lncRNA_detected, ]^ 2 ) -
Matrix:: rowMeans (expr_mat[lncRNA_detected, ])^ 2
top_variable_lncRNAs <- names (sort (lncRNA_variance, decreasing = TRUE )) %>%
head (n = min (1000 , length (.)))
SO_astro$ cluster_condition <- paste0 (
"Cl" ,
SO_astro$ seurat_clusters_harmony,
"_" ,
SO_astro$ condition
)
avg_variable_lncRNA <- AverageExpression (
SO_astro,
assays = "RNA" ,
features = top_variable_lncRNAs,
group.by = "cluster_condition" ,
slot = "data" ,
verbose = FALSE
)$ RNA %>%
as.matrix ()
avg_variable_lncRNA_z <- t (scale (t (avg_variable_lncRNA)))
avg_variable_lncRNA_z[is.na (avg_variable_lncRNA_z)] <- 0
avg_variable_lncRNA_z <- pmax (pmin (avg_variable_lncRNA_z, 2 ), - 2 )
heatmap_breaks <- seq (- 2 , 2 , length.out = 101 )
column_annotation <- data.frame (
group = colnames (avg_variable_lncRNA_z)
)
column_annotation$ cluster <- stringr:: str_match (
column_annotation$ group,
"^Cl([^-]+)-"
)[, 2 ]
column_annotation$ condition <- stringr:: str_match (
column_annotation$ group,
"-([^-]+)$"
)[, 2 ]
rownames (column_annotation) <- column_annotation$ group
column_annotation$ group <- NULL
column_annotation$ cluster <- factor (column_annotation$ cluster)
column_annotation$ condition <- factor (
column_annotation$ condition,
levels = c ("C" , "PD" )
)
column_annotation
Show code
pheatmap:: pheatmap (
avg_variable_lncRNA_z,
annotation_col = column_annotation,
show_rownames = FALSE ,
cluster_rows = TRUE ,
cluster_cols = TRUE ,
breaks = heatmap_breaks,
main = "Top Variable lncRNA Average Expression by Astrocyte Cluster and Condition" ,
fontsize_col = 8 ,
border_color = NA
)
Show code
write.csv (
top_variable_lncRNAs,
file.path (results_dir, "Step11_5_top_variable_lncRNAs_for_heatmap.csv" ),
row.names = FALSE
)
write.csv (
avg_variable_lncRNA,
file.path (results_dir, "Step11_5_average_variable_lncRNA_expression_by_cluster_condition.csv" )
)
write.csv (
avg_variable_lncRNA_z,
file.path (results_dir, "Step11_5_average_variable_lncRNA_expression_by_cluster_condition_zscore.csv" )
)
Step 11.16: Save Astrocyte Object and Results
Show code
saveRDS (
SO_astro,
file.path (results_dir, "SO_astro_step11_5_fernando_harmony_reclustered_scored.rds" )
)
write.csv (
harmony_cluster_summary,
file.path (results_dir, "Step11_5_harmony_cluster_summary.csv" ),
row.names = FALSE
)
write.csv (
as.data.frame.matrix (harmony_cluster_condition_table),
file.path (results_dir, "Step11_5_harmony_cluster_condition_table.csv" )
)
write.csv (
as.data.frame.matrix (harmony_cluster_sample_table),
file.path (results_dir, "Step11_5_harmony_cluster_sample_table.csv" )
)
write.csv (
as.data.frame.matrix (harmony_cluster_sex_table),
file.path (results_dir, "Step11_5_harmony_cluster_sex_table.csv" )
)
write.csv (
data.frame (
setting = c (
"input_object" ,
"annotation_column" ,
"condition_column" ,
"sample_column" ,
"astrocyte_labels_used" ,
"normalization" ,
"variable_features" ,
"pca_dimensions_computed" ,
"harmony_group_var" ,
"dims_use_harmony" ,
"resolution_use_harmony" ,
"primary_cluster_column" ,
"primary_umap_reduction" ,
"optional_unintegrated_dims" ,
"optional_unintegrated_resolution" ,
"lncRNA_annotation_method" ,
"lncRNA_context_match_method" ,
"n_lncRNA_genes_detected_in_astrocytes" ,
"n_astrocyte_cells"
),
value = as.character (c (
fernando_path,
"Cell_types" ,
"condition" ,
"Sample" ,
paste (target_astrocytes, collapse = "; " ),
"LogNormalize" ,
2000 ,
40 ,
harmony_group_var,
paste (dims_use_harmony, collapse = "," ),
resolution_use_harmony,
"seurat_clusters_harmony" ,
"umap_astro_harmony" ,
paste (dims_use_unintegrated, collapse = "," ),
resolution_use_unintegrated,
"GENCODE_v49_genomic_context" ,
lncRNA_context_match_method,
length (lncRNA_genes),
ncol (SO_astro)
))
),
file.path (results_dir, "Step11_5_analysis_settings.csv" ),
row.names = FALSE
)
Interpretation Note
This analysis intentionally separates three concepts:
canonical astrocyte marker expression
astrocyte subcluster structure
reactive / inflammatory / stress / ECM module scores
lncRNA marker expression and lncRNA subtype annotation
After subsetting astrocytes, canonical astrocyte markers may not appear as top subcluster markers because they are expected to be broadly expressed across astrocytes.
Therefore:
AQP4 / GFAP / ALDH1L1 / SLC1A3 expression confirms astrocyte identity.
Reactive / inflammatory / stress / ECM scores help interpret astrocyte states..
Differential expression should be performed using the RNA assay, not Harmony-corrected embeddings.
Recent GENCODE releases use a broad lncRNA biotype. Therefore, lncRNA candidates were annotated by genomic context relative to protein-coding genes using GENCODE v49. LncRNAs without protein-coding gene overlap were labeled LINC. LncRNAs overlapping protein-coding genes on the opposite strand were labeled Antisense. Same-strand lncRNAs fully contained within protein-coding gene bodies were labeled Sense_gene_body_contained, and same-strand partial overlaps were labeled Sense_overlapping. These labels are context-based annotations, not original GENCODE subtype labels.