Step 7-8: Clustering & UMAP

Setup: Environment and Data

library(Seurat)
library(tidyverse)
library(ggplot2)

# Catch the baton: Load the SO from Step 6 (PCA Completed)
import_path <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/scripts/SO_04_PCA_Completed.rds"
SO <- readRDS(import_path)

Step 7: Clustering Cells

After dimensionality reduction, we mathematically group the cells based on their similar gene expression profiles. This is a two-step process in Seurat: first, we construct a K-nearest neighbor (KNN) graph, and second, we apply a modularity optimization algorithm to identify the actual clusters.

7.1 Memory Allocation

Single-cell clustering requires significant computational memory. To prevent the R session from crashing due to memory limits during parallel processing, we increase the maximum allowable global object size to 4 GB.

# Increase future.globals.maxSize to 4 GB
options(future.globals.maxSize = 4 * 1024^3)

7.2 Finding Neighbors

We use the FindNeighbors function to construct the KNN graph based on the euclidean distance in PCA space. We use the first 35 Principal Components, as determined by the Elbow Plot in the previous step.

# Find neighbors using the first 35 PCs
SO <- FindNeighbors(SO, dims = 1:35)

7.3 Finding Clusters

Next, we identify the actual clusters using the Louvain algorithm. We set the resolution parameter to 0.5. A higher resolution will yield more, smaller clusters, while a lower resolution will yield fewer, larger clusters.

Note: We set random.seed = 123 to ensure this exact clustering result is mathematically reproducible.

# Find clusters at resolution 0.5
SO <- FindClusters(SO, 
                   resolution = 0.5, 
                   random.seed = 123)

Step 8: Visualizing Cell Clusters (UMAP)

Now that our cells are mathematically clustered in high-dimensional space, we need a way to visualize them.

For this, we use UMAP. Unlike PCA, which is a linear dimensionality reduction method, UMAP is a non-linear technique. It excels at taking the complex, high-dimensional neighborhood graph we built in the previous step and flattening it into 2D space while preserving both the local clustering structure and the broader global relationships between different cell types.

Cells that have highly similar gene expression profiles will cluster closely together, forming distinct visual “islands.”

# 1. Calculate UMAP using the same 35 Principal Components
set.seed(123)
SO <- RunUMAP(object = SO, dims = 1:35, seed.use = 123)

# 2. Generate the UMAP plot with cluster labels
umap_clusters <- DimPlot(SO, 
                         reduction = "umap", 
                         label = TRUE) + NoLegend()

# 3. Display the plot
umap_clusters

UMAP projection of single cells. Each dot represents a single cell, colored by its mathematically assigned cluster.

Step 8.1: Saving the Clustered Data

# Pass the baton: Save the SO for Step 9 (Marker Discovery)
save_path_05 <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/scripts/SO_05_Clustered.rds"
saveRDS(SO, file = save_path_05)

print("Step 7-8 Complete! Clusters have been identified and UMAP visualized.")
[1] "Step 7-8 Complete! Clusters have been identified and UMAP visualized."

Step 8.5: UMAP by Sample and Harmony Decision

p_sample <- DimPlot(
  SO,
  reduction = "umap",
  group.by = "Sample",
  pt.size = 0.3
) +
  ggtitle("UMAP colored by Sample")

p_sample

p_split <- DimPlot(
  SO,
  reduction = "umap",
  split.by = "Sample",
  pt.size = 0.3,
  ncol = 4
) +
  ggtitle("UMAP split by Sample")

p_split

sample_cluster_table <- table(
  Cluster = SO$seurat_clusters,
  Sample = SO$Sample
)

print("Raw cell counts: cluster x sample")
[1] "Raw cell counts: cluster x sample"
print(sample_cluster_table)
       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0      1702      119     2208      341     1590      363      534      450
     1        35       27     2138      847       83        4      766        6
     2        17       12     2752      699       25        9      214       22
     3        12        0        8        6      111       19       60     3240
     4       174       98      470     1083      260       54      368       13
     5         1        0        5       18        7        1        7        5
     6       329      106      386      602      185       17      115      174
     7       287       27      482      255      191      101      141      124
     8       140        9      226       75      433      102      156     1307
     9       176       33      171       68      208      132      410      115
     10       13      120       24      503       35       58       12       50
     11       34        7       95       42       80       74      354      105
     12        2        0        0       10        0        0        1        0
     13        9       29        2        0        0       26        0      528
     14       17      296        4        3       15       93        0       81
     15      221        1       85        1       18        0      126        0
     16       31       29       15      108        6       44        1        8
     17       10        4       17       26        9        4        6        1
     18       11        1        8        7        9        4       13        6
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0       450      177       28      771
     1       475       22        3      102
     2       301       12        0      146
     3        38       29        2       17
     4        10      480      252      157
     5        53     1119      377     1759
     6       587      356      117      274
     7       303      356       96      340
     8        96       41        6       83
     9       165      191        9       99
     10      167       36       23       16
     11       77       81       42       54
     12      880       11        4      117
     13        0        1        1        0
     14        2        0        0        0
     15        1       28        0        0
     16       39        1        1       73
     17        8       11        3       25
     18       11        5        1       12
sample_cluster_prop <- prop.table(sample_cluster_table, margin = 1)

print("Proportion within each cluster:")
[1] "Proportion within each cluster:"
print(round(sample_cluster_prop, 3))
       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0     0.195    0.014    0.253    0.039    0.182    0.042    0.061    0.052
     1     0.008    0.006    0.474    0.188    0.018    0.001    0.170    0.001
     2     0.004    0.003    0.654    0.166    0.006    0.002    0.051    0.005
     3     0.003    0.000    0.002    0.002    0.031    0.005    0.017    0.915
     4     0.051    0.029    0.137    0.317    0.076    0.016    0.108    0.004
     5     0.000    0.000    0.001    0.005    0.002    0.000    0.002    0.001
     6     0.101    0.033    0.119    0.185    0.057    0.005    0.035    0.054
     7     0.106    0.010    0.178    0.094    0.071    0.037    0.052    0.046
     8     0.052    0.003    0.085    0.028    0.162    0.038    0.058    0.489
     9     0.099    0.019    0.096    0.038    0.117    0.074    0.231    0.065
     10    0.012    0.114    0.023    0.476    0.033    0.055    0.011    0.047
     11    0.033    0.007    0.091    0.040    0.077    0.071    0.339    0.100
     12    0.002    0.000    0.000    0.010    0.000    0.000    0.001    0.000
     13    0.015    0.049    0.003    0.000    0.000    0.044    0.000    0.886
     14    0.033    0.579    0.008    0.006    0.029    0.182    0.000    0.159
     15    0.459    0.002    0.177    0.002    0.037    0.000    0.262    0.000
     16    0.087    0.081    0.042    0.303    0.017    0.124    0.003    0.022
     17    0.081    0.032    0.137    0.210    0.073    0.032    0.048    0.008
     18    0.125    0.011    0.091    0.080    0.102    0.045    0.148    0.068
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0     0.052    0.020    0.003    0.088
     1     0.105    0.005    0.001    0.023
     2     0.072    0.003    0.000    0.035
     3     0.011    0.008    0.001    0.005
     4     0.003    0.140    0.074    0.046
     5     0.016    0.334    0.112    0.525
     6     0.181    0.110    0.036    0.084
     7     0.112    0.132    0.036    0.126
     8     0.036    0.015    0.002    0.031
     9     0.093    0.107    0.005    0.056
     10    0.158    0.034    0.022    0.015
     11    0.074    0.078    0.040    0.052
     12    0.859    0.011    0.004    0.114
     13    0.000    0.002    0.002    0.000
     14    0.004    0.000    0.000    0.000
     15    0.002    0.058    0.000    0.000
     16    0.110    0.003    0.003    0.205
     17    0.065    0.089    0.024    0.202
     18    0.125    0.057    0.011    0.136
sample_cluster_df <- as.data.frame(sample_cluster_table)

ggplot(sample_cluster_df, aes(x = Cluster, y = Freq, fill = Sample)) +
  geom_bar(stat = "identity", position = "fill", color = "black", linewidth = 0.2) +
  theme_classic() +
  labs(
    title = "Sample composition within each cluster",
    x = "Cluster",
    y = "Proportion"
  )

max_sample_prop_by_cluster <- apply(sample_cluster_prop, 1, max)

decision_df <- data.frame(
  cluster = names(max_sample_prop_by_cluster),
  max_sample_fraction = round(as.numeric(max_sample_prop_by_cluster), 3)
)

print("Maximum sample dominance per cluster:")
[1] "Maximum sample dominance per cluster:"
print(decision_df)
   cluster max_sample_fraction
1        0               0.253
2        1               0.474
3        2               0.654
4        3               0.915
5        4               0.317
6        5               0.525
7        6               0.185
8        7               0.178
9        8               0.489
10       9               0.231
11      10               0.476
12      11               0.339
13      12               0.859
14      13               0.886
15      14               0.579
16      15               0.459
17      16               0.303
18      17               0.210
19      18               0.148
biased_clusters <- decision_df %>%
  filter(max_sample_fraction >= 0.70)

print("Clusters dominated by one sample >= 70%:")
[1] "Clusters dominated by one sample >= 70%:"
print(biased_clusters)
  cluster max_sample_fraction
1       3               0.915
2      12               0.859
3      13               0.886
if (nrow(biased_clusters) >= 2) {
  cat("\nDecision: Harmony is recommended.\n")
  cat("Reason: Multiple clusters are strongly dominated by one sample, suggesting sample/batch-driven structure.\n")
} else if (nrow(biased_clusters) == 1) {
  cat("\nDecision: Harmony may be useful, but inspect biologically first.\n")
  cat("Reason: One cluster is sample-dominated. It could be batch effect, rare biology, or poor-quality cells.\n")
} else {
  cat("\nDecision: Harmony is not strictly necessary from this UMAP/sample check.\n")
  cat("Reason: Clusters are not strongly dominated by single samples.\n")
}

Decision: Harmony is recommended.
Reason: Multiple clusters are strongly dominated by one sample, suggesting sample/batch-driven structure.

Step 8.6: UMAP Before and After Harmony

if (!requireNamespace("harmony", quietly = TRUE)) {
  install.packages("harmony")
}

library(harmony)
p_before_no_label <- DimPlot(
  SO,
  reduction = "umap",
  group.by = "seurat_clusters",
  label = FALSE,
  pt.size = 0.3
) +
  ggtitle("Before Harmony: clusters without labels") +
  NoLegend()

p_before_label <- DimPlot(
  SO,
  reduction = "umap",
  group.by = "seurat_clusters",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.3
) +
  ggtitle("Before Harmony: clusters with labels") +
  NoLegend()

p_before_no_label | p_before_label

# Check PCA elbow numerically
stdev <- SO[["pca"]]@stdev
pct_var <- stdev^2 / sum(stdev^2) * 100
cum_var <- cumsum(pct_var)

pc_df <- data.frame(
  PC = 1:length(stdev),
  pct_var = round(pct_var, 2),
  cum_var = round(cum_var, 2)
)

print(head(pc_df, 50))
   PC pct_var cum_var
1   1   20.11   20.11
2   2   12.74   32.85
3   3   11.48   44.33
4   4    8.48   52.81
5   5    7.36   60.16
6   6    5.38   65.55
7   7    3.59   69.14
8   8    2.52   71.65
9   9    1.94   73.59
10 10    1.87   75.46
11 11    1.69   77.15
12 12    1.47   78.62
13 13    1.28   79.90
14 14    1.23   81.13
15 15    1.11   82.24
16 16    1.02   83.26
17 17    0.94   84.20
18 18    0.84   85.03
19 19    0.80   85.83
20 20    0.69   86.52
21 21    0.67   87.19
22 22    0.63   87.82
23 23    0.57   88.39
24 24    0.56   88.95
25 25    0.55   89.50
26 26    0.53   90.03
27 27    0.52   90.55
28 28    0.51   91.06
29 29    0.50   91.57
30 30    0.49   92.06
31 31    0.48   92.54
32 32    0.47   93.01
33 33    0.45   93.46
34 34    0.44   93.90
35 35    0.42   94.32
36 36    0.42   94.74
37 37    0.41   95.16
38 38    0.41   95.56
39 39    0.40   95.96
40 40    0.39   96.35
41 41    0.39   96.74
42 42    0.39   97.13
43 43    0.37   97.50
44 44    0.37   97.87
45 45    0.37   98.24
46 46    0.37   98.60
47 47    0.36   98.96
48 48    0.35   99.31
49 49    0.35   99.66
50 50    0.34  100.00
# Suggested PC number:
pcs_80 <- which(cum_var >= 80)[1]
pcs_90 <- which(cum_var >= 90)[1]

cat("\nPCs for 80% variance:", pcs_80, "\n")

PCs for 80% variance: 14 
cat("PCs for 90% variance:", pcs_90, "\n")
PCs for 90% variance: 26 
ElbowPlot(SO, ndims = 50)

p_before_condition <- DimPlot(
  SO,
  reduction = "umap",
  group.by = "condition",
  pt.size = 0.3
) +
  ggtitle("Before Harmony: UMAP colored by Condition")

# make points transparent
p_before_condition$layers[[1]]$aes_params$alpha <- 0.3

p_before_condition_split <- DimPlot(
  SO,
  reduction = "umap",
  split.by = "condition",
  pt.size = 0.3,
  ncol = 2
) +
  ggtitle("Before Harmony: UMAP split by Condition")

p_before_condition | p_before_condition_split

SO <- harmony::RunHarmony(
  object = SO,
  group.by.vars = "Sample",
  reduction.use = "pca",
  dims.use = 1:35
)
Initializing centroids
SO <- FindNeighbors(
  SO,
  reduction = "harmony",
  dims = 1:35
)

SO <- FindClusters(
  SO,
  resolution = 0.5,
  random.seed = 123,
  cluster.name = "harmony_clusters"
)
Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 43448
Number of edges: 1721473

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.8981
Number of communities: 17
Elapsed time: 9 seconds
set.seed(123)

SO <- RunUMAP(
  SO,
  reduction = "harmony",
  dims = 1:35,
  reduction.name = "umap_harmony",
  reduction.key = "UMAPHARMONY_",
  seed.use = 123
)
p_after_no_label <- DimPlot(
  SO,
  reduction = "umap_harmony",
  group.by = "harmony_clusters",
  label = FALSE,
  pt.size = 0.3
) +
  ggtitle("After Harmony: clusters without labels") +
  NoLegend()

p_after_label <- DimPlot(
  SO,
  reduction = "umap_harmony",
  group.by = "harmony_clusters",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.3
) +
  ggtitle("After Harmony: clusters with labels") +
  NoLegend()

p_after_no_label | p_after_label

p_after_sample <- DimPlot(
  SO,
  reduction = "umap_harmony",
  group.by = "Sample",
  pt.size = 0.3
) +
  ggtitle("After Harmony: UMAP colored by Sample")

p_after_sample_split <- DimPlot(
  SO,
  reduction = "umap_harmony",
  split.by = "Sample",
  pt.size = 0.3,
  ncol = 4
) +
  ggtitle("After Harmony: UMAP split by Sample")

p_after_sample

p_after_sample_split

p_after_condition <- DimPlot(
  SO,
  reduction = "umap_harmony",
  group.by = "condition",
  pt.size = 0.3
) +
  ggtitle("After Harmony: UMAP colored by Condition")

p_after_condition_split <- DimPlot(
  SO,
  reduction = "umap_harmony",
  split.by = "condition",
  pt.size = 0.3,
  ncol = 2
) +
  ggtitle("After Harmony: UMAP split by Condition")

p_after_condition | p_after_condition_split

harmony_sample_cluster_table <- table(
  Cluster = SO$harmony_clusters,
  Sample = SO$Sample
)

print("After Harmony: raw cell counts, cluster x sample")
[1] "After Harmony: raw cell counts, cluster x sample"
print(harmony_sample_cluster_table)
       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0       748       70     2579      600      677      198      438     1813
     1       404       44     1815      700      458       46      612      712
     2       186      130      466     1086      259       96      367      540
     3       246       31     1140      364      319      103      244      839
     4       383       27      935      193      318      103      243      529
     5       334      105      381      606      186       21      113      180
     6        81       26      882      102      444       40      193      868
     7       287       43      481      258      189       98      140      127
     8       180       34      169       74      209      135      410      116
     9        46      137       39      457       60       57       44       85
     10       48       11      103       51       93       80      368      111
     11      185       44       65       59       18       22       92      194
     12       31       30       15      107        9       43       11        9
     13       12      177        4        4       14       55        0       79
     14       39        5        5        7        3        3        3       31
     15       11        4       17       26        9        5        6        2
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0       432      442      128      848
     1       472      309       92      938
     2       890      508      255      277
     3       248      274       62      412
     4       163      121       69      270
     5       588      357      118      275
     6        61      223       41      290
     7       304      356       96      339
     8       167      192       10      100
     9       148       50       21       30
     10       90       86       44       66
     11       39       26       24       44
     12       40        1        2       72
     13        3        0        0        0
     14       10        1        0       60
     15        8       11        3       24
harmony_sample_cluster_prop <- prop.table(
  harmony_sample_cluster_table,
  margin = 1
)

print("After Harmony: proportion within each cluster")
[1] "After Harmony: proportion within each cluster"
print(round(harmony_sample_cluster_prop, 3))
       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0     0.083    0.008    0.287    0.067    0.075    0.022    0.049    0.202
     1     0.061    0.007    0.275    0.106    0.069    0.007    0.093    0.108
     2     0.037    0.026    0.092    0.215    0.051    0.019    0.073    0.107
     3     0.057    0.007    0.266    0.085    0.074    0.024    0.057    0.196
     4     0.114    0.008    0.279    0.058    0.095    0.031    0.072    0.158
     5     0.102    0.032    0.117    0.186    0.057    0.006    0.035    0.055
     6     0.025    0.008    0.271    0.031    0.137    0.012    0.059    0.267
     7     0.106    0.016    0.177    0.095    0.070    0.036    0.052    0.047
     8     0.100    0.019    0.094    0.041    0.116    0.075    0.228    0.065
     9     0.039    0.117    0.033    0.389    0.051    0.049    0.037    0.072
     10    0.042    0.010    0.089    0.044    0.081    0.070    0.320    0.096
     11    0.228    0.054    0.080    0.073    0.022    0.027    0.113    0.239
     12    0.084    0.081    0.041    0.289    0.024    0.116    0.030    0.024
     13    0.034    0.509    0.011    0.011    0.040    0.158    0.000    0.227
     14    0.234    0.030    0.030    0.042    0.018    0.018    0.018    0.186
     15    0.087    0.032    0.135    0.206    0.071    0.040    0.048    0.016
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0     0.048    0.049    0.014    0.095
     1     0.071    0.047    0.014    0.142
     2     0.176    0.100    0.050    0.055
     3     0.058    0.064    0.014    0.096
     4     0.049    0.036    0.021    0.081
     5     0.180    0.109    0.036    0.084
     6     0.019    0.069    0.013    0.089
     7     0.112    0.131    0.035    0.125
     8     0.093    0.107    0.006    0.056
     9     0.126    0.043    0.018    0.026
     10    0.078    0.075    0.038    0.057
     11    0.048    0.032    0.030    0.054
     12    0.108    0.003    0.005    0.195
     13    0.009    0.000    0.000    0.000
     14    0.060    0.006    0.000    0.359
     15    0.063    0.087    0.024    0.190
harmony_sample_cluster_df <- as.data.frame(harmony_sample_cluster_table)

ggplot(harmony_sample_cluster_df, aes(x = Cluster, y = Freq, fill = Sample)) +
  geom_bar(stat = "identity", position = "fill", color = "black", linewidth = 0.2) +
  theme_classic() +
  labs(
    title = "After Harmony: sample composition within each cluster",
    x = "Harmony cluster",
    y = "Proportion"
  )

SO$pre_harmony_cluster <- as.character(SO$seurat_clusters)
SO$was_pre_cluster5 <- SO$pre_harmony_cluster == "5"

cluster5_cells <- colnames(SO)[SO$was_pre_cluster5]

cat("Number of pre-Harmony cluster 5 cells:\n")
Number of pre-Harmony cluster 5 cells:
print(length(cluster5_cells))
[1] 3352
cat("\nWhere did pre-Harmony cluster 5 cells go after Harmony?\n")

Where did pre-Harmony cluster 5 cells go after Harmony?
tab_c5 <- table(
  HarmonyCluster = SO$harmony_clusters[cluster5_cells]
)
print(tab_c5)
HarmonyCluster
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
 874 1243   22  594   94    3  397    0    3   23    2   42    0    0   55    0 
cat("\nProportion:\n")

Proportion:
print(round(prop.table(tab_c5), 3))
HarmonyCluster
    0     1     2     3     4     5     6     7     8     9    10    11    12 
0.261 0.371 0.007 0.177 0.028 0.001 0.118 0.000 0.001 0.007 0.001 0.013 0.000 
   13    14    15 
0.000 0.016 0.000 
cat("\nSample / condition / sex composition of old cluster 5:\n")

Sample / condition / sex composition of old cluster 5:
c5_metadata <- SO@meta.data[cluster5_cells, ] |>
  dplyr::count(Sample, condition, Sex, harmony_clusters, sort = TRUE)

print(c5_metadata)
     Sample condition Sex harmony_clusters   n
1  PDSN09_M        PD   M                1 851
2  PDSN09_M        PD   M                0 370
3  PDSN06_M        PD   M                0 355
4  PDSN06_M        PD   M                1 280
5  PDSN09_M        PD   M                3 256
6  PDSN06_M        PD   M                3 249
7  PDSN06_M        PD   M                6 198
8  PDSN09_M        PD   M                6 157
9  PDSN07_M        PD   M                0 117
10 PDSN07_M        PD   M                1  88
11 PDSN07_M        PD   M                3  59
12 PDSN09_M        PD   M               14  53
13 PDSN07_M        PD   M                4  52
14 PDSN07_M        PD   M                6  38
15 PDSN09_M        PD   M                4  35
16 PDSN09_M        PD   M               11  22
17 PDSN04_F        PD   F                3  19
18 PDSN07_M        PD   M               11  18
19 PDSN04_F        PD   F                0  17
20 PDSN06_M        PD   M                2  16
21 PDSN04_F        PD   F                1  12
22 PDSN06_M        PD   M                9  12
23 PDSN09_M        PD   M                9  11
24 NOSN04_M         C   M                0   8
25 PDSN06_M        PD   M                4   7
26 NOSN04_M         C   M                3   5
27 NOSN03_F         C   F                3   3
28 NOSN04_M         C   M                1   3
29 NOSN05_M         C   M                1   3
30 PDSN01_F        PD   F                1   3
31 PDSN09_M        PD   M                2   3
32 NOSN05_M         C   M                0   2
33 PDSN01_F        PD   F                0   2
34 PDSN01_F        PD   F                3   2
35 PDSN02_F        PD   F                0   2
36 PDSN04_F        PD   F               11   2
37 PDSN07_M        PD   M                2   2
38 NOSN01_F         C   F                1   1
39 NOSN03_F         C   F                0   1
40 NOSN03_F         C   F                1   1
41 NOSN04_M         C   M                6   1
42 NOSN04_M         C   M               14   1
43 NOSN05_M         C   M                6   1
44 NOSN05_M         C   M               10   1
45 NOSN06_M         C   M                2   1
46 PDSN02_F        PD   F                1   1
47 PDSN02_F        PD   F                3   1
48 PDSN02_F        PD   F                6   1
49 PDSN04_F        PD   F                5   1
50 PDSN04_F        PD   F                6   1
51 PDSN04_F        PD   F               14   1
52 PDSN06_M        PD   M                5   1
53 PDSN06_M        PD   M                8   1
54 PDSN07_M        PD   M                5   1
55 PDSN07_M        PD   M                8   1
56 PDSN07_M        PD   M               10   1
57 PDSN09_M        PD   M                8   1
p1 <- DimPlot(
  SO,
  reduction = "umap",
  cells.highlight = cluster5_cells,
  cols.highlight = "red",
  cols = "grey80",
  pt.size = 0.3
) +
  ggtitle("Before Harmony: pre-Harmony cluster 5 highlighted") +
  NoLegend()

p2 <- DimPlot(
  SO,
  reduction = "umap_harmony",
  cells.highlight = cluster5_cells,
  cols.highlight = "red",
  cols = "grey80",
  pt.size = 0.3
) +
  ggtitle("After Harmony: same cells highlighted") +
  NoLegend()

p1 | p2

FeaturePlot(
  SO,
  reduction = "umap_harmony",
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  ncol = 3,
  pt.size = 0.3
)

VlnPlot(
  SO,
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  group.by = "harmony_clusters",
  pt.size = 0
)

VlnPlot(
  SO,
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  group.by = "Sex",
  pt.size = 0
)

VlnPlot(
  SO,
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  group.by = "condition",
  split.by = "Sex",
  pt.size = 0
)

save_path_harmony <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026/scripts/SO_06_Harmony_Compared.rds"

saveRDS(SO, file = save_path_harmony)

print("Step 8.6 complete: before/after Harmony UMAPs generated and saved.")
[1] "Step 8.6 complete: before/after Harmony UMAPs generated and saved."