Step 8.7: RPCA Integration and Comparison with Harmony

library(Seurat)
library(tidyverse)
library(ggplot2)
library(patchwork)

set.seed(123)

project_dir <- "/Users/yoshimurasouhei/Downloads/010_school/4年生/bioinfomaticsリサーチクラークシップ/PD_2026"
scripts_dir <- file.path(project_dir, "scripts")

# Load pre-Harmony clustered object
pre_path <- file.path(scripts_dir, "SO_05_Clustered.rds")
SO_pre <- readRDS(pre_path)

# Load your Harmony object for comparison
harmony_path <- file.path(scripts_dir, "SO_05_Harmony_Compared.rds")
SO_harmony <- readRDS(harmony_path)

dims_use <- 1:26
resolution_use <- 0.5

print(SO_pre)

An object of class Seurat 
47773 features across 43448 samples within 1 assay 
Active assay: RNA (47773 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 2 dimensional reductions calculated: pca, umap

print(SO_harmony)

An object of class Seurat 
47773 features across 43448 samples within 1 assay 
Active assay: RNA (47773 features, 2000 variable features)
 3 layers present: counts, data, scale.data
 4 dimensional reductions calculated: pca, umap, harmony, umap_harmony

Your PCA table showed 90% variance at PC26, so I recommend using 1:26 here instead of 1:35. This is already a more gentle choice.

RPCA Integration

# Keep old cluster identity before integration
SO_pre$pre_harmony_cluster <- as.character(SO_pre$seurat_clusters)
SO_pre$was_pre_cluster5 <- SO_pre$pre_harmony_cluster == "5"

DefaultAssay(SO_pre) <- "RNA"

# Split by sample
SO_list <- SplitObject(SO_pre, split.by = "Sample")

# Normalize and run PCA per sample
SO_list <- lapply(SO_list, function(x) {
  x <- NormalizeData(x, verbose = FALSE)
  x <- FindVariableFeatures(x, selection.method = "vst", nfeatures = 2000, verbose = FALSE)
  x <- ScaleData(x, verbose = FALSE)
  x <- RunPCA(x, npcs = max(dims_use), verbose = FALSE)
  return(x)
})

# Select shared integration features
rpca_features <- SelectIntegrationFeatures(
  object.list = SO_list,
  nfeatures = 2000
)

# Prepare objects for RPCA
SO_list <- lapply(SO_list, function(x) {
  x <- ScaleData(x, features = rpca_features, verbose = FALSE)
  x <- RunPCA(x, features = rpca_features, npcs = max(dims_use), verbose = FALSE)
  return(x)
})

# Find RPCA anchors
rpca_anchors <- FindIntegrationAnchors(
  object.list = SO_list,
  anchor.features = rpca_features,
  reduction = "rpca",
  dims = dims_use
)

# Integrate data
SO_rpca <- IntegrateData(
  anchorset = rpca_anchors,
  dims = dims_use
)

RPCA Clustering and UMAP

DefaultAssay(SO_rpca) <- "integrated"

SO_rpca <- ScaleData(SO_rpca, verbose = FALSE)

SO_rpca <- RunPCA(
  SO_rpca,
  npcs = max(dims_use),
  verbose = FALSE
)

SO_rpca <- FindNeighbors(
  SO_rpca,
  dims = dims_use
)

SO_rpca <- FindClusters(
  SO_rpca,
  resolution = resolution_use,
  random.seed = 123
)

Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck

Number of nodes: 43448
Number of edges: 1878475

Running Louvain algorithm...
Maximum modularity in 10 random starts: 0.9159
Number of communities: 16
Elapsed time: 9 seconds

SO_rpca$rpca_clusters <- SO_rpca$seurat_clusters

set.seed(123)

SO_rpca <- RunUMAP(
  SO_rpca,
  dims = dims_use,
  reduction = "pca",
  reduction.name = "umap_rpca",
  reduction.key = "UMAPRPCA_",
  seed.use = 123
)

Compare Pre-Harmony, Harmony, and RPCA UMAPs

p_pre <- DimPlot(
  SO_pre,
  reduction = "umap",
  group.by = "seurat_clusters",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.3
) +
  ggtitle("Before Integration") +
  NoLegend()

p_harmony <- DimPlot(
  SO_harmony,
  reduction = "umap_harmony",
  group.by = "harmony_clusters",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.3
) +
  ggtitle("Harmony") +
  NoLegend()

p_rpca <- DimPlot(
  SO_rpca,
  reduction = "umap_rpca",
  group.by = "rpca_clusters",
  label = TRUE,
  repel = TRUE,
  pt.size = 0.3
) +
  ggtitle("RPCA") +
  NoLegend()

p_pre | p_harmony | p_rpca

Compare by Sample

p_pre_sample <- DimPlot(
  SO_pre,
  reduction = "umap",
  group.by = "Sample",
  pt.size = 0.3
) +
  ggtitle("Before Integration: Sample")

p_harmony_sample <- DimPlot(
  SO_harmony,
  reduction = "umap_harmony",
  group.by = "Sample",
  pt.size = 0.3
) +
  ggtitle("Harmony: Sample")

p_rpca_sample <- DimPlot(
  SO_rpca,
  reduction = "umap_rpca",
  group.by = "Sample",
  pt.size = 0.3
) +
  ggtitle("RPCA: Sample")

p_pre_sample | p_harmony_sample | p_rpca_sample

Compare by Condition

p_pre_condition <- DimPlot(
  SO_pre,
  reduction = "umap",
  group.by = "condition",
  pt.size = 0.3
) +
  ggtitle("Before Integration: Condition")

p_harmony_condition <- DimPlot(
  SO_harmony,
  reduction = "umap_harmony",
  group.by = "condition",
  pt.size = 0.3
) +
  ggtitle("Harmony: Condition")

p_rpca_condition <- DimPlot(
  SO_rpca,
  reduction = "umap_rpca",
  group.by = "condition",
  pt.size = 0.3
) +
  ggtitle("RPCA: Condition")

p_pre_condition$layers[[1]]$aes_params$alpha <- 0.3
p_harmony_condition$layers[[1]]$aes_params$alpha <- 0.3
p_rpca_condition$layers[[1]]$aes_params$alpha <- 0.3

p_pre_condition | p_harmony_condition | p_rpca_condition

Sample Composition After RPCA

rpca_sample_cluster_table <- table(
  Cluster = SO_rpca$rpca_clusters,
  Sample = SO_rpca$Sample
)

print("RPCA: raw cell counts, cluster x sample")

[1] "RPCA: raw cell counts, cluster x sample"

print(rpca_sample_cluster_table)

       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0       362       49     2374      645      472      139      413     1454
     1       434       34     1747      502      515       21      473      771
     2       368       22      868      476      474       79      353      737
     3       289       27     1059      190      412      129      208     1402
     4       422       21     1232      157      335      114      280      617
     5       336      103      401      610      190       21      117      180
     6       109       70      237      682       90       24      150      483
     7       285       29      482      255      190       97      140      128
     8        79       64      238      416      180       66      220       68
     9       268      409      147      505       89      149      141      158
     10      183       36      174       69      215      138      411      116
     11       37       10       95       43       78       73      350      105
     12       31       27       15      105        6       38        0        8
     13        9        5       17       24        9        6        8        0
     14        9        2        7        7        9        4       14        6
     15        0       10        3        8        1        7        6        2
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0       482      725      168      696
     1       236      237       43      645
     2       451      217      115      737
     3       119       85       38      440
     4       115      120       44      350
     5       590      361      118      277
     6       507      108      106      213
     7       303      356       96      340
     8       388      390      153       70
     9       166       68       26       18
     10      164      190       10       99
     11       77       83       42       54
     12       35        1        2       70
     13        8       11        3       24
     14       13        5        1       12
     15        9        0        0        0

rpca_sample_cluster_prop <- prop.table(
  rpca_sample_cluster_table,
  margin = 1
)

print("RPCA: proportion within each cluster")

[1] "RPCA: proportion within each cluster"

print(round(rpca_sample_cluster_prop, 3))

       Sample
Cluster NOSN01_F NOSN02_F NOSN03_F NOSN04_M NOSN05_M NOSN06_M PDSN01_F PDSN02_F
     0     0.045    0.006    0.298    0.081    0.059    0.017    0.052    0.182
     1     0.077    0.006    0.309    0.089    0.091    0.004    0.084    0.136
     2     0.075    0.004    0.177    0.097    0.097    0.016    0.072    0.151
     3     0.066    0.006    0.241    0.043    0.094    0.029    0.047    0.319
     4     0.111    0.006    0.324    0.041    0.088    0.030    0.074    0.162
     5     0.102    0.031    0.121    0.185    0.058    0.006    0.035    0.054
     6     0.039    0.025    0.085    0.245    0.032    0.009    0.054    0.174
     7     0.106    0.011    0.178    0.094    0.070    0.036    0.052    0.047
     8     0.034    0.027    0.102    0.178    0.077    0.028    0.094    0.029
     9     0.125    0.191    0.069    0.236    0.042    0.069    0.066    0.074
     10    0.101    0.020    0.096    0.038    0.119    0.076    0.228    0.064
     11    0.035    0.010    0.091    0.041    0.074    0.070    0.334    0.100
     12    0.092    0.080    0.044    0.311    0.018    0.112    0.000    0.024
     13    0.073    0.040    0.137    0.194    0.073    0.048    0.065    0.000
     14    0.101    0.022    0.079    0.079    0.101    0.045    0.157    0.067
     15    0.000    0.217    0.065    0.174    0.022    0.152    0.130    0.043
       Sample
Cluster PDSN04_F PDSN06_M PDSN07_M PDSN09_M
     0     0.060    0.091    0.021    0.087
     1     0.042    0.042    0.008    0.114
     2     0.092    0.044    0.023    0.151
     3     0.027    0.019    0.009    0.100
     4     0.030    0.032    0.012    0.092
     5     0.179    0.109    0.036    0.084
     6     0.182    0.039    0.038    0.077
     7     0.112    0.132    0.036    0.126
     8     0.166    0.167    0.066    0.030
     9     0.077    0.032    0.012    0.008
     10    0.091    0.105    0.006    0.055
     11    0.074    0.079    0.040    0.052
     12    0.104    0.003    0.006    0.207
     13    0.065    0.089    0.024    0.194
     14    0.146    0.056    0.011    0.135
     15    0.196    0.000    0.000    0.000

rpca_sample_cluster_df <- as.data.frame(rpca_sample_cluster_table)

ggplot(rpca_sample_cluster_df, aes(x = Cluster, y = Freq, fill = Sample)) +
  geom_bar(stat = "identity", position = "fill", color = "black", linewidth = 0.2) +
  theme_classic() +
  labs(
    title = "RPCA: sample composition within each cluster",
    x = "RPCA cluster",
    y = "Proportion"
  )

Track Pre-Harmony Cluster 5 After RPCA

cluster5_cells <- colnames(SO_pre)[SO_pre$seurat_clusters == "5"]

cat("Number of pre-Harmony cluster 5 cells:\n")

Number of pre-Harmony cluster 5 cells:

print(length(cluster5_cells))

[1] 3352

# Make sure cells exist in all objects
cluster5_cells_harmony <- intersect(cluster5_cells, colnames(SO_harmony))
cluster5_cells_rpca <- intersect(cluster5_cells, colnames(SO_rpca))

cat("\nWhere did pre-Harmony cluster 5 cells go after Harmony?\n")


Where did pre-Harmony cluster 5 cells go after Harmony?

tab_harmony_c5 <- table(
  HarmonyCluster = SO_harmony$harmony_clusters[cluster5_cells_harmony]
)
print(tab_harmony_c5)

HarmonyCluster
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
 834 1311  672   24   86    1    0  195    3    2    5   29   47  142    0    0 
  16 
   1

print(round(prop.table(tab_harmony_c5), 3))

HarmonyCluster
    0     1     2     3     4     5     6     7     8     9    10    11    12 
0.249 0.391 0.200 0.007 0.026 0.000 0.000 0.058 0.001 0.001 0.001 0.009 0.014 
   13    14    15    16 
0.042 0.000 0.000 0.000

cat("\nWhere did pre-Harmony cluster 5 cells go after RPCA?\n")


Where did pre-Harmony cluster 5 cells go after RPCA?

tab_rpca_c5 <- table(
  RPCACluster = SO_rpca$rpca_clusters[cluster5_cells_rpca]
)
print(tab_rpca_c5)

RPCACluster
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
1434  877  844   93   67    8    3    0   16    8    1    1    0    0    0    0

print(round(prop.table(tab_rpca_c5), 3))

RPCACluster
    0     1     2     3     4     5     6     7     8     9    10    11    12 
0.428 0.262 0.252 0.028 0.020 0.002 0.001 0.000 0.005 0.002 0.000 0.000 0.000 
   13    14    15 
0.000 0.000 0.000

p_pre_c5 <- DimPlot(
  SO_pre,
  reduction = "umap",
  cells.highlight = cluster5_cells,
  cols.highlight = "red",
  cols = "grey80",
  pt.size = 0.3
) +
  ggtitle("Before Integration: pre-Harmony cluster 5") +
  NoLegend()

p_harmony_c5 <- DimPlot(
  SO_harmony,
  reduction = "umap_harmony",
  cells.highlight = cluster5_cells_harmony,
  cols.highlight = "red",
  cols = "grey80",
  pt.size = 0.3
) +
  ggtitle("Harmony: same cells") +
  NoLegend()

p_rpca_c5 <- DimPlot(
  SO_rpca,
  reduction = "umap_rpca",
  cells.highlight = cluster5_cells_rpca,
  cols.highlight = "red",
  cols = "grey80",
  pt.size = 0.3
) +
  ggtitle("RPCA: same cells") +
  NoLegend()

p_pre_c5 | p_harmony_c5 | p_rpca_c5

Stress Marker Check After RPCA

DefaultAssay(SO_rpca) <- "RNA"

FeaturePlot(
  SO_rpca,
  reduction = "umap_rpca",
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  ncol = 3,
  pt.size = 0.3
)

VlnPlot(
  SO_rpca,
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  group.by = "rpca_clusters",
  pt.size = 0
)

VlnPlot(
  SO_rpca,
  features = c("HSPA1A", "HSPA1B", "CRYAB"),
  group.by = "condition",
  split.by = "Sex",
  pt.size = 0
)

Save RPCA Object

save_path_rpca <- file.path(
  scripts_dir,
  "SO_05_RPCA_Compared.rds"
)

saveRDS(SO_rpca, file = save_path_rpca)

print("RPCA integration complete and saved.")

[1] "RPCA integration complete and saved."