# Load the dataset
data("USArrests")
# Run PCA with scaling to prevent dominance by large-range variables
pca_result <- prcomp(USArrests, scale. = TRUE)
# Compute PVE using sdev
var_explained <- pca_result$sdev^2
pve_percent <- 100 * var_explained / sum(var_explained)
pve_percent
## [1] 62.006039 24.744129 8.914080 4.335752
# Re-scale and apply matrix multiplication using loadings
scaled_data <- scale(USArrests)
component_scores <- scaled_data %*% pca_result$rotation
# Numerators: sum of squared component scores
num_vals <- apply(component_scores^2, 2, sum)
# Denominator: total variance in scaled data
denom_val <- sum(scaled_data^2)
# PVE calculation
manual_pve <- 100 * num_vals / denom_val
manual_pve
## PC1 PC2 PC3 PC4
## 62.006039 24.744129 8.914080 4.335752
hc_raw <- hclust(dist(USArrests), method = "complete")
plot(hc_raw, main = "Dendrogram: USArrests (Unscaled)", xlab = "States")
# Cut tree into 3 clusters
raw_clusters <- cutree(hc_raw, k = 3)
# Show cluster membership
table(raw_clusters)
## raw_clusters
## 1 2 3
## 16 14 20
raw_clusters
## Alabama Alaska Arizona Arkansas California
## 1 1 1 2 1
## Colorado Connecticut Delaware Florida Georgia
## 2 3 1 1 2
## Hawaii Idaho Illinois Indiana Iowa
## 3 3 1 3 3
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 3 1
## Massachusetts Michigan Minnesota Mississippi Missouri
## 2 1 3 1 2
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 1 3 2
## New Mexico New York North Carolina North Dakota Ohio
## 1 1 1 3 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 2 2 3 2 1
## South Dakota Tennessee Texas Utah Vermont
## 3 2 2 3 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 2 2 3 3 2
# Scale then cluster
hc_scaled <- hclust(dist(scale(USArrests)), method = "complete")
plot(hc_scaled, main = "Dendrogram: USArrests (Scaled)", xlab = "States")
# Cut into 3
scaled_clusters <- cutree(hc_scaled, k = 3)
scaled_clusters
## Alabama Alaska Arizona Arkansas California
## 1 1 2 3 2
## Colorado Connecticut Delaware Florida Georgia
## 2 3 3 2 1
## Hawaii Idaho Illinois Indiana Iowa
## 3 3 2 3 3
## Kansas Kentucky Louisiana Maine Maryland
## 3 3 1 3 2
## Massachusetts Michigan Minnesota Mississippi Missouri
## 3 2 3 1 3
## Montana Nebraska Nevada New Hampshire New Jersey
## 3 3 2 3 3
## New Mexico New York North Carolina North Dakota Ohio
## 2 2 1 3 3
## Oklahoma Oregon Pennsylvania Rhode Island South Carolina
## 3 3 3 3 1
## South Dakota Tennessee Texas Utah Vermont
## 3 1 2 3 3
## Virginia Washington West Virginia Wisconsin Wyoming
## 3 3 3 3 3
# Compare cluster changes
table(Scaled = scaled_clusters, Unscaled = raw_clusters)
## Unscaled
## Scaled 1 2 3
## 1 6 2 0
## 2 9 2 0
## 3 1 10 20
Without scaling, high-variance features like Assault dominate distance calculations. After scaling, all variables contribute equally — leading to significantly different clusters.
set.seed(2025)
# Simulate 3 distinct clusters (means: 0, 10, 20), each with 20 samples and 50 features
groupA <- matrix(rnorm(1000, 0, 3), nrow = 20)
groupB <- matrix(rnorm(1000, 10, 3), nrow = 20)
groupC <- matrix(rnorm(1000, 20, 3), nrow = 20)
sim_data <- rbind(groupA, groupB, groupC)
labels <- rep(1:3, each = 20)
df_sim <- as.data.frame(sim_data)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# Run PCA
pca_sim <- prcomp(df_sim, scale. = TRUE)
pc_data <- data.frame(PC1 = pca_sim$x[,1], PC2 = pca_sim$x[,2], Group = factor(labels))
# Plot
ggplot(pc_data, aes(x = PC1, y = PC2, color = Group)) +
geom_point(size = 3) +
labs(title = "PC1 vs PC2: Simulated Clusters", color = "True Group")
set.seed(2025)
k3_model <- kmeans(df_sim, centers = 3, nstart = 25)
# Compare predicted vs actual
table(Actual = labels, Clustered = k3_model$cluster)
## Clustered
## Actual 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
set.seed(2025)
k2_model <- kmeans(df_sim, centers = 2, nstart = 25)
table(Actual = labels, Clustered = k2_model$cluster)
## Clustered
## Actual 1 2
## 1 0 20
## 2 0 20
## 3 20 0
set.seed(2025)
k4_model <- kmeans(df_sim, centers = 4, nstart = 25)
table(Actual = labels, Clustered = k4_model$cluster)
## Clustered
## Actual 1 2 3 4
## 1 0 10 10 0
## 2 0 0 0 20
## 3 20 0 0 0
pc_two <- pca_sim$x[, 1:2]
set.seed(2025)
kmeans_pc <- kmeans(pc_two, centers = 3, nstart = 25)
table(Actual = labels, Clustered = kmeans_pc$cluster)
## Clustered
## Actual 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
Conclusion: PCA-based clustering gives almost identical results while using only 2 dimensions.
set.seed(2025)
kmeans_scaled <- kmeans(scale(df_sim), centers = 3, nstart = 25)
# Compare with unscaled K=3 clustering
table(Scaled = kmeans_scaled$cluster, Unscaled = k3_model$cluster)
## Unscaled
## Scaled 1 2 3
## 1 20 0 0
## 2 0 20 0
## 3 0 0 20
Conclusion: Since features were generated with similar variance, scaling doesn’t change much. But in real data, scaling is recommended.