LAB_11

Question_8

(a)

# Load the dataset
data("USArrests")

# Run PCA with scaling to prevent dominance by large-range variables
pca_result <- prcomp(USArrests, scale. = TRUE)

# Compute PVE using sdev
var_explained <- pca_result$sdev^2
pve_percent <- 100 * var_explained / sum(var_explained)
pve_percent
## [1] 62.006039 24.744129  8.914080  4.335752

(b)

# Re-scale and apply matrix multiplication using loadings
scaled_data <- scale(USArrests)
component_scores <- scaled_data %*% pca_result$rotation

# Numerators: sum of squared component scores
num_vals <- apply(component_scores^2, 2, sum)

# Denominator: total variance in scaled data
denom_val <- sum(scaled_data^2)

# PVE calculation
manual_pve <- 100 * num_vals / denom_val
manual_pve
##       PC1       PC2       PC3       PC4 
## 62.006039 24.744129  8.914080  4.335752

Question-9

(a)

hc_raw <- hclust(dist(USArrests), method = "complete")
plot(hc_raw, main = "Dendrogram: USArrests (Unscaled)", xlab = "States")

# Cut tree into 3 clusters
raw_clusters <- cutree(hc_raw, k = 3)

# Show cluster membership
table(raw_clusters)
## raw_clusters
##  1  2  3 
## 16 14 20
raw_clusters
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              1              2              1 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              1              1              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              3              1              3              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              3              1 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              2              1              3              1              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              1              3              2 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              1              1              1              3              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              2              2              3              2              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              2              2              3              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              2              2              3              3              2

(c)

# Scale then cluster
hc_scaled <- hclust(dist(scale(USArrests)), method = "complete")
plot(hc_scaled, main = "Dendrogram: USArrests (Scaled)", xlab = "States")

# Cut into 3
scaled_clusters <- cutree(hc_scaled, k = 3)
scaled_clusters
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              1              2              3              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              3              2              3              3 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              3              1              3              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              3              1              3 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              3              3              2              3              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              1              3              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              3              1              2              3              3 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              3              3              3

(d)

# Compare cluster changes
table(Scaled = scaled_clusters, Unscaled = raw_clusters)
##       Unscaled
## Scaled  1  2  3
##      1  6  2  0
##      2  9  2  0
##      3  1 10 20

Without scaling, high-variance features like Assault dominate distance calculations. After scaling, all variables contribute equally — leading to significantly different clusters.

Question-10

(a)

set.seed(2025)

# Simulate 3 distinct clusters (means: 0, 10, 20), each with 20 samples and 50 features
groupA <- matrix(rnorm(1000, 0, 3), nrow = 20)
groupB <- matrix(rnorm(1000, 10, 3), nrow = 20)
groupC <- matrix(rnorm(1000, 20, 3), nrow = 20)
sim_data <- rbind(groupA, groupB, groupC)

labels <- rep(1:3, each = 20)
df_sim <- as.data.frame(sim_data)

(b)

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# Run PCA
pca_sim <- prcomp(df_sim, scale. = TRUE)
pc_data <- data.frame(PC1 = pca_sim$x[,1], PC2 = pca_sim$x[,2], Group = factor(labels))

# Plot
ggplot(pc_data, aes(x = PC1, y = PC2, color = Group)) +
  geom_point(size = 3) +
  labs(title = "PC1 vs PC2: Simulated Clusters", color = "True Group")

(c)

set.seed(2025)
k3_model <- kmeans(df_sim, centers = 3, nstart = 25)

# Compare predicted vs actual
table(Actual = labels, Clustered = k3_model$cluster)
##       Clustered
## Actual  1  2  3
##      1  0 20  0
##      2 20  0  0
##      3  0  0 20

(d)

set.seed(2025)
k2_model <- kmeans(df_sim, centers = 2, nstart = 25)
table(Actual = labels, Clustered = k2_model$cluster)
##       Clustered
## Actual  1  2
##      1  0 20
##      2  0 20
##      3 20  0

(e)

set.seed(2025)
k4_model <- kmeans(df_sim, centers = 4, nstart = 25)
table(Actual = labels, Clustered = k4_model$cluster)
##       Clustered
## Actual  1  2  3  4
##      1  0 10 10  0
##      2  0  0  0 20
##      3 20  0  0  0

(f)

pc_two <- pca_sim$x[, 1:2]

set.seed(2025)
kmeans_pc <- kmeans(pc_two, centers = 3, nstart = 25)
table(Actual = labels, Clustered = kmeans_pc$cluster)
##       Clustered
## Actual  1  2  3
##      1  0 20  0
##      2 20  0  0
##      3  0  0 20

Conclusion: PCA-based clustering gives almost identical results while using only 2 dimensions.

(g)

set.seed(2025)
kmeans_scaled <- kmeans(scale(df_sim), centers = 3, nstart = 25)

# Compare with unscaled K=3 clustering
table(Scaled = kmeans_scaled$cluster, Unscaled = k3_model$cluster)
##       Unscaled
## Scaled  1  2  3
##      1 20  0  0
##      2  0 20  0
##      3  0  0 20

Conclusion: Since features were generated with similar variance, scaling doesn’t change much. But in real data, scaling is recommended.