pca1 <- prcomp(USArrests, scale. = TRUE)
pve_a <- pca1$sdev^2 / sum(pca1$sdev^2)
round(pve_a, 3)
## [1] 0.620 0.247 0.089 0.043
Xsc <- scale(USArrests)
Z <- Xsc %*% pca1$rotation
pve_b <- colSums(Z^2) / sum(Xsc^2)
round(pve_b, 3)
## PC1 PC2 PC3 PC4
## 0.620 0.247 0.089 0.043
hc_raw <- hclust(dist(USArrests), method = "complete")
plot(hc_raw, main = "9(a): Dendrogram (raw USArrests)")
b.)
cl_raw3 <- cutree(hc_raw, k = 3)
split(rownames(USArrests), cl_raw3)
## $`1`
## [1] "Alabama" "Alaska" "Arizona" "California"
## [5] "Delaware" "Florida" "Illinois" "Louisiana"
## [9] "Maryland" "Michigan" "Mississippi" "Nevada"
## [13] "New Mexico" "New York" "North Carolina" "South Carolina"
##
## $`2`
## [1] "Arkansas" "Colorado" "Georgia" "Massachusetts"
## [5] "Missouri" "New Jersey" "Oklahoma" "Oregon"
## [9] "Rhode Island" "Tennessee" "Texas" "Virginia"
## [13] "Washington" "Wyoming"
##
## $`3`
## [1] "Connecticut" "Hawaii" "Idaho" "Indiana"
## [5] "Iowa" "Kansas" "Kentucky" "Maine"
## [9] "Minnesota" "Montana" "Nebraska" "New Hampshire"
## [13] "North Dakota" "Ohio" "Pennsylvania" "South Dakota"
## [17] "Utah" "Vermont" "West Virginia" "Wisconsin"
c.)
hc_scl <- hclust(dist(scale(USArrests)), method = "complete")
plot(hc_scl, main = "9(c): Dendrogram (scaled USArrests)")
cl_scl3 <- cutree(hc_scl, k = 3)
split(rownames(USArrests), cl_scl3)
## $`1`
## [1] "Alabama" "Alaska" "Georgia" "Louisiana"
## [5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
##
## $`2`
## [1] "Arizona" "California" "Colorado" "Florida" "Illinois"
## [6] "Maryland" "Michigan" "Nevada" "New Mexico" "New York"
## [11] "Texas"
##
## $`3`
## [1] "Arkansas" "Connecticut" "Delaware" "Hawaii"
## [5] "Idaho" "Indiana" "Iowa" "Kansas"
## [9] "Kentucky" "Maine" "Massachusetts" "Minnesota"
## [13] "Missouri" "Montana" "Nebraska" "New Hampshire"
## [17] "New Jersey" "North Dakota" "Ohio" "Oklahoma"
## [21] "Oregon" "Pennsylvania" "Rhode Island" "South Dakota"
## [25] "Utah" "Vermont" "Virginia" "Washington"
## [29] "West Virginia" "Wisconsin" "Wyoming"
d.)
Scaling forces each variable to have unit variance, so no single arrest rate (e.g. Assault) dominates the distance. As a result the cluster memberships change to reflect all four variables equally. Yes, when your features are on very different scales you should scale before computing distances.
set.seed(2025)
n <- 20; p <- 50
#A.) Simulate three mean-shifted classes
mu1 <- rep( 0, p)
mu2 <- c(rep( 3, 5), rep(0, p-5))
mu3 <- c(rep(-3, 5), rep(0, p-5))
X1 <- matrix(rnorm(n*p, mean = mu1), n, p)
X2 <- matrix(rnorm(n*p, mean = mu2), n, p)
X3 <- matrix(rnorm(n*p, mean = mu3), n, p)
X <- rbind(X1, X2, X3)
true.labels <- rep(1:3, each = n)
#B.) PCA & plot PC1 vs PC2
pcaX <- prcomp(X, scale. = TRUE)
scores <- pcaX$x[,1:2]
plot(scores,
col = true.labels, pch = true.labels,
xlab = "PC1", ylab = "PC2",
main = "10(b): PCA score plot")
#C.) K = 3
km3_raw <- kmeans(X, centers = 3, nstart = 20)
table(true.labels, km3_raw$cluster)
##
## true.labels 1 2 3
## 1 0 0 20
## 2 0 0 20
## 3 5 5 10
#d.) K = 2
km2_raw <- kmeans(X, centers = 2, nstart = 20)
table(true.labels, km2_raw$cluster)
##
## true.labels 1 2
## 1 0 20
## 2 0 20
## 3 10 10
#e.) K = 4
km4_raw <- kmeans(X, centers = 4, nstart = 20)
table(true.labels, km4_raw$cluster)
##
## true.labels 1 2 3 4
## 1 0 0 0 20
## 2 0 0 5 15
## 3 5 5 0 10
#F.) K = 3
km3_pcs <- kmeans(scores, centers = 3, nstart = 20)
table(true.labels, km3_pcs$cluster)
##
## true.labels 1 2 3
## 1 0 0 20
## 2 0 0 20
## 3 5 5 10
#G.) K = 3 on scaled data
Xsc2 <- scale(X)
km3_scl <- kmeans(Xsc2, centers = 3, nstart = 20)
table(true.labels, km3_scl$cluster)
##
## true.labels 1 2 3
## 1 0 0 20
## 2 0 0 20
## 3 5 5 10