data(USArrests)
pca_result <- prcomp(USArrests, center = TRUE, scale = TRUE)
pve_sdev <- pca_result$sdev^2 / sum(pca_result$sdev^2)
cat("(a) PVE using sdev:\n")
## (a) PVE using sdev:
print(pve_sdev)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
# Center and scale the data manually (same preprocessing as in part a)
X <- scale(USArrests)
loadings <- pca_result$rotation # Use loadings from prcomp
# Calculate PVE using Equation 12.10
pve_eq_12_10 <- numeric(ncol(USArrests))
denominator <- sum(X^2)
for (m in 1:ncol(USArrests)) {
# Calculate scores for component m
scores_m <- X %*% loadings[, m]
# Apply equation 12.10: sum(scores_m^2) / sum(X^2)
pve_eq_12_10[m] <- sum(scores_m^2) / denominator
}
cat("\n(b) PVE using Equation 12.10:\n")
##
## (b) PVE using Equation 12.10:
print(pve_eq_12_10)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
# Compare the results
cat("\nAre the results approximately equal? ", all.equal(pve_sdev, pve_eq_12_10), "\n")
##
## Are the results approximately equal? TRUE
data(USArrests)
# Compute distance matrix
dist_matrix <- dist(USArrests, method = "euclidean")
# Perform hierarchical clustering
hc_complete <- hclust(dist_matrix, method = "complete")
# Plot the dendrogram
plot(hc_complete, main = "Hierarchical Clustering of USArrests",
xlab = "States", sub = "", cex = 0.6)
clusters <- cutree(hc_complete, k = 3)
# View the number of states in each cluster
table(clusters)
## clusters
## 1 2 3
## 16 14 20
# See which states belong to which clusters
states_clusters <- split(rownames(USArrests), clusters)
# Print the states in each cluster
cat("Cluster 1 states:\n")
## Cluster 1 states:
print(states_clusters[[1]])
## [1] "Alabama" "Alaska" "Arizona" "California"
## [5] "Delaware" "Florida" "Illinois" "Louisiana"
## [9] "Maryland" "Michigan" "Mississippi" "Nevada"
## [13] "New Mexico" "New York" "North Carolina" "South Carolina"
cat("\nCluster 2 states:\n")
##
## Cluster 2 states:
print(states_clusters[[2]])
## [1] "Arkansas" "Colorado" "Georgia" "Massachusetts"
## [5] "Missouri" "New Jersey" "Oklahoma" "Oregon"
## [9] "Rhode Island" "Tennessee" "Texas" "Virginia"
## [13] "Washington" "Wyoming"
cat("\nCluster 3 states:\n")
##
## Cluster 3 states:
print(states_clusters[[3]])
## [1] "Connecticut" "Hawaii" "Idaho" "Indiana"
## [5] "Iowa" "Kansas" "Kentucky" "Maine"
## [9] "Minnesota" "Montana" "Nebraska" "New Hampshire"
## [13] "North Dakota" "Ohio" "Pennsylvania" "South Dakota"
## [17] "Utah" "Vermont" "West Virginia" "Wisconsin"
# Scale the data
USArrests_scaled <- scale(USArrests)
# Compute distance matrix on scaled data
dist_matrix_scaled <- dist(USArrests_scaled, method = "euclidean")
# Perform hierarchical clustering
hc_complete_scaled <- hclust(dist_matrix_scaled, method = "complete")
# Plot the dendrogram
plot(hc_complete_scaled, main = "Hierarchical Clustering of Scaled USArrests",
xlab = "States", sub = "", cex = 0.6)
# Cut the scaled dendrogram to get 3 clusters
clusters_scaled <- cutree(hc_complete_scaled, k = 3)
# View the number of states in each cluster
table(clusters_scaled)
## clusters_scaled
## 1 2 3
## 8 11 31
# See which states belong to which clusters after scaling
states_clusters_scaled <- split(rownames(USArrests), clusters_scaled)
# Print the states in each cluster after scaling
cat("\nScaled Cluster 1 states:\n")
##
## Scaled Cluster 1 states:
print(states_clusters_scaled[[1]])
## [1] "Alabama" "Alaska" "Georgia" "Louisiana"
## [5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
cat("\nScaled Cluster 2 states:\n")
##
## Scaled Cluster 2 states:
print(states_clusters_scaled[[2]])
## [1] "Arizona" "California" "Colorado" "Florida" "Illinois"
## [6] "Maryland" "Michigan" "Nevada" "New Mexico" "New York"
## [11] "Texas"
cat("\nScaled Cluster 3 states:\n")
##
## Scaled Cluster 3 states:
print(states_clusters_scaled[[3]])
## [1] "Arkansas" "Connecticut" "Delaware" "Hawaii"
## [5] "Idaho" "Indiana" "Iowa" "Kansas"
## [9] "Kentucky" "Maine" "Massachusetts" "Minnesota"
## [13] "Missouri" "Montana" "Nebraska" "New Hampshire"
## [17] "New Jersey" "North Dakota" "Ohio" "Oklahoma"
## [21] "Oregon" "Pennsylvania" "Rhode Island" "South Dakota"
## [25] "Utah" "Vermont" "Virginia" "Washington"
## [29] "West Virginia" "Wisconsin" "Wyoming"
# Create a table to compare cluster assignments
comparison_table <- table(clusters, clusters_scaled)
print(comparison_table)
## clusters_scaled
## clusters 1 2 3
## 1 6 9 1
## 2 2 2 10
## 3 0 0 20
Effect of scaling the variables on hierarchical clustering: 1. The unscaled clustering is heavily influenced by variables with larger absolute values (Assault). 2. After scaling, all variables contribute equally to the distance calculations. 3. This results in different cluster compositions as seen in the comparison table.
Regarding whether variables should be scaled: - For the USArrests dataset, scaling is appropriate because the variables have very different ranges: * Murder: approximately 0.8-17.4 * Assault: approximately 45-337 * UrbanPop: approximately 32-91 * Rape: approximately 7.3-46 - Without scaling, Assault would dominate the clustering due to its much larger magnitude. - Scaling ensures each crime metric contributes equally to the analysis. - In general, scaling should be applied when variables are measured in different units or have substantially different ranges, unless there is a specific reason to give some variables more weight.
set.seed(123)
# Parameters
n_per_class <- 20
num_classes <- 3
num_vars <- 50
# Generate data for each class with mean shifts
class1 <- matrix(rnorm(n_per_class * num_vars, mean = 0, sd = 1), nrow = n_per_class)
class2 <- matrix(rnorm(n_per_class * num_vars, mean = 3, sd = 1), nrow = n_per_class)
class3 <- matrix(rnorm(n_per_class * num_vars, mean = 6, sd = 1), nrow = n_per_class)
# Combine data
data <- rbind(class1, class2, class3)
# Create class labels
true_labels <- c(rep(1, n_per_class), rep(2, n_per_class), rep(3, n_per_class))
# Convert to data frame
sim_data <- data.frame(data)
colnames(sim_data) <- paste0("Var", 1:num_vars)
sim_data$Class <- true_labels
# View the first few rows
head(sim_data[, c(1:5, 51)])
## Var1 Var2 Var3 Var4 Var5 Class
## 1 -0.56047565 -1.0678237 -0.6947070 0.3796395 0.005764186 1
## 2 -0.23017749 -0.2179749 -0.2079173 -0.5023235 0.385280401 1
## 3 1.55870831 -1.0260044 -1.2653964 -0.3332074 -0.370660032 1
## 4 0.07050839 -0.7288912 2.1689560 -1.0185754 0.644376549 1
## 5 0.12928774 -0.6250393 1.2079620 -1.0717912 -0.220486562 1
## 6 1.71506499 -1.6866933 -1.1231086 0.3035286 0.331781964 1
pca_result <- prcomp(sim_data[, 1:num_vars], center = TRUE, scale. = FALSE)
pca_scores <- pca_result$x[, 1:2]
# Plot the first two principal components with class colors
plot(pca_scores,
col = c("red", "green", "blue")[true_labels],
pch = 16,
xlab = "PC1", ylab = "PC2",
main = "PCA: First Two Principal Components")
legend("topright", legend = paste("Class", 1:3),
col = c("red", "green", "blue"), pch = 16)
set.seed(123)
kmeans_3 <- kmeans(sim_data[, 1:num_vars], centers = 3)
clusters_3 <- kmeans_3$cluster
# Compare clusters to true class labels using a contingency table
contingency_3 <- table(Cluster = clusters_3, "True Class" = true_labels)
print("K-means with K=3 vs True Classes:")
## [1] "K-means with K=3 vs True Classes:"
print(contingency_3)
## True Class
## Cluster 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
The K-means clustering with K=3 perfectly recovers the true class structure.
Each cluster corresponds exactly to one of the true classes.
set.seed(123)
kmeans_2 <- kmeans(sim_data[, 1:num_vars], centers = 2)
clusters_2 <- kmeans_2$cluster
# Compare clusters to true class labels
contingency_2 <- table(Cluster = clusters_2, "True Class" = true_labels)
print("\nK-means with K=2 vs True Classes:")
## [1] "\nK-means with K=2 vs True Classes:"
print(contingency_2)
## True Class
## Cluster 1 2 3
## 1 0 20 20
## 2 20 0 0
With K=2, the algorithm groups classes 1 and 2 together into one cluster, while class 3 forms its own cluster.
This suggests classes 1 and 2 are more similar to each other than
to class 3.
set.seed(123)
kmeans_4 <- kmeans(sim_data[, 1:num_vars], centers = 4)
clusters_4 <- kmeans_4$cluster
# Compare clusters to true class labels
contingency_4 <- table(Cluster = clusters_4, "True Class" = true_labels)
print("\nK-means with K=4 vs True Classes:")
## [1] "\nK-means with K=4 vs True Classes:"
print(contingency_4)
## True Class
## Cluster 1 2 3
## 1 0 20 0
## 2 12 0 0
## 3 0 0 20
## 4 8 0 0
With K=4, class 2 is split into two separate clusters (8 and 12 observations), while classes 1 and 3 remain intact.
This indicates that class 2 has more internal variability than the other classes.
set.seed(123)
kmeans_3_pca <- kmeans(pca_scores, centers = 3)
clusters_3_pca <- kmeans_3_pca$cluster
# Compare clusters to true class labels
contingency_3_pca <- table(Cluster = clusters_3_pca, "True Class" = true_labels)
print("\nK-means with K=3 on PCA scores vs True Classes:")
## [1] "\nK-means with K=3 on PCA scores vs True Classes:"
print(contingency_3_pca)
## True Class
## Cluster 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
K-means on just the first two PC scores still perfectly recovers the true classes.
This indicates that the first two principal components capture enough of the class separation structure to allow perfect clustering, which is more efficient than using all 50 variables.
scaled_data <- scale(sim_data[, 1:num_vars])
set.seed(123)
kmeans_3_scaled <- kmeans(scaled_data, centers = 3)
clusters_3_scaled <- kmeans_3_scaled$cluster
# Compare clusters to true class labels
contingency_3_scaled <- table(Cluster = clusters_3_scaled, "True Class" = true_labels)
print("\nK-means with K=3 on scaled data vs True Classes:")
## [1] "\nK-means with K=3 on scaled data vs True Classes:"
print(contingency_3_scaled)
## True Class
## Cluster 1 2 3
## 1 0 20 0
## 2 20 0 0
## 3 0 0 20
Scaling the data doesn’t change the clustering results in this case.
This is because our simulated data already has consistent variance across variables.
In real datasets with variables on different scales, scaling would be important to prevent variables with larger scales from dominating the distance calculations.
Scaling ensures each variable contributes equally to the clustering process.