Lab11

8.

data(USArrests)

a. Using the sdev output of the prcomp() function

pca_result <- prcomp(USArrests, center = TRUE, scale = TRUE)
pve_sdev <- pca_result$sdev^2 / sum(pca_result$sdev^2)
cat("(a) PVE using sdev:\n")
## (a) PVE using sdev:
print(pve_sdev)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752

b. Using Equation 12.10 directly

# Center and scale the data manually (same preprocessing as in part a)
X <- scale(USArrests)
loadings <- pca_result$rotation  # Use loadings from prcomp

# Calculate PVE using Equation 12.10
pve_eq_12_10 <- numeric(ncol(USArrests))
denominator <- sum(X^2)
for (m in 1:ncol(USArrests)) {
  # Calculate scores for component m
  scores_m <- X %*% loadings[, m]
  # Apply equation 12.10: sum(scores_m^2) / sum(X^2)
  pve_eq_12_10[m] <- sum(scores_m^2) / denominator
}

cat("\n(b) PVE using Equation 12.10:\n")
## 
## (b) PVE using Equation 12.10:
print(pve_eq_12_10)
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
# Compare the results
cat("\nAre the results approximately equal? ", all.equal(pve_sdev, pve_eq_12_10), "\n")
## 
## Are the results approximately equal?  TRUE

9.

data(USArrests)

a. Hierarchical clustering with complete linkage and Euclidean distance

# Compute distance matrix
dist_matrix <- dist(USArrests, method = "euclidean")

# Perform hierarchical clustering
hc_complete <- hclust(dist_matrix, method = "complete")

# Plot the dendrogram
plot(hc_complete, main = "Hierarchical Clustering of USArrests",
     xlab = "States", sub = "", cex = 0.6)

b. Cut the dendrogram to get 3 clusters

clusters <- cutree(hc_complete, k = 3)
# View the number of states in each cluster
table(clusters)
## clusters
##  1  2  3 
## 16 14 20
# See which states belong to which clusters
states_clusters <- split(rownames(USArrests), clusters)
# Print the states in each cluster
cat("Cluster 1 states:\n")
## Cluster 1 states:
print(states_clusters[[1]])
##  [1] "Alabama"        "Alaska"         "Arizona"        "California"    
##  [5] "Delaware"       "Florida"        "Illinois"       "Louisiana"     
##  [9] "Maryland"       "Michigan"       "Mississippi"    "Nevada"        
## [13] "New Mexico"     "New York"       "North Carolina" "South Carolina"
cat("\nCluster 2 states:\n")
## 
## Cluster 2 states:
print(states_clusters[[2]])
##  [1] "Arkansas"      "Colorado"      "Georgia"       "Massachusetts"
##  [5] "Missouri"      "New Jersey"    "Oklahoma"      "Oregon"       
##  [9] "Rhode Island"  "Tennessee"     "Texas"         "Virginia"     
## [13] "Washington"    "Wyoming"
cat("\nCluster 3 states:\n")
## 
## Cluster 3 states:
print(states_clusters[[3]])
##  [1] "Connecticut"   "Hawaii"        "Idaho"         "Indiana"      
##  [5] "Iowa"          "Kansas"        "Kentucky"      "Maine"        
##  [9] "Minnesota"     "Montana"       "Nebraska"      "New Hampshire"
## [13] "North Dakota"  "Ohio"          "Pennsylvania"  "South Dakota" 
## [17] "Utah"          "Vermont"       "West Virginia" "Wisconsin"

c. Hierarchical clustering after scaling the variables

# Scale the data
USArrests_scaled <- scale(USArrests)

# Compute distance matrix on scaled data
dist_matrix_scaled <- dist(USArrests_scaled, method = "euclidean")

# Perform hierarchical clustering
hc_complete_scaled <- hclust(dist_matrix_scaled, method = "complete")

# Plot the dendrogram
plot(hc_complete_scaled, main = "Hierarchical Clustering of Scaled USArrests",
     xlab = "States", sub = "", cex = 0.6)

# Cut the scaled dendrogram to get 3 clusters
clusters_scaled <- cutree(hc_complete_scaled, k = 3)

# View the number of states in each cluster
table(clusters_scaled)
## clusters_scaled
##  1  2  3 
##  8 11 31
# See which states belong to which clusters after scaling
states_clusters_scaled <- split(rownames(USArrests), clusters_scaled)

# Print the states in each cluster after scaling
cat("\nScaled Cluster 1 states:\n")
## 
## Scaled Cluster 1 states:
print(states_clusters_scaled[[1]])
## [1] "Alabama"        "Alaska"         "Georgia"        "Louisiana"     
## [5] "Mississippi"    "North Carolina" "South Carolina" "Tennessee"
cat("\nScaled Cluster 2 states:\n")
## 
## Scaled Cluster 2 states:
print(states_clusters_scaled[[2]])
##  [1] "Arizona"    "California" "Colorado"   "Florida"    "Illinois"  
##  [6] "Maryland"   "Michigan"   "Nevada"     "New Mexico" "New York"  
## [11] "Texas"
cat("\nScaled Cluster 3 states:\n")
## 
## Scaled Cluster 3 states:
print(states_clusters_scaled[[3]])
##  [1] "Arkansas"      "Connecticut"   "Delaware"      "Hawaii"       
##  [5] "Idaho"         "Indiana"       "Iowa"          "Kansas"       
##  [9] "Kentucky"      "Maine"         "Massachusetts" "Minnesota"    
## [13] "Missouri"      "Montana"       "Nebraska"      "New Hampshire"
## [17] "New Jersey"    "North Dakota"  "Ohio"          "Oklahoma"     
## [21] "Oregon"        "Pennsylvania"  "Rhode Island"  "South Dakota" 
## [25] "Utah"          "Vermont"       "Virginia"      "Washington"   
## [29] "West Virginia" "Wisconsin"     "Wyoming"

d. Compare the clusters before and after scaling

# Create a table to compare cluster assignments
comparison_table <- table(clusters, clusters_scaled)
print(comparison_table)
##         clusters_scaled
## clusters  1  2  3
##        1  6  9  1
##        2  2  2 10
##        3  0  0 20

Effect of scaling the variables on hierarchical clustering: 1. The unscaled clustering is heavily influenced by variables with larger absolute values (Assault). 2. After scaling, all variables contribute equally to the distance calculations. 3. This results in different cluster compositions as seen in the comparison table.

Regarding whether variables should be scaled: - For the USArrests dataset, scaling is appropriate because the variables have very different ranges: * Murder: approximately 0.8-17.4 * Assault: approximately 45-337 * UrbanPop: approximately 32-91 * Rape: approximately 7.3-46 - Without scaling, Assault would dominate the clustering due to its much larger magnitude. - Scaling ensures each crime metric contributes equally to the analysis. - In general, scaling should be applied when variables are measured in different units or have substantially different ranges, unless there is a specific reason to give some variables more weight.


10.

a. Generate a simulated data set with 3 classes and 50 variables

set.seed(123)

# Parameters
n_per_class <- 20
num_classes <- 3
num_vars <- 50

# Generate data for each class with mean shifts
class1 <- matrix(rnorm(n_per_class * num_vars, mean = 0, sd = 1), nrow = n_per_class)
class2 <- matrix(rnorm(n_per_class * num_vars, mean = 3, sd = 1), nrow = n_per_class)
class3 <- matrix(rnorm(n_per_class * num_vars, mean = 6, sd = 1), nrow = n_per_class)

# Combine data
data <- rbind(class1, class2, class3)

# Create class labels
true_labels <- c(rep(1, n_per_class), rep(2, n_per_class), rep(3, n_per_class))

# Convert to data frame
sim_data <- data.frame(data)
colnames(sim_data) <- paste0("Var", 1:num_vars)
sim_data$Class <- true_labels

# View the first few rows
head(sim_data[, c(1:5, 51)])
##          Var1       Var2       Var3       Var4         Var5 Class
## 1 -0.56047565 -1.0678237 -0.6947070  0.3796395  0.005764186     1
## 2 -0.23017749 -0.2179749 -0.2079173 -0.5023235  0.385280401     1
## 3  1.55870831 -1.0260044 -1.2653964 -0.3332074 -0.370660032     1
## 4  0.07050839 -0.7288912  2.1689560 -1.0185754  0.644376549     1
## 5  0.12928774 -0.6250393  1.2079620 -1.0717912 -0.220486562     1
## 6  1.71506499 -1.6866933 -1.1231086  0.3035286  0.331781964     1

b. Perform PCA and plot first two principal component score vectors

pca_result <- prcomp(sim_data[, 1:num_vars], center = TRUE, scale. = FALSE)
pca_scores <- pca_result$x[, 1:2]

# Plot the first two principal components with class colors
plot(pca_scores, 
     col = c("red", "green", "blue")[true_labels],
     pch = 16, 
     xlab = "PC1", ylab = "PC2",
     main = "PCA: First Two Principal Components")
legend("topright", legend = paste("Class", 1:3), 
       col = c("red", "green", "blue"), pch = 16)

c. Perform K-means clustering with K=3

set.seed(123)
kmeans_3 <- kmeans(sim_data[, 1:num_vars], centers = 3)
clusters_3 <- kmeans_3$cluster

# Compare clusters to true class labels using a contingency table
contingency_3 <- table(Cluster = clusters_3, "True Class" = true_labels)
print("K-means with K=3 vs True Classes:")
## [1] "K-means with K=3 vs True Classes:"
print(contingency_3)
##        True Class
## Cluster  1  2  3
##       1  0 20  0
##       2 20  0  0
##       3  0  0 20
  • The K-means clustering with K=3 perfectly recovers the true class structure.

  • Each cluster corresponds exactly to one of the true classes.

d. Perform K-means clustering with K=2

set.seed(123)
kmeans_2 <- kmeans(sim_data[, 1:num_vars], centers = 2)
clusters_2 <- kmeans_2$cluster

# Compare clusters to true class labels
contingency_2 <- table(Cluster = clusters_2, "True Class" = true_labels)
print("\nK-means with K=2 vs True Classes:")
## [1] "\nK-means with K=2 vs True Classes:"
print(contingency_2)
##        True Class
## Cluster  1  2  3
##       1  0 20 20
##       2 20  0  0
  • With K=2, the algorithm groups classes 1 and 2 together into one cluster, while class 3 forms its own cluster.

  • This suggests classes 1 and 2 are more similar to each other than to class 3.

e. Perform K-means clustering with K=4

set.seed(123)
kmeans_4 <- kmeans(sim_data[, 1:num_vars], centers = 4)
clusters_4 <- kmeans_4$cluster

# Compare clusters to true class labels
contingency_4 <- table(Cluster = clusters_4, "True Class" = true_labels)
print("\nK-means with K=4 vs True Classes:")
## [1] "\nK-means with K=4 vs True Classes:"
print(contingency_4)
##        True Class
## Cluster  1  2  3
##       1  0 20  0
##       2 12  0  0
##       3  0  0 20
##       4  8  0  0
  • With K=4, class 2 is split into two separate clusters (8 and 12 observations), while classes 1 and 3 remain intact.

  • This indicates that class 2 has more internal variability than the other classes.

f. Perform K-means clustering with K=3 on the first two principal component score vectors

set.seed(123)
kmeans_3_pca <- kmeans(pca_scores, centers = 3)
clusters_3_pca <- kmeans_3_pca$cluster

# Compare clusters to true class labels
contingency_3_pca <- table(Cluster = clusters_3_pca, "True Class" = true_labels)
print("\nK-means with K=3 on PCA scores vs True Classes:")
## [1] "\nK-means with K=3 on PCA scores vs True Classes:"
print(contingency_3_pca)
##        True Class
## Cluster  1  2  3
##       1  0 20  0
##       2 20  0  0
##       3  0  0 20
  • K-means on just the first two PC scores still perfectly recovers the true classes.

  • This indicates that the first two principal components capture enough of the class separation structure to allow perfect clustering, which is more efficient than using all 50 variables.

g. Perform K-means clustering with K=3 on scaled data

scaled_data <- scale(sim_data[, 1:num_vars])
set.seed(123)
kmeans_3_scaled <- kmeans(scaled_data, centers = 3)
clusters_3_scaled <- kmeans_3_scaled$cluster

# Compare clusters to true class labels
contingency_3_scaled <- table(Cluster = clusters_3_scaled, "True Class" = true_labels)
print("\nK-means with K=3 on scaled data vs True Classes:")
## [1] "\nK-means with K=3 on scaled data vs True Classes:"
print(contingency_3_scaled)
##        True Class
## Cluster  1  2  3
##       1  0 20  0
##       2 20  0  0
##       3  0  0 20
  • Scaling the data doesn’t change the clustering results in this case.

  • This is because our simulated data already has consistent variance across variables.

  • In real datasets with variables on different scales, scaling would be important to prevent variables with larger scales from dominating the distance calculations.

  • Scaling ensures each variable contributes equally to the clustering process.