Introduction

In this analysis, we perform clustering on the USArrests dataset, applying different methods such as k-means clustering and hierarchical clustering. We also visualize the results using PCA and dendrograms to compare the clustering outcomes.

# Load and preview the dataset
data(USArrests)
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
dim(USArrests)
## [1] 50  4
summary(USArrests)
##      Murder          Assault         UrbanPop          Rape      
##  Min.   : 0.800   Min.   : 45.0   Min.   :32.00   Min.   : 7.30  
##  1st Qu.: 4.075   1st Qu.:109.0   1st Qu.:54.50   1st Qu.:15.07  
##  Median : 7.250   Median :159.0   Median :66.00   Median :20.10  
##  Mean   : 7.788   Mean   :170.8   Mean   :65.54   Mean   :21.23  
##  3rd Qu.:11.250   3rd Qu.:249.0   3rd Qu.:77.75   3rd Qu.:26.18  
##  Max.   :17.400   Max.   :337.0   Max.   :91.00   Max.   :46.00

Data Scaling

Before applying clustering algorithms, we standardize the dataset to ensure that all features contribute equally to the clustering results.

# Scale the data
scaled_data <- scale(USArrests)
head(scaled_data)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

K-Means Clustering We first apply the K-Means clustering algorithm to the scaled dataset and use the elbow method to determine the optimal number of clusters.

# Elbow method to determine the number of clusters
wss <- (nrow(scaled_data) - 1) * sum(apply(scaled_data, 2, var))
for (i in 2:10) wss[i] <- sum(kmeans(scaled_data, centers = i, nstart = 25)$tot.withinss)

# Plot the Elbow Method
plot(1:10, wss, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of Clusters", ylab = "Total Within-Cluster Sum of Squares")

# Perform K-Means Clustering with 4 clusters
set.seed(123)
kmeans_result <- kmeans(scaled_data, centers = 4, nstart = 25)
print(kmeans_result)
## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1  1.4118898  0.8743346 -0.8145211  0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  0.6950701  1.0394414  0.7226370  1.27693964
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              4              4              1              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              3              3              4              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              4              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              4              2              1              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              4              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              4              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3 
## 
## Within cluster sum of squares by cluster:
## [1]  8.316061 11.952463 16.212213 19.922437
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

PCA for Visualization We use Principal Component Analysis (PCA) to visualize the clusters in a two-dimensional space.

# Perform PCA for visualization
pca <- prcomp(scaled_data)
pca_data <- data.frame(pca$x[, 1:2], Cluster = as.factor(kmeans_result$cluster))

# Visualize the clusters
library(ggplot2)
ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster)) +
  geom_point(size = 3) +
  labs(title = "K-Means Clustering of USArrests Data", x = "Principal Component 1", y = "Principal Component 2") +
  theme_minimal()

Hierarchical Clustering Next, we apply hierarchical clustering using Ward’s method and visualize the results with a dendrogram.

# Perform hierarchical clustering
res.dist <- dist(scaled_data, method = "euclidean")
res.hc <- hclust(res.dist, method = "ward.D2")

# Plot dendrogram
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_dend(res.hc, cex = 0.5) + 
  theme(legend.position = "none")  # Remove the legend
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Cutting the Dendrogram We cut the dendrogram into 4 clusters to compare the hierarchical clustering results with K-Means.

# Cutting the dendrogram into 4 clusters
nc <- cutree(res.hc, k = 4)
table(nc)
## nc
##  1  2  3  4 
##  7 12 19 12
# Visualizing with colored dendrogram
fviz_dend(res.hc, k = 4, 
          cex = 0.5, 
          k_colors = c("red", "green", "blue", "yellow"),
          rect = TRUE, rect_border = "gray", rect_fill = TRUE)

Comparing Dendrograms We compare multiple hierarchical clustering methods to explore the difference in clustering.

# Perform clustering using different methods
hc1 <- hclust(res.dist, method = "average")
hc2 <- hclust(res.dist, method = "complete")
hc3 <- hclust(res.dist, method = "centroid")

# Convert to dendrograms and compare

suppressPackageStartupMessages(library(dendextend))

library(dendextend)
den1 <- as.dendrogram(hc1)
den2 <- as.dendrogram(hc2)
den3 <- as.dendrogram(hc3)

# Compare using tanglegram
tanglegram(den1, den2)

Davies-Bouldin Index We compute the Davies-Bouldin Index to assess the quality of the k-means clustering.

# Load necessary library
library(clusterSim)
## Loading required package: cluster
## Loading required package: MASS
# Compute Davies-Bouldin Index for K-Means result
db_index <- index.DB(scaled_data, kmeans_result$cluster)$DB
print(paste("Davies-Bouldin Index: ", db_index))
## [1] "Davies-Bouldin Index:  1.05733784441662"

Conclusion This analysis of the USArrests dataset employed both K-Means and hierarchical clustering methods. Scaling the data allowed for equitable feature contribution, while the elbow method identified four optimal clusters for K-Means. PCA visualizations highlighted distinct group separations. Hierarchical clustering via Ward’s method, complemented by tanglegram comparisons, revealed differences among clustering approaches. The Davies-Bouldin Index of 1.057 indicated satisfactory cluster separation. Overall, this study illustrates the effectiveness of clustering techniques in identifying patterns in multivariate data.