Cluster Analysis

library(factoextra)

## Warning: package 'factoextra' was built under R version 4.3.3

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

Data preparation

The built-in R dataset USArrests is used:

# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
head(df)

##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Distance matrix computation and visualization

library(factoextra)
# Correlation-based distance method
res.dist <- get_dist(df, method = "pearson")
head(round(as.matrix(res.dist), 2))[, 1:6]

##            Alabama Alaska Arizona Arkansas California Colorado
## Alabama       0.00   0.71    1.45     0.09       1.87     1.69
## Alaska        0.71   0.00    0.83     0.37       0.81     0.52
## Arizona       1.45   0.83    0.00     1.18       0.29     0.60
## Arkansas      0.09   0.37    1.18     0.00       1.59     1.37
## California    1.87   0.81    0.29     1.59       0.00     0.11
## Colorado      1.69   0.52    0.60     1.37       0.11     0.00

# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 8)

Enhanced clustering analysis

The standard R code for computing hierarchical clustering looks like this:

# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)

# Compute dissimilarity matrix
res.dist <- dist(df, method = "euclidean")

# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")

# Visualize
plot(res.hc, cex = 0.5)

In this section we’ll describe the eclust() function [factoextra package] to simplify the workflow. The format is as follow:

In the following R code, we’ll show some examples for enhanced k-means clustering and hierarchical clustering. Note that the same analysis can be done for PAM, CLARA, FANNY, AGNES and DIANA.

library("factoextra")

# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", nstart = 25)

# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)

# Silhouette plot
fviz_silhouette(res.km)

##   cluster size ave.sil.width
## 1       1   13          0.37
## 2       2   20          0.26
## 3       3   17          0.32

# Optimal number of clusters using gap statistics
res.km$nbclust

## [1] 3

# Print result
 res.km

## K-means clustering with 3 clusters of sizes 13, 20, 17
## 
## Cluster means:
##       Murder    Assault   UrbanPop       Rape
## 1 -0.9615407 -1.1066010 -0.9301069 -0.9667633
## 2  1.0049340  1.0138274  0.1975853  0.8469650
## 3 -0.4469795 -0.3465138  0.4788049 -0.2571398
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              2              2              2              3              2 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              2              3              3              2              2 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              1              2              3              1 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              1              2              1              2 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              2              1              2              2 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              1              1              2              1              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              2              2              2              1              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              2 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              1              2              2              3              1 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              1              1              3 
## 
## Within cluster sum of squares by cluster:
## [1] 11.95246 46.74796 19.62285
##  (between_SS / total_SS =  60.0 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "clust_plot"  
## [11] "silinfo"      "nbclust"      "data"         "gap_stat"

 # Enhanced hierarchical clustering
 res.hc <- eclust(df, "hclust") # compute hclust

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

 fviz_dend(res.hc, rect = TRUE) # dendrogam

fviz_silhouette(res.hc) # silhouette plot

##   cluster size ave.sil.width
## 1       1   19          0.26
## 2       2   19          0.28
## 3       3   12          0.43

 fviz_cluster(res.hc) # scatter plot

eclust(df, "kmeans", k = 4)

## K-means clustering with 4 clusters of sizes 8, 13, 16, 13
## 
## Cluster means:
##       Murder    Assault   UrbanPop        Rape
## 1  1.4118898  0.8743346 -0.8145211  0.01927104
## 2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
## 3 -0.4894375 -0.3826001  0.5758298 -0.26165379
## 4  0.6950701  1.0394414  0.7226370  1.27693964
## 
## Clustering vector:
##        Alabama         Alaska        Arizona       Arkansas     California 
##              1              4              4              1              4 
##       Colorado    Connecticut       Delaware        Florida        Georgia 
##              4              3              3              4              1 
##         Hawaii          Idaho       Illinois        Indiana           Iowa 
##              3              2              4              3              2 
##         Kansas       Kentucky      Louisiana          Maine       Maryland 
##              3              2              1              2              4 
##  Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
##              3              4              2              1              4 
##        Montana       Nebraska         Nevada  New Hampshire     New Jersey 
##              2              2              4              2              3 
##     New Mexico       New York North Carolina   North Dakota           Ohio 
##              4              4              1              2              3 
##       Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
##              3              3              3              3              1 
##   South Dakota      Tennessee          Texas           Utah        Vermont 
##              2              1              4              3              2 
##       Virginia     Washington  West Virginia      Wisconsin        Wyoming 
##              3              3              2              2              3 
## 
## Within cluster sum of squares by cluster:
## [1]  8.316061 11.952463 16.212213 19.922437
##  (between_SS / total_SS =  71.2 %)
## 
## Available components:
## 
##  [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
##  [6] "betweenss"    "size"         "iter"         "ifault"       "clust_plot"  
## [11] "silinfo"      "nbclust"      "data"

Cluster Analysis

DEVINE FATHY MAE S. GRINO

2024-05-17

Data preparation

Distance matrix computation and visualization

Enhanced clustering analysis