Cluster Analysis

Cluster Analysis in R Simplified and Enhanced

Required packages

library(factoextra)
library(cluster)

Data Preparation

# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
head(df)

               Murder   Assault   UrbanPop         Rape
Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
Arizona    0.07163341 1.4788032  0.9989801  1.042878388
Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144  1.7589234  2.067820292
Colorado   0.02571456 0.3988593  0.8608085  1.864967207

Distance matrix computation and visualization

# Correlation-based distance method
res.dist <- get_dist(df, method = "pearson")
head(round(as.matrix(res.dist), 2))[, 1:6]

           Alabama Alaska Arizona Arkansas California Colorado
Alabama       0.00   0.71    1.45     0.09       1.87     1.69
Alaska        0.71   0.00    0.83     0.37       0.81     0.52
Arizona       1.45   0.83    0.00     1.18       0.29     0.60
Arkansas      0.09   0.37    1.18     0.00       1.59     1.37
California    1.87   0.81    0.29     1.59       0.00     0.11
Colorado      1.69   0.52    0.60     1.37       0.11     0.00

# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 8)

Enhanced clustering analysis

# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)

# Compute dissimilarity matrix
res.dist <- dist(df, method = "euclidean")

# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")

# Visualize
plot(res.hc, cex = 0.5)

# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", nstart = 20)

# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)

# Silhouette plot
fviz_silhouette(res.km)

  cluster size ave.sil.width
1       1   13          0.37
2       2   13          0.27
3       3   16          0.34
4       4    8          0.39

# Optimal number of clusters using gap statistics
res.km$nbclust

[1] 4

# Print result
 res.km

K-means clustering with 4 clusters of sizes 13, 13, 16, 8

Cluster means:
      Murder    Assault   UrbanPop        Rape
1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
2  0.6950701  1.0394414  0.7226370  1.27693964
3 -0.4894375 -0.3826001  0.5758298 -0.26165379
4  1.4118898  0.8743346 -0.8145211  0.01927104

Clustering vector:
       Alabama         Alaska        Arizona       Arkansas     California 
             4              2              2              4              2 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             2              3              3              2              4 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             3              1              2              3              1 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             3              1              4              1              2 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             3              2              1              4              2 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             1              1              2              1              3 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             2              2              4              1              3 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             3              3              3              3              4 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             1              4              2              3              1 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             3              3              1              1              3 

Within cluster sum of squares by cluster:
[1] 11.952463 19.922437 16.212213  8.316061
 (between_SS / total_SS =  71.2 %)

Available components:

 [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
 [6] "betweenss"    "size"         "iter"         "ifault"       "clust_plot"  
[11] "silinfo"      "nbclust"      "data"         "gap_stat"

 # Enhanced hierarchical clustering
 res.hc <- eclust(df, "hclust") # compute hclust

 fviz_dend(res.hc, rect = TRUE) # dendrogam

fviz_silhouette(res.hc) # silhouette plot

  cluster size ave.sil.width
1       1   19          0.26
2       2   19          0.28
3       3   12          0.43

 fviz_cluster(res.hc) # scatter plot

eclust(df, "kmeans", k = 4)

K-means clustering with 4 clusters of sizes 8, 13, 16, 13

Cluster means:
      Murder    Assault   UrbanPop        Rape
1  1.4118898  0.8743346 -0.8145211  0.01927104
2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
3 -0.4894375 -0.3826001  0.5758298 -0.26165379
4  0.6950701  1.0394414  0.7226370  1.27693964

Clustering vector:
       Alabama         Alaska        Arizona       Arkansas     California 
             1              4              4              1              4 
      Colorado    Connecticut       Delaware        Florida        Georgia 
             4              3              3              4              1 
        Hawaii          Idaho       Illinois        Indiana           Iowa 
             3              2              4              3              2 
        Kansas       Kentucky      Louisiana          Maine       Maryland 
             3              2              1              2              4 
 Massachusetts       Michigan      Minnesota    Mississippi       Missouri 
             3              4              2              1              4 
       Montana       Nebraska         Nevada  New Hampshire     New Jersey 
             2              2              4              2              3 
    New Mexico       New York North Carolina   North Dakota           Ohio 
             4              4              1              2              3 
      Oklahoma         Oregon   Pennsylvania   Rhode Island South Carolina 
             3              3              3              3              1 
  South Dakota      Tennessee          Texas           Utah        Vermont 
             2              1              4              3              2 
      Virginia     Washington  West Virginia      Wisconsin        Wyoming 
             3              3              2              2              3 

Within cluster sum of squares by cluster:
[1]  8.316061 11.952463 16.212213 19.922437
 (between_SS / total_SS =  71.2 %)

Available components:

 [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
 [6] "betweenss"    "size"         "iter"         "ifault"       "clust_plot"  
[11] "silinfo"      "nbclust"      "data"

Cluster Analysis

JORIELYN S. MANLA

2024-05-18

Cluster Analysis in R Simplified and Enhanced

Required packages

Data Preparation

Distance matrix computation and visualization

Enhanced clustering analysis