Data Preparation
# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
head(df)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144 1.7589234 2.067820292
Colorado 0.02571456 0.3988593 0.8608085 1.864967207
Distance matrix computation and visualization
# Correlation-based distance method
res.dist <- get_dist(df, method = "pearson")
head(round(as.matrix(res.dist), 2))[, 1:6]
Alabama Alaska Arizona Arkansas California Colorado
Alabama 0.00 0.71 1.45 0.09 1.87 1.69
Alaska 0.71 0.00 0.83 0.37 0.81 0.52
Arizona 1.45 0.83 0.00 1.18 0.29 0.60
Arkansas 0.09 0.37 1.18 0.00 1.59 1.37
California 1.87 0.81 0.29 1.59 0.00 0.11
Colorado 1.69 0.52 0.60 1.37 0.11 0.00
# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 8)

Enhanced clustering analysis
# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
# Compute dissimilarity matrix
res.dist <- dist(df, method = "euclidean")
# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")
# Visualize
plot(res.hc, cex = 0.5)

# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", nstart = 20)

# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)

# Silhouette plot
fviz_silhouette(res.km)
cluster size ave.sil.width
1 1 13 0.37
2 2 13 0.27
3 3 16 0.34
4 4 8 0.39

# Optimal number of clusters using gap statistics
res.km$nbclust
[1] 4
# Print result
res.km
K-means clustering with 4 clusters of sizes 13, 13, 16, 8
Cluster means:
Murder Assault UrbanPop Rape
1 -0.9615407 -1.1066010 -0.9301069 -0.96676331
2 0.6950701 1.0394414 0.7226370 1.27693964
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 1.4118898 0.8743346 -0.8145211 0.01927104
Clustering vector:
Alabama Alaska Arizona Arkansas California
4 2 2 4 2
Colorado Connecticut Delaware Florida Georgia
2 3 3 2 4
Hawaii Idaho Illinois Indiana Iowa
3 1 2 3 1
Kansas Kentucky Louisiana Maine Maryland
3 1 4 1 2
Massachusetts Michigan Minnesota Mississippi Missouri
3 2 1 4 2
Montana Nebraska Nevada New Hampshire New Jersey
1 1 2 1 3
New Mexico New York North Carolina North Dakota Ohio
2 2 4 1 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
3 3 3 3 4
South Dakota Tennessee Texas Utah Vermont
1 4 2 3 1
Virginia Washington West Virginia Wisconsin Wyoming
3 3 1 1 3
Within cluster sum of squares by cluster:
[1] 11.952463 19.922437 16.212213 8.316061
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault" "clust_plot"
[11] "silinfo" "nbclust" "data" "gap_stat"
# Enhanced hierarchical clustering
res.hc <- eclust(df, "hclust") # compute hclust
fviz_dend(res.hc, rect = TRUE) # dendrogam

fviz_silhouette(res.hc) # silhouette plot
cluster size ave.sil.width
1 1 19 0.26
2 2 19 0.28
3 3 12 0.43

fviz_cluster(res.hc) # scatter plot

eclust(df, "kmeans", k = 4)

K-means clustering with 4 clusters of sizes 8, 13, 16, 13
Cluster means:
Murder Assault UrbanPop Rape
1 1.4118898 0.8743346 -0.8145211 0.01927104
2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 0.6950701 1.0394414 0.7226370 1.27693964
Clustering vector:
Alabama Alaska Arizona Arkansas California
1 4 4 1 4
Colorado Connecticut Delaware Florida Georgia
4 3 3 4 1
Hawaii Idaho Illinois Indiana Iowa
3 2 4 3 2
Kansas Kentucky Louisiana Maine Maryland
3 2 1 2 4
Massachusetts Michigan Minnesota Mississippi Missouri
3 4 2 1 4
Montana Nebraska Nevada New Hampshire New Jersey
2 2 4 2 3
New Mexico New York North Carolina North Dakota Ohio
4 4 1 2 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
3 3 3 3 1
South Dakota Tennessee Texas Utah Vermont
2 1 4 3 2
Virginia Washington West Virginia Wisconsin Wyoming
3 3 2 2 3
Within cluster sum of squares by cluster:
[1] 8.316061 11.952463 16.212213 19.922437
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault" "clust_plot"
[11] "silinfo" "nbclust" "data"