library(factoextra)
# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
df<-round(df, 4)
head(df)
Murder Assault UrbanPop Rape
Alabama 1.2426 0.7828 -0.5209 -0.0034
Alaska 0.5079 1.1068 -1.2118 2.4842
Arizona 0.0716 1.4788 0.9990 1.0429
Arkansas 0.2323 0.2309 -1.0736 -0.1849
California 0.2783 1.2628 1.7589 2.0678
Colorado 0.0257 0.3989 0.8608 1.8650
library(factoextra)
# Correlation-based distance method
res.dist <- get_dist(df, method = "pearson")
head(round(as.matrix(res.dist), 2))[, 1:6]
Alabama Alaska Arizona Arkansas California Colorado
Alabama 0.00 0.71 1.45 0.09 1.87 1.69
Alaska 0.71 0.00 0.83 0.37 0.81 0.52
Arizona 1.45 0.83 0.00 1.18 0.29 0.60
Arkansas 0.09 0.37 1.18 0.00 1.59 1.37
California 1.87 0.81 0.29 1.59 0.00 0.11
Colorado 1.69 0.52 0.60 1.37 0.11 0.00
# Visualize the dissimilarity matrix
fviz_dist(res.dist, lab_size = 8)

# Load and scale the dataset
data("USArrests")
df <- scale(USArrests)
# Compute dissimilarity matrix
res.dist <- dist(df, method = "euclidean")
# Compute hierarchical clustering
res.hc <- hclust(res.dist, method = "ward.D2")
# Visualize
plot(res.hc, cex = 0.5)

library(factoextra)
library(cluster)
# Enhanced k-means clustering
res.km <- eclust(df, "kmeans", k=NULL, nstart = 25)

# Gap statistic plot
fviz_gap_stat(res.km$gap_stat)

# Silhouette plot
fviz_silhouette(res.km)
cluster size ave.sil.width
1 1 13 0.37
2 2 20 0.26
3 3 17 0.32

# Optimal number of clusters using gap statistics
res.km$nbclust
[1] 3
# Print result
res.km
K-means clustering with 3 clusters of sizes 13, 20, 17
Cluster means:
Murder Assault UrbanPop Rape
1 -0.9615407 -1.1066010 -0.9301069 -0.9667633
2 1.0049340 1.0138274 0.1975853 0.8469650
3 -0.4469795 -0.3465138 0.4788049 -0.2571398
Clustering vector:
Alabama Alaska Arizona Arkansas California
2 2 2 3 2
Colorado Connecticut Delaware Florida Georgia
2 3 3 2 2
Hawaii Idaho Illinois Indiana Iowa
3 1 2 3 1
Kansas Kentucky Louisiana Maine Maryland
3 1 2 1 2
Massachusetts Michigan Minnesota Mississippi Missouri
3 2 1 2 2
Montana Nebraska Nevada New Hampshire New Jersey
1 1 2 1 3
New Mexico New York North Carolina North Dakota Ohio
2 2 2 1 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
3 3 3 3 2
South Dakota Tennessee Texas Utah Vermont
1 2 2 3 1
Virginia Washington West Virginia Wisconsin Wyoming
3 3 1 1 3
Within cluster sum of squares by cluster:
[1] 11.95246 46.74796 19.62285
(between_SS / total_SS = 60.0 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault" "clust_plot"
[11] "silinfo" "nbclust" "data" "gap_stat"
# Enhanced hierarchical clustering
res.hc <- eclust(df, "hclust") # compute hclust
fviz_dend(res.hc, rect = TRUE) # dendrogam

fviz_silhouette(res.hc) # silhouette plot
cluster size ave.sil.width
1 1 19 0.26
2 2 19 0.28
3 3 12 0.43

fviz_cluster(res.hc) # scatter plot

eclust(df, "kmeans", k = 4)

K-means clustering with 4 clusters of sizes 8, 13, 16, 13
Cluster means:
Murder Assault UrbanPop Rape
1 1.4118898 0.8743346 -0.8145211 0.01927104
2 -0.9615407 -1.1066010 -0.9301069 -0.96676331
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 0.6950701 1.0394414 0.7226370 1.27693964
Clustering vector:
Alabama Alaska Arizona Arkansas California
1 4 4 1 4
Colorado Connecticut Delaware Florida Georgia
4 3 3 4 1
Hawaii Idaho Illinois Indiana Iowa
3 2 4 3 2
Kansas Kentucky Louisiana Maine Maryland
3 2 1 2 4
Massachusetts Michigan Minnesota Mississippi Missouri
3 4 2 1 4
Montana Nebraska Nevada New Hampshire New Jersey
2 2 4 2 3
New Mexico New York North Carolina North Dakota Ohio
4 4 1 2 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina
3 3 3 3 1
South Dakota Tennessee Texas Utah Vermont
2 1 4 3 2
Virginia Washington West Virginia Wisconsin Wyoming
3 3 2 2 3
Within cluster sum of squares by cluster:
[1] 8.316061 11.952463 16.212213 19.922437
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss"
[6] "betweenss" "size" "iter" "ifault" "clust_plot"
[11] "silinfo" "nbclust" "data"