The term cluster validation is used to design the procedure of evaluating the goodness of clustering algorithm results.
Internal Validation. Loading Packages
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.2
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(fpc)
## Warning: package 'fpc' was built under R version 3.6.2
library(NbClust)
Data Preparation
# Excluding the column "Species" at position 5
df <- iris[, -5]
Standardise
# Standardize
df <- scale(df)
Creating K Means cluster
# K-means clustering
km.res <- eclust(df, "kmeans", k = 3, nstart = 25, graph = FALSE)
Chreating Hierarchical cluster
# Hierarchical clustering
hc.res <- eclust(df, "hclust", k = 3, hc_metric = "euclidean",
hc_method = "ward.D2", graph = FALSE)
Silhouette plot
fviz_silhouette(km.res, palette = "jco",
ggtheme = theme_classic())
## cluster size ave.sil.width
## 1 1 50 0.64
## 2 2 53 0.39
## 3 3 47 0.35
Value of Sil.width should be closer to 1 for good fit and to -1 for a poor fit. Silhouette Information
# Silhouette information
silinfo <- km.res$silinfo
names(silinfo)
## [1] "widths" "clus.avg.widths" "avg.width"
# Silhouette widths of each observation
head(silinfo$widths[, 1:3], 10)
## cluster neighbor sil_width
## 1 1 2 0.7341949
## 41 1 2 0.7333345
## 8 1 2 0.7308169
## 18 1 2 0.7287522
## 5 1 2 0.7284741
## 40 1 2 0.7247047
## 38 1 2 0.7244191
## 12 1 2 0.7217939
## 28 1 2 0.7215103
## 29 1 2 0.7145192
# Average silhouette width of each cluster
silinfo$clus.avg.widths
## [1] 0.6363162 0.3933772 0.3473922
# The total average (mean of all individual silhouette widths)
silinfo$avg.width
## [1] 0.4599482
# The size of each clusters
km.res$size
## [1] 50 53 47
Negative silver coefficients are not in the right cluster, we cna find the clusters they are closer to
# Silhouette width of observation
sil <- km.res$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
## cluster neighbor sil_width
## 112 3 2 -0.01058434
## 128 3 2 -0.02489394
Dunn Index is another measure of internal variation. It should be maximised.
library(fpc)
# Statistics for k-means clustering
km_stats <- cluster.stats(dist(df), km.res$cluster)
# Dun index
km_stats$dunn
## [1] 0.02649665
External Validation: It needs an external result to validate.