Iris dataset has been used to apply K-means clustering.
# Load the data
rm(list = ls())
library(tidyverse)
library(magrittr)
datatable(iris)
df <- iris %>% select(-Species)
library(clustertend)
hopkins(df, n = nrow(df) - 1)
## $H
## [1] 0.1823657
k.max <- round(sqrt(nrow(df)/2))
k.max
## [1] 9
data <- sapply(df, function(x) {
(x- mean(x))/ (max(x) - min(x))}
)
head(data)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,] -0.2064815 0.18444444 -0.3996610 -0.4163889
## [2,] -0.2620370 -0.02388889 -0.3996610 -0.4163889
## [3,] -0.3175926 0.05944444 -0.4166102 -0.4163889
## [4,] -0.3453704 0.01777778 -0.3827119 -0.4163889
## [5,] -0.2342593 0.22611111 -0.3996610 -0.4163889
## [6,] -0.1231481 0.35111111 -0.3488136 -0.3330556
wss <- sapply(1:k.max, function(k){
kmeans(data, k, nstart = 10)$tot.withinss
})
wss
## [1] 41.166110 12.127791 6.982216 5.516933 4.580323 3.923095 3.506278
## [8] 3.147428 2.814362
# k = 3 is suggested
theme_set(theme_minimal())
k <- data.frame(k = 1:k.max, wss = wss)
k %>% ggplot(aes(k, wss)) +
geom_line() +
geom_point() +
geom_point(data = k %>% filter( k == 3), color = 'red', size = 4) +
scale_x_continuous(breaks = seq(1, k.max, by = 1))
km.res3 <- kmeans(data, 3, nstart = 20)
km.res3
## K-means clustering with 3 clusters of sizes 50, 39, 61
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 -0.23259259 0.15444444 -0.3891525 -0.39722222
## 2 0.27856125 0.01029915 0.3295871 0.36673077
## 3 0.01255313 -0.13317851 0.1082578 0.09112477
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [71] 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 2 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3
##
## Within cluster sum of squares by cluster:
## [1] 1.829062 2.073324 3.079830
## (between_SS / total_SS = 83.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(case_when(km.res3$cluster == 2 ~ "setosa",
km.res3$cluster == 3 ~ "versicolor",
km.res3$cluster == 1 ~ "virginica"), iris$Species %>% as.character())
##
## setosa versicolor virginica
## setosa 0 3 36
## versicolor 0 47 14
## virginica 50 0 0