Data

Iris dataset has been used to apply K-means clustering.

# Load the data
rm(list = ls())
library(tidyverse)
library(magrittr)
datatable(iris)
df <- iris %>%  select(-Species)

Evaluate the clustering capability of the dataset

library(clustertend)
hopkins(df, n = nrow(df) - 1)
## $H
## [1] 0.1823657

Find the optimal K

k.max <- round(sqrt(nrow(df)/2))
k.max
## [1] 9

Normalisation

data <- sapply(df, function(x) {
  (x- mean(x))/ (max(x) - min(x))}
  )
head(data)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## [1,]   -0.2064815  0.18444444   -0.3996610  -0.4163889
## [2,]   -0.2620370 -0.02388889   -0.3996610  -0.4163889
## [3,]   -0.3175926  0.05944444   -0.4166102  -0.4163889
## [4,]   -0.3453704  0.01777778   -0.3827119  -0.4163889
## [5,]   -0.2342593  0.22611111   -0.3996610  -0.4163889
## [6,]   -0.1231481  0.35111111   -0.3488136  -0.3330556

Total within-cluster sum of square (wss)

wss <- sapply(1:k.max, function(k){
  kmeans(data, k, nstart = 10)$tot.withinss
})

wss
## [1] 41.166110 12.127791  6.982216  5.516933  4.580323  3.923095  3.506278
## [8]  3.147428  2.814362

Plot k

# k = 3 is suggested

theme_set(theme_minimal())

k <- data.frame(k = 1:k.max, wss = wss)

k  %>%  ggplot(aes(k, wss)) + 
  geom_line() +
  geom_point() +
  geom_point(data = k %>%  filter( k == 3), color = 'red', size = 4) +
  scale_x_continuous(breaks = seq(1, k.max, by = 1))

Perform clustering with k = 3

km.res3 <- kmeans(data, 3, nstart = 20)
km.res3
## K-means clustering with 3 clusters of sizes 50, 39, 61
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1  -0.23259259  0.15444444   -0.3891525 -0.39722222
## 2   0.27856125  0.01029915    0.3295871  0.36673077
## 3   0.01255313 -0.13317851    0.1082578  0.09112477
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 2 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3
## 
## Within cluster sum of squares by cluster:
## [1] 1.829062 2.073324 3.079830
##  (between_SS / total_SS =  83.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Evaluate the efficiency of the clustering algorithm

table(case_when(km.res3$cluster == 2 ~ "setosa",
                km.res3$cluster == 3 ~ "versicolor",
                km.res3$cluster == 1 ~ "virginica"), iris$Species %>% as.character())
##             
##              setosa versicolor virginica
##   setosa          0          3        36
##   versicolor      0         47        14
##   virginica      50          0         0