Load a sample data set

library(cluster)
library(cluster.datasets)
data(birth.death.rates.1966)
birth.death = birth.death.rates.1966
head(birth.death)
##       country birth death
## 1     Algeria  36.4  14.6
## 2       Congo  37.3   8.0
## 3       Egypt  42.1  15.3
## 4       Ghana  55.8  25.6
## 5 Ivory Coast  56.1  33.1
## 6    Malagasy  41.8  15.8

Remove the labels

bd = birth.death[,-1]
head(bd)
##   birth death
## 1  36.4  14.6
## 2  37.3   8.0
## 3  42.1  15.3
## 4  55.8  25.6
## 5  56.1  33.1
## 6  41.8  15.8

Select number of clusters

wss = kmeans(bd, centers=1)$tot.withinss
for (i in 2:15)
  wss[i] = kmeans(bd, centers=i)$tot.withinss
library(ggvis)
sse = data.frame(c(1:15), c(wss))
names(sse)[1] = 'Clusters'
names(sse)[2] = 'SSE'
sse %>%
  ggvis(~Clusters, ~SSE) %>%
  layer_points(fill := 'blue') %>% 
  layer_lines() %>%
  set_options(height = 300, width = 400)

Create Clusters

clusters = kmeans(bd, 6)
clusters
## K-means clustering with 6 clusters of sizes 15, 14, 13, 8, 7, 13
## 
## Cluster means:
##      birth     death
## 1 37.84667  8.813333
## 2 16.87857  8.471429
## 3 21.78462  8.300000
## 4 27.17500  7.262500
## 5 17.98571 13.471429
## 6 46.60000 16.376923
## 
## Clustering vector:
##  [1] 1 1 6 6 6 6 6 1 6 1 1 1 4 3 4 3 4 2 1 2 1 1 4 4 1 3 4 6 1 6 6 6 1 1 3
## [36] 3 2 6 1 6 6 4 3 1 5 5 5 2 2 5 2 5 5 5 2 2 3 2 3 2 2 3 2 3 2 2 3 3 3 4
## 
## Within cluster sum of squares by cluster:
## [1] 320.13467  71.21214  63.65692  65.63375  52.38286 838.00308
##  (between_SS / total_SS =  87.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

Assign Labels

birth.death$Cluster = clusters$cluster
head(birth.death)
##       country birth death Cluster
## 1     Algeria  36.4  14.6       1
## 2       Congo  37.3   8.0       1
## 3       Egypt  42.1  15.3       6
## 4       Ghana  55.8  25.6       6
## 5 Ivory Coast  56.1  33.1       6
## 6    Malagasy  41.8  15.8       6

Plot Clusters

clusplot(bd, clusters$cluster, color=T, shade=F,labels=0,lines=0, main='k-Means Cluster Analysis')

Summary