Load a sample data set
library(cluster)
library(cluster.datasets)
data(birth.death.rates.1966)
birth.death = birth.death.rates.1966
head(birth.death)
## country birth death
## 1 Algeria 36.4 14.6
## 2 Congo 37.3 8.0
## 3 Egypt 42.1 15.3
## 4 Ghana 55.8 25.6
## 5 Ivory Coast 56.1 33.1
## 6 Malagasy 41.8 15.8
Remove the labels
bd = birth.death[,-1]
head(bd)
## birth death
## 1 36.4 14.6
## 2 37.3 8.0
## 3 42.1 15.3
## 4 55.8 25.6
## 5 56.1 33.1
## 6 41.8 15.8
Select number of clusters
- Total within cluster sum of squares (center=1)
wss = kmeans(bd, centers=1)$tot.withinss
- Find out SSE for clusters 2 to 15
for (i in 2:15)
wss[i] = kmeans(bd, centers=i)$tot.withinss
- Plot SSE vs. Number of Clusters
library(ggvis)
sse = data.frame(c(1:15), c(wss))
names(sse)[1] = 'Clusters'
names(sse)[2] = 'SSE'
sse %>%
ggvis(~Clusters, ~SSE) %>%
layer_points(fill := 'blue') %>%
layer_lines() %>%
set_options(height = 300, width = 400)
Create Clusters
clusters = kmeans(bd, 6)
clusters
## K-means clustering with 6 clusters of sizes 15, 14, 13, 8, 7, 13
##
## Cluster means:
## birth death
## 1 37.84667 8.813333
## 2 16.87857 8.471429
## 3 21.78462 8.300000
## 4 27.17500 7.262500
## 5 17.98571 13.471429
## 6 46.60000 16.376923
##
## Clustering vector:
## [1] 1 1 6 6 6 6 6 1 6 1 1 1 4 3 4 3 4 2 1 2 1 1 4 4 1 3 4 6 1 6 6 6 1 1 3
## [36] 3 2 6 1 6 6 4 3 1 5 5 5 2 2 5 2 5 5 5 2 2 3 2 3 2 2 3 2 3 2 2 3 3 3 4
##
## Within cluster sum of squares by cluster:
## [1] 320.13467 71.21214 63.65692 65.63375 52.38286 838.00308
## (between_SS / total_SS = 87.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
Assign Labels
birth.death$Cluster = clusters$cluster
head(birth.death)
## country birth death Cluster
## 1 Algeria 36.4 14.6 1
## 2 Congo 37.3 8.0 1
## 3 Egypt 42.1 15.3 6
## 4 Ghana 55.8 25.6 6
## 5 Ivory Coast 56.1 33.1 6
## 6 Malagasy 41.8 15.8 6
Plot Clusters
clusplot(bd, clusters$cluster, color=T, shade=F,labels=0,lines=0, main='k-Means Cluster Analysis')

Summary
- Our findings support the popular adage that “birds of a feather flock together.”
- By using machine learning methods to cluster countries based on birth and death rates.
- These same methods can be applied to other contexts with similar results.
- As a very mature machine learning method, there are a myriad of variants to the k-means algorithm as well as many alternatives which bring unique biases and heuristics to the task.
- Based on what you have learned here, you will be able to understand and apply other clustering methods to new problems.