Four Clustering Techniques on Irises.

k-means

iris2 <- iris
iris2$Species <- NULL
head(iris2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1          5.1         3.5          1.4         0.2
## 2          4.9         3.0          1.4         0.2
## 3          4.7         3.2          1.3         0.2
## 4          4.6         3.1          1.5         0.2
## 5          5.0         3.6          1.4         0.2
## 6          5.4         3.9          1.7         0.4
(kmeans.result <- kmeans(iris2, 3))
## K-means clustering with 3 clusters of sizes 62, 50, 38
## 
## Cluster means:
##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.901613    2.748387     4.393548    1.433871
## 2     5.006000    3.428000     1.462000    0.246000
## 3     6.850000    3.073684     5.742105    2.071053
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
## 
## Within cluster sum of squares by cluster:
## [1] 39.82097 15.15100 23.87947
##  (between_SS / total_SS =  88.4 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
table(iris$Species, kmeans.result$cluster)
##             
##               1  2  3
##   setosa      0 50  0
##   versicolor 48  0  2
##   virginica  14  0 36
plot(iris2[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)
points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex = 2)

k-medoids (medoid is the object clostest to the center of the cluster, it is more robust than k-means)

Algorithms for k-medoids: PAM (better for small data) and CLARA (better for large data)

version 1

#install.packages("fpc")
library(fpc)
## Warning: package 'fpc' was built under R version 3.3.3
pamk.result <- pamk(iris2)
pamk.result$nc #number of clusters
## [1] 2
table(pamk.result$pamobject$clustering, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          1         0
##   2      0         49        50
layout(matrix(c(1,2),1,2))
plot(pamk.result$pamobject) 

layout(matrix(1))

The lines show distance between clusters; silhouettes: i->1 == good clustering, i–>0 == observations lies between clusters.

library(cluster)
pam.result <- pam(iris2, 3)
table(pam.result$clustering, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48        14
##   3      0          2        36

version 2

layout(matrix(c(1,2),1,2))
plot(pam.result)

  layout(matrix(1))

Hierarchical clustering

Average linkage

idx <- sample(1:dim(iris)[1], 40) #40 irises for a clearer picture
irisSample <- iris[idx,]
irisSample$Species <- NULL
hc <- hclust(dist(irisSample), method = "ave")
plot(hc, hang = -1, labels = iris$Species[idx])
rect.hclust(hc, k = 3)

groups <- cutree(hc, k = 3)

Density-based clustering (DBSCAN) for numeric data (connects objects by densely populated areas):

epc = size of neighbourhood;

MinPts = minimum number of points;

point is dense if epx <= MinPts;

Advantage: DBSCAN can discover clusters of various shapes and size, insensitive to noise. (cf. k-means: sphere-shape clusters of similar size).

library(fpc)
iris2 <- iris[-5]
ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
table(ds$cluster, iris$Species)
##    
##     setosa versicolor virginica
##   0      2         10        17
##   1     48          0         0
##   2      0         37         0
##   3      0          3        33

0 (first row) stands for noises or outliers (ML: data = true signal + noise. Outliers are part of the data)

plot(ds, iris2)

plot(ds, iris2[c(1,4)])

plotcluster(iris2, ds$cluster)

set.seed(345)
idx <- sample(1:nrow(iris), 10)
newData <- iris[idx, -5]
newData <- newData + matrix(runif(10*4, min = 0, max = 0.2), nrow = 10, ncol = 4)
myPred <- predict(ds, iris2, newData)
plot(iris2[c(1,4)], pch = "ast", col = 1 + myPred, cex = 1)

But there is more! You can use clustering models to label new data:

table(myPred, iris$Species[idx])
##       
## myPred setosa versicolor virginica
##      0      0          1         1
##      1      3          0         0
##      2      0          4         0
##      3      0          0         1

Source: R and Data Mining: Examples and Case Studies. Zhao, 2012.

Visualization of Cluster Analysis

Dendrograms among us

library(knitr)
knitr::include_graphics('https://img1.etsystatic.com/184/0/15772964/il_570xN.1316025687_7a55.jpg')

library(ggplot2)
#install.packages("dendextend")
library(dendextend)
#install.packages("ggdendro")
library(ggdendro)
library(magrittr)
dend <- USArrests %>% dist %>% hclust %>% as.dendrogram 
plot(dend)

ggdendrogram(hc, rotate = T, labels = T, theme_dendro = T)

dd <- dist(scale(USArrests), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, hang = -1, cex = 0.6)

hang: The fraction of the plot height by which labels should hang below the rest of the plot. A negative value will cause the labels to hang down from 0.

Convert hclust into a dendrogram and plot

hcd <- as.dendrogram(hc)
# Default plot
plot(hcd, type = "rectangle", ylab = "Height")

Zoom in to the first dendrogram

plot(hcd, xlim = c(1, 20), ylim = c(1,8))

More colours and customisation

dend <- iris[1:30,-5] %>% scale %>% dist %>% 
   hclust %>% as.dendrogram %>%
   set("branches_k_color", k=3) %>% set("branches_lwd", 1.5) %>%
   set("labels_colors") %>% set("labels_cex", c(.8, 1, 1.2)) %>% 
   set("leaves_pch", 19) %>% set("leaves_col", c("purple", "pink"))
plot(dend)

ggd1 <- as.ggdend(dend) #very easy, if you already have a dendrogram
ggplot(ggd1)
## Warning: Removed 29 rows containing missing values (geom_point).

ggd1 <- as.ggdend(dend)
ggplot(ggd1, horiz = TRUE, theme = theme_minimal())
## Warning: Removed 29 rows containing missing values (geom_point).

ggplot(ggd1, labels = FALSE) + 
  scale_y_reverse(expand = c(0.2, 0)) +
  coord_polar(theta="x")
## Warning: Removed 29 rows containing missing values (geom_point).

plot(hcd, type = "triangle", ylab = "Height", horiz = F)

The above dendrogram can be customized using the arguments:

nodePar: a list of plotting parameters to use for the nodes (Default is NULL).

edgePar: list of plotting parameters to use for the edge segments (col, lty and lwd).

leaflab: a string specifying how leaves are labeled (the default “perpendicular” write text vertically; “textlike” writes text horizontally, and “none” suppresses leaf labels.

# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 
                cex = 0.7, col = "salmon")
# Customized plot; remove labels, make nodes salmon pink
plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")

Horizontal plot

plot(hcd,  xlab = "Height",
     nodePar = nodePar, horiz = TRUE)

Change edge color

plot(hcd,  xlab = "Height", nodePar = nodePar, 
     edgePar = list(col = 5:6, lwd = 2:1.5))

Dendrogram Extensions

Toy example (extra)

dend <- USArrests[1:7,] %>%  scale %>% 
        dist %>% hclust %>% as.dendrogram
dend %>% plot

# Change the labels, and then plot:
dend %>% set("labels", c("a", "b", "c", "d", "e", "f", "z")) %>% plot

# Change color and size for labels
dend %>% set("labels_col", c("red", "yellow", "green")) %>% # change color
  set("labels_cex", .66) %>% # Change size
  plot(main = "Change the color and size") # plot

# Color labels by specifying the number of cluster (k)
dend %>% set("labels_col", value = c("purple", "pink"), k=2) %>% 
          plot(main = "Color labels \n by cluster")
abline(h = 2, lty = 2)

# Change the type, the color and the size of node points
dend %>% set("nodes_pch", 19) %>%  # node point type
  set("nodes_cex", 1.5) %>%  # node point size
  set("nodes_col", "steelblue") %>% # node point color
  set("labels_cex", .7) %>% 
  plot(main = "Node points", horiz = F)

# Change the type, the color and the size of leave points
dend %>% set("leaves_pch", 16) %>%  # node point type
  set("leaves_cex", 1.2) %>%  # node point size
  set("leaves_col", "blue") %>% # node point color
  set("labels_cex", .7) %>%
  plot(main = "Leaves points")

# Specify different point types and colors for each leave
dend %>% set("leaves_pch", c(16, 19, 17)) %>%  # node point type
  set("leaves_cex", 1.1) %>%  # node point size
  set("leaves_col", c("purple", "magenta", "green")) %>% #node point color
  set("labels_cex", .7) %>%
  plot(main = "Leaves points")

dend %>% set("branches_k_color", k = 2) %>% 
  set("labels_cex", .7) %>%
  plot(main = "Default colors")

Customized colors

dend %>% set("branches_k_color", 
             value = c("red", "blue"), k = 2) %>% 
   plot(main = "Customized colors")

Vertical plot

dend %>% set("branches_k_color", k = 3) %>% plot
dend %>% rect.dendrogram(k=3, border = 8, lty = 5, lwd = 2)

Horizontal plot

dend %>% set("branches_k_color", k = 3) %>% plot(horiz = TRUE)
dend %>% rect.dendrogram(k = 3, horiz = TRUE, border = 8, lty = 5, lwd = 2)

Adapted from source: http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning