iris2 <- iris
iris2$Species <- NULL
head(iris2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
(kmeans.result <- kmeans(iris2, 3))
## K-means clustering with 3 clusters of sizes 62, 50, 38
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 5.006000 3.428000 1.462000 0.246000
## 3 6.850000 3.073684 5.742105 2.071053
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 1 3 3 3
## [106] 3 1 3 3 3 3 3 3 1 1 3 3 3 3 1 3 1 3 1 3 3 1 1 3 3 3 3 3 1 3 3 3 3 1 3
## [141] 3 3 1 3 3 3 1 3 3 1
##
## Within cluster sum of squares by cluster:
## [1] 39.82097 15.15100 23.87947
## (between_SS / total_SS = 88.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
table(iris$Species, kmeans.result$cluster)
##
## 1 2 3
## setosa 0 50 0
## versicolor 48 0 2
## virginica 14 0 36
plot(iris2[c("Sepal.Length", "Sepal.Width")], col = kmeans.result$cluster)
points(kmeans.result$centers[,c("Sepal.Length", "Sepal.Width")], col = 1:3, pch = 8, cex = 2)
Algorithms for k-medoids: PAM (better for small data) and CLARA (better for large data)
version 1
#install.packages("fpc")
library(fpc)
## Warning: package 'fpc' was built under R version 3.3.3
pamk.result <- pamk(iris2)
pamk.result$nc #number of clusters
## [1] 2
table(pamk.result$pamobject$clustering, iris$Species)
##
## setosa versicolor virginica
## 1 50 1 0
## 2 0 49 50
layout(matrix(c(1,2),1,2))
plot(pamk.result$pamobject)
layout(matrix(1))
The lines show distance between clusters; silhouettes: i->1 == good clustering, i–>0 == observations lies between clusters.
library(cluster)
pam.result <- pam(iris2, 3)
table(pam.result$clustering, iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 14
## 3 0 2 36
version 2
layout(matrix(c(1,2),1,2))
plot(pam.result)
layout(matrix(1))
Average linkage
idx <- sample(1:dim(iris)[1], 40) #40 irises for a clearer picture
irisSample <- iris[idx,]
irisSample$Species <- NULL
hc <- hclust(dist(irisSample), method = "ave")
plot(hc, hang = -1, labels = iris$Species[idx])
rect.hclust(hc, k = 3)
groups <- cutree(hc, k = 3)
epc = size of neighbourhood;
MinPts = minimum number of points;
point is dense if epx <= MinPts;
Advantage: DBSCAN can discover clusters of various shapes and size, insensitive to noise. (cf. k-means: sphere-shape clusters of similar size).
library(fpc)
iris2 <- iris[-5]
ds <- dbscan(iris2, eps = 0.42, MinPts = 5)
table(ds$cluster, iris$Species)
##
## setosa versicolor virginica
## 0 2 10 17
## 1 48 0 0
## 2 0 37 0
## 3 0 3 33
0 (first row) stands for noises or outliers (ML: data = true signal + noise. Outliers are part of the data)
plot(ds, iris2)
plot(ds, iris2[c(1,4)])
plotcluster(iris2, ds$cluster)
set.seed(345)
idx <- sample(1:nrow(iris), 10)
newData <- iris[idx, -5]
newData <- newData + matrix(runif(10*4, min = 0, max = 0.2), nrow = 10, ncol = 4)
myPred <- predict(ds, iris2, newData)
plot(iris2[c(1,4)], pch = "ast", col = 1 + myPred, cex = 1)
But there is more! You can use clustering models to label new data:
table(myPred, iris$Species[idx])
##
## myPred setosa versicolor virginica
## 0 0 1 1
## 1 3 0 0
## 2 0 4 0
## 3 0 0 1
Source: R and Data Mining: Examples and Case Studies. Zhao, 2012.
library(knitr)
knitr::include_graphics('https://img1.etsystatic.com/184/0/15772964/il_570xN.1316025687_7a55.jpg')
library(ggplot2)
#install.packages("dendextend")
library(dendextend)
#install.packages("ggdendro")
library(ggdendro)
library(magrittr)
dend <- USArrests %>% dist %>% hclust %>% as.dendrogram
plot(dend)
ggdendrogram(hc, rotate = T, labels = T, theme_dendro = T)
dd <- dist(scale(USArrests), method = "euclidean")
hc <- hclust(dd, method = "ward.D2")
plot(hc, hang = -1, cex = 0.6)
hang: The fraction of the plot height by which labels should hang below the rest of the plot. A negative value will cause the labels to hang down from 0.
hcd <- as.dendrogram(hc)
# Default plot
plot(hcd, type = "rectangle", ylab = "Height")
plot(hcd, xlim = c(1, 20), ylim = c(1,8))
dend <- iris[1:30,-5] %>% scale %>% dist %>%
hclust %>% as.dendrogram %>%
set("branches_k_color", k=3) %>% set("branches_lwd", 1.5) %>%
set("labels_colors") %>% set("labels_cex", c(.8, 1, 1.2)) %>%
set("leaves_pch", 19) %>% set("leaves_col", c("purple", "pink"))
plot(dend)
ggd1 <- as.ggdend(dend) #very easy, if you already have a dendrogram
ggplot(ggd1)
## Warning: Removed 29 rows containing missing values (geom_point).
ggd1 <- as.ggdend(dend)
ggplot(ggd1, horiz = TRUE, theme = theme_minimal())
## Warning: Removed 29 rows containing missing values (geom_point).
ggplot(ggd1, labels = FALSE) +
scale_y_reverse(expand = c(0.2, 0)) +
coord_polar(theta="x")
## Warning: Removed 29 rows containing missing values (geom_point).
plot(hcd, type = "triangle", ylab = "Height", horiz = F)
The above dendrogram can be customized using the arguments:
nodePar: a list of plotting parameters to use for the nodes (Default is NULL).
edgePar: list of plotting parameters to use for the edge segments (col, lty and lwd).
leaflab: a string specifying how leaves are labeled (the default “perpendicular” write text vertically; “textlike” writes text horizontally, and “none” suppresses leaf labels.
# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "salmon")
# Customized plot; remove labels, make nodes salmon pink
plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")
plot(hcd, xlab = "Height",
nodePar = nodePar, horiz = TRUE)
plot(hcd, xlab = "Height", nodePar = nodePar,
edgePar = list(col = 5:6, lwd = 2:1.5))
dend <- USArrests[1:7,] %>% scale %>%
dist %>% hclust %>% as.dendrogram
dend %>% plot
# Change the labels, and then plot:
dend %>% set("labels", c("a", "b", "c", "d", "e", "f", "z")) %>% plot
# Change color and size for labels
dend %>% set("labels_col", c("red", "yellow", "green")) %>% # change color
set("labels_cex", .66) %>% # Change size
plot(main = "Change the color and size") # plot
# Color labels by specifying the number of cluster (k)
dend %>% set("labels_col", value = c("purple", "pink"), k=2) %>%
plot(main = "Color labels \n by cluster")
abline(h = 2, lty = 2)
# Change the type, the color and the size of node points
dend %>% set("nodes_pch", 19) %>% # node point type
set("nodes_cex", 1.5) %>% # node point size
set("nodes_col", "steelblue") %>% # node point color
set("labels_cex", .7) %>%
plot(main = "Node points", horiz = F)
# Change the type, the color and the size of leave points
dend %>% set("leaves_pch", 16) %>% # node point type
set("leaves_cex", 1.2) %>% # node point size
set("leaves_col", "blue") %>% # node point color
set("labels_cex", .7) %>%
plot(main = "Leaves points")
# Specify different point types and colors for each leave
dend %>% set("leaves_pch", c(16, 19, 17)) %>% # node point type
set("leaves_cex", 1.1) %>% # node point size
set("leaves_col", c("purple", "magenta", "green")) %>% #node point color
set("labels_cex", .7) %>%
plot(main = "Leaves points")
dend %>% set("branches_k_color", k = 2) %>%
set("labels_cex", .7) %>%
plot(main = "Default colors")
dend %>% set("branches_k_color",
value = c("red", "blue"), k = 2) %>%
plot(main = "Customized colors")
dend %>% set("branches_k_color", k = 3) %>% plot
dend %>% rect.dendrogram(k=3, border = 8, lty = 5, lwd = 2)
dend %>% set("branches_k_color", k = 3) %>% plot(horiz = TRUE)
dend %>% rect.dendrogram(k = 3, horiz = TRUE, border = 8, lty = 5, lwd = 2)
Adapted from source: http://www.sthda.com/english/wiki/beautiful-dendrogram-visualizations-in-r-5-must-known-methods-unsupervised-machine-learning