This article describes several clustering algorithms implemented in R.
#install.packages("factoextra")
library(Rtsne)
library(tsne)
library(tidyverse)
Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------------------------------
filter(): dplyr, stats
lag(): dplyr, stats
library(rio)
The following rio suggested packages are not installed: ‘clipr’, ‘csvy’, ‘feather’, ‘fst’, ‘readODS’, ‘rmatio’
Use 'install_formats()' to install them
library(RColorBrewer)
library(knitr)
library("factoextra")
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(cluster)
## calling the installed package
train<- import("/Users/nanaakwasiabayieboateng/Documents/memphisclassesbooks/DataMiningscience/Clustering/Crowdsourced Mapping/training.csv")
head(train)
train$class=as.factor(train$class)
str(train)
'data.frame': 10545 obs. of 29 variables:
$ class : Factor w/ 6 levels "farm","forest",..: 6 6 6 6 6 2 6 6 6 6 ...
$ max_ndvi : num 998 914 3801 952 1232 ...
$ 20150720_N: num 637.6 634.2 1671.3 58 72.5 ...
$ 20150602_N: num 659 594 1207 -1599 -1221 ...
$ 20150517_N: num -1882 -1626 450 211 380 ...
$ 20150501_N: num -1924 -1672 1071 -1053 -1257 ...
$ 20150415_N: num 998 914 546 579 516 ...
$ 20150330_N: num -1740 -692 1078 -1565 -1413 ...
$ 20150314_N: num 630 708 215 -858 -803 ...
$ 20150226_N: num -1628 -1671 850 730 683 ...
$ 20150210_N: num -1326 -1409 1284 -3162 -2829 ...
$ 20150125_N: num -944 -989 1305 -1522 -1268 ...
$ 20150109_N: num 277 214 542 433 461 ...
$ 20141117_N: num -206.8 -75.6 922.6 228.2 317.5 ...
$ 20141101_N: num 536 893 890 555 405 ...
$ 20141016_N: num 749 401 836 531 564 ...
$ 20140930_N: num -483 -390 1824 952 1232 ...
$ 20140813_N: num 492 394 1670 -1075 -118 ...
$ 20140626_N: num 656 667 2307 546 683 ...
$ 20140610_N: num -921 -955 1562 -1026 -1814 ...
$ 20140525_N: num -1043 -934 1566 369 156 ...
$ 20140509_N: num -1942 -625 2208 -1787 -1190 ...
$ 20140423_N: num 267 120 1057 -1228 -924 ...
$ 20140407_N: num 367 365 385 305 432 ...
$ 20140322_N: num 452 477 301 291 283 ...
$ 20140218_N: num 211 221 294 369 298 ...
$ 20140202_N: num -2203 -2250 2763 -2202 -2197 ...
$ 20140117_N: num -1180 -1361 151 600 626 ...
$ 20140101_N: num 434 524 3801 -1344 -827 ...
train=train%>%mutate_if(is.numeric,as.factor)
# K-Means Clustering with 3 clusters
fit <- kmeans(iris[,1:4], 3)
# Cluster Plot against 1st 2 principal components
clusplot(iris[,1:4], fit$cluster, color=TRUE, shade=TRUE, labels=3, lines=0)
fit%>%head()
$cluster
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[51] 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[101] 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 3 3 3 3 2 3 3 3 2 3 3 3 2 3 3 2
$centers
Sepal.Length Sepal.Width Petal.Length Petal.Width
1 5.006000 3.428000 1.462000 0.246000
2 5.901613 2.748387 4.393548 1.433871
3 6.850000 3.073684 5.742105 2.071053
$totss
[1] 681.3706
$withinss
[1] 15.15100 39.82097 23.87947
$tot.withinss
[1] 78.85144
$betweenss
[1] 602.5192
# K-Means Clustering with 6 clusters
fit <- kmeans(train[,-1], 6)
# Cluster Plot against 1st 2 principal components
clusplot(train[,-1], fit$cluster, color=TRUE, shade=TRUE, labels=6, lines=0)
Compute the distances.
distances = dist(train[,-1], method = "euclidean")
There are various techniques in hierarchical clustering. Here, we are considerating Ward’s method which considers distances between centroids and variances.
fit_hc<-hclust(distances, method = "ward.D")
plot(fit_hc)
plot(fit_hc)
rect.hclust(fit_hc, k = 6, border = "blue")
plot(fit_hc) # display dendogram
groups <- cutree(fit_hc, k=3) # cut tree into 5 clusters
# draw dendogram with red borders around the 5 clusters
rect.hclust(fit_hc, k=3, border="red")
#library(rpud)
install.packages("rpud")
Installing package into ‘/Users/nanaakwasiabayieboateng/Library/R/3.4/library’
(as ‘lib’ is unspecified)
Warning in install.packages :
package ‘rpud’ is not available (for R version 3.4.1)
fit_km = kmeans(train[,-1], centers = 6, nstart = 20)
names(fit_km)
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault"
attributes(fit_km)
$names
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault"
$class
[1] "kmeans"
fit_km$centers
max_ndvi 20150720_N 20150602_N 20150517_N 20150501_N 20150415_N 20150330_N 20150314_N 20150226_N
1 7951.808 7041.744 2756.644 6396.5180 7148.348 496.4735 3529.513 1894.856 5252.8405
2 7837.537 6484.468 5362.508 4835.0892 5917.055 2556.1776 5433.398 3954.655 5455.9754
3 7087.216 4875.583 3801.549 2716.2724 3789.584 1977.7132 4893.552 3816.139 3876.7899
4 8092.030 6286.156 7405.038 5494.5289 5926.027 6785.9893 6858.562 3294.386 6597.1688
5 2597.921 1809.370 1268.958 774.4369 1201.813 1194.7143 1217.718 1173.303 818.8006
6 7827.469 6075.783 6587.107 4709.2019 4970.326 3796.6643 5575.058 4506.093 5715.9119
20150210_N 20150125_N 20150109_N 20141117_N 20141101_N 20141016_N 20140930_N 20140813_N 20140626_N
1 1802.241 2585.466 933.6302 1996.766 637.8239 1166.279 991.5442 1696.6322 2487.878
2 4945.251 7267.725 3781.3251 2021.533 2081.7024 1490.661 1126.3265 2631.8926 3210.604
3 3210.029 4261.117 1591.0804 2337.748 2037.0799 2068.411 1849.9389 1740.9452 3408.682
4 6696.053 7282.415 3473.8132 6943.557 4778.3507 4707.625 4102.8139 856.0087 2819.742
5 1211.962 1266.042 747.1180 1459.684 1316.5520 1319.166 1329.8296 1002.7528 1474.555
6 6364.351 5979.931 1616.2231 4447.159 4534.2848 5606.437 4798.5566 862.5837 3728.317
20140610_N 20140525_N 20140509_N 20140423_N 20140407_N 20140322_N 20140218_N 20140202_N 20140117_N
1 3649.289 4759.695 2913.354 4144.2253 739.5823 2886.8984 629.7423 6746.569 711.201
2 4903.869 4069.703 2923.284 3250.8815 1398.3357 4663.7203 1562.9058 6764.144 2582.180
3 3916.864 3163.083 3675.098 3216.8030 1818.2729 1760.8143 1161.0218 5557.568 1512.822
4 6802.208 2639.935 2615.503 2147.5663 3605.1606 3875.4071 4591.3772 7274.335 5304.153
5 1538.597 1599.596 1361.412 938.4758 827.3844 755.0112 611.2895 1458.597 1121.484
6 6591.674 4675.166 3608.978 3292.8240 3420.2253 1362.4768 3479.9115 6689.611 3818.138
20140101_N
1 1214.008
2 4851.483
3 2573.937
4 2779.081
5 622.777
6 2004.545
#fit_km%>%head()
table(fit_km$cluster, train$class)
farm forest grass impervious orchard water
1 97 1604 2 2 17 4
2 136 1393 198 54 18 5
3 102 1867 31 13 6 4
4 17 18 20 672 0 165
5 1 1686 17 5 3 1
6 1088 863 178 223 9 26
fit_km$cluster <- as.factor(fit_km$cluster )
ggplot(train, aes(train[,3],train[,2], color = fit_km$cluster)) + geom_point()
ggplot(train, aes(train[,4],train[,5], color = fit_km$cluster)) + geom_point()
names(train)
[1] "class" "max_ndvi" "20150720_N" "20150602_N" "20150517_N" "20150501_N" "20150415_N"
[8] "20150330_N" "20150314_N" "20150226_N" "20150210_N" "20150125_N" "20150109_N" "20141117_N"
[15] "20141101_N" "20141016_N" "20140930_N" "20140813_N" "20140626_N" "20140610_N" "20140525_N"
[22] "20140509_N" "20140423_N" "20140407_N" "20140322_N" "20140218_N" "20140202_N" "20140117_N"
[29] "20140101_N"
A fundamental question is how to determine the value of the parameter . If we look at the percentage of variance explained as a function of the number of clusters: One should choose a number of clusters so that adding another cluster doesn’t give much better modeling of the data. More precisely, if one plots the percentage of variance explained by the clusters against the number of clusters, the first clusters will add much information (explain a lot of variance), but at some point the marginal gain will drop, giving an angle in the graph. The number of clusters is chosen at this point, hence the “elbow criterion”.
wssplot <- function(data, nc=15, seed=148){
wss <- (nrow(data)-1)*sum(apply(data,2,var))
for (i in 2:nc){
set.seed(seed)
wss[i] <- sum(kmeans(data, centers=i)$withinss)}
plot(1:nc, wss, type="b", xlab="Number of Clusters",
ylab="Within groups sum of squares")}
wssplot(train[,-1], nc=8)
library(cluster)
clusplot(train[,-1], fit_km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE,
labels=2, lines=0)
o=order(fit_km$cluster)
data.frame(train$class[o],fit_km$cluster[o])%>%head()
library(cluster)
clusplot(train[,-1], fit_km$cluster, main='2D representation of the Cluster solution',
color=TRUE, shade=TRUE, labels=2, lines=0)
#foodagg=agnes(train,diss=FALSE,metric="euclidian")
#plot(foodagg, main='Dendrogram') ## dendrogram
The classification of objects, into clusters, requires some methods for measuring the distance or the (dis)similarity between the objects. Chapter Clustering Distance Measures Essentials covers the common distance measures used for assessing similarity between observations.
It’s simple to compute and visualize distance matrix using the functions get_dist() and fviz_dist() [factoextra R package]:
get_dist(): for computing a distance matrix between the rows of a data matrix. Compared to the standard dist() function, it supports correlation-based distance measures including “pearson”, “kendall” and “spearman” methods. fviz_dist(): for visualizing a distance matrix
res.dist <- get_dist(USArrests, stand = TRUE, method = "pearson")
fviz_dist(res.dist,
gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
library(FactoMineR)
#works with qualitative variables
#MCA(train[,-1], ncp = 5, graph = TRUE)
#res.mca <- MCA(train[,-1],quanti.sup=19,quali.sup=20:36)
#summary(res.mca)
# Load data
data("USArrests")
my_data1 <- USArrests
# Remove any missing value (i.e, NA values for not available)
my_data1 <- na.omit(my_data1)
# Scale variables
my_data1 <- scale(my_data1)
# View the firt 3 rows
head(my_data1, n = 5)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
California 0.27826823 1.2628144 1.7589234 2.067820292
Three popular methods for determining the optimal number of clusters ◦ Elbow method ◾ Concept ◾ Algorithm ◾ R codes ◦ Average silhouette method ◾ Concept ◾ Algorithm ◾ R codes ◦ Conclusions about elbow and silhouette methods ◦ Gap statistic method ◾ Concept ◾ Algorithm ◾ R codes
fviz_nbclust(my_data1, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
clusters
install.packages("NbClust")
Installing package into ‘/Users/nanaakwasiabayieboateng/Library/R/3.4/library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/NbClust_3.0.tgz'
Content type 'application/x-gzip' length 37891 bytes (37 KB)
==================================================
downloaded 37 KB
The downloaded binary packages are in
/var/folders/mj/w1gxzjcd0qx2cw_0690z7y640000gn/T//RtmppzlA5w/downloaded_packages
library("NbClust")
res.nbclust <- NbClust(my_data1, distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="all")
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 9 proposed 2 as the best number of clusters
* 4 proposed 3 as the best number of clusters
* 6 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 10 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 2
*******************************************************************
Visualize using factoextra:
factoextra::fviz_nbclust(res.nbclust) + theme_minimal()
Among all indices:
===================
* 2 proposed 0 as the best number of clusters
* 1 proposed 1 as the best number of clusters
* 9 proposed 2 as the best number of clusters
* 4 proposed 3 as the best number of clusters
* 6 proposed 4 as the best number of clusters
* 2 proposed 5 as the best number of clusters
* 1 proposed 8 as the best number of clusters
* 1 proposed 10 as the best number of clusters
Conclusion
=========================
* According to the majority rule, the best number of clusters is 2 .
The term clustering validation is used to design the procedure of evaluating the results of a clustering algorithm. Clustering validation statistics: fpc::cluster.stats()
my_data <- scale(iris[, -5])
# Enhanced hierarchical clustering, cut in 3 groups
library("factoextra")
res.hc <- eclust(my_data, "hclust", k = 3, graph = FALSE)
# Visualize
fviz_dend(res.hc, rect = TRUE, show_labels = FALSE)
fviz_nbclust(my_data1, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
#Compute and visualize k-means clustering
km.res <- kmeans(my_data1, 4, nstart = 25)
# Visualize
fviz_cluster(km.res, data = my_data1, frame.type = "convex")+
theme_minimal()
argument frame is deprecated; please use ellipse instead.argument frame.type is deprecated; please use ellipse.type instead.
An alternative to k-means clustering is the K-medoids clustering or PAM (Partitioning Around Medoids, Kaufman & Rousseeuw, 1990), which is less sensitive to outliers compared to k-means.
pam.res <- pam(my_data1, 4)
# Visualize
fviz_cluster(pam.res)
Estimate the number of clusters in the data using gap statistics : factoextra::fviz_nbclust()
fviz_nbclust(my_data, kmeans, method = "gap_stat")
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.
did not converge in 10 iterationsdid not converge in 10 iterations
....
did not converge in 10 iterationsdid not converge in 10 iterations
.
did not converge in 10 iterations
..
did not converge in 10 iterations
..
did not converge in 10 iterations
...
did not converge in 10 iterations
...
did not converge in 10 iterations
...
did not converge in 10 iterations
.
did not converge in 10 iterationsdid not converge in 10 iterations
...
did not converge in 10 iterations
.
Quick-TRANSfer stage steps exceeded maximum (= 527250)
.....
did not converge in 10 iterations
..
did not converge in 10 iterations
..
did not converge in 10 iterations
..
did not converge in 10 iterations
..
did not converge in 10 iterations
..
Quick-TRANSfer stage steps exceeded maximum (= 527250)
.
did not converge in 10 iterationsdid not converge in 10 iterations
..
did not converge in 10 iterations
.
did not converge in 10 iterations
...
did not converge in 10 iterations
...
did not converge in 10 iterations
. 50
.
did not converge in 10 iterations
..
did not converge in 10 iterations
.
did not converge in 10 iterations
......
did not converge in 10 iterations
....
did not converge in 10 iterations
.
did not converge in 10 iterations
..
did not converge in 10 iterations
.
did not converge in 10 iterations
....
Quick-TRANSfer stage steps exceeded maximum (= 527250)
.......
did not converge in 10 iterations
...
did not converge in 10 iterations
....
did not converge in 10 iterations
..
did not converge in 10 iterations
.....
did not converge in 10 iterations
..
did not converge in 10 iterations
..... 100
library("NbClust")
set.seed(123)
res.nbclust <- NbClust(my_data, distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="all")
Visualize using factoextra:
factoextra::fviz_nbclust(res.nbclust) + theme_minimal()
clustering, less sensitive to outliers.
#Compute and visualize k-means clustering
km.res <- kmeans(my_data, 6, nstart = 25)
# Visualize
fviz_cluster(km.res, data = my_data, frame.type = "convex")+
theme_minimal()
argument frame is deprecated; please use ellipse instead.argument frame.type is deprecated; please use ellipse.type instead.
# Compute hierarchical clustering and cut into 4 clusters
res <- hcut(my_data, k = 6, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("#00AFBB","#2E9FDF", "#E7B800", "#FC4E
07","#99D594", "#3288BD"))
clustering, less sensitive to outliers.
pam.res <- pam(my_data, 4)
# Visualize
fviz_cluster(pam.res)
# 2. Compute dissimilarity matrix
d <- dist(my_data1, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(d, method = "ward.D2" )
# Cut tree into 4 groups
rp <- cutree(res.hc, k = 4)
# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 4, border = 2:5) # add rectangle
Clustering validation includes three main tasks: 1. clustering tendency assesses whether applying clustering is suitable to your data. 2. clustering evaluation assesses the goodness or quality of the clustering. 3. clustering stability seeks to understand the sensitivity of the clustering result to various algorithmic parameters, for example, the number of clusters.
• Hopkins statistic: If the value of Hopkins statistic is close to zero (far below 0.5), then we can conclude that the dataset is significantly clusterable. • VAT (Visual Assessment of cluster Tendency): The VAT detects the clustering tendency in a visual form by counting the number of square shaped dark (or colored) blocks along the diagonal in a VAT image.
my_data2 <- scale(iris[, -5])
get_clust_tendency(my_data2, n = 50,
gradient = list(low = "steelblue", high
= "white"))
$hopkins_stat
[1] 0.2002686
$plot
# 1. Loading and preparing data
data("USArrests")
my_data <- scale(USArrests)
# 2. Compute dissimilarity matrix
d <- dist(my_data, method = "euclidean")
# Hierarchical clustering using Ward's method
res.hc <- hclust(d, method = "ward.D2" )
grp <- cutree(res.hc, k = 4)
# Visualize
plot(res.hc, cex = 0.6) # plot tree
rect.hclust(res.hc, k = 4, border = 2:5) # add rectangle
# Compute hierarchical clustering and cut into 4 clusters
res <- hcut(my_data1, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("#D53E4F","#FC8D59", "#E6F598", "#99D594" ,"#3288BD"))
Length of color vector was longer than the number of clusters - first k elements are used
my_data3 <- scale(iris[, -5])
# Enhanced hierarchical clustering, cut in 3 groups
library("factoextra")
res.hc <- eclust(my_data3, "hclust", k = 3, graph = FALSE)
# Visualize
fviz_dend(res.hc, rect = TRUE, show_labels = FALSE)
#Compute hierarchical clustering and cut into 4 clusters
res <- hcut(my_data3, k = 3, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("#D53E4F", "#E6F598", "#99D594" ,"#3288BD"))
Length of color vector was longer than the number of clusters - first k elements are used
Recall that the silhouette \(S_i\) measures how similar an object \(i\) is to the the other objects in its own cluster versus those in the neighbor cluster. \(S_i\) values range from 1 to - 1: • A value of \(S_i\) close to 1 indicates that the object is well clustered. In the other words, the object \(i\) is similar to the other objects in its group. • A value of \(S_i\) close to -1 indicates that the object is poorly clustered, and that assignment to some other cluster would probably improve the overall results.
# Visualize the silhouette plot
fviz_silhouette(res.hc)
cluster size ave.sil.width
1 1 49 0.63
2 2 30 0.44
3 3 71 0.32
Which samples have negative silhouette? To what cluster are they closer?
# Silhouette width of observations
sil <- res.hc$silinfo$widths[, 1:3]
# Objects with negative silhouette
neg_sil_index <- which(sil[, 'sil_width'] < 0)
sil[neg_sil_index, , drop = FALSE]
res.hc$dist.method
[1] "euclidean"
for your data?
install.packages("clValid")
Installing package into ‘/Users/nanaakwasiabayieboateng/Library/R/3.4/library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/clValid_0.6-6.tgz'
Content type 'application/x-gzip' length 506712 bytes (494 KB)
==================================================
downloaded 494 KB
The downloaded binary packages are in
/var/folders/mj/w1gxzjcd0qx2cw_0690z7y640000gn/T//RtmppzlA5w/downloaded_packages
# Compute clValid
library("clValid")
intern <- clValid(my_data1, nClust = 2:6,
clMethods = c("hierarchical","kmeans","pam"),
validation = "internal")
# Summary
summary(intern)
Clustering Methods:
hierarchical kmeans pam
Cluster sizes:
2 3 4 5 6
Validation Measures:
2 3 4 5 6
hierarchical Connectivity 6.6437 9.5615 13.9563 22.5782 31.2873
Dunn 0.2214 0.2214 0.2224 0.2046 0.2126
Silhouette 0.4085 0.3486 0.3637 0.3213 0.2720
kmeans Connectivity 6.6437 13.6484 16.2413 24.6639 33.7194
Dunn 0.2214 0.2224 0.2224 0.1983 0.2231
Silhouette 0.4085 0.3668 0.3573 0.3377 0.3079
pam Connectivity 6.6437 13.8302 20.4421 29.5726 38.2643
Dunn 0.2214 0.1376 0.1849 0.1849 0.2019
Silhouette 0.4085 0.3144 0.3390 0.3105 0.2630
Optimal Scores:
Score Method Clusters
Connectivity 6.6437 hierarchical 2
Dunn 0.2231 kmeans 6
Silhouette 0.4085 hierarchical 2
R?
The R package pvclust (Suzuki et al., 2004) which uses bootstrap resampling techniques to compute p-value for each clusters.
#install.packages("pvclust")
library(pvclust)
# Data preparation
set.seed(123)
data("lung")
ss <- sample(1:73, 30) # extract 20 samples out of
my_data4 <- lung[, ss]
my_data4%>%head()
# Compute pvclust
res.pv <- pvclust(my_data4, method.dist="cor",
method.hclust="average", nboot = 10)
Bootstrap (r = 0.5)... Done.
Bootstrap (r = 0.6)... Done.
Bootstrap (r = 0.7)... Done.
Bootstrap (r = 0.8)... Done.
Bootstrap (r = 0.9)... Done.
Bootstrap (r = 1.0)... Done.
Bootstrap (r = 1.1)... Done.
Bootstrap (r = 1.2)... Done.
Bootstrap (r = 1.3)... Done.
Bootstrap (r = 1.4)... Done.
# Default plot
plot(res.pv, hang = -1, cex = 0.5)
pvrect(res.pv)
Fuzzy clustering is also known as soft method. Standard clustering approaches produce partitions (K-means, PAM), in which each observation belongs to only one cluster. This is known as hard clustering. In Fuzzy clustering, items can be a member of more than one cluster. Each item has a set of membership coefficients corresponding to the degree of being in a given cluster. The Fuzzy c-means method is the most popular fuzzy clustering algorithm. Read more: Fuzzy clustering analysis. 10.2 Model-based clustering In model-based clustering, the data are viewed as coming from a distribution that is mixture of two ore more clusters. It finds best fit of models to data and estimates the number of clusters. Read more: Model-based clustering.
Clustering validation includes three main tasks: 1. clustering tendency assesses whether applying clustering is suitable to your data. 2. clustering evaluation assesses the goodness or quality of the clustering. 3. clustering stability seeks to understand the sensitivity of the clustering result to various algorithmic parameters, for example, the number of clusters.
• Hopkins statistic: If the value of Hopkins statistic is close to zero (far below 0.5), then we can conclude that the dataset is significantly clusterable. • VAT (Visual Assessment of cluster Tendency): The VAT detects the clustering tendency in a visual form by counting the number of square shaped dark (or colored) blocks along the diagonal in a VAT image.
K-means (Chapter @ref(kmeans-clustering)) represents one of the most popular clustering algorithm. However, it has some limitations: it requires the user to specify the number of clusters in advance and selects initial centroids randomly. The final k-means clustering solution is very sensitive to this initial random selection of cluster centers. The result might be (slightly) different each time you compute k-means.
# Compute hierarchical k-means clustering
library(factoextra)
res.hk <-hkmeans(my_data1, 4)
# Elements returned by hkmeans()
names(res.hk)
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault" "data" "hclust"
# Print the results
res.hk
Hierarchical K-means clustering with 4 clusters of sizes 8, 13, 16, 13
Cluster means:
Murder Assault UrbanPop Rape
1 1.4118898 0.8743346 -0.8145211 0.01927104
2 0.6950701 1.0394414 0.7226370 1.27693964
3 -0.4894375 -0.3826001 0.5758298 -0.26165379
4 -0.9615407 -1.1066010 -0.9301069 -0.96676331
Clustering vector:
Alabama Alaska Arizona Arkansas California Colorado
1 2 2 1 2 2
Connecticut Delaware Florida Georgia Hawaii Idaho
3 3 2 1 3 4
Illinois Indiana Iowa Kansas Kentucky Louisiana
2 3 4 3 4 1
Maine Maryland Massachusetts Michigan Minnesota Mississippi
4 2 3 2 4 1
Missouri Montana Nebraska Nevada New Hampshire New Jersey
2 4 4 2 4 3
New Mexico New York North Carolina North Dakota Ohio Oklahoma
2 2 1 4 3 3
Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee
3 3 3 1 4 1
Texas Utah Vermont Virginia Washington West Virginia
2 3 4 3 3 4
Wisconsin Wyoming
4 3
Within cluster sum of squares by cluster:
[1] 8.316061 19.922437 16.212213 11.952463
(between_SS / total_SS = 71.2 %)
Available components:
[1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss"
[7] "size" "iter" "ifault" "data" "hclust"
# Visualize the tree
fviz_dend(res.hk, cex = 0.6, palette = "jco",
rect = TRUE, rect_border = "jco", rect_fill = TRUE)
# Visualize the hkmeans final clusters
fviz_cluster(res.hk, palette = "jco", repel = TRUE,
ggtheme = theme_classic())
This is different from k-means and k-medoid clustering, where each object is affected exactly to one cluster. K-means and k-medoids clustering are known as hard or non-fuzzy clustering.
In fuzzy clustering, points close to the center of a cluster, may be in the cluster to a higher degree than points in the edge of a cluster. The degree, to which an element belongs to a given cluster, is a numerical value varying from 0 to 1.
The fuzzy c-means (FCM) algorithm is one of the most widely used fuzzy clustering algorithms. The centroid of a cluster is calculated as the mean of all points, weighted by their degree of belonging to the cluster:
The function fanny() [cluster R package] can be used to compute fuzzy clustering. FANNY stands for fuzzy analysis clustering. A simplified format is:
x: A data matrix or data frame or dissimilarity matrix k: The desired number of clusters to be generated metric: Metric for calculating dissimilarities between observations stand: If TRUE, variables are standardized before calculating the dissimilarities The function fanny() returns an object including the following components:
membership: matrix containing the degree to which each observation belongs to a given cluster. Column names are the clusters and rows are observations coeff: Dunn’s partition coefficient F(k) of the clustering, where k is the number of clusters. F(k) is the sum of all squared membership coefficients, divided by the number of observations. Its value is between 1/k and 1. The normalized form of the coefficient is also given. It is defined as (F(k)−1/k)/(1−1/k) , and ranges between 0 and 1. A low value of Dunn’s coefficient indicates a very fuzzy clustering, whereas a value close to 1 indicates a near-crisp clustering. clustering: the clustering vector containing the nearest crisp grouping of observations
library(cluster)
df <- scale(USArrests) # Standardize the data
res.fanny <- fanny(df, 2) # Compute fuzzy clustering with k = 2
head(res.fanny$membership, 3) # Membership coefficients
[,1] [,2]
Alabama 0.6641977 0.3358023
Alaska 0.6098062 0.3901938
Arizona 0.6862278 0.3137722
res.fanny$coeff # Dunn's partition coefficient
dunn_coeff normalized
0.5547365 0.1094731
head(res.fanny$clustering) # Observation groups
Alabama Alaska Arizona Arkansas California Colorado
1 1 1 2 1 1
To visualize observation groups, use the function fviz_cluster() [factoextra package]:
fviz_cluster(res.fanny, ellipse.type = "norm", repel = TRUE,
palette = "jco", ggtheme = theme_minimal(),
legend = "right")
To evaluate the goodnesss of the clustering results, plot the silhouette coefficient as follow:
fviz_silhouette(res.fanny, palette = "jco",
ggtheme = theme_minimal())
cluster size ave.sil.width
1 1 22 0.32
2 2 28 0.44
The traditional clustering methods, such as hierarchical clustering (Chapter @ref(agglomerative-clustering)) and k-means clustering (Chapter @ref(kmeans-clustering)), are heuristic and are not based on formal models. Furthermore, k-means algorithm is commonly randomnly initialized, so different runs of k-means will often yield different results. Additionally, k-means requires the user to specify the the optimal number of clusters.
An alternative is model-based clustering, which consider the data as coming from a distribution that is mixture of two or more clusters (Fraley and Raftery 2002, Fraley et al. (2012)). Unlike k-means, the model-based clustering uses a soft assignment, where each data point has a probability of belonging to each cluster.
Concept of model-based clustering
In model-based clustering, the data is considered as coming from a mixture of density.
Each component (i.e. cluster) k is modeled by the normal or Gaussian distribution which is characterized by the parameters:
μk: mean vector, ∑k: covariance matrix, An associated probability in the mixture. Each point has a probability of belonging to each cluster. For example, consider the “old faithful geyser data” [in MASS R package], which can be illustrated as follow using the ggpubr R package:
# Load the data
library("MASS")
Attaching package: ‘MASS’
The following object is masked from ‘package:dplyr’:
select
data("geyser")
# Scatter plot
library("ggpubr")
Loading required package: magrittr
Attaching package: ‘magrittr’
The following object is masked from ‘package:purrr’:
set_names
The following object is masked from ‘package:tidyr’:
extract
ggscatter(geyser, x = "duration", y = "waiting")+
geom_density2d() # Add 2D density
The plot above suggests at least 3 clusters in the mixture. The shape of each of the 3 clusters appears to be approximately elliptical suggesting three bivariate normal distributions. As the 3 ellipses seems to be similar in terms of volume, shape and orientation, we might anticipate that the three components of this mixture might have homogeneous covariance matrices.
The model parameters can be estimated using the Expectation-Maximization (EM) algorithm initialized by hierarchical model-based clustering. Each cluster k is centered at the means μk , with increased density for points near the mean.
Geometric features (shape, volume, orientation) of each cluster are determined by the covariance matrix ∑k .
Different possible parameterizations of ∑k are available in the R package mclust (see ?mclustModelNames).
The available model options, in mclust package, are represented by identifiers including: EII, VII, EEI, VEI, EVI, VVI, EEE, EEV, VEV and VVV.
The first identifier refers to volume, the second to shape and the third to orientation. E stands for “equal”, V for “variable” and I for “coordinate axes”.
For example:
EVI denotes a model in which the volumes of all clusters are equal (E), the shapes of the clusters may vary (V), and the orientation is the identity (I) or “coordinate axes. EEE means that the clusters have the same volume, shape and orientation in p-dimensional space. VEI means that the clusters have variable volume, the same shape and orientation equal to coordinate axes. Choosing the best model
The Mclust package uses maximum likelihood to fit all these models, with different covariance matrix parameterizations, for a range of k components.
The best model is selected using the Bayesian Information Criterion or BIC. A large BIC score indicates strong evidence for the corresponding model.
library("mclust")
data("diabetes")
head(diabetes, 3)
df <- scale(diabetes[, -1]) # Standardize the data
mc <- Mclust(df) # Model-based-clustering
fitting ...
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|=== | 3%
|
|==== | 4%
|
|==== | 5%
|
|===== | 6%
|
|====== | 6%
|
|======= | 7%
|
|======= | 8%
|
|======== | 9%
|
|========= | 9%
|
|========== | 10%
|
|========== | 11%
|
|=========== | 12%
|
|============ | 13%
|
|============= | 13%
|
|============= | 14%
|
|============== | 15%
|
|=============== | 16%
|
|================ | 17%
|
|================= | 18%
|
|================== | 19%
|
|=================== | 20%
|
|==================== | 21%
|
|===================== | 22%
|
|===================== | 23%
|
|====================== | 24%
|
|======================= | 24%
|
|======================== | 25%
|
|======================== | 26%
|
|========================= | 27%
|
|========================== | 28%
|
|=========================== | 28%
|
|=========================== | 29%
|
|============================ | 30%
|
|============================= | 31%
|
|============================== | 31%
|
|============================== | 32%
|
|=============================== | 33%
|
|================================ | 34%
|
|================================= | 35%
|
|================================== | 36%
|
|=================================== | 37%
|
|==================================== | 38%
|
|==================================== | 39%
|
|===================================== | 39%
|
|====================================== | 40%
|
|====================================== | 41%
|
|======================================= | 42%
|
|======================================== | 43%
|
|========================================= | 43%
|
|========================================= | 44%
|
|========================================== | 45%
|
|=========================================== | 46%
|
|============================================ | 46%
|
|============================================ | 47%
|
|============================================= | 48%
|
|============================================== | 49%
|
|=============================================== | 50%
|
|================================================ | 51%
|
|================================================= | 52%
|
|================================================== | 53%
|
|================================================== | 54%
|
|=================================================== | 54%
|
|==================================================== | 55%
|
|===================================================== | 56%
|
|===================================================== | 57%
|
|====================================================== | 57%
|
|======================================================= | 58%
|
|======================================================== | 59%
|
|======================================================== | 60%
|
|========================================================= | 61%
|
|========================================================== | 61%
|
|========================================================== | 62%
|
|=========================================================== | 63%
|
|============================================================ | 64%
|
|============================================================= | 65%
|
|============================================================== | 66%
|
|=============================================================== | 67%
|
|================================================================ | 68%
|
|================================================================ | 69%
|
|================================================================= | 69%
|
|================================================================== | 70%
|
|=================================================================== | 71%
|
|=================================================================== | 72%
|
|==================================================================== | 72%
|
|===================================================================== | 73%
|
|====================================================================== | 74%
|
|====================================================================== | 75%
|
|======================================================================= | 76%
|
|======================================================================== | 76%
|
|========================================================================= | 77%
|
|========================================================================= | 78%
|
|========================================================================== | 79%
|
|=========================================================================== | 80%
|
|============================================================================ | 81%
|
|============================================================================= | 82%
|
|============================================================================== | 83%
|
|=============================================================================== | 84%
|
|================================================================================ | 85%
|
|================================================================================= | 86%
|
|================================================================================= | 87%
|
|================================================================================== | 87%
|
|=================================================================================== | 88%
|
|==================================================================================== | 89%
|
|==================================================================================== | 90%
|
|===================================================================================== | 91%
|
|====================================================================================== | 91%
|
|======================================================================================= | 92%
|
|======================================================================================= | 93%
|
|======================================================================================== | 94%
|
|========================================================================================= | 94%
|
|========================================================================================== | 95%
|
|========================================================================================== | 96%
|
|=========================================================================================== | 97%
|
|============================================================================================ | 98%
|
|============================================================================================= | 98%
|
|============================================================================================= | 99%
|
|==============================================================================================| 100%
summary(mc) # Print a summary
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 3 components:
log.likelihood n df BIC ICL
-169.0918 145 29 -482.5089 -501.4368
Clustering table:
1 2 3
81 36 28
For this data, it can be seen that model-based clustering selected a model with three components (i.e. clusters). The optimal selected model name is VVV model. That is the three components are ellipsoidal with varying volume, shape, and orientation. The summary contains also the clustering table specifying the number of observations in each clusters.
You can access to the results as follow:
mc$modelName # Optimal selected model ==> "VVV"
[1] "VVV"
mc$G # Optimal number of cluster => 3
[1] 3
head(mc$z, 30) # Probality to belong to a given cluster
[,1] [,2] [,3]
1 0.9907874 0.008870671 3.419770e-04
2 0.9824097 0.017586177 4.107625e-06
3 0.9779945 0.021948580 5.690665e-05
4 0.9777076 0.022077703 2.147003e-04
5 0.9217797 0.078151587 6.868319e-05
6 0.9864768 0.012835588 6.876134e-04
7 0.9436635 0.056211528 1.249607e-04
8 0.9771351 0.022689680 1.752408e-04
9 0.9742319 0.025580223 1.878661e-04
10 0.9842886 0.015700736 1.061526e-05
11 0.9608016 0.039063321 1.350300e-04
12 0.9820004 0.017881233 1.184090e-04
13 0.9818349 0.018011976 1.531207e-04
14 0.9751132 0.024576982 3.097991e-04
15 0.9676354 0.032185044 1.795065e-04
16 0.9757508 0.020466961 3.782239e-03
17 0.9867246 0.013089300 1.861052e-04
18 0.9719509 0.027856308 1.927908e-04
19 0.9854911 0.014497880 1.099339e-05
20 0.9878355 0.012130965 3.352455e-05
21 0.9844490 0.015337570 2.134332e-04
22 0.9873609 0.012616432 2.266848e-05
23 0.9799708 0.019894253 1.349581e-04
24 0.9803321 0.019540366 1.275444e-04
25 0.6609733 0.339015667 1.108037e-05
26 0.9555813 0.038816298 5.602419e-03
27 0.9780961 0.021784401 1.195137e-04
28 0.9761452 0.023700606 1.542353e-04
29 0.9848905 0.014928363 1.810939e-04
30 0.9499927 0.049921182 8.607427e-05
head(mc$classification, 10) # Cluster assignement of each observation
1 2 3 4 5 6 7 8 9 10
1 1 1 1 1 1 1 1 1 1
Model-based clustering results can be drawn using the base function plot.Mclust() [in mclust package]. Here we’ll use the function fviz_mclust() [in factoextra package] to create beautiful plots based on ggplot2.
In the situation, where the data contain more than two variables, fviz_mclust() uses a principal component analysis to reduce the dimensionnality of the data. The first two principal components are used to produce a scatter plot of the data. However, if you want to plot the data using only two variables of interest, let say here c(“insulin”, “sspg”), you can specify that in the fviz_mclust() function using the argument choose.vars = c(“insulin”, “sspg”).
library(factoextra)
# BIC values used for choosing the number of clusters
fviz_mclust(mc, "BIC", palette = "jco")
# Classification: plot showing the clustering
fviz_mclust(mc, "classification", geom = "point",
pointsize = 1.5, palette = "jco")
# Classification uncertainty
fviz_mclust(mc, "uncertainty", palette = "jco")
DBSCAN (Density-Based Spatial Clustering and Application with Noise), is a density-based clusering algorithm, introduced in Ester et al. 1996, which can be used to identify clusters of any shape in a data set containing noise and outliers.
The basic idea behind the density-based clustering approach is derived from a human intuitive clustering method. For instance, by looking at the figure below, one can easily identify four clusters along with several points of noise, because of the differences in the density of points.
Clusters are dense regions in the data space, separated by regions of lower density of points. The DBSCAN algorithm is based on this intuitive notion of “clusters” and “noise”. The key idea is that for each point of a cluster, the neighborhood of a given radius has to contain at least a minimum number of points.
Why DBSCAN?
Partitioning methods (K-means, PAM clustering) and hierarchical clustering are suitable for finding spherical-shaped clusters or convex clusters. In other words, they work well only for compact and well separated clusters. Moreover, they are also severely affected by the presence of noise and outliers in the data.
Unfortunately, real life data can contain: i) clusters of arbitrary shape such as those shown in the figure below (oval, linear and “S” shape clusters); ii) many outliers and noise.
The figure below shows a data set containing nonconvex clusters and outliers/noises. The simulated data set multishapes [in factoextra package] is used.
data("multishapes")
df <- multishapes[, 1:2]
set.seed(123)
km.res <- kmeans(df, 5, nstart = 25)
fviz_cluster(km.res, df, geom = "point",
ellipse= FALSE, show.clust.cent = FALSE,
palette = "jco", ggtheme = theme_classic())
Here, we’ll use the R package fpc to compute DBSCAN. It’s also possible to use the package dbscan, which provides a faster re-implementation of DBSCAN algorithm compared to the fpc package.
We’ll also use the factoextra package for visualizing clusters.
First, install the packages as follow:
#install.packages("fpc")
install.packages("dbscan")
Installing package into ‘/Users/nanaakwasiabayieboateng/Library/R/3.4/library’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.4/dbscan_1.1-1.tgz'
Content type 'application/x-gzip' length 3129485 bytes (3.0 MB)
==================================================
downloaded 3.0 MB
The downloaded binary packages are in
/var/folders/mj/w1gxzjcd0qx2cw_0690z7y640000gn/T//RtmpDAVf3z/downloaded_packages
# Load the data
data("multishapes", package = "factoextra")
df <- multishapes[, 1:2]
# Compute DBSCAN using fpc package
library("fpc")
set.seed(123)
db <- fpc::dbscan(df, eps = 0.15, MinPts = 5)
# Plot DBSCAN results
library("factoextra")
Loading required package: ggplot2
Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
fviz_cluster(db, data = df, stand = FALSE,
ellipse = FALSE, show.clust.cent = FALSE,
geom = "point",palette = "jco", ggtheme = theme_classic())
print(db)
dbscan Pts=1100 MinPts=5 eps=0.15
0 1 2 3 4 5
border 31 24 1 5 7 1
seed 0 386 404 99 92 50
total 31 410 405 104 99 51
In the table above, column names are cluster number. Cluster 0 corresponds to outliers (black points in the DBSCAN plot). The function print.dbscan() shows a statistic of the number of points belonging to the clusters that are seeds and border points.
# Cluster membership. Noise/outlier observations are coded as 0
# A random subset is shown
db$cluster[sample(1:1089, 20)]
[1] 3 2 4 0 1 2 4 2 2 0 2 2 2 1 4 1 1 1 0 4
The method proposed here consists of computing the k-nearest neighbor distances in a matrix of points.
The idea is to calculate, the average of the distances of every point to its k nearest neighbors. The value of k will be specified by the user and corresponds to MinPts.
Next, these k-distances are plotted in an ascending order. The aim is to determine the “knee”, which corresponds to the optimal eps parameter.
A knee corresponds to a threshold where a sharp change occurs along the k-distance curve.
The function kNNdistplot() [in dbscan package] can be used to draw the k-distance plot:
dbscan::kNNdistplot(df, k = 5)
abline(h = 0.15, lty = 2)
Clustering is one of the important data mining methods for discovering knowledge in multivariate data sets. The goal is to identify groups (i.e. clusters) of similar objects within a data set of interest. To learn more about clustering, you can read our book entitled “Practical Guide to Cluster Analysis in R” (https://goo.gl/DmJ5y5).
Briefly, the two most common clustering strategies are:
Hierarchical clustering, used for identifying groups of similar observations in a data set. Partitioning clustering such as k-means algorithm, used for splitting a data set into several groups. The HCPC (Hierarchical Clustering on Principal Components) approach allows us to combine the three standard methods used in multivariate data analyses (Husson, Josse, and J. 2010):
Principal component methods (PCA, CA, MCA, FAMD, MFA), Hierarchical clustering and Partitioning clustering, particularly the k-means method.
Why HCPC
Combining principal component methods and clustering methods are useful in at least three situations.
Case 1: Continuous variables
In the situation where you have a multidimensional data set containing multiple continuous variables, the principal component analysis (PCA) can be used to reduce the dimension of the data into few continuous variables containing the most important information in the data. Next, you can perform cluster analysis on the PCA results.
The PCA step can be considered as a denoising step which can lead to a more stable clustering. This might be very useful if you have a large data set with multiple variables, such as in gene expression data.
Case 2: Clustering on categorical data
In order to perform clustering analysis on categorical data, the correspondence analysis (CA, for analyzing contingency table) and the multiple correspondence analysis (MCA, for analyzing multidimensional categorical variables) can be used to transform categorical variables into a set of few continuous variables (the principal components). The cluster analysis can be then applied on the (M)CA results.
In this case, the (M)CA method can be considered as pre-processing steps which allow to compute clustering on categorical data.
Case 3: Clustering on mixed data
When you have a mixed data of continuous and categorical variables, you can first perform FAMD (factor analysis of mixed data) or MFA (multiple factor analysis). Next, you can apply cluster analysis on the FAMD/MFA outputs.
Algorithm of the HCPC method
The algorithm of the HCPC method, as implemented in the FactoMineR package, can be summarized as follow:
Compute principal component methods: PCA, (M)CA or MFA depending on the types of variables in the data set and the structure of the data set. At this step, you can choose the number of dimensions to be retained in the output by specifying the argument ncp. The default value is 5.
Compute hierarchical clustering: Hierarchical clustering is performed using the Ward’s criterion on the selected principal components. Ward criterion is used in the hierarchical clustering because it is based on the multidimensional variance like principal component analysis.
Choose the number of clusters based on the hierarchical tree: An initial partitioning is performed by cutting the hierarchical tree.
Perform K-means clustering to improve the initial partition obtained from hierarchical clustering. The final partitioning solution, obtained after consolidation with k-means, can be (slightly) different from the one obtained with the hierarchical clustering.
Computation
R packages
We’ll use two R packages: i) FactoMineR for computing HCPC and ii) factoextra for visualizing the results.
To install the packages, type this:
library(factoextra)
library(FactoMineR)
The function HCPC() [in FactoMineR package] can be used to compute hierarchical clustering on principal components.
A simplified format is:
#HCPC(res, nb.clust = 0, min = 3, max = NULL, graph = TRUE)
res: Either the result of a factor analysis or a data frame. nb.clust: an integer specifying the number of clusters. Possible values are: 0: the tree is cut at the level the user clicks on -1: the tree is automatically cut at the suggested level Any positive integer: the tree is cut with nb.clusters clusters min, max: the minimum and the maximum number of clusters to be generated, respectively graph: if TRUE, graphics are displayed Case of continuous variables
We start by computing again the principal component analysis (PCA). The argument ncp = 3 is used in the function PCA() to keep only the first three principal components. Next, the HCPC is applied on the result of the PCA.
# Compute PCA with ncp = 3
res.pca <- PCA(USArrests, ncp = 3, graph = FALSE)
# Compute hierarchical clustering on principal components
res.hcpc <- HCPC(res.pca, graph = FALSE)
To visualize the dendrogram generated by the hierarchical clustering, we’ll use the function fviz_dend() [factoextra package]:
fviz_dend(res.hcpc,
cex = 0.7, # Label size
palette = "jco", # Color palette see ?ggpubr::ggpar
rect = TRUE, rect_fill = TRUE, # Add rectangle around groups
rect_border = "jco", # Rectangle color
labels_track_height = 0.8 # Augment the room for labels
)
It’s possible to visualize individuals on the principal component map and to color individuals according to the cluster they belong to. The function fviz_cluster() [in factoextra] can be used to visualize individuals clusters.
fviz_cluster(res.hcpc,
repel = TRUE, # Avoid label overlapping
show.clust.cent = TRUE, # Show cluster centers
You can also draw a three dimensional plot combining the hierarchical clustering and the factorial map using the R base function plot():
# Principal components + tree
plot(res.hcpc, choice = "3D.map")
The function HCPC() returns a list containing:
data.clust: The original data with a supplementary column called class containing the partition. desc.var: The variables describing clusters desc.ind: The more typical individuals of each cluster desc.axes: The axes describing clusters To display the original data with cluster assignments, type this:
head(res.hcpc$data.clust, 10)
In the table above, the last column contains the cluster assignments.
To display quantitative variables that describe the most each cluster, type this:
res.hcpc$desc.var$quanti
$`1`
v.test Mean in category Overall mean sd in category Overall sd p.value
UrbanPop -3.898420 52.07692 65.540 9.691087 14.329285 9.682222e-05
Murder -4.030171 3.60000 7.788 2.269870 4.311735 5.573624e-05
Rape -4.052061 12.17692 21.232 3.130779 9.272248 5.076842e-05
Assault -4.638172 78.53846 170.760 24.700095 82.500075 3.515038e-06
$`2`
v.test Mean in category Overall mean sd in category Overall sd p.value
UrbanPop 2.793185 73.87500 65.540 8.652131 14.329285 0.005219187
Murder -2.374121 5.65625 7.788 1.594902 4.311735 0.017590794
$`3`
v.test Mean in category Overall mean sd in category Overall sd p.value
Murder 4.357187 13.9375 7.788 2.433587 4.311735 1.317449e-05
Assault 2.698255 243.6250 170.760 46.540137 82.500075 6.970399e-03
UrbanPop -2.513667 53.7500 65.540 7.529110 14.329285 1.194833e-02
$`4`
v.test Mean in category Overall mean sd in category Overall sd p.value
Rape 5.352124 33.19231 21.232 6.996643 9.272248 8.692769e-08
Assault 4.356682 257.38462 170.760 41.850537 82.500075 1.320491e-05
UrbanPop 3.028838 76.00000 65.540 10.347798 14.329285 2.454964e-03
Murder 2.913295 10.81538 7.788 2.001863 4.311735 3.576369e-03
From the output above, it can be seen that:
the variables UrbanPop, Murder, Rape and Assault are most significantly associated with the cluster 1. For example, the mean value of the Assault variable in cluster 1 is 78.53 which is less than it’s overall mean (170.76) across all clusters. Therefore, It can be conclude that the cluster 1 is characterized by a low rate of Assault compared to all clusters.
the variables UrbanPop and Murder are most significantly associated with the cluster 2.
…and so on …
res.hcpc$desc.axes$quanti
$`1`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.1 -5.175764 -1.964502 -5.639933e-16 0.6192556 1.574878 2.269806e-07
$`2`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.2 3.585635 0.7428712 -5.369316e-16 0.6137936 0.9948694 0.0003362596
$`3`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.1 2.058338 1.0610731 -5.639933e-16 0.5146613 1.5748783 3.955769e-02
Dim.3 2.028887 0.3965588 3.535366e-17 0.3714503 0.5971291 4.246985e-02
Dim.2 -4.536594 -1.4773302 -5.369316e-16 0.5750284 0.9948694 5.717010e-06
$`4`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.1 4.986474 1.892656 -5.639933e-16 0.6126035 1.574878 6.149115e-07
Finally, representative individuals of each cluster can be extracted as follow:
res.hcpc$desc.ind$para
Cluster: 1
Idaho South Dakota Maine Iowa New Hampshire
0.3674381 0.4993032 0.5012072 0.5533105 0.5891145
-----------------------------------------------------------------------------
Cluster: 2
Ohio Oklahoma Pennsylvania Kansas Indiana
0.2796100 0.5047549 0.5088363 0.6039091 0.7100820
-----------------------------------------------------------------------------
Cluster: 3
Alabama South Carolina Georgia Tennessee Louisiana
0.3553460 0.5335189 0.6136865 0.8522640 0.8780872
-----------------------------------------------------------------------------
Cluster: 4
Michigan Arizona New Mexico Maryland Texas
0.3246254 0.4532480 0.5176322 0.9013514 0.9239792
For categorical variables, compute CA or MCA and then apply the function HCPC() on the results as described above.
Here, we’ll use the tea data [in FactoMineR] as demo data set: Rows represent the individuals and columns represent categorical variables.
We start, by performing an MCA on the individuals. We keep the first 20 axes of the MCA which retain 87% of the information.
# Loading data
library(FactoMineR)
data(tea)
# Performing MCA
res.mca <- MCA(tea,
ncp = 20, # Number of components kept
quanti.sup = 19, # Quantitative supplementary variables
quali.sup = c(20:36), # Qualitative supplementary variables
graph=FALSE)
Next, we apply hierarchical clustering on the results of the MCA:
res.hcpc <- HCPC (res.mca, graph = FALSE, max = 3)
Chi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrectChi-squared approximation may be incorrect
# Dendrogram
fviz_dend(res.hcpc, show_labels = FALSE)
# Individuals facor map
fviz_cluster(res.hcpc, geom = "point", main = "Factor map")
As mentioned above, clusters can be described by i) variables and/or categories, ii) principal axes and iii) individuals. In the example below, we display only a subset of the results.
Description by variables and categories
# Description by variables
res.hcpc$desc.var$test.chi2
p.value df
where 8.465616e-79 4
how 3.144675e-47 4
price 1.862462e-28 10
tearoom 9.624188e-19 2
pub 8.539893e-10 2
friends 6.137618e-08 2
resto 3.537876e-07 2
How 3.616532e-06 6
Tea 1.778330e-03 4
sex 1.789593e-03 2
frequency 1.973274e-03 6
work 3.052988e-03 2
tea.time 3.679599e-03 2
lunch 1.052478e-02 2
dinner 2.234313e-02 2
always 3.600913e-02 2
sugar 3.685785e-02 2
sophisticated 4.077297e-02 2
# Description by variable categories
res.hcpc$desc.var$category
$`1`
Cla/Mod Mod/Cla Global p.value v.test
where=chain store 85.937500 93.750000 64.000000 2.094419e-40 13.307475
how=tea bag 84.117647 81.250000 56.666667 1.478564e-25 10.449142
tearoom=Not.tearoom 70.661157 97.159091 80.666667 1.082077e-18 8.826287
price=p_branded 83.157895 44.886364 31.666667 1.631861e-09 6.030764
pub=Not.pub 67.088608 90.340909 79.000000 1.249296e-08 5.692859
friends=Not.friends 76.923077 45.454545 34.666667 2.177180e-06 4.736242
resto=Not.resto 64.705882 81.250000 73.666667 4.546462e-04 3.506146
price=p_private label 90.476190 10.795455 7.000000 1.343844e-03 3.206448
tea.time=Not.tea time 67.938931 50.568182 43.666667 4.174032e-03 2.864701
How=alone 64.102564 71.022727 65.000000 9.868387e-03 2.580407
work=Not.work 63.380282 76.704545 71.000000 1.036429e-02 2.563432
sugar=sugar 66.206897 54.545455 48.333333 1.066744e-02 2.553408
always=Not.always 63.959391 71.590909 65.666667 1.079912e-02 2.549133
price=p_unknown 91.666667 6.250000 4.000000 1.559798e-02 2.418189
frequency=1 to 2/week 75.000000 18.750000 14.666667 1.649092e-02 2.397866
frequency=1/day 68.421053 36.931818 31.666667 1.958790e-02 2.334149
age_Q=15-24 68.478261 35.795455 30.666667 2.179803e-02 2.293869
price=p_cheap 100.000000 3.977273 2.333333 2.274539e-02 2.277684
lunch=Not.lunch 61.328125 89.204545 85.333333 2.681490e-02 2.214202
SPC=senior 42.857143 8.522727 11.666667 4.813710e-02 -1.976156
lunch=lunch 43.181818 10.795455 14.666667 2.681490e-02 -2.214202
always=always 48.543689 28.409091 34.333333 1.079912e-02 -2.549133
sugar=No.sugar 51.612903 45.454545 51.666667 1.066744e-02 -2.553408
work=work 47.126437 23.295455 29.000000 1.036429e-02 -2.563432
tea.time=tea time 51.479290 49.431818 56.333333 4.174032e-03 -2.864701
How=lemon 30.303030 5.681818 11.000000 5.943089e-04 -3.434198
resto=resto 41.772152 18.750000 26.333333 4.546462e-04 -3.506146
How=other 0.000000 0.000000 3.000000 2.952904e-04 -3.619397
price=p_variable 44.642857 28.409091 37.333333 1.595638e-04 -3.775692
frequency=+2/day 45.669291 32.954545 42.333333 9.872288e-05 -3.893709
friends=friends 48.979592 54.545455 65.333333 2.177180e-06 -4.736242
how=unpackaged 19.444444 3.977273 12.000000 4.328211e-07 -5.053925
pub=pub 26.984127 9.659091 21.000000 1.249296e-08 -5.692859
where=tea shop 6.666667 1.136364 10.000000 4.770573e-10 -6.226471
price=p_upscale 18.867925 5.681818 17.666667 9.472539e-11 -6.475138
how=tea bag+unpackaged 27.659574 14.772727 31.333333 1.927326e-13 -7.353743
tearoom=tearoom 8.620690 2.840909 19.333333 1.082077e-18 -8.826287
where=chain store+tea shop 11.538462 5.113636 26.000000 1.133459e-23 -10.029275
$`2`
Cla/Mod Mod/Cla Global p.value v.test
where=tea shop 90.000000 84.375 10.00000 3.703402e-30 11.410559
how=unpackaged 66.666667 75.000 12.00000 5.346850e-20 9.156781
price=p_upscale 49.056604 81.250 17.66667 2.392655e-17 8.472945
Tea=green 27.272727 28.125 11.00000 4.436713e-03 2.845318
sophisticated=sophisticated 13.488372 90.625 71.66667 8.080918e-03 2.648670
sex=M 16.393443 62.500 40.66667 9.511848e-03 2.593088
resto=Not.resto 13.122172 90.625 73.66667 1.587879e-02 2.411690
dinner=dinner 28.571429 18.750 7.00000 1.874042e-02 2.350655
escape.exoticism=Not.escape-exoticism 14.556962 71.875 52.66667 2.177458e-02 2.294277
how=tea bag+unpackaged 5.319149 15.625 31.33333 3.876799e-02 -2.066641
escape.exoticism=escape-exoticism 6.338028 28.125 47.33333 2.177458e-02 -2.294277
dinner=Not.dinner 9.318996 81.250 93.00000 1.874042e-02 -2.350655
resto=resto 3.797468 9.375 26.33333 1.587879e-02 -2.411690
Tea=Earl Grey 7.253886 43.750 64.33333 1.314753e-02 -2.479748
sex=F 6.741573 37.500 59.33333 9.511848e-03 -2.593088
sophisticated=Not.sophisticated 3.529412 9.375 28.33333 8.080918e-03 -2.648670
where=chain store+tea shop 2.564103 6.250 26.00000 3.794134e-03 -2.894789
price=p_variable 3.571429 12.500 37.33333 1.349384e-03 -3.205264
age_Q=15-24 2.173913 6.250 30.66667 6.100227e-04 -3.427119
price=p_branded 2.105263 6.250 31.66667 4.024289e-04 -3.538486
how=tea bag 1.764706 9.375 56.66667 5.537403e-09 -5.830161
where=chain store 1.562500 9.375 64.00000 1.664577e-11 -6.732775
$`3`
Cla/Mod Mod/Cla Global p.value v.test
where=chain store+tea shop 85.897436 72.826087 26.00000 5.730651e-34 12.150084
how=tea bag+unpackaged 67.021277 68.478261 31.33333 1.382641e-19 9.053653
tearoom=tearoom 77.586207 48.913043 19.33333 1.252051e-16 8.278053
pub=pub 63.492063 43.478261 21.00000 1.126679e-09 6.090345
friends=friends 41.836735 89.130435 65.33333 1.429181e-09 6.052158
price=p_variable 51.785714 63.043478 37.33333 1.572243e-09 6.036775
resto=resto 54.430380 46.739130 26.33333 2.406386e-07 5.164845
How=other 100.000000 9.782609 3.00000 1.807938e-05 4.287379
frequency=+2/day 41.732283 57.608696 42.33333 4.237330e-04 3.524844
tea.time=tea time 38.461538 70.652174 56.33333 8.453564e-04 3.337500
work=work 44.827586 42.391304 29.00000 9.079377e-04 3.317602
sex=F 37.078652 71.739130 59.33333 3.494245e-03 2.920541
lunch=lunch 50.000000 23.913043 14.66667 3.917102e-03 2.884762
How=lemon 51.515152 18.478261 11.00000 8.747530e-03 2.621767
sugar=No.sugar 36.129032 60.869565 51.66667 3.484061e-02 2.110206
home=home 31.615120 100.000000 97.00000 3.506563e-02 2.107600
home=Not.home 0.000000 0.000000 3.00000 3.506563e-02 -2.107600
sugar=sugar 24.827586 39.130435 48.33333 3.484061e-02 -2.110206
price=p_private label 9.523810 2.173913 7.00000 2.370629e-02 -2.261856
how=unpackaged 13.888889 5.434783 12.00000 1.645107e-02 -2.398752
How=alone 25.128205 53.260870 65.00000 5.300881e-03 -2.788157
lunch=Not.lunch 27.343750 76.086957 85.33333 3.917102e-03 -2.884762
sex=M 21.311475 28.260870 40.66667 3.494245e-03 -2.920541
Tea=green 9.090909 3.260870 11.00000 2.545816e-03 -3.017842
frequency=1 to 2/week 11.363636 5.434783 14.66667 1.604219e-03 -3.155139
work=Not.work 24.882629 57.608696 71.00000 9.079377e-04 -3.317602
tea.time=Not.tea time 20.610687 29.347826 43.66667 8.453564e-04 -3.337500
where=tea shop 3.333333 1.086957 10.00000 1.466234e-04 -3.796720
price=p_branded 14.736842 15.217391 31.66667 2.746948e-05 -4.193490
resto=Not.resto 22.171946 53.260870 73.66667 2.406386e-07 -5.164845
friends=Not.friends 9.615385 10.869565 34.66667 1.429181e-09 -6.052158
pub=Not.pub 21.940928 56.521739 79.00000 1.126679e-09 -6.090345
how=tea bag 14.117647 26.086957 56.66667 1.082059e-12 -7.119644
tearoom=Not.tearoom 19.421488 51.086957 80.66667 1.252051e-16 -8.278053
where=chain store 12.500000 26.086957 64.00000 1.711522e-19 -9.030332
Description by principal components
res.hcpc$desc.axes
$quanti.var
Eta2 P-value
Dim.2 0.66509105 2.828937e-71
Dim.1 0.63497903 1.009707e-65
Dim.4 0.11231020 2.073924e-08
Dim.14 0.03141943 8.732913e-03
Dim.6 0.02358138 2.890373e-02
$quanti
$quanti$`1`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.6 2.647552 0.03433626 -2.721637e-17 0.2655618 0.2671712 8.107689e-03
Dim.2 -7.796641 -0.13194656 -6.207419e-17 0.1813156 0.3486355 6.357699e-15
Dim.1 -12.409741 -0.23196088 -1.678981e-16 0.2143767 0.3850642 2.314001e-35
$quanti$`2`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.2 13.918285 0.81210870 -6.207419e-17 0.2340345 0.3486355 4.905356e-44
Dim.4 4.350620 0.20342610 7.249699e-19 0.3700048 0.2793822 1.357531e-05
Dim.14 2.909073 0.10749165 4.512196e-17 0.2161509 0.2207818 3.625025e-03
Dim.13 2.341566 0.08930402 1.266782e-17 0.1606616 0.2278809 1.920305e-02
Dim.3 2.208179 0.11087544 -4.161891e-17 0.2449710 0.3000159 2.723180e-02
Dim.11 -2.234447 -0.08934293 6.504924e-17 0.2066708 0.2389094 2.545367e-02
$quanti$`3`
v.test Mean in category Overall mean sd in category Overall sd p.value
Dim.1 13.485906 0.45155993 -1.678981e-16 0.2516544 0.3850642 1.893256e-41
Dim.6 -2.221728 -0.05161581 -2.721637e-17 0.2488566 0.2671712 2.630166e-02
Dim.4 -4.725270 -0.11479621 7.249699e-19 0.2924881 0.2793822 2.298093e-06
attr(,"class")
[1] "catdes" "list "
Description by individuals
res.hcpc$desc.ind$para
Cluster: 1
285 152 166 143 71
0.5884476 0.6242123 0.6242123 0.6244176 0.6478185
-----------------------------------------------------------------------------
Cluster: 2
31 95 53 182 202
0.6620553 0.7442013 0.7610437 0.7948663 0.8154826
-----------------------------------------------------------------------------
Cluster: 3
172 33 233 18 67
0.7380497 0.7407711 0.7503006 0.7572188 0.7701598
Summary
We described how to compute hierarchical clustering on principal components (HCPC). This approach is useful in situations, including:
When you have a large data set containing continuous variables, a principal component analysis can be used to reduce the dimension of the data before the hierarchical clustering analysis.
When you have a data set containing categorical variables, a (Multiple)Correspondence analysis can be used to transform the categorical variables into few continuous principal components, which can be used as the input of the cluster analysis.
We used the FactoMineR package to compute the HCPC and the factoextra R package for ggplot2-based elegant data visualization.