if(!require(devtools)) install.packages("devtools")
## Loading required package: devtools
devtools::install_github("kassambara/factoextra")
## Skipping install of 'factoextra' from a github remote, the SHA1 (718b68fe) has not changed since last install.
## Use `force = TRUE` to force installation
pkgs <- c("cluster", "NbClust")
install.packages(pkgs)
## Installing packages into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.4'
## (as 'lib' is unspecified)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(NbClust)
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
# Remove species column (5) and scale the data
iris.scaled <- scale(iris[, -5])
# K-means clustering
set.seed(123)
km.res <- kmeans(iris.scaled, 3, nstart = 25)
# k-means group number of each observation
km.res$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3
## [71] 2 3 3 3 3 2 2 2 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 2 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3
# Visualize k-means clusters
fviz_cluster(km.res, data = iris.scaled, geom = "point",
stand = FALSE, ellipse.type = "norm")

# PAM clustering
library("cluster")
pam.res <- pam(iris.scaled, 3)
pam.res$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3
## [71] 3 3 3 3 3 2 2 2 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 2 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3
# Visualize pam clusters
fviz_cluster(pam.res, stand = FALSE, geom = "point",
ellipse.type = "norm")

# Compute pairewise distance matrices
dist.res <- dist(iris.scaled, method = "euclidean")
# Hierarchical clustering results
hc <- hclust(dist.res, method = "complete")
# Visualization of hclust
plot(hc, labels = FALSE, hang = -1)
# Add rectangle around 3 groups
rect.hclust(hc, k = 3, border = 2:4)

# Cut into 3 groups
hc.cut <- cutree(hc, k = 3)
head(hc.cut, 20)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
#
set.seed(123)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- iris.scaled
wss <- sapply(1:k.max,
function(k){kmeans(data, k, nstart=10 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

fviz_nbclust(iris.scaled, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(iris.scaled, pam, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(iris.scaled, hcut, method = "wss") +
geom_vline(xintercept = 3, linetype = 2)

library(cluster)
k.max <- 15
data <- iris.scaled
sil <- rep(0, k.max)
# Compute the average silhouette width for
# k = 2 to k = 15
for(i in 2:k.max){
km.res <- kmeans(data, centers = i, nstart = 25)
ss <- silhouette(km.res$cluster, dist(data))
sil[i] <- mean(ss[, 3])
}
# Plot the average silhouette width
plot(1:k.max, sil, type = "b", pch = 19,
frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

require(cluster)
fviz_nbclust(iris.scaled, kmeans, method = "silhouette")

require(cluster)
fviz_nbclust(iris.scaled, pam, method = "silhouette")

require(cluster)
fviz_nbclust(iris.scaled, hcut, method = "silhouette",
hc_method = "complete")

#
# Compute gap statistic
library(cluster)
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = kmeans, nstart = 25,
K.max = 10, B = 50)
# Print the result
print(gap_stat, method = "firstmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = iris.scaled, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'firstmax'): 3
## logW E.logW gap SE.sim
## [1,] 4.534565 4.754595 0.2200304 0.02504585
## [2,] 4.021316 4.489687 0.4683711 0.02742112
## [3,] 3.806577 4.295715 0.4891381 0.02384746
## [4,] 3.699263 4.143675 0.4444115 0.02093871
## [5,] 3.589284 4.052262 0.4629781 0.02036366
## [6,] 3.519726 3.972254 0.4525278 0.02049566
## [7,] 3.448288 3.905945 0.4576568 0.02106987
## [8,] 3.398210 3.850807 0.4525967 0.01969193
## [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183
## Clustering Gap statistic ["clusGap"].
## B=50 simulated reference sets, k = 1..10
## --> Number of clusters (method 'firstmax'): 3
## logW E.logW gap SE.sim
## [1,] 4.534565 4.754595 0.2200304 0.02504585
## [2,] 4.021316 4.489687 0.4683711 0.02742112
## [3,] 3.806577 4.295715 0.4891381 0.02384746
## [4,] 3.699263 4.143675 0.4444115 0.02093871
## [5,] 3.589284 4.052262 0.4629781 0.02036366
## [6,] 3.519726 3.972254 0.4525278 0.02049566
## [7,] 3.448288 3.905945 0.4576568 0.02106987
## [8,] 3.398210 3.850807 0.4525967 0.01969193
## [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183
# Base plot of gap statistic
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)

fviz_gap_stat(gap_stat)

# Print
print(gap_stat, method = "Tibs2001SEmax")
## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = iris.scaled, FUNcluster = kmeans, K.max = 10, B = 50, nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
## --> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 2
## logW E.logW gap SE.sim
## [1,] 4.534565 4.754595 0.2200304 0.02504585
## [2,] 4.021316 4.489687 0.4683711 0.02742112
## [3,] 3.806577 4.295715 0.4891381 0.02384746
## [4,] 3.699263 4.143675 0.4444115 0.02093871
## [5,] 3.589284 4.052262 0.4629781 0.02036366
## [6,] 3.519726 3.972254 0.4525278 0.02049566
## [7,] 3.448288 3.905945 0.4576568 0.02106987
## [8,] 3.398210 3.850807 0.4525967 0.01969193
## [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183
# Plot
fviz_gap_stat(gap_stat,
maxSE = list(method = "Tibs2001SEmax"))

# Relaxed the gap test to be within two standard deviations
fviz_gap_stat(gap_stat,
maxSE = list(method = "Tibs2001SEmax", SE.factor = 2))

# Compute gap statistic
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = pam, K.max = 10, B = 50)
# Plot gap statistic
fviz_gap_stat(gap_stat)

# Compute gap statistic
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = hcut, K.max = 10, B = 50)
# Plot gap statistic
fviz_gap_stat(gap_stat)

###
library("NbClust")
set.seed(123)
res.nb <- NbClust(iris.scaled, distance = "euclidean",
min.nc = 2, max.nc = 10,
method = "complete", index ="gap")
res.nb # print the results
## $All.index
## 2 3 4 5 6 7 8 9 10
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551
##
## $All.CriticalValues
## 2 3 4 5 6 7 8 9 10
## -0.0539 0.4694 0.1787 0.2009 0.2848 0.0230 0.1631 0.0988 0.1708
##
## $Best.nc
## Number_clusters Value_Index
## 3.0000 -0.2303
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
## [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
## $All.index
## 2 3 4 5 6 7 8 9 10
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551
##
## $All.CriticalValues
## 2 3 4 5 6 7 8 9 10
## -0.0539 0.4694 0.1787 0.2009 0.2848 0.0230 0.1631 0.0988 0.1708
##
## $Best.nc
## Number_clusters Value_Index
## 3.0000 -0.2303
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
## [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
#The elements returned by the function NbClust() are accessible using the R code below:
# All gap statistic values
res.nb$All.index
## 2 3 4 5 6 7 8 9 10
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551
# Best number of clusters
res.nb$Best.nc
## Number_clusters Value_Index
## 3.0000 -0.2303
# Best partition
res.nb$Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
## [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
###
nb <- NbClust(iris.scaled, distance = "euclidean", min.nc = 2,
max.nc = 10, method = "complete", index ="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 2 proposed 2 as the best number of clusters
## * 18 proposed 3 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
# Print the result
nb
## $All.index
## KL CH Hartigan CCC Scott Marriot TrCovW TraceW
## 2 1.1854 151.6332 137.0327 -1.7090 206.2459 4042610.9 11446.7289 294.3866
## 3 9.8471 213.0817 33.2185 3.3712 444.9163 1852776.5 1065.2914 152.8569
## 4 1.3651 183.9682 24.9638 2.0682 543.6783 1705121.4 900.8342 124.6818
## 5 2.2951 166.6596 10.6281 0.4146 592.5766 1923092.4 562.5608 106.4760
## 6 2.7125 144.2243 12.7058 -0.7426 633.8818 2102673.6 558.6742 99.2046
## 7 0.0462 131.9862 39.5776 -1.2574 662.7020 2361686.4 432.8320 91.1610
## 8 2.7803 149.0463 19.9304 1.3079 781.5413 1396790.2 430.6392 71.3999
## 9 0.5276 150.1464 32.3094 2.0036 848.5792 1130685.4 285.0416 62.6120
## 10 2.6428 166.4470 16.6788 4.1139 922.7090 851586.6 199.1249 50.9395
## Friedman Rubin Cindex DB Silhouette Duda Pseudot2 Beale
## 2 38.3959 2.0245 0.2978 0.9735 0.4408 0.2934 170.9649 5.7326
## 3 53.5029 3.8991 0.3247 0.8581 0.4496 0.7005 32.0629 1.0185
## 4 63.1994 4.7802 0.3085 0.9581 0.4106 0.5462 39.0434 1.9637
## 5 64.5874 5.5975 0.3486 0.9262 0.3521 0.6102 14.0540 1.4752
## 6 71.9579 6.0078 0.3922 0.9569 0.3107 0.3089 20.1316 4.8602
## 7 72.7720 6.5379 0.4288 0.8597 0.3076 0.6359 36.6526 1.3613
## 8 80.5472 8.3474 0.3937 0.9451 0.3303 0.6792 19.8416 1.1140
## 9 85.6861 9.5189 0.3828 0.9766 0.3421 0.4147 56.4663 3.3249
## 10 89.4399 11.7002 0.3895 0.9858 0.3266 0.5931 18.5213 1.5969
## Ratkowsky Ball Ptbiserial Frey McClain Dunn Hubert SDindex
## 2 0.4606 147.1933 0.5970 0.0616 0.5265 0.0412 0.0023 2.0088
## 3 0.4958 50.9523 0.7169 0.7401 0.6296 0.0580 0.0028 1.5641
## 4 0.4431 31.1704 0.7030 0.8948 0.7655 0.0623 0.0031 1.8915
## 5 0.4047 21.2952 0.6892 1.3879 0.8368 0.0740 0.0033 1.5738
## 6 0.3722 16.5341 0.6817 0.4503 0.8664 0.0842 0.0033 1.8071
## 7 0.3476 13.0230 0.6815 1.1435 0.8708 0.0927 0.0034 1.6904
## 8 0.3314 8.9250 0.5977 1.0644 1.2698 0.0990 0.0035 1.9843
## 9 0.3151 6.9569 0.5491 0.8307 1.5619 0.1044 0.0036 2.4947
## 10 0.3023 5.0939 0.4938 0.5605 1.9793 0.1185 0.0038 2.3764
## Dindex SDbw
## 2 1.2595 1.1052
## 3 0.8925 0.3975
## 4 0.8236 0.4585
## 5 0.7703 0.3046
## 6 0.7485 0.2503
## 7 0.7183 0.1602
## 8 0.6356 0.1434
## 9 0.5981 0.1335
## 10 0.5367 0.1031
##
## $All.CriticalValues
## CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2 0.6044 46.4793 0.0002
## 3 0.6106 47.8328 0.3979
## 4 0.5522 38.1140 0.1017
## 5 0.4284 29.3527 0.2166
## 6 0.2316 29.8538 0.0031
## 7 0.5921 44.0831 0.2478
## 8 0.5362 36.3229 0.3517
## 9 0.5291 35.6039 0.0120
## 10 0.4656 30.9841 0.1804
##
## $Best.nc
## KL CH Hartigan CCC Scott Marriot TrCovW
## Number_clusters 3.0000 3.0000 3.0000 10.0000 3.0000 3 3.00
## Value_Index 9.8471 213.0817 103.8142 4.1139 238.6703 2042179 10381.44
## TraceW Friedman Rubin Cindex DB Silhouette Duda
## Number_clusters 3.0000 3.000 3.0000 2.0000 3.0000 3.0000 3.0000
## Value_Index 113.3546 15.107 -0.9934 0.2978 0.8581 0.4496 0.7005
## PseudoT2 Beale Ratkowsky Ball PtBiserial Frey McClain
## Number_clusters 3.0000 3.0000 3.0000 3.000 3.0000 1 2.0000
## Value_Index 32.0629 1.0185 0.4958 96.241 0.7169 NA 0.5265
## Dunn Hubert SDindex Dindex SDbw
## Number_clusters 10.0000 0 3.0000 0 10.0000
## Value_Index 0.1185 0 1.5641 0 0.1031
##
## $Best.partition
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
## [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
#It’s possible to visualize the result using the function fviz_nbclust() [in factoextra], as follow:
fviz_nbclust(nb) + theme_minimal()
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 2 proposed 2 as the best number of clusters
## * 18 proposed 3 as the best number of clusters
## * 3 proposed 10 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 3 .
