iris_cluster

if(!require(devtools)) install.packages("devtools")

## Loading required package: devtools

devtools::install_github("kassambara/factoextra")

## Skipping install of 'factoextra' from a github remote, the SHA1 (718b68fe) has not changed since last install.
##   Use `force = TRUE` to force installation

pkgs <- c("cluster",  "NbClust")
install.packages(pkgs)

## Installing packages into '/home/rstudio-user/R/x86_64-pc-linux-gnu-library/3.4'
## (as 'lib' is unspecified)

library(factoextra)

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(cluster)
library(NbClust)

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

# Remove species column (5) and scale the data
iris.scaled <- scale(iris[, -5])

# K-means clustering
set.seed(123)
km.res <- kmeans(iris.scaled, 3, nstart = 25)
# k-means group number of each observation
km.res$cluster

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3
##  [71] 2 3 3 3 3 2 2 2 3 3 3 3 3 3 3 2 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 2 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3

# Visualize k-means clusters
fviz_cluster(km.res, data = iris.scaled, geom = "point",
             stand = FALSE, ellipse.type = "norm")

# PAM clustering
library("cluster")
pam.res <- pam(iris.scaled, 3)
pam.res$cluster

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 3 3 3 2 3 3 3 3 3 3 3 3 2 3 3 3 3
##  [71] 3 3 3 3 3 2 2 2 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2
## [106] 2 3 2 2 2 2 2 2 3 2 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 3 2 2 2 3 2
## [141] 2 2 3 2 2 2 3 2 2 3

# Visualize pam clusters
fviz_cluster(pam.res, stand = FALSE, geom = "point",
             ellipse.type = "norm")

# Compute pairewise distance matrices
dist.res <- dist(iris.scaled, method = "euclidean")
# Hierarchical clustering results
hc <- hclust(dist.res, method = "complete")
# Visualization of hclust
plot(hc, labels = FALSE, hang = -1)
# Add rectangle around 3 groups
rect.hclust(hc, k = 3, border = 2:4)

# Cut into 3 groups
hc.cut <- cutree(hc, k = 3)
head(hc.cut, 20)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

#
set.seed(123)
# Compute and plot wss for k = 2 to k = 15
k.max <- 15 # Maximal number of clusters
data <- iris.scaled
wss <- sapply(1:k.max, 
              function(k){kmeans(data, k, nstart=10 )$tot.withinss})

plot(1:k.max, wss,
     type="b", pch = 19, frame = FALSE, 
     xlab="Number of clusters K",
     ylab="Total within-clusters sum of squares")
abline(v = 3, lty =2)

fviz_nbclust(iris.scaled, kmeans, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(iris.scaled, pam, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2)

fviz_nbclust(iris.scaled, hcut, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2)

library(cluster)
k.max <- 15
data <- iris.scaled
sil <- rep(0, k.max)

# Compute the average silhouette width for 
# k = 2 to k = 15
for(i in 2:k.max){
  km.res <- kmeans(data, centers = i, nstart = 25)
  ss <- silhouette(km.res$cluster, dist(data))
  sil[i] <- mean(ss[, 3])
}

# Plot the  average silhouette width
plot(1:k.max, sil, type = "b", pch = 19, 
     frame = FALSE, xlab = "Number of clusters k")
abline(v = which.max(sil), lty = 2)

require(cluster)
fviz_nbclust(iris.scaled, kmeans, method = "silhouette")

require(cluster)
fviz_nbclust(iris.scaled, pam, method = "silhouette")

require(cluster)
fviz_nbclust(iris.scaled, hcut, method = "silhouette",
             hc_method = "complete")

#

# Compute gap statistic
library(cluster)
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = kmeans, nstart = 25,
                    K.max = 10, B = 50)

# Print the result
print(gap_stat, method = "firstmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = iris.scaled, FUNcluster = kmeans, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'firstmax'): 3
##           logW   E.logW       gap     SE.sim
##  [1,] 4.534565 4.754595 0.2200304 0.02504585
##  [2,] 4.021316 4.489687 0.4683711 0.02742112
##  [3,] 3.806577 4.295715 0.4891381 0.02384746
##  [4,] 3.699263 4.143675 0.4444115 0.02093871
##  [5,] 3.589284 4.052262 0.4629781 0.02036366
##  [6,] 3.519726 3.972254 0.4525278 0.02049566
##  [7,] 3.448288 3.905945 0.4576568 0.02106987
##  [8,] 3.398210 3.850807 0.4525967 0.01969193
##  [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183

## Clustering Gap statistic ["clusGap"].
## B=50 simulated reference sets, k = 1..10
##  --> Number of clusters (method 'firstmax'): 3
##           logW   E.logW       gap     SE.sim
##  [1,] 4.534565 4.754595 0.2200304 0.02504585
##  [2,] 4.021316 4.489687 0.4683711 0.02742112
##  [3,] 3.806577 4.295715 0.4891381 0.02384746
##  [4,] 3.699263 4.143675 0.4444115 0.02093871
##  [5,] 3.589284 4.052262 0.4629781 0.02036366
##  [6,] 3.519726 3.972254 0.4525278 0.02049566
##  [7,] 3.448288 3.905945 0.4576568 0.02106987
##  [8,] 3.398210 3.850807 0.4525967 0.01969193
##  [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183
# Base plot of gap statistic
plot(gap_stat, frame = FALSE, xlab = "Number of clusters k")
abline(v = 3, lty = 2)

fviz_gap_stat(gap_stat)

# Print
print(gap_stat, method = "Tibs2001SEmax")

## Clustering Gap statistic ["clusGap"] from call:
## clusGap(x = iris.scaled, FUNcluster = kmeans, K.max = 10, B = 50,     nstart = 25)
## B=50 simulated reference sets, k = 1..10; spaceH0="scaledPCA"
##  --> Number of clusters (method 'Tibs2001SEmax', SE.factor=1): 2
##           logW   E.logW       gap     SE.sim
##  [1,] 4.534565 4.754595 0.2200304 0.02504585
##  [2,] 4.021316 4.489687 0.4683711 0.02742112
##  [3,] 3.806577 4.295715 0.4891381 0.02384746
##  [4,] 3.699263 4.143675 0.4444115 0.02093871
##  [5,] 3.589284 4.052262 0.4629781 0.02036366
##  [6,] 3.519726 3.972254 0.4525278 0.02049566
##  [7,] 3.448288 3.905945 0.4576568 0.02106987
##  [8,] 3.398210 3.850807 0.4525967 0.01969193
##  [9,] 3.334279 3.802315 0.4680368 0.01905974
## [10,] 3.250246 3.759661 0.5094149 0.01928183

# Plot
fviz_gap_stat(gap_stat, 
              maxSE = list(method = "Tibs2001SEmax"))

# Relaxed the gap test to be within two standard deviations
fviz_gap_stat(gap_stat, 
              maxSE = list(method = "Tibs2001SEmax", SE.factor = 2))

# Compute gap statistic
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = pam, K.max = 10, B = 50)
# Plot gap statistic
fviz_gap_stat(gap_stat)

# Compute gap statistic
set.seed(123)
gap_stat <- clusGap(iris.scaled, FUN = hcut, K.max = 10, B = 50)
# Plot gap statistic
fviz_gap_stat(gap_stat)

###

library("NbClust")
set.seed(123)
res.nb <- NbClust(iris.scaled, distance = "euclidean",
                  min.nc = 2, max.nc = 10, 
                  method = "complete", index ="gap") 
res.nb # print the results

## $All.index
##       2       3       4       5       6       7       8       9      10 
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551 
## 
## $All.CriticalValues
##       2       3       4       5       6       7       8       9      10 
## -0.0539  0.4694  0.1787  0.2009  0.2848  0.0230  0.1631  0.0988  0.1708 
## 
## $Best.nc
## Number_clusters     Value_Index 
##          3.0000         -0.2303 
## 
## $Best.partition
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
##  [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3

## $All.index
##       2       3       4       5       6       7       8       9      10 
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551 
## 
## $All.CriticalValues
##       2       3       4       5       6       7       8       9      10 
## -0.0539  0.4694  0.1787  0.2009  0.2848  0.0230  0.1631  0.0988  0.1708 
## 
## $Best.nc
## Number_clusters     Value_Index 
##          3.0000         -0.2303 
## 
## $Best.partition
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
##  [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3
#The elements returned by the function NbClust() are accessible using the R code below:

# All gap statistic values
res.nb$All.index

##       2       3       4       5       6       7       8       9      10 
## -0.2899 -0.2303 -0.6915 -0.8606 -1.0506 -1.3223 -1.3303 -1.4759 -1.5551

# Best number of clusters
res.nb$Best.nc

## Number_clusters     Value_Index 
##          3.0000         -0.2303

# Best partition
res.nb$Best.partition

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
##  [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3

###

nb <- NbClust(iris.scaled, distance = "euclidean", min.nc = 2,
              max.nc = 10, method = "complete", index ="all")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 2 proposed 2 as the best number of clusters 
## * 18 proposed 3 as the best number of clusters 
## * 3 proposed 10 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

# Print the result
nb

## $All.index
##        KL       CH Hartigan     CCC    Scott   Marriot     TrCovW   TraceW
## 2  1.1854 151.6332 137.0327 -1.7090 206.2459 4042610.9 11446.7289 294.3866
## 3  9.8471 213.0817  33.2185  3.3712 444.9163 1852776.5  1065.2914 152.8569
## 4  1.3651 183.9682  24.9638  2.0682 543.6783 1705121.4   900.8342 124.6818
## 5  2.2951 166.6596  10.6281  0.4146 592.5766 1923092.4   562.5608 106.4760
## 6  2.7125 144.2243  12.7058 -0.7426 633.8818 2102673.6   558.6742  99.2046
## 7  0.0462 131.9862  39.5776 -1.2574 662.7020 2361686.4   432.8320  91.1610
## 8  2.7803 149.0463  19.9304  1.3079 781.5413 1396790.2   430.6392  71.3999
## 9  0.5276 150.1464  32.3094  2.0036 848.5792 1130685.4   285.0416  62.6120
## 10 2.6428 166.4470  16.6788  4.1139 922.7090  851586.6   199.1249  50.9395
##    Friedman   Rubin Cindex     DB Silhouette   Duda Pseudot2  Beale
## 2   38.3959  2.0245 0.2978 0.9735     0.4408 0.2934 170.9649 5.7326
## 3   53.5029  3.8991 0.3247 0.8581     0.4496 0.7005  32.0629 1.0185
## 4   63.1994  4.7802 0.3085 0.9581     0.4106 0.5462  39.0434 1.9637
## 5   64.5874  5.5975 0.3486 0.9262     0.3521 0.6102  14.0540 1.4752
## 6   71.9579  6.0078 0.3922 0.9569     0.3107 0.3089  20.1316 4.8602
## 7   72.7720  6.5379 0.4288 0.8597     0.3076 0.6359  36.6526 1.3613
## 8   80.5472  8.3474 0.3937 0.9451     0.3303 0.6792  19.8416 1.1140
## 9   85.6861  9.5189 0.3828 0.9766     0.3421 0.4147  56.4663 3.3249
## 10  89.4399 11.7002 0.3895 0.9858     0.3266 0.5931  18.5213 1.5969
##    Ratkowsky     Ball Ptbiserial   Frey McClain   Dunn Hubert SDindex
## 2     0.4606 147.1933     0.5970 0.0616  0.5265 0.0412 0.0023  2.0088
## 3     0.4958  50.9523     0.7169 0.7401  0.6296 0.0580 0.0028  1.5641
## 4     0.4431  31.1704     0.7030 0.8948  0.7655 0.0623 0.0031  1.8915
## 5     0.4047  21.2952     0.6892 1.3879  0.8368 0.0740 0.0033  1.5738
## 6     0.3722  16.5341     0.6817 0.4503  0.8664 0.0842 0.0033  1.8071
## 7     0.3476  13.0230     0.6815 1.1435  0.8708 0.0927 0.0034  1.6904
## 8     0.3314   8.9250     0.5977 1.0644  1.2698 0.0990 0.0035  1.9843
## 9     0.3151   6.9569     0.5491 0.8307  1.5619 0.1044 0.0036  2.4947
## 10    0.3023   5.0939     0.4938 0.5605  1.9793 0.1185 0.0038  2.3764
##    Dindex   SDbw
## 2  1.2595 1.1052
## 3  0.8925 0.3975
## 4  0.8236 0.4585
## 5  0.7703 0.3046
## 6  0.7485 0.2503
## 7  0.7183 0.1602
## 8  0.6356 0.1434
## 9  0.5981 0.1335
## 10 0.5367 0.1031
## 
## $All.CriticalValues
##    CritValue_Duda CritValue_PseudoT2 Fvalue_Beale
## 2          0.6044            46.4793       0.0002
## 3          0.6106            47.8328       0.3979
## 4          0.5522            38.1140       0.1017
## 5          0.4284            29.3527       0.2166
## 6          0.2316            29.8538       0.0031
## 7          0.5921            44.0831       0.2478
## 8          0.5362            36.3229       0.3517
## 9          0.5291            35.6039       0.0120
## 10         0.4656            30.9841       0.1804
## 
## $Best.nc
##                     KL       CH Hartigan     CCC    Scott Marriot   TrCovW
## Number_clusters 3.0000   3.0000   3.0000 10.0000   3.0000       3     3.00
## Value_Index     9.8471 213.0817 103.8142  4.1139 238.6703 2042179 10381.44
##                   TraceW Friedman   Rubin Cindex     DB Silhouette   Duda
## Number_clusters   3.0000    3.000  3.0000 2.0000 3.0000     3.0000 3.0000
## Value_Index     113.3546   15.107 -0.9934 0.2978 0.8581     0.4496 0.7005
##                 PseudoT2  Beale Ratkowsky   Ball PtBiserial Frey McClain
## Number_clusters   3.0000 3.0000    3.0000  3.000     3.0000    1  2.0000
## Value_Index      32.0629 1.0185    0.4958 96.241     0.7169   NA  0.5265
##                    Dunn Hubert SDindex Dindex    SDbw
## Number_clusters 10.0000      0  3.0000      0 10.0000
## Value_Index      0.1185      0  1.5641      0  0.1031
## 
## $Best.partition
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 3 3 3 2 3 2 3 2 3 2 2 3 2 3 3 3 3 2 2 2
##  [71] 3 3 3 3 3 3 3 3 3 2 2 2 2 3 3 3 3 2 3 2 2 3 2 2 2 3 3 3 2 2 3 3 3 3 3
## [106] 3 2 3 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [141] 3 3 3 3 3 3 3 3 3 3

#It’s possible to visualize the result using the function fviz_nbclust() [in factoextra], as follow:

fviz_nbclust(nb) + theme_minimal()

## Among all indices: 
## ===================
## * 2 proposed  0 as the best number of clusters
## * 1 proposed  1 as the best number of clusters
## * 2 proposed  2 as the best number of clusters
## * 18 proposed  3 as the best number of clusters
## * 3 proposed  10 as the best number of clusters
## 
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is  3 .

iris_cluster_study.R

rstudio-user

Wed Mar 28 07:03:55 2018