Gap Statistic for Estimating the Number of Clusters
#from R kmeans help
# a 2-dimensional example
set.seed(1492)
library('cluster')
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
gsP.Z <- clusGap(x, FUN = kmeans, K.max = 4, B = 200)
gsP.Z
## Clustering Gap statistic ["clusGap"].
## B=200 simulated reference sets, k = 1..4
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 2
## logW E.logW gap SE.sim
## [1,] 3.278851 3.360794 0.08194353 0.03818589
## [2,] 2.581850 2.936698 0.35484750 0.03298729
## [3,] 2.449657 2.760230 0.31057308 0.03530315
## [4,] 2.316659 2.600011 0.28335147 0.03766585
cl <- kmeans(x, which.max(gsP.Z$Tab[,3]))
plot(x, col = cl$cluster)
plot(gsP.Z, main = "Gap statistic for the data")
Now, let’s try with three clusters
#from R kmeans help
# a 2-dimensional example
library('cluster')
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 4, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
gsP.Z <- clusGap(x, FUN = kmeans, K.max = 4, B = 200)
gsP.Z
## Clustering Gap statistic ["clusGap"].
## B=200 simulated reference sets, k = 1..4
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
## logW E.logW gap SE.sim
## [1,] 4.619495 4.538478 -0.08101687 0.03355308
## [2,] 3.851696 3.945970 0.09427358 0.03069327
## [3,] 2.923698 3.653423 0.72972526 0.03022670
## [4,] 2.862678 3.481741 0.61906272 0.03999245
cl <- kmeans(x, which.max(gsP.Z$Tab[,3]))
plot(x, col = cl$cluster)
Now, let’s try getting the three clusters closer to each other
#from R kmeans help
# a 2-dimensional example
library('cluster')
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 2, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
gsP.Z <- clusGap(x, FUN = kmeans, K.max = 4, B = 200)
gsP.Z
## Clustering Gap statistic ["clusGap"].
## B=200 simulated reference sets, k = 1..4
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 3
## logW E.logW gap SE.sim
## [1,] 4.010358 4.024755 0.01439744 0.03133673
## [2,] 3.420587 3.520125 0.09953857 0.02710607
## [3,] 2.973097 3.299115 0.32601730 0.03109743
## [4,] 2.886289 3.172393 0.28610451 0.03022591
cl <- kmeans(x, which.max(gsP.Z$Tab[,3]))
plot(x, col = cl$cluster)
plot(gsP.Z, main = "Gap statistic for the data")
Now, let’s try with three clusters even closer
#from R kmeans help
# a 2-dimensional example
library('cluster')
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 0.5, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 0.7, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
gsP.Z <- clusGap(x, FUN = kmeans, K.max = 4, B = 200)
gsP.Z
## Clustering Gap statistic ["clusGap"].
## B=200 simulated reference sets, k = 1..4
## --> Number of clusters (method 'firstSEmax', SE.factor=1): 2
## logW E.logW gap SE.sim
## [1,] 3.296782 3.618456 0.3216737 0.02624460
## [2,] 2.892258 3.325615 0.4333574 0.03529877
## [3,] 2.767049 3.095116 0.3280673 0.03255138
## [4,] 2.621527 2.898401 0.2768739 0.03169599
cl <- kmeans(x, which.max(gsP.Z$Tab[,3]))
plot(x, col = cl$cluster)
plot(gsP.Z, main = "Gap statistic for the data")