test

require(graphics)

# a 2-dimensional example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))

## K-means clustering with 2 clusters of sizes 51, 49
## 
## Cluster means:
##             x           y
## 1  1.03931844  0.96130188
## 2 -0.04955299 -0.01873649
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1] 8.827633 8.828323
##  (between_SS / total_SS =  75.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)

# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl);  head(fitted.x)

##             x           y
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649

resid.x <- x - fitted(cl)

## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
         c(ss(fitted.x), ss(resid.x),    ss(x)))

##              [,1]     [,2]    
## betweenss    53.63144 53.63144
## tot.withinss 17.65596 17.65596
## totss        71.2874  71.2874

stopifnot(all.equal(cl$ totss,        ss(x)),
      all.equal(cl$ tot.withinss, ss(resid.x)),
      ## these three are the same:
      all.equal(cl$ betweenss,    ss(fitted.x)),
      all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
      ## and hence also
      all.equal(ss(x), ss(fitted.x) + ss(resid.x))
      )

kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## [1] 71.2874

## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))

## K-means clustering with 5 clusters of sizes 15, 25, 26, 20, 14
## 
## Cluster means:
##             x          y
## 1  0.01340044 -0.4138815
## 2  0.82411477  0.7936244
## 3  1.24624505  1.1225302
## 4  0.12777577  0.1828706
## 5 -0.37032989  0.1166231
## 
## Clustering vector:
##   [1] 1 5 1 4 1 4 1 1 5 5 1 4 1 2 4 4 4 1 4 5 4 4 1 4 4 1 4 1 4 1 5 1 1 5 5
##  [36] 4 5 4 4 5 5 5 4 5 5 1 4 4 4 5 3 3 2 3 2 3 2 3 3 3 2 3 3 2 3 2 2 2 3 3
##  [71] 3 3 3 2 3 3 2 2 3 3 2 3 2 2 3 3 2 2 2 2 2 2 3 3 2 2 2 3 3 2
## 
## Within cluster sum of squares by cluster:
## [1] 1.334008 1.790710 3.387071 1.049642 0.904233
##  (between_SS / total_SS =  88.1 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

test

Matthew Curcio

June 1, 2019