require(graphics)
# a 2-dimensional example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))
## K-means clustering with 2 clusters of sizes 51, 49
##
## Cluster means:
## x y
## 1 1.03931844 0.96130188
## 2 -0.04955299 -0.01873649
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 8.827633 8.828323
## (between_SS / total_SS = 75.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)

# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl); head(fitted.x)
## x y
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
## 2 -0.04955299 -0.01873649
resid.x <- x - fitted(cl)
## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.x), ss(resid.x), ss(x)))
## [,1] [,2]
## betweenss 53.63144 53.63144
## tot.withinss 17.65596 17.65596
## totss 71.2874 71.2874
stopifnot(all.equal(cl$ totss, ss(x)),
all.equal(cl$ tot.withinss, ss(resid.x)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.x)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(x), ss(fitted.x) + ss(resid.x))
)
kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## [1] 71.2874
## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
## K-means clustering with 5 clusters of sizes 15, 25, 26, 20, 14
##
## Cluster means:
## x y
## 1 0.01340044 -0.4138815
## 2 0.82411477 0.7936244
## 3 1.24624505 1.1225302
## 4 0.12777577 0.1828706
## 5 -0.37032989 0.1166231
##
## Clustering vector:
## [1] 1 5 1 4 1 4 1 1 5 5 1 4 1 2 4 4 4 1 4 5 4 4 1 4 4 1 4 1 4 1 5 1 1 5 5
## [36] 4 5 4 4 5 5 5 4 5 5 1 4 4 4 5 3 3 2 3 2 3 2 3 3 3 2 3 3 2 3 2 2 2 3 3
## [71] 3 3 3 2 3 3 2 2 3 3 2 3 2 2 3 3 2 2 2 2 2 2 3 3 2 2 2 3 3 2
##
## Within cluster sum of squares by cluster:
## [1] 1.334008 1.790710 3.387071 1.049642 0.904233
## (between_SS / total_SS = 88.1 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
