k-means example

Generate data as the mixture of 5 Gaussians (well separated for this example).

library(MASS)
library(ggplot2)
suppressPackageStartupMessages(library(dplyr))

spy <- function(x) {print(x); x}

set.seed(42-2)

Now we generate some data to work with.

centers <- data.frame(x=c(-2, 2, -2, 2, 1), y=c(2, 2, -2, -2, 0))
data <- lapply(seq(nrow(centers)),
               function(idx)
                 mvrnorm(30,
                         mu=as.numeric(centers[idx,]),
                         Sigma=matrix(c(0.1,0,0,0.1),
                                      nrow=2))) %>%
  Reduce(rbind, .) %>% as.data.frame
names(data) <- c("x", "y")
ggplot(data, aes(x=x, y=y)) + geom_point()

plot of chunk generateData

Finally we cluster.

km <- kmeans(data, 5, nstart = 20)
print(as.data.frame(km$centers))

##         x        y
## 1  0.8644 -0.03806
## 2 -1.9391 -1.94921
## 3  2.0141  1.99618
## 4 -2.1282  1.96316
## 5  1.9767 -2.06286

ggplot(data, aes(x=x, y=y)) + geom_point() +
  geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))

plot of chunk findCentroids

Now to show the ustability in kmeans, we also show the results of doing kmeans with a single start. Here you see two out of three give very bad cluster centers. These won’t improve with more interations, these are local bests.

km <- kmeans(data, 5)
print(as.data.frame(km$centers))

##        x       y
## 1  1.439  0.9791
## 2 -2.128  1.9632
## 3 -1.831 -2.0101
## 4 -2.481 -1.6448
## 5  1.977 -2.0629

ggplot(data, aes(x=x, y=y)) + geom_point() +
  geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))

plot of chunk unnamed-chunk-1

km <- kmeans(data, 5)
print(as.data.frame(km$centers))

##         x        y
## 1 -2.1282  1.96316
## 2 -1.9391 -1.94921
## 3  1.9767 -2.06286
## 4  2.0141  1.99618
## 5  0.8644 -0.03806

ggplot(data, aes(x=x, y=y)) + geom_point() +
  geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))

plot of chunk unnamed-chunk-1

km <- kmeans(data, 5)
print(as.data.frame(km$centers))

##        x      y
## 1 -2.128  1.963
## 2 -2.354 -1.755
## 3  1.979  1.901
## 4 -1.788 -2.020
## 5  1.420 -1.103

ggplot(data, aes(x=x, y=y)) + geom_point() +
  geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))

plot of chunk unnamed-chunk-1