Choosing initial centroids

library(MASS)
library(ggplot2)
suppressPackageStartupMessages(library(dplyr))

spy <- function(x) {print(x); x}

set.seed(42-2)

Generate some data to play with. For this data we saw before that many initial choices are bad, lets see if the method of choosing initial points iteratively as far as possible from the already chosen ones solves the problem. Certainly this is not the most challenging set of points to find initial centroids for, but it would be a start.

centers <- data.frame(x=c(-2, 2, -2, 2, 1), y=c(2, 2, -2, -2, 0))
data <- lapply(seq(nrow(centers)),
               function(idx)
                 mvrnorm(30,
                         mu=as.numeric(centers[idx,]),
                         Sigma=matrix(c(0.1,0,0,0.1),
                                      nrow=2))) %>%
  Reduce(rbind, .) %>% as.data.frame
names(data) <- c("x", "y")
ggplot(data, aes(x=x, y=y)) + geom_point()

plot of chunk generateData

initial.centroids <- function(data, k) {
  stopifnot(k > 1)
  data.ct <- nrow(data)
  stopifnot(data.ct > k)
  centroids <- data.frame(x=rep(0,k), y=rep(0,k))
  # first centroid is random data point
  centroids[1,] <- data[sample.int(data.ct, size=1),]
  # remaining centroids have min distance to the already chosen ones maximal
  for(idx in 2:k) {
    dists <- lapply(seq(nrow(data)),
                    function(dat.idx) {
                      min(sapply(1:idx,
                                 function(idx)
                                   dist(rbind(centroids[idx,],
                                              data[dat.idx,]))))
                    })
    best.idx <- which.max(dists)
    centroids[idx,] <- data[best.idx,]
  }
  centroids
}

centroids <- initial.centroids(data, 5)
km <- kmeans(data, centroids)
ggplot(data, aes(x=x, y=y)) + geom_point() +
  geom_point(data=centroids[-1,], aes(x=x,y=y, color="later")) +
  geom_point(data=centroids[1,], aes(x=x,y=y, color="first")) +
  geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="result"))

plot of chunk unnamed-chunk-1

I rerun this strategy many times, and always the kmeans clustering found the optimal centroids after.