library(MASS)
library(ggplot2)
suppressPackageStartupMessages(library(dplyr))
spy <- function(x) {print(x); x}
set.seed(42-2)
Generate some data to play with. For this data we saw before that many initial choices are bad, lets see if the method of choosing initial points iteratively as far as possible from the already chosen ones solves the problem. Certainly this is not the most challenging set of points to find initial centroids for, but it would be a start.
centers <- data.frame(x=c(-2, 2, -2, 2, 1), y=c(2, 2, -2, -2, 0))
data <- lapply(seq(nrow(centers)),
function(idx)
mvrnorm(30,
mu=as.numeric(centers[idx,]),
Sigma=matrix(c(0.1,0,0,0.1),
nrow=2))) %>%
Reduce(rbind, .) %>% as.data.frame
names(data) <- c("x", "y")
ggplot(data, aes(x=x, y=y)) + geom_point()
initial.centroids <- function(data, k) {
stopifnot(k > 1)
data.ct <- nrow(data)
stopifnot(data.ct > k)
centroids <- data.frame(x=rep(0,k), y=rep(0,k))
# first centroid is random data point
centroids[1,] <- data[sample.int(data.ct, size=1),]
# remaining centroids have min distance to the already chosen ones maximal
for(idx in 2:k) {
dists <- lapply(seq(nrow(data)),
function(dat.idx) {
min(sapply(1:idx,
function(idx)
dist(rbind(centroids[idx,],
data[dat.idx,]))))
})
best.idx <- which.max(dists)
centroids[idx,] <- data[best.idx,]
}
centroids
}
centroids <- initial.centroids(data, 5)
km <- kmeans(data, centroids)
ggplot(data, aes(x=x, y=y)) + geom_point() +
geom_point(data=centroids[-1,], aes(x=x,y=y, color="later")) +
geom_point(data=centroids[1,], aes(x=x,y=y, color="first")) +
geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="result"))
I rerun this strategy many times, and always the kmeans clustering found the optimal centroids after.