Generate data as the mixture of 5 Gaussians (well separated for this example).
library(MASS)
library(ggplot2)
suppressPackageStartupMessages(library(dplyr))
spy <- function(x) {print(x); x}
set.seed(42-2)
Now we generate some data to work with.
centers <- data.frame(x=c(-2, 2, -2, 2, 1), y=c(2, 2, -2, -2, 0))
data <- lapply(seq(nrow(centers)),
function(idx)
mvrnorm(30,
mu=as.numeric(centers[idx,]),
Sigma=matrix(c(0.1,0,0,0.1),
nrow=2))) %>%
Reduce(rbind, .) %>% as.data.frame
names(data) <- c("x", "y")
ggplot(data, aes(x=x, y=y)) + geom_point()
Finally we cluster.
km <- kmeans(data, 5, nstart = 20)
print(as.data.frame(km$centers))
## x y
## 1 0.8644 -0.03806
## 2 -1.9391 -1.94921
## 3 2.0141 1.99618
## 4 -2.1282 1.96316
## 5 1.9767 -2.06286
ggplot(data, aes(x=x, y=y)) + geom_point() +
geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))
Now to show the ustability in kmeans, we also show the results of doing kmeans with a single start. Here you see two out of three give very bad cluster centers. These won’t improve with more interations, these are local bests.
km <- kmeans(data, 5)
print(as.data.frame(km$centers))
## x y
## 1 1.439 0.9791
## 2 -2.128 1.9632
## 3 -1.831 -2.0101
## 4 -2.481 -1.6448
## 5 1.977 -2.0629
ggplot(data, aes(x=x, y=y)) + geom_point() +
geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))
km <- kmeans(data, 5)
print(as.data.frame(km$centers))
## x y
## 1 -2.1282 1.96316
## 2 -1.9391 -1.94921
## 3 1.9767 -2.06286
## 4 2.0141 1.99618
## 5 0.8644 -0.03806
ggplot(data, aes(x=x, y=y)) + geom_point() +
geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))
km <- kmeans(data, 5)
print(as.data.frame(km$centers))
## x y
## 1 -2.128 1.963
## 2 -2.354 -1.755
## 3 1.979 1.901
## 4 -1.788 -2.020
## 5 1.420 -1.103
ggplot(data, aes(x=x, y=y)) + geom_point() +
geom_point(data=as.data.frame(km$centers), aes(x=x, y=y, color="red"))