sapply(iris[,-5], var)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 0.6856935 0.1899794 3.1162779 0.5810063
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()
ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
set.seed(200)
k.max <- 10
wss<- sapply(1:k.max,function(k){kmeans(iris[,3:4],k,nstart = 20,iter.max = 20)$tot.withinss})
wss
## [1] 550.895333 86.390220 31.371359 19.465989 13.916909 11.025145
## [7] 9.236596 7.674414 6.456495 5.550520
plot(1:k.max,wss, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")
icluster <- kmeans(iris[,3:4],3,nstart = 20)
table(icluster$cluster,iris$Species)
##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 4
## 3 0 2 46
#kmeans from scratch
# Sample data
set.seed(100)
xval <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
yval <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)
# Kmeans function with random.seed for initialization
kclus <- function(x, y, nclus, random.seed=123) {
set.seed(random.seed)
# start with random cluster centers
xcen <- runif(n = nclus, min = min(x), max = max(x))
ycen <- runif(n = nclus, min = min(y), max = max(y))
# data points and cluster assignment in "data"
# cluster coordinates in "clus"
data <- data.frame(xval = x, yval = y, clus = NA)
clus <- data.frame(name = 1:nclus, xcen = xcen, ycen = ycen)
finish <- FALSE
round = 1
while(finish == FALSE) {
# assign cluster with minimum distance to each data point
for(i in 1:length(x)) {
dist <- sqrt((x[i]-clus$xcen)^2 + (y[i]-clus$ycen)^2)
#print(i)
#print(dist)
#print(clus$xcen)
#print(round)
data$clus[i] <- which.min(dist)
}
xcen_old <- clus$xcen
ycen_old <- clus$ycen
# calculate new cluster centers
for(i in 1:nclus) {
clus[i,2] <- mean(subset(data$xval, data$clus == i))
clus[i,3] <- mean(subset(data$yval, data$clus == i))
}
# stop the loop if there is no change in cluster coordinates
if(identical(xcen_old, clus$xcen) & identical(ycen_old, clus$ycen))
finish <- TRUE
round = round + 1
}
data
}
# with default random seed 123, you should be able to reproduce the result
# as you can see, in this case, no data points were assigned to the 4th cluster
cluster <- kclus(as.numeric(iris[,1]), as.numeric(iris[,2]), 3)
cluster.centers <- aggregate(.~clus, cluster, mean)
ggplot(cluster, aes(xval, yval, color = as.factor(clus))) +
geom_point(size=5) +
geom_point(data=cluster.centers, aes(xval, yval, col=as.factor(clus)), pch=8, size=5)