Iris

sapply(iris[,-5], var)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##    0.6856935    0.1899794    3.1162779    0.5810063

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

ggplot(iris,aes(x = Sepal.Length, y = Sepal.Width, col= Species)) + geom_point()

ggplot(iris,aes(x = Petal.Length, y = Petal.Width, col= Species)) + geom_point()

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

set.seed(200)
k.max <- 10
wss<- sapply(1:k.max,function(k){kmeans(iris[,3:4],k,nstart = 20,iter.max = 20)$tot.withinss})
wss

##  [1] 550.895333  86.390220  31.371359  19.465989  13.916909  11.025145
##  [7]   9.236596   7.674414   6.456495   5.550520

plot(1:k.max,wss, type= "b", xlab = "Number of clusters(k)", ylab = "Within cluster sum of squares")

icluster <- kmeans(iris[,3:4],3,nstart = 20)
table(icluster$cluster,iris$Species)

##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48         4
##   3      0          2        46

#kmeans from scratch

# Sample data    
set.seed(100)
xval <- rnorm(12, mean = rep(1:3, each = 4), sd = 0.2)
yval <- rnorm(12, mean = rep(c(1,2,1), each = 4), sd = 0.2)

# Kmeans function with random.seed for initialization
kclus <- function(x, y, nclus, random.seed=123) {

  set.seed(random.seed)
  # start with random cluster centers
  xcen <- runif(n = nclus, min = min(x), max = max(x))   
  ycen <- runif(n = nclus, min = min(y), max = max(y))
  
  # data points and cluster assignment in "data"
  # cluster coordinates in "clus"
  data <- data.frame(xval = x, yval = y, clus = NA)
  clus <- data.frame(name = 1:nclus, xcen = xcen, ycen = ycen)
  
  finish <- FALSE
  
  round = 1

  while(finish == FALSE) {
    
    # assign cluster with minimum distance to each data point
    for(i in 1:length(x)) {
      dist <- sqrt((x[i]-clus$xcen)^2 + (y[i]-clus$ycen)^2)
      #print(i)
      #print(dist)
      #print(clus$xcen)
      #print(round)
      data$clus[i] <- which.min(dist)
    }
    
    xcen_old <- clus$xcen
    ycen_old <- clus$ycen
    
    # calculate new cluster centers
    for(i in 1:nclus) {
      clus[i,2] <- mean(subset(data$xval, data$clus == i))
      clus[i,3] <- mean(subset(data$yval, data$clus == i))
    }
    
    # stop the loop if there is no change in cluster coordinates
    if(identical(xcen_old, clus$xcen) & identical(ycen_old, clus$ycen))
      finish <- TRUE
    
    round = round + 1
  }
  data
}

# with default random seed 123, you should be able to reproduce the result
# as you can see, in this case, no data points were assigned to the 4th cluster
cluster <- kclus(as.numeric(iris[,1]), as.numeric(iris[,2]), 3)
cluster.centers <- aggregate(.~clus, cluster, mean)
ggplot(cluster, aes(xval, yval, color = as.factor(clus))) + 
  geom_point(size=5) + 
  geom_point(data=cluster.centers, aes(xval, yval, col=as.factor(clus)), pch=8, size=5)

Iris

Yung Kuo

4/19/2020