Cluster analysis (k-means)

Load data: Fisher's iris data '

data(iris)
summary(iris)
  Sepal.Length   Sepal.Width    Petal.Length   Petal.Width        Species  
 Min.   :4.30   Min.   :2.00   Min.   :1.00   Min.   :0.1   setosa    :50  
 1st Qu.:5.10   1st Qu.:2.80   1st Qu.:1.60   1st Qu.:0.3   versicolor:50  
 Median :5.80   Median :3.00   Median :4.35   Median :1.3   virginica :50  
 Mean   :5.84   Mean   :3.06   Mean   :3.76   Mean   :1.2                  
 3rd Qu.:6.40   3rd Qu.:3.30   3rd Qu.:5.10   3rd Qu.:1.8                  
 Max.   :7.90   Max.   :4.40   Max.   :6.90   Max.   :2.5                  

Scatter plot: Sepal.Length vs Sepal.Width

plot(Sepal.Width ~ Sepal.Length, iris, pch = 16)

plot of chunk unnamed-chunk-3


library(ggplot2)
ggplot(iris, aes(Sepal.Length, Sepal.Width)) + geom_point()

plot of chunk unnamed-chunk-3

Perform k-means method for cluster number 1 to 10

res.kmeans <- lapply(1:10, function(i) {
    kmeans(iris[,c("Sepal.Length","Sepal.Width")], centers = i)
})

## SS for each cluster (1 cluster to 10 clusters)
lapply(res.kmeans, function(x) x$withinss)
[[1]]
[1] 130.48

[[2]]
[1] 30.044 28.404

[[3]]
[1] 13.129 12.622 11.300

[[4]]
[1] 10.6341  4.4517  4.6300  8.2506

[[5]]
[1] 3.8373 3.6096 3.1929 4.8527 6.0237

[[6]]
[1] 3.1440 3.8262 4.3523 1.2296 3.1762 2.3745

[[7]]
[1] 2.30828 2.54500 1.55520 3.09286 0.64875 2.28316 3.05517

[[8]]
[1] 1.42960 3.13687 0.58571 0.74800 1.31048 1.99529 1.72375 2.27067

[[9]]
[1] 0.83524 1.99529 2.37455 0.47429 0.90476 0.39333 0.42200 3.13687 1.50000

[[10]]
 [1] 1.16917 1.00381 0.53143 0.60278 0.48800 0.81882 1.11130 2.37455 1.01444 1.51389


## Sum up SS
res.within.ss <- sapply(res.kmeans, function(x) sum(x$withinss))

Number of cluster and Within SS

plot(1:10, res.within.ss, type = "b", xlab = "Number of clusters", ylab = "Within SS")

plot of chunk unnamed-chunk-5


ggplot(data.frame(cluster = 1:10, within.ss = res.within.ss), aes(cluster, within.ss)) +
    geom_point() + geom_line() +
    scale_x_continuous(breaks = 0:10)

plot of chunk unnamed-chunk-5

Coloring by clusters

cluster.colors <- lapply(res.kmeans, function(x) x$cluster)

library(plyr)
l_ply(cluster.colors,
      function(colors) {

          plot(Sepal.Width ~ Sepal.Length, iris, col = colors, main = paste(nlevels(factor(colors))), pch = 16)
      })

plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6



l_ply(cluster.colors,
      function(colors) {
          plot.dat <- cbind(iris, cluster = factor(colors))

          gg.obj <- ggplot(plot.dat, aes(Sepal.Length, Sepal.Width, color = cluster)) +
              geom_point() + labs(title = paste(nlevels(factor(colors))))

          print(gg.obj)
      })

plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6 plot of chunk unnamed-chunk-6

True grouping

plot(Sepal.Width ~ Sepal.Length, iris, col = Species, pch = 16)

plot of chunk unnamed-chunk-7


library(ggplot2)
ggplot(iris, aes(Sepal.Length, Sepal.Width, color = Species)) + geom_point()

plot of chunk unnamed-chunk-7