Introduction

This document demonstrates basic k-means clustering technique. The analysis makes use of the iris dataset.

library(knitr)
knitr::opts_chunk$set(tidy=T, 
               fig.width=8,
               fig.height=6,
               fig.align='center',
               warning=FALSE,
               message=FALSE,
               echo=TRUE)
options(width = 120)
data(iris); attach(iris)

Base Plotting Package

library(datasets)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
library(ggplot2)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) + 
  geom_point()

set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster
## K-means clustering with 3 clusters of sizes 50, 52, 48
## 
## Cluster means:
##   Petal.Length Petal.Width
## 1     1.462000    0.246000
## 2     4.269231    1.342308
## 3     5.595833    2.037500
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
##  [58] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3
## [115] 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1]  2.02200 13.05769 16.29167
##  (between_SS / total_SS =  94.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"    "size"        
## [8] "iter"         "ifault"
table(irisCluster$cluster, iris$Species)
##    
##     setosa versicolor virginica
##   1     50          0         0
##   2      0         48         4
##   3      0          2        46
irisCluster$cluster <- as.factor(irisCluster$cluster)
centers=as.data.frame(irisCluster$centers)
centers$Species = as.numeric(rownames(centers))
colnames(centers)
## [1] "Petal.Length" "Petal.Width"  "Species"
centers
##   Petal.Length Petal.Width Species
## 1     1.462000    0.246000       1
## 2     4.269231    1.342308       2
## 3     5.595833    2.037500       3
library(cluster)
clusplot(iris, irisCluster$cluster, color=TRUE, shade=TRUE, 
    labels=2, lines=0)

irisCluster$centers
##   Petal.Length Petal.Width
## 1     1.462000    0.246000
## 2     4.269231    1.342308
## 3     5.595833    2.037500
as.data.frame(as.table(irisCluster$centers))
##   Var1         Var2     Freq
## 1    1 Petal.Length 1.462000
## 2    2 Petal.Length 4.269231
## 3    3 Petal.Length 5.595833
## 4    1  Petal.Width 0.246000
## 5    2  Petal.Width 1.342308
## 6    3  Petal.Width 2.037500

Programming Environment

sessionInfo()
## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] cluster_2.0.6 ggplot2_2.2.1 knitr_1.15.1 
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.10     digest_0.6.12    rprojroot_1.2    plyr_1.8.4       grid_3.3.3       gtable_0.2.0    
##  [7] backports_1.0.5  formatR_1.4      magrittr_1.5     evaluate_0.10    scales_0.4.1     stringi_1.1.3   
## [13] lazyeval_0.2.0   rmarkdown_1.4    labeling_0.3     tools_3.3.3      stringr_1.2.0    munsell_0.4.3   
## [19] yaml_2.1.14      colorspace_1.3-2 htmltools_0.3.5  tibble_1.3.0