This document demonstrates basic k-means clustering technique. The analysis makes use of the iris dataset.
library(knitr)
knitr::opts_chunk$set(tidy=T,
fig.width=8,
fig.height=6,
fig.align='center',
warning=FALSE,
message=FALSE,
echo=TRUE)
options(width = 120)
data(iris); attach(iris)library(datasets)
head(iris)## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(ggplot2)
ggplot(iris, aes(Petal.Length, Petal.Width, color = Species)) +
geom_point()set.seed(20)
irisCluster <- kmeans(iris[, 3:4], 3, nstart = 20)
irisCluster## K-means clustering with 3 clusters of sizes 50, 52, 48
##
## Cluster means:
## Petal.Length Petal.Width
## 1 1.462000 0.246000
## 2 4.269231 1.342308
## 3 5.595833 2.037500
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2
## [58] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3
## [115] 3 3 3 3 3 2 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3
##
## Within cluster sum of squares by cluster:
## [1] 2.02200 13.05769 16.29167
## (between_SS / total_SS = 94.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss" "betweenss" "size"
## [8] "iter" "ifault"
table(irisCluster$cluster, iris$Species)##
## setosa versicolor virginica
## 1 50 0 0
## 2 0 48 4
## 3 0 2 46
irisCluster$cluster <- as.factor(irisCluster$cluster)
centers=as.data.frame(irisCluster$centers)
centers$Species = as.numeric(rownames(centers))
colnames(centers)## [1] "Petal.Length" "Petal.Width" "Species"
centers## Petal.Length Petal.Width Species
## 1 1.462000 0.246000 1
## 2 4.269231 1.342308 2
## 3 5.595833 2.037500 3
library(cluster)
clusplot(iris, irisCluster$cluster, color=TRUE, shade=TRUE,
labels=2, lines=0)irisCluster$centers## Petal.Length Petal.Width
## 1 1.462000 0.246000
## 2 4.269231 1.342308
## 3 5.595833 2.037500
as.data.frame(as.table(irisCluster$centers))## Var1 Var2 Freq
## 1 1 Petal.Length 1.462000
## 2 2 Petal.Length 4.269231
## 3 3 Petal.Length 5.595833
## 4 1 Petal.Width 0.246000
## 5 2 Petal.Width 1.342308
## 6 3 Petal.Width 2.037500
sessionInfo()## R version 3.3.3 (2017-03-06)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Sierra 10.12.3
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] cluster_2.0.6 ggplot2_2.2.1 knitr_1.15.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.10 digest_0.6.12 rprojroot_1.2 plyr_1.8.4 grid_3.3.3 gtable_0.2.0
## [7] backports_1.0.5 formatR_1.4 magrittr_1.5 evaluate_0.10 scales_0.4.1 stringi_1.1.3
## [13] lazyeval_0.2.0 rmarkdown_1.4 labeling_0.3 tools_3.3.3 stringr_1.2.0 munsell_0.4.3
## [19] yaml_2.1.14 colorspace_1.3-2 htmltools_0.3.5 tibble_1.3.0