##set working directory
setwd("~/CST-425")
##clean data
Here is the data set that shows 10 columns and 14 rows. There are no headers in this data set.
data <- read.csv("~/CST-425/wine.data.csv", header= FALSE)
head(data, 10)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14
## 1 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93 735
## 6 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
## 7 1 14.39 1.87 2.45 14.6 96 2.50 2.52 0.30 1.98 5.25 1.02 3.58 1290
## 8 1 14.06 2.15 2.61 17.6 121 2.60 2.51 0.31 1.25 5.05 1.06 3.58 1295
## 9 1 14.83 1.64 2.17 14.0 97 2.80 2.98 0.29 1.98 5.20 1.08 2.85 1045
## 10 1 13.86 1.35 2.27 16.0 98 2.98 3.15 0.22 1.85 7.22 1.01 3.55 1045
##plot data
Here is the plot of the data. It shows five clusters of data, or two clusters of data.
# NOT RUN {
require(graphics)
# a 2-dimensional example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))
## K-means clustering with 2 clusters of sizes 48, 52
##
## Cluster means:
## x y
## 1 1.03058307 0.99792776
## 2 -0.06195971 0.06808572
##
## Clustering vector:
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## Within cluster sum of squares by cluster:
## [1] 8.884806 10.510227
## (between_SS / total_SS = 72.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)
# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)
## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl); head(fitted.x)
## x y
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
resid.x <- x - fitted(cl)
## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
c(ss(fitted.x), ss(resid.x), ss(x)))
## [,1] [,2]
## betweenss 51.37407 51.37407
## tot.withinss 19.39503 19.39503
## totss 70.7691 70.7691
stopifnot(all.equal(cl$ totss, ss(x)),
all.equal(cl$ tot.withinss, ss(resid.x)),
## these three are the same:
all.equal(cl$ betweenss, ss(fitted.x)),
all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
## and hence also
all.equal(ss(x), ss(fitted.x) + ss(resid.x))
)
kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))
## [1] 70.7691
## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))
## K-means clustering with 5 clusters of sizes 21, 14, 26, 26, 13
##
## Cluster means:
## x y
## 1 1.1083290 0.7012580
## 2 -0.2768202 0.3615325
## 3 -0.1434266 -0.1895129
## 4 0.9927783 1.2452862
## 5 0.3664240 0.3233086
##
## Clustering vector:
## [1] 3 2 3 3 5 2 2 5 3 3 5 3 5 3 5 5 3 3 2 3 3 2 2 3 3 3 2 5 3 2 2 3 5 3 3 2 3
## [38] 5 5 3 2 2 3 2 3 3 3 3 2 3 4 4 1 1 1 1 1 4 4 1 4 1 4 5 1 1 4 4 4 4 5 4 4 4
## [75] 1 1 4 4 4 4 1 4 5 4 1 1 4 1 1 4 4 4 4 1 4 1 1 1 1 4
##
## Within cluster sum of squares by cluster:
## [1] 1.3539250 0.9415432 2.2498703 3.4650101 1.0635544
## (between_SS / total_SS = 87.2 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)
# }