test kmeans

##set working directory

setwd("~/CST-425")

##clean data

Here is the data set that shows 10 columns and 14 rows. There are no headers in this data set.

data <- read.csv("~/CST-425/wine.data.csv", header= FALSE) 

head(data, 10)

##    V1    V2   V3   V4   V5  V6   V7   V8   V9  V10  V11  V12  V13  V14
## 1   1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.64 1.04 3.92 1065
## 2   1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.38 1.05 3.40 1050
## 3   1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.68 1.03 3.17 1185
## 4   1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.80 0.86 3.45 1480
## 5   1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.32 1.04 2.93  735
## 6   1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.75 1.05 2.85 1450
## 7   1 14.39 1.87 2.45 14.6  96 2.50 2.52 0.30 1.98 5.25 1.02 3.58 1290
## 8   1 14.06 2.15 2.61 17.6 121 2.60 2.51 0.31 1.25 5.05 1.06 3.58 1295
## 9   1 14.83 1.64 2.17 14.0  97 2.80 2.98 0.29 1.98 5.20 1.08 2.85 1045
## 10  1 13.86 1.35 2.27 16.0  98 2.98 3.15 0.22 1.85 7.22 1.01 3.55 1045

##plot data

Here is the plot of the data. It shows five clusters of data, or two clusters of data.

# NOT RUN {
require(graphics)

# a 2-dimensional example
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))

## K-means clustering with 2 clusters of sizes 48, 52
## 
## Cluster means:
##             x          y
## 1  1.03058307 0.99792776
## 2 -0.06195971 0.06808572
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 
## Within cluster sum of squares by cluster:
## [1]  8.884806 10.510227
##  (between_SS / total_SS =  72.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex = 2)

# sum of squares
ss <- function(x) sum(scale(x, scale = FALSE)^2)

## cluster centers "fitted" to each obs.:
fitted.x <- fitted(cl);  head(fitted.x)

##             x          y
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572
## 2 -0.06195971 0.06808572

resid.x <- x - fitted(cl)

## Equalities : ----------------------------------
cbind(cl[c("betweenss", "tot.withinss", "totss")], # the same two columns
      c(ss(fitted.x), ss(resid.x),    ss(x)))

##              [,1]     [,2]    
## betweenss    51.37407 51.37407
## tot.withinss 19.39503 19.39503
## totss        70.7691  70.7691

stopifnot(all.equal(cl$ totss,        ss(x)),
          all.equal(cl$ tot.withinss, ss(resid.x)),
          ## these three are the same:
          all.equal(cl$ betweenss,    ss(fitted.x)),
          all.equal(cl$ betweenss, cl$totss - cl$tot.withinss),
          ## and hence also
          all.equal(ss(x), ss(fitted.x) + ss(resid.x))
)

kmeans(x,1)$withinss # trivial one-cluster, (its W.SS == ss(x))

## [1] 70.7691

## random starts do help here with too many clusters
## (and are often recommended anyway!):
(cl <- kmeans(x, 5, nstart = 25))

## K-means clustering with 5 clusters of sizes 21, 14, 26, 26, 13
## 
## Cluster means:
##            x          y
## 1  1.1083290  0.7012580
## 2 -0.2768202  0.3615325
## 3 -0.1434266 -0.1895129
## 4  0.9927783  1.2452862
## 5  0.3664240  0.3233086
## 
## Clustering vector:
##   [1] 3 2 3 3 5 2 2 5 3 3 5 3 5 3 5 5 3 3 2 3 3 2 2 3 3 3 2 5 3 2 2 3 5 3 3 2 3
##  [38] 5 5 3 2 2 3 2 3 3 3 3 2 3 4 4 1 1 1 1 1 4 4 1 4 1 4 5 1 1 4 4 4 4 5 4 4 4
##  [75] 1 1 4 4 4 4 1 4 5 4 1 1 4 1 1 4 4 4 4 1 4 1 1 1 1 4
## 
## Within cluster sum of squares by cluster:
## [1] 1.3539250 0.9415432 2.2498703 3.4650101 1.0635544
##  (between_SS / total_SS =  87.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

plot(x, col = cl$cluster)
points(cl$centers, col = 1:5, pch = 8)

# }

test kmeans

Ben Hebbel

9/22/2020