From Data Smart by John Foreman
http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html.
First check if skmeans is available, install if not.
if('skmeans' %in% rownames(installed.packages()) == FALSE) {
install.packages('skmeans', repos='http://cran.us.r-project.org', dependencies=TRUE)
}
library(skmeans)
## Warning: package 'skmeans' was built under R version 3.1.2
Now get crackin
#getwd()
winedata <- read.csv('data/WineKMC.csv')
ncol(winedata)
## [1] 107
nrow(winedata)
## [1] 32
winedata[is.na(winedata)] <- 0
winedata[1:5,1:10]
## Offer Mth Varietal MinQty Disc Origin PastPeak Adams Allen Anders
## 1 1 Jan Malbec 72 56 France FALSE 0 0 0
## 2 2 Jan Pinot Noir 72 17 France FALSE 0 0 0
## 3 3 Feb Espumante 144 32 Oregon TRUE 0 0 0
## 4 4 Feb Champagne 72 48 France TRUE 0 0 0
## 5 5 Feb Cab. Sauv. 144 44 NZ TRUE 0 0 0
winedata.transposed <- t(winedata[,8:107])
winedata.transposed[1:10, 1:10]
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## Adams 0 0 0 0 0 0 0 0 0 0
## Allen 0 0 0 0 0 0 0 0 1 0
## Anders 0 0 0 0 0 0 0 0 0 0
## Bailey 0 0 0 0 0 0 1 0 0 0
## Baker 0 0 0 0 0 0 1 0 0 1
## Barnes 0 0 0 0 0 0 0 0 0 1
## Bell 0 1 0 0 0 0 0 0 0 0
## Bennett 0 0 0 0 0 0 0 1 0 0
## Brooks 0 0 1 0 0 0 0 1 0 0
## Brown 0 0 0 0 0 0 1 0 0 0
Call skmeans on datatset, specifying five means, and use of genetic algorithm.
winedata.clusters <- skmeans(winedata.transposed, 5, method='genetic')
winedata.clusters
## A hard spherical k-means partition of 100 objects into 5 classes.
## Class sizes: 17, 17, 34, 16, 16
## Call: skmeans(x = winedata.transposed, k = 5, method = "genetic")
Using str, see that actual cluster assignments are stored within ‘cluster’ list of object.
str(winedata.clusters)
## List of 7
## $ prototypes: num [1:5, 1:32] 0.1295 0.09 0 0.0974 0.1458 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:5] "1" "2" "3" "4" ...
## .. ..$ : NULL
## $ membership: NULL
## $ cluster : Named int [1:100] 3 5 2 3 4 4 2 3 1 3 ...
## ..- attr(*, "names")= chr [1:100] "Adams" "Allen" "Anders" "Bailey" ...
## $ family :List of 7
## ..$ description: chr "spherical k-means"
## ..$ D :function (x, prototypes)
## ..$ C :function (x, weights, control)
## ..$ init :function (x, k)
## ..$ e : num 1
## ..$ .modify : NULL
## ..$ .subset : NULL
## ..- attr(*, "class")= chr "pclust_family"
## $ m : num 1
## $ value : num 38.3
## $ call : language skmeans(x = winedata.transposed, k = 5, method = "genetic")
## - attr(*, "class")= chr [1:2] "skmeans" "pclust"
winedata.clusters$cluster[4]
## Bailey
## 3
winedata.clusters$cluster[which(row.names(winedata.transposed) == 'Wright')]
## Wright
## 5
aggregate(winedata.transposed, by=list(winedata.clusters$cluster), sum)
## Group.1 V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18
## 1 1 3 3 4 3 0 3 2 2 0 0 8 3 0 1 1 0 0 0
## 2 2 2 7 0 0 0 0 0 0 0 1 0 1 0 0 0 1 7 0
## 3 3 0 0 1 0 0 1 15 16 0 2 0 1 6 0 0 0 0 13
## 4 4 2 0 1 3 0 5 2 2 2 4 2 0 0 2 2 1 0 1
## 5 5 3 0 0 6 4 3 0 0 8 0 3 0 0 6 3 3 0 0
## V19 V20 V21 V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32
## 1 2 1 1 13 0 0 4 0 3 4 0 4 1 1
## 2 0 0 0 0 1 12 0 12 1 0 0 0 0 0
## 3 0 0 1 0 0 0 0 0 1 1 16 16 0 0
## 4 3 2 1 6 1 0 0 0 1 1 1 1 16 2
## 5 0 3 1 2 3 0 2 3 3 0 0 1 0 1
winedata.clustercounts <- t(aggregate(winedata.transposed,
by=list(winedata.clusters$cluster),
sum) [,2:33])
winedata.clustercounts
## [,1] [,2] [,3] [,4] [,5]
## V1 3 2 0 2 3
## V2 3 7 0 0 0
## V3 4 0 1 1 0
## V4 3 0 0 3 6
## V5 0 0 0 0 4
## V6 3 0 1 5 3
## V7 2 0 15 2 0
## V8 2 0 16 2 0
## V9 0 0 0 2 8
## V10 0 1 2 4 0
## V11 8 0 0 2 3
## V12 3 1 1 0 0
## V13 0 0 6 0 0
## V14 1 0 0 2 6
## V15 1 0 0 2 3
## V16 0 1 0 1 3
## V17 0 7 0 0 0
## V18 0 0 13 1 0
## V19 2 0 0 3 0
## V20 1 0 0 2 3
## V21 1 0 1 1 1
## V22 13 0 0 6 2
## V23 0 1 0 1 3
## V24 0 12 0 0 0
## V25 4 0 0 0 2
## V26 0 12 0 0 3
## V27 3 1 1 1 3
## V28 4 0 1 1 0
## V29 0 0 16 1 0
## V30 4 0 16 1 1
## V31 1 0 0 16 0
## V32 1 0 0 2 1
winedata.desc.plus.counts <- cbind(winedata[,1:7], winedata.clustercounts)
winedata.desc.plus.counts
## Offer Mth Varietal MinQty Disc Origin PastPeak 1 2 3 4 5
## V1 1 Jan Malbec 72 56 France FALSE 3 2 0 2 3
## V2 2 Jan Pinot Noir 72 17 France FALSE 3 7 0 0 0
## V3 3 Feb Espumante 144 32 Oregon TRUE 4 0 1 1 0
## V4 4 Feb Champagne 72 48 France TRUE 3 0 0 3 6
## V5 5 Feb Cab. Sauv. 144 44 NZ TRUE 0 0 0 0 4
## V6 6 Mar Prosecco 144 86 Chile FALSE 3 0 1 5 3
## V7 7 Mar Prosecco 6 40 Australia TRUE 2 0 15 2 0
## V8 8 Mar Espumante 6 45 S. Africa FALSE 2 0 16 2 0
## V9 9 Apr Chardonnay 144 57 Chile FALSE 0 0 0 2 8
## V10 10 Apr Prosecco 72 52 CA FALSE 0 1 2 4 0
## V11 11 May Champagne 72 85 France FALSE 8 0 0 2 3
## V12 12 May Prosecco 72 83 Australia FALSE 3 1 1 0 0
## V13 13 May Merlot 6 43 Chile FALSE 0 0 6 0 0
## V14 14 Jun Merlot 72 64 Chile FALSE 1 0 0 2 6
## V15 15 Jun Cab. Sauv. 144 19 Italy FALSE 1 0 0 2 3
## V16 16 Jun Merlot 72 88 CA FALSE 0 1 0 1 3
## V17 17 Jul Pinot Noir 12 47 Germany FALSE 0 7 0 0 0
## V18 18 Jul Espumante 6 50 Oregon FALSE 0 0 13 1 0
## V19 19 Jul Champagne 12 66 Germany FALSE 2 0 0 3 0
## V20 20 Aug Cab. Sauv. 72 82 Italy FALSE 1 0 0 2 3
## V21 21 Aug Champagne 12 50 CA FALSE 1 0 1 1 1
## V22 22 Aug Champagne 72 63 France FALSE 13 0 0 6 2
## V23 23 Sept Chardonnay 144 39 S. Africa FALSE 0 1 0 1 3
## V24 24 Sept Pinot Noir 6 34 Italy FALSE 0 12 0 0 0
## V25 25 Oct Cab. Sauv. 72 59 Oregon TRUE 4 0 0 0 2
## V26 26 Oct Pinot Noir 144 83 Australia FALSE 0 12 0 0 3
## V27 27 Oct Champagne 72 88 NZ FALSE 3 1 1 1 3
## V28 28 Nov Cab. Sauv. 12 56 France TRUE 4 0 1 1 0
## V29 29 Nov P. Grigio 6 87 France FALSE 0 0 16 1 0
## V30 30 Dec Malbec 6 54 France FALSE 4 0 16 1 1
## V31 31 Dec Champagne 72 89 France FALSE 1 0 0 16 0
## V32 32 Dec Cab. Sauv. 72 45 Germany TRUE 1 0 0 2 1
winedata.desc.plus.counts[order(-winedata.desc.plus.counts[,8]),]
## Offer Mth Varietal MinQty Disc Origin PastPeak 1 2 3 4 5
## V22 22 Aug Champagne 72 63 France FALSE 13 0 0 6 2
## V11 11 May Champagne 72 85 France FALSE 8 0 0 2 3
## V3 3 Feb Espumante 144 32 Oregon TRUE 4 0 1 1 0
## V25 25 Oct Cab. Sauv. 72 59 Oregon TRUE 4 0 0 0 2
## V28 28 Nov Cab. Sauv. 12 56 France TRUE 4 0 1 1 0
## V30 30 Dec Malbec 6 54 France FALSE 4 0 16 1 1
## V1 1 Jan Malbec 72 56 France FALSE 3 2 0 2 3
## V2 2 Jan Pinot Noir 72 17 France FALSE 3 7 0 0 0
## V4 4 Feb Champagne 72 48 France TRUE 3 0 0 3 6
## V6 6 Mar Prosecco 144 86 Chile FALSE 3 0 1 5 3
## V12 12 May Prosecco 72 83 Australia FALSE 3 1 1 0 0
## V27 27 Oct Champagne 72 88 NZ FALSE 3 1 1 1 3
## V7 7 Mar Prosecco 6 40 Australia TRUE 2 0 15 2 0
## V8 8 Mar Espumante 6 45 S. Africa FALSE 2 0 16 2 0
## V19 19 Jul Champagne 12 66 Germany FALSE 2 0 0 3 0
## V14 14 Jun Merlot 72 64 Chile FALSE 1 0 0 2 6
## V15 15 Jun Cab. Sauv. 144 19 Italy FALSE 1 0 0 2 3
## V20 20 Aug Cab. Sauv. 72 82 Italy FALSE 1 0 0 2 3
## V21 21 Aug Champagne 12 50 CA FALSE 1 0 1 1 1
## V31 31 Dec Champagne 72 89 France FALSE 1 0 0 16 0
## V32 32 Dec Cab. Sauv. 72 45 Germany TRUE 1 0 0 2 1
## V5 5 Feb Cab. Sauv. 144 44 NZ TRUE 0 0 0 0 4
## V9 9 Apr Chardonnay 144 57 Chile FALSE 0 0 0 2 8
## V10 10 Apr Prosecco 72 52 CA FALSE 0 1 2 4 0
## V13 13 May Merlot 6 43 Chile FALSE 0 0 6 0 0
## V16 16 Jun Merlot 72 88 CA FALSE 0 1 0 1 3
## V17 17 Jul Pinot Noir 12 47 Germany FALSE 0 7 0 0 0
## V18 18 Jul Espumante 6 50 Oregon FALSE 0 0 13 1 0
## V23 23 Sept Chardonnay 144 39 S. Africa FALSE 0 1 0 1 3
## V24 24 Sept Pinot Noir 6 34 Italy FALSE 0 12 0 0 0
## V26 26 Oct Pinot Noir 144 83 Australia FALSE 0 12 0 0 3
## V29 29 Nov P. Grigio 6 87 France FALSE 0 0 16 1 0