EPI 288 Lecture 2: Cluster analysis

References

The Elements of Statistical Learning: http://www-stat.stanford.edu/~tibs/ElemStatLearn/
CRAN Task View: http://cran.r-project.org/web/views/Cluster.html

Cluster analysis

K-means clustering
K-Medoid clustering
Expectation maximization (EM) clustering
Hierarchical clustering

Load nutrition dataset

library(gdata)
dat <- read.xls("./nutrffq_dat.xls")

## Change variables names to lower cases
names(dat) <- tolower(names(dat))

## Data manipulation
dat <- within(dat, {
    bmi25 <- factor(bmi25)
})

## Extract variables for analysis
vars <- names(dat)[c(2,13:37)]

K-means clustering using kmeans()

## Try 1 to 20 clusters
num.clusters <- 1:20

## Try K-means clustering with 1 to 20 centers
res.kmeans <- lapply(num.clusters, function(i) {
    kmeans(x = dat[,vars], centers = i)
})

## Get within cluster sum of squares
res.within.ss <- sapply(res.kmeans, function(x) sum(x$withinss))

## Plot them against number of clusters
plot(num.clusters, res.within.ss, type = "b", xlab = "Number of clusters", ylab = "Within cluster sum of squares")

plot of chunk unnamed-chunk-3


## Put the clustering result in the dataset
dat$kmeans3 <- res.kmeans[[3]]$cluster

K-medoid clustering using cluster::pam()

pam(x, k, diss = inherits(x, "dist"), metric = "euclidean",
    medoids = NULL, stand = FALSE, cluster.only = FALSE,
    do.swap = TRUE,
    keep.diss = !diss && !cluster.only && n < 100,
    keep.data = !diss && !cluster.only,
    pamonce = FALSE, trace.lev = 0)

## Load cluster package
library(cluster)

## Perform 3-cluster K-medoid using manhattan distance
res.pam <- pam(x = dat[,vars], k = 3, metric = "manhattan")

## Put the result in the dataset
dat$kmedoid3 <- res.pam$clustering

EM clustering using mclust::em()

Classify into mixture of normal distributions.

Reference: http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Clustering/Expectation_Maximization_(EM)

library(mclust)

## Number of cluster is automatically decided?
res.Mclust <- Mclust(dat[,vars])
res.Mclust

'Mclust' model object:
 best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 5 components


## Show number of clusters
summary(res.Mclust)

----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm 
----------------------------------------------------

Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 5 components:

 log.likelihood    n   df     BIC
        -225900 2143 1889 -466288

Clustering table:
   1    2    3    4    5 
 399  323   32  119 1270


plot(res.Mclust, what = "BIC")

plot of chunk unnamed-chunk-6


plot(res.Mclust, what = "classification", dimens = c(1,2))

plot of chunk unnamed-chunk-6


plot(res.Mclust, what = "uncertainty", dimens = c(1,2))

plot of chunk unnamed-chunk-6

plot(res.Mclust, what = "density")

plot of chunk unnamed-chunk-6

Hierarchical clustering with hclust()

  method: the agglomeration method to be used. This should be (an
          unambiguous abbreviation of) one of ‘"ward"’, ‘"single"’,
          ‘"complete"’, ‘"average"’, ‘"mcquitty"’, ‘"median"’ or
          ‘"centroid"’.

Classification of observations

## Create a distance matrix
dist.mat <- dist(dat[,vars], method = "euclidean", diag = FALSE, upper = FALSE, p = 2)

## Perform hierarchical clustering with Ward, single, and complete methods
res.hclust <- lapply(c("ward","single","complete"),
                     function(METHOD) {
                         hclust(dist.mat, method = METHOD)
                     })
res.hclust

[[1]]

Call:
hclust(d = dist.mat, method = METHOD)

Cluster method   : ward 
Distance         : euclidean 
Number of objects: 2143 


[[2]]

Call:
hclust(d = dist.mat, method = METHOD)

Cluster method   : single 
Distance         : euclidean 
Number of objects: 2143 


[[3]]

Call:
hclust(d = dist.mat, method = METHOD)

Cluster method   : complete 
Distance         : euclidean 
Number of objects: 2143


## Plot
junk <- lapply(res.hclust, function(res) plot(res, cex = 0.001, lwd = 0.1))

plot of chunk unnamed-chunk-7


## Get 3-cluster classification using cutree()
res.hclust.k3        <- sapply(res.hclust, function(res) cutree(res, k = 3))
res.hclust.k3        <- data.frame(res.hclust.k3)
names(res.hclust.k3) <- c("ward","single","complete")
dat                  <- cbind(dat, res.hclust.k3)

Classification of variables

dist.mat.t <- dist(t(as.matrix(dat[,vars])), method = "euclidean", diag = FALSE, upper = FALSE, p = 2)

res.hclust.t <- lapply(c("ward","single","complete"),
                       function(METHOD) {
                           hclust(dist.mat.t, method = METHOD)
                       })

junk <- lapply(res.hclust.t, function(res) plot(res))

plot of chunk unnamed-chunk-8

Comparison of classifications

kmeans3 and ward (left column, 2nd from top) appear to have good correlation.

single and complete tend to classify most people in one cluster, and not very useful here.

## Method names
clusters <- c("kmeans3", "kmedoid3", "ward", "single", "complete")

## ## All possible pairs
## cluster.pairs <- t(combn(clusters, 2))
## cluster.pairs
## ## Create formulas
## formulas <- apply(cluster.pairs, 1,
##                   function(X) paste("~", paste(X, collapse = " + ")))

## Create matrix of pairs (this is simpler for two-element combinations)
cluster.pairs <- outer(clusters, clusters, paste, sep = " + ")
## Take upper triangular matrix elements only, and create formulas
formulas      <- paste("~", cluster.pairs[upper.tri(cluster.pairs, diag = F)])
formulas

 [1] "~ kmeans3 + kmedoid3"  "~ kmeans3 + ward"      "~ kmedoid3 + ward"     "~ kmeans3 + single"   
 [5] "~ kmedoid3 + single"   "~ ward + single"       "~ kmeans3 + complete"  "~ kmedoid3 + complete"
 [9] "~ ward + complete"     "~ single + complete"


## Create 3x3 tables from formulas
list.of.xtabs <- lapply(formulas,
                        function(formula) {
                            formula <- as.formula(formula)
                            xtabs   <- xtabs(formula, data = dat)
                        })

## Plot them
layout(matrix(1:10, ncol = 2))
par(mar = c(2,2,2,2))
junk <- lapply(list.of.xtabs, mosaicplot, color = 3:5 , main = "")

plot of chunk unnamed-chunk-9