library(gdata)
dat <- read.xls("./nutrffq_dat.xls")
## Change variables names to lower cases
names(dat) <- tolower(names(dat))
## Data manipulation
dat <- within(dat, {
bmi25 <- factor(bmi25)
})
## Extract variables for analysis
vars <- names(dat)[c(2,13:37)]
## Try 1 to 20 clusters
num.clusters <- 1:20
## Try K-means clustering with 1 to 20 centers
res.kmeans <- lapply(num.clusters, function(i) {
kmeans(x = dat[,vars], centers = i)
})
## Get within cluster sum of squares
res.within.ss <- sapply(res.kmeans, function(x) sum(x$withinss))
## Plot them against number of clusters
plot(num.clusters, res.within.ss, type = "b", xlab = "Number of clusters", ylab = "Within cluster sum of squares")
## Put the clustering result in the dataset
dat$kmeans3 <- res.kmeans[[3]]$cluster
pam(x, k, diss = inherits(x, "dist"), metric = "euclidean",
medoids = NULL, stand = FALSE, cluster.only = FALSE,
do.swap = TRUE,
keep.diss = !diss && !cluster.only && n < 100,
keep.data = !diss && !cluster.only,
pamonce = FALSE, trace.lev = 0)
## Load cluster package
library(cluster)
## Perform 3-cluster K-medoid using manhattan distance
res.pam <- pam(x = dat[,vars], k = 3, metric = "manhattan")
## Put the result in the dataset
dat$kmedoid3 <- res.pam$clustering
Classify into mixture of normal distributions.
Reference: http://en.wikibooks.org/wiki/Data_Mining_Algorithms_In_R/Clustering/Expectation_Maximization_(EM)
library(mclust)
## Number of cluster is automatically decided?
res.Mclust <- Mclust(dat[,vars])
res.Mclust
'Mclust' model object:
best model: ellipsoidal, varying volume, shape, and orientation (VVV) with 5 components
## Show number of clusters
summary(res.Mclust)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust VVV (ellipsoidal, varying volume, shape, and orientation) model with 5 components:
log.likelihood n df BIC
-225900 2143 1889 -466288
Clustering table:
1 2 3 4 5
399 323 32 119 1270
plot(res.Mclust, what = "BIC")
plot(res.Mclust, what = "classification", dimens = c(1,2))
plot(res.Mclust, what = "uncertainty", dimens = c(1,2))
plot(res.Mclust, what = "density")
method: the agglomeration method to be used. This should be (an
unambiguous abbreviation of) one of ‘"ward"’, ‘"single"’,
‘"complete"’, ‘"average"’, ‘"mcquitty"’, ‘"median"’ or
‘"centroid"’.
Classification of observations
## Create a distance matrix
dist.mat <- dist(dat[,vars], method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
## Perform hierarchical clustering with Ward, single, and complete methods
res.hclust <- lapply(c("ward","single","complete"),
function(METHOD) {
hclust(dist.mat, method = METHOD)
})
res.hclust
[[1]]
Call:
hclust(d = dist.mat, method = METHOD)
Cluster method : ward
Distance : euclidean
Number of objects: 2143
[[2]]
Call:
hclust(d = dist.mat, method = METHOD)
Cluster method : single
Distance : euclidean
Number of objects: 2143
[[3]]
Call:
hclust(d = dist.mat, method = METHOD)
Cluster method : complete
Distance : euclidean
Number of objects: 2143
## Plot
junk <- lapply(res.hclust, function(res) plot(res, cex = 0.001, lwd = 0.1))
## Get 3-cluster classification using cutree()
res.hclust.k3 <- sapply(res.hclust, function(res) cutree(res, k = 3))
res.hclust.k3 <- data.frame(res.hclust.k3)
names(res.hclust.k3) <- c("ward","single","complete")
dat <- cbind(dat, res.hclust.k3)
Classification of variables
dist.mat.t <- dist(t(as.matrix(dat[,vars])), method = "euclidean", diag = FALSE, upper = FALSE, p = 2)
res.hclust.t <- lapply(c("ward","single","complete"),
function(METHOD) {
hclust(dist.mat.t, method = METHOD)
})
junk <- lapply(res.hclust.t, function(res) plot(res))
kmeans3 and ward (left column, 2nd from top) appear to have good correlation.
single and complete tend to classify most people in one cluster, and not very useful here.
## Method names
clusters <- c("kmeans3", "kmedoid3", "ward", "single", "complete")
## ## All possible pairs
## cluster.pairs <- t(combn(clusters, 2))
## cluster.pairs
## ## Create formulas
## formulas <- apply(cluster.pairs, 1,
## function(X) paste("~", paste(X, collapse = " + ")))
## Create matrix of pairs (this is simpler for two-element combinations)
cluster.pairs <- outer(clusters, clusters, paste, sep = " + ")
## Take upper triangular matrix elements only, and create formulas
formulas <- paste("~", cluster.pairs[upper.tri(cluster.pairs, diag = F)])
formulas
[1] "~ kmeans3 + kmedoid3" "~ kmeans3 + ward" "~ kmedoid3 + ward" "~ kmeans3 + single"
[5] "~ kmedoid3 + single" "~ ward + single" "~ kmeans3 + complete" "~ kmedoid3 + complete"
[9] "~ ward + complete" "~ single + complete"
## Create 3x3 tables from formulas
list.of.xtabs <- lapply(formulas,
function(formula) {
formula <- as.formula(formula)
xtabs <- xtabs(formula, data = dat)
})
## Plot them
layout(matrix(1:10, ncol = 2))
par(mar = c(2,2,2,2))
junk <- lapply(list.of.xtabs, mosaicplot, color = 3:5 , main = "")