Cluster Analysis

setwd("C:/Users/01438475/Google Drive/UCTcourses/STA3022F/Practicals/2017/prac6 Cluster MDS")
protein <- read.csv("protein.csv")
X <- as.matrix (protein[,-1])   # or standardised data:  X <- as.matrix(scale(datasetname))
rownames(X) <- protein[,1]
require(cluster)
## Loading required package: cluster

CLUSTER ANALYSIS: Single linkage

Euclidean Distance

out.single.euc <- hclust(daisy(X,metric="euclidean"),method="single")
plot(out.single.euc)

Decide to cut the tree at height 9

out.single.euc <- cutree(out.single.euc, h=9)

View cluster allocation

names (out.single.euc) <- protein[,1]
sort(out.single.euc)
##        Albania        Austria        Belgium        Denmark      E Germany 
##              1              2              2              2              2 
##         France        Ireland    Netherlands         Norway         Sweden 
##              2              2              2              2              2 
##    Switzerland             UK      W Germany       Bulgaria        Romania 
##              2              2              2              3              3 
##     Yugoslavia Czechoslovakia         Greece          Italy         Poland 
##              3              4              4              4              4 
##           USSR        Finland        Hungary       Portugal          Spain 
##              4              5              6              7              7

Manhattan Distance

out.single.city <- hclust(daisy(X,metric="manhattan"),method="single")
plot(out.single.city)

Decide to cut the tree into 6 clusters

out.single.city <- cutree(out.single.city, k=6)

View cluster allocation

names (out.single.city) <- protein[,1]
sort(out.single.city)
##        Albania        Austria        Belgium Czechoslovakia        Denmark 
##              1              2              2              2              2 
##      E Germany        Finland         France         Greece        Ireland 
##              2              2              2              2              2 
##          Italy    Netherlands         Norway         Poland         Sweden 
##              2              2              2              2              2 
##    Switzerland             UK      W Germany       Bulgaria        Romania 
##              2              2              2              3              3 
##     Yugoslavia        Hungary       Portugal          Spain           USSR 
##              3              4              5              5              6

Correlation Distance

out.single.cor <- hclust(as.dist(cor(t(X))),method="single")
plot(out.single.cor)

CLUSTER ANALYSIS: Complete linkage

Euclidean Distance

out.complete.euc <- hclust(daisy(X,metric="euclidean"),method="complete")
plot(out.complete.euc)

Decide to cut the tree into 5 clusters

out.complete.euc <- cutree(out.complete.euc, k=5)

View cluster allocation

names (out.complete.euc) <- protein[,1]
sort(out.complete.euc)
##        Albania Czechoslovakia         Greece        Hungary          Italy 
##              1              1              1              1              1 
##         Poland           USSR        Austria        Belgium         France 
##              1              1              2              2              2 
##        Ireland    Netherlands    Switzerland             UK      W Germany 
##              2              2              2              2              2 
##       Bulgaria        Romania     Yugoslavia        Denmark        Finland 
##              3              3              3              4              4 
##         Norway         Sweden      E Germany       Portugal          Spain 
##              4              4              5              5              5

Manhattan Distance

out.complete.city <- hclust(daisy(X,metric="manhattan"),method="complete")
plot(out.complete.city)

Decide to cut the tree at a height of 40

out.complete.city <- cutree(out.complete.city, h=40)

View cluster allocation

names (out.complete.city) <- protein[,1]
sort(out.complete.city)
##        Albania       Bulgaria        Hungary        Romania     Yugoslavia 
##              1              1              1              1              1 
##        Austria        Belgium        Denmark        Finland         France 
##              2              2              2              2              2 
##        Ireland    Netherlands         Norway         Sweden    Switzerland 
##              2              2              2              2              2 
##             UK      W Germany Czechoslovakia      E Germany         Poland 
##              2              2              3              3              3 
##         Greece          Italy           USSR       Portugal          Spain 
##              4              4              4              5              5

Correlation distance

out.complete.cor <- hclust(as.dist(1-cor(t(X))),method="complete")
plot(out.complete.cor)

Decide to cut the tree into 5 clusters

out.complete.cor <- cutree(out.complete.cor, k=5)

View cluster allocation

names (out.complete.cor) <- protein[,1]
sort(out.complete.cor)
##        Albania       Bulgaria         Greece        Hungary          Italy 
##              1              1              1              1              1 
##        Romania          Spain           USSR     Yugoslavia        Austria 
##              1              1              1              1              2 
## Czechoslovakia      E Germany         Poland        Belgium         France 
##              2              2              2              3              3 
##        Ireland    Netherlands    Switzerland             UK      W Germany 
##              3              3              3              3              3 
##        Denmark        Finland         Norway         Sweden       Portugal 
##              4              4              4              4              5

CLUSTER ANALYSIS: Centroid Clustering

Euclidean

out.centroid.euc <- hclust(daisy(X,metric="euclidean"),method="centroid")
plot(out.centroid.euc)

Manhattan

out.centroid.city <- hclust(daisy(X,metric="manhattan"),method="centroid")
plot(out.centroid.city)

Correlation

out.centroid.cor <- hclust(as.dist(1-cor(t(X))),method="centroid")
plot(out.centroid.cor)

K-Means Clustering

Set number of clusters

r <- 6
out.kmeans <- kmeans(X,centers=r)$cluster

View cluster allocation

names (out.kmeans) <- protein[,1]
sort(out.kmeans)
##        Austria        Belgium         France        Ireland    Netherlands 
##              1              1              1              1              1 
##    Switzerland             UK      W Germany      E Germany       Portugal 
##              1              1              1              2              2 
##          Spain        Denmark         Norway         Sweden        Finland 
##              2              3              3              3              4 
##        Albania Czechoslovakia         Greece        Hungary          Italy 
##              5              5              5              5              5 
##         Poland           USSR       Bulgaria        Romania     Yugoslavia 
##              5              5              6              6              6

CLUSTER PROFILING

clusvec <- out.complete.euc

Calculate means

class.means <- apply(X, 2, function(x) tapply (x, clusvec, mean))
class.means
##     RedMeat WhiteMeat     Eggs      Milk      Fish  Cereals   Starch
## 1  8.642857  6.871429 2.385714 14.042857 2.5428571 39.27143 3.742857
## 2 13.212500 10.637500 3.987500 21.162500 3.3750000 24.70000 4.650000
## 3  6.133333  5.766667 1.433333  9.633333 0.9333333 54.06667 2.400000
## 4  9.850000  7.050000 3.150000 26.675000 8.2250000 22.67500 4.550000
## 5  7.233333  6.233333 2.633333  8.200000 8.8666667 26.93333 6.033333
##       Nuts    FrVeg
## 1 4.214286 4.657143
## 2 2.062500 4.175000
## 3 4.900000 3.400000
## 4 1.175000 2.125000
## 5 3.800000 6.233333

Plot means

plot (c(1,ncol(X)),range(class.means),type="n",xlab="",ylab="Average proportion of protein intake",xaxt="n")
axis (side=1, 1:ncol(X), colnames(X), las=2)
#ensure you list enough colours for the number of clusters
colvec <- c("green","gold","blue","red","black")
for (i in 1:nrow(class.means))
  lines (1:ncol(X),class.means[i,],col=colvec[i])