setwd("C:/Users/01438475/Google Drive/UCTcourses/STA3022F/Practicals/2017/prac6 Cluster MDS")
protein <- read.csv("protein.csv")
X <- as.matrix (protein[,-1]) # or standardised data: X <- as.matrix(scale(datasetname))
rownames(X) <- protein[,1]
require(cluster)
## Loading required package: cluster
out.single.euc <- hclust(daisy(X,metric="euclidean"),method="single")
plot(out.single.euc)
Decide to cut the tree at height 9
out.single.euc <- cutree(out.single.euc, h=9)
View cluster allocation
names (out.single.euc) <- protein[,1]
sort(out.single.euc)
## Albania Austria Belgium Denmark E Germany
## 1 2 2 2 2
## France Ireland Netherlands Norway Sweden
## 2 2 2 2 2
## Switzerland UK W Germany Bulgaria Romania
## 2 2 2 3 3
## Yugoslavia Czechoslovakia Greece Italy Poland
## 3 4 4 4 4
## USSR Finland Hungary Portugal Spain
## 4 5 6 7 7
out.single.city <- hclust(daisy(X,metric="manhattan"),method="single")
plot(out.single.city)
Decide to cut the tree into 6 clusters
out.single.city <- cutree(out.single.city, k=6)
View cluster allocation
names (out.single.city) <- protein[,1]
sort(out.single.city)
## Albania Austria Belgium Czechoslovakia Denmark
## 1 2 2 2 2
## E Germany Finland France Greece Ireland
## 2 2 2 2 2
## Italy Netherlands Norway Poland Sweden
## 2 2 2 2 2
## Switzerland UK W Germany Bulgaria Romania
## 2 2 2 3 3
## Yugoslavia Hungary Portugal Spain USSR
## 3 4 5 5 6
out.single.cor <- hclust(as.dist(cor(t(X))),method="single")
plot(out.single.cor)
out.complete.euc <- hclust(daisy(X,metric="euclidean"),method="complete")
plot(out.complete.euc)
Decide to cut the tree into 5 clusters
out.complete.euc <- cutree(out.complete.euc, k=5)
View cluster allocation
names (out.complete.euc) <- protein[,1]
sort(out.complete.euc)
## Albania Czechoslovakia Greece Hungary Italy
## 1 1 1 1 1
## Poland USSR Austria Belgium France
## 1 1 2 2 2
## Ireland Netherlands Switzerland UK W Germany
## 2 2 2 2 2
## Bulgaria Romania Yugoslavia Denmark Finland
## 3 3 3 4 4
## Norway Sweden E Germany Portugal Spain
## 4 4 5 5 5
out.complete.city <- hclust(daisy(X,metric="manhattan"),method="complete")
plot(out.complete.city)
Decide to cut the tree at a height of 40
out.complete.city <- cutree(out.complete.city, h=40)
View cluster allocation
names (out.complete.city) <- protein[,1]
sort(out.complete.city)
## Albania Bulgaria Hungary Romania Yugoslavia
## 1 1 1 1 1
## Austria Belgium Denmark Finland France
## 2 2 2 2 2
## Ireland Netherlands Norway Sweden Switzerland
## 2 2 2 2 2
## UK W Germany Czechoslovakia E Germany Poland
## 2 2 3 3 3
## Greece Italy USSR Portugal Spain
## 4 4 4 5 5
out.complete.cor <- hclust(as.dist(1-cor(t(X))),method="complete")
plot(out.complete.cor)
Decide to cut the tree into 5 clusters
out.complete.cor <- cutree(out.complete.cor, k=5)
View cluster allocation
names (out.complete.cor) <- protein[,1]
sort(out.complete.cor)
## Albania Bulgaria Greece Hungary Italy
## 1 1 1 1 1
## Romania Spain USSR Yugoslavia Austria
## 1 1 1 1 2
## Czechoslovakia E Germany Poland Belgium France
## 2 2 2 3 3
## Ireland Netherlands Switzerland UK W Germany
## 3 3 3 3 3
## Denmark Finland Norway Sweden Portugal
## 4 4 4 4 5
out.centroid.euc <- hclust(daisy(X,metric="euclidean"),method="centroid")
plot(out.centroid.euc)
out.centroid.city <- hclust(daisy(X,metric="manhattan"),method="centroid")
plot(out.centroid.city)
out.centroid.cor <- hclust(as.dist(1-cor(t(X))),method="centroid")
plot(out.centroid.cor)
Set number of clusters
r <- 6
out.kmeans <- kmeans(X,centers=r)$cluster
View cluster allocation
names (out.kmeans) <- protein[,1]
sort(out.kmeans)
## Austria Belgium France Ireland Netherlands
## 1 1 1 1 1
## Switzerland UK W Germany E Germany Portugal
## 1 1 1 2 2
## Spain Denmark Norway Sweden Finland
## 2 3 3 3 4
## Albania Czechoslovakia Greece Hungary Italy
## 5 5 5 5 5
## Poland USSR Bulgaria Romania Yugoslavia
## 5 5 6 6 6
clusvec <- out.complete.euc
Calculate means
class.means <- apply(X, 2, function(x) tapply (x, clusvec, mean))
class.means
## RedMeat WhiteMeat Eggs Milk Fish Cereals Starch
## 1 8.642857 6.871429 2.385714 14.042857 2.5428571 39.27143 3.742857
## 2 13.212500 10.637500 3.987500 21.162500 3.3750000 24.70000 4.650000
## 3 6.133333 5.766667 1.433333 9.633333 0.9333333 54.06667 2.400000
## 4 9.850000 7.050000 3.150000 26.675000 8.2250000 22.67500 4.550000
## 5 7.233333 6.233333 2.633333 8.200000 8.8666667 26.93333 6.033333
## Nuts FrVeg
## 1 4.214286 4.657143
## 2 2.062500 4.175000
## 3 4.900000 3.400000
## 4 1.175000 2.125000
## 5 3.800000 6.233333
Plot means
plot (c(1,ncol(X)),range(class.means),type="n",xlab="",ylab="Average proportion of protein intake",xaxt="n")
axis (side=1, 1:ncol(X), colnames(X), las=2)
#ensure you list enough colours for the number of clusters
colvec <- c("green","gold","blue","red","black")
for (i in 1:nrow(class.means))
lines (1:ncol(X),class.means[i,],col=colvec[i])