# R in Action (2nd ed): Chapter 16                      
# Cluster analysis                                      
# requires packaged NbClust, flexclust, rattle          
# install.packages(c("NbClust", "flexclust", "rattle")) 
par(ask=TRUE)
opar <- par(no.readonly=FALSE)

# Calculating Distances
data(nutrient, package="flexclust")
head(nutrient, 2)
##              energy protein fat calcium iron
## BEEF BRAISED    340      20  28       9  2.6
## HAMBURGER       245      21  17       9  2.7
d <- dist(nutrient)
as.matrix(d)[1:4,1:4]
##              BEEF BRAISED HAMBURGER BEEF ROAST BEEF STEAK
## BEEF BRAISED      0.00000   95.6400   80.93429   35.24202
## HAMBURGER        95.64000    0.0000  176.49218  130.87784
## BEEF ROAST       80.93429  176.4922    0.00000   45.76418
## BEEF STEAK       35.24202  130.8778   45.76418    0.00000
# Listing 16.1 - Average linkage clustering of nutrient data
data(nutrient, package="flexclust")
row.names(nutrient) <- tolower(row.names(nutrient))
nutrient.scaled <- scale(nutrient)                                  
d <- dist(nutrient.scaled)                                          
fit.average <- hclust(d, method="average")                          
plot(fit.average, hang=-1, cex=.8, main="Average Linkage Clustering")

# Listing 16.2 - Selecting the number of clusters
library(NbClust)
nc <- NbClust(nutrient.scaled, distance="euclidean", 
              min.nc=2, max.nc=15, method="average")
## Warning in pf(beale, pp, df2): NaNs produced

## Warning in pf(beale, pp, df2): NaNs produced

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 4 proposed 2 as the best number of clusters 
## * 4 proposed 3 as the best number of clusters 
## * 2 proposed 4 as the best number of clusters 
## * 4 proposed 5 as the best number of clusters 
## * 1 proposed 9 as the best number of clusters 
## * 1 proposed 10 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## * 1 proposed 14 as the best number of clusters 
## * 4 proposed 15 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
par(opar)
## Warning in par(opar): graphical parameter "cin" cannot be set
## Warning in par(opar): graphical parameter "cra" cannot be set
## Warning in par(opar): graphical parameter "csi" cannot be set
## Warning in par(opar): graphical parameter "cxy" cannot be set
## Warning in par(opar): graphical parameter "din" cannot be set
## Warning in par(opar): graphical parameter "page" cannot be set
table(nc$Best.n[1,])
## 
##  0  1  2  3  4  5  9 10 13 14 15 
##  2  1  4  4  2  4  1  1  2  1  4
barplot(table(nc$Best.n[1,]), 
        xlab="Numer of Clusters", ylab="Number of Criteria",
        main="Number of Clusters Chosen by 26 Criteria") 

# Listing 16.3 - Obtaining the final cluster solution
clusters <- cutree(fit.average, k=5) 
table(clusters)
## clusters
##  1  2  3  4  5 
##  7 16  1  2  1
aggregate(nutrient, by=list(cluster=clusters), median) 
##   cluster energy protein fat calcium iron
## 1       1  340.0      19  29       9 2.50
## 2       2  170.0      20   8      13 1.45
## 3       3  160.0      26   5      14 5.90
## 4       4   57.5       9   1      78 5.70
## 5       5  180.0      22   9     367 2.50
aggregate(as.data.frame(nutrient.scaled), by=list(cluster=clusters),
          median)
##   cluster     energy    protein        fat    calcium        iron
## 1       1  1.3101024  0.0000000  1.3785620 -0.4480464  0.08110456
## 2       2 -0.3696099  0.2352002 -0.4869384 -0.3967868 -0.63743114
## 3       3 -0.4684165  1.6464016 -0.7534384 -0.3839719  2.40779157
## 4       4 -1.4811842 -2.3520023 -1.1087718  0.4361807  2.27092763
## 5       5 -0.2708033  0.7056007 -0.3981050  4.1396825  0.08110456
plot(fit.average, hang=-1, cex=.8,  
     main="Average Linkage Clustering\n5 Cluster Solution")
rect.hclust(fit.average, k=5)

# Plot function for within groups sum of squares by number of clusters
wssplot <- function(data, nc=15, seed=1234){
  wss <- (nrow(data)-1)*sum(apply(data,2,var))
  for (i in 2:nc){
    set.seed(seed)
    wss[i] <- sum(kmeans(data, centers=i)$withinss)}
  plot(1:nc, wss, type="b", xlab="Number of Clusters",
       ylab="Within groups sum of squares")}

## Avoiding non-existent clusters
library(fMultivar)
## Loading required package: timeDate
## Loading required package: timeSeries
## Loading required package: fBasics
set.seed(1234)
df <- rnorm2d(1000, rho=.5)
df <- as.data.frame(df)
plot(df, main="Bivariate Normal Distribution with rho=0.5")

wssplot(df)

library(NbClust)
nc <- NbClust(df, min.nc=2, max.nc=15, method="kmeans")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 8 proposed 2 as the best number of clusters 
## * 5 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 1 proposed 8 as the best number of clusters 
## * 4 proposed 10 as the best number of clusters 
## * 1 proposed 12 as the best number of clusters 
## * 2 proposed 13 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  2 
##  
##  
## *******************************************************************
par(opar)
## Warning in par(opar): graphical parameter "cin" cannot be set
## Warning in par(opar): graphical parameter "cra" cannot be set
## Warning in par(opar): graphical parameter "csi" cannot be set
## Warning in par(opar): graphical parameter "cxy" cannot be set
## Warning in par(opar): graphical parameter "din" cannot be set
## Warning in par(opar): graphical parameter "page" cannot be set
barplot(table(nc$Best.n[1,]), 
        xlab="Numer of Clusters", ylab="Number of Criteria",
        main  ="Number of Clusters Chosen by 26 Criteria")

library(ggplot2)

library(cluster)
fit <- pam(df, k=2)
df$clustering <- factor(fit$clustering)
ggplot(data=df, aes(x=V1, y=V2, color=clustering, shape=clustering)) +
  geom_point() + ggtitle("Clustering of Bivariate Normal Data")

plot(nc$All.index[,4], type="o", ylab="CCC",
     xlab="Number of clusters", col="blue")