#
#Hierarchical Clustering in R.
#PREPARED BY Richel alabado
homedir <- "C:/Users/LENOVO/OneDrive/Documents/thesis"
setwd(homedir)

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)

#load data
df <- USArrests

#remove rows with missing values
df <- na.omit(df)

#scale each variable to have a mean of 0 and sd of 1
df <- scale(df)

#define linkage methods
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")

#function to compute agglomerative coefficient
ac <- function(x) {
  agnes(df, method = x)$ac
}

#calculate agglomerative coefficient for each clustering linkage method
sapply(m, ac)
##   average    single  complete      ward 
## 0.7379371 0.6276128 0.8531583 0.9346210
#perform hierarchical clustering using Ward's minimum variance
clust <- agnes(df, method = "ward")

#produce dendrogram
pltree(clust, cex = 0.6, hang = -1, main = "Dendrogram") 

#calculate gap statistic for each number of clusters (up to 10 clusters)
gap_stat <- clusGap(df, FUN = hcut, nstart = 25, K.max = 10, B = 50)

#produce plot of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

#compute distance matrix
d <- dist(df, method = "euclidean")

#perform hierarchical clustering using Ward's method
final_clust <- hclust(d, method = "ward.D2" )

#cut the dendrogram into 4 clusters
groups <- cutree(final_clust, k=4)

# Number of members in each cluster
table(groups)
## groups
##  1  2  3  4 
##  7 12 19 12
#append cluster labels to original data
final_data <- cbind(USArrests, cluster = groups)

#display first six rows of final data
head(final_data)
##            Murder Assault UrbanPop Rape cluster
## Alabama      13.2     236       58 21.2       1
## Alaska       10.0     263       48 44.5       2
## Arizona       8.1     294       80 31.0       2
## Arkansas      8.8     190       50 19.5       3
## California    9.0     276       91 40.6       2
## Colorado      7.9     204       78 38.7       2
#find mean values for each cluster
aggregate(final_data, by=list(cluster=final_data$cluster), mean)
##   cluster    Murder  Assault UrbanPop     Rape cluster
## 1       1 14.671429 251.2857 54.28571 21.68571       1
## 2       2 10.966667 264.0000 76.50000 33.60833       2
## 3       3  6.210526 142.0526 71.26316 19.18421       3
## 4       4  3.091667  76.0000 52.08333 11.83333       4