# Question: Implement the hierarchical clustering algorithm using the Arrests dataset
#Load inbuild Dataset
data("USArrests")

# Remove any missing value (i.e, NA values for not available)
# That might be present in the data
df <- na.omit(USArrests)

#Previewing dataset
head(df)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
#check the structure
str(df)
## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dim(df)
## [1] 50  4
## Before hierarchical clustering, we can compute some descriptive statistics
# ---
# 
desc_stats <- data.frame(
  Min = apply(df, 2, min),    # minimum
  Med = apply(df, 2, median), # median
  Mean = apply(df, 2, mean),  # mean
  SD = apply(df, 2, sd),      # Standard deviation
  Max = apply(df, 2, max)     # Maximum
)
desc_stats <- round(desc_stats, 1) # Round to 1 decimal place

head(desc_stats)
##           Min   Med  Mean   SD   Max
## Murder    0.8   7.2   7.8  4.4  17.4
## Assault  45.0 159.0 170.8 83.3 337.0
## UrbanPop 32.0  66.0  65.5 14.5  91.0
## Rape      7.3  20.1  21.2  9.4  46.0
#the above can be done using describe() as well
library(psych) #help in stat analysis

describe(df)
##          vars  n   mean    sd median trimmed    mad  min   max range  skew
## Murder      1 50   7.79  4.36   7.25    7.53   5.41  0.8  17.4  16.6  0.37
## Assault     2 50 170.76 83.34 159.00  168.48 110.45 45.0 337.0 292.0  0.22
## UrbanPop    3 50  65.54 14.47  66.00   65.88  17.79 32.0  91.0  59.0 -0.21
## Rape        4 50  21.23  9.37  20.10   20.36   8.60  7.3  46.0  38.7  0.75
##          kurtosis    se
## Murder      -0.95  0.62
## Assault     -1.15 11.79
## UrbanPop    -0.87  2.05
## Rape         0.08  1.32
# We note that the variables have a large different means and variances. 
# This is explained by the fact that the variables are measured in different 
# units; Murder, Rape, and Assault are measured as the number of occurrences per 100 000 people, 
# and UrbanPop is the percentage of the state’s population that lives in an urban area.
# They must be standardized (i.e., scaled) to make them comparable. Recall that, 
# standardization consists of transforming the variables such that 
# they have mean zero and standard deviation one.
#We  want the hierarchical clustering result to depend to an arbitrary variable unit, 
# we start by scaling the data using the R function scale() as follows

#Scale the values
df <- scale(df)
head(df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207
# We now use the R function hclust() for hierarchical clustering

# First we use the dist() function to compute the Euclidean distance between observations, 
# d will be the first argument in the hclust() function dissimilarity matrix
# ---
#
d <- dist(df, method = "euclidean") # u can even use other distanaces like manhattan

#display the result as a matrix for first 6 result
as.matrix(d)[1:6, 1:6]
##             Alabama   Alaska  Arizona Arkansas California Colorado
## Alabama    0.000000 2.703754 2.293520 1.289810   3.263110 2.651067
## Alaska     2.703754 0.000000 2.700643 2.826039   3.012541 2.326519
## Arizona    2.293520 2.700643 0.000000 2.717758   1.310484 1.365031
## Arkansas   1.289810 2.826039 2.717758 0.000000   3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641   0.000000 1.287619
## Colorado   2.651067 2.326519 1.365031 2.831051   1.287619 0.000000
#1. We then hierarchical clustering using the Ward's method
# ---
# 
res.hc <- hclust(d, method = "ward.D2" )
res.hc
## 
## Call:
## hclust(d = d, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 50
# Lastly, we plot the obtained dendrogram
# ---
plot(res.hc, cex = 0.6, hang = -1)

#display the above using hullplot
#install.packages("hullplot")
# Convert to dendrogram and color branches
library(dendextend)
## 
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags: 
##   https://stackoverflow.com/questions/tagged/dendextend
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
## 
##     cutree
#dend <- as.dendrogram(res.hc)
#dend <- color_branches(dend, k = 4)
#dend <- set(dend, "branches_k_color", k = 4)

# Get cluster assignments
#clusters <- cutree(dend, k = 4)

# Use hullplot with the original data and cluster assignments
#hullplot(data, clusters
#)

Enhanced Visualization of Dendrogram

#we can add colours to the 4 clusters
#visualization
library(factoextra)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("ggplot2")

fviz_dend(res.hc, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Cut the tree
library(factoextra)#library to help us cat the tree

# Cut the tree into 4 groups
# Don't color labels, add rectangles
fviz_dend(res.hc, cex = 0.5, k = 4,
 color_labels_by_k = FALSE, rect = TRUE) #we can customise colour using   k_colors = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A"))

# Color labels using k-means clusters
 km.clust <- kmeans(df, 4)$cluster
 fviz_dend(res.hc, k = 4,
   k_colors = c("blue", "green3", "red", "black"),
   label_cols =  km.clust[res.hc$order], cex = 0.6)

Verify the cluster tree After linking the objects in a data set into a hierarchical cluster tree, you might want to assess that the distances (i.e., heights) in the tree reflect the original distances accurately.

One way to measure how well the cluster tree generated by the hclust() function reflects your data is to compute the correlation between the cophenetic distances and the original distance data generated by the dist() function. If the clustering is valid, the linking of objects in the cluster tree should have a strong correlation with the distances between objects in the original distance matrix.

The closer the value of the correlation coefficient is to 1, the more accurately the clustering solution reflects your data. Values above 0.75 are felt to be good. The “average” linkage method appears to produce high values of this statistic. This may be one reason that it is so popular.

The R base function cophenetic() can be used to compute the cophenetic distances for hierarchical clustering.

# Compute cophentic distance using ward method
res.coph <- cophenetic(res.hc)

# Correlation between cophenetic distance and
# the original distance
cor(d, res.coph)
## [1] 0.6975266
#2. Hierarchical clustering using the single method
hc2 <- hclust(d, method = "single")

#cut tree to 4 clusters
grp <- cutree(hc2, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
## 46  1  2  1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
##  [1] "Alabama"        "Arizona"        "Arkansas"       "Colorado"      
##  [5] "Connecticut"    "Delaware"       "Georgia"        "Hawaii"        
##  [9] "Idaho"          "Illinois"       "Indiana"        "Iowa"          
## [13] "Kansas"         "Kentucky"       "Louisiana"      "Maine"         
## [17] "Maryland"       "Massachusetts"  "Michigan"       "Minnesota"     
## [21] "Mississippi"    "Missouri"       "Montana"        "Nebraska"      
## [25] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [29] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [33] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [37] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [41] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [45] "Wisconsin"      "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
fviz_dend(hc2, cex = 0.5, k = 4, type = "phylogenic")

Execute the hclust() function again using the single linkage method. Next, call cophenetic() to evaluate the clustering solution.

#using Average method
hc2 <- hclust(d, method = "single")

cor(d, cophenetic(hc2))
## [1] 0.541272
#3. Hierarchical clustering using the Complete method
hc3 <- hclust(d, method = "complete")

#cut tree to 4 clusters
grp <- cutree(hc3, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
##  8 11 21 10
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama"        "Alaska"         "Georgia"        "Louisiana"     
## [5] "Mississippi"    "North Carolina" "South Carolina" "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc3, cex = 0.5, k = 4, type = "circular")

Execute the hclust() function again using the complete linkage method. Next, call cophenetic() to evaluate the clustering solution.

#using Average method
hc3 <- hclust(d, method = "complete")

cor(d, cophenetic(hc3))
## [1] 0.6979437
#4. Hierarchical clustering using the Average method
hc4 <- hclust(d, method = "average")

#cut tree to 4 clusters
grp <- cutree(hc3, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
##  8 11 21 10
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama"        "Alaska"         "Georgia"        "Louisiana"     
## [5] "Mississippi"    "North Carolina" "South Carolina" "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc4, cex = 0.5, k = 4, color_labels_by_k = TRUE)

Execute the hclust() function again using the average linkage method. Next, call cophenetic() to evaluate the clustering solution.

#using Average method
hc4 <- hclust(d, method = "average")

cor(d, cophenetic(hc4))
## [1] 0.7180382
#5. Hierarchical clustering using the Average method
hc5 <- hclust(d, method = "centroid")

#cut tree to 4 clusters
grp <- cutree(hc5, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
## 47  1  1  1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
##  [1] "Alabama"        "Arizona"        "Arkansas"       "California"    
##  [5] "Colorado"       "Connecticut"    "Delaware"       "Florida"       
##  [9] "Georgia"        "Hawaii"         "Idaho"          "Illinois"      
## [13] "Indiana"        "Iowa"           "Kansas"         "Kentucky"      
## [17] "Louisiana"      "Maine"          "Maryland"       "Massachusetts" 
## [21] "Michigan"       "Minnesota"      "Mississippi"    "Missouri"      
## [25] "Montana"        "Nebraska"       "Nevada"         "New Hampshire" 
## [29] "New Jersey"     "New Mexico"     "New York"       "North Dakota"  
## [33] "Ohio"           "Oklahoma"       "Oregon"         "Pennsylvania"  
## [37] "Rhode Island"   "South Carolina" "South Dakota"   "Tennessee"     
## [41] "Texas"          "Utah"           "Virginia"       "Washington"    
## [45] "West Virginia"  "Wisconsin"      "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc5, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled

#Get cophenetic cor using centroid method
hc5 <- hclust(d, method = "centroid")

cor(d, cophenetic(hc5))
## [1] 0.6074717
#6. Hierarchical clustering using the median method
hc6 <- hclust(d, method = "median")

#cut tree to 4 clusters
grp <- cutree(hc6, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
## 27  1 19  3
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
##  [1] "Alabama"        "Arkansas"       "Connecticut"    "Delaware"      
##  [5] "Georgia"        "Hawaii"         "Idaho"          "Indiana"       
##  [9] "Iowa"           "Kansas"         "Kentucky"       "Louisiana"     
## [13] "Maine"          "Minnesota"      "Mississippi"    "Montana"       
## [17] "Nebraska"       "New Hampshire"  "North Carolina" "Oklahoma"      
## [21] "Rhode Island"   "South Carolina" "South Dakota"   "Tennessee"     
## [25] "Virginia"       "Wisconsin"      "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc6, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled

#Get cophenetic cor using median method
hc6 <- hclust(d, method = "median")

cor(d, cophenetic(hc6))
## [1] 0.436442
#5. Hierarchical clustering using the mcquitty method
hc7 <- hclust(d, method = "mcquitty")

#cut tree to 4 clusters
grp <- cutree(hc7, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
##  9 13 21  7
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama"        "Arkansas"       "Georgia"        "Kentucky"      
## [5] "Louisiana"      "Mississippi"    "North Carolina" "South Carolina"
## [9] "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc7, cex = 0.5, k = 4, color_labels_by_k = TRUE)

#Get cophenetic cor using mcquitty method
hc7 <- hclust(d, method = "mcquitty")

cor(d, cophenetic(hc7))
## [1] 0.6212635
#5. Hierarchical clustering using the ward.D method
hc8 <- hclust(d, method = "ward.D")

#cut tree to 4 clusters
grp <- cutree(hc5, k = 4)

#Check the Number of members in each cluster
table(grp)
## grp
##  1  2  3  4 
## 47  1  1  1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
##  [1] "Alabama"        "Arizona"        "Arkansas"       "California"    
##  [5] "Colorado"       "Connecticut"    "Delaware"       "Florida"       
##  [9] "Georgia"        "Hawaii"         "Idaho"          "Illinois"      
## [13] "Indiana"        "Iowa"           "Kansas"         "Kentucky"      
## [17] "Louisiana"      "Maine"          "Maryland"       "Massachusetts" 
## [21] "Michigan"       "Minnesota"      "Mississippi"    "Missouri"      
## [25] "Montana"        "Nebraska"       "Nevada"         "New Hampshire" 
## [29] "New Jersey"     "New Mexico"     "New York"       "North Dakota"  
## [33] "Ohio"           "Oklahoma"       "Oregon"         "Pennsylvania"  
## [37] "Rhode Island"   "South Carolina" "South Dakota"   "Tennessee"     
## [41] "Texas"          "Utah"           "Virginia"       "Washington"    
## [45] "West Virginia"  "Wisconsin"      "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc8, cex = 0.5, k = 4, color_labels_by_k = TRUE)

#Get cophenetic cor using ward.D method
hc5 <- hclust(d, method = "ward.D")

cor(d, cophenetic(hc8))
## [1] 0.6844016

Conclusion: the best method to use is the average method as it has the highest cophenetic correlation coefficient of 0.72, and the worst method to use is the median method as it has the lowest cophenetic correlation coefficient of 0.44.

Note: the closer the value of the correlation coefficient is to 1, the more accurately the clustering solution reflects your data. Values above 0.75 are felt to be good. The “average” linkage method appears to produce high values of this statistic. This may be one reason that it is so popular.