# Question: Implement the hierarchical clustering algorithm using the Arrests dataset
#Load inbuild Dataset
data("USArrests")
# Remove any missing value (i.e, NA values for not available)
# That might be present in the data
df <- na.omit(USArrests)
#Previewing dataset
head(df)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
#check the structure
str(df)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
dim(df)
## [1] 50 4
## Before hierarchical clustering, we can compute some descriptive statistics
# ---
#
desc_stats <- data.frame(
Min = apply(df, 2, min), # minimum
Med = apply(df, 2, median), # median
Mean = apply(df, 2, mean), # mean
SD = apply(df, 2, sd), # Standard deviation
Max = apply(df, 2, max) # Maximum
)
desc_stats <- round(desc_stats, 1) # Round to 1 decimal place
head(desc_stats)
## Min Med Mean SD Max
## Murder 0.8 7.2 7.8 4.4 17.4
## Assault 45.0 159.0 170.8 83.3 337.0
## UrbanPop 32.0 66.0 65.5 14.5 91.0
## Rape 7.3 20.1 21.2 9.4 46.0
#the above can be done using describe() as well
library(psych) #help in stat analysis
describe(df)
## vars n mean sd median trimmed mad min max range skew
## Murder 1 50 7.79 4.36 7.25 7.53 5.41 0.8 17.4 16.6 0.37
## Assault 2 50 170.76 83.34 159.00 168.48 110.45 45.0 337.0 292.0 0.22
## UrbanPop 3 50 65.54 14.47 66.00 65.88 17.79 32.0 91.0 59.0 -0.21
## Rape 4 50 21.23 9.37 20.10 20.36 8.60 7.3 46.0 38.7 0.75
## kurtosis se
## Murder -0.95 0.62
## Assault -1.15 11.79
## UrbanPop -0.87 2.05
## Rape 0.08 1.32
# We note that the variables have a large different means and variances.
# This is explained by the fact that the variables are measured in different
# units; Murder, Rape, and Assault are measured as the number of occurrences per 100 000 people,
# and UrbanPop is the percentage of the stateâs population that lives in an urban area.
# They must be standardized (i.e., scaled) to make them comparable. Recall that,
# standardization consists of transforming the variables such that
# they have mean zero and standard deviation one.
#We want the hierarchical clustering result to depend to an arbitrary variable unit,
# we start by scaling the data using the R function scale() as follows
#Scale the values
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
# We now use the R function hclust() for hierarchical clustering
# First we use the dist() function to compute the Euclidean distance between observations,
# d will be the first argument in the hclust() function dissimilarity matrix
# ---
#
d <- dist(df, method = "euclidean") # u can even use other distanaces like manhattan
#display the result as a matrix for first 6 result
as.matrix(d)[1:6, 1:6]
## Alabama Alaska Arizona Arkansas California Colorado
## Alabama 0.000000 2.703754 2.293520 1.289810 3.263110 2.651067
## Alaska 2.703754 0.000000 2.700643 2.826039 3.012541 2.326519
## Arizona 2.293520 2.700643 0.000000 2.717758 1.310484 1.365031
## Arkansas 1.289810 2.826039 2.717758 0.000000 3.763641 2.831051
## California 3.263110 3.012541 1.310484 3.763641 0.000000 1.287619
## Colorado 2.651067 2.326519 1.365031 2.831051 1.287619 0.000000
#1. We then hierarchical clustering using the Ward's method
# ---
#
res.hc <- hclust(d, method = "ward.D2" )
res.hc
##
## Call:
## hclust(d = d, method = "ward.D2")
##
## Cluster method : ward.D2
## Distance : euclidean
## Number of objects: 50
# Lastly, we plot the obtained dendrogram
# ---
plot(res.hc, cex = 0.6, hang = -1)
#display the above using hullplot
#install.packages("hullplot")
# Convert to dendrogram and color branches
library(dendextend)
##
## ---------------------
## Welcome to dendextend version 1.19.0
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
## Attaching package: 'dendextend'
## The following object is masked from 'package:stats':
##
## cutree
#dend <- as.dendrogram(res.hc)
#dend <- color_branches(dend, k = 4)
#dend <- set(dend, "branches_k_color", k = 4)
# Get cluster assignments
#clusters <- cutree(dend, k = 4)
# Use hullplot with the original data and cluster assignments
#hullplot(data, clusters
#)
#we can add colours to the 4 clusters
#visualization
library(factoextra)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library("ggplot2")
fviz_dend(res.hc, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
## Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Cut the tree
library(factoextra)#library to help us cat the tree
# Cut the tree into 4 groups
# Don't color labels, add rectangles
fviz_dend(res.hc, cex = 0.5, k = 4,
color_labels_by_k = FALSE, rect = TRUE) #we can customise colour using k_colors = c("#1B9E77", "#D95F02", "#7570B3", "#E7298A"))
# Color labels using k-means clusters
km.clust <- kmeans(df, 4)$cluster
fviz_dend(res.hc, k = 4,
k_colors = c("blue", "green3", "red", "black"),
label_cols = km.clust[res.hc$order], cex = 0.6)
Verify the cluster tree After linking the objects in a data set into a hierarchical cluster tree, you might want to assess that the distances (i.e., heights) in the tree reflect the original distances accurately.
One way to measure how well the cluster tree generated by the hclust() function reflects your data is to compute the correlation between the cophenetic distances and the original distance data generated by the dist() function. If the clustering is valid, the linking of objects in the cluster tree should have a strong correlation with the distances between objects in the original distance matrix.
The closer the value of the correlation coefficient is to 1, the more accurately the clustering solution reflects your data. Values above 0.75 are felt to be good. The “average” linkage method appears to produce high values of this statistic. This may be one reason that it is so popular.
The R base function cophenetic() can be used to compute the cophenetic distances for hierarchical clustering.
# Compute cophentic distance using ward method
res.coph <- cophenetic(res.hc)
# Correlation between cophenetic distance and
# the original distance
cor(d, res.coph)
## [1] 0.6975266
#2. Hierarchical clustering using the single method
hc2 <- hclust(d, method = "single")
#cut tree to 4 clusters
grp <- cutree(hc2, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 46 1 2 1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Arizona" "Arkansas" "Colorado"
## [5] "Connecticut" "Delaware" "Georgia" "Hawaii"
## [9] "Idaho" "Illinois" "Indiana" "Iowa"
## [13] "Kansas" "Kentucky" "Louisiana" "Maine"
## [17] "Maryland" "Massachusetts" "Michigan" "Minnesota"
## [21] "Mississippi" "Missouri" "Montana" "Nebraska"
## [25] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [29] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [33] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [37] "South Dakota" "Tennessee" "Texas" "Utah"
## [41] "Vermont" "Virginia" "Washington" "West Virginia"
## [45] "Wisconsin" "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
fviz_dend(hc2, cex = 0.5, k = 4, type = "phylogenic")
Execute the hclust() function again using the single linkage method. Next, call cophenetic() to evaluate the clustering solution.
#using Average method
hc2 <- hclust(d, method = "single")
cor(d, cophenetic(hc2))
## [1] 0.541272
#3. Hierarchical clustering using the Complete method
hc3 <- hclust(d, method = "complete")
#cut tree to 4 clusters
grp <- cutree(hc3, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 8 11 21 10
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Alaska" "Georgia" "Louisiana"
## [5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc3, cex = 0.5, k = 4, type = "circular")
Execute the hclust() function again using the complete linkage method.
Next, call cophenetic() to evaluate the clustering solution.
#using Average method
hc3 <- hclust(d, method = "complete")
cor(d, cophenetic(hc3))
## [1] 0.6979437
#4. Hierarchical clustering using the Average method
hc4 <- hclust(d, method = "average")
#cut tree to 4 clusters
grp <- cutree(hc3, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 8 11 21 10
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Alaska" "Georgia" "Louisiana"
## [5] "Mississippi" "North Carolina" "South Carolina" "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc4, cex = 0.5, k = 4, color_labels_by_k = TRUE)
Execute the hclust() function again using the average linkage method. Next, call cophenetic() to evaluate the clustering solution.
#using Average method
hc4 <- hclust(d, method = "average")
cor(d, cophenetic(hc4))
## [1] 0.7180382
#5. Hierarchical clustering using the Average method
hc5 <- hclust(d, method = "centroid")
#cut tree to 4 clusters
grp <- cutree(hc5, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 47 1 1 1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Arizona" "Arkansas" "California"
## [5] "Colorado" "Connecticut" "Delaware" "Florida"
## [9] "Georgia" "Hawaii" "Idaho" "Illinois"
## [13] "Indiana" "Iowa" "Kansas" "Kentucky"
## [17] "Louisiana" "Maine" "Maryland" "Massachusetts"
## [21] "Michigan" "Minnesota" "Mississippi" "Missouri"
## [25] "Montana" "Nebraska" "Nevada" "New Hampshire"
## [29] "New Jersey" "New Mexico" "New York" "North Dakota"
## [33] "Ohio" "Oklahoma" "Oregon" "Pennsylvania"
## [37] "Rhode Island" "South Carolina" "South Dakota" "Tennessee"
## [41] "Texas" "Utah" "Virginia" "Washington"
## [45] "West Virginia" "Wisconsin" "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc5, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
#Get cophenetic cor using centroid method
hc5 <- hclust(d, method = "centroid")
cor(d, cophenetic(hc5))
## [1] 0.6074717
#6. Hierarchical clustering using the median method
hc6 <- hclust(d, method = "median")
#cut tree to 4 clusters
grp <- cutree(hc6, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 27 1 19 3
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Arkansas" "Connecticut" "Delaware"
## [5] "Georgia" "Hawaii" "Idaho" "Indiana"
## [9] "Iowa" "Kansas" "Kentucky" "Louisiana"
## [13] "Maine" "Minnesota" "Mississippi" "Montana"
## [17] "Nebraska" "New Hampshire" "North Carolina" "Oklahoma"
## [21] "Rhode Island" "South Carolina" "South Dakota" "Tennessee"
## [25] "Virginia" "Wisconsin" "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc6, cex = 0.5, k = 4, color_labels_by_k = TRUE)
## Warning in get_col(col, k): Length of color vector was shorter than the number
## of clusters - color vector was recycled
#Get cophenetic cor using median method
hc6 <- hclust(d, method = "median")
cor(d, cophenetic(hc6))
## [1] 0.436442
#5. Hierarchical clustering using the mcquitty method
hc7 <- hclust(d, method = "mcquitty")
#cut tree to 4 clusters
grp <- cutree(hc7, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 9 13 21 7
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Arkansas" "Georgia" "Kentucky"
## [5] "Louisiana" "Mississippi" "North Carolina" "South Carolina"
## [9] "Tennessee"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc7, cex = 0.5, k = 4, color_labels_by_k = TRUE)
#Get cophenetic cor using mcquitty method
hc7 <- hclust(d, method = "mcquitty")
cor(d, cophenetic(hc7))
## [1] 0.6212635
#5. Hierarchical clustering using the ward.D method
hc8 <- hclust(d, method = "ward.D")
#cut tree to 4 clusters
grp <- cutree(hc5, k = 4)
#Check the Number of members in each cluster
table(grp)
## grp
## 1 2 3 4
## 47 1 1 1
# Get the names for the members of cluster 1
rownames(df)[grp == 1]
## [1] "Alabama" "Arizona" "Arkansas" "California"
## [5] "Colorado" "Connecticut" "Delaware" "Florida"
## [9] "Georgia" "Hawaii" "Idaho" "Illinois"
## [13] "Indiana" "Iowa" "Kansas" "Kentucky"
## [17] "Louisiana" "Maine" "Maryland" "Massachusetts"
## [21] "Michigan" "Minnesota" "Mississippi" "Missouri"
## [25] "Montana" "Nebraska" "Nevada" "New Hampshire"
## [29] "New Jersey" "New Mexico" "New York" "North Dakota"
## [33] "Ohio" "Oklahoma" "Oregon" "Pennsylvania"
## [37] "Rhode Island" "South Carolina" "South Dakota" "Tennessee"
## [41] "Texas" "Utah" "Virginia" "Washington"
## [45] "West Virginia" "Wisconsin" "Wyoming"
#Visual
library(igraph) # this library helps in plotting the phylogenic tree
fviz_dend(hc8, cex = 0.5, k = 4, color_labels_by_k = TRUE)
#Get cophenetic cor using ward.D method
hc5 <- hclust(d, method = "ward.D")
cor(d, cophenetic(hc8))
## [1] 0.6844016
Conclusion: the best method to use is the average method as it has the highest cophenetic correlation coefficient of 0.72, and the worst method to use is the median method as it has the lowest cophenetic correlation coefficient of 0.44.
Note: the closer the value of the correlation coefficient is to 1, the more accurately the clustering solution reflects your data. Values above 0.75 are felt to be good. The “average” linkage method appears to produce high values of this statistic. This may be one reason that it is so popular.