Hierarchical Clustering

Example

Implement the hierarchical clustering algorithm using the Arrests data set

data("USArrests")

Remove any missing value (i.e, NA values for not available) That might be present in the data

df <- na.omit(USArrests)
head(df)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

Before hierarchical clustering, we can compute some descriptive statistics

desc_stats <- data.frame(
  Min = apply(df, 2, min),    # minimum
  Med = apply(df, 2, median), # median
  Mean = apply(df, 2, mean),  # mean
  SD = apply(df, 2, sd),      # Standard deviation
  Max = apply(df, 2, max)     # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
##           Min   Med  Mean   SD   Max
## Murder    0.8   7.2   7.8  4.4  17.4
## Assault  45.0 159.0 170.8 83.3 337.0
## UrbanPop 32.0  66.0  65.5 14.5  91.0
## Rape      7.3  20.1  21.2  9.4  46.0

We note that the variables have a large different means and variances. This is explained by the fact that the variables are measured in different units; Murder, Rape, and Assault are measured as the number of occurrences per 100 000 people, and UrbanPop is the percentage of the state’s population that lives in an urban area. They must be standardized (i.e., scaled) to make them comparable. Recall that, standardization consists of transforming the variables such that they have mean zero and standard deviation one.

As we don’t want the hierarchical clustering result to depend to an arbitrary variable unit, we start by scaling the data using the R function scale() as follows

df <- scale(df)
head(df)
##                Murder   Assault   UrbanPop         Rape
## Alabama    1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska     0.50786248 1.1068225 -1.2117642  2.484202941
## Arizona    0.07163341 1.4788032  0.9989801  1.042878388
## Arkansas   0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144  1.7589234  2.067820292
## Colorado   0.02571456 0.3988593  0.8608085  1.864967207

First we use the dist() function to compute the Euclidean distance between observations, d will be the first argument in the hclust() function dissimilarity matrix

d <- dist(df, method = "euclidean")

We then hierarchical clustering using the Ward’s method

res.hc <- hclust(d, method = "ward.D2" )

Lastly, we plot the obtained dendrogram

plot(res.hc, cex = 0.6, hang = -1)

Challenge 1

Using the US Arrests data sets in the above example, compute hierarchical clustering with other linkage methods, such as single, median, average, centro id, Ward’s and McQuitty’s.

  1. single method
res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)

  1. median method
res.hc <- hclust(d, method = "median" )
plot(res.hc, cex = 0.6, hang = -1)

  1. average method
res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)

  1. centroid method
res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)

  1. macquitty method
res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)

Challenge 2

Perform hierarchical clustering using the mtcars dataset

df <- mtcars
head(df)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Checking for nulls

anyNA(df)
## [1] FALSE

Descriptive statistics

desc_stats <- data.frame(
  Min = apply(df, 2, min),    # minimum
  Med = apply(df, 2, median), # median
  Mean = apply(df, 2, mean),  # mean
  SD = apply(df, 2, sd),      # Standard deviation
  Max = apply(df, 2, max)     # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
##       Min   Med  Mean    SD   Max
## mpg  10.4  19.2  20.1   6.0  33.9
## cyl   4.0   6.0   6.2   1.8   8.0
## disp 71.1 196.3 230.7 123.9 472.0
## hp   52.0 123.0 146.7  68.6 335.0
## drat  2.8   3.7   3.6   0.5   4.9
## wt    1.5   3.3   3.2   1.0   5.4

Scaling

df <- scale(df)
head(df)
##                          mpg        cyl        disp         hp       drat
## Mazda RX4          0.1508848 -0.1049878 -0.57061982 -0.5350928  0.5675137
## Mazda RX4 Wag      0.1508848 -0.1049878 -0.57061982 -0.5350928  0.5675137
## Datsun 710         0.4495434 -1.2248578 -0.99018209 -0.7830405  0.4739996
## Hornet 4 Drive     0.2172534 -0.1049878  0.22009369 -0.5350928 -0.9661175
## Hornet Sportabout -0.2307345  1.0148821  1.04308123  0.4129422 -0.8351978
## Valiant           -0.3302874 -0.1049878 -0.04616698 -0.6080186 -1.5646078
##                             wt       qsec         vs         am       gear
## Mazda RX4         -0.610399567 -0.7771651 -0.8680278  1.1899014  0.4235542
## Mazda RX4 Wag     -0.349785269 -0.4637808 -0.8680278  1.1899014  0.4235542
## Datsun 710        -0.917004624  0.4260068  1.1160357  1.1899014  0.4235542
## Hornet 4 Drive    -0.002299538  0.8904872  1.1160357 -0.8141431 -0.9318192
## Hornet Sportabout  0.227654255 -0.4637808 -0.8680278 -0.8141431 -0.9318192
## Valiant            0.248094592  1.3269868  1.1160357 -0.8141431 -0.9318192
##                         carb
## Mazda RX4          0.7352031
## Mazda RX4 Wag      0.7352031
## Datsun 710        -1.1221521
## Hornet 4 Drive    -1.1221521
## Hornet Sportabout -0.5030337
## Valiant           -1.1221521
d <- dist(df, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)

Challenge 3

Perform hierarchical clustering using the iris dataset

df <- iris
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
anyNA(df)
## [1] FALSE
df$Species <- unclass(df$Species)
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1
desc_stats <- data.frame(
  Min = apply(df, 2, min),    # minimum
  Med = apply(df, 2, median), # median
  Mean = apply(df, 2, mean),  # mean
  SD = apply(df, 2, sd),      # Standard deviation
  Max = apply(df, 2, max)     # Maximum
)
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA

## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA

## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
head(desc_stats)
##              Min Med Mean        SD Max
## Sepal.Length 4.3  NA   NA 0.8280661 7.9
## Sepal.Width  2.0  NA   NA 0.4358663 4.4
## Petal.Length 1.0  NA   NA 1.7652982 6.9
## Petal.Width  0.1  NA   NA 0.7622377 2.5
## Species        1  NA   NA 0.8192319   3
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1
d <- dist(df, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)

Challenge 4

Perform hierarchical cluster analysis on the given protein consumption. Dataset url = http://bit.ly/HierarchicalClusteringDataset

library(data.table)
## Warning: package 'data.table' was built under R version 4.1.2
data <- fread('http://bit.ly/HierarchicalClusteringDataset')
head(data)
##           Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr&Veg
## 1:        Albania    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5    1.7
## 2:        Austria     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3    4.3
## 3:        Belgium    13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1    4.0
## 4:       Bulgaria     7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7    4.2
## 5: Czechoslovakia     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1    4.0
## 6:        Denmark    10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7    2.4
data$Country<- NULL
head(data)
##    RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr&Veg
## 1:    10.1       1.4  0.5  8.9  0.2    42.3    0.6  5.5    1.7
## 2:     8.9      14.0  4.3 19.9  2.1    28.0    3.6  1.3    4.3
## 3:    13.5       9.3  4.1 17.5  4.5    26.6    5.7  2.1    4.0
## 4:     7.8       6.0  1.6  8.3  1.2    56.7    1.1  3.7    4.2
## 5:     9.7      11.4  2.8 12.5  2.0    34.3    5.0  1.1    4.0
## 6:    10.6      10.8  3.7 25.0  9.9    21.9    4.8  0.7    2.4
desc_stats <- data.frame(
  Min = apply(data, 2, min),    # minimum
  Med = apply(data, 2, median), # median
  Mean = apply(data, 2, mean),  # mean
  SD = apply(data, 2, sd),      # Standard deviation
  Max = apply(data, 2, max)     # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
##            Min  Med Mean   SD  Max
## RedMeat    4.4  9.5  9.8  3.3 18.0
## WhiteMeat  1.4  7.8  7.9  3.7 14.0
## Eggs       0.5  2.9  2.9  1.1  4.7
## Milk       4.9 17.6 17.1  7.1 33.7
## Fish       0.2  3.4  4.3  3.4 14.2
## Cereals   18.6 28.0 32.2 11.0 56.7
data <- scale(data)
head(df)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2       1
## 2          4.9         3.0          1.4         0.2       1
## 3          4.7         3.2          1.3         0.2       1
## 4          4.6         3.1          1.5         0.2       1
## 5          5.0         3.6          1.4         0.2       1
## 6          5.4         3.9          1.7         0.4       1
d <- dist(data, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)

res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)