Implement the hierarchical clustering algorithm using the Arrests data set
data("USArrests")
Remove any missing value (i.e, NA values for not available) That might be present in the data
df <- na.omit(USArrests)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
Before hierarchical clustering, we can compute some descriptive statistics
desc_stats <- data.frame(
Min = apply(df, 2, min), # minimum
Med = apply(df, 2, median), # median
Mean = apply(df, 2, mean), # mean
SD = apply(df, 2, sd), # Standard deviation
Max = apply(df, 2, max) # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
## Min Med Mean SD Max
## Murder 0.8 7.2 7.8 4.4 17.4
## Assault 45.0 159.0 170.8 83.3 337.0
## UrbanPop 32.0 66.0 65.5 14.5 91.0
## Rape 7.3 20.1 21.2 9.4 46.0
We note that the variables have a large different means and variances. This is explained by the fact that the variables are measured in different units; Murder, Rape, and Assault are measured as the number of occurrences per 100 000 people, and UrbanPop is the percentage of the state’s population that lives in an urban area. They must be standardized (i.e., scaled) to make them comparable. Recall that, standardization consists of transforming the variables such that they have mean zero and standard deviation one.
As we don’t want the hierarchical clustering result to depend to an arbitrary variable unit, we start by scaling the data using the R function scale() as follows
df <- scale(df)
head(df)
## Murder Assault UrbanPop Rape
## Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
## Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
## Arizona 0.07163341 1.4788032 0.9989801 1.042878388
## Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
## California 0.27826823 1.2628144 1.7589234 2.067820292
## Colorado 0.02571456 0.3988593 0.8608085 1.864967207
First we use the dist() function to compute the Euclidean distance between observations, d will be the first argument in the hclust() function dissimilarity matrix
d <- dist(df, method = "euclidean")
We then hierarchical clustering using the Ward’s method
res.hc <- hclust(d, method = "ward.D2" )
Lastly, we plot the obtained dendrogram
plot(res.hc, cex = 0.6, hang = -1)
Using the US Arrests data sets in the above example, compute hierarchical clustering with other linkage methods, such as single, median, average, centro id, Ward’s and McQuitty’s.
res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "median" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)
Perform hierarchical clustering using the mtcars dataset
df <- mtcars
head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
Checking for nulls
anyNA(df)
## [1] FALSE
Descriptive statistics
desc_stats <- data.frame(
Min = apply(df, 2, min), # minimum
Med = apply(df, 2, median), # median
Mean = apply(df, 2, mean), # mean
SD = apply(df, 2, sd), # Standard deviation
Max = apply(df, 2, max) # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
## Min Med Mean SD Max
## mpg 10.4 19.2 20.1 6.0 33.9
## cyl 4.0 6.0 6.2 1.8 8.0
## disp 71.1 196.3 230.7 123.9 472.0
## hp 52.0 123.0 146.7 68.6 335.0
## drat 2.8 3.7 3.6 0.5 4.9
## wt 1.5 3.3 3.2 1.0 5.4
Scaling
df <- scale(df)
head(df)
## mpg cyl disp hp drat
## Mazda RX4 0.1508848 -0.1049878 -0.57061982 -0.5350928 0.5675137
## Mazda RX4 Wag 0.1508848 -0.1049878 -0.57061982 -0.5350928 0.5675137
## Datsun 710 0.4495434 -1.2248578 -0.99018209 -0.7830405 0.4739996
## Hornet 4 Drive 0.2172534 -0.1049878 0.22009369 -0.5350928 -0.9661175
## Hornet Sportabout -0.2307345 1.0148821 1.04308123 0.4129422 -0.8351978
## Valiant -0.3302874 -0.1049878 -0.04616698 -0.6080186 -1.5646078
## wt qsec vs am gear
## Mazda RX4 -0.610399567 -0.7771651 -0.8680278 1.1899014 0.4235542
## Mazda RX4 Wag -0.349785269 -0.4637808 -0.8680278 1.1899014 0.4235542
## Datsun 710 -0.917004624 0.4260068 1.1160357 1.1899014 0.4235542
## Hornet 4 Drive -0.002299538 0.8904872 1.1160357 -0.8141431 -0.9318192
## Hornet Sportabout 0.227654255 -0.4637808 -0.8680278 -0.8141431 -0.9318192
## Valiant 0.248094592 1.3269868 1.1160357 -0.8141431 -0.9318192
## carb
## Mazda RX4 0.7352031
## Mazda RX4 Wag 0.7352031
## Datsun 710 -1.1221521
## Hornet 4 Drive -1.1221521
## Hornet Sportabout -0.5030337
## Valiant -1.1221521
d <- dist(df, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)
Perform hierarchical clustering using the iris dataset
df <- iris
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
anyNA(df)
## [1] FALSE
df$Species <- unclass(df$Species)
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 1
## 2 4.9 3.0 1.4 0.2 1
## 3 4.7 3.2 1.3 0.2 1
## 4 4.6 3.1 1.5 0.2 1
## 5 5.0 3.6 1.4 0.2 1
## 6 5.4 3.9 1.7 0.4 1
desc_stats <- data.frame(
Min = apply(df, 2, min), # minimum
Med = apply(df, 2, median), # median
Mean = apply(df, 2, mean), # mean
SD = apply(df, 2, sd), # Standard deviation
Max = apply(df, 2, max) # Maximum
)
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(sort(x, partial = half + 0L:1L)[half + 0L:1L]): argument
## is not numeric or logical: returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
## Warning in mean.default(newX[, i], ...): argument is not numeric or logical:
## returning NA
head(desc_stats)
## Min Med Mean SD Max
## Sepal.Length 4.3 NA NA 0.8280661 7.9
## Sepal.Width 2.0 NA NA 0.4358663 4.4
## Petal.Length 1.0 NA NA 1.7652982 6.9
## Petal.Width 0.1 NA NA 0.7622377 2.5
## Species 1 NA NA 0.8192319 3
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 1
## 2 4.9 3.0 1.4 0.2 1
## 3 4.7 3.2 1.3 0.2 1
## 4 4.6 3.1 1.5 0.2 1
## 5 5.0 3.6 1.4 0.2 1
## 6 5.4 3.9 1.7 0.4 1
d <- dist(df, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)
Perform hierarchical cluster analysis on the given protein consumption. Dataset url = http://bit.ly/HierarchicalClusteringDataset
library(data.table)
## Warning: package 'data.table' was built under R version 4.1.2
data <- fread('http://bit.ly/HierarchicalClusteringDataset')
head(data)
## Country RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr&Veg
## 1: Albania 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
## 2: Austria 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
## 3: Belgium 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0
## 4: Bulgaria 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2
## 5: Czechoslovakia 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
## 6: Denmark 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
data$Country<- NULL
head(data)
## RedMeat WhiteMeat Eggs Milk Fish Cereals Starch Nuts Fr&Veg
## 1: 10.1 1.4 0.5 8.9 0.2 42.3 0.6 5.5 1.7
## 2: 8.9 14.0 4.3 19.9 2.1 28.0 3.6 1.3 4.3
## 3: 13.5 9.3 4.1 17.5 4.5 26.6 5.7 2.1 4.0
## 4: 7.8 6.0 1.6 8.3 1.2 56.7 1.1 3.7 4.2
## 5: 9.7 11.4 2.8 12.5 2.0 34.3 5.0 1.1 4.0
## 6: 10.6 10.8 3.7 25.0 9.9 21.9 4.8 0.7 2.4
desc_stats <- data.frame(
Min = apply(data, 2, min), # minimum
Med = apply(data, 2, median), # median
Mean = apply(data, 2, mean), # mean
SD = apply(data, 2, sd), # Standard deviation
Max = apply(data, 2, max) # Maximum
)
desc_stats <- round(desc_stats, 1)
head(desc_stats)
## Min Med Mean SD Max
## RedMeat 4.4 9.5 9.8 3.3 18.0
## WhiteMeat 1.4 7.8 7.9 3.7 14.0
## Eggs 0.5 2.9 2.9 1.1 4.7
## Milk 4.9 17.6 17.1 7.1 33.7
## Fish 0.2 3.4 4.3 3.4 14.2
## Cereals 18.6 28.0 32.2 11.0 56.7
data <- scale(data)
head(df)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 1
## 2 4.9 3.0 1.4 0.2 1
## 3 4.7 3.2 1.3 0.2 1
## 4 4.6 3.1 1.5 0.2 1
## 5 5.0 3.6 1.4 0.2 1
## 6 5.4 3.9 1.7 0.4 1
d <- dist(data, method = "euclidean")
res.hc <- hclust(d, method = "ward.D2" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "single" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "average" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "centroid" )
plot(res.hc, cex = 0.6, hang = -1)
res.hc <- hclust(d, method = "mcquitty" )
plot(res.hc, cex = 0.6, hang = -1)