Hierarchical clustering is the other form of unsupervised learning after K-Means clustering. It is a type of machine learning algorithm that is used to draw inferences from unlabeled data. This approach doesn’t require to specify the number of clusters in advance.
data("USArrests") # Load the data set
head(USArrests,5)
str(USArrests)
'data.frame': 50 obs. of 4 variables:
$ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
$ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
$ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
$ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
any(is.na(USArrests))
[1] FALSE
summary(USArrests)
Murder Assault UrbanPop Rape
Min. : 0.800 Min. : 45.0 Min. :32.00 Min. : 7.30
1st Qu.: 4.075 1st Qu.:109.0 1st Qu.:54.50 1st Qu.:15.07
Median : 7.250 Median :159.0 Median :66.00 Median :20.10
Mean : 7.788 Mean :170.8 Mean :65.54 Mean :21.23
3rd Qu.:11.250 3rd Qu.:249.0 3rd Qu.:77.75 3rd Qu.:26.18
Max. :17.400 Max. :337.0 Max. :91.00 Max. :46.00
library(cluster)
library(dendextend)
library(factoextra)
dim(USArrests)
[1] 50 4
US_df <- scale(USArrests)
head(US_df,4)
Murder Assault UrbanPop Rape
Alabama 1.24256408 0.7828393 -0.5209066 -0.003416473
Alaska 0.50786248 1.1068225 -1.2117642 2.484202941
Arizona 0.07163341 1.4788032 0.9989801 1.042878388
Arkansas 0.23234938 0.2308680 -1.0735927 -0.184916602
We can perform agglomerative HC with hclust. First we compute the dissimilarity values with dist and then feed these values into hclust and specify the agglomeration method to be used (i.e. “complete”, “average”, “single”, “ward.D”). We can then plot the dendrogram.
dist <- dist(US_df, method = "euclidean")
head(dist)
[1] 2.703754 2.293520 1.289810 3.263110 2.651067 3.215297
hlink <- hclust(dist, method = "complete" )
plot(hlink, cex = 0.7, hang = -2) #dendrogram
Working with Dendrograms
# Ward's method
hlink2 <- hclust(dist, method = "ward.D2" )
# Cut tree into 4 groups
sub_grp <- cutree(hlink2, k = 4)
# Number of members in each cluster
table(sub_grp)
sub_grp
1 2 3 4
7 12 19 12
plot(hlink2, cex = 0.6)
rect.hclust(hlink2, k = 4, border = 2:8)
library(dplyr)
USArrests %>% mutate(cluster= sub_grp)%>%head()
As we saw in the k-means tutorial, we can also use the fviz_cluster function from the factoextra package to visualize the result in a scatter plot.
library(factoextra)
fviz_cluster(list(data = US_df, cluster = sub_grp))
hagnes <- agnes(US_df, method = "ward")
cutree(as.hclust(hagnes), k = 4)
Alabama Alaska Arizona Arkansas California Colorado Connecticut
1 2 2 3 2 2 3
Delaware Florida Georgia Hawaii Idaho Illinois Indiana
3 2 1 3 4 2 3
Iowa Kansas Kentucky Louisiana Maine Maryland Massachusetts
4 3 3 1 4 2 3
Michigan Minnesota Mississippi Missouri Montana Nebraska Nevada
2 4 1 3 4 4 2
New Hampshire New Jersey New Mexico New York North Carolina North Dakota Ohio
4 3 2 2 1 4 3
Oklahoma Oregon Pennsylvania Rhode Island South Carolina South Dakota Tennessee
3 3 3 3 1 4 1
Texas Utah Vermont Virginia Washington West Virginia Wisconsin
2 3 4 3 3 4 4
Wyoming
3
# Create two dendrograms
dend1 <- as.dendrogram (hlink)
dend2 <- as.dendrogram (hlink2)
den <- dendlist(dend1,dend2)
tanglegram(dend1, dend2)
tanglegram(dend1, dend2,highlight_distinct_edges = F,common_subtrees_color_lines = F,
common_subtrees_color_branches = T, main = paste("Entanglement=", round(entanglement(den),2)))