Learning Objectives

In this lesson students will …

  • Implement the Hierarchical clustering algorithm
  • Visualize the model

Resources:

The following example comes from:

Hierarchical clustering

Step 0: Load Data

bdiag<- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/bdiag.csv", stringsAsFactors = TRUE)

Step 1: Select

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.1 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.3.0      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
bdiag.2vars <- bdiag%>%
  select(c("radius_mean", "texture_mean"))

Step 2: Distances

#distances between the observations
bdiag.dist <- dist(bdiag.2vars, method = "euclidean")
      
#### CHECK: what is dist() doing?

bdiag.dist[1]  #is the dist between obs1 and obs2
## [1] 7.82742
## [1] 7.82742

bdiag.2vars[1:2, ] #obs 1 and 2
##   radius_mean texture_mean
## 1       17.99        10.38
## 2       20.57        17.77
##   radius_mean texture_mean
## 1       17.99        10.38
## 2       20.57        17.77

#Eucl distance
sqrt((bdiag.2vars[1, 1] - bdiag.2vars[2,1 ])^2 + 
(bdiag.2vars[1, 2] - bdiag.2vars[2,2 ])^2 )  
## [1] 7.82742

Step 3: Complete Linkage

#Dendrogram using the complete linkage method
bdiag.ddgram <- hclust(bdiag.dist, method="complete")

Step 4: Plot

#Plot the dendrogram
#the option hang = -1 will make the
#labels appear below 0

plot(bdiag.ddgram, cex=.4, hang = -1)

## Step 5: Cut

plot(bdiag.ddgram, cex=.4, hang = -1)
abline(a=20, b=0, lty=2)

Step 6: Cluster

plot(bdiag.ddgram, cex=.4, hang = -1)
rect.hclust(bdiag.ddgram, k = 3, border = 2:5)

group3 <- cutree(bdiag.ddgram, k = 3)  
table(group3 )
## group3
##   1   2   3 
## 274 224  71

Step 7: Visualize

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data = bdiag.2vars, cluster = group3 ))