In this lesson students will …
The following example comes from:
bdiag<- read.csv("https://raw.githubusercontent.com/kitadasmalley/DATA252/main/Data/bdiag.csv", stringsAsFactors = TRUE)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
bdiag.2vars <- bdiag%>%
select(c("radius_mean", "texture_mean"))
#distances between the observations
bdiag.dist <- dist(bdiag.2vars, method = "euclidean")
#### CHECK: what is dist() doing?
bdiag.dist[1] #is the dist between obs1 and obs2
## [1] 7.82742
## [1] 7.82742
bdiag.2vars[1:2, ] #obs 1 and 2
## radius_mean texture_mean
## 1 17.99 10.38
## 2 20.57 17.77
## radius_mean texture_mean
## 1 17.99 10.38
## 2 20.57 17.77
#Eucl distance
sqrt((bdiag.2vars[1, 1] - bdiag.2vars[2,1 ])^2 +
(bdiag.2vars[1, 2] - bdiag.2vars[2,2 ])^2 )
## [1] 7.82742
#Dendrogram using the complete linkage method
bdiag.ddgram <- hclust(bdiag.dist, method="complete")
#Plot the dendrogram
#the option hang = -1 will make the
#labels appear below 0
plot(bdiag.ddgram, cex=.4, hang = -1)
## Step 5: Cut
plot(bdiag.ddgram, cex=.4, hang = -1)
abline(a=20, b=0, lty=2)
plot(bdiag.ddgram, cex=.4, hang = -1)
rect.hclust(bdiag.ddgram, k = 3, border = 2:5)
group3 <- cutree(bdiag.ddgram, k = 3)
table(group3 )
## group3
## 1 2 3
## 274 224 71
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data = bdiag.2vars, cluster = group3 ))