Here are the steps on how to create a Hierarchical cluster analysis:
setwd("~/Documents/CT 2023 prescription counts by town")
library(tidyverse) # data manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster) # clustering algorithms
library(factoextra) # clustering visualization
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dendextend) # for comparing two dendrograms
##
## ---------------------
## Welcome to dendextend version 1.17.1
## Type citation('dendextend') for how to cite the package.
##
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
##
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## You may ask questions at stackoverflow, use the r and dendextend tags:
## https://stackoverflow.com/questions/tagged/dendextend
##
## To suppress this message use: suppressPackageStartupMessages(library(dendextend))
## ---------------------
##
##
## Attaching package: 'dendextend'
##
## The following object is masked from 'package:stats':
##
## cutree
df_PMP<- read.csv("Prescription_Monitoring_Program_CT_2023.csv", row.names = 1)
head(df_PMP, 5) # View the first 5 rows of the data
## Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine
## Andover 10 1256 131 1217
## Ansonia 185 9629 1645 6559
## Ashford 28 1678 182 1490
## Avon 45 4425 421 6723
## Barkhamsted 12 1490 80 1142
## Stimulant Gabapentin
## Andover 939 817
## Ansonia 5425 4459
## Ashford 1562 1185
## Avon 6803 3313
## Barkhamsted 1194 862
df_PMP <- na.omit(df_PMP)
df <- scale(df_PMP)
head(df, 5) # View the first 5 rows of the data
## Naloxone Opiate.Agonist Opiate.Partial.Agonist Benzodiazepine
## Andover -0.4316241 -0.7170122 -0.6779584 -0.84984926
## Ansonia 0.1340374 0.2623889 0.1031724 -0.11800769
## Ashford -0.3734418 -0.6676502 -0.6516455 -0.81244890
## Avon -0.3184918 -0.3463300 -0.5283362 -0.09554007
## Barkhamsted -0.4251594 -0.6896409 -0.7042712 -0.86012409
## Stimulant Gabapentin
## Andover -0.90373498 -0.70622561
## Ansonia -0.17529756 -0.02757079
## Ashford -0.80257214 -0.63765203
## Avon 0.04846231 -0.24111786
## Barkhamsted -0.86232804 -0.69784025
d <- dist(df, method = "euclidean") # Compute distance matrix
hc1 <- hclust(d, method = "complete" ) # Hierarchical clustering using 'complete' linkage
# Plot the obtained dendrogram
plot(hc1, cex = 0.6, hang = -1, main = "Dendrogram: Prescription by town of residence using complete linkage method, 2023")
hc2 <- agnes(df, method = "complete") # Compute with agnes
hc2$ac # Agglomerative coefficient
## [1] 0.9700968
# methods to assess
m <- c( "average", "single", "complete", "ward")
names(m) <- c( "average", "single", "complete", "ward")
# function to compute coefficient
ac <- function(x) {
agnes(df, method = x)$ac }
map_dbl(m, ac)
## average single complete ward
## 0.9577365 0.9045642 0.9700968 0.9883423
hc3 <- agnes(df, method = "ward")
pltree(hc3, cex = 0.6, hang = -1, main = "Dendrogram: Prescription by town of residence using Ward method, 2023")
The diana() function provided by the {cluster} package allows us to perform divisive hierarchical clustering. In addition, the diana() function works similar to the agnes() function; however, there is no method provided.
hc4 <- diana(df)
hc4$dc # Amount of clustering structure found
## [1] 0.967901
pltree(hc4, cex = 0.6, hang = -1, main = "Dendrogram: Prescription by town of residence using divisive hierarchical clustering, 2023")
d <- dist(df, method = "euclidean") # Compute distance matrix
hc5 <- hclust(d, method = "ward.D2" ) # Ward's method
sub_grp <- cutree(hc5, k = 9) # Cut tree into 9 groups
table(sub_grp) # Number of members in each cluster
## sub_grp
## 1 2 3 4 5 6 7 8 9
## 73 39 24 13 3 3 9 4 1
plot(hc5, cex = 0.6, main = "Dendrogram: Prescription by town of residence using Ward.D2 method divided with nine clusters, 2023")
rect.hclust(hc5, k = 9, border = 2:5)
ite, inflammate omnia