PAC2: Dendrogram *****

# Load libraries
library(knitr)
library(ggplot2)
library(ggdendro)
library(dendextend)
library(readr)

# Load the dataset
hdi_data <- read_csv("human-development-index.csv")

# Filter data for the year 2022
hdi_2022 <- subset(hdi_data, Year == 2022)

# Standardize HDI values for clustering
hdi_scaled <- scale(hdi_2022$`Human Development Index`)

# Perform hierarchical clustering
d <- dist(hdi_scaled)  # compute distance matrix
hc <- hclust(d, method = "ward.D2")  # hierarchical clustering with Ward's method

# Convert the hclust object to a dendrogram
dend <- as.dendrogram(hc)

# Use dendextend to customize the dendrogram
dend <- dend %>%
  set("labels", hdi_2022$Entity) %>%
  set("labels_cex", 0.7) %>%
  set("branches_k_color", k = 5) %>%
  set("branches_lwd", 0.6) %>%
  set("labels_colors", value = "blue")

# Plot the dendrogram
plot(dend, main = "Dendrogram of Countries Based on HDI (2022)", horiz = TRUE)

# Convert hclust object to dendrogram data for ggplot
dend_data <- as.dendrogram(hc)
dend_data <- ggdendro::dendro_data(dend_data, type = "rectangle")

# Plot using ggplot2
ggplot() +
  geom_segment(data = dend_data$segments, aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_text(data = dend_data$labels, aes(x = x, y = y, label = label), hjust = 1, size = 3) +
  coord_flip() +
  labs(title = "Dendrogram of Countries Based on HDI (2022)", x = "Distance", y = "") +
  theme_minimal()