Activity 4.1 - Hierarchical clustering

SUBMISSION INSTRUCTIONS

  1. Render to html
  2. Publish your html to RPubs
  3. Submit a link to your published solutions
#Loading packages needed for this assignment
library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)

Question 1

Consider three data sets below. Each data set contains three “clusters” in two dimensions. In the first, the clusters are three convex spheres. (A convex cluster is one where all points in the cluster can be connected with a straight line that does not leave the cluster.) In the second, one cluster is a sphere; one is a ring; and one is a half-moon. In the third, two clusters are spirals and one is a sphere. Our goal is to compare the performance of various hierarchical methods in clustering different cluster shapes.

three_spheres <- read.csv('Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')

Perform agglomerative clustering with single, complete, average, and Ward linkages. Cut each tree to produce three clusters. Produce 12 scatterplots, one per data set/linkage combination, showing the 3-cluster solution. Title each graph with the linkage used, as well as the average silhouette width (\(\bar s\)) for that clustering solution. Use patchwork to create a nice 4x3 grid of your plots.

Discuss the following:

  1. Which linkage works best for which scenario?
  2. Does the average silhouette width always do a good job of measuring the quality of the clustering solution?

(Hint: you have a lot of repetitive code to write. You may find it helpful to write a function that takes a data set and a linkage method as arguments, does the clustering and computes average silhouette width, and produces the desired plot.)

library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)

three_spheres <- read.csv('Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')

cluster_and_plot <- function(data, linkage_method, dataset_name) {
  data_numeric <- data %>% select(where(is.numeric))
  
  dist_mat <- dist(data_numeric)
  hc <- hclust(dist_mat, method = linkage_method)
  
  clusters <- cutree(hc, k = 3)
  
  sil <- silhouette(clusters, dist_mat)
  avg_sil <- mean(sil[, 3])
  
  plot_data <- data_numeric %>%
    mutate(cluster = factor(clusters))
  
  p <- ggplot(plot_data, aes(x = x, y = y, color = cluster)) +
    geom_point(size = 1.5, alpha = 0.7) +
    scale_color_manual(values = c("#E41A1C", "#377EB8", "#4DAF4A")) +
    labs(title = paste0(str_to_title(linkage_method), " Linkage"),
         subtitle = bquote(bar(s) == .(round(avg_sil, 3)))) +
    theme_minimal() +
    theme(legend.position = "none",
          plot.title = element_text(size = 11, face = "bold"),
          plot.subtitle = element_text(size = 9),
          axis.title = element_text(size = 9),
          axis.text = element_text(size = 8))
  
  return(p)
}

linkage_methods <- c("single", "complete", "average", "ward.D2")

plots_spheres <- map(linkage_methods, 
                     ~cluster_and_plot(three_spheres, .x, "Three Spheres"))

plots_ring <- map(linkage_methods, 
                  ~cluster_and_plot(ring_moon_sphere, .x, "Ring-Moon-Sphere"))
plots_spirals <- map(linkage_methods, 
                     ~cluster_and_plot(two_spirals_sphere, .x, "Two Spirals-Sphere"))

combined_plot <- (plots_spheres[[1]] | plots_ring[[1]] | plots_spirals[[1]]) /
                 (plots_spheres[[2]] | plots_ring[[2]] | plots_spirals[[2]]) /
                 (plots_spheres[[3]] | plots_ring[[3]] | plots_spirals[[3]]) /
                 (plots_spheres[[4]] | plots_ring[[4]] | plots_spirals[[4]]) +
  plot_annotation(
    title = "Hierarchical Clustering Comparison Across Linkage Methods",
    theme = theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5))
  )
  print(combined_plot)

summary_results <- tibble(
  Dataset = rep(c("Three Spheres", "Ring-Moon-Sphere", "Two Spirals-Sphere"), each = 4),
  Linkage = rep(c("Single", "Complete", "Average", "Ward"), 3),
  Avg_Silhouette = numeric(12)
)

idx <- 1
for (dataset in list(three_spheres, ring_moon_sphere, two_spirals_sphere)) {
  for (method in linkage_methods) {
    data_numeric <- dataset %>% select(where(is.numeric))
    dist_mat <- dist(data_numeric)
    hc <- hclust(dist_mat, method = method)
    clusters <- cutree(hc, k = 3)
    sil <- silhouette(clusters, dist_mat)
    summary_results$Avg_Silhouette[idx] <- mean(sil[, 3])
    idx <- idx + 1
  }
}
print(summary_results %>% 
        pivot_wider(names_from = Dataset, values_from = Avg_Silhouette) %>%
        mutate(across(where(is.numeric), ~round(.x, 3))))
# A tibble: 4 × 4
  Linkage  `Three Spheres` `Ring-Moon-Sphere` `Two Spirals-Sphere`
  <chr>              <dbl>              <dbl>                <dbl>
1 Single             0.442              0.407                0.158
2 Complete           0.727              0.533                0.35 
3 Average            0.727              0.575                0.432
4 Ward               0.727              0.485                0.389

Question 2

Consider the data set below on milk content of 25 mammals. The variables have been pre-scaled to z-scores, hence no additional standardizing is necessary. (Data source: Everitt et al. Cluster analysis 4ed)

mammals <- read.csv('Data/mammal_milk.csv') %>% 
  column_to_rownames('Mammal')

A)

Perform agglomerative clustering with single, complete, average, and Ward linkages. Which has the best agglomerative coefficient?

library(tidyverse)
library(cluster)

mammals <- read.csv('Data/mammal_milk.csv') %>%
  column_to_rownames('Mammal')

linkage_methods <- c("single", "complete", "average", "ward")

compute_agglom_coef <- function(data, method) {
  ag <- agnes(data, method = method)
  return(ag$ac)
}

agglom_results <- tibble(
  Linkage_Method = linkage_methods,
  Agglomerative_Coefficient = map_dbl(linkage_methods, 
                                       ~compute_agglom_coef(mammals, .x))
) %>%
  arrange(desc(Agglomerative_Coefficient))

print(agglom_results, n = Inf)
# A tibble: 4 × 2
  Linkage_Method Agglomerative_Coefficient
  <chr>                              <dbl>
1 ward                               0.941
2 complete                           0.899
3 average                            0.871
4 single                             0.788
cat("\n\nBest Method: ", agglom_results$Linkage_Method[1], 
    " (AC = ", round(agglom_results$Agglomerative_Coefficient[1], 4), ")\n", sep = "")


Best Method: ward (AC = 0.9414)
p <- ggplot(agglom_results, aes(x = reorder(Linkage_Method, Agglomerative_Coefficient), 
                                 y = Agglomerative_Coefficient, 
                                 fill = Linkage_Method)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = round(Agglomerative_Coefficient, 4)), 
            vjust = -0.5, size = 4) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Agglomerative Coefficients by Linkage Method",
       subtitle = "Mammal Milk Data",
       x = "Linkage Method",
       y = "Agglomerative Coefficient") +
  theme_minimal() +
  theme(plot.title = element_text(size = 14, face = "bold", hjust = 0.5),
        plot.subtitle = element_text(size = 11, hjust = 0.5),
        axis.title = element_text(size = 11),
        axis.text = element_text(size = 10)) +
  ylim(0, 1)

print(p)

B)

Plot a dendrogram of the method with the highest AC. Which mammals cluster together first?

par(mfrow = c(2, 2), mar = c(5, 4, 4, 2))
for (method in linkage_methods) {
  ag <- agnes(mammals, method = method)
  plot(ag, which.plots = 2, 
       main = paste0(str_to_title(method), " Linkage (AC = ", 
                     round(ag$ac, 4), ")"),
       xlab = "", sub = "")
}

par(mfrow = c(1, 1))

C)

If the tree is cut at a height of 4, how many clusters will form? Which cluster will have the fewest mammals, and which mammals will they be?

ag_ward <- agnes(mammals, method = "ward")
hc_ward <- as.hclust(ag_ward)

clusters_h4 <- cutree(hc_ward, h = 4)

n_clusters <- max(clusters_h4)



cluster_counts <- table(clusters_h4)
cat("Cluster sizes:\n")
Cluster sizes:
print(cluster_counts)
clusters_h4
 1  2  3  4 
11  6  6  2 
smallest_cluster <- which.min(cluster_counts)
cat("\n\nSmallest cluster: Cluster", smallest_cluster, "\n")


Smallest cluster: Cluster 4 
cat("Number of mammals:", cluster_counts[smallest_cluster], "\n\n")
Number of mammals: 2 
mammals_in_smallest <- names(clusters_h4[clusters_h4 == smallest_cluster])
cat("Mammals in smallest cluster:\n")
Mammals in smallest cluster:
cat(paste(mammals_in_smallest, collapse = "\n"), "\n")
Dolphin
Seal 
plot(hc_ward, main = "Ward Linkage Dendrogram with Cut at Height 4",
     xlab = "", sub = "")
abline(h = 4, col = "red", lty = 2, lwd = 2)
text(x = 2, y = 4.3, labels = "Cut at h=4", col = "red", cex = 0.9)

D)

Use WSS and average silhouette method to suggest the optimal number of clusters. Re-create the dendrogram with the cluster memberships indicated.

library(factoextra)

 wss_plot <- fviz_nbclust(mammals, 
                         FUN = hcut, 
                         method = "wss",
                         hc_method = "ward.D2",
                         k.max = 10) +
  labs(title = "Elbow Method (WSS)",
       subtitle = "Optimal number of clusters") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
sil_plot <- fviz_nbclust(mammals, 
                         FUN = hcut, 
                         method = "silhouette",
                         hc_method = "ward.D2",
                         k.max = 10) +
  labs(title = "Silhouette Method",
       subtitle = "Optimal number of clusters") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))

library(patchwork)
combined_opt <- wss_plot | sil_plot
print(combined_opt)

 sil_values <- sapply(2:10, function(k) {
  clusters <- cutree(hc_ward, k = k)
  sil <- silhouette(clusters, dist(mammals))
  mean(sil[, 3])
})
optimal_k <- which.max(sil_values) + 1
cat("\nOptimal k by silhouette method:", optimal_k, "\n")

Optimal k by silhouette method: 3 
clusters_optimal <- cutree(hc_ward, k = optimal_k)
fviz_dend(hc_ward, k = optimal_k, 
          rect = TRUE, 
          rect_border = "steelblue",
          rect_fill = TRUE,
          cex = 0.6,
          main = paste0("Ward Linkage Dendrogram with ", optimal_k, " Clusters"),
          xlab = "",
          sub = "") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5))
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

cluster_membership <- data.frame(
  Mammal = names(clusters_optimal),
  Cluster = clusters_optimal
) %>%
  arrange(Cluster, Mammal)
print(cluster_membership, row.names = FALSE)
     Mammal Cluster
      Bison       1
    Buffalo       1
      Camel       1
        Cat       1
     Donkey       1
   Elephant       1
        Fox       1
 Guinea pig       1
      Hippo       1
      Horse       1
      Llama       1
     Monkey       1
       Mule       1
  Orangutan       1
        Pig       1
      Sheep       1
      Zebra       1
       Deer       2
        Dog       2
     Rabbit       2
        Rat       2
   Reindeer       2
      Whale       2
    Dolphin       3
       Seal       3

E)

Use suitable visualizations, including dimension reduction techniques, to explore the different milk characteristics of the assigned clusters. Discuss.

mammals_clustered <- mammals %>%
  mutate(Cluster = factor(clusters_optimal))

  cluster_summary <- mammals_clustered %>%
  group_by(Cluster) %>%
  summarise(
    n = n(),
    across(where(is.numeric), list(mean = mean, sd = sd), .names = "{.col}_{.fn}"),
    .groups = "drop"
  )
print(cluster_summary)
# A tibble: 3 × 14
  Cluster     n Water_mean Water_sd Protein_mean Protein_sd Fat_mean Fat_sd
  <fct>   <int>      <dbl>    <dbl>        <dbl>      <dbl>    <dbl>  <dbl>
1 1          17      0.617   0.264        -0.541      0.702   -0.508  0.368
2 2           6     -0.693   0.374         1.17       0.319    0.548  0.469
3 3           2     -2.53    0.0827        1.08       0.174    2.68   0.477
# ℹ 6 more variables: Lactose_mean <dbl>, Lactose_sd <dbl>, Ash_mean <dbl>,
#   Ash_sd <dbl>, n_mean <dbl>, n_sd <dbl>
 pca_result <- prcomp(mammals, scale. = FALSE)  # Already scaled
pca_data <- as.data.frame(pca_result$x) %>%
  mutate(Cluster = factor(clusters_optimal),
         Mammal = rownames(mammals))

var_explained <- summary(pca_result)$importance[2, 1:2] * 100
pca_plot <- ggplot(pca_data, aes(x = PC1, y = PC2, color = Cluster, label = Mammal)) +
  geom_point(size = 3, alpha = 0.7) +
  geom_text(size = 2.5, hjust = 0, vjust = 0, nudge_x = 0.1, nudge_y = 0.1) +
  labs(title = "PCA Biplot of Mammal Milk Clusters",
       x = paste0("PC1 (", round(var_explained[1], 1), "% variance)"),
       y = paste0("PC2 (", round(var_explained[2], 1), "% variance)")) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        legend.position = "right")
  print(pca_plot)

 loadings_df <- as.data.frame(pca_result$rotation[, 1:2]) %>%
  rownames_to_column("Variable")

loading_plot <- ggplot(loadings_df, aes(x = PC1, y = PC2)) +
  geom_segment(aes(xend = 0, yend = 0), 
               arrow = arrow(length = unit(0.3, "cm")),
               color = "darkred", size = 1) +
  geom_text(aes(label = Variable), hjust = 0, vjust = 0, 
            nudge_x = 0.02, nudge_y = 0.02, size = 4, fontface = "bold") +
  labs(title = "PCA Loading Plot",
       subtitle = "variable contributions to principal components") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5))
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
print(loading_plot)

 library(tidyr)
mammals_long <- mammals_clustered %>%
  rownames_to_column("Mammal") %>%
  pivot_longer(cols = -c(Mammal, Cluster), 
               names_to = "Variable", 
               values_to = "Value")

boxplot_grid <- ggplot(mammals_long, aes(x = Cluster, y = Value, fill = Cluster)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~Variable, scales = "free_y", ncol = 2) +
  labs(title = "Milk Composition by Cluster",
       subtitle = "Standardized values (z-scores)") +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", hjust = 0.5),
        plot.subtitle = element_text(hjust = 0.5),
        legend.position = "bottom",
        strip.text = element_text(face = "bold"))
print(boxplot_grid)