Activity 4.1 - Hierarchical clustering

SUBMISSION INSTRUCTIONS

  1. Render to html
  2. Publish your html to RPubs
  3. Submit a link to your published solutions
#Loading packages needed for this assignment
library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)

Question 1

Consider three data sets below. Each data set contains three “clusters” in two dimensions. In the first, the clusters are three convex spheres. (A convex cluster is one where all points in the cluster can be connected with a straight line that does not leave the cluster.) In the second, one cluster is a sphere; one is a ring; and one is a half-moon. In the third, two clusters are spirals and one is a sphere. Our goal is to compare the performance of various hierarchical methods in clustering different cluster shapes.

three_spheres <- read.csv('Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')

Perform agglomerative clustering with single, complete, average, and Ward linkages. Cut each tree to produce three clusters. Produce 12 scatterplots, one per data set/linkage combination, showing the 3-cluster solution. Title each graph with the linkage used, as well as the average silhouette width (\(\bar s\)) for that clustering solution. Use patchwork to create a nice 4x3 grid of your plots.

three_spheres_single <- agnes(three_spheres, metric = 'euclidean', method = 'single')
hc_three_spheres_single <- as.hclust(three_spheres_single)
three_spheres_clusters_single <- cutree(hc_three_spheres_single, k = 3)
three_spheres_single_silhouette <- silhouette(three_spheres_clusters_single, dist(three_spheres))
avg_silhouette_three_spheres_single <- mean(three_spheres_single_silhouette[, 3])

p1 <- ggplot(three_spheres, aes(x, y, color = factor(three_spheres_clusters_single))) +
  geom_point(size = 2) +
  labs(
    title = paste0("Three Spheres — Single Linkage \n(avg sil = ",
                   round(avg_silhouette_three_spheres_single, 3), ")")
  ) +
  theme_minimal()
three_spheres_complete <- agnes(three_spheres, metric = 'euclidean', method = 'complete')
hc_three_spheres_complete <- as.hclust(three_spheres_complete)
three_spheres_clusters_complete <- cutree(hc_three_spheres_complete, k = 3)
three_spheres_complete_silhouette <- silhouette(three_spheres_clusters_complete, dist = dist(three_spheres))
avg_silhouette_three_spheres_complete <- mean(three_spheres_complete_silhouette[, 3])

p2 <- ggplot(three_spheres, aes(x, y, color = factor(three_spheres_clusters_complete))) +
  geom_point(size = 2) +
  labs(title = paste0("Three Spheres — Complete Linkage \n(avg sil = ",
                      round(avg_silhouette_three_spheres_complete, 3), ")")) +
  theme_minimal()
three_spheres_average <- agnes(three_spheres, metric = 'euclidean', method = 'average')
hc_three_spheres_average <- as.hclust(three_spheres_average)
three_spheres_clusters_average <- cutree(hc_three_spheres_average, k = 3)
three_spheres_average_silhouette <- silhouette(three_spheres_clusters_average, dist = dist(three_spheres))
avg_silhouette_three_spheres_average <- mean(three_spheres_average_silhouette[, 3])

p3 <- ggplot(three_spheres, aes(x, y, color = factor(three_spheres_clusters_average))) +
  geom_point(size = 2) +
  labs(title = paste0("Three Spheres — Average Linkage \n(avg sil = ",
                      round(avg_silhouette_three_spheres_average, 3), ")")) +
  theme_minimal()
three_spheres_ward <- agnes(three_spheres, metric = 'euclidean', method = 'ward')
hc_three_spheres_ward <- as.hclust(three_spheres_ward)
three_spheres_clusters_ward <- cutree(hc_three_spheres_ward, k = 3)
three_spheres_ward_silhouette <- silhouette(three_spheres_clusters_ward, dist = dist(three_spheres))
avg_silhouette_three_spheres_ward <- mean(three_spheres_ward_silhouette[, 3])

p4 <- ggplot(three_spheres, aes(x, y, color = factor(three_spheres_clusters_ward))) +
  geom_point(size = 2) +
  labs(title = paste0("Three Spheres — Ward Linkage \n(avg sil = ",
                      round(avg_silhouette_three_spheres_ward, 3), ")")) +
  theme_minimal()
ring_moon_sphere_single <- agnes(ring_moon_sphere, metric = 'euclidean', method = 'single')
hc_ring_moon_sphere_single <- as.hclust(ring_moon_sphere_single)
ring_moon_sphere_clusters_single <- cutree(hc_ring_moon_sphere_single, k = 3)
ring_moon_sphere_single_silhouette <- silhouette(ring_moon_sphere_clusters_single, dist = dist(ring_moon_sphere))
avg_silhouette_ring_moon_sphere_single <- mean(ring_moon_sphere_single_silhouette[, 3])

p5 <- ggplot(ring_moon_sphere, aes(x, y, color = factor(ring_moon_sphere_clusters_single))) +
  geom_point(size = 2) +
  labs(title = paste0("Ring+Moon+Sphere — Single Linkage \n(avg sil = ",
                      round(avg_silhouette_ring_moon_sphere_single, 3), ")")) +
  theme_minimal()
ring_moon_sphere_complete <- agnes(ring_moon_sphere, metric = 'euclidean', method = 'complete')
hc_ring_moon_sphere_complete <- as.hclust(ring_moon_sphere_complete)
ring_moon_sphere_clusters_complete <- cutree(hc_ring_moon_sphere_complete, k = 3)
ring_moon_sphere_complete_silhouette <- silhouette(ring_moon_sphere_clusters_complete, dist = dist(ring_moon_sphere))
avg_silhouette_ring_moon_sphere_complete <- mean(ring_moon_sphere_complete_silhouette[, 3])

p6 <- ggplot(ring_moon_sphere, aes(x, y, color = factor(ring_moon_sphere_clusters_complete))) +
  geom_point(size = 2) +
  labs(title = paste0("Ring+Moon+Sphere — Complete Linkage \n(avg sil = ",
                      round(avg_silhouette_ring_moon_sphere_complete, 3), ")")) +
  theme_minimal()
ring_moon_sphere_average <- agnes(ring_moon_sphere, metric = 'euclidean', method = 'average')
hc_ring_moon_sphere_average <- as.hclust(ring_moon_sphere_average)
ring_moon_sphere_clusters_average <- cutree(hc_ring_moon_sphere_average, k = 3)
ring_moon_sphere_average_silhouette <- silhouette(ring_moon_sphere_clusters_average, dist = dist(ring_moon_sphere))
avg_silhouette_ring_moon_sphere_average <- mean(ring_moon_sphere_average_silhouette[, 3])

p7 <- ggplot(ring_moon_sphere, aes(x, y, color = factor(ring_moon_sphere_clusters_average))) +
  geom_point(size = 2) +
  labs(title = paste0("Ring+Moon+Sphere — Average Linkage \n(avg sil = ",
                      round(avg_silhouette_ring_moon_sphere_average, 3), ")")) +
  theme_minimal()
ring_moon_sphere_ward <- agnes(ring_moon_sphere, metric = 'euclidean', method = 'ward')
hc_ring_moon_sphere_ward <- as.hclust(ring_moon_sphere_ward)
ring_moon_sphere_clusters_ward <- cutree(hc_ring_moon_sphere_ward, k = 3)
ring_moon_sphere_ward_silhouette <- silhouette(ring_moon_sphere_clusters_ward, dist = dist(ring_moon_sphere))
avg_silhouette_ring_moon_sphere_ward <- mean(ring_moon_sphere_ward_silhouette[, 3])

p8 <- ggplot(ring_moon_sphere, aes(x, y, color = factor(ring_moon_sphere_clusters_ward))) +
  geom_point(size = 2) +
  labs(title = paste0("Ring+Moon+Sphere — Ward Linkage \n(avg sil = ",
                      round(avg_silhouette_ring_moon_sphere_ward, 3), ")")) +
  theme_minimal()
two_spirals_sphere_single <- agnes(two_spirals_sphere, metric = 'euclidean', method = 'single')
hc_two_spirals_sphere_single <- as.hclust(two_spirals_sphere_single)
two_spirals_sphere_clusters_single <- cutree(hc_two_spirals_sphere_single, k = 3)
two_spirals_sphere_single_silhouette <- silhouette(two_spirals_sphere_clusters_single, dist = dist(two_spirals_sphere))
avg_silhouette_two_spirals_sphere_single <- mean(two_spirals_sphere_single_silhouette[, 3])

p9 <- ggplot(two_spirals_sphere, aes(x, y, color = factor(two_spirals_sphere_clusters_single))) +
  geom_point(size = 2) +
  labs(title = paste0("Two Spirals+Sphere — Single Linkage \n(avg sil = ",
                      round(avg_silhouette_two_spirals_sphere_single, 3), ")")) +
  theme_minimal()
two_spirals_sphere_complete <- agnes(two_spirals_sphere, metric = 'euclidean', method = 'complete')
hc_two_spirals_sphere_complete <- as.hclust(two_spirals_sphere_complete)
two_spirals_sphere_clusters_complete <- cutree(hc_two_spirals_sphere_complete, k = 3)
two_spirals_sphere_complete_silhouette <- silhouette(two_spirals_sphere_clusters_complete, dist = dist(two_spirals_sphere))
avg_silhouette_two_spirals_sphere_complete <- mean(two_spirals_sphere_complete_silhouette[, 3])

p10 <- ggplot(two_spirals_sphere, aes(x, y, color = factor(two_spirals_sphere_clusters_complete))) +
  geom_point(size = 2) +
  labs(title = paste0("Two Spirals+Sphere — Complete Linkage \n(avg sil = ",
                      round(avg_silhouette_two_spirals_sphere_complete, 3), ")")) +
  theme_minimal()
two_spirals_sphere_average <- agnes(two_spirals_sphere, metric = 'euclidean', method = 'average')
hc_two_spirals_sphere_average <- as.hclust(two_spirals_sphere_average)
two_spirals_sphere_clusters_average <- cutree(hc_two_spirals_sphere_average, k = 3)
two_spirals_sphere_average_silhouette <- silhouette(two_spirals_sphere_clusters_average, dist = dist(two_spirals_sphere))
avg_silhouette_two_spirals_sphere_average <- mean(two_spirals_sphere_average_silhouette[, 3])

p11 <- ggplot(two_spirals_sphere, aes(x, y, color = factor(two_spirals_sphere_clusters_average))) +
  geom_point(size = 2) +
  labs(title = paste0("Two Spirals+Sphere — Average Linkage \n(avg sil = ",
                      round(avg_silhouette_two_spirals_sphere_average, 3), ")")) +
  theme_minimal()
two_spirals_sphere_ward <- agnes(two_spirals_sphere, metric = 'euclidean', method = 'ward')
hc_two_spirals_sphere_ward <- as.hclust(two_spirals_sphere_ward)
two_spirals_sphere_clusters_ward <- cutree(hc_two_spirals_sphere_ward, k = 3)
two_spirals_sphere_ward_silhouette <- silhouette(two_spirals_sphere_clusters_ward, dist = dist(two_spirals_sphere))
avg_silhouette_two_spirals_sphere_ward <- mean(two_spirals_sphere_ward_silhouette[, 3])

p12 <- ggplot(two_spirals_sphere, aes(x, y, color = factor(two_spirals_sphere_clusters_ward))) +
  geom_point(size = 2) +
  labs(title = paste0("Two Spirals+Sphere — Ward Linkage \n(avg sil = ",
                      round(avg_silhouette_two_spirals_sphere_ward, 3), ")")) +
  theme_minimal()
p1 <- p1 + theme(plot.title = element_text(size = 5))
p2 <- p2 + theme(plot.title = element_text(size = 5))
p3 <- p3 + theme(plot.title = element_text(size = 5))
p4 <- p4 + theme(plot.title = element_text(size = 5))
p5 <- p5 + theme(plot.title = element_text(size = 5))
p6 <- p6 + theme(plot.title = element_text(size = 5))
p7 <- p7 + theme(plot.title = element_text(size = 5))
p8 <- p8 + theme(plot.title = element_text(size = 5))
p9 <- p9 + theme(plot.title = element_text(size = 5))
p10 <- p10 + theme(plot.title = element_text(size = 5))
p11 <- p11 + theme(plot.title = element_text(size = 5))
p12 <- p12 + theme(plot.title = element_text(size = 5))
combined_plot <- (p1 | p2 | p3 | p4) /
                 (p5 | p6 | p7 | p8) /
                 (p9 | p10 | p11 | p12) &
  theme(legend.position = "none")
combined_plot

Discuss the following:

  1. Which linkage works best for which scenario?

    For the three sphere the complete, average, and ward linkage all have the same average silhouette of .727.

    For the ring moon sphere the average linkage was the best with an average silhouette width of .575.

    For the two spirals sphere the average linkage was the best. The average silhouette width was .432.

  2. Does the average silhouette width always do a good job of measuring the quality of the clustering solution?

    No it doesn’t work the best for all the clustering solutions. The ones that it does the best in are the ones where the clusters are compact and convex. Those being the three spheres data set. It doesn’t do the best with spiral data sets because the euclidean distance is far apart.

(Hint: you have a lot of repetitive code to write. You may find it helpful to write a function that takes a data set and a linkage method as arguments, does the clustering and computes average silhouette width, and produces the desired plot.)

Question 2

Consider the data set below on milk content of 25 mammals. The variables have been pre-scaled to z-scores, hence no additional standardizing is necessary. (Data source: Everitt et al. Cluster analysis 4ed)

mammals <- read.csv('Data/mammal_milk.csv') %>% 
  column_to_rownames('Mammal')

A)

Perform agglomerative clustering with single, complete, average, and Ward linkages. Which has the best agglomerative coefficient?

cluster_mammals_single <- agnes(mammals, metric='euclidean', method='single')
cluster_mammals_complete <- agnes(mammals, metric='euclidean', method='complete')
cluster_mammals_average <- agnes(mammals, metric='euclidean', method='average')
cluster_mammals_ward <- agnes(mammals, metric='euclidean', method='ward')

cluster_mammals_single$ac
[1] 0.7875718
cluster_mammals_complete$ac
[1] 0.8985539
cluster_mammals_average$ac
[1] 0.8706571
cluster_mammals_ward$ac
[1] 0.9413994

The Ward linkage has the best agglomerative coefficient. The agglomerative coefficient is .94.

B)

Plot a dendrogram of the method with the highest AC. Which mammals cluster together first?

fviz_dend(cluster_mammals_ward, k = 24) + 
  labs(title='Ward linkage')
Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
ℹ Please use tidy evaluation idioms with `aes()`.
ℹ See also `vignette("ggplot2-in-packages")` for more information.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

The two mammals that cluster together first are the deer and reindeer.

C)

If the tree is cut at a height of 4, how many clusters will form? Which cluster will have the fewest mammals, and which mammals will they be?

When looking at the dendrogram it can be seen that 4 clusters will form if the tree is cut at a height of 4. The cluster that has the fewest amount of mammals has 2 mammals. Those mammals being the dolphin and the seal.

D)

Use WSS and average silhouette method to suggest the optimal number of clusters. Re-create the dendrogram with the cluster memberships indicated.

fviz_nbclust(mammals,
             FUNcluster = hcut,
             k.max = 10,
             method='silhouette',
             hc_method='ward',
             hc_func = 'agnes')

fviz_nbclust(mammals,
             FUNcluster = hcut,
             k.max = 10,
             method='wss',
             hc_method='ward',
             hc_func = 'agnes')

fviz_dend(cluster_mammals_ward, k = 3) + 
  labs(title = 'Ward linkage')

When looking at the two plots the average silhouette width plot shows that the optimal number of clusters should be 3. When looking at the wss plot there is a good argument that 3 clusters should be used.

E)

Use suitable visualizations, including dimension reduction techniques, to explore the different milk characteristics of the assigned clusters. Discuss.

dist_mammals <- dist(mammals)
hc <- hclust(dist_mammals, method = 'ward.D2')
clusters_3 <- cutree(hc, k = 3)
mammals_clust <- mammals %>%
  mutate(Cluster = factor(clusters_3),
         Mammal = rownames(mammals))
pca_mammals <- prcomp(mammals, scale = FALSE)

plot_pca_mammals <- fviz_pca_ind(
  pca_mammals,
  geom = "point",
  habillage = mammals_clust$Cluster,
  title = "PCA of Mammal Milk Characteristics"
)
plot_pca_mammals

mammals_long <- mammals_clust %>%
  pivot_longer(cols = -c(Cluster, Mammal),
               names_to = "Variable",
               values_to = "Value")

boxplot_mammals <- ggplot(mammals_long, aes(x = Cluster, y = Value, fill = Cluster)) +
  geom_boxplot() +
  facet_wrap(~ Variable, scales = "free", ncol = 3) +
  theme_bw() +
  labs(title = "Milk Composition Differences Across Clusters")
boxplot_mammals

When looking at the clusters on the plot it can be seen that cluster 3 is very isolated in the bottom right corner of the plot. This shows that this cluster is very different than the other two clusters.

Cluster 1 is low on ash, fat, and protein. It is high in lactose and water.

Cluster 2 is high in ash and protein. It is moderate in fat, lactose and water.

Cluster 3 is low in ash, lactose, and water. It is high in fat and protein.