Activity 4.1 - Hierarchical clustering

SUBMISSION INSTRUCTIONS

Render to html
Publish your html to RPubs
Submit a link to your published solutions

#Loading packages needed for this assignment
library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)

Question 1

Consider three data sets below. Each data set contains three “clusters” in two dimensions. In the first, the clusters are three convex spheres. (A convex cluster is one where all points in the cluster can be connected with a straight line that does not leave the cluster.) In the second, one cluster is a sphere; one is a ring; and one is a half-moon. In the third, two clusters are spirals and one is a sphere. Our goal is to compare the performance of various hierarchical methods in clustering different cluster shapes.

three_spheres <- read.csv('Data/cluster_data1.csv')
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')

Perform agglomerative clustering with single, complete, average, and Ward linkages. Cut each tree to produce three clusters. Produce 12 scatterplots, one per data set/linkage combination, showing the 3-cluster solution. Title each graph with the linkage used, as well as the average silhouette width (\(\bar s\)) for that clustering solution. Use patchwork to create a nice 4x3 grid of your plots.

library(cluster)
library(ggplot2)
library(patchwork)

## plot to look at first dataset
head(three_spheres)

          x        y
1 0.8131339 4.084861
2 1.0518924 4.410097
3 0.8745572 3.866620
4 1.1435424 3.742791
5 1.2999460 4.345134
6 1.1079499 3.820448

## plot to look at first dataset
ggplot(three_spheres, aes(x = x, y = y)) +
  geom_point(alpha = 0.6) +
  theme_minimal() +
  labs(
    title = "Raw Data: Three Spheres",
    x = "X coordinate",
    y = "Y coordinate"
  )

## create euclidean distance matrices
dM3Sph <- dist(three_spheres, method='euclidean')
dMMoon <- dist(ring_moon_sphere, method='euclidean')
dMTsP <- dist(two_spirals_sphere, method='euclidean')


## run agglomeration clustering with the 4 different linkages

# Three Sphere
(ThrSph_single <- agnes(dM3Sph, method='single'))$ac

[1] 0.8953964

(ThrSph_complete <- agnes(dM3Sph, method='complete'))$ac

[1] 0.9792809

(ThrSph_average <- agnes(dM3Sph, method='average'))$ac

[1] 0.9667276

(ThrSph_ward <- agnes(dM3Sph, method='ward'))$ac

[1] 0.9967758

# Ring Moon
(Moon_single <- agnes(dMMoon, method='single'))$ac

[1] 0.9304074

(Moon_complete <- agnes(dMMoon, method='complete'))$ac

[1] 0.9911578

(Moon_average <- agnes(dMMoon, method='average'))$ac

[1] 0.9852207

(Moon_ward <- agnes(dMMoon, method='ward'))$ac

[1] 0.9991226

# Two Spirals
(TsP_single <- agnes(dMTsP, method='single'))$ac

[1] 0.8646828

(TsP_complete <- agnes(dMTsP, method='complete'))$ac

[1] 0.9912231

(TsP_average <- agnes(dMTsP, method='average'))$ac

[1] 0.9810023

(TsP_ward <- agnes(dMTsP, method='ward'))$ac

[1] 0.998266

## cut each linkage into 3 clusters

# Three Sphere
ThrSph_single_clust   <- cutree(as.hclust(ThrSph_single),   k = 3)
ThrSph_complete_clust <- cutree(as.hclust(ThrSph_complete), k = 3)
ThrSph_average_clust  <- cutree(as.hclust(ThrSph_average),  k = 3)
ThrSph_ward_clust     <- cutree(as.hclust(ThrSph_ward),     k = 3)
# Ring Moon
Moon_single_clust   <- cutree(as.hclust(Moon_single),   k = 3)
Moon_complete_clust <- cutree(as.hclust(Moon_complete), k = 3)
Moon_average_clust  <- cutree(as.hclust(Moon_average),  k = 3)
Moon_ward_clust     <- cutree(as.hclust(Moon_ward),     k = 3)
# Two Spirals
TsP_single_clust   <- cutree(as.hclust(TsP_single),   k = 3)
TsP_complete_clust <- cutree(as.hclust(TsP_complete), k = 3)
TsP_average_clust  <- cutree(as.hclust(TsP_average),  k = 3)
TsP_ward_clust     <- cutree(as.hclust(TsP_ward),     k = 3)

## create labels for each linkage

# Three Sphere
three_spheres$single   <- factor(ThrSph_single_clust)
three_spheres$complete <- factor(ThrSph_complete_clust)
three_spheres$average  <- factor(ThrSph_average_clust)
three_spheres$ward     <- factor(ThrSph_ward_clust)
# Ring Moon
ring_moon_sphere$single   <- factor(Moon_single_clust)
ring_moon_sphere$complete <- factor(Moon_complete_clust)
ring_moon_sphere$average  <- factor(Moon_average_clust)
ring_moon_sphere$ward     <- factor(Moon_ward_clust)
# Two Spiral
two_spirals_sphere$single   <- factor(TsP_single_clust)
two_spirals_sphere$complete <- factor(TsP_complete_clust)
two_spirals_sphere$average  <- factor(TsP_average_clust)
two_spirals_sphere$ward     <- factor(TsP_ward_clust)

## compute average silhouette lengths

# Three Sphere
ThrSph_sil_single <- mean(silhouette(ThrSph_single_clust, dM3Sph)[, 3])
ThrSph_sil_complete <- mean(silhouette(ThrSph_complete_clust, dM3Sph)[, 3])
ThrSph_sil_average <- mean(silhouette(ThrSph_average_clust, dM3Sph)[, 3])
ThrSph_sil_ward <- mean(silhouette(ThrSph_ward_clust, dM3Sph)[, 3])
# Ring Moon
Moon_sil_single <- mean(silhouette(Moon_single_clust, dMMoon)[, 3])
Moon_sil_complete <- mean(silhouette(Moon_complete_clust, dMMoon)[, 3])
Moon_sil_average <- mean(silhouette(Moon_average_clust, dMMoon)[, 3])
Moon_sil_ward <- mean(silhouette(Moon_ward_clust, dMMoon)[, 3])
# Two SPiral
TsP_sil_single <- mean(silhouette(TsP_single_clust, dMTsP)[, 3])
TsP_sil_complete <- mean(silhouette(TsP_complete_clust, dMTsP)[, 3])
TsP_sil_average <- mean(silhouette(TsP_average_clust, dMTsP)[, 3])
TsP_sil_ward <- mean(silhouette(TsP_ward_clust, dMTsP)[, 3])

## make 4 plots for each dataset

# Three Sphere
ThrSph_p_single <- ggplot(three_spheres, aes(x, y, color = single)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ", round(ThrSph_sil_single, 3))) +
  theme(legend.position = "none")

ThrSph_p_complete <- ggplot(three_spheres, aes(x, y, color = complete)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ", round(ThrSph_sil_complete, 3))) +
  theme(legend.position = "none")


ThrSph_p_average <- ggplot(three_spheres, aes(x, y, color = average)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
       round(ThrSph_sil_average, 3))) +
      (theme(legend.position = "none"))

ThrSph_p_ward <- ggplot(three_spheres, aes(x, y, color = ward)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
       round(ThrSph_sil_ward, 3))) + 
      (theme(legend.position = "none"))

# Ring Moon
Moon_p_single <- ggplot(ring_moon_sphere, aes(x, y, color = single)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(Moon_sil_single, 3))) +
      (theme(legend.position = "none"))

Moon_p_complete <- ggplot(ring_moon_sphere, aes(x, y, color = complete)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(Moon_sil_complete, 3))) +
      (theme(legend.position = "none"))

Moon_p_average <- ggplot(ring_moon_sphere, aes(x, y, color = average)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(Moon_sil_average, 3))) +
      (theme(legend.position = "none"))

Moon_p_ward <- ggplot(ring_moon_sphere, aes(x, y, color = ward)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(Moon_sil_ward, 3))) +
      (theme(legend.position = "none"))

# Two Spiral
TsP_p_single <- ggplot(two_spirals_sphere, aes(x, y, color = single)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(TsP_sil_single, 3))) + 
      (theme(legend.position = "none"))

TsP_p_complete <- ggplot(two_spirals_sphere, aes(x, y, color = complete)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(TsP_sil_complete, 3))) + 
      (theme(legend.position = "none"))

TsP_p_average <- ggplot(two_spirals_sphere, aes(x, y, color = average)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",  
      round(TsP_sil_average, 3))) +
      (theme(legend.position = "none"))

TsP_p_ward <- ggplot(two_spirals_sphere, aes(x, y, color = ward)) +
  geom_point() +
  theme_minimal() +
  labs(title = paste0("Avg Sil = ",
      round(TsP_sil_ward, 3))) + 
      (theme(legend.position = "none"))

## create 4 x 3 grid with rows representing a data set and columns representing linkage type

# row labels
row_single   <- ggplot() + 
  annotate("text", x = 0, y = 0, label = "Single", size = 4) + 
  theme_void()
row_complete <- ggplot() + 
  annotate("text", x = 0, y = 0, label = "Complete", size = 4) + 
  theme_void()
row_average  <- ggplot() + 
  annotate("text", x = 0, y = 0, label = "Average", size = 4) + 
  theme_void()
row_ward     <- ggplot() + 
  annotate("text", x = 0, y = 0, label = "Ward", size = 4) + 
  theme_void()
# Column labels
col_thrSph <- ggplot() + 
  annotate("text", x=0, y=0, label="Three Spheres", size=4) + 
  theme_void()
col_moon <- ggplot() + 
  annotate("text", x=0, y=0, label="Ring-Moon", size=4) + 
  theme_void()
col_ts <- ggplot() + 
  annotate("text", x=0, y=0, label="Two Spirals", size=4) + 
  theme_void()
# create grid
labeled_grid_4x3 <- 
  (wrap_elements() | col_thrSph | col_moon | col_ts) /
  (row_single   | ThrSph_p_single   | Moon_p_single   | TsP_p_single) /
  (row_complete | ThrSph_p_complete | Moon_p_complete | TsP_p_complete) /
  (row_average  | ThrSph_p_average  | Moon_p_average  | TsP_p_average) /
  (row_ward     | ThrSph_p_ward     | Moon_p_ward     | TsP_p_ward)

labeled_grid_4x3

Discuss the following:

Which linkage works best for which scenario?

For the Three Spheres scenario it could be said that the complete, average and ward linkages are all equally the best. This is because they all have the same average silhouette length. The Average and Ward linkages have the exact same clusters and complete has 1 different point that must be perfectly interchangeable between the 2 clusters. The clusters created seem to accurately describe the naturally occurring clusters in the data.

For the Ring-Moon scenario based on the average silhouette length the average linkage creates the best clusters. However, looking at the clusters on a plot, the single linkage seems to align with the naturally occurring clusters created by the data set. In this case, I think the “best” linkage depends on the context of what it is being used for. If our goal is to create clusters that are as mathematically perfect as possible than silhouette length is all that matters and average would be the best linkage. But if we are looking for clusters that find relationships between points that has less to do with being close to the center and more with close to each other than single would be better. Considering how dramatic the shapes created in the data set I would say that the single linkage is the best.

For the Two Spirals scenario the answer is basically the same as Ring-Moon where the single linkage follows the natural spirals made in the data set and the average linkage has a better average silhouette length this time by a larger margin (over 2x). Again I think it depends but still think the single linkage shows the natural relationships created in the data set.

Does the average silhouette width always do a good job of measuring the quality of the clustering solution?

I feel like I already accidentally answered this in Q1, but no the average silhouette length does not always do a good job measuring the quality of a clustering solution. All it does is tell you how far on average each point is from its respective center, this is only useful when trying to have mathematically perfect and tight clusters that are ideally circular shapes to reduce silhouette length. When dealing with abstract shapes that have bends, curves and holes in them silhouette length becomes somewhat negligible since it doesn’t really capture the quality of the clusters.

(Hint: you have a lot of repetitive code to write. You may find it helpful to write a function that takes a data set and a linkage method as arguments, does the clustering and computes average silhouette width, and produces the desired plot.)

Question 2

Consider the data set below on milk content of 25 mammals. The variables have been pre-scaled to z-scores, hence no additional standardizing is necessary. (Data source: Everitt et al. Cluster analysis 4ed)

A)

Perform agglomerative clustering with single, complete, average, and Ward linkages. Which has the best agglomerative coefficient?

mammals <- read.csv('Data/mammal_milk.csv') %>% 
  column_to_rownames('Mammal')

head(mammals)

         Water Protein    Fat Lactose    Ash
Bison    0.681  -0.387 -0.818   0.856  0.073
Buffalo  0.307  -0.085 -0.229   0.310 -0.165
Camel    0.743  -0.742 -0.657   0.365 -0.303
Cat      0.268   1.064 -0.381   0.146 -0.224
Deer    -0.955   1.147  0.893  -0.836  1.063
Dog     -0.145   0.845 -0.077  -0.618  0.667

## create a distance matrix
DMmammals <- dist(mammals, method='euclidean')

## perform agglomerative clustering with each of the 4 linkages
(DMmammals_single <- agnes(mammals, method='single'))$ac

[1] 0.7875718

(DMmammals_complete <- agnes(mammals, method='complete'))$ac

[1] 0.8985539

(DMmammals_average <- agnes(mammals, method='average'))$ac

[1] 0.8706571

(DMmammals_ward <- agnes(mammals, method='ward'))$ac

[1] 0.9413994

The ward linkage has the best agglomerative coefficient at 0.941

B)

Plot a dendrogram of the method with the highest AC. Which mammals cluster together first?

p_ward <- fviz_dend(DMmammals_ward) + ggtitle('Ward linkage')

p_ward

Bison and llama, Camel and Zebra, Donkey and Mule, Monkey and Orangutan, Fox and Sheep, Deer and Reindeer are all at about 0 height. Next Guinea Pig and Pig, Dog and Rat, Dolphin and Seal are all at about 0.8 - 1.0.

C)

If the tree is cut at a height of 4, how many clusters will form? Which cluster will have the fewest mammals, and which mammals will they be?

If the tree was cut at height 4 there would be clusters. I am counting them from left to right as c1, c2, c3, c4. Cluster c4 will have the fewest mammals with only 2, dolphin and seal.

D)

Use WSS and average silhouette method to suggest the optimal number of clusters. Re-create the dendrogram with the cluster memberships indicated.

#| attr-source: "style='font-size: 0.5em'"
fviz_nbclust(mammals, 
             FUNcluster = hcut,
             method='wss',
             diss = DMmammals,
             hc_method='ward',
             hc_func = 'agnes') + 
  labs(title = 'Plot of WSS vs k')

#| attr-source: "style='font-size: 0.5em'"
fviz_nbclust(mammals, 
             FUNcluster = hcut,
             method='silhouette',
             diss = DMmammals,
             hc_method='ward',
             hc_func = 'agnes') + 
  labs(title = 'Plot of mean silhouette width vs k')

Based on the WSS plot suggests 3-4 with 3 probably being the better option. The Silhouette method suggests that 3 is the best with 2 being the second best. With this I will choose 3 as the best option since they both can agree on 3 as a viable pick.

p_ward_colored <- fviz_dend(DMmammals_ward, 
                    k = 3,            # cut tree into 3 clusters
                    color_labels = TRUE,
                    k_colors = c("red", "blue", "forestgreen")) + 
                    ggtitle('Ward Linkage with Colored Clusters') 

p_ward_colored

E)

Use suitable visualizations, including dimension reduction techniques, to explore the different milk characteristics of the assigned clusters. Discuss.

library(dplyr)

ward_tree    <- as.hclust(DMmammals_ward)
ward_cluster <- cutree(ward_tree, k = 3)      

## perform PCA on variables
milk_pca <- prcomp(mammals, scale. = TRUE)  

## % variance explained for axis label
pca_var <- summary(milk_pca)$importance[2, 1:2] * 100   # PC1, PC2 %


# scores 
scores <- as.data.frame(milk_pca$x[, 1:2])
colnames(scores) <- c("PC1", "PC2")
scores$Mammal  <- rownames(mammals)
scores$cluster <- factor(ward_cluster)

# loadings
loadings <- as.data.frame(milk_pca$rotation[, 1:2])
colnames(loadings) <- c("PC1", "PC2")
loadings$var <- rownames(loadings)

# scale arrows
arrow_mult <- 2.5


library(grid) 

ggplot(scores, aes(PC1, PC2, color = cluster)) +
  geom_point(size = 3) +
  geom_segment(data = loadings,
               aes(x = 0, y = 0,
                   xend = PC1 * arrow_mult,
                   yend = PC2 * arrow_mult),
               arrow = arrow(length = unit(0.2, "cm")),
               color = "black") +
  geom_text(data = loadings,
            aes(x = PC1 * arrow_mult * 1.1,
                y = PC2 * arrow_mult * 1.1,
                label = var),
            color = "black", size = 4) +
  scale_color_manual(values = c("red", "blue", "forestgreen"),
                     name = "Ward cluster") +
  theme_minimal() +
  labs(
    title = "PCA biplot",
    x = paste0("PC1 (", round(pca_var[1], 1), "%)"),
    y = paste0("PC2 (", round(pca_var[2], 1), "%)")
  )

As shown in the graph Cluster 3 have very high levels of fat and very low levels of water in their milk. This makes sense for the dolphins and seals to have very different fat levels than the rest since they survive in colder waters. Cluster 2 has somewhat high levels of protein and very high levels Ash while lacking in Lactose and water. Cluster 1 have very high levels of water and lactose and lack heavily in protein and fat. Cluster 1 is also the cluster that has has the most variability compared to the other 2 having quite a few points lying around the center of the PCA plot.