Activity 4.1 - Hierarchical clustering

SUBMISSION INSTRUCTIONS

  1. Render to html
  2. Publish your html to RPubs
  3. Submit a link to your published solutions
#Loading packages needed for this assignment
library(tidyverse)
library(cluster)
library(factoextra)
library(patchwork)

Question 1

Consider three data sets below. Each data set contains three “clusters” in two dimensions. In the first, the clusters are three convex spheres. (A convex cluster is one where all points in the cluster can be connected with a straight line that does not leave the cluster.) In the second, one cluster is a sphere; one is a ring; and one is a half-moon. In the third, two clusters are spirals and one is a sphere. Our goal is to compare the performance of various hierarchical methods in clustering different cluster shapes.

three_spheres <- read.csv('Data/cluster_data1.csv')
three_scaled <- scale(three_spheres)
ring_moon_sphere <- read.csv('Data/cluster_data2.csv')
ring_scaled <- scale(ring_moon_sphere)
two_spirals_sphere <- read.csv('Data/cluster_data3.csv')
two_spirals_scaled <- scale(two_spirals_sphere)

Perform agglomerative clustering with single, complete, average, and Ward linkages. Cut each tree to produce three clusters. Produce 12 scatterplots, one per data set/linkage combination, showing the 3-cluster solution. Title each graph with the linkage used, as well as the average silhouette width (\(\bar s\)) for that clustering solution. Use patchwork to create a nice 4x3 grid of your plots.

library(factoextra)
library(cluster)
library(dplyr)
library(patchwork)
cluster_three_single <- agnes(three_scaled, metric='euclidean', method='single')
cluster_three_complete <- agnes(three_scaled, metric='euclidean', method='complete')
cluster_three_average <- agnes(three_scaled, metric='euclidean', method='average')
cluster_three_ward <- agnes(three_scaled, metric='euclidean', method='ward')

single_clusters <- cutree(cluster_three_single, k = 3)
complete_clusters <- cutree(cluster_three_complete, k = 3)
average_clusters <- cutree(cluster_three_average, k = 3)
ward_clusters <- cutree(cluster_three_ward, k = 3)

three_spheres_clusters <- (bind_cols(three_scaled, 
          sing_clust=factor(single_clusters),
          comp_clust=factor(complete_clusters),
          avg_clust=factor(average_clusters),
          ward_clust=factor(ward_clusters)
) %>% as_tibble())
p1 <- ggplot(data = three_spheres_clusters, aes(x = x, y = y, col = sing_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2) +
  labs(title='3 Spheres - Single')

p2 <- ggplot(data = three_spheres_clusters, aes(x = x, y = y, col = comp_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='3 spheres - complete')

p3 <- ggplot(data = three_spheres_clusters, aes(x = x, y = y, col = avg_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='3 spheres - average')
p4 <- ggplot(data = three_spheres_clusters, aes(x = x, y = y, col = ward_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='3 spheres - ward')
cluster_ring_single <- agnes(ring_scaled, metric='euclidean', method='single')
cluster_ring_complete <- agnes(ring_scaled, metric='euclidean', method='complete')
cluster_ring_average <- agnes(ring_scaled, metric='euclidean', method='average')
cluster_ring_ward <- agnes(ring_scaled, metric='euclidean', method='ward')

single_clusters <- cutree(cluster_ring_single, k = 3)
complete_clusters <- cutree(cluster_ring_complete, k = 3)
average_clusters <- cutree(cluster_ring_average, k = 3)
ward_clusters <- cutree(cluster_ring_ward, k = 3)

ring_moon_clusters <- (bind_cols(ring_scaled, 
          sing_clust=factor(single_clusters),
          comp_clust=factor(complete_clusters),
          avg_clust=factor(average_clusters),
          ward_clust=factor(ward_clusters)
) %>% as_tibble())

p5 <- ggplot(data = ring_moon_clusters, aes(x = x, y = y, col = sing_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2) +
  labs(title='Ring Moon - Single')

p6 <- ggplot(data = ring_moon_clusters, aes(x = x, y = y, col = comp_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Ring Moon - Complete')

p7 <- ggplot(data = ring_moon_clusters, aes(x = x, y = y, col = avg_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Ring Moon - Average')

p8 <- ggplot(data = ring_moon_clusters, aes(x = x, y = y, col = ward_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Ring Moon - Ward')
cluster_two_spirals_single <- agnes(two_spirals_scaled, metric='euclidean', method='single')
cluster_two_spirals_complete <- agnes(two_spirals_scaled, metric='euclidean', method='complete')
cluster_two_spirals_average <- agnes(two_spirals_scaled, metric='euclidean', method='average')
cluster_two_spirals_ward <- agnes(two_spirals_scaled, metric='euclidean', method='ward')

single_clusters <- cutree(cluster_two_spirals_single, k = 3)
complete_clusters <- cutree(cluster_two_spirals_complete, k = 3)
average_clusters <- cutree(cluster_two_spirals_average, k = 3)
ward_clusters <- cutree(cluster_two_spirals_ward, k = 3)

two_spirals_clusters <- (bind_cols(two_spirals_scaled, 
          sing_clust=factor(single_clusters),
          comp_clust=factor(complete_clusters),
          avg_clust=factor(average_clusters),
          ward_clust=factor(ward_clusters)
) %>% as_tibble())

p9 <- ggplot(data = two_spirals_clusters, aes(x = x, y = y, col = sing_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2) +
  labs(title='Two Spirals - Single')

p10 <- ggplot(data = two_spirals_clusters, aes(x = x, y = y, col = comp_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Two Spirals - Complete')

p11 <- ggplot(data = two_spirals_clusters, aes(x = x, y = y, col = avg_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Two Spirals - Average')

p12 <- ggplot(data = two_spirals_clusters, aes(x = x, y = y, col = ward_clust)) +
  geom_point() + 
  guides(color='none') + 
  theme_classic(base_size = 2)+
  labs(title='Two Spirals - Ward')
print(
  (p1 + p2 + p3 + p4 + 
   p5 + p6 + p7 + p8 +
   p9 + p10 + p11 + p12) +
    plot_layout(nrow = 3, ncol = 4)
)

Discuss the following:

  1. Which linkage works best for which scenario? For the first df, they all do a pretty good job but I think single and ward linkage does the best
  • The second df none of them are perfect, but I think ward does the best job

  • The third df single linkage does the best at highlighting the spiral shape pretty easily

  1. Does the average silhouette width always do a good job of measuring the quality of the clustering solution?

(Hint: you have a lot of repetitive code to write. You may find it helpful to write a function that takes a data set and a linkage method as arguments, does the clustering and computes average silhouette width, and produces the desired plot.)

Question 2

Consider the data set below on milk content of 25 mammals. The variables have been pre-scaled to z-scores, hence no additional standardizing is necessary. (Data source: Everitt et al. Cluster analysis 4ed)

library(tibble)
mammals <- read.csv('Data/mammal_milk.csv') %>% 
  column_to_rownames('Mammal')
head(mammals)
         Water Protein    Fat Lactose    Ash
Bison    0.681  -0.387 -0.818   0.856  0.073
Buffalo  0.307  -0.085 -0.229   0.310 -0.165
Camel    0.743  -0.742 -0.657   0.365 -0.303
Cat      0.268   1.064 -0.381   0.146 -0.224
Deer    -0.955   1.147  0.893  -0.836  1.063
Dog     -0.145   0.845 -0.077  -0.618  0.667
mammal_scaled <- scale(mammals)

A)

Perform agglomerative clustering with single, complete, average, and Ward linkages. Which has the best agglomerative coefficient? ward, with a coefficient of 0.9414

cluster_mammal_single <- agnes(mammal_scaled, metric='euclidean', method='single')
cluster_mammal_complete <- agnes(mammal_scaled, metric='euclidean', method='complete')
cluster_mammal_average <- agnes(mammal_scaled, metric='euclidean', method='average')
cluster_mammal_ward <- agnes(mammal_scaled, metric='euclidean', method='ward')
print(cluster_mammal_single$ac)
[1] 0.7876335
print(cluster_mammal_complete$ac)
[1] 0.8985795
print(cluster_mammal_average$ac)
[1] 0.8706989
print(cluster_mammal_ward$ac)
[1] 0.941414

B)

Plot a dendrogram of the method with the highest AC. Which mammals cluster together first?

Deer and Reindeer (those two look to have a shorter stem than any other at the bottom I think?)

fviz_dend(cluster_mammal_ward) + 
  labs(title='ward linkage')
Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
ℹ The deprecated feature was likely used in the factoextra package.
  Please report the issue at <https://github.com/kassambara/factoextra/issues>.

C)

If the tree is cut at a height of 4, how many clusters will form? Which cluster will have the fewest mammals, and which mammals will they be? 4 clusters, with the smallest being only dolphin and seal

D)

Use WSS and average silhouette method to suggest the optimal number of clusters. Re-create the dendrogram with the cluster memberships indicated. WSS suggests either 3 or 4, silhouette suggests 3, so will choose 3

fviz_nbclust(mammal_scaled,
             FUNcluster = hcut,
             k.max = 10,
             method='wss',
             hc_method='ward',
             hc_func = 'agnes')

fviz_nbclust(mammal_scaled,
             FUNcluster = hcut,
             k.max = 10,
             method='silhouette',
             hc_method='ward',
             hc_func = 'agnes')

fviz_dend(cluster_mammal_ward,k=3) + 
  labs(title='ward linkage')

E)

Use suitable visualizations, including dimension reduction techniques, to explore the different milk characteristics of the assigned clusters. Discuss. The animals in the red would expect to produce more lactose, which does seem to be fitting (however there are some entries that are a bit out of place). The blue cluster being on the opposite end makes sense, as it is only aquatic animals.

(ward_clusters <- cutree(cluster_mammal_ward, k = 3) 
    %>% data.frame()
    %>% setNames('k3')
) %>% head
  k3
1  1
2  1
3  1
4  1
5  2
6  2
mammal_pca <- prcomp(mammal_scaled, center=TRUE, scale. = TRUE)

k3_biplot <- fviz_pca(mammal_pca, 
         habillage = factor(ward_clusters$k3),
         repel = TRUE) + 
        ggtitle('3-cluster solution') + 
        guides(color='none',shape='none')
k3_biplot