pacman::p_load(tidyverse, factoextra, cluster, dendextend, gridExtra, circlize, broom)

theme_set(theme_bw(base_size = 12))

Introduction

This lesson demonstrates hierarchical clustering and k-means clustering using the built-in iris dataset in R. Cluster analysis is an unsupervised learning technique for grouping similar observations together.

In PCA, we group variables, but in cluster analysis, we group observations. The goal is to identify natural groupings in the data without prior knowledge of categories.

We start with hierarchical clustering (without specifying the number of clusters) to visualize the data structure, then use k-means clustering (with specifying the number of clusters) to partition the data into distinct clusters. We get the best results using combination of the both.

We need to decide how many clusters are optimum based on the segmentation variables that contain similar answers from the same group of respondents.

Data Preparation

Load and Examine Data

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

Data Summary

summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 

Scale the Data

iris_scaled = scale(iris[,1:4])
rownames(iris_scaled) = paste(substr(iris$Species, 1, 2), 1:nrow(iris), sep="_")

head(iris_scaled)
##      Sepal.Length Sepal.Width Petal.Length Petal.Width
## se_1   -0.8976739  1.01560199    -1.335752   -1.311052
## se_2   -1.1392005 -0.13153881    -1.335752   -1.311052
## se_3   -1.3807271  0.32731751    -1.392399   -1.311052
## se_4   -1.5014904  0.09788935    -1.279104   -1.311052
## se_5   -1.0184372  1.24503015    -1.335752   -1.311052
## se_6   -0.5353840  1.93331463    -1.165809   -1.048667

Hierarchical Clustering

Distance Matrix Calculation

dist_matrix = dist(iris_scaled, method = "euclidean")

Hierarchical Clustering

# Agglomerative schedule: how are clusters formed?
agnes_fit = agnes(iris_scaled, method = 'ward')
dend = agnes_fit %>% as.dendrogram()
agg_data = dend %>% get_nodes_attr('height')
agg_schedule = data.frame(
  Step = seq_along(agnes_fit$height),
  Cluster_1 = agnes_fit$merge[, 1],
  Cluster_2 = agnes_fit$merge[, 2],
  Distance = agnes_fit$height
)
agg_schedule
##     Step Cluster_1 Cluster_2   Distance
## 1      1      -102      -143  0.1311927
## 2      2        -8       -40  0.1716415
## 3      3       -11       -49  0.2427378
## 4      4       -10       -35  0.4473402
## 5      5        -1       -18  0.1207633
## 6      6      -129      -133  0.2191573
## 7      7      -128      -139  0.8297548
## 8      8        -3       -48  0.2960431
## 9      9       -81       -82  0.4641953
## 10    10       -20       -47  0.9254562
## 11    11        -2       -26  0.2858005
## 12    12      -121      -144  0.3886956
## 13    13       -12       -25  2.0066086
## 14    14         5       -41  0.1783123
## 15    15         4       -31  0.5589245
## 16    16         8       -30  1.1877552
## 17    17        -5       -38  0.1207633
## 18    18       -89       -96  0.6380471
## 19    19      -137      -149  0.1429002
## 20    20       -64       -79  0.3438179
## 21    21       -14       -39  0.3224602
## 22    22       -66       -87  4.0477797
## 23    23         2       -29  0.2265906
## 24    24       -56      -100  0.5340894
## 25    25        -6       -17  0.6882410
## 26    26       -83       -93  1.4538136
## 27    27       -67       -85  0.9062807
## 28    28       -75       -98  0.4520086
## 29    29        14       -28  6.5857494
## 30    30       -36       -50  0.1655887
## 31    31      -117      -138  0.2684118
## 32    32       -58       -94  0.2623854
## 33    33       -13       -46  0.4416051
## 34    34      -124      -127  0.1311927
## 35    35      -113      -140  0.1716415
## 36    36         7      -150  1.3252277
## 37    37        11        33  0.3128832
## 38    38        24       -97  0.2112608
## 39    39        20       -92  1.7471497
## 40    40       -21       -32  0.1333894
## 41    41       -59       -76  0.1777711
## 42    42       -70       -90  0.3013598
## 43    43       -24       -27  0.4033035
## 44    44        16        -4  1.0523692
## 45    45       -52       -57  0.3913333
## 46    46       -91       -95  0.1699430
## 47    47       -51       -53  0.7894933
## 48    48      -108      -131  0.2558771
## 49    49        -9        21 27.1589265
## 50    50       -22       -45  2.0083698
## 51    51      -142      -146  0.2592702
## 52    52        10        50  0.5129516
## 53    53       -68        26  1.1443270
## 54    54       -55      -134  4.2293163
## 55    55       -69      -120  0.5355756
## 56    56      -141        51  0.2943447
## 57    57        40       -37  0.4644243
## 58    58        31      -148  0.1429002
## 59    59        -7        13  1.3640485
## 60    60      -118      -132  0.6130577
## 61    61        44       -43  0.3118788
## 62    62      -106      -136  0.9583056
## 63    63      -112        34  0.3449165
## 64    64       -72       -74  0.2363181
## 65    65      -111      -116  0.4724409
## 66    66       -62        39  1.5314283
## 67    67         1      -122  2.6863691
## 68    68        37        15  0.8645241
## 69    69        29        23  0.3722352
## 70    70       -33       -34  0.5405283
## 71    71        12      -125  3.4304199
## 72    72       -65        18  0.2265906
## 73    73        43       -44  0.2728766
## 74    74        42         9  0.6265431
## 75    75       -84      -135  0.4584860
## 76    76        53       -80  0.1870942
## 77    77      -103        35  0.8144325
## 78    78      -104        58  0.2415266
## 79    79        64        28 12.5946503
## 80    80      -101        19  0.3118788
## 81    81        71      -145  0.4987262
## 82    82        47        22  0.2146908
## 83    83        32       -99  2.1345735
## 84    84       -73      -147  0.3722352
## 85    85        41       -77  0.7048376
## 86    86        25       -19  0.2875493
## 87    87       -54        74  0.5290904
## 88    88        55       -88  1.4308379
## 89    89      -126      -130  0.4293560
## 90    90        17       -23  0.2112608
## 91    91       -71       -86  0.2842208
## 92    92      -105         6  0.7995944
## 93    93       -78        78  0.4176726
## 94    94       -60        46  0.4818247
## 95    95      -119      -123  0.2415266
## 96    96        38        72  2.8593049
## 97    97         3        52  0.3118788
## 98    98        67      -114  0.7563437
## 99    99        86       -15  0.5682073
## 100  100        54        85  1.3276410
## 101  101        77        56  0.1333894
## 102  102        45        91  0.2655045
## 103  103        59        30  2.2956986
## 104  104        66        79  0.5851456
## 105  105        84      -109  0.4815183
## 106  106        96        27  0.2592702
## 107  107        69        73  0.3907803
## 108  108        98      -115  1.1187762
## 109  109       -63        88  0.5683562
## 110  110        65        81  0.1311927
## 111  111       -16        70  3.9195379
## 112  112       107        57  0.5278193
## 113  113        48        89  0.8019967
## 114  114        75        63  1.0511733
## 115  115        94        76  0.4650710
## 116  116        62        95  0.9272554
## 117  117       105       114  0.4133855
## 118  118        61       103  0.2653865
## 119  119        93        92  2.0110753
## 120  120        83       -61  0.0000000
## 121  121       101       110  0.4336200
## 122  122        90        97  0.6665665
## 123  123      -110        60  0.8397975
## 124  124        68        49  7.9780202
## 125  125       102        36  0.4833674
## 126  126        87       115  0.2112608
## 127  127       100       104  1.7998309
## 128  128        99       111  0.4750990
## 129  129       126      -107  0.2653865
## 130  130       116       113  0.7119273
## 131  131       124       118  0.3828594
## 132  132        80       121  0.3379073
## 133  133       112       122  1.1803836
## 134  134       -42       120  0.4269933
## 135  135       117       108  0.8988842
## 136  136        82       127  0.1655887
## 137  137       125       119  0.4582660
## 138  138       129       109  0.4926419
## 139  139       136       137  3.9375786
## 140  140       138       106  1.1977371
## 141  141       139       135  0.3950466
## 142  142       132       123  4.2129195
## 143  143       133       128  0.4045414
## 144  144       142       130  1.0456249
## 145  145       134       140  0.6150517
## 146  146       143       131  1.6485355
## 147  147       141       144  0.3118788
## 148  148       145       147  0.9267179
## 149  149       146       148  0.5405840
# Negative number means the observation number, positive number means cluster number (step number where cluster is formed)
hc_complete = hclust(dist_matrix, method = "complete")
hc_average = hclust(dist_matrix, method = "average")
hc_ward = hclust(dist_matrix, method = "ward.D2")

Dendrogram Visualization

par(mfrow = c(3,1))

plot(hc_complete, main = "Complete Linkage", xlab = "", sub = "")
rect.hclust(hc_complete, k = 3, border = 2:4)

plot(hc_average, main = "Average Linkage", xlab = "", sub = "")
rect.hclust(hc_average, k = 3, border = 2:4)

plot(hc_ward, main = "Ward's Method", xlab = "", sub = "")
rect.hclust(hc_ward, k = 3, border = 2:4)

par(mfrow = c(1,1))

Circular dendogram

# Create and color the dendrogram
dend <- as.dendrogram(hc_ward) %>%
  set("branches_k_color", k = 3) %>%  # Color branches by 3 clusters
  set("labels_colors", k = 3)         # Color labels by 3 clusters

# Circular plot
circlize_dendrogram(dend, labels_track_height = 0.1, dend_track_height = 0.6)

Interpretation:

- Ward’s method shows the clearest separation into three clusters

- The complete linkage method also shows reasonable separation

- Average linkage produces less distinct clusters

Optimal Number of Clusters

# wss = total within sum of squares

fviz_nbclust(iris_scaled, FUN = hcut, method = "wss") +
  geom_vline(xintercept = 3, linetype = 2) +
  labs(title = "Elbow Method for Optimal Clusters")

Interpretation:

- The “elbow” occurs at k=3, suggesting 3 clusters are optimal

- This matches our knowledge of three iris species in the dataset

K-means Clustering

Determine Optimal k

# B = number of bootstrap samples (Monte Carlo)

set.seed(123)
gap_stat = clusGap(iris_scaled, FUN = kmeans, 
                    K.max = 10, B = 50)

fviz_gap_stat(gap_stat) + 
  labs(title = "Gap Statistic for K-means Clustering")

Perform K-means Clustering (k=3)

# nstart = number of random initializations for centers (initial cluster centers)
set.seed(123)
km_res = kmeans(iris_scaled, centers = 3, nstart = 25)

Visualize K-means Results

p1 = fviz_cluster(km_res, data = iris_scaled,
                  ellipse.type = "convex",
                  palette = "Dark2",
                  ggtheme = theme_bw(),
                  main = "a. K-means cluster")

# Compare with actual species
iris$Cluster = as.factor(km_res$cluster)
p2 = ggplot(iris, aes(x = Petal.Length, y = Petal.Width, 
                      color = Cluster, shape = Species)) +
  geom_point(size = 3) +
  scale_color_manual(values = c("#E69F00", "#56B4E9", "#009E73")) +
  theme_bw() +
  labs(title = "b. Comparison with actual species")

grid.arrange(p1, p2, ncol = 2)

Interpretation:

- K-means successfully separates the data into 3 distinct clusters

- Cluster 1 (yellow) corresponds almost perfectly with setosa

- Clusters 2 (blue) and 3 (green) separate most versicolor and virginica

- Some overlap between versicolor and virginica is expected as these species are more similar

Cluster Profiles

# Calculate cluster means
cluster_means <- iris %>%
  mutate(Cluster = km_res$cluster) %>%
  group_by(Cluster) %>% 
  summarise(across(where(is.numeric), mean))

# Display cluster characteristics
cluster_means
## # A tibble: 3 × 5
##   Cluster Sepal.Length Sepal.Width Petal.Length Petal.Width
##     <int>        <dbl>       <dbl>        <dbl>       <dbl>
## 1       1         5.01        3.43         1.46       0.246
## 2       2         5.80        2.67         4.37       1.41 
## 3       3         6.78        3.10         5.51       1.97

Interpretation: - Cluster 1 (setosa): Smallest flowers (low values on all measurements) - Cluster 2 (mostly versicolor): Medium-sized flowers - Cluster 3 (mostly virginica): Largest flowers

Evaluating Cluster Quality

Silhouette Analysis

sil = silhouette(km_res$cluster, dist_matrix)

fviz_silhouette(sil) +
  labs(title = "Silhouette Plot for K-means Clustering")
##   cluster size ave.sil.width
## 1       1   50          0.64
## 2       2   53          0.39
## 3       3   47          0.35

Interpretation:

- Most observations have positive silhouette widths

- Average silhouette width is 0.46, indicating reasonable structure

- Cluster 1 (setosa) has the highest cohesion and separation

- Some observations in clusters 3 have negative values, indicating possible misclassification

Comparing with Actual Species

Confusion Matrix

table(Actual = iris$Species, Predicted = km_res$cluster)
##             Predicted
## Actual        1  2  3
##   setosa     50  0  0
##   versicolor  0 39 11
##   virginica   0 14 36

Interpretation:

- Perfect classification for setosa (Cluster 1)

- 2 versicolor misclassified as virginica (Cluster 3)

- 14 virginica misclassified as versicolor (Cluster 2)

- Overall accuracy: 83.3%

Comments

Key findings from the cluster analysis:

  1. Both hierarchical and k-means clustering identified three natural clusters in the data

  2. The clusters correspond well to the known iris species:

    • Cluster 1: Setosa (perfect separation)
    • Cluster 2: Mostly versicolor
    • Cluster 3: Mostly virginica
  3. The overlap between versicolor and virginica clusters reflects biological similarity between these species

  4. Cluster validation metrics (elbow method, gap statistic, silhouette analysis) all support the choice of k=3

The results largely match the known species classification, with some expected confusion between the more similar versicolor and virginica species.