1 Setup

1.1 Set Seed

set.seed(1618)

1.2 Import Data

library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'dplyr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df <- read_csv("C:/Users/Muhammad Hafiz F/Downloads/wine-clustering.csv")
## Rows: 178 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): Alcohol, Malic_Acid, Ash, Ash_Alcanity, Magnesium, Total_Phenols, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 13
##   Alcohol Malic_Acid   Ash Ash_Alcanity Magnesium Total_Phenols Flavanoids
##     <dbl>      <dbl> <dbl>        <dbl>     <dbl>         <dbl>      <dbl>
## 1    14.2       1.71  2.43         15.6       127          2.8        3.06
## 2    13.2       1.78  2.14         11.2       100          2.65       2.76
## 3    13.2       2.36  2.67         18.6       101          2.8        3.24
## 4    14.4       1.95  2.5          16.8       113          3.85       3.49
## 5    13.2       2.59  2.87         21         118          2.8        2.69
## 6    14.2       1.76  2.45         15.2       112          3.27       3.39
## # ℹ 6 more variables: Nonflavanoid_Phenols <dbl>, Proanthocyanins <dbl>,
## #   Color_Intensity <dbl>, Hue <dbl>, OD280 <dbl>, Proline <dbl>

1.3 Data Exploration

library(corrplot)
## corrplot 0.95 loaded
cor_mat <- cor(df, use = "pairwise.complete.obs", method = "pearson")

corrplot(cor_mat, method = "color", type = "upper", tl.col = "black", addCoef.col = "black", number.cex = 0.6)

1.4 Drop Unnecessary Columns

library(dplyr)

keep <- c(
  "Alcohol","Malic_Acid","Ash_Alcanity","Magnesium",
  "Flavanoids","Nonflavanoid_Phenols","Proanthocyanins",
  "Color_Intensity","Proline"
)

df_clean <- df %>% select(all_of(intersect(keep, names(df))))
head(df_clean)
## # A tibble: 6 × 9
##   Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
##     <dbl>      <dbl>        <dbl>     <dbl>      <dbl>                <dbl>
## 1    14.2       1.71         15.6       127       3.06                 0.28
## 2    13.2       1.78         11.2       100       2.76                 0.26
## 3    13.2       2.36         18.6       101       3.24                 0.3 
## 4    14.4       1.95         16.8       113       3.49                 0.24
## 5    13.2       2.59         21         118       2.69                 0.39
## 6    14.2       1.76         15.2       112       3.39                 0.34
## # ℹ 3 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## #   Proline <dbl>

1.5 Scale Data

df_scaled <- scale(df_clean)
df_scaled <- as.data.frame(df_scaled)
head(df_scaled)
##     Alcohol  Malic_Acid Ash_Alcanity  Magnesium Flavanoids Nonflavanoid_Phenols
## 1 1.5143408 -0.56066822   -1.1663032 1.90852151  1.0319081           -0.6577078
## 2 0.2455968 -0.49800856   -2.4838405 0.01809398  0.7315653           -0.8184106
## 3 0.1963252  0.02117152   -0.2679823 0.08810981  1.2121137           -0.4970050
## 4 1.6867914 -0.34583508   -0.8069748 0.92829983  1.4623994           -0.9791134
## 5 0.2948684  0.22705328    0.4506745 1.27837900  0.6614853            0.2261576
## 6 1.4773871 -0.51591132   -1.2860793 0.85828399  1.3622851           -0.1755994
##   Proanthocyanins Color_Intensity     Proline
## 1       1.2214385       0.2510088  1.01015939
## 2      -0.5431887      -0.2924962  0.96252635
## 3       2.1299594       0.2682629  1.39122370
## 4       1.0292513       1.1827317  2.32800680
## 5       0.4002753      -0.3183774 -0.03776747
## 6       0.6623487       0.7298108  2.23274072

2 Hierarchical Clustering

2.1 Compute Distance Matrix

dist_matrix <- dist(df_scaled, method = "euclidean")
head(dist_matrix)
## [1] 3.232780 2.688085 2.023915 2.852088 1.870516 2.451591

2.2 Perform Hierarchical Clustering

hc <- hclust(dist_matrix, method = "ward.D2")
hc
## 
## Call:
## hclust(d = dist_matrix, method = "ward.D2")
## 
## Cluster method   : ward.D2 
## Distance         : euclidean 
## Number of objects: 178

2.3 Plot Dendrogram

plot(hc, labels = FALSE, hang = -1, main = "Dendrogram of Hierarchical Clustering")

2.4 Determine Optimal k from Potential ks from Dendrogram

library(cluster)
## Warning: package 'cluster' was built under R version 4.4.3
library(clusterCrit)
## Warning: package 'clusterCrit' was built under R version 4.4.3
ks <- 2:5
sizes_list <- list()
results <- data.frame(
  k = integer(),
  Silhouette = numeric(),
  Calinski_Harabasz = numeric(),
  Dunn = numeric(),
  stringsAsFactors = FALSE
)

for (k in ks) {
  cl <- cutree(hc, k = k)
  sizes_list[[as.character(k)]] <- table(cl)

  sil <- silhouette(cl, dist_matrix)
  sil_mean <- mean(sil[, "sil_width"])

  crit <- intCriteria(as.matrix(df_scaled), as.integer(cl),
                      c("Calinski_Harabasz", "Dunn"))

  results <- rbind(results, data.frame(
    k = k,
    Silhouette = sil_mean,
    Calinski_Harabasz = crit$calinski_harabasz,
    Dunn = crit$dunn
  ))
}

for (k in ks) {
  cat("\n--- k =", k, "cluster sizes ---\n")
  print(sizes_list[[as.character(k)]])
}
## 
## --- k = 2 cluster sizes ---
## cl
##   1   2 
##  64 114 
## 
## --- k = 3 cluster sizes ---
## cl
##  1  2  3 
## 64 67 47 
## 
## --- k = 4 cluster sizes ---
## cl
##  1  2  3  4 
## 57  7 67 47 
## 
## --- k = 5 cluster sizes ---
## cl
##  1  2  3  4  5 
## 57  7 48 19 47
results[order(results$k), ]
##   k Silhouette Calinski_Harabasz      Dunn
## 1 2  0.2620687          65.06705 0.1608956
## 2 3  0.2805325          65.58287 0.1752991
## 3 4  0.2878328          52.42604 0.1864274
## 4 5  0.2334873          45.41841 0.1755590

2.5 Evaluate Final Clustering with Optimal k

optimal_k <- 3

final_cl <- cutree(hc, k = optimal_k)
final_sil <- silhouette(final_cl, dist_matrix)
final_sil_mean <- mean(final_sil[, "sil_width"])
cat("Final Silhouette Score for k =", optimal_k, "is", final_sil_mean)
## Final Silhouette Score for k = 3 is 0.2805325
plot(final_sil, main = paste("Silhouette Plot for k =", optimal_k))

2.6 Cluster Visualization

library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
fviz_cluster(list(data = df_scaled, cluster = final_cl),
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal(),
             main = "Cluster Visualization for Hierarchical Clustering (k=3)")

2.7 Cluster Profiling

df_final <- df_clean %>%
  mutate(Cluster = as.factor(final_cl))
df_final
## # A tibble: 178 × 10
##    Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
##      <dbl>      <dbl>        <dbl>     <dbl>      <dbl>                <dbl>
##  1    14.2       1.71         15.6       127       3.06                 0.28
##  2    13.2       1.78         11.2       100       2.76                 0.26
##  3    13.2       2.36         18.6       101       3.24                 0.3 
##  4    14.4       1.95         16.8       113       3.49                 0.24
##  5    13.2       2.59         21         118       2.69                 0.39
##  6    14.2       1.76         15.2       112       3.39                 0.34
##  7    14.4       1.87         14.6        96       2.52                 0.3 
##  8    14.1       2.15         17.6       121       2.51                 0.31
##  9    14.8       1.64         14          97       2.98                 0.29
## 10    13.9       1.35         16          98       3.15                 0.22
## # ℹ 168 more rows
## # ℹ 4 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## #   Proline <dbl>, Cluster <fct>
cluster_profile <- df_final %>%
  group_by(Cluster) %>%
  summarise(across(everything(), list(mean = mean, sd = sd), .names = "{col}_{fn}"))
cluster_profile
## # A tibble: 3 × 19
##   Cluster Alcohol_mean Alcohol_sd Malic_Acid_mean Malic_Acid_sd
##   <fct>          <dbl>      <dbl>           <dbl>         <dbl>
## 1 1               13.6      0.603            1.97         0.684
## 2 2               12.3      0.535            1.86         0.954
## 3 3               13.2      0.541            3.52         0.956
## # ℹ 14 more variables: Ash_Alcanity_mean <dbl>, Ash_Alcanity_sd <dbl>,
## #   Magnesium_mean <dbl>, Magnesium_sd <dbl>, Flavanoids_mean <dbl>,
## #   Flavanoids_sd <dbl>, Nonflavanoid_Phenols_mean <dbl>,
## #   Nonflavanoid_Phenols_sd <dbl>, Proanthocyanins_mean <dbl>,
## #   Proanthocyanins_sd <dbl>, Color_Intensity_mean <dbl>,
## #   Color_Intensity_sd <dbl>, Proline_mean <dbl>, Proline_sd <dbl>

2.7.1 Interpretation of the Hierarchical Cluster Profiles:

(Courtesy of GPT since I don’t understand wines)

  • Cluster 1 — Rich / phenolic
    • fuller, riper profile; stronger flavonoid-driven structure (tannins), likely rounder palate and higher body.
  • Cluster 2 — Light / low-color
    • lighter style; thinner color and phenolic load; milder structure, easier drinking.
  • Cluster 3 — Acidic / high-color, nonflavanoid-leaning
    • sharper acidity, deeper color, phenolic balance shifts away from flavanoids; punchy color with leaner tannin feel.

3 K-Means Clustering

3.1 Determine Optimal k Using Elbow Method and Silhouette Score

wss <- numeric()
ks_kmeans <- 1:10

for (k in ks_kmeans) {
  kmeans_model <- kmeans(df_scaled, centers = k, nstart = 25)
  wss[k] <- kmeans_model$tot.withinss
}

plot(ks_kmeans, wss, type = "b", pch = 19,
     xlab = "Number of clusters k",
     ylab = "Total within-clusters sum of squares",
     main = "Elbow Method for Determining Optimal k")

sil_scores <- numeric()

for (k in ks_kmeans[-1]) {
  kmeans_model <- kmeans(df_scaled, centers = k, nstart = 25)
  sil <- silhouette(kmeans_model$cluster, dist_matrix)
  sil_scores[k] <- mean(sil[, "sil_width"])
}

plot(ks_kmeans[-1], sil_scores[-1], type = "b", pch = 19,
     xlab = "Number of clusters k",
     ylab = "Average Silhouette Score",
     main = "Silhouette Scores for Determining Optimal k")

3.2 Perform K-Means Clustering with Optimal k and Evaluate

optimal_k_kmeans <- 3

kmeans_final <- kmeans(df_scaled, centers = optimal_k_kmeans, nstart = 25)
kmeans_final
## K-means clustering with 3 clusters of sizes 69, 62, 47
## 
## Cluster means:
##      Alcohol Malic_Acid Ash_Alcanity   Magnesium  Flavanoids
## 1 -0.8905065 -0.3707433    0.2132921 -0.51869408  0.03410259
## 2  0.8314919 -0.3306755   -0.7142449  0.62565074  0.90030626
## 3  0.2104777  0.9804929    0.6290645 -0.06383944 -1.23770355
##   Nonflavanoid_Phenols Proanthocyanins Color_Intensity    Proline
## 1           0.05846771     -0.05145304      -0.8360012 -0.7568653
## 2          -0.61882809      0.62007876       0.1474144  1.1308810
## 3           0.73049084     -0.74243880       1.0328594 -0.3806578
## 
## Clustering vector:
##   [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2
##  [38] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 2 1 1 1 2
##  [75] 1 1 1 1 2 1 1 1 1 3 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [112] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 384.2362 280.0128 224.5865
##  (between_SS / total_SS =  44.2 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
final_sil_kmeans <- silhouette(kmeans_final$cluster, dist_matrix)
final_sil_mean_kmeans <- mean(final_sil_kmeans[, "sil_width"])

cat("Final Silhouette Score for K-Means k =", optimal_k_kmeans, "is", final_sil_mean_kmeans)
## Final Silhouette Score for K-Means k = 3 is 0.292841
plot(final_sil_kmeans, main = paste("Silhouette Plot for K-Means k =", optimal_k_kmeans))

3.3 Cluster Visualization

fviz_cluster(kmeans_final, data = df_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = "jco",
             ggtheme = theme_minimal(),
             main = "Cluster Visualization for K-Means Clustering (k=3)")

3.4 Cluster Profiling

df_kmeans_final <- df_clean %>%
  mutate(Cluster = as.factor(kmeans_final$cluster))
df_kmeans_final
## # A tibble: 178 × 10
##    Alcohol Malic_Acid Ash_Alcanity Magnesium Flavanoids Nonflavanoid_Phenols
##      <dbl>      <dbl>        <dbl>     <dbl>      <dbl>                <dbl>
##  1    14.2       1.71         15.6       127       3.06                 0.28
##  2    13.2       1.78         11.2       100       2.76                 0.26
##  3    13.2       2.36         18.6       101       3.24                 0.3 
##  4    14.4       1.95         16.8       113       3.49                 0.24
##  5    13.2       2.59         21         118       2.69                 0.39
##  6    14.2       1.76         15.2       112       3.39                 0.34
##  7    14.4       1.87         14.6        96       2.52                 0.3 
##  8    14.1       2.15         17.6       121       2.51                 0.31
##  9    14.8       1.64         14          97       2.98                 0.29
## 10    13.9       1.35         16          98       3.15                 0.22
## # ℹ 168 more rows
## # ℹ 4 more variables: Proanthocyanins <dbl>, Color_Intensity <dbl>,
## #   Proline <dbl>, Cluster <fct>
kmeans_cluster_profile <- df_kmeans_final %>%
  group_by(Cluster) %>%
  summarise(across(everything(), list(mean = mean, sd = sd), .names = "{col}_{fn}"))
kmeans_cluster_profile
## # A tibble: 3 × 19
##   Cluster Alcohol_mean Alcohol_sd Malic_Acid_mean Malic_Acid_sd
##   <fct>          <dbl>      <dbl>           <dbl>         <dbl>
## 1 1               12.3      0.544            1.92         0.997
## 2 2               13.7      0.546            1.97         0.695
## 3 3               13.2      0.526            3.43         1.01 
## # ℹ 14 more variables: Ash_Alcanity_mean <dbl>, Ash_Alcanity_sd <dbl>,
## #   Magnesium_mean <dbl>, Magnesium_sd <dbl>, Flavanoids_mean <dbl>,
## #   Flavanoids_sd <dbl>, Nonflavanoid_Phenols_mean <dbl>,
## #   Nonflavanoid_Phenols_sd <dbl>, Proanthocyanins_mean <dbl>,
## #   Proanthocyanins_sd <dbl>, Color_Intensity_mean <dbl>,
## #   Color_Intensity_sd <dbl>, Proline_mean <dbl>, Proline_sd <dbl>

3.4.1 Interpretation of the K-Means Cluster Profiles:

(Once again, courtesy of GPT since I don’t understand wines)

  • Cluster 1 — Rich / flavonoid-heavy
    • fuller body, riper signature; strong tannin/phenolic backbone driven by flavonoids; lower acid markers.
  • Cluster 2 — Acidic / high-color, nonflavonoid-leaning
    • sharper acidity with deep color; phenolic balance shifts away from flavonoids (leaner tannins), more bite/linearity.
  • Cluster 3 — Light / low-color
    • leanest style overall—lower alcohol, color, and phenolic load; softer structure.