The necessary libraries and the wine dataset are loaded.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data_url <- 'https://raw.githubusercontent.com/koimabrian/Datasets/refs/heads/main/Wine.csv'
Wine <- read.csv(data_url, comment = "#") # Handle potential comments in the CSV
head(Wine)
The continuous variables (excluding observation number and country) are scaled.
data <- Wine %>% select(-Obs, -Country) %>% scale()
The k-means algorithm is applied to the standardized data, setting the number of clusters to 3.
kmeans_result <- kmeans(data, centers = 3, iter.max = 100, nstart = 100)
kmeans_result
## K-means clustering with 3 clusters of sizes 25, 43, 32
##
## Cluster means:
## Rating Price Alcohol Residual_Sugar Sulphates pH
## 1 0.0005649203 -0.09531108 0.7695537 0.73705964 0.7738343 -0.71133001
## 2 -0.7611830170 -0.74488647 -0.3311789 -0.48277078 -0.1169446 0.37747968
## 3 1.0223983350 1.07540297 -0.1561923 0.07289539 -0.4474138 0.04848825
##
## Clustering vector:
## [1] 3 2 2 1 2 2 1 1 3 3 2 2 2 2 2 3 3 2 2 2 1 1 3 3 2 2 3 1 2 3 2 3 3 1 3 2 3
## [38] 1 3 2 2 2 3 2 1 3 2 2 3 3 1 1 2 3 2 2 3 1 2 2 1 2 3 1 3 3 3 1 2 2 2 2 2 1
## [75] 3 2 2 1 1 2 2 3 1 1 2 3 2 1 1 1 3 2 1 3 3 3 3 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 114.3882 156.1271 119.1354
## (between_SS / total_SS = 34.4 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
To identify the optimal number of clusters, the within-cluster sum of squares (WSS), silhouette method, and gap statistic are visualized.
library(factoextra)
# WSS plot
fviz_nbclust(data, kmeans, method = "wss")
# Silhouette plot
fviz_nbclust(data, kmeans, method = "silhouette")
# Gap statistic
fviz_nbclust(data, kmeans, method = "gap_stat")
Clusters are visualized in a biplot, and then plotted using the original variables (Rating vs Price).
# Cluster biplot
fviz_cluster(kmeans(data, centers = 3, iter.max = 100, nstart = 100), data = data)
Cluster labels are added to the original dataset, and clusters are visualized using the Rating and Price variables.
clusters <- kmeans(data, centers = 3, iter.max = 100, nstart = 100)
Wine <- Wine |> mutate(cluster = clusters$cluster)
# Plot clusters using Rating and Price
Wine |> ggplot(aes(x = Rating, y = Price, col = as.factor(cluster))) +
geom_point() +
labs(title = "Wine Clusters by Rating and Price", color = "Cluster")