# Load libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
# Load the dataset
data <- read_csv("customer_segmentation.csv")
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data <- data %>% select(-ID)
head(data)
## # A tibble: 6 × 14
## CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 2 2 2 2 2
## 2 1 2 1 1 1 1
## 3 2 1 1 1 1 2
## 4 3 3 2 4 1 2
## 5 2 1 3 5 2 1
## 6 1 1 3 2 1 1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## # Education <dbl>
# Scale the data
data_scaled <- scale(data)
# Elbow method to choose optimal k
fviz_nbclust(data_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal k")

# Run K-Means with k = 3
set.seed(42)
kmeans_result <- kmeans(data_scaled, centers = 3, nstart = 25)
data$Cluster <- as.factor(kmeans_result$cluster)
head(data)
## # A tibble: 6 × 15
## CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 2 2 2 2 2
## 2 1 2 1 1 1 1
## 3 2 1 1 1 1 2
## 4 3 3 2 4 1 2
## 5 2 1 3 5 2 1
## 6 1 1 3 2 1 1
## # ℹ 9 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## # Education <dbl>, Cluster <fct>
# Summarize clusters
data %>%
group_by(Cluster) %>%
summarise(across(everything(), mean, .names = "avg_{.col}"))
## # A tibble: 3 × 15
## Cluster avg_CS_helpful avg_Recommend avg_Come_again avg_All_Products
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 1.5 2.17
## 2 2 2.5 2 2.5 3.25
## 3 3 1.58 1.25 1.08 1.67
## # ℹ 10 more variables: avg_Profesionalism <dbl>, avg_Limitation <dbl>,
## # avg_Online_grocery <dbl>, avg_delivery <dbl>, avg_Pick_up <dbl>,
## # avg_Find_items <dbl>, avg_other_shops <dbl>, avg_Gender <dbl>,
## # avg_Age <dbl>, avg_Education <dbl>
# Visualize the clusters
fviz_cluster(kmeans_result, data = data_scaled,
geom = "point", ellipse.type = "norm",
palette = "jco", ggtheme = theme_minimal())
