# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(cluster)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(readr)
# Load the dataset
data <- read_csv("customer_segmentation.csv")
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# View first few rows
head(data)
## # A tibble: 6 × 15
## ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## # Education <dbl>
# Check for missing values
sum(is.na(data))
## [1] 0
# Optional: Drop non-numeric columns (like ID or names)
data_numeric <- data %>%
select(where(is.numeric))
# Scale the data
data_scaled <- scale(data_numeric)
# Elbow Method to find optimal k
fviz_nbclust(data_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 3, linetype = 2) + # You can adjust based on the plot
labs(title = "Elbow Method for Optimal Clusters")
# Apply K-means with chosen k
set.seed(123) # for reproducibility
k <- 3 # Change this based on Elbow plot
km_result <- kmeans(data_scaled, centers = k, nstart = 25)
# Add cluster results to original data
data_clustered <- data %>%
mutate(Cluster = as.factor(km_result$cluster))
# Visualize clusters (first two principal components)
fviz_cluster(km_result, data = data_scaled,
geom = "point", ellipse.type = "convex",
palette = "jco", ggtheme = theme_minimal())
# View cluster summary
data_clustered %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 3 × 16
## Cluster ID CS_helpful Recommend Come_again All_Products Profesionalism
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 11.3 1.58 1.25 1.08 1.67 1.42
## 2 2 11.5 1 1 1.5 2.17 1
## 3 3 12 2.5 2 2.5 3.25 2
## # ℹ 9 more variables: Limitation <dbl>, Online_grocery <dbl>, delivery <dbl>,
## # Pick_up <dbl>, Find_items <dbl>, other_shops <dbl>, Gender <dbl>,
## # Age <dbl>, Education <dbl>
Interpretation: Cluster 1 (blue - circles): This is the largest cluster, with a wider spread. Contains the majority of “average” customers — mid-range (e.g. income, spending score, etc.).
Cluster 2 (yellow - triangles): Small, tightly grouped. These are customers that are very similar to each other, and probably distinct in behavior (e.g. high-spenders, young tech-savvy, etc.).
Cluster 3 (gray - squares): Also small and distinct. Customers in this group are very different from those in clusters 1 and 2 — possibly outliers or a niche segment (e.g. high income but low spending).