knitr::opts_chunk$set(echo = TRUE, fig.width = 8, fig.height = 5)
required_packages <- c("tidyverse", "factoextra", "cluster", "janitor", "scales")
missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing_packages) > 0) install.packages(missing_packages)
invisible(lapply(required_packages, library, character.only = TRUE))
This report performs a cluster analysis on the customer segmentation dataset. The workflow:
Update the path below only if your CSV is stored somewhere else.
data_path <- "customer_segmentation.csv"
raw_data <- read.csv(data_path, stringsAsFactors = FALSE) |> janitor::clean_names()
glimpse(raw_data)
## Rows: 22
## Columns: 15
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ cs_helpful <int> 2, 1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3…
## $ recommend <int> 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ come_again <int> 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2…
## $ all_products <int> 2, 1, 1, 4, 5, 2, 2, 2, 2, 1, 2, 2, 1, 2, 4, 2, 2, 1, 3…
## $ profesionalism <int> 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ limitation <int> 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4…
## $ online_grocery <int> 2, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1…
## $ delivery <int> 3, 3, 3, 3, 3, 2, 2, 1, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 3…
## $ pick_up <int> 4, 3, 2, 2, 1, 1, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 5, 3, 1…
## $ find_items <int> 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3…
## $ other_shops <int> 2, 2, 3, 2, 3, 4, 1, 4, 1, 1, 3, 3, 1, 1, 5, 5, 5, 2, 2…
## $ gender <int> 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
## $ age <int> 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 4, 3, 4, 3, 2, 3, 2, 2, 2…
## $ education <int> 2, 2, 2, 5, 2, 5, 3, 2, 1, 2, 5, 1, 5, 5, 5, 5, 1, 5, 2…
summary(raw_data)
## id cs_helpful recommend come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## all_products profesionalism limitation online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## pick_up find_items other_shops gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## age education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
id is used only as an identifier and is excluded from
clustering.
cluster_data <- raw_data |> select(-id)
nrow(cluster_data)
## [1] 22
ncol(cluster_data)
## [1] 14
colSums(is.na(cluster_data))
## cs_helpful recommend come_again all_products profesionalism
## 0 0 0 0 0
## limitation online_grocery delivery pick_up find_items
## 0 0 0 0 0
## other_shops gender age education
## 0 0 0 0
cor_matrix <- cor(cluster_data)
round(cor_matrix, 2)
## cs_helpful recommend come_again all_products profesionalism
## cs_helpful 1.00 0.49 0.27 0.29 0.51
## recommend 0.49 1.00 0.38 0.03 0.39
## come_again 0.27 0.38 1.00 0.37 0.43
## all_products 0.29 0.03 0.37 1.00 0.09
## profesionalism 0.51 0.39 0.43 0.09 1.00
## limitation 0.61 0.05 0.00 0.06 0.05
## online_grocery 0.21 0.30 -0.15 -0.15 0.06
## delivery 0.59 0.42 0.17 0.07 0.25
## pick_up -0.18 -0.08 -0.52 -0.25 -0.16
## find_items 0.30 -0.02 0.04 0.54 -0.01
## other_shops -0.31 -0.06 0.33 0.22 -0.19
## gender 0.06 0.01 0.32 0.14 0.45
## age -0.17 -0.12 0.13 0.31 -0.23
## education 0.07 0.12 0.09 0.07 -0.28
## limitation online_grocery delivery pick_up find_items
## cs_helpful 0.61 0.21 0.59 -0.18 0.30
## recommend 0.05 0.30 0.42 -0.08 -0.02
## come_again 0.00 -0.15 0.17 -0.52 0.04
## all_products 0.06 -0.15 0.07 -0.25 0.54
## profesionalism 0.05 0.06 0.25 -0.16 -0.01
## limitation 1.00 -0.15 0.36 0.00 0.44
## online_grocery -0.15 1.00 0.30 0.31 -0.16
## delivery 0.36 0.30 1.00 0.12 0.28
## pick_up 0.00 0.31 0.12 1.00 -0.10
## find_items 0.44 -0.16 0.28 -0.10 1.00
## other_shops -0.06 -0.11 -0.20 -0.03 0.00
## gender 0.00 -0.09 -0.06 -0.47 0.04
## age -0.32 -0.06 -0.10 -0.22 0.04
## education -0.07 0.07 -0.03 -0.24 0.10
## other_shops gender age education
## cs_helpful -0.31 0.06 -0.17 0.07
## recommend -0.06 0.01 -0.12 0.12
## come_again 0.33 0.32 0.13 0.09
## all_products 0.22 0.14 0.31 0.07
## profesionalism -0.19 0.45 -0.23 -0.28
## limitation -0.06 0.00 -0.32 -0.07
## online_grocery -0.11 -0.09 -0.06 0.07
## delivery -0.20 -0.06 -0.10 -0.03
## pick_up -0.03 -0.47 -0.22 -0.24
## find_items 0.00 0.04 0.04 0.10
## other_shops 1.00 -0.12 -0.04 0.01
## gender -0.12 1.00 0.18 -0.26
## age -0.04 0.18 1.00 0.33
## education 0.01 -0.26 0.33 1.00
K-means is distance-based, so variables should be standardized before clustering.
scaled_data <- scale(cluster_data)
head(scaled_data)
## cs_helpful recommend come_again all_products profesionalism limitation
## [1,] 0.5572385 1.0548991 0.7385489 -0.08536162 1.0009877 0.6236096
## [2,] -0.8049001 1.0548991 -0.6154575 -1.02433946 -0.6929915 -0.6236096
## [3,] 0.5572385 -0.4922862 -0.6154575 -1.02433946 -0.6929915 0.6236096
## [4,] 1.9193772 2.6020844 0.7385489 1.79259406 -0.6929915 0.6236096
## [5,] 0.5572385 -0.4922862 2.0925553 2.73157191 1.0009877 -0.6236096
## [6,] -0.8049001 -0.4922862 2.0925553 -0.08536162 -0.6929915 -0.6236096
## online_grocery delivery pick_up find_items other_shops gender
## [1,] -0.3554390 0.8049001 1.4623535 -0.6774335 -0.4212692 -0.598293
## [2,] -0.3554390 0.8049001 0.5161248 -0.6774335 -0.4212692 -0.598293
## [3,] 0.9478374 0.8049001 -0.4301040 -0.6774335 0.2916479 -0.598293
## [4,] 0.9478374 0.8049001 -0.4301040 0.8129201 -0.4212692 -0.598293
## [5,] -0.3554390 0.8049001 -1.3763327 0.8129201 0.2916479 1.595448
## [6,] -1.6587154 -0.5572385 -1.3763327 -0.6774335 1.0045650 -0.598293
## age education
## [1,] -0.6154575 -0.7284586
## [2,] -0.6154575 -0.7284586
## [3,] -0.6154575 -0.7284586
## [4,] 0.7385489 1.1207055
## [5,] 2.0925553 -0.7284586
## [6,] -0.6154575 1.1207055
Because the dataset is small (22 observations), a practical search range is 2 to 6 clusters.
set.seed(123)
fviz_nbclust(scaled_data, kmeans, method = "wss", k.max = 6) +
labs(title = "Elbow Method for Choosing k")
set.seed(123)
fviz_nbclust(scaled_data, kmeans, method = "silhouette", k.max = 6) +
labs(title = "Silhouette Method for Choosing k")
set.seed(123)
gap_stat <- clusGap(scaled_data, FUN = kmeans, nstart = 25, K.max = 6, B = 100)
fviz_gap_stat(gap_stat) + labs(title = "Gap Statistic")
Set final_k after reviewing the plots above. A starting
value of 3 is common for interpretation and can be changed easily.
final_k <- 3
set.seed(123)
km_model <- kmeans(scaled_data, centers = final_k, nstart = 25)
km_model
## K-means clustering with 3 clusters of sizes 6, 4, 12
##
## Cluster means:
## cs_helpful recommend come_again all_products profesionalism limitation
## 1 -0.80490011 -0.4922862 0.06154575 0.07113469 -0.69299145 -4.157397e-01
## 2 1.23830786 1.0548991 1.41555215 1.08836068 1.00098765 6.236096e-01
## 3 -0.01031923 -0.1054899 -0.50262359 -0.39835424 0.01283318 1.850372e-17
## online_grocery delivery pick_up find_items other_shops gender
## 1 -0.78986449 -1.0112848 -0.4301040 -0.1806489 0.7669260 -0.2326695
## 2 -0.02961992 0.8049001 -1.1397755 0.8129201 0.1134186 1.0470128
## 3 0.40480555 0.2373423 0.5949772 -0.1806489 -0.4212692 -0.2326695
## age education
## 1 0.5128812 0.8125115
## 2 0.4000473 -0.1120706
## 3 -0.3897897 -0.3688989
##
## Clustering vector:
## [1] 3 3 3 2 2 1 3 1 3 3 1 3 1 3 1 1 3 3 2 2 3 3
##
## Within cluster sum of squares by cluster:
## [1] 46.34352 58.16415 96.54795
## (between_SS / total_SS = 31.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
clustered_data <- raw_data |> mutate(cluster = factor(km_model$cluster))
head(clustered_data)
## id cs_helpful recommend come_again all_products profesionalism limitation
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## online_grocery delivery pick_up find_items other_shops gender age education
## 1 2 3 4 1 2 1 2 2
## 2 2 3 3 1 2 1 2 2
## 3 3 3 2 1 3 1 2 2
## 4 3 3 2 2 2 1 3 5
## 5 2 3 1 2 3 2 4 2
## 6 1 2 1 1 4 1 2 5
## cluster
## 1 3
## 2 3
## 3 3
## 4 2
## 5 2
## 6 1
table(clustered_data$cluster)
##
## 1 2 3
## 6 4 12
cluster_profile <- clustered_data |>
group_by(cluster) |>
summarise(across(-id, mean), .groups = "drop")
cluster_profile
## # A tibble: 3 × 15
## cluster cs_helpful recommend come_again all_products profesionalism limitation
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 1 1.5 2.17 1 1.17
## 2 2 2.5 2 2.5 3.25 2 2
## 3 3 1.58 1.25 1.08 1.67 1.42 1.5
## # ℹ 8 more variables: online_grocery <dbl>, delivery <dbl>, pick_up <dbl>,
## # find_items <dbl>, other_shops <dbl>, gender <dbl>, age <dbl>,
## # education <dbl>
cluster_sizes <- clustered_data |>
count(cluster) |>
mutate(percent = percent(n / sum(n)))
cluster_sizes
## cluster n percent
## 1 1 6 27.3%
## 2 2 4 18.2%
## 3 3 12 54.5%
cluster_centers <- as.data.frame(km_model$centers)
cluster_centers$cluster <- factor(seq_len(nrow(cluster_centers)))
cluster_centers
## cs_helpful recommend come_again all_products profesionalism limitation
## 1 -0.80490011 -0.4922862 0.06154575 0.07113469 -0.69299145 -4.157397e-01
## 2 1.23830786 1.0548991 1.41555215 1.08836068 1.00098765 6.236096e-01
## 3 -0.01031923 -0.1054899 -0.50262359 -0.39835424 0.01283318 1.850372e-17
## online_grocery delivery pick_up find_items other_shops gender
## 1 -0.78986449 -1.0112848 -0.4301040 -0.1806489 0.7669260 -0.2326695
## 2 -0.02961992 0.8049001 -1.1397755 0.8129201 0.1134186 1.0470128
## 3 0.40480555 0.2373423 0.5949772 -0.1806489 -0.4212692 -0.2326695
## age education cluster
## 1 0.5128812 0.8125115 1
## 2 0.4000473 -0.1120706 2
## 3 -0.3897897 -0.3688989 3
fviz_cluster(km_model, data = scaled_data, ellipse.type = "convex") +
labs(title = "Customer Segments Visualized with PCA")
cluster_profile_long <- cluster_profile |>
pivot_longer(-cluster, names_to = "variable", values_to = "mean_value")
ggplot(cluster_profile_long, aes(x = variable, y = cluster, fill = mean_value)) +
geom_tile() +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Cluster Mean Profiles", x = "Variable", y = "Cluster") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Use the cluster profile table and heatmap to label the segments. For example, you might describe clusters as:
Base the final labels on which variables are highest or lowest within each cluster.
write.csv(clustered_data, "customer_segmentation_with_clusters.csv", row.names = FALSE)
If you want to compare k-means with hierarchical clustering, run the chunk below.
dist_matrix <- dist(scaled_data)
hc_model <- hclust(dist_matrix, method = "ward.D2")
plot(hc_model, main = "Hierarchical Clustering Dendrogram")
rect.hclust(hc_model, k = final_k, border = 2:5)
sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so; LAPACK version 3.12.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 janitor_2.2.1 cluster_2.1.8.2 factoextra_2.0.0
## [5] lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0 dplyr_1.2.1
## [9] purrr_1.2.2 readr_2.2.0 tidyr_1.3.2 tibble_3.3.1
## [13] ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] utf8_1.2.6 sass_0.4.10 generics_0.1.4 rstatix_0.7.3
## [5] stringi_1.8.7 hms_1.1.4 digest_0.6.39 magrittr_2.0.5
## [9] evaluate_1.0.5 grid_4.5.3 timechange_0.4.0 RColorBrewer_1.1-3
## [13] fastmap_1.2.0 jsonlite_2.0.0 ggrepel_0.9.8 backports_1.5.1
## [17] Formula_1.2-5 jquerylib_0.1.4 abind_1.4-8 cli_3.6.6
## [21] rlang_1.2.0 withr_3.0.2 cachem_1.1.0 yaml_2.3.12
## [25] otel_0.2.0 tools_4.5.3 tzdb_0.5.0 ggsignif_0.6.4
## [29] ggpubr_0.6.3 broom_1.0.12 vctrs_0.7.3 R6_2.6.1
## [33] lifecycle_1.0.5 snakecase_0.11.1 car_3.1-5 pkgconfig_2.0.3
## [37] pillar_1.11.1 bslib_0.10.0 gtable_0.3.6 glue_1.8.1
## [41] Rcpp_1.1.1-1 xfun_0.57 tidyselect_1.2.1 rstudioapi_0.18.0
## [45] knitr_1.51 farver_2.1.2 htmltools_0.5.9 labeling_0.4.3
## [49] carData_3.0-6 rmarkdown_2.31 compiler_4.5.3 S7_0.2.1-1
install.packages(c( “readr”, “dplyr”, “ggplot2”, “cluster”, “factoextra”, “janitor” ))