knitr::opts_chunk$set(echo = TRUE, fig.width = 8, fig.height = 5)
required_packages <- c("tidyverse", "factoextra", "cluster", "janitor", "scales")
missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing_packages) > 0) install.packages(missing_packages)
invisible(lapply(required_packages, library, character.only = TRUE))

1 1. Purpose

This report performs a cluster analysis on the customer segmentation dataset. The workflow:

2 2. Load Data

Update the path below only if your CSV is stored somewhere else.

data_path <- "customer_segmentation.csv"
raw_data <- read.csv(data_path, stringsAsFactors = FALSE) |> janitor::clean_names()

glimpse(raw_data)
## Rows: 22
## Columns: 15
## $ id             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ cs_helpful     <int> 2, 1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3…
## $ recommend      <int> 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ come_again     <int> 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2…
## $ all_products   <int> 2, 1, 1, 4, 5, 2, 2, 2, 2, 1, 2, 2, 1, 2, 4, 2, 2, 1, 3…
## $ profesionalism <int> 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ limitation     <int> 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4…
## $ online_grocery <int> 2, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1…
## $ delivery       <int> 3, 3, 3, 3, 3, 2, 2, 1, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 3…
## $ pick_up        <int> 4, 3, 2, 2, 1, 1, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 5, 3, 1…
## $ find_items     <int> 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3…
## $ other_shops    <int> 2, 2, 3, 2, 3, 4, 1, 4, 1, 1, 3, 3, 1, 1, 5, 5, 5, 2, 2…
## $ gender         <int> 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
## $ age            <int> 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 4, 3, 4, 3, 2, 3, 2, 2, 2…
## $ education      <int> 2, 2, 2, 5, 2, 5, 3, 2, 1, 2, 5, 1, 5, 5, 5, 5, 1, 5, 2…
summary(raw_data)
##        id          cs_helpful      recommend       come_again   
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 6.25   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :11.50   Median :1.000   Median :1.000   Median :1.000  
##  Mean   :11.50   Mean   :1.591   Mean   :1.318   Mean   :1.455  
##  3rd Qu.:16.75   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :22.00   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##   all_products   profesionalism    limitation  online_grocery     delivery    
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.250   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1.000   Median :1.0   Median :2.000   Median :3.000  
##  Mean   :2.091   Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000  
##     pick_up        find_items     other_shops        gender     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000  
##  Median :2.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :2.455   Mean   :1.455   Mean   :2.591   Mean   :1.273  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750  
##  Max.   :5.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##       age          education    
##  Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.500  
##  Mean   :2.455   Mean   :3.182  
##  3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :4.000   Max.   :5.000

3 3. Data Preparation

id is used only as an identifier and is excluded from clustering.

cluster_data <- raw_data |> select(-id)

nrow(cluster_data)
## [1] 22
ncol(cluster_data)
## [1] 14
colSums(is.na(cluster_data))
##     cs_helpful      recommend     come_again   all_products profesionalism 
##              0              0              0              0              0 
##     limitation online_grocery       delivery        pick_up     find_items 
##              0              0              0              0              0 
##    other_shops         gender            age      education 
##              0              0              0              0

3.1 Optional: inspect correlations

cor_matrix <- cor(cluster_data)
round(cor_matrix, 2)
##                cs_helpful recommend come_again all_products profesionalism
## cs_helpful           1.00      0.49       0.27         0.29           0.51
## recommend            0.49      1.00       0.38         0.03           0.39
## come_again           0.27      0.38       1.00         0.37           0.43
## all_products         0.29      0.03       0.37         1.00           0.09
## profesionalism       0.51      0.39       0.43         0.09           1.00
## limitation           0.61      0.05       0.00         0.06           0.05
## online_grocery       0.21      0.30      -0.15        -0.15           0.06
## delivery             0.59      0.42       0.17         0.07           0.25
## pick_up             -0.18     -0.08      -0.52        -0.25          -0.16
## find_items           0.30     -0.02       0.04         0.54          -0.01
## other_shops         -0.31     -0.06       0.33         0.22          -0.19
## gender               0.06      0.01       0.32         0.14           0.45
## age                 -0.17     -0.12       0.13         0.31          -0.23
## education            0.07      0.12       0.09         0.07          -0.28
##                limitation online_grocery delivery pick_up find_items
## cs_helpful           0.61           0.21     0.59   -0.18       0.30
## recommend            0.05           0.30     0.42   -0.08      -0.02
## come_again           0.00          -0.15     0.17   -0.52       0.04
## all_products         0.06          -0.15     0.07   -0.25       0.54
## profesionalism       0.05           0.06     0.25   -0.16      -0.01
## limitation           1.00          -0.15     0.36    0.00       0.44
## online_grocery      -0.15           1.00     0.30    0.31      -0.16
## delivery             0.36           0.30     1.00    0.12       0.28
## pick_up              0.00           0.31     0.12    1.00      -0.10
## find_items           0.44          -0.16     0.28   -0.10       1.00
## other_shops         -0.06          -0.11    -0.20   -0.03       0.00
## gender               0.00          -0.09    -0.06   -0.47       0.04
## age                 -0.32          -0.06    -0.10   -0.22       0.04
## education           -0.07           0.07    -0.03   -0.24       0.10
##                other_shops gender   age education
## cs_helpful           -0.31   0.06 -0.17      0.07
## recommend            -0.06   0.01 -0.12      0.12
## come_again            0.33   0.32  0.13      0.09
## all_products          0.22   0.14  0.31      0.07
## profesionalism       -0.19   0.45 -0.23     -0.28
## limitation           -0.06   0.00 -0.32     -0.07
## online_grocery       -0.11  -0.09 -0.06      0.07
## delivery             -0.20  -0.06 -0.10     -0.03
## pick_up              -0.03  -0.47 -0.22     -0.24
## find_items            0.00   0.04  0.04      0.10
## other_shops           1.00  -0.12 -0.04      0.01
## gender               -0.12   1.00  0.18     -0.26
## age                  -0.04   0.18  1.00      0.33
## education             0.01  -0.26  0.33      1.00

4 4. Standardize Variables

K-means is distance-based, so variables should be standardized before clustering.

scaled_data <- scale(cluster_data)
head(scaled_data)
##      cs_helpful  recommend come_again all_products profesionalism limitation
## [1,]  0.5572385  1.0548991  0.7385489  -0.08536162      1.0009877  0.6236096
## [2,] -0.8049001  1.0548991 -0.6154575  -1.02433946     -0.6929915 -0.6236096
## [3,]  0.5572385 -0.4922862 -0.6154575  -1.02433946     -0.6929915  0.6236096
## [4,]  1.9193772  2.6020844  0.7385489   1.79259406     -0.6929915  0.6236096
## [5,]  0.5572385 -0.4922862  2.0925553   2.73157191      1.0009877 -0.6236096
## [6,] -0.8049001 -0.4922862  2.0925553  -0.08536162     -0.6929915 -0.6236096
##      online_grocery   delivery    pick_up find_items other_shops    gender
## [1,]     -0.3554390  0.8049001  1.4623535 -0.6774335  -0.4212692 -0.598293
## [2,]     -0.3554390  0.8049001  0.5161248 -0.6774335  -0.4212692 -0.598293
## [3,]      0.9478374  0.8049001 -0.4301040 -0.6774335   0.2916479 -0.598293
## [4,]      0.9478374  0.8049001 -0.4301040  0.8129201  -0.4212692 -0.598293
## [5,]     -0.3554390  0.8049001 -1.3763327  0.8129201   0.2916479  1.595448
## [6,]     -1.6587154 -0.5572385 -1.3763327 -0.6774335   1.0045650 -0.598293
##             age  education
## [1,] -0.6154575 -0.7284586
## [2,] -0.6154575 -0.7284586
## [3,] -0.6154575 -0.7284586
## [4,]  0.7385489  1.1207055
## [5,]  2.0925553 -0.7284586
## [6,] -0.6154575  1.1207055

5 5. Choose the Number of Clusters

Because the dataset is small (22 observations), a practical search range is 2 to 6 clusters.

5.1 Elbow method

set.seed(123)
fviz_nbclust(scaled_data, kmeans, method = "wss", k.max = 6) +
  labs(title = "Elbow Method for Choosing k")

5.2 Average silhouette width

set.seed(123)
fviz_nbclust(scaled_data, kmeans, method = "silhouette", k.max = 6) +
  labs(title = "Silhouette Method for Choosing k")

5.3 Gap statistic (optional but useful)

set.seed(123)
gap_stat <- clusGap(scaled_data, FUN = kmeans, nstart = 25, K.max = 6, B = 100)
fviz_gap_stat(gap_stat) + labs(title = "Gap Statistic")

6 6. Fit Final K-means Model

Set final_k after reviewing the plots above. A starting value of 3 is common for interpretation and can be changed easily.

final_k <- 3
set.seed(123)
km_model <- kmeans(scaled_data, centers = final_k, nstart = 25)
km_model
## K-means clustering with 3 clusters of sizes 6, 4, 12
## 
## Cluster means:
##    cs_helpful  recommend  come_again all_products profesionalism    limitation
## 1 -0.80490011 -0.4922862  0.06154575   0.07113469    -0.69299145 -4.157397e-01
## 2  1.23830786  1.0548991  1.41555215   1.08836068     1.00098765  6.236096e-01
## 3 -0.01031923 -0.1054899 -0.50262359  -0.39835424     0.01283318  1.850372e-17
##   online_grocery   delivery    pick_up find_items other_shops     gender
## 1    -0.78986449 -1.0112848 -0.4301040 -0.1806489   0.7669260 -0.2326695
## 2    -0.02961992  0.8049001 -1.1397755  0.8129201   0.1134186  1.0470128
## 3     0.40480555  0.2373423  0.5949772 -0.1806489  -0.4212692 -0.2326695
##          age  education
## 1  0.5128812  0.8125115
## 2  0.4000473 -0.1120706
## 3 -0.3897897 -0.3688989
## 
## Clustering vector:
##  [1] 3 3 3 2 2 1 3 1 3 3 1 3 1 3 1 1 3 3 2 2 3 3
## 
## Within cluster sum of squares by cluster:
## [1] 46.34352 58.16415 96.54795
##  (between_SS / total_SS =  31.6 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"

7 7. Attach Cluster Labels

clustered_data <- raw_data |> mutate(cluster = factor(km_model$cluster))
head(clustered_data)
##   id cs_helpful recommend come_again all_products profesionalism limitation
## 1  1          2         2          2            2              2          2
## 2  2          1         2          1            1              1          1
## 3  3          2         1          1            1              1          2
## 4  4          3         3          2            4              1          2
## 5  5          2         1          3            5              2          1
## 6  6          1         1          3            2              1          1
##   online_grocery delivery pick_up find_items other_shops gender age education
## 1              2        3       4          1           2      1   2         2
## 2              2        3       3          1           2      1   2         2
## 3              3        3       2          1           3      1   2         2
## 4              3        3       2          2           2      1   3         5
## 5              2        3       1          2           3      2   4         2
## 6              1        2       1          1           4      1   2         5
##   cluster
## 1       3
## 2       3
## 3       3
## 4       2
## 5       2
## 6       1
table(clustered_data$cluster)
## 
##  1  2  3 
##  6  4 12

8 8. Cluster Profiles

8.1 Mean profile by cluster

cluster_profile <- clustered_data |>
  group_by(cluster) |>
  summarise(across(-id, mean), .groups = "drop")

cluster_profile
## # A tibble: 3 × 15
##   cluster cs_helpful recommend come_again all_products profesionalism limitation
##   <fct>        <dbl>     <dbl>      <dbl>        <dbl>          <dbl>      <dbl>
## 1 1             1         1          1.5          2.17           1          1.17
## 2 2             2.5       2          2.5          3.25           2          2   
## 3 3             1.58      1.25       1.08         1.67           1.42       1.5 
## # ℹ 8 more variables: online_grocery <dbl>, delivery <dbl>, pick_up <dbl>,
## #   find_items <dbl>, other_shops <dbl>, gender <dbl>, age <dbl>,
## #   education <dbl>

8.2 Cluster sizes and percentages

cluster_sizes <- clustered_data |>
  count(cluster) |>
  mutate(percent = percent(n / sum(n)))

cluster_sizes
##   cluster  n percent
## 1       1  6   27.3%
## 2       2  4   18.2%
## 3       3 12   54.5%

8.3 Standardized center values from k-means

cluster_centers <- as.data.frame(km_model$centers)
cluster_centers$cluster <- factor(seq_len(nrow(cluster_centers)))
cluster_centers
##    cs_helpful  recommend  come_again all_products profesionalism    limitation
## 1 -0.80490011 -0.4922862  0.06154575   0.07113469    -0.69299145 -4.157397e-01
## 2  1.23830786  1.0548991  1.41555215   1.08836068     1.00098765  6.236096e-01
## 3 -0.01031923 -0.1054899 -0.50262359  -0.39835424     0.01283318  1.850372e-17
##   online_grocery   delivery    pick_up find_items other_shops     gender
## 1    -0.78986449 -1.0112848 -0.4301040 -0.1806489   0.7669260 -0.2326695
## 2    -0.02961992  0.8049001 -1.1397755  0.8129201   0.1134186  1.0470128
## 3     0.40480555  0.2373423  0.5949772 -0.1806489  -0.4212692 -0.2326695
##          age  education cluster
## 1  0.5128812  0.8125115       1
## 2  0.4000473 -0.1120706       2
## 3 -0.3897897 -0.3688989       3

9 9. Visualize the Segments

9.1 PCA map of clusters

fviz_cluster(km_model, data = scaled_data, ellipse.type = "convex") +
  labs(title = "Customer Segments Visualized with PCA")

9.2 Heatmap of cluster means

cluster_profile_long <- cluster_profile |>
  pivot_longer(-cluster, names_to = "variable", values_to = "mean_value")

ggplot(cluster_profile_long, aes(x = variable, y = cluster, fill = mean_value)) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Cluster Mean Profiles", x = "Variable", y = "Cluster") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

10 10. Interpretation Notes

Use the cluster profile table and heatmap to label the segments. For example, you might describe clusters as:

Base the final labels on which variables are highest or lowest within each cluster.

11 11. Save Output Dataset

write.csv(clustered_data, "customer_segmentation_with_clusters.csv", row.names = FALSE)

12 12. Optional Alternative: Hierarchical Clustering

If you want to compare k-means with hierarchical clustering, run the chunk below.

dist_matrix <- dist(scaled_data)
hc_model <- hclust(dist_matrix, method = "ward.D2")
plot(hc_model, main = "Hierarchical Clustering Dendrogram")
rect.hclust(hc_model, k = final_k, border = 2:5)

13 13. Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0     janitor_2.2.1    cluster_2.1.8.2  factoextra_2.0.0
##  [5] lubridate_1.9.5  forcats_1.0.1    stringr_1.6.0    dplyr_1.2.1     
##  [9] purrr_1.2.2      readr_2.2.0      tidyr_1.3.2      tibble_3.3.1    
## [13] ggplot2_4.0.2    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     rstatix_0.7.3     
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.5    
##  [9] evaluate_1.0.5     grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      jsonlite_2.0.0     ggrepel_0.9.8      backports_1.5.1   
## [17] Formula_1.2-5      jquerylib_0.1.4    abind_1.4-8        cli_3.6.6         
## [21] rlang_1.2.0        withr_3.0.2        cachem_1.1.0       yaml_2.3.12       
## [25] otel_0.2.0         tools_4.5.3        tzdb_0.5.0         ggsignif_0.6.4    
## [29] ggpubr_0.6.3       broom_1.0.12       vctrs_0.7.3        R6_2.6.1          
## [33] lifecycle_1.0.5    snakecase_0.11.1   car_3.1-5          pkgconfig_2.0.3   
## [37] pillar_1.11.1      bslib_0.10.0       gtable_0.3.6       glue_1.8.1        
## [41] Rcpp_1.1.1-1       xfun_0.57          tidyselect_1.2.1   rstudioapi_0.18.0 
## [45] knitr_1.51         farver_2.1.2       htmltools_0.5.9    labeling_0.4.3    
## [49] carData_3.0-6      rmarkdown_2.31     compiler_4.5.3     S7_0.2.1-1

install.packages(c( “readr”, “dplyr”, “ggplot2”, “cluster”, “factoextra”, “janitor” ))