1 Introduction

This report analyzes a grocery shopping customer survey collected by a student research group. The dataset contains 22 respondents who rated various aspects of their grocery shopping experience on Likert-style scales, along with basic demographic information.

Survey Variables:

Variable Description Scale
CS_helpful Customer service helpfulness 1–3
Recommend Likelihood to recommend the store 1–3
Come_again Likelihood to return 1–3
All_Products Satisfaction with product availability 1–5
Profesionalism Staff professionalism 1–3
Limitation Perceived product limitations 1–4
Online_grocery Use/satisfaction with online grocery 1–3
delivery Delivery service rating 1–3
Pick_up In-store pickup rating 1–5
Find_items Ease of finding items 1–3
other_shops Frequency of shopping at other stores 1–5
Gender 1 = Male, 2 = Female
Age 2 = 18–25, 3 = 26–35, 4 = 36–45
Education 1 = Some HS, 2 = HS Diploma, 3 = Some College, 5 = Bachelor’s+

2 Setup

# Install any missing packages before running:
# install.packages(c("tidyverse", "knitr", "kableExtra",
#                    "psych", "corrplot", "ggcorrplot",
#                    "scales", "patchwork", "cluster",
#                    "factoextra", "likert"))

library(tidyverse)
library(knitr)
library(kableExtra)
library(psych)
library(corrplot)
library(ggcorrplot)
library(scales)
library(patchwork)
library(cluster)
library(factoextra)
df_raw <- read_csv("customer_segmentation.csv", show_col_types = FALSE)

# Clean column name (trailing space on Profesionalism)
df_raw <- df_raw %>% rename_with(str_trim)

# Recode demographic variables as labeled factors
df <- df_raw %>%
  mutate(
    Gender_f    = factor(Gender,    levels = 1:2,
                         labels = c("Male", "Female")),
    Age_f       = factor(Age,       levels = 2:4,
                         labels = c("18–25", "26–35", "36–45")),
    Education_f = factor(Education, levels = c(1, 2, 3, 5),
                         labels = c("Some HS", "HS Diploma",
                                    "Some College", "Bachelor's+"))
  )

# Satisfaction items only (exclude ID & demographics)
satisfaction_vars <- c("CS_helpful", "Recommend", "Come_again",
                       "All_Products", "Profesionalism", "Limitation",
                       "Online_grocery", "delivery", "Pick_up",
                       "Find_items", "other_shops")

df_sat <- df %>% select(all_of(satisfaction_vars))

3 Data Overview

3.1 Raw Data

df_raw %>%
  kable(caption = "Raw Survey Data") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  scroll_box(height = "350px")
Raw Survey Data
ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age Education
1 2 2 2 2 2 2 2 3 4 1 2 1 2 2
2 1 2 1 1 1 1 2 3 3 1 2 1 2 2
3 2 1 1 1 1 2 3 3 2 1 3 1 2 2
4 3 3 2 4 1 2 3 3 2 2 2 1 3 5
5 2 1 3 5 2 1 2 3 1 2 3 2 4 2
6 1 1 3 2 1 1 1 2 1 1 4 1 2 5
7 2 1 1 2 2 1 2 2 2 1 1 1 2 3
8 1 1 1 2 1 2 1 1 2 2 4 1 2 2
9 1 1 1 2 2 1 2 1 3 1 1 2 2 1
10 1 1 1 1 1 1 3 2 2 1 1 2 2 2
11 1 1 1 2 1 1 2 2 2 1 3 2 4 5
12 1 1 1 2 1 1 3 2 3 1 3 1 3 1
13 1 1 1 1 1 1 1 2 2 1 1 1 4 5
14 1 1 1 2 1 1 3 3 3 3 1 1 3 5
15 1 1 1 4 1 1 2 2 2 2 5 1 2 5
16 1 1 2 2 1 1 3 1 3 1 5 1 3 5
17 1 1 1 2 1 2 2 3 5 2 5 1 2 1
18 2 1 1 1 1 3 3 3 3 1 2 1 2 5
19 3 1 2 3 2 4 1 3 1 3 2 2 2 2
20 2 3 3 1 3 1 3 3 1 1 4 2 2 3
21 2 1 1 2 2 1 3 3 4 1 2 1 2 2
22 3 2 1 2 2 2 3 3 3 2 1 1 2 5

3.2 Summary Statistics

describe(df_sat) %>%
  select(n, mean, sd, median, min, max, skew, kurtosis) %>%
  round(3) %>%
  kable(caption = "Descriptive Statistics — Satisfaction Items") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)
Descriptive Statistics — Satisfaction Items
n mean sd median min max skew kurtosis
CS_helpful 22 1.591 0.734 1 1 3 0.735 -0.890
Recommend 22 1.318 0.646 1 1 3 1.670 1.382
Come_again 22 1.455 0.739 1 1 3 1.164 -0.233
All_Products 22 2.091 1.065 2 1 5 1.185 0.794
Profesionalism 22 1.409 0.590 1 1 3 0.997 -0.136
Limitation 22 1.500 0.802 1 1 4 1.587 1.991
Online_grocery 22 2.273 0.767 2 1 3 -0.459 -1.251
delivery 22 2.409 0.734 3 1 3 -0.735 -0.890
Pick_up 22 2.455 1.057 2 1 5 0.460 -0.367
Find_items 22 1.455 0.671 1 1 3 1.059 -0.188
other_shops 22 2.591 1.403 2 1 5 0.415 -1.213

4 Demographic Profile

p_gender <- df %>%
  count(Gender_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Gender_f, y = pct, fill = Gender_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Gender", x = NULL, y = "Proportion") +
  theme_minimal(base_size = 12)

p_age <- df %>%
  count(Age_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Age_f, y = pct, fill = Age_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group", x = NULL, y = NULL) +
  theme_minimal(base_size = 12)

p_edu <- df %>%
  count(Education_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Education_f, y = pct, fill = Education_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Education Level", x = NULL, y = NULL) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

p_gender | p_age | p_edu

Key observations: - The sample is predominantly female and concentrated in the 18–35 age range. - Most respondents hold a high school diploma or some college education.


5 Satisfaction Item Analysis

5.1 Mean Ratings per Item

df_sat %>%
  summarise(across(everything(), mean)) %>%
  pivot_longer(everything(), names_to = "Item", values_to = "Mean") %>%
  arrange(desc(Mean)) %>%
  ggplot(aes(x = reorder(Item, Mean), y = Mean, fill = Mean)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(Mean, 2)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
  labs(title = "Average Rating per Survey Item",
       subtitle = "Higher values indicate higher ratings on each scale",
       x = NULL, y = "Mean Score") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  ylim(0, max(colMeans(df_sat)) * 1.2)

5.2 Response Distribution (Stacked Bar)

df_sat %>%
  pivot_longer(everything(), names_to = "Item", values_to = "Rating") %>%
  mutate(Rating = factor(Rating)) %>%
  count(Item, Rating) %>%
  group_by(Item) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Item, y = pct, fill = Rating)) +
  geom_col(position = "fill", width = 0.7) +
  coord_flip() +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Blues", direction = 1) +
  labs(title = "Response Distribution per Item",
       x = NULL, y = "Proportion", fill = "Rating") +
  theme_minimal(base_size = 12)


6 Correlation Analysis

cor_matrix <- cor(df_sat, use = "complete.obs", method = "spearman")

ggcorrplot(cor_matrix,
           method = "circle",
           type = "lower",
           lab = TRUE,
           lab_size = 3,
           colors = c("#D32F2F", "white", "#1565C0"),
           title = "Spearman Correlations Between Satisfaction Items",
           ggtheme = theme_minimal(base_size = 11))

Spearman rank correlation is used here because all items are measured on ordinal (Likert) scales. Strong positive correlations suggest items that tend to move together across respondents.


7 Demographic Group Comparisons

7.1 Satisfaction by Gender

df %>%
  select(Gender_f, all_of(satisfaction_vars)) %>%
  pivot_longer(-Gender_f, names_to = "Item", values_to = "Score") %>%
  group_by(Gender_f, Item) %>%
  summarise(Mean = mean(Score), .groups = "drop") %>%
  ggplot(aes(x = Item, y = Mean, fill = Gender_f)) +
  geom_col(position = "dodge", width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Mean Satisfaction Scores by Gender",
       x = NULL, y = "Mean Score", fill = "Gender") +
  theme_minimal(base_size = 12)

7.2 Satisfaction by Age Group

df %>%
  select(Age_f, all_of(satisfaction_vars)) %>%
  pivot_longer(-Age_f, names_to = "Item", values_to = "Score") %>%
  group_by(Age_f, Item) %>%
  summarise(Mean = mean(Score), .groups = "drop") %>%
  ggplot(aes(x = Item, y = Mean, fill = Age_f)) +
  geom_col(position = "dodge", width = 0.7) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Mean Satisfaction Scores by Age Group",
       x = NULL, y = "Mean Score", fill = "Age Group") +
  theme_minimal(base_size = 12)


8 Customer Segmentation (K-Means Clustering)

We apply k-means clustering on the satisfaction items to identify natural customer segments. All variables are scaled before clustering.

8.1 Choosing the Number of Clusters

set.seed(42)
df_scaled <- scale(df_sat)

fviz_nbclust(df_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method — Optimal Number of Clusters",
       subtitle = "Look for the 'elbow' where WSS levels off") +
  theme_minimal(base_size = 12)

8.2 Fit K-Means (k = 3)

set.seed(42)
k <- 3   # adjust based on the elbow plot above
km_fit <- kmeans(df_scaled, centers = k, nstart = 25)

df <- df %>% mutate(Cluster = factor(km_fit$cluster))

cat("Cluster sizes:\n")
## Cluster sizes:
print(table(df$Cluster))
## 
##  1  2  3 
## 10  4  8

8.3 Cluster Visualization (PCA Biplot)

fviz_cluster(km_fit,
             data = df_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = c("#E74C3C", "#2ECC71", "#3498DB"),
             ggtheme = theme_minimal(base_size = 12)) +
  labs(title = "K-Means Customer Segments (PCA Projection)")

8.4 Cluster Profiles

df %>%
  group_by(Cluster) %>%
  summarise(across(all_of(satisfaction_vars), mean), n = n()) %>%
  kable(digits = 2,
        caption = "Mean Satisfaction Scores by Customer Segment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)
Mean Satisfaction Scores by Customer Segment
Cluster CS_helpful Recommend Come_again All_Products Profesionalism Limitation Online_grocery delivery Pick_up Find_items other_shops n
1 1.10 1.00 1.30 2.00 1.20 1.10 2.00 1.7 2.20 1.2 2.80 10
2 2.50 2.00 2.50 3.25 2.00 2.00 2.25 3.0 1.25 2.0 2.75 4
3 1.75 1.38 1.12 1.62 1.38 1.75 2.62 3.0 3.38 1.5 2.25 8
# Radar-style faceted bar chart for cluster profiles
df %>%
  group_by(Cluster) %>%
  summarise(across(all_of(satisfaction_vars), mean)) %>%
  pivot_longer(-Cluster, names_to = "Item", values_to = "Mean") %>%
  ggplot(aes(x = Item, y = Mean, fill = Cluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~paste("Cluster", Cluster)) +
  scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
  labs(title = "Satisfaction Profile by Customer Segment",
       x = NULL, y = "Mean Score") +
  theme_minimal(base_size = 11)

8.5 Cluster Demographics

p_c_gender <- df %>%
  count(Cluster, Gender_f) %>%
  ggplot(aes(x = Cluster, y = n, fill = Gender_f)) +
  geom_col(position = "fill", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Gender", x = "Cluster", y = "Proportion", fill = NULL) +
  theme_minimal(base_size = 11)

p_c_age <- df %>%
  count(Cluster, Age_f) %>%
  ggplot(aes(x = Cluster, y = n, fill = Age_f)) +
  geom_col(position = "fill", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group", x = "Cluster", y = NULL, fill = NULL) +
  theme_minimal(base_size = 11)

p_c_gender | p_c_age


9 Key Findings & Limitations

9.1 Summary of Findings

  • Customer service and professionalism received the highest average ratings, suggesting staff interactions are a strength.
  • Product availability (All_Products) and pickup experience (Pick_up) showed more variability, indicating areas for potential improvement.
  • Correlation analysis revealed that CS_helpful, Recommend, and Come_again are highly interrelated — a satisfied customer tends to score all three highly.
  • K-means clustering identified three distinct customer segments with differing satisfaction profiles and demographic compositions.

9.2 Limitations

⚠️ This dataset contains only 22 observations, which severely limits the statistical reliability of all analyses — especially clustering and group comparisons. The following caveats apply:

  • Group differences (gender, age) should not be interpreted as statistically significant without formal testing.
  • Cluster solutions are exploratory and highly sensitive to random initialization.
  • Results should not be generalized beyond the sample until replicated with a larger dataset.

10 Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] factoextra_2.0.0   cluster_2.1.8.2    patchwork_1.3.2    scales_1.4.0      
##  [5] ggcorrplot_0.1.4.1 corrplot_0.95      psych_2.6.3        kableExtra_1.4.0  
##  [9] knitr_1.51         lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0     
## [13] dplyr_1.2.0        purrr_1.2.1        readr_2.2.0        tidyr_1.3.2       
## [17] tibble_3.3.1       ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.57          bslib_0.10.0       rstatix_0.7.3     
##  [5] ggrepel_0.9.8      lattice_0.22-9     tzdb_0.5.0         vctrs_0.7.2       
##  [9] tools_4.5.3        generics_0.1.4     parallel_4.5.3     pkgconfig_2.0.3   
## [13] RColorBrewer_1.1-3 S7_0.2.1           lifecycle_1.0.5    compiler_4.5.3    
## [17] farver_2.1.2       textshaping_1.0.5  mnormt_2.1.2       carData_3.0-6     
## [21] htmltools_0.5.9    sass_0.4.10        yaml_2.3.12        Formula_1.2-5     
## [25] car_3.1-5          ggpubr_0.6.3       pillar_1.11.1      crayon_1.5.3      
## [29] jquerylib_0.1.4    cachem_1.1.0       abind_1.4-8        nlme_3.1-168      
## [33] tidyselect_1.2.1   digest_0.6.39      stringi_1.8.7      reshape2_1.4.5    
## [37] labeling_0.4.3     fastmap_1.2.0      grid_4.5.3         cli_3.6.5         
## [41] magrittr_2.0.4     broom_1.0.12       withr_3.0.2        backports_1.5.0   
## [45] bit64_4.6.0-1      timechange_0.4.0   rmarkdown_2.30     bit_4.6.0         
## [49] otel_0.2.0         ggsignif_0.6.4     hms_1.1.4          evaluate_1.0.5    
## [53] viridisLite_0.4.3  rlang_1.1.7        Rcpp_1.1.1         glue_1.8.0        
## [57] xml2_1.5.2         svglite_2.2.2      rstudioapi_0.18.0  vroom_1.7.0       
## [61] jsonlite_2.0.0     plyr_1.8.9         R6_2.6.1           systemfonts_1.3.2