1 Introduction

This report analyzes a grocery shopping customer survey collected by a student research group. The dataset contains 22 respondents who rated various aspects of their grocery shopping experience on Likert-style scales, along with basic demographic information.

Survey Variables:

Variable	Description	Scale
`CS_helpful`	Customer service helpfulness	1–3
`Recommend`	Likelihood to recommend the store	1–3
`Come_again`	Likelihood to return	1–3
`All_Products`	Satisfaction with product availability	1–5
`Profesionalism`	Staff professionalism	1–3
`Limitation`	Perceived product limitations	1–4
`Online_grocery`	Use/satisfaction with online grocery	1–3
`delivery`	Delivery service rating	1–3
`Pick_up`	In-store pickup rating	1–5
`Find_items`	Ease of finding items	1–3
`other_shops`	Frequency of shopping at other stores	1–5
`Gender`	1 = Male, 2 = Female	—
`Age`	2 = 18–25, 3 = 26–35, 4 = 36–45	—
`Education`	1 = Some HS, 2 = HS Diploma, 3 = Some College, 5 = Bachelor’s+	—

2 Setup

# Install any missing packages before running:
# install.packages(c("tidyverse", "knitr", "kableExtra",
#                    "psych", "corrplot", "ggcorrplot",
#                    "scales", "patchwork", "cluster",
#                    "factoextra", "likert"))

library(tidyverse)
library(knitr)
library(kableExtra)
library(psych)
library(corrplot)
library(ggcorrplot)
library(scales)
library(patchwork)
library(cluster)
library(factoextra)

df_raw <- read_csv("customer_segmentation.csv", show_col_types = FALSE)

# Clean column name (trailing space on Profesionalism)
df_raw <- df_raw %>% rename_with(str_trim)

# Recode demographic variables as labeled factors
df <- df_raw %>%
  mutate(
    Gender_f    = factor(Gender,    levels = 1:2,
                         labels = c("Male", "Female")),
    Age_f       = factor(Age,       levels = 2:4,
                         labels = c("18–25", "26–35", "36–45")),
    Education_f = factor(Education, levels = c(1, 2, 3, 5),
                         labels = c("Some HS", "HS Diploma",
                                    "Some College", "Bachelor's+"))
  )

# Satisfaction items only (exclude ID & demographics)
satisfaction_vars <- c("CS_helpful", "Recommend", "Come_again",
                       "All_Products", "Profesionalism", "Limitation",
                       "Online_grocery", "delivery", "Pick_up",
                       "Find_items", "other_shops")

df_sat <- df %>% select(all_of(satisfaction_vars))

3 Data Overview

3.1 Raw Data

df_raw %>%
  kable(caption = "Raw Survey Data") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE) %>%
  scroll_box(height = "350px")

Raw Survey Data
ID	CS_helpful	Recommend	Come_again	All_Products	Profesionalism	Limitation	Online_grocery	delivery	Pick_up	Find_items	other_shops	Gender	Age	Education
1	2	2	2	2	2	2	2	3	4	1	2	1	2	2
2	1	2	1	1	1	1	2	3	3	1	2	1	2	2
3	2	1	1	1	1	2	3	3	2	1	3	1	2	2
4	3	3	2	4	1	2	3	3	2	2	2	1	3	5
5	2	1	3	5	2	1	2	3	1	2	3	2	4	2
6	1	1	3	2	1	1	1	2	1	1	4	1	2	5
7	2	1	1	2	2	1	2	2	2	1	1	1	2	3
8	1	1	1	2	1	2	1	1	2	2	4	1	2	2
9	1	1	1	2	2	1	2	1	3	1	1	2	2	1
10	1	1	1	1	1	1	3	2	2	1	1	2	2	2
11	1	1	1	2	1	1	2	2	2	1	3	2	4	5
12	1	1	1	2	1	1	3	2	3	1	3	1	3	1
13	1	1	1	1	1	1	1	2	2	1	1	1	4	5
14	1	1	1	2	1	1	3	3	3	3	1	1	3	5
15	1	1	1	4	1	1	2	2	2	2	5	1	2	5
16	1	1	2	2	1	1	3	1	3	1	5	1	3	5
17	1	1	1	2	1	2	2	3	5	2	5	1	2	1
18	2	1	1	1	1	3	3	3	3	1	2	1	2	5
19	3	1	2	3	2	4	1	3	1	3	2	2	2	2
20	2	3	3	1	3	1	3	3	1	1	4	2	2	3
21	2	1	1	2	2	1	3	3	4	1	2	1	2	2
22	3	2	1	2	2	2	3	3	3	2	1	1	2	5

3.2 Summary Statistics

describe(df_sat) %>%
  select(n, mean, sd, median, min, max, skew, kurtosis) %>%
  round(3) %>%
  kable(caption = "Descriptive Statistics — Satisfaction Items") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                full_width = FALSE)

Descriptive Statistics — Satisfaction Items
	n	mean	sd	median	min	max	skew	kurtosis
CS_helpful	22	1.591	0.734	1	1	3	0.735	-0.890
Recommend	22	1.318	0.646	1	1	3	1.670	1.382
Come_again	22	1.455	0.739	1	1	3	1.164	-0.233
All_Products	22	2.091	1.065	2	1	5	1.185	0.794
Profesionalism	22	1.409	0.590	1	1	3	0.997	-0.136
Limitation	22	1.500	0.802	1	1	4	1.587	1.991
Online_grocery	22	2.273	0.767	2	1	3	-0.459	-1.251
delivery	22	2.409	0.734	3	1	3	-0.735	-0.890
Pick_up	22	2.455	1.057	2	1	5	0.460	-0.367
Find_items	22	1.455	0.671	1	1	3	1.059	-0.188
other_shops	22	2.591	1.403	2	1	5	0.415	-1.213

4 Demographic Profile

p_gender <- df %>%
  count(Gender_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Gender_f, y = pct, fill = Gender_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Gender", x = NULL, y = "Proportion") +
  theme_minimal(base_size = 12)

p_age <- df %>%
  count(Age_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Age_f, y = pct, fill = Age_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group", x = NULL, y = NULL) +
  theme_minimal(base_size = 12)

p_edu <- df %>%
  count(Education_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Education_f, y = pct, fill = Education_f)) +
  geom_col(show.legend = FALSE, width = 0.6) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Education Level", x = NULL, y = NULL) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 20, hjust = 1))

p_gender | p_age | p_edu

Key observations: - The sample is predominantly female and concentrated in the 18–35 age range. - Most respondents hold a high school diploma or some college education.

5 Satisfaction Item Analysis

5.1 Mean Ratings per Item

df_sat %>%
  summarise(across(everything(), mean)) %>%
  pivot_longer(everything(), names_to = "Item", values_to = "Mean") %>%
  arrange(desc(Mean)) %>%
  ggplot(aes(x = reorder(Item, Mean), y = Mean, fill = Mean)) +
  geom_col(width = 0.7) +
  geom_text(aes(label = round(Mean, 2)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
  labs(title = "Average Rating per Survey Item",
       subtitle = "Higher values indicate higher ratings on each scale",
       x = NULL, y = "Mean Score") +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none") +
  ylim(0, max(colMeans(df_sat)) * 1.2)

5.2 Response Distribution (Stacked Bar)

df_sat %>%
  pivot_longer(everything(), names_to = "Item", values_to = "Rating") %>%
  mutate(Rating = factor(Rating)) %>%
  count(Item, Rating) %>%
  group_by(Item) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Item, y = pct, fill = Rating)) +
  geom_col(position = "fill", width = 0.7) +
  coord_flip() +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Blues", direction = 1) +
  labs(title = "Response Distribution per Item",
       x = NULL, y = "Proportion", fill = "Rating") +
  theme_minimal(base_size = 12)

6 Correlation Analysis

cor_matrix <- cor(df_sat, use = "complete.obs", method = "spearman")

ggcorrplot(cor_matrix,
           method = "circle",
           type = "lower",
           lab = TRUE,
           lab_size = 3,
           colors = c("#D32F2F", "white", "#1565C0"),
           title = "Spearman Correlations Between Satisfaction Items",
           ggtheme = theme_minimal(base_size = 11))

Spearman rank correlation is used here because all items are measured on ordinal (Likert) scales. Strong positive correlations suggest items that tend to move together across respondents.

7 Demographic Group Comparisons

7.1 Satisfaction by Gender

df %>%
  select(Gender_f, all_of(satisfaction_vars)) %>%
  pivot_longer(-Gender_f, names_to = "Item", values_to = "Score") %>%
  group_by(Gender_f, Item) %>%
  summarise(Mean = mean(Score), .groups = "drop") %>%
  ggplot(aes(x = Item, y = Mean, fill = Gender_f)) +
  geom_col(position = "dodge", width = 0.7) +
  coord_flip() +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Mean Satisfaction Scores by Gender",
       x = NULL, y = "Mean Score", fill = "Gender") +
  theme_minimal(base_size = 12)

7.2 Satisfaction by Age Group

df %>%
  select(Age_f, all_of(satisfaction_vars)) %>%
  pivot_longer(-Age_f, names_to = "Item", values_to = "Score") %>%
  group_by(Age_f, Item) %>%
  summarise(Mean = mean(Score), .groups = "drop") %>%
  ggplot(aes(x = Item, y = Mean, fill = Age_f)) +
  geom_col(position = "dodge", width = 0.7) +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Mean Satisfaction Scores by Age Group",
       x = NULL, y = "Mean Score", fill = "Age Group") +
  theme_minimal(base_size = 12)

8 Customer Segmentation (K-Means Clustering)

We apply k-means clustering on the satisfaction items to identify natural customer segments. All variables are scaled before clustering.

8.1 Choosing the Number of Clusters

set.seed(42)
df_scaled <- scale(df_sat)

fviz_nbclust(df_scaled, kmeans, method = "wss") +
  labs(title = "Elbow Method — Optimal Number of Clusters",
       subtitle = "Look for the 'elbow' where WSS levels off") +
  theme_minimal(base_size = 12)

8.2 Fit K-Means (k = 3)

set.seed(42)
k <- 3   # adjust based on the elbow plot above
km_fit <- kmeans(df_scaled, centers = k, nstart = 25)

df <- df %>% mutate(Cluster = factor(km_fit$cluster))

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(df$Cluster))

## 
##  1  2  3 
## 10  4  8

8.3 Cluster Visualization (PCA Biplot)

fviz_cluster(km_fit,
             data = df_scaled,
             geom = "point",
             ellipse.type = "convex",
             palette = c("#E74C3C", "#2ECC71", "#3498DB"),
             ggtheme = theme_minimal(base_size = 12)) +
  labs(title = "K-Means Customer Segments (PCA Projection)")

8.4 Cluster Profiles

df %>%
  group_by(Cluster) %>%
  summarise(across(all_of(satisfaction_vars), mean), n = n()) %>%
  kable(digits = 2,
        caption = "Mean Satisfaction Scores by Customer Segment") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = FALSE)

Mean Satisfaction Scores by Customer Segment
Cluster	CS_helpful	Recommend	Come_again	All_Products	Profesionalism	Limitation	Online_grocery	delivery	Pick_up	Find_items	other_shops	n
1	1.10	1.00	1.30	2.00	1.20	1.10	2.00	1.7	2.20	1.2	2.80	10
2	2.50	2.00	2.50	3.25	2.00	2.00	2.25	3.0	1.25	2.0	2.75	4
3	1.75	1.38	1.12	1.62	1.38	1.75	2.62	3.0	3.38	1.5	2.25	8

# Radar-style faceted bar chart for cluster profiles
df %>%
  group_by(Cluster) %>%
  summarise(across(all_of(satisfaction_vars), mean)) %>%
  pivot_longer(-Cluster, names_to = "Item", values_to = "Mean") %>%
  ggplot(aes(x = Item, y = Mean, fill = Cluster)) +
  geom_col(width = 0.7, show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~paste("Cluster", Cluster)) +
  scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
  labs(title = "Satisfaction Profile by Customer Segment",
       x = NULL, y = "Mean Score") +
  theme_minimal(base_size = 11)

8.5 Cluster Demographics

p_c_gender <- df %>%
  count(Cluster, Gender_f) %>%
  ggplot(aes(x = Cluster, y = n, fill = Gender_f)) +
  geom_col(position = "fill", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
  labs(title = "Gender", x = "Cluster", y = "Proportion", fill = NULL) +
  theme_minimal(base_size = 11)

p_c_age <- df %>%
  count(Cluster, Age_f) %>%
  ggplot(aes(x = Cluster, y = n, fill = Age_f)) +
  geom_col(position = "fill", width = 0.6) +
  scale_y_continuous(labels = percent_format()) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group", x = "Cluster", y = NULL, fill = NULL) +
  theme_minimal(base_size = 11)

p_c_gender | p_c_age

9 Key Findings & Limitations

9.1 Summary of Findings

Customer service and professionalism received the highest average ratings, suggesting staff interactions are a strength.
Product availability (All_Products) and pickup experience (Pick_up) showed more variability, indicating areas for potential improvement.
Correlation analysis revealed that CS_helpful, Recommend, and Come_again are highly interrelated — a satisfied customer tends to score all three highly.
K-means clustering identified three distinct customer segments with differing satisfaction profiles and demographic compositions.

9.2 Limitations

⚠️ This dataset contains only 22 observations, which severely limits the statistical reliability of all analyses — especially clustering and group comparisons. The following caveats apply:

Group differences (gender, age) should not be interpreted as statistically significant without formal testing.

Cluster solutions are exploratory and highly sensitive to random initialization.

Results should not be generalized beyond the sample until replicated with a larger dataset.

10 Session Info

sessionInfo()

## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] factoextra_2.0.0   cluster_2.1.8.2    patchwork_1.3.2    scales_1.4.0      
##  [5] ggcorrplot_0.1.4.1 corrplot_0.95      psych_2.6.3        kableExtra_1.4.0  
##  [9] knitr_1.51         lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0     
## [13] dplyr_1.2.0        purrr_1.2.1        readr_2.2.0        tidyr_1.3.2       
## [17] tibble_3.3.1       ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.57          bslib_0.10.0       rstatix_0.7.3     
##  [5] ggrepel_0.9.8      lattice_0.22-9     tzdb_0.5.0         vctrs_0.7.2       
##  [9] tools_4.5.3        generics_0.1.4     parallel_4.5.3     pkgconfig_2.0.3   
## [13] RColorBrewer_1.1-3 S7_0.2.1           lifecycle_1.0.5    compiler_4.5.3    
## [17] farver_2.1.2       textshaping_1.0.5  mnormt_2.1.2       carData_3.0-6     
## [21] htmltools_0.5.9    sass_0.4.10        yaml_2.3.12        Formula_1.2-5     
## [25] car_3.1-5          ggpubr_0.6.3       pillar_1.11.1      crayon_1.5.3      
## [29] jquerylib_0.1.4    cachem_1.1.0       abind_1.4-8        nlme_3.1-168      
## [33] tidyselect_1.2.1   digest_0.6.39      stringi_1.8.7      reshape2_1.4.5    
## [37] labeling_0.4.3     fastmap_1.2.0      grid_4.5.3         cli_3.6.5         
## [41] magrittr_2.0.4     broom_1.0.12       withr_3.0.2        backports_1.5.0   
## [45] bit64_4.6.0-1      timechange_0.4.0   rmarkdown_2.30     bit_4.6.0         
## [49] otel_0.2.0         ggsignif_0.6.4     hms_1.1.4          evaluate_1.0.5    
## [53] viridisLite_0.4.3  rlang_1.1.7        Rcpp_1.1.1         glue_1.8.0        
## [57] xml2_1.5.2         svglite_2.2.2      rstudioapi_0.18.0  vroom_1.7.0       
## [61] jsonlite_2.0.0     plyr_1.8.9         R6_2.6.1           systemfonts_1.3.2

Customer Segmentation Analysis: Grocery Shopping Survey

Your Name

2026-03-25