This report analyzes a grocery shopping customer survey collected by a student research group. The dataset contains 22 respondents who rated various aspects of their grocery shopping experience on Likert-style scales, along with basic demographic information.
Survey Variables:
| Variable | Description | Scale |
|---|---|---|
CS_helpful |
Customer service helpfulness | 1–3 |
Recommend |
Likelihood to recommend the store | 1–3 |
Come_again |
Likelihood to return | 1–3 |
All_Products |
Satisfaction with product availability | 1–5 |
Profesionalism |
Staff professionalism | 1–3 |
Limitation |
Perceived product limitations | 1–4 |
Online_grocery |
Use/satisfaction with online grocery | 1–3 |
delivery |
Delivery service rating | 1–3 |
Pick_up |
In-store pickup rating | 1–5 |
Find_items |
Ease of finding items | 1–3 |
other_shops |
Frequency of shopping at other stores | 1–5 |
Gender |
1 = Male, 2 = Female | — |
Age |
2 = 18–25, 3 = 26–35, 4 = 36–45 | — |
Education |
1 = Some HS, 2 = HS Diploma, 3 = Some College, 5 = Bachelor’s+ | — |
# Install any missing packages before running:
# install.packages(c("tidyverse", "knitr", "kableExtra",
# "psych", "corrplot", "ggcorrplot",
# "scales", "patchwork", "cluster",
# "factoextra", "likert"))
library(tidyverse)
library(knitr)
library(kableExtra)
library(psych)
library(corrplot)
library(ggcorrplot)
library(scales)
library(patchwork)
library(cluster)
library(factoextra)df_raw <- read_csv("customer_segmentation.csv", show_col_types = FALSE)
# Clean column name (trailing space on Profesionalism)
df_raw <- df_raw %>% rename_with(str_trim)
# Recode demographic variables as labeled factors
df <- df_raw %>%
mutate(
Gender_f = factor(Gender, levels = 1:2,
labels = c("Male", "Female")),
Age_f = factor(Age, levels = 2:4,
labels = c("18–25", "26–35", "36–45")),
Education_f = factor(Education, levels = c(1, 2, 3, 5),
labels = c("Some HS", "HS Diploma",
"Some College", "Bachelor's+"))
)
# Satisfaction items only (exclude ID & demographics)
satisfaction_vars <- c("CS_helpful", "Recommend", "Come_again",
"All_Products", "Profesionalism", "Limitation",
"Online_grocery", "delivery", "Pick_up",
"Find_items", "other_shops")
df_sat <- df %>% select(all_of(satisfaction_vars))df_raw %>%
kable(caption = "Raw Survey Data") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
scroll_box(height = "350px")| ID | CS_helpful | Recommend | Come_again | All_Products | Profesionalism | Limitation | Online_grocery | delivery | Pick_up | Find_items | other_shops | Gender | Age | Education |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 2 | 2 | 2 | 2 | 2 | 2 | 2 | 3 | 4 | 1 | 2 | 1 | 2 | 2 |
| 2 | 1 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 3 | 1 | 2 | 1 | 2 | 2 |
| 3 | 2 | 1 | 1 | 1 | 1 | 2 | 3 | 3 | 2 | 1 | 3 | 1 | 2 | 2 |
| 4 | 3 | 3 | 2 | 4 | 1 | 2 | 3 | 3 | 2 | 2 | 2 | 1 | 3 | 5 |
| 5 | 2 | 1 | 3 | 5 | 2 | 1 | 2 | 3 | 1 | 2 | 3 | 2 | 4 | 2 |
| 6 | 1 | 1 | 3 | 2 | 1 | 1 | 1 | 2 | 1 | 1 | 4 | 1 | 2 | 5 |
| 7 | 2 | 1 | 1 | 2 | 2 | 1 | 2 | 2 | 2 | 1 | 1 | 1 | 2 | 3 |
| 8 | 1 | 1 | 1 | 2 | 1 | 2 | 1 | 1 | 2 | 2 | 4 | 1 | 2 | 2 |
| 9 | 1 | 1 | 1 | 2 | 2 | 1 | 2 | 1 | 3 | 1 | 1 | 2 | 2 | 1 |
| 10 | 1 | 1 | 1 | 1 | 1 | 1 | 3 | 2 | 2 | 1 | 1 | 2 | 2 | 2 |
| 11 | 1 | 1 | 1 | 2 | 1 | 1 | 2 | 2 | 2 | 1 | 3 | 2 | 4 | 5 |
| 12 | 1 | 1 | 1 | 2 | 1 | 1 | 3 | 2 | 3 | 1 | 3 | 1 | 3 | 1 |
| 13 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 2 | 2 | 1 | 1 | 1 | 4 | 5 |
| 14 | 1 | 1 | 1 | 2 | 1 | 1 | 3 | 3 | 3 | 3 | 1 | 1 | 3 | 5 |
| 15 | 1 | 1 | 1 | 4 | 1 | 1 | 2 | 2 | 2 | 2 | 5 | 1 | 2 | 5 |
| 16 | 1 | 1 | 2 | 2 | 1 | 1 | 3 | 1 | 3 | 1 | 5 | 1 | 3 | 5 |
| 17 | 1 | 1 | 1 | 2 | 1 | 2 | 2 | 3 | 5 | 2 | 5 | 1 | 2 | 1 |
| 18 | 2 | 1 | 1 | 1 | 1 | 3 | 3 | 3 | 3 | 1 | 2 | 1 | 2 | 5 |
| 19 | 3 | 1 | 2 | 3 | 2 | 4 | 1 | 3 | 1 | 3 | 2 | 2 | 2 | 2 |
| 20 | 2 | 3 | 3 | 1 | 3 | 1 | 3 | 3 | 1 | 1 | 4 | 2 | 2 | 3 |
| 21 | 2 | 1 | 1 | 2 | 2 | 1 | 3 | 3 | 4 | 1 | 2 | 1 | 2 | 2 |
| 22 | 3 | 2 | 1 | 2 | 2 | 2 | 3 | 3 | 3 | 2 | 1 | 1 | 2 | 5 |
describe(df_sat) %>%
select(n, mean, sd, median, min, max, skew, kurtosis) %>%
round(3) %>%
kable(caption = "Descriptive Statistics — Satisfaction Items") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE)| n | mean | sd | median | min | max | skew | kurtosis | |
|---|---|---|---|---|---|---|---|---|
| CS_helpful | 22 | 1.591 | 0.734 | 1 | 1 | 3 | 0.735 | -0.890 |
| Recommend | 22 | 1.318 | 0.646 | 1 | 1 | 3 | 1.670 | 1.382 |
| Come_again | 22 | 1.455 | 0.739 | 1 | 1 | 3 | 1.164 | -0.233 |
| All_Products | 22 | 2.091 | 1.065 | 2 | 1 | 5 | 1.185 | 0.794 |
| Profesionalism | 22 | 1.409 | 0.590 | 1 | 1 | 3 | 0.997 | -0.136 |
| Limitation | 22 | 1.500 | 0.802 | 1 | 1 | 4 | 1.587 | 1.991 |
| Online_grocery | 22 | 2.273 | 0.767 | 2 | 1 | 3 | -0.459 | -1.251 |
| delivery | 22 | 2.409 | 0.734 | 3 | 1 | 3 | -0.735 | -0.890 |
| Pick_up | 22 | 2.455 | 1.057 | 2 | 1 | 5 | 0.460 | -0.367 |
| Find_items | 22 | 1.455 | 0.671 | 1 | 1 | 3 | 1.059 | -0.188 |
| other_shops | 22 | 2.591 | 1.403 | 2 | 1 | 5 | 0.415 | -1.213 |
p_gender <- df %>%
count(Gender_f) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Gender_f, y = pct, fill = Gender_f)) +
geom_col(show.legend = FALSE, width = 0.6) +
geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
labs(title = "Gender", x = NULL, y = "Proportion") +
theme_minimal(base_size = 12)
p_age <- df %>%
count(Age_f) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Age_f, y = pct, fill = Age_f)) +
geom_col(show.legend = FALSE, width = 0.6) +
geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Age Group", x = NULL, y = NULL) +
theme_minimal(base_size = 12)
p_edu <- df %>%
count(Education_f) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Education_f, y = pct, fill = Education_f)) +
geom_col(show.legend = FALSE, width = 0.6) +
geom_text(aes(label = percent(pct, 1)), vjust = -0.5, size = 4) +
scale_y_continuous(labels = percent_format(), limits = c(0, 0.9)) +
scale_fill_brewer(palette = "Pastel1") +
labs(title = "Education Level", x = NULL, y = NULL) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 20, hjust = 1))
p_gender | p_age | p_eduKey observations: - The sample is predominantly female and concentrated in the 18–35 age range. - Most respondents hold a high school diploma or some college education.
df_sat %>%
summarise(across(everything(), mean)) %>%
pivot_longer(everything(), names_to = "Item", values_to = "Mean") %>%
arrange(desc(Mean)) %>%
ggplot(aes(x = reorder(Item, Mean), y = Mean, fill = Mean)) +
geom_col(width = 0.7) +
geom_text(aes(label = round(Mean, 2)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#AED6F1", high = "#1A5276") +
labs(title = "Average Rating per Survey Item",
subtitle = "Higher values indicate higher ratings on each scale",
x = NULL, y = "Mean Score") +
theme_minimal(base_size = 12) +
theme(legend.position = "none") +
ylim(0, max(colMeans(df_sat)) * 1.2)df_sat %>%
pivot_longer(everything(), names_to = "Item", values_to = "Rating") %>%
mutate(Rating = factor(Rating)) %>%
count(Item, Rating) %>%
group_by(Item) %>%
mutate(pct = n / sum(n)) %>%
ggplot(aes(x = Item, y = pct, fill = Rating)) +
geom_col(position = "fill", width = 0.7) +
coord_flip() +
scale_y_continuous(labels = percent_format()) +
scale_fill_brewer(palette = "Blues", direction = 1) +
labs(title = "Response Distribution per Item",
x = NULL, y = "Proportion", fill = "Rating") +
theme_minimal(base_size = 12)cor_matrix <- cor(df_sat, use = "complete.obs", method = "spearman")
ggcorrplot(cor_matrix,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("#D32F2F", "white", "#1565C0"),
title = "Spearman Correlations Between Satisfaction Items",
ggtheme = theme_minimal(base_size = 11))Spearman rank correlation is used here because all items are measured on ordinal (Likert) scales. Strong positive correlations suggest items that tend to move together across respondents.
df %>%
select(Gender_f, all_of(satisfaction_vars)) %>%
pivot_longer(-Gender_f, names_to = "Item", values_to = "Score") %>%
group_by(Gender_f, Item) %>%
summarise(Mean = mean(Score), .groups = "drop") %>%
ggplot(aes(x = Item, y = Mean, fill = Gender_f)) +
geom_col(position = "dodge", width = 0.7) +
coord_flip() +
scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
labs(title = "Mean Satisfaction Scores by Gender",
x = NULL, y = "Mean Score", fill = "Gender") +
theme_minimal(base_size = 12)df %>%
select(Age_f, all_of(satisfaction_vars)) %>%
pivot_longer(-Age_f, names_to = "Item", values_to = "Score") %>%
group_by(Age_f, Item) %>%
summarise(Mean = mean(Score), .groups = "drop") %>%
ggplot(aes(x = Item, y = Mean, fill = Age_f)) +
geom_col(position = "dodge", width = 0.7) +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(title = "Mean Satisfaction Scores by Age Group",
x = NULL, y = "Mean Score", fill = "Age Group") +
theme_minimal(base_size = 12)We apply k-means clustering on the satisfaction items to identify natural customer segments. All variables are scaled before clustering.
set.seed(42)
df_scaled <- scale(df_sat)
fviz_nbclust(df_scaled, kmeans, method = "wss") +
labs(title = "Elbow Method — Optimal Number of Clusters",
subtitle = "Look for the 'elbow' where WSS levels off") +
theme_minimal(base_size = 12)set.seed(42)
k <- 3 # adjust based on the elbow plot above
km_fit <- kmeans(df_scaled, centers = k, nstart = 25)
df <- df %>% mutate(Cluster = factor(km_fit$cluster))
cat("Cluster sizes:\n")## Cluster sizes:
##
## 1 2 3
## 10 4 8
fviz_cluster(km_fit,
data = df_scaled,
geom = "point",
ellipse.type = "convex",
palette = c("#E74C3C", "#2ECC71", "#3498DB"),
ggtheme = theme_minimal(base_size = 12)) +
labs(title = "K-Means Customer Segments (PCA Projection)")df %>%
group_by(Cluster) %>%
summarise(across(all_of(satisfaction_vars), mean), n = n()) %>%
kable(digits = 2,
caption = "Mean Satisfaction Scores by Customer Segment") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = FALSE)| Cluster | CS_helpful | Recommend | Come_again | All_Products | Profesionalism | Limitation | Online_grocery | delivery | Pick_up | Find_items | other_shops | n |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1.10 | 1.00 | 1.30 | 2.00 | 1.20 | 1.10 | 2.00 | 1.7 | 2.20 | 1.2 | 2.80 | 10 |
| 2 | 2.50 | 2.00 | 2.50 | 3.25 | 2.00 | 2.00 | 2.25 | 3.0 | 1.25 | 2.0 | 2.75 | 4 |
| 3 | 1.75 | 1.38 | 1.12 | 1.62 | 1.38 | 1.75 | 2.62 | 3.0 | 3.38 | 1.5 | 2.25 | 8 |
# Radar-style faceted bar chart for cluster profiles
df %>%
group_by(Cluster) %>%
summarise(across(all_of(satisfaction_vars), mean)) %>%
pivot_longer(-Cluster, names_to = "Item", values_to = "Mean") %>%
ggplot(aes(x = Item, y = Mean, fill = Cluster)) +
geom_col(width = 0.7, show.legend = FALSE) +
coord_flip() +
facet_wrap(~paste("Cluster", Cluster)) +
scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
labs(title = "Satisfaction Profile by Customer Segment",
x = NULL, y = "Mean Score") +
theme_minimal(base_size = 11)p_c_gender <- df %>%
count(Cluster, Gender_f) %>%
ggplot(aes(x = Cluster, y = n, fill = Gender_f)) +
geom_col(position = "fill", width = 0.6) +
scale_y_continuous(labels = percent_format()) +
scale_fill_manual(values = c("#4E79A7", "#F28E2B")) +
labs(title = "Gender", x = "Cluster", y = "Proportion", fill = NULL) +
theme_minimal(base_size = 11)
p_c_age <- df %>%
count(Cluster, Age_f) %>%
ggplot(aes(x = Cluster, y = n, fill = Age_f)) +
geom_col(position = "fill", width = 0.6) +
scale_y_continuous(labels = percent_format()) +
scale_fill_brewer(palette = "Set2") +
labs(title = "Age Group", x = "Cluster", y = NULL, fill = NULL) +
theme_minimal(base_size = 11)
p_c_gender | p_c_ageAll_Products) and pickup
experience (Pick_up) showed more variability,
indicating areas for potential improvement.CS_helpful,
Recommend, and Come_again are highly
interrelated — a satisfied customer tends to score all three
highly.⚠️ This dataset contains only 22 observations, which severely limits the statistical reliability of all analyses — especially clustering and group comparisons. The following caveats apply:
- Group differences (gender, age) should not be interpreted as statistically significant without formal testing.
- Cluster solutions are exploratory and highly sensitive to random initialization.
- Results should not be generalized beyond the sample until replicated with a larger dataset.
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] factoextra_2.0.0 cluster_2.1.8.2 patchwork_1.3.2 scales_1.4.0
## [5] ggcorrplot_0.1.4.1 corrplot_0.95 psych_2.6.3 kableExtra_1.4.0
## [9] knitr_1.51 lubridate_1.9.5 forcats_1.0.1 stringr_1.6.0
## [13] dplyr_1.2.0 purrr_1.2.1 readr_2.2.0 tidyr_1.3.2
## [17] tibble_3.3.1 ggplot2_4.0.2 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.57 bslib_0.10.0 rstatix_0.7.3
## [5] ggrepel_0.9.8 lattice_0.22-9 tzdb_0.5.0 vctrs_0.7.2
## [9] tools_4.5.3 generics_0.1.4 parallel_4.5.3 pkgconfig_2.0.3
## [13] RColorBrewer_1.1-3 S7_0.2.1 lifecycle_1.0.5 compiler_4.5.3
## [17] farver_2.1.2 textshaping_1.0.5 mnormt_2.1.2 carData_3.0-6
## [21] htmltools_0.5.9 sass_0.4.10 yaml_2.3.12 Formula_1.2-5
## [25] car_3.1-5 ggpubr_0.6.3 pillar_1.11.1 crayon_1.5.3
## [29] jquerylib_0.1.4 cachem_1.1.0 abind_1.4-8 nlme_3.1-168
## [33] tidyselect_1.2.1 digest_0.6.39 stringi_1.8.7 reshape2_1.4.5
## [37] labeling_0.4.3 fastmap_1.2.0 grid_4.5.3 cli_3.6.5
## [41] magrittr_2.0.4 broom_1.0.12 withr_3.0.2 backports_1.5.0
## [45] bit64_4.6.0-1 timechange_0.4.0 rmarkdown_2.30 bit_4.6.0
## [49] otel_0.2.0 ggsignif_0.6.4 hms_1.1.4 evaluate_1.0.5
## [53] viridisLite_0.4.3 rlang_1.1.7 Rcpp_1.1.1 glue_1.8.0
## [57] xml2_1.5.2 svglite_2.2.2 rstudioapi_0.18.0 vroom_1.7.0
## [61] jsonlite_2.0.0 plyr_1.8.9 R6_2.6.1 systemfonts_1.3.2