1. Introduction

This report analyzes a grocery store customer satisfaction survey collected by a student research group. The dataset contains 22 customer responses across 14 survey variables capturing satisfaction ratings, shopping behaviors, and demographic information.

Survey variables include:

Variable Description
CS_helpful Customer service helpfulness rating
Recommend Likelihood to recommend the store
Come_again Likelihood to return
All_Products Satisfaction with product availability
Profesionalism Staff professionalism rating
Limitation Perceived store limitations
Online_grocery Online grocery experience rating
delivery Delivery service satisfaction
Pick_up Pick-up service satisfaction
Find_items Ease of finding items
other_shops Frequency of shopping at other stores
Gender Gender (1 = Male, 2 = Female)
Age Age group (1 = Under 18, 2 = 18–34, 3 = 35–54, 4 = 55+)
Education Education level (1 = No formal, 2 = High school, 3 = Some college, 4 = Bachelor’s, 5 = Graduate)

All satisfaction/rating variables use a 1–5 Likert scale (1 = Strongly Agree / Very Satisfied, 5 = Strongly Disagree / Very Unsatisfied), unless otherwise noted.


2. Setup & Data Loading

# Install any missing packages automatically
required_packages <- c("tidyverse", "psych", "corrplot", "ggplot2",
                        "scales", "knitr", "reshape2", "cluster",
                        "factoextra", "RColorBrewer")

for (pkg in required_packages) {
  if (!requireNamespace(pkg, quietly = TRUE)) {
    install.packages(pkg, repos = "https://cloud.r-project.org")
  }
}

library(tidyverse)
library(psych)
library(corrplot)
library(ggplot2)
library(scales)
library(knitr)
library(reshape2)
library(cluster)
library(factoextra)
library(RColorBrewer)
# ── Load dataset ──────────────────────────────────────────────────────────────
# Update this path to wherever you saved the CSV
df_raw <- read.csv("customer_segmentation.csv", stringsAsFactors = FALSE)

# Preview
glimpse(df_raw)
## Rows: 22
## Columns: 15
## $ ID             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ CS_helpful     <int> 2, 1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3…
## $ Recommend      <int> 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Come_again     <int> 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2…
## $ All_Products   <int> 2, 1, 1, 4, 5, 2, 2, 2, 2, 1, 2, 2, 1, 2, 4, 2, 2, 1, 3…
## $ Profesionalism <int> 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Limitation     <int> 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4…
## $ Online_grocery <int> 2, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1…
## $ delivery       <int> 3, 3, 3, 3, 3, 2, 2, 1, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 3…
## $ Pick_up        <int> 4, 3, 2, 2, 1, 1, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 5, 3, 1…
## $ Find_items     <int> 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3…
## $ other_shops    <int> 2, 2, 3, 2, 3, 4, 1, 4, 1, 1, 3, 3, 1, 1, 5, 5, 5, 2, 2…
## $ Gender         <int> 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Age            <int> 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 4, 3, 4, 3, 2, 3, 2, 2, 2…
## $ Education      <int> 2, 2, 2, 5, 2, 5, 3, 2, 1, 2, 5, 1, 5, 5, 5, 5, 1, 5, 2…

3. Data Cleaning & Preparation

df <- df_raw %>%
  # Drop the ID column — not analytically useful
  select(-ID) %>%
  # Recode numeric codes into labeled factors for demographics
  mutate(
    Gender_f = factor(Gender,
                      levels = 1:2,
                      labels = c("Male", "Female")),
    Age_f = factor(Age,
                   levels = 1:4,
                   labels = c("Under 18", "18–34", "35–54", "55+")),
    Education_f = factor(Education,
                         levels = 1:5,
                         labels = c("No formal education",
                                    "High school",
                                    "Some college",
                                    "Bachelor's degree",
                                    "Graduate degree"))
  )

# Satisfaction / rating columns (numeric, used for analysis)
rating_vars <- c("CS_helpful", "Recommend", "Come_again",
                 "All_Products", "Profesionalism", "Limitation",
                 "Online_grocery", "delivery", "Pick_up",
                 "Find_items", "other_shops")

df_ratings <- df %>% select(all_of(rating_vars))

cat("Dataset dimensions:", nrow(df), "rows ×", ncol(df), "columns\n")
## Dataset dimensions: 22 rows × 17 columns
cat("Missing values per column:\n")
## Missing values per column:
print(colSums(is.na(df)))
##     CS_helpful      Recommend     Come_again   All_Products Profesionalism 
##              0              0              0              0              0 
##     Limitation Online_grocery       delivery        Pick_up     Find_items 
##              0              0              0              0              0 
##    other_shops         Gender            Age      Education       Gender_f 
##              0              0              0              0              0 
##          Age_f    Education_f 
##              0              0

4. Descriptive Statistics

4.1 Summary Statistics

desc_stats <- describe(df_ratings) %>%
  as.data.frame() %>%
  select(n, mean, sd, median, min, max, skew, kurtosis) %>%
  mutate(across(where(is.numeric), ~ round(.x, 2)))

kable(desc_stats,
      caption = "Descriptive Statistics – Rating Variables",
      col.names = c("N", "Mean", "SD", "Median", "Min", "Max", "Skewness", "Kurtosis"))
Descriptive Statistics – Rating Variables
N Mean SD Median Min Max Skewness Kurtosis
CS_helpful 22 1.59 0.73 1 1 3 0.73 -0.89
Recommend 22 1.32 0.65 1 1 3 1.67 1.38
Come_again 22 1.45 0.74 1 1 3 1.16 -0.23
All_Products 22 2.09 1.06 2 1 5 1.18 0.79
Profesionalism 22 1.41 0.59 1 1 3 1.00 -0.14
Limitation 22 1.50 0.80 1 1 4 1.59 1.99
Online_grocery 22 2.27 0.77 2 1 3 -0.46 -1.25
delivery 22 2.41 0.73 3 1 3 -0.73 -0.89
Pick_up 22 2.45 1.06 2 1 5 0.46 -0.37
Find_items 22 1.45 0.67 1 1 3 1.06 -0.19
other_shops 22 2.59 1.40 2 1 5 0.42 -1.21

4.2 Demographic Profile

# Gender distribution
p1 <- df %>%
  count(Gender_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Gender_f, y = pct, fill = Gender_f)) +
  geom_col(width = 0.5, show.legend = FALSE) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.4, size = 4.5) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Gender Distribution", x = NULL, y = "Proportion") +
  theme_minimal(base_size = 13)

# Age distribution
p2 <- df %>%
  count(Age_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = Age_f, y = pct, fill = Age_f)) +
  geom_col(width = 0.6, show.legend = FALSE) +
  geom_text(aes(label = percent(pct, 1)), vjust = -0.4, size = 4) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 1)) +
  scale_fill_brewer(palette = "Set3") +
  labs(title = "Age Group Distribution", x = NULL, y = "Proportion") +
  theme_minimal(base_size = 13)

# Education distribution
p3 <- df %>%
  count(Education_f) %>%
  mutate(pct = n / sum(n)) %>%
  ggplot(aes(x = reorder(Education_f, pct), y = pct, fill = Education_f)) +
  geom_col(show.legend = FALSE) +
  geom_text(aes(label = percent(pct, 1)), hjust = -0.15, size = 3.8) +
  scale_y_continuous(labels = percent_format(), limits = c(0, 0.8)) +
  scale_fill_brewer(palette = "Pastel1") +
  coord_flip() +
  labs(title = "Education Level Distribution", x = NULL, y = "Proportion") +
  theme_minimal(base_size = 12)

print(p1)

print(p2)

print(p3)


5. Satisfaction Ratings Analysis

5.1 Average Ratings per Category

avg_ratings <- df_ratings %>%
  summarise(across(everything(), mean)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Mean") %>%
  arrange(Mean)

ggplot(avg_ratings, aes(x = reorder(Variable, Mean), y = Mean, fill = Mean)) +
  geom_col() +
  geom_text(aes(label = round(Mean, 2)), hjust = -0.15, size = 4) +
  scale_fill_gradient(low = "#2ecc71", high = "#e74c3c",
                      limits = c(1, 5), name = "Score") +
  scale_y_continuous(limits = c(0, 5.5)) +
  coord_flip() +
  labs(title = "Average Satisfaction Score by Category",
       subtitle = "Scale: 1 (Strongly Agree / Very Satisfied) → 5 (Strongly Disagree)",
       x = NULL, y = "Mean Score") +
  theme_minimal(base_size = 13)

5.2 Rating Distributions (Boxplots)

df_ratings_long <- df_ratings %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Score")

ggplot(df_ratings_long, aes(x = reorder(Variable, Score, FUN = median),
                             y = Score, fill = Variable)) +
  geom_boxplot(show.legend = FALSE, alpha = 0.8, outlier.shape = 21) +
  stat_summary(fun = mean, geom = "point", shape = 23,
               size = 3, fill = "white", color = "black") +
  scale_fill_manual(values = colorRampPalette(brewer.pal(9, "Set1"))(11)) +
  scale_y_continuous(breaks = 1:5) +
  coord_flip() +
  labs(title = "Distribution of Satisfaction Scores",
       subtitle = "Diamond = mean; box = IQR; whiskers = full range",
       x = NULL, y = "Score (1–5)") +
  theme_minimal(base_size = 13)

5.3 Frequency Heatmap

freq_table <- df_ratings_long %>%
  count(Variable, Score) %>%
  group_by(Variable) %>%
  mutate(Proportion = n / sum(n)) %>%
  ungroup()

ggplot(freq_table, aes(x = factor(Score), y = Variable, fill = Proportion)) +
  geom_tile(color = "white", linewidth = 0.5) +
  geom_text(aes(label = percent(Proportion, 1)), size = 3.5) +
  scale_fill_gradient(low = "#ffffff", high = "#2c7bb6",
                      labels = percent_format(), name = "Proportion") +
  labs(title = "Response Frequency Heatmap",
       subtitle = "Proportion of respondents choosing each score level",
       x = "Score", y = NULL) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(size = 11))


6. Correlation Analysis

6.1 Correlation Matrix

cor_matrix <- cor(df_ratings, use = "complete.obs")

corrplot(cor_matrix,
         method   = "color",
         type     = "upper",
         order    = "hclust",
         addCoef.col = "black",
         number.cex  = 0.75,
         tl.cex      = 0.85,
         tl.col      = "black",
         col         = colorRampPalette(c("#d73027","#f7f7f7","#1a9850"))(200),
         title       = "Pearson Correlation Matrix – Rating Variables",
         mar         = c(0, 0, 2, 0))

6.2 Top Correlating Pairs

cor_df <- as.data.frame(as.table(cor_matrix)) %>%
  rename(Var1 = Var1, Var2 = Var2, Correlation = Freq) %>%
  filter(Var1 != Var2) %>%
  mutate(pair = pmap_chr(list(Var1, Var2), ~ paste(sort(c(..1, ..2)), collapse = " – "))) %>%
  distinct(pair, .keep_all = TRUE) %>%
  arrange(desc(abs(Correlation))) %>%
  slice_head(n = 10) %>%
  select(pair, Correlation) %>%
  mutate(Correlation = round(Correlation, 3))

kable(cor_df, caption = "Top 10 Strongest Correlations (by |r|)",
      col.names = c("Variable Pair", "r"))
Top 10 Strongest Correlations (by |r|)
Variable Pair r
CS_helpful – Limitation 0.607
CS_helpful – delivery 0.590
All_Products – Find_items 0.539
Come_again – Pick_up -0.521
CS_helpful – Profesionalism 0.514
CS_helpful – Recommend 0.488
Limitation – Find_items 0.443
Come_again – Profesionalism 0.427
Recommend – delivery 0.415
Recommend – Profesionalism 0.391

7. Demographic Subgroup Comparisons

7.1 Mean Scores by Gender

df_with_demos <- bind_cols(df_ratings, df %>% select(Gender_f, Age_f, Education_f))

gender_means <- df_with_demos %>%
  group_by(Gender_f) %>%
  summarise(across(all_of(rating_vars), mean, .names = "{.col}")) %>%
  pivot_longer(-Gender_f, names_to = "Variable", values_to = "Mean")

ggplot(gender_means, aes(x = Variable, y = Mean, fill = Gender_f)) +
  geom_col(position = "dodge", alpha = 0.85) +
  scale_fill_brewer(palette = "Set2", name = "Gender") +
  scale_y_continuous(limits = c(0, 5)) +
  coord_flip() +
  labs(title = "Average Satisfaction Scores by Gender",
       x = NULL, y = "Mean Score (1–5)") +
  theme_minimal(base_size = 12)

7.2 Mean Scores by Age Group

age_means <- df_with_demos %>%
  group_by(Age_f) %>%
  summarise(across(all_of(rating_vars), mean, .names = "{.col}")) %>%
  pivot_longer(-Age_f, names_to = "Variable", values_to = "Mean")

ggplot(age_means, aes(x = Variable, y = Mean, color = Age_f, group = Age_f)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2.5) +
  scale_color_brewer(palette = "Dark2", name = "Age Group") +
  scale_y_continuous(limits = c(0, 5)) +
  coord_flip() +
  labs(title = "Average Satisfaction Scores by Age Group",
       x = NULL, y = "Mean Score (1–5)") +
  theme_minimal(base_size = 12)


8. Customer Segmentation (K-Means Clustering)

set.seed(42)
df_scaled <- scale(df_ratings)

# Elbow method to choose optimal k
fviz_nbclust(df_scaled, kmeans, method = "wss",
             k.max = 8,
             linecolor = "#e74c3c") +
  labs(title = "Elbow Method – Optimal Number of Clusters",
       subtitle = "Look for the 'elbow' where WSS improvement flattens",
       x = "Number of Clusters (k)", y = "Total Within-Cluster SS") +
  theme_minimal(base_size = 13)

# Fit k-means with k = 3 (adjust based on elbow plot)
k <- 3
km <- kmeans(df_scaled, centers = k, nstart = 25, iter.max = 100)

df$Cluster <- factor(km$cluster, labels = paste0("Segment ", 1:k))

cat("Cluster sizes:\n")
## Cluster sizes:
print(table(df$Cluster))
## 
## Segment 1 Segment 2 Segment 3 
##         4        10         8
fviz_cluster(km, data = df_scaled,
             geom       = "point",
             ellipse    = TRUE,
             ellipse.type = "convex",
             palette    = "jco",
             ggtheme    = theme_minimal(base_size = 13)) +
  labs(title = "Customer Segments – PCA Projection",
       subtitle = "Each point = one customer; ellipses define segment boundaries")

8.1 Segment Profiles

segment_profiles <- df %>%
  group_by(Cluster) %>%
  summarise(
    n           = n(),
    CS_helpful   = round(mean(CS_helpful), 2),
    Recommend    = round(mean(Recommend), 2),
    Come_again   = round(mean(Come_again), 2),
    All_Products = round(mean(All_Products), 2),
    Profesionalism = round(mean(Profesionalism), 2),
    Online_grocery = round(mean(Online_grocery), 2),
    delivery     = round(mean(delivery), 2),
    Pick_up      = round(mean(Pick_up), 2),
    Find_items   = round(mean(Find_items), 2),
    .groups = "drop"
  )

kable(segment_profiles, caption = "Mean Scores per Customer Segment")
Mean Scores per Customer Segment
Cluster n CS_helpful Recommend Come_again All_Products Profesionalism Online_grocery delivery Pick_up Find_items
Segment 1 4 2.50 2.00 2.50 3.25 2.00 2.25 3.0 1.25 2.0
Segment 2 10 1.10 1.00 1.30 2.00 1.20 2.00 1.7 2.20 1.2
Segment 3 8 1.75 1.38 1.12 1.62 1.38 2.62 3.0 3.38 1.5
# Radar-style comparison using a grouped bar chart
segment_long <- segment_profiles %>%
  select(-n) %>%
  pivot_longer(-Cluster, names_to = "Variable", values_to = "Mean")

ggplot(segment_long, aes(x = Variable, y = Mean, fill = Cluster)) +
  geom_col(position = "dodge", alpha = 0.85) +
  scale_fill_brewer(palette = "Set1", name = "Segment") +
  scale_y_continuous(limits = c(0, 5)) +
  coord_flip() +
  labs(title = "Customer Segment Profiles",
       subtitle = "Mean satisfaction scores per variable, grouped by segment",
       x = NULL, y = "Mean Score (1–5)") +
  theme_minimal(base_size = 12)


9. Key Findings & Recommendations

1. Overall Satisfaction

Most customers gave ratings between 1 and 2, suggesting generally high satisfaction (on a scale where 1 = best). The mean scores across all rating dimensions were low, indicating the store performs well overall.

2. Highest Satisfaction Area

The item with the lowest (best) mean score is Recommend (M = 1.32), suggesting this is the area customers are most pleased with.

3. Area for Improvement

The item with the highest (worst) mean score is other_shops (M = 2.59). This is the primary opportunity for targeted improvement.

4. Segmentation

K-means clustering identified 3 distinct customer segments. Each segment shows a distinct satisfaction profile. Stores can tailor communication, loyalty programs, and service improvements based on these profiles.

5. Demographic Insights

The majority of respondents fall in the 18–34 age group, with high-school and graduate-level education dominating. Gender representation is uneven; findings should be interpreted with this in mind given the small sample size (n = 22).


10. Limitations

  • Small sample size (n = 22) limits statistical power and generalizability.
  • Survey scale direction (whether 1 = agree/satisfied or disagree/unsatisfied) should be confirmed with the original questionnaire to ensure correct interpretation.
  • K-means assumes spherical clusters and is sensitive to scale; results should be validated with silhouette analysis or hierarchical clustering on larger samples.

Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 factoextra_2.0.0   cluster_2.1.8.2    reshape2_1.4.5    
##  [5] knitr_1.51         scales_1.4.0       corrplot_0.95      psych_2.6.1       
##  [9] lubridate_1.9.5    forcats_1.0.1      stringr_1.6.0      dplyr_1.2.0       
## [13] purrr_1.2.1        readr_2.2.0        tidyr_1.3.2        tibble_3.3.1      
## [17] ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.4    rstatix_0.7.3     stringi_1.8.7    
##  [5] lattice_0.22-9    hms_1.1.4         digest_0.6.39     magrittr_2.0.4   
##  [9] timechange_0.4.0  evaluate_1.0.5    grid_4.5.3        fastmap_1.2.0    
## [13] plyr_1.8.9        jsonlite_2.0.0    ggrepel_0.9.8     backports_1.5.0  
## [17] Formula_1.2-5     jquerylib_0.1.4   abind_1.4-8       mnormt_2.1.2     
## [21] cli_3.6.5         rlang_1.1.7       withr_3.0.2       cachem_1.1.0     
## [25] yaml_2.3.12       otel_0.2.0        tools_4.5.3       parallel_4.5.3   
## [29] tzdb_0.5.0        ggsignif_0.6.4    ggpubr_0.6.3      broom_1.0.12     
## [33] vctrs_0.7.2       R6_2.6.1          lifecycle_1.0.5   car_3.1-5        
## [37] pkgconfig_2.0.3   pillar_1.11.1     bslib_0.10.0      gtable_0.3.6     
## [41] glue_1.8.0        Rcpp_1.1.1        xfun_0.57         tidyselect_1.2.1 
## [45] rstudioapi_0.18.0 farver_2.1.2      htmltools_0.5.9   nlme_3.1-168     
## [49] ggsci_4.2.0       carData_3.0-6     labeling_0.4.3    rmarkdown_2.30   
## [53] compiler_4.5.3    S7_0.2.1