1. Introduction

This report analyzes survey data collected from 22 grocery store customers. The dataset captures customer satisfaction, shopping behaviors, and demographic characteristics. The goal is to identify meaningful customer segments and uncover patterns that could inform store strategy.

Variables included:

Variable	Description	Scale
`CS_helpful`	Customer service helpfulness rating	1–3 (1 = Most positive)
`Recommend`	Likelihood to recommend the store	1–3
`Come_again`	Likelihood to return	1–3
`All_Products`	Satisfaction with product availability	1–5
`Profesionalism`	Staff professionalism rating	1–3
`Limitation`	Perceived product limitations	1–4
`Online_grocery`	Use of online grocery shopping	1–3
`delivery`	Use of delivery services	1–3
`Pick_up`	Use of pickup services	1–5
`Find_items`	Ease of finding items	1–3
`other_shops`	Shopping at other stores	1–5
`Gender`	1 = Male, 2 = Female	Binary
`Age`	2 = 18–34, 3 = 35–54, 4 = 55+	Ordinal
`Education`	1 = HS, 2 = Some College, 3 = Bachelor’s, 5 = Graduate	Ordinal

2. Setup & Data Loading

# Install packages if needed:
# install.packages(c("tidyverse", "corrplot", "ggplot2", "cluster",
#                    "factoextra", "knitr", "scales", "RColorBrewer"))

library(tidyverse)
library(corrplot)
library(ggplot2)
library(cluster)
library(factoextra)
library(knitr)
library(scales)
library(RColorBrewer)

# Load dataset — adjust path as needed
df <- read.csv("customer_segmentation.csv")

# Remove ID column for analysis
df_analysis <- df %>% select(-ID)

# Preview
kable(head(df_analysis), caption = "First 6 Rows of Customer Segmentation Data")

First 6 Rows of Customer Segmentation Data
CS_helpful	Recommend	Come_again	All_Products	Profesionalism	Limitation	Online_grocery	delivery	Pick_up	Find_items	other_shops	Gender	Age	Education
2	2	2	2	2	2	2	3	4	1	2	1	2	2
1	2	1	1	1	1	2	3	3	1	2	1	2	2
2	1	1	1	1	2	3	3	2	1	3	1	2	2
3	3	2	4	1	2	3	3	2	2	2	1	3	5
2	1	3	5	2	1	2	3	1	2	3	2	4	2
1	1	3	2	1	1	1	2	1	1	4	1	2	5

3. Exploratory Data Analysis

3.1 Dataset Overview

cat("Dimensions:", nrow(df), "rows x", ncol(df), "columns\n\n")

## Dimensions: 22 rows x 15 columns

summary(df_analysis)

##    CS_helpful      Recommend       Come_again     All_Products  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.591   Mean   :1.318   Mean   :1.455   Mean   :2.091  
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##  Profesionalism    Limitation  Online_grocery     delivery        Pick_up     
##  Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :1.000   Median :1.0   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##    Find_items     other_shops        Gender           Age       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000   1st Qu.:2.000  
##  Median :1.000   Median :2.000   Median :1.000   Median :2.000  
##  Mean   :1.455   Mean   :2.591   Mean   :1.273   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :5.000   Max.   :2.000   Max.   :4.000  
##    Education    
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :2.500  
##  Mean   :3.182  
##  3rd Qu.:5.000  
##  Max.   :5.000

3.2 Demographic Distribution

# Recode factor labels for readability
df_plot <- df_analysis %>%
  mutate(
    Gender_label    = factor(Gender,    levels = c(1, 2), labels = c("Male", "Female")),
    Age_label       = factor(Age,       levels = c(2, 3, 4), labels = c("18–34", "35–54", "55+")),
    Education_label = factor(Education, levels = c(1, 2, 3, 5),
                              labels = c("High School", "Some College", "Bachelor's", "Graduate"))
  )

p1 <- ggplot(df_plot, aes(x = Gender_label, fill = Gender_label)) +
  geom_bar(width = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Gender Distribution", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p2 <- ggplot(df_plot, aes(x = Age_label, fill = Age_label)) +
  geom_bar(width = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group Distribution", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p3 <- ggplot(df_plot, aes(x = Education_label, fill = Education_label)) +
  geom_bar(width = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Education Level", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Note: Install gridExtra with install.packages("gridExtra") if not available.

3.3 Satisfaction Variables: Distribution

satisfaction_vars <- c("CS_helpful", "Recommend", "Come_again",
                        "Profesionalism", "Find_items")

df_analysis %>%
  select(all_of(satisfaction_vars)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Rating") %>%
  ggplot(aes(x = factor(Rating), fill = Variable)) +
  geom_bar(show.legend = FALSE) +
  facet_wrap(~Variable, scales = "free_y") +
  scale_fill_brewer(palette = "Blues") +
  labs(title = "Distribution of Satisfaction Ratings",
       subtitle = "Lower scores = more positive (scale 1–3)",
       x = "Rating", y = "Count") +
  theme_minimal()

3.4 Shopping Behavior Variables

behavior_vars <- c("Online_grocery", "delivery", "Pick_up", "other_shops", "Limitation")

df_analysis %>%
  select(all_of(behavior_vars)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Response") %>%
  ggplot(aes(x = factor(Response), fill = Variable)) +
  geom_bar(show.legend = FALSE) +
  facet_wrap(~Variable, scales = "free") +
  scale_fill_brewer(palette = "Greens") +
  labs(title = "Distribution of Shopping Behavior Responses",
       x = "Response", y = "Count") +
  theme_minimal()

4. Correlation Analysis

cor_matrix <- cor(df_analysis, use = "complete.obs")

corrplot(cor_matrix,
         method   = "color",
         type     = "upper",
         order    = "hclust",
         tl.cex   = 0.8,
         tl.col   = "black",
         addCoef.col = "black",
         number.cex  = 0.6,
         col      = colorRampPalette(c("#2166ac", "white", "#d73027"))(200),
         title    = "Correlation Matrix of Customer Segmentation Variables",
         mar      = c(0, 0, 2, 0))

Key observations:

CS_helpful, Recommend, and Come_again tend to be positively correlated — satisfied customers across one dimension tend to be satisfied across others.
other_shops and Limitation may reflect customers who perceive a lack of product variety.

5. Customer Segmentation (K-Means Clustering)

5.1 Determine Optimal Number of Clusters

# Use satisfaction + behavior columns (exclude demographics for clustering)
cluster_vars <- c("CS_helpful", "Recommend", "Come_again", "All_Products",
                   "Profesionalism", "Limitation", "Online_grocery",
                   "delivery", "Pick_up", "Find_items", "other_shops")

df_cluster <- df_analysis %>% select(all_of(cluster_vars)) %>% scale()

set.seed(42)
fviz_nbclust(df_cluster, kmeans, method = "wss") +
  labs(title = "Elbow Method — Optimal Number of Clusters",
       x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares") +
  theme_minimal()

5.2 Fit K-Means (k = 3)

set.seed(42)
k <- 3
km_model <- kmeans(df_cluster, centers = k, nstart = 25)

# Append cluster labels to data
df_analysis$Cluster <- factor(km_model$cluster,
                               labels = paste("Segment", 1:k))

cat("Cluster sizes:\n")

## Cluster sizes:

print(table(df_analysis$Cluster))

## 
## Segment 1 Segment 2 Segment 3 
##        10         4         8

5.3 Visualize Clusters (PCA Biplot)

fviz_cluster(km_model,
             data    = df_cluster,
             palette = c("#E74C3C", "#2ECC71", "#3498DB"),
             ellipse.type = "convex",
             ggtheme = theme_minimal(),
             main    = "K-Means Customer Segments (PCA Projection)")

5.4 Segment Profiles

segment_summary <- df_analysis %>%
  group_by(Cluster) %>%
  summarise(across(all_of(cluster_vars), mean, .names = "{.col}"), .groups = "drop")

kable(segment_summary %>% mutate(across(where(is.numeric), ~round(., 2))),
      caption = "Mean Variable Values by Customer Segment")

Mean Variable Values by Customer Segment
Cluster	CS_helpful	Recommend	Come_again	All_Products	Profesionalism	Limitation	Online_grocery	delivery	Pick_up	Find_items	other_shops
Segment 1	1.10	1.00	1.30	2.00	1.20	1.10	2.00	1.7	2.20	1.2	2.80
Segment 2	2.50	2.00	2.50	3.25	2.00	2.00	2.25	3.0	1.25	2.0	2.75
Segment 3	1.75	1.38	1.12	1.62	1.38	1.75	2.62	3.0	3.38	1.5	2.25

# Heatmap of segment profiles
segment_summary %>%
  pivot_longer(-Cluster, names_to = "Variable", values_to = "Mean") %>%
  ggplot(aes(x = Variable, y = Cluster, fill = Mean)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2166ac", mid = "white", high = "#d73027",
                        midpoint = 2.5, name = "Mean\nRating") +
  labs(title = "Customer Segment Heatmap",
       subtitle = "Mean response by segment and variable",
       x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

6. Demographics by Segment

df_analysis %>%
  mutate(Gender_label = ifelse(Gender == 1, "Male", "Female")) %>%
  ggplot(aes(x = Cluster, fill = Gender_label)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Gender Composition by Customer Segment",
       x = "Segment", y = "Proportion", fill = "Gender") +
  theme_minimal()

df_analysis %>%
  mutate(Age_label = factor(Age, levels = c(2, 3, 4),
                             labels = c("18–34", "35–54", "55+"))) %>%
  ggplot(aes(x = Cluster, fill = Age_label)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Age Group Composition by Customer Segment",
       x = "Segment", y = "Proportion", fill = "Age Group") +
  theme_minimal()

7. Key Findings & Recommendations

Summary of Segments

findings <- tibble(
  Segment = c("Segment 1", "Segment 2", "Segment 3"),
  Profile = c(
    "Highly satisfied, likely loyal shoppers",
    "Moderate satisfaction, open to alternatives",
    "Lower satisfaction, at-risk customers"
  ),
  Recommendation = c(
    "Reward loyalty; invite to referral programs",
    "Improve product variety and pickup options",
    "Address pain points (CS, item availability)"
  )
)
kable(findings, caption = "Customer Segment Profiles and Action Items")

Customer Segment Profiles and Action Items
Segment	Profile	Recommendation
Segment 1	Highly satisfied, likely loyal shoppers	Reward loyalty; invite to referral programs
Segment 2	Moderate satisfaction, open to alternatives	Improve product variety and pickup options
Segment 3	Lower satisfaction, at-risk customers	Address pain points (CS, item availability)

Note: Segment labels above are illustrative — review the actual cluster means in Section 5.4 to confirm each segment’s true profile before drawing conclusions.

Overall Takeaways

Satisfaction ratings cluster together — customers who rate customer service highly also tend to recommend and return. Focus improvements on the weakest satisfaction link.
Product limitation is a key driver of shopping elsewhere — addressing Limitation and All_Products scores may reduce cross-shopping.
Online/delivery adoption is mixed — there is an opportunity to convert hesitant customers to digital channels through targeted promotions.
Small sample size (n = 22) limits generalizability. Collecting more responses would strengthen the segmentation model substantially.

8. Session Info

sessionInfo()

## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 scales_1.4.0       knitr_1.51         factoextra_2.0.0  
##  [5] cluster_2.1.8.2    corrplot_0.95      lubridate_1.9.5    forcats_1.0.1     
##  [9] stringr_1.6.0      dplyr_1.2.0        purrr_1.2.1        readr_2.2.0       
## [13] tidyr_1.3.2        tibble_3.3.1       ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.4    rstatix_0.7.3     stringi_1.8.7    
##  [5] hms_1.1.4         digest_0.6.39     magrittr_2.0.4    evaluate_1.0.5   
##  [9] grid_4.5.3        timechange_0.4.0  fastmap_1.2.0     jsonlite_2.0.0   
## [13] ggrepel_0.9.8     backports_1.5.0   Formula_1.2-5     gridExtra_2.3    
## [17] jquerylib_0.1.4   abind_1.4-8       cli_3.6.5         rlang_1.1.7      
## [21] withr_3.0.2       cachem_1.1.0      yaml_2.3.12       otel_0.2.0       
## [25] tools_4.5.3       tzdb_0.5.0        ggsignif_0.6.4    ggpubr_0.6.3     
## [29] broom_1.0.12      vctrs_0.7.2       R6_2.6.1          lifecycle_1.0.5  
## [33] car_3.1-5         pkgconfig_2.0.3   pillar_1.11.1     bslib_0.10.0     
## [37] gtable_0.3.6      glue_1.8.0        Rcpp_1.1.1        xfun_0.57        
## [41] tidyselect_1.2.1  rstudioapi_0.18.0 farver_2.1.2      htmltools_0.5.9  
## [45] carData_3.0-6     rmarkdown_2.30    labeling_0.4.3    compiler_4.5.3   
## [49] S7_0.2.1

Grocery Customer Segmentation Analysis

Alondra Garay

`3/25/2026`