1. Introduction

This report analyzes survey data collected from 22 grocery store customers. The dataset captures customer satisfaction, shopping behaviors, and demographic characteristics. The goal is to identify meaningful customer segments and uncover patterns that could inform store strategy.

Variables included:

Variable Description Scale
CS_helpful Customer service helpfulness rating 1–3 (1 = Most positive)
Recommend Likelihood to recommend the store 1–3
Come_again Likelihood to return 1–3
All_Products Satisfaction with product availability 1–5
Profesionalism Staff professionalism rating 1–3
Limitation Perceived product limitations 1–4
Online_grocery Use of online grocery shopping 1–3
delivery Use of delivery services 1–3
Pick_up Use of pickup services 1–5
Find_items Ease of finding items 1–3
other_shops Shopping at other stores 1–5
Gender 1 = Male, 2 = Female Binary
Age 2 = 18–34, 3 = 35–54, 4 = 55+ Ordinal
Education 1 = HS, 2 = Some College, 3 = Bachelor’s, 5 = Graduate Ordinal

2. Setup & Data Loading

# Install packages if needed:
# install.packages(c("tidyverse", "corrplot", "ggplot2", "cluster",
#                    "factoextra", "knitr", "scales", "RColorBrewer"))

library(tidyverse)
library(corrplot)
library(ggplot2)
library(cluster)
library(factoextra)
library(knitr)
library(scales)
library(RColorBrewer)
# Load dataset — adjust path as needed
df <- read.csv("customer_segmentation.csv")

# Remove ID column for analysis
df_analysis <- df %>% select(-ID)

# Preview
kable(head(df_analysis), caption = "First 6 Rows of Customer Segmentation Data")
First 6 Rows of Customer Segmentation Data
CS_helpful Recommend Come_again All_Products Profesionalism Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age Education
2 2 2 2 2 2 2 3 4 1 2 1 2 2
1 2 1 1 1 1 2 3 3 1 2 1 2 2
2 1 1 1 1 2 3 3 2 1 3 1 2 2
3 3 2 4 1 2 3 3 2 2 2 1 3 5
2 1 3 5 2 1 2 3 1 2 3 2 4 2
1 1 3 2 1 1 1 2 1 1 4 1 2 5

3. Exploratory Data Analysis

3.1 Dataset Overview

cat("Dimensions:", nrow(df), "rows x", ncol(df), "columns\n\n")
## Dimensions: 22 rows x 15 columns
summary(df_analysis)
##    CS_helpful      Recommend       Come_again     All_Products  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.591   Mean   :1.318   Mean   :1.455   Mean   :2.091  
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##  Profesionalism    Limitation  Online_grocery     delivery        Pick_up     
##  Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :1.000   Median :1.0   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##    Find_items     other_shops        Gender           Age       
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :2.000  
##  1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000   1st Qu.:2.000  
##  Median :1.000   Median :2.000   Median :1.000   Median :2.000  
##  Mean   :1.455   Mean   :2.591   Mean   :1.273   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :5.000   Max.   :2.000   Max.   :4.000  
##    Education    
##  Min.   :1.000  
##  1st Qu.:2.000  
##  Median :2.500  
##  Mean   :3.182  
##  3rd Qu.:5.000  
##  Max.   :5.000

3.2 Demographic Distribution

# Recode factor labels for readability
df_plot <- df_analysis %>%
  mutate(
    Gender_label    = factor(Gender,    levels = c(1, 2), labels = c("Male", "Female")),
    Age_label       = factor(Age,       levels = c(2, 3, 4), labels = c("18–34", "35–54", "55+")),
    Education_label = factor(Education, levels = c(1, 2, 3, 5),
                              labels = c("High School", "Some College", "Bachelor's", "Graduate"))
  )

p1 <- ggplot(df_plot, aes(x = Gender_label, fill = Gender_label)) +
  geom_bar(width = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Gender Distribution", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p2 <- ggplot(df_plot, aes(x = Age_label, fill = Age_label)) +
  geom_bar(width = 0.5) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Age Group Distribution", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none")

p3 <- ggplot(df_plot, aes(x = Education_label, fill = Education_label)) +
  geom_bar(width = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Education Level", x = NULL, y = "Count") +
  theme_minimal() + theme(legend.position = "none") +
  theme(axis.text.x = element_text(angle = 15, hjust = 1))

gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Note: Install gridExtra with install.packages("gridExtra") if not available.

3.3 Satisfaction Variables: Distribution

satisfaction_vars <- c("CS_helpful", "Recommend", "Come_again",
                        "Profesionalism", "Find_items")

df_analysis %>%
  select(all_of(satisfaction_vars)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Rating") %>%
  ggplot(aes(x = factor(Rating), fill = Variable)) +
  geom_bar(show.legend = FALSE) +
  facet_wrap(~Variable, scales = "free_y") +
  scale_fill_brewer(palette = "Blues") +
  labs(title = "Distribution of Satisfaction Ratings",
       subtitle = "Lower scores = more positive (scale 1–3)",
       x = "Rating", y = "Count") +
  theme_minimal()

3.4 Shopping Behavior Variables

behavior_vars <- c("Online_grocery", "delivery", "Pick_up", "other_shops", "Limitation")

df_analysis %>%
  select(all_of(behavior_vars)) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "Response") %>%
  ggplot(aes(x = factor(Response), fill = Variable)) +
  geom_bar(show.legend = FALSE) +
  facet_wrap(~Variable, scales = "free") +
  scale_fill_brewer(palette = "Greens") +
  labs(title = "Distribution of Shopping Behavior Responses",
       x = "Response", y = "Count") +
  theme_minimal()


4. Correlation Analysis

cor_matrix <- cor(df_analysis, use = "complete.obs")

corrplot(cor_matrix,
         method   = "color",
         type     = "upper",
         order    = "hclust",
         tl.cex   = 0.8,
         tl.col   = "black",
         addCoef.col = "black",
         number.cex  = 0.6,
         col      = colorRampPalette(c("#2166ac", "white", "#d73027"))(200),
         title    = "Correlation Matrix of Customer Segmentation Variables",
         mar      = c(0, 0, 2, 0))

Key observations:

  • CS_helpful, Recommend, and Come_again tend to be positively correlated — satisfied customers across one dimension tend to be satisfied across others.
  • other_shops and Limitation may reflect customers who perceive a lack of product variety.

5. Customer Segmentation (K-Means Clustering)

5.1 Determine Optimal Number of Clusters

# Use satisfaction + behavior columns (exclude demographics for clustering)
cluster_vars <- c("CS_helpful", "Recommend", "Come_again", "All_Products",
                   "Profesionalism", "Limitation", "Online_grocery",
                   "delivery", "Pick_up", "Find_items", "other_shops")

df_cluster <- df_analysis %>% select(all_of(cluster_vars)) %>% scale()

set.seed(42)
fviz_nbclust(df_cluster, kmeans, method = "wss") +
  labs(title = "Elbow Method — Optimal Number of Clusters",
       x = "Number of Clusters (k)", y = "Total Within-Cluster Sum of Squares") +
  theme_minimal()

5.2 Fit K-Means (k = 3)

set.seed(42)
k <- 3
km_model <- kmeans(df_cluster, centers = k, nstart = 25)

# Append cluster labels to data
df_analysis$Cluster <- factor(km_model$cluster,
                               labels = paste("Segment", 1:k))

cat("Cluster sizes:\n")
## Cluster sizes:
print(table(df_analysis$Cluster))
## 
## Segment 1 Segment 2 Segment 3 
##        10         4         8

5.3 Visualize Clusters (PCA Biplot)

fviz_cluster(km_model,
             data    = df_cluster,
             palette = c("#E74C3C", "#2ECC71", "#3498DB"),
             ellipse.type = "convex",
             ggtheme = theme_minimal(),
             main    = "K-Means Customer Segments (PCA Projection)")

5.4 Segment Profiles

segment_summary <- df_analysis %>%
  group_by(Cluster) %>%
  summarise(across(all_of(cluster_vars), mean, .names = "{.col}"), .groups = "drop")

kable(segment_summary %>% mutate(across(where(is.numeric), ~round(., 2))),
      caption = "Mean Variable Values by Customer Segment")
Mean Variable Values by Customer Segment
Cluster CS_helpful Recommend Come_again All_Products Profesionalism Limitation Online_grocery delivery Pick_up Find_items other_shops
Segment 1 1.10 1.00 1.30 2.00 1.20 1.10 2.00 1.7 2.20 1.2 2.80
Segment 2 2.50 2.00 2.50 3.25 2.00 2.00 2.25 3.0 1.25 2.0 2.75
Segment 3 1.75 1.38 1.12 1.62 1.38 1.75 2.62 3.0 3.38 1.5 2.25
# Heatmap of segment profiles
segment_summary %>%
  pivot_longer(-Cluster, names_to = "Variable", values_to = "Mean") %>%
  ggplot(aes(x = Variable, y = Cluster, fill = Mean)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2166ac", mid = "white", high = "#d73027",
                        midpoint = 2.5, name = "Mean\nRating") +
  labs(title = "Customer Segment Heatmap",
       subtitle = "Mean response by segment and variable",
       x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))


6. Demographics by Segment

df_analysis %>%
  mutate(Gender_label = ifelse(Gender == 1, "Male", "Female")) %>%
  ggplot(aes(x = Cluster, fill = Gender_label)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Gender Composition by Customer Segment",
       x = "Segment", y = "Proportion", fill = "Gender") +
  theme_minimal()

df_analysis %>%
  mutate(Age_label = factor(Age, levels = c(2, 3, 4),
                             labels = c("18–34", "35–54", "55+"))) %>%
  ggplot(aes(x = Cluster, fill = Age_label)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = percent) +
  scale_fill_brewer(palette = "Pastel1") +
  labs(title = "Age Group Composition by Customer Segment",
       x = "Segment", y = "Proportion", fill = "Age Group") +
  theme_minimal()


7. Key Findings & Recommendations

Summary of Segments

findings <- tibble(
  Segment = c("Segment 1", "Segment 2", "Segment 3"),
  Profile = c(
    "Highly satisfied, likely loyal shoppers",
    "Moderate satisfaction, open to alternatives",
    "Lower satisfaction, at-risk customers"
  ),
  Recommendation = c(
    "Reward loyalty; invite to referral programs",
    "Improve product variety and pickup options",
    "Address pain points (CS, item availability)"
  )
)
kable(findings, caption = "Customer Segment Profiles and Action Items")
Customer Segment Profiles and Action Items
Segment Profile Recommendation
Segment 1 Highly satisfied, likely loyal shoppers Reward loyalty; invite to referral programs
Segment 2 Moderate satisfaction, open to alternatives Improve product variety and pickup options
Segment 3 Lower satisfaction, at-risk customers Address pain points (CS, item availability)

Note: Segment labels above are illustrative — review the actual cluster means in Section 5.4 to confirm each segment’s true profile before drawing conclusions.

Overall Takeaways

  1. Satisfaction ratings cluster together — customers who rate customer service highly also tend to recommend and return. Focus improvements on the weakest satisfaction link.
  2. Product limitation is a key driver of shopping elsewhere — addressing Limitation and All_Products scores may reduce cross-shopping.
  3. Online/delivery adoption is mixed — there is an opportunity to convert hesitant customers to digital channels through targeted promotions.
  4. Small sample size (n = 22) limits generalizability. Collecting more responses would strengthen the segmentation model substantially.

8. Session Info

sessionInfo()
## R version 4.5.3 (2026-03-11 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 22631)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] RColorBrewer_1.1-3 scales_1.4.0       knitr_1.51         factoextra_2.0.0  
##  [5] cluster_2.1.8.2    corrplot_0.95      lubridate_1.9.5    forcats_1.0.1     
##  [9] stringr_1.6.0      dplyr_1.2.0        purrr_1.2.1        readr_2.2.0       
## [13] tidyr_1.3.2        tibble_3.3.1       ggplot2_4.0.2      tidyverse_2.0.0   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10       generics_0.1.4    rstatix_0.7.3     stringi_1.8.7    
##  [5] hms_1.1.4         digest_0.6.39     magrittr_2.0.4    evaluate_1.0.5   
##  [9] grid_4.5.3        timechange_0.4.0  fastmap_1.2.0     jsonlite_2.0.0   
## [13] ggrepel_0.9.8     backports_1.5.0   Formula_1.2-5     gridExtra_2.3    
## [17] jquerylib_0.1.4   abind_1.4-8       cli_3.6.5         rlang_1.1.7      
## [21] withr_3.0.2       cachem_1.1.0      yaml_2.3.12       otel_0.2.0       
## [25] tools_4.5.3       tzdb_0.5.0        ggsignif_0.6.4    ggpubr_0.6.3     
## [29] broom_1.0.12      vctrs_0.7.2       R6_2.6.1          lifecycle_1.0.5  
## [33] car_3.1-5         pkgconfig_2.0.3   pillar_1.11.1     bslib_0.10.0     
## [37] gtable_0.3.6      glue_1.8.0        Rcpp_1.1.1        xfun_0.57        
## [41] tidyselect_1.2.1  rstudioapi_0.18.0 farver_2.1.2      htmltools_0.5.9  
## [45] carData_3.0-6     rmarkdown_2.30    labeling_0.4.3    compiler_4.5.3   
## [49] S7_0.2.1