1 Overview

This report analyzes a grocery store customer segmentation survey with 22 respondents across 15 variables. The survey captures customer satisfaction ratings (on a 1–5 scale), shopping preferences, and demographic attributes. The goal is to understand customer behavior patterns and identify potential segments.


2 Setup: Load Libraries & Data

# Auto-install any missing packages
required_packages <- c("tidyverse", "ggplot2", "reshape2", "corrplot", "scales", "cluster")

installed <- rownames(installed.packages())
to_install <- required_packages[!required_packages %in% installed]
if (length(to_install) > 0) {
  install.packages(to_install, repos = "https://cloud.r-project.org")
}

library(tidyverse)
library(ggplot2)
library(reshape2)
library(corrplot)
library(scales)
library(cluster)

# Load the dataset
df <- read.csv("customer_segmentation.csv")

# Preview the data
head(df)
##   ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## 1  1          2         2          2            2              2          2
## 2  2          1         2          1            1              1          1
## 3  3          2         1          1            1              1          2
## 4  4          3         3          2            4              1          2
## 5  5          2         1          3            5              2          1
## 6  6          1         1          3            2              1          1
##   Online_grocery delivery Pick_up Find_items other_shops Gender Age Education
## 1              2        3       4          1           2      1   2         2
## 2              2        3       3          1           2      1   2         2
## 3              3        3       2          1           3      1   2         2
## 4              3        3       2          2           2      1   3         5
## 5              2        3       1          2           3      2   4         2
## 6              1        2       1          1           4      1   2         5
# Check structure and dimensions
glimpse(df)
## Rows: 22
## Columns: 15
## $ ID             <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ CS_helpful     <int> 2, 1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3…
## $ Recommend      <int> 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Come_again     <int> 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2…
## $ All_Products   <int> 2, 1, 1, 4, 5, 2, 2, 2, 2, 1, 2, 2, 1, 2, 4, 2, 2, 1, 3…
## $ Profesionalism <int> 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Limitation     <int> 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4…
## $ Online_grocery <int> 2, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1…
## $ delivery       <int> 3, 3, 3, 3, 3, 2, 2, 1, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 3…
## $ Pick_up        <int> 4, 3, 2, 2, 1, 1, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 5, 3, 1…
## $ Find_items     <int> 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3…
## $ other_shops    <int> 2, 2, 3, 2, 3, 4, 1, 4, 1, 1, 3, 3, 1, 1, 5, 5, 5, 2, 2…
## $ Gender         <int> 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Age            <int> 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 4, 3, 4, 3, 2, 3, 2, 2, 2…
## $ Education      <int> 2, 2, 2, 5, 2, 5, 3, 2, 1, 2, 5, 1, 5, 5, 5, 5, 1, 5, 2…
cat("\nDimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## 
## Dimensions: 22 rows x 15 columns

3 Data Preparation

The categorical variables use numeric codes. We decode them here for readability.

# Decode Gender: 1 = Male, 2 = Female
df$Gender_Label <- ifelse(df$Gender == 1, "Male", "Female")

# Decode Age groups (assumed ordinal bands)
df$Age_Label <- factor(df$Age,
  levels = 1:5,
  labels = c("Under 18", "18–30", "31–45", "46–60", "60+")
)

# Decode Education levels
df$Education_Label <- factor(df$Education,
  levels = 1:5,
  labels = c("No formal education", "High School", "Some College",
             "Bachelor's Degree", "Graduate Degree")
)

# Shopping channel preference columns
shopping_cols <- c("CS_helpful", "Recommend", "Come_again",
                   "All_Products", "Profesionalism", "Limitation",
                   "Online_grocery", "delivery", "Pick_up",
                   "Find_items", "other_shops")

4 Descriptive Statistics

df %>%
  select(all_of(shopping_cols)) %>%
  summary()
##    CS_helpful      Recommend       Come_again     All_Products  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.591   Mean   :1.318   Mean   :1.455   Mean   :2.091  
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##  Profesionalism    Limitation  Online_grocery     delivery        Pick_up     
##  Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :1.000   Median :1.0   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##    Find_items     other_shops   
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :2.000  
##  Mean   :1.455   Mean   :2.591  
##  3rd Qu.:2.000   3rd Qu.:3.750  
##  Max.   :3.000   Max.   :5.000
# Mean satisfaction scores per question
mean_scores <- df %>%
  select(all_of(shopping_cols)) %>%
  summarise(across(everything(), mean)) %>%
  pivot_longer(everything(), names_to = "Question", values_to = "Mean_Score") %>%
  arrange(desc(Mean_Score))

mean_scores
## # A tibble: 11 × 2
##    Question       Mean_Score
##    <chr>               <dbl>
##  1 other_shops          2.59
##  2 Pick_up              2.45
##  3 delivery             2.41
##  4 Online_grocery       2.27
##  5 All_Products         2.09
##  6 CS_helpful           1.59
##  7 Limitation           1.5 
##  8 Come_again           1.45
##  9 Find_items           1.45
## 10 Profesionalism       1.41
## 11 Recommend            1.32

5 Demographics

5.1 Gender Distribution

df %>%
  count(Gender_Label) %>%
  ggplot(aes(x = Gender_Label, y = n, fill = Gender_Label)) +
  geom_col(width = 0.5) +
  geom_text(aes(label = n), vjust = -0.5, fontface = "bold") +
  scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452")) +
  labs(
    title = "Gender Distribution of Respondents",
    x = "Gender", y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

5.2 Age Distribution

df %>%
  count(Age_Label) %>%
  ggplot(aes(x = Age_Label, y = n, fill = Age_Label)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = n), vjust = -0.5, fontface = "bold") +
  scale_fill_brewer(palette = "Blues") +
  labs(
    title = "Age Group Distribution",
    x = "Age Group", y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

5.3 Education Distribution

df %>%
  count(Education_Label) %>%
  ggplot(aes(x = reorder(Education_Label, n), y = n, fill = Education_Label)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = n), hjust = -0.2, fontface = "bold") +
  coord_flip() +
  scale_fill_brewer(palette = "Set2") +
  labs(
    title = "Education Level of Respondents",
    x = NULL, y = "Count"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")


6 Satisfaction & Preference Ratings

6.1 Average Rating per Survey Item

mean_scores %>%
  ggplot(aes(x = reorder(Question, Mean_Score), y = Mean_Score, fill = Mean_Score)) +
  geom_col() +
  geom_text(aes(label = round(Mean_Score, 2)), hjust = -0.2, size = 3.5) +
  coord_flip() +
  scale_fill_gradient(low = "#f7cac9", high = "#355c7d") +
  scale_y_continuous(limits = c(0, 4)) +
  labs(
    title = "Mean Satisfaction Score per Survey Item",
    x = NULL, y = "Mean Score (1–5 scale)"
  ) +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

6.2 Distribution of Key Satisfaction Metrics

df %>%
  select(CS_helpful, Recommend, Come_again, Profesionalism) %>%
  pivot_longer(everything(), names_to = "Metric", values_to = "Score") %>%
  ggplot(aes(x = factor(Score), fill = Metric)) +
  geom_bar(position = "dodge") +
  facet_wrap(~Metric, scales = "free_y") +
  scale_fill_brewer(palette = "Pastel1") +
  labs(
    title = "Score Distribution: Core Satisfaction Metrics",
    x = "Score", y = "Count"
  ) +
  theme_minimal(base_size = 12) +
  theme(legend.position = "none")

6.3 Shopping Channel Preferences

df %>%
  select(Online_grocery, delivery, Pick_up) %>%
  pivot_longer(everything(), names_to = "Channel", values_to = "Score") %>%
  group_by(Channel, Score) %>%
  summarise(n = n(), .groups = "drop") %>%
  ggplot(aes(x = factor(Score), y = n, fill = Channel)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("#2ecc71", "#3498db", "#9b59b6")) +
  labs(
    title = "Shopping Channel Preference Scores",
    subtitle = "Online Grocery vs. Delivery vs. Pick-Up",
    x = "Score (1 = Low, 5 = High)", y = "Count"
  ) +
  theme_minimal(base_size = 13)


7 Correlation Analysis

cor_matrix <- df %>%
  select(all_of(shopping_cols)) %>%
  cor()

corrplot(cor_matrix,
  method = "color",
  type = "upper",
  order = "hclust",
  tl.cex = 0.85,
  addCoef.col = "black",
  number.cex = 0.65,
  col = colorRampPalette(c("#d73027", "white", "#4575b4"))(200),
  title = "Correlation Matrix of Survey Items",
  mar = c(0, 0, 2, 0)
)


8 Demographic Comparisons

8.1 Satisfaction by Gender

df %>%
  group_by(Gender_Label) %>%
  summarise(
    CS_helpful   = mean(CS_helpful),
    Recommend    = mean(Recommend),
    Come_again   = mean(Come_again),
    Profesionalism = mean(Profesionalism),
    .groups = "drop"
  ) %>%
  pivot_longer(-Gender_Label, names_to = "Metric", values_to = "Avg_Score") %>%
  ggplot(aes(x = Metric, y = Avg_Score, fill = Gender_Label)) +
  geom_col(position = "dodge", width = 0.6) +
  scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452")) +
  labs(
    title = "Average Satisfaction Scores by Gender",
    x = NULL, y = "Average Score", fill = "Gender"
  ) +
  theme_minimal(base_size = 13)

8.2 Channel Preference by Age Group

df %>%
  group_by(Age_Label) %>%
  summarise(
    Online = mean(Online_grocery),
    Delivery = mean(delivery),
    PickUp = mean(Pick_up),
    .groups = "drop"
  ) %>%
  pivot_longer(-Age_Label, names_to = "Channel", values_to = "Avg_Score") %>%
  ggplot(aes(x = Age_Label, y = Avg_Score, color = Channel, group = Channel)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  scale_color_manual(values = c("#2ecc71", "#3498db", "#9b59b6")) +
  labs(
    title = "Shopping Channel Preference by Age Group",
    x = "Age Group", y = "Average Score", color = "Channel"
  ) +
  theme_minimal(base_size = 13)


9 Customer Segmentation (K-Means Clustering)

We cluster customers based on their shopping preference scores to identify distinct behavioral segments.

9.1 Determine Optimal Number of Clusters

set.seed(42)
survey_data <- df %>% select(all_of(shopping_cols)) %>% scale()

# Compute total within-cluster sum of squares for k = 1 to 8
wss <- sapply(1:8, function(k) {
  kmeans(survey_data, centers = k, nstart = 25)$tot.withinss
})

elbow_df <- data.frame(k = 1:8, wss = wss)

ggplot(elbow_df, aes(x = k, y = wss)) +
  geom_line(color = "#3498DB", linewidth = 1.2) +
  geom_point(size = 3, color = "#E74C3C") +
  scale_x_continuous(breaks = 1:8) +
  labs(
    title = "Elbow Method: Optimal Number of Clusters",
    x = "Number of Clusters (k)",
    y = "Total Within-Cluster Sum of Squares"
  ) +
  theme_minimal(base_size = 13)

9.2 Apply K-Means (k = 3)

set.seed(42)
km <- kmeans(survey_data, centers = 3, nstart = 25)

df$Cluster <- factor(km$cluster, labels = c("Segment A", "Segment B", "Segment C"))

table(df$Cluster)
## 
## Segment A Segment B Segment C 
##        10         4         8

9.3 Cluster Visualization (PCA Projection)

# Project to 2D using PCA for visualization
pca <- prcomp(survey_data)
pca_df <- data.frame(
  PC1     = pca$x[, 1],
  PC2     = pca$x[, 2],
  Cluster = df$Cluster
)

ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
  geom_point(size = 3.5, alpha = 0.85) +
  stat_ellipse(geom = "polygon", alpha = 0.12, level = 0.8) +
  scale_color_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
  scale_fill_manual(values  = c("#E74C3C", "#2ECC71", "#3498DB")) +
  labs(
    title = "Customer Segments — PCA Projection (K-Means, k = 3)",
    x = "Principal Component 1",
    y = "Principal Component 2"
  ) +
  theme_minimal(base_size = 13)

9.4 Segment Profiles

df %>%
  group_by(Cluster) %>%
  summarise(across(all_of(shopping_cols), mean), .groups = "drop") %>%
  pivot_longer(-Cluster, names_to = "Feature", values_to = "Mean_Score") %>%
  ggplot(aes(x = Feature, y = Mean_Score, fill = Cluster)) +
  geom_col(position = "dodge") +
  scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
  coord_flip() +
  labs(
    title = "Average Score per Feature by Customer Segment",
    x = NULL, y = "Mean Score", fill = "Segment"
  ) +
  theme_minimal(base_size = 12)

9.5 Demographics by Segment

df %>%
  group_by(Cluster) %>%
  summarise(
    N = n(),
    Pct_Female = round(mean(Gender == 2) * 100, 1),
    Avg_Age_Group = round(mean(Age), 2),
    Avg_Education = round(mean(Education), 2),
    .groups = "drop"
  )
## # A tibble: 3 × 5
##   Cluster       N Pct_Female Avg_Age_Group Avg_Education
##   <fct>     <int>      <dbl>         <dbl>         <dbl>
## 1 Segment A    10         30          2.6            3.4
## 2 Segment B     4         75          2.75           3  
## 3 Segment C     8          0          2.12           3

10 Key Takeaways

  • Most respondents are aged 18–30 and hold either a High School diploma or Graduate Degree.
  • Customer Service helpfulness and Professionalism received among the highest average scores, indicating general satisfaction with staff.
  • Pick-up is a more consistently preferred channel compared to online grocery and delivery.
  • The correlation matrix reveals that CS_helpful, Recommend, and Come_again are closely interrelated — satisfied customers are more likely to recommend and return.
  • K-means clustering identifies three customer segments with distinct preference profiles, which can inform targeted marketing and service improvements.

Analysis generated with R Markdown. Dataset: customer_segmentation.csv (22 respondents, 15 variables).