This report analyzes a grocery store customer segmentation survey with 22 respondents across 15 variables. The survey captures customer satisfaction ratings (on a 1–5 scale), shopping preferences, and demographic attributes. The goal is to understand customer behavior patterns and identify potential segments.
# Auto-install any missing packages
required_packages <- c("tidyverse", "ggplot2", "reshape2", "corrplot", "scales", "cluster")
installed <- rownames(installed.packages())
to_install <- required_packages[!required_packages %in% installed]
if (length(to_install) > 0) {
install.packages(to_install, repos = "https://cloud.r-project.org")
}
library(tidyverse)
library(ggplot2)
library(reshape2)
library(corrplot)
library(scales)
library(cluster)
# Load the dataset
df <- read.csv("customer_segmentation.csv")
# Preview the data
head(df)## ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## Online_grocery delivery Pick_up Find_items other_shops Gender Age Education
## 1 2 3 4 1 2 1 2 2
## 2 2 3 3 1 2 1 2 2
## 3 3 3 2 1 3 1 2 2
## 4 3 3 2 2 2 1 3 5
## 5 2 3 1 2 3 2 4 2
## 6 1 2 1 1 4 1 2 5
## Rows: 22
## Columns: 15
## $ ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, …
## $ CS_helpful <int> 2, 1, 2, 3, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3…
## $ Recommend <int> 2, 2, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Come_again <int> 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2…
## $ All_Products <int> 2, 1, 1, 4, 5, 2, 2, 2, 2, 1, 2, 2, 1, 2, 4, 2, 2, 1, 3…
## $ Profesionalism <int> 2, 1, 1, 1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Limitation <int> 2, 1, 2, 2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, 4…
## $ Online_grocery <int> 2, 2, 3, 3, 2, 1, 2, 1, 2, 3, 2, 3, 1, 3, 2, 3, 2, 3, 1…
## $ delivery <int> 3, 3, 3, 3, 3, 2, 2, 1, 1, 2, 2, 2, 2, 3, 2, 1, 3, 3, 3…
## $ Pick_up <int> 4, 3, 2, 2, 1, 1, 2, 2, 3, 2, 2, 3, 2, 3, 2, 3, 5, 3, 1…
## $ Find_items <int> 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, 1, 1, 1, 3, 2, 1, 2, 1, 3…
## $ other_shops <int> 2, 2, 3, 2, 3, 4, 1, 4, 1, 1, 3, 3, 1, 1, 5, 5, 5, 2, 2…
## $ Gender <int> 1, 1, 1, 1, 2, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2…
## $ Age <int> 2, 2, 2, 3, 4, 2, 2, 2, 2, 2, 4, 3, 4, 3, 2, 3, 2, 2, 2…
## $ Education <int> 2, 2, 2, 5, 2, 5, 3, 2, 1, 2, 5, 1, 5, 5, 5, 5, 1, 5, 2…
##
## Dimensions: 22 rows x 15 columns
The categorical variables use numeric codes. We decode them here for readability.
# Decode Gender: 1 = Male, 2 = Female
df$Gender_Label <- ifelse(df$Gender == 1, "Male", "Female")
# Decode Age groups (assumed ordinal bands)
df$Age_Label <- factor(df$Age,
levels = 1:5,
labels = c("Under 18", "18–30", "31–45", "46–60", "60+")
)
# Decode Education levels
df$Education_Label <- factor(df$Education,
levels = 1:5,
labels = c("No formal education", "High School", "Some College",
"Bachelor's Degree", "Graduate Degree")
)
# Shopping channel preference columns
shopping_cols <- c("CS_helpful", "Recommend", "Come_again",
"All_Products", "Profesionalism", "Limitation",
"Online_grocery", "delivery", "Pick_up",
"Find_items", "other_shops")## CS_helpful Recommend Come_again All_Products
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.250
## Median :1.000 Median :1.000 Median :1.000 Median :2.000
## Mean :1.591 Mean :1.318 Mean :1.455 Mean :2.091
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :3.000 Max. :3.000 Max. :3.000 Max. :5.000
## Profesionalism Limitation Online_grocery delivery Pick_up
## Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :1.000 Median :1.0 Median :2.000 Median :3.000 Median :2.000
## Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409 Mean :2.455
## 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000 Max. :5.000
## Find_items other_shops
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.250
## Median :1.000 Median :2.000
## Mean :1.455 Mean :2.591
## 3rd Qu.:2.000 3rd Qu.:3.750
## Max. :3.000 Max. :5.000
# Mean satisfaction scores per question
mean_scores <- df %>%
select(all_of(shopping_cols)) %>%
summarise(across(everything(), mean)) %>%
pivot_longer(everything(), names_to = "Question", values_to = "Mean_Score") %>%
arrange(desc(Mean_Score))
mean_scores## # A tibble: 11 × 2
## Question Mean_Score
## <chr> <dbl>
## 1 other_shops 2.59
## 2 Pick_up 2.45
## 3 delivery 2.41
## 4 Online_grocery 2.27
## 5 All_Products 2.09
## 6 CS_helpful 1.59
## 7 Limitation 1.5
## 8 Come_again 1.45
## 9 Find_items 1.45
## 10 Profesionalism 1.41
## 11 Recommend 1.32
df %>%
count(Gender_Label) %>%
ggplot(aes(x = Gender_Label, y = n, fill = Gender_Label)) +
geom_col(width = 0.5) +
geom_text(aes(label = n), vjust = -0.5, fontface = "bold") +
scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452")) +
labs(
title = "Gender Distribution of Respondents",
x = "Gender", y = "Count"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")df %>%
count(Age_Label) %>%
ggplot(aes(x = Age_Label, y = n, fill = Age_Label)) +
geom_col(width = 0.6) +
geom_text(aes(label = n), vjust = -0.5, fontface = "bold") +
scale_fill_brewer(palette = "Blues") +
labs(
title = "Age Group Distribution",
x = "Age Group", y = "Count"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")df %>%
count(Education_Label) %>%
ggplot(aes(x = reorder(Education_Label, n), y = n, fill = Education_Label)) +
geom_col(width = 0.6) +
geom_text(aes(label = n), hjust = -0.2, fontface = "bold") +
coord_flip() +
scale_fill_brewer(palette = "Set2") +
labs(
title = "Education Level of Respondents",
x = NULL, y = "Count"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")mean_scores %>%
ggplot(aes(x = reorder(Question, Mean_Score), y = Mean_Score, fill = Mean_Score)) +
geom_col() +
geom_text(aes(label = round(Mean_Score, 2)), hjust = -0.2, size = 3.5) +
coord_flip() +
scale_fill_gradient(low = "#f7cac9", high = "#355c7d") +
scale_y_continuous(limits = c(0, 4)) +
labs(
title = "Mean Satisfaction Score per Survey Item",
x = NULL, y = "Mean Score (1–5 scale)"
) +
theme_minimal(base_size = 13) +
theme(legend.position = "none")df %>%
select(CS_helpful, Recommend, Come_again, Profesionalism) %>%
pivot_longer(everything(), names_to = "Metric", values_to = "Score") %>%
ggplot(aes(x = factor(Score), fill = Metric)) +
geom_bar(position = "dodge") +
facet_wrap(~Metric, scales = "free_y") +
scale_fill_brewer(palette = "Pastel1") +
labs(
title = "Score Distribution: Core Satisfaction Metrics",
x = "Score", y = "Count"
) +
theme_minimal(base_size = 12) +
theme(legend.position = "none")df %>%
select(Online_grocery, delivery, Pick_up) %>%
pivot_longer(everything(), names_to = "Channel", values_to = "Score") %>%
group_by(Channel, Score) %>%
summarise(n = n(), .groups = "drop") %>%
ggplot(aes(x = factor(Score), y = n, fill = Channel)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#2ecc71", "#3498db", "#9b59b6")) +
labs(
title = "Shopping Channel Preference Scores",
subtitle = "Online Grocery vs. Delivery vs. Pick-Up",
x = "Score (1 = Low, 5 = High)", y = "Count"
) +
theme_minimal(base_size = 13)cor_matrix <- df %>%
select(all_of(shopping_cols)) %>%
cor()
corrplot(cor_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.cex = 0.85,
addCoef.col = "black",
number.cex = 0.65,
col = colorRampPalette(c("#d73027", "white", "#4575b4"))(200),
title = "Correlation Matrix of Survey Items",
mar = c(0, 0, 2, 0)
)df %>%
group_by(Gender_Label) %>%
summarise(
CS_helpful = mean(CS_helpful),
Recommend = mean(Recommend),
Come_again = mean(Come_again),
Profesionalism = mean(Profesionalism),
.groups = "drop"
) %>%
pivot_longer(-Gender_Label, names_to = "Metric", values_to = "Avg_Score") %>%
ggplot(aes(x = Metric, y = Avg_Score, fill = Gender_Label)) +
geom_col(position = "dodge", width = 0.6) +
scale_fill_manual(values = c("Male" = "#4C72B0", "Female" = "#DD8452")) +
labs(
title = "Average Satisfaction Scores by Gender",
x = NULL, y = "Average Score", fill = "Gender"
) +
theme_minimal(base_size = 13)df %>%
group_by(Age_Label) %>%
summarise(
Online = mean(Online_grocery),
Delivery = mean(delivery),
PickUp = mean(Pick_up),
.groups = "drop"
) %>%
pivot_longer(-Age_Label, names_to = "Channel", values_to = "Avg_Score") %>%
ggplot(aes(x = Age_Label, y = Avg_Score, color = Channel, group = Channel)) +
geom_line(linewidth = 1.2) +
geom_point(size = 3) +
scale_color_manual(values = c("#2ecc71", "#3498db", "#9b59b6")) +
labs(
title = "Shopping Channel Preference by Age Group",
x = "Age Group", y = "Average Score", color = "Channel"
) +
theme_minimal(base_size = 13)We cluster customers based on their shopping preference scores to identify distinct behavioral segments.
set.seed(42)
survey_data <- df %>% select(all_of(shopping_cols)) %>% scale()
# Compute total within-cluster sum of squares for k = 1 to 8
wss <- sapply(1:8, function(k) {
kmeans(survey_data, centers = k, nstart = 25)$tot.withinss
})
elbow_df <- data.frame(k = 1:8, wss = wss)
ggplot(elbow_df, aes(x = k, y = wss)) +
geom_line(color = "#3498DB", linewidth = 1.2) +
geom_point(size = 3, color = "#E74C3C") +
scale_x_continuous(breaks = 1:8) +
labs(
title = "Elbow Method: Optimal Number of Clusters",
x = "Number of Clusters (k)",
y = "Total Within-Cluster Sum of Squares"
) +
theme_minimal(base_size = 13)set.seed(42)
km <- kmeans(survey_data, centers = 3, nstart = 25)
df$Cluster <- factor(km$cluster, labels = c("Segment A", "Segment B", "Segment C"))
table(df$Cluster)##
## Segment A Segment B Segment C
## 10 4 8
# Project to 2D using PCA for visualization
pca <- prcomp(survey_data)
pca_df <- data.frame(
PC1 = pca$x[, 1],
PC2 = pca$x[, 2],
Cluster = df$Cluster
)
ggplot(pca_df, aes(x = PC1, y = PC2, color = Cluster, fill = Cluster)) +
geom_point(size = 3.5, alpha = 0.85) +
stat_ellipse(geom = "polygon", alpha = 0.12, level = 0.8) +
scale_color_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
labs(
title = "Customer Segments — PCA Projection (K-Means, k = 3)",
x = "Principal Component 1",
y = "Principal Component 2"
) +
theme_minimal(base_size = 13)df %>%
group_by(Cluster) %>%
summarise(across(all_of(shopping_cols), mean), .groups = "drop") %>%
pivot_longer(-Cluster, names_to = "Feature", values_to = "Mean_Score") %>%
ggplot(aes(x = Feature, y = Mean_Score, fill = Cluster)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("#E74C3C", "#2ECC71", "#3498DB")) +
coord_flip() +
labs(
title = "Average Score per Feature by Customer Segment",
x = NULL, y = "Mean Score", fill = "Segment"
) +
theme_minimal(base_size = 12)df %>%
group_by(Cluster) %>%
summarise(
N = n(),
Pct_Female = round(mean(Gender == 2) * 100, 1),
Avg_Age_Group = round(mean(Age), 2),
Avg_Education = round(mean(Education), 2),
.groups = "drop"
)## # A tibble: 3 × 5
## Cluster N Pct_Female Avg_Age_Group Avg_Education
## <fct> <int> <dbl> <dbl> <dbl>
## 1 Segment A 10 30 2.6 3.4
## 2 Segment B 4 75 2.75 3
## 3 Segment C 8 0 2.12 3
CS_helpful, Recommend, and
Come_again are closely interrelated — satisfied customers
are more likely to recommend and return.Analysis generated with R Markdown. Dataset:
customer_segmentation.csv (22 respondents, 15
variables).