Introduction

This report analyzes customer feedback and demographic data from customer_segmentation.csv. The goal is to understand patterns, satisfaction factors, and segment customers based on their responses.


Setup

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.0     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(cluster)
library(factoextra)
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(GGally)
data <- read.csv("customer_segmentation.csv", header = TRUE)
str(data)
## 'data.frame':    22 obs. of  15 variables:
##  $ ID            : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ CS_helpful    : int  2 1 2 3 2 1 2 1 1 1 ...
##  $ Recommend     : int  2 2 1 3 1 1 1 1 1 1 ...
##  $ Come_again    : int  2 1 1 2 3 3 1 1 1 1 ...
##  $ All_Products  : int  2 1 1 4 5 2 2 2 2 1 ...
##  $ Profesionalism: int  2 1 1 1 2 1 2 1 2 1 ...
##  $ Limitation    : int  2 1 2 2 1 1 1 2 1 1 ...
##  $ Online_grocery: int  2 2 3 3 2 1 2 1 2 3 ...
##  $ delivery      : int  3 3 3 3 3 2 2 1 1 2 ...
##  $ Pick_up       : int  4 3 2 2 1 1 2 2 3 2 ...
##  $ Find_items    : int  1 1 1 2 2 1 1 2 1 1 ...
##  $ other_shops   : int  2 2 3 2 3 4 1 4 1 1 ...
##  $ Gender        : int  1 1 1 1 2 1 1 1 2 2 ...
##  $ Age           : int  2 2 2 3 4 2 2 2 2 2 ...
##  $ Education     : int  2 2 2 5 2 5 3 2 1 2 ...
summary(data)
##        ID          CS_helpful      Recommend       Come_again   
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 6.25   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :11.50   Median :1.000   Median :1.000   Median :1.000  
##  Mean   :11.50   Mean   :1.591   Mean   :1.318   Mean   :1.455  
##  3rd Qu.:16.75   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :22.00   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##   All_Products   Profesionalism    Limitation  Online_grocery     delivery    
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.250   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1.000   Median :1.0   Median :2.000   Median :3.000  
##  Mean   :2.091   Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000  
##     Pick_up        Find_items     other_shops        Gender     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000  
##  Median :2.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :2.455   Mean   :1.455   Mean   :2.591   Mean   :1.273  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750  
##  Max.   :5.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##       Age          Education    
##  Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.500  
##  Mean   :2.455   Mean   :3.182  
##  3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :4.000   Max.   :5.000
# Remove ID column
data_clean <- data %>% select(-ID)

# Check missing values
colSums(is.na(data_clean))
##     CS_helpful      Recommend     Come_again   All_Products Profesionalism 
##              0              0              0              0              0 
##     Limitation Online_grocery       delivery        Pick_up     Find_items 
##              0              0              0              0              0 
##    other_shops         Gender            Age      Education 
##              0              0              0              0
# Convert categorical-like variables to factors if needed
data_clean$Gender <- as.factor(data_clean$Gender)
data_clean$Education <- as.factor(data_clean$Education)
data_clean$Age <- as.factor(data_clean$Age)
# Summary of key satisfaction features
summary(select(data_clean, CS_helpful:other_shops))
##    CS_helpful      Recommend       Come_again     All_Products  
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :1.000   Median :1.000   Median :2.000  
##  Mean   :1.591   Mean   :1.318   Mean   :1.455   Mean   :2.091  
##  3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:2.000  
##  Max.   :3.000   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##  Profesionalism    Limitation  Online_grocery     delivery        Pick_up     
##  Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000   1st Qu.:2.000  
##  Median :1.000   Median :1.0   Median :2.000   Median :3.000   Median :2.000  
##  Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409   Mean   :2.455  
##  3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000   Max.   :5.000  
##    Find_items     other_shops   
##  Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.000   1st Qu.:1.250  
##  Median :1.000   Median :2.000  
##  Mean   :1.455   Mean   :2.591  
##  3rd Qu.:2.000   3rd Qu.:3.750  
##  Max.   :3.000   Max.   :5.000
# Average satisfaction scores
avg_scores <- data_clean %>%
  summarise(across(CS_helpful:other_shops, mean))
print(avg_scores)
##   CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## 1   1.590909  1.318182   1.454545     2.090909       1.409091        1.5
##   Online_grocery delivery  Pick_up Find_items other_shops
## 1       2.272727 2.409091 2.454545   1.454545    2.590909
avg_scores_long <- avg_scores %>%
  pivot_longer(cols = everything(), names_to = "Feature", values_to = "Average_Score")

ggplot(avg_scores_long, aes(x = reorder(Feature, Average_Score), y = Average_Score)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Average Customer Ratings by Feature",
       x = "Feature",
       y = "Average Score (1–5)")

numeric_vars <- data_clean %>%
  select(where(is.numeric))
GGally::ggcorr(numeric_vars, label = TRUE, label_round = 2)

# Scale numeric data
scaled_data <- scale(select(data_clean, where(is.numeric)))

# Elbow method to find optimal number of clusters
fviz_nbclust(scaled_data, kmeans, method = "wss") +
  labs(title = "Elbow Method for Optimal Clusters")

# Run K-means with 3 clusters (as an example)
set.seed(123)
kmeans_res <- kmeans(scaled_data, centers = 3, nstart = 25)

# Add cluster labels to dataset
data_with_clusters <- data_clean %>%
  mutate(Cluster = as.factor(kmeans_res$cluster))

# Cluster summary
data_with_clusters %>%
  group_by(Cluster) %>%
  summarise(across(where(is.numeric), mean))
## # A tibble: 3 × 12
##   Cluster CS_helpful Recommend Come_again All_Products Profesionalism Limitation
##   <fct>        <dbl>     <dbl>      <dbl>        <dbl>          <dbl>      <dbl>
## 1 1             2.5       2          2.5          3.25           2          2   
## 2 2             1.1       1          1.3          2              1.2        1.1 
## 3 3             1.75      1.38       1.12         1.62           1.38       1.75
## # ℹ 5 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## #   Find_items <dbl>, other_shops <dbl>
fviz_cluster(kmeans_res, data = scaled_data,
             geom = "point", ellipse.type = "convex",
             palette = "jco", repel = TRUE)

ggplot(data_with_clusters, aes(x = Education, fill = Cluster)) +
  geom_bar(position = "dodge") +
  labs(title = "Education Level by Cluster", x = "Education", y = "Count")

ggplot(data_with_clusters, aes(x = Age, fill = Cluster)) +
  geom_bar(position = "dodge") +
  labs(title = "Age Group by Cluster", x = "Age Group", y = "Count")