This report analyzes customer feedback and demographic data from
customer_segmentation.csv. The goal is to understand
patterns, satisfaction factors, and segment customers based on their
responses.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(cluster)
library(factoextra)
## Welcome to factoextra!
## Want to learn more? See two factoextra-related books at https://www.datanovia.com/en/product/practical-guide-to-principal-component-methods-in-r/
library(GGally)
data <- read.csv("customer_segmentation.csv", header = TRUE)
str(data)
## 'data.frame': 22 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CS_helpful : int 2 1 2 3 2 1 2 1 1 1 ...
## $ Recommend : int 2 2 1 3 1 1 1 1 1 1 ...
## $ Come_again : int 2 1 1 2 3 3 1 1 1 1 ...
## $ All_Products : int 2 1 1 4 5 2 2 2 2 1 ...
## $ Profesionalism: int 2 1 1 1 2 1 2 1 2 1 ...
## $ Limitation : int 2 1 2 2 1 1 1 2 1 1 ...
## $ Online_grocery: int 2 2 3 3 2 1 2 1 2 3 ...
## $ delivery : int 3 3 3 3 3 2 2 1 1 2 ...
## $ Pick_up : int 4 3 2 2 1 1 2 2 3 2 ...
## $ Find_items : int 1 1 1 2 2 1 1 2 1 1 ...
## $ other_shops : int 2 2 3 2 3 4 1 4 1 1 ...
## $ Gender : int 1 1 1 1 2 1 1 1 2 2 ...
## $ Age : int 2 2 2 3 4 2 2 2 2 2 ...
## $ Education : int 2 2 2 5 2 5 3 2 1 2 ...
summary(data)
## ID CS_helpful Recommend Come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## All_Products Profesionalism Limitation Online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## Pick_up Find_items other_shops Gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## Age Education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
# Remove ID column
data_clean <- data %>% select(-ID)
# Check missing values
colSums(is.na(data_clean))
## CS_helpful Recommend Come_again All_Products Profesionalism
## 0 0 0 0 0
## Limitation Online_grocery delivery Pick_up Find_items
## 0 0 0 0 0
## other_shops Gender Age Education
## 0 0 0 0
# Convert categorical-like variables to factors if needed
data_clean$Gender <- as.factor(data_clean$Gender)
data_clean$Education <- as.factor(data_clean$Education)
data_clean$Age <- as.factor(data_clean$Age)
# Summary of key satisfaction features
summary(select(data_clean, CS_helpful:other_shops))
## CS_helpful Recommend Come_again All_Products
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.250
## Median :1.000 Median :1.000 Median :1.000 Median :2.000
## Mean :1.591 Mean :1.318 Mean :1.455 Mean :2.091
## 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:2.000
## Max. :3.000 Max. :3.000 Max. :3.000 Max. :5.000
## Profesionalism Limitation Online_grocery delivery Pick_up
## Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000 1st Qu.:2.000
## Median :1.000 Median :1.0 Median :2.000 Median :3.000 Median :2.000
## Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409 Mean :2.455
## 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000 Max. :5.000
## Find_items other_shops
## Min. :1.000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:1.250
## Median :1.000 Median :2.000
## Mean :1.455 Mean :2.591
## 3rd Qu.:2.000 3rd Qu.:3.750
## Max. :3.000 Max. :5.000
# Average satisfaction scores
avg_scores <- data_clean %>%
summarise(across(CS_helpful:other_shops, mean))
print(avg_scores)
## CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## 1 1.590909 1.318182 1.454545 2.090909 1.409091 1.5
## Online_grocery delivery Pick_up Find_items other_shops
## 1 2.272727 2.409091 2.454545 1.454545 2.590909
avg_scores_long <- avg_scores %>%
pivot_longer(cols = everything(), names_to = "Feature", values_to = "Average_Score")
ggplot(avg_scores_long, aes(x = reorder(Feature, Average_Score), y = Average_Score)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Average Customer Ratings by Feature",
x = "Feature",
y = "Average Score (1–5)")
numeric_vars <- data_clean %>%
select(where(is.numeric))
GGally::ggcorr(numeric_vars, label = TRUE, label_round = 2)
# Scale numeric data
scaled_data <- scale(select(data_clean, where(is.numeric)))
# Elbow method to find optimal number of clusters
fviz_nbclust(scaled_data, kmeans, method = "wss") +
labs(title = "Elbow Method for Optimal Clusters")
# Run K-means with 3 clusters (as an example)
set.seed(123)
kmeans_res <- kmeans(scaled_data, centers = 3, nstart = 25)
# Add cluster labels to dataset
data_with_clusters <- data_clean %>%
mutate(Cluster = as.factor(kmeans_res$cluster))
# Cluster summary
data_with_clusters %>%
group_by(Cluster) %>%
summarise(across(where(is.numeric), mean))
## # A tibble: 3 × 12
## Cluster CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2.5 2 2.5 3.25 2 2
## 2 2 1.1 1 1.3 2 1.2 1.1
## 3 3 1.75 1.38 1.12 1.62 1.38 1.75
## # ℹ 5 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>
fviz_cluster(kmeans_res, data = scaled_data,
geom = "point", ellipse.type = "convex",
palette = "jco", repel = TRUE)
ggplot(data_with_clusters, aes(x = Education, fill = Cluster)) +
geom_bar(position = "dodge") +
labs(title = "Education Level by Cluster", x = "Education", y = "Count")
ggplot(data_with_clusters, aes(x = Age, fill = Cluster)) +
geom_bar(position = "dodge") +
labs(title = "Age Group by Cluster", x = "Age Group", y = "Count")