Load Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.0 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(summarytools)
## Warning in fun(libname, pkgname): couldn't connect to display ":0"
## system might not have X11 capabilities; in case of errors when using dfSummary(), set st_options(use.x11 = FALSE)
##
## Attaching package: 'summarytools'
##
## The following object is masked from 'package:tibble':
##
## view
library(ggplot2)
Load Data
data <- read.csv("customer_segmentation.csv")
# Clean column names
colnames(data) <- trimws(colnames(data))
# View structure
str(data)
## 'data.frame': 22 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CS_helpful : int 2 1 2 3 2 1 2 1 1 1 ...
## $ Recommend : int 2 2 1 3 1 1 1 1 1 1 ...
## $ Come_again : int 2 1 1 2 3 3 1 1 1 1 ...
## $ All_Products : int 2 1 1 4 5 2 2 2 2 1 ...
## $ Profesionalism: int 2 1 1 1 2 1 2 1 2 1 ...
## $ Limitation : int 2 1 2 2 1 1 1 2 1 1 ...
## $ Online_grocery: int 2 2 3 3 2 1 2 1 2 3 ...
## $ delivery : int 3 3 3 3 3 2 2 1 1 2 ...
## $ Pick_up : int 4 3 2 2 1 1 2 2 3 2 ...
## $ Find_items : int 1 1 1 2 2 1 1 2 1 1 ...
## $ other_shops : int 2 2 3 2 3 4 1 4 1 1 ...
## $ Gender : int 1 1 1 1 2 1 1 1 2 2 ...
## $ Age : int 2 2 2 3 4 2 2 2 2 2 ...
## $ Education : int 2 2 2 5 2 5 3 2 1 2 ...
Data Overview
summary(data)
## ID CS_helpful Recommend Come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## All_Products Profesionalism Limitation Online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## Pick_up Find_items other_shops Gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## Age Education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
dfSummary(data)
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Warning in png(png_loc <- tempfile(fileext = ".png"), width = 150 *
## graph.magnif, : unable to open connection to X11 display ''
## Data Frame Summary
## data
## Dimensions: 22 x 15
## Duplicates: 0
##
## ------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ---------------- ------------------------ -------------------- ------------------- ---------- ---------
## 1 ID Mean (sd) : 11.5 (6.5) 22 distinct values : : : : 22 0
## [integer] min < med < max: (Integer sequence) : : : : (100.0%) (0.0%)
## 1 < 11.5 < 22 : : : :
## IQR (CV) : 10.5 (0.6) : : : : :
## : : : : :
##
## 2 CS_helpful Mean (sd) : 1.6 (0.7) 1 : 12 (54.5%) IIIIIIIIII 22 0
## [integer] min < med < max: 2 : 7 (31.8%) IIIIII (100.0%) (0.0%)
## 1 < 1 < 3 3 : 3 (13.6%) II
## IQR (CV) : 1 (0.5)
##
## 3 Recommend Mean (sd) : 1.3 (0.6) 1 : 17 (77.3%) IIIIIIIIIIIIIII 22 0
## [integer] min < med < max: 2 : 3 (13.6%) II (100.0%) (0.0%)
## 1 < 1 < 3 3 : 2 ( 9.1%) I
## IQR (CV) : 0 (0.5)
##
## 4 Come_again Mean (sd) : 1.5 (0.7) 1 : 15 (68.2%) IIIIIIIIIIIII 22 0
## [integer] min < med < max: 2 : 4 (18.2%) III (100.0%) (0.0%)
## 1 < 1 < 3 3 : 3 (13.6%) II
## IQR (CV) : 1 (0.5)
##
## 5 All_Products Mean (sd) : 2.1 (1.1) 1 : 6 (27.3%) IIIII 22 0
## [integer] min < med < max: 2 : 12 (54.5%) IIIIIIIIII (100.0%) (0.0%)
## 1 < 2 < 5 3 : 1 ( 4.5%)
## IQR (CV) : 0.8 (0.5) 4 : 2 ( 9.1%) I
## 5 : 1 ( 4.5%)
##
## 6 Profesionalism Mean (sd) : 1.4 (0.6) 1 : 14 (63.6%) IIIIIIIIIIII 22 0
## [integer] min < med < max: 2 : 7 (31.8%) IIIIII (100.0%) (0.0%)
## 1 < 1 < 3 3 : 1 ( 4.5%)
## IQR (CV) : 1 (0.4)
##
## 7 Limitation Mean (sd) : 1.5 (0.8) 1 : 14 (63.6%) IIIIIIIIIIII 22 0
## [integer] min < med < max: 2 : 6 (27.3%) IIIII (100.0%) (0.0%)
## 1 < 1 < 4 3 : 1 ( 4.5%)
## IQR (CV) : 1 (0.5) 4 : 1 ( 4.5%)
##
## 8 Online_grocery Mean (sd) : 2.3 (0.8) 1 : 4 (18.2%) III 22 0
## [integer] min < med < max: 2 : 8 (36.4%) IIIIIII (100.0%) (0.0%)
## 1 < 2 < 3 3 : 10 (45.5%) IIIIIIIII
## IQR (CV) : 1 (0.3)
##
## 9 delivery Mean (sd) : 2.4 (0.7) 1 : 3 (13.6%) II 22 0
## [integer] min < med < max: 2 : 7 (31.8%) IIIIII (100.0%) (0.0%)
## 1 < 3 < 3 3 : 12 (54.5%) IIIIIIIIII
## IQR (CV) : 1 (0.3)
##
## 10 Pick_up Mean (sd) : 2.5 (1.1) 1 : 4 (18.2%) III 22 0
## [integer] min < med < max: 2 : 8 (36.4%) IIIIIII (100.0%) (0.0%)
## 1 < 2 < 5 3 : 7 (31.8%) IIIIII
## IQR (CV) : 1 (0.4) 4 : 2 ( 9.1%) I
## 5 : 1 ( 4.5%)
##
## 11 Find_items Mean (sd) : 1.5 (0.7) 1 : 14 (63.6%) IIIIIIIIIIII 22 0
## [integer] min < med < max: 2 : 6 (27.3%) IIIII (100.0%) (0.0%)
## 1 < 1 < 3 3 : 2 ( 9.1%) I
## IQR (CV) : 1 (0.5)
##
## 12 other_shops Mean (sd) : 2.6 (1.4) 1 : 6 (27.3%) IIIII 22 0
## [integer] min < med < max: 2 : 6 (27.3%) IIIII (100.0%) (0.0%)
## 1 < 2 < 5 3 : 4 (18.2%) III
## IQR (CV) : 2.5 (0.5) 4 : 3 (13.6%) II
## 5 : 3 (13.6%) II
##
## 13 Gender Min : 1 1 : 16 (72.7%) IIIIIIIIIIIIII 22 0
## [integer] Mean : 1.3 2 : 6 (27.3%) IIIII (100.0%) (0.0%)
## Max : 2
##
## 14 Age Mean (sd) : 2.5 (0.7) 2 : 15 (68.2%) IIIIIIIIIIIII 22 0
## [integer] min < med < max: 3 : 4 (18.2%) III (100.0%) (0.0%)
## 2 < 2 < 4 4 : 3 (13.6%) II
## IQR (CV) : 1 (0.3)
##
## 15 Education Mean (sd) : 3.2 (1.6) 1 : 3 (13.6%) II 22 0
## [integer] min < med < max: 2 : 8 (36.4%) IIIIIII (100.0%) (0.0%)
## 1 < 2.5 < 5 3 : 2 ( 9.1%) I
## IQR (CV) : 3 (0.5) 5 : 9 (40.9%) IIIIIIII
## ------------------------------------------------------------------------------------------------------------
Demographics Analysis
# Gender distribution
ggplot(data, aes(x = factor(Gender))) +
geom_bar() +
labs(title = "Gender Distribution", x = "Gender", y = "Count")

# Age distribution
ggplot(data, aes(x = factor(Age))) +
geom_bar() +
labs(title = "Age Distribution", x = "Age Group", y = "Count")

# Education distribution
ggplot(data, aes(x = factor(Education))) +
geom_bar() +
labs(title = "Education Levels", x = "Education", y = "Count")

Customer Behavior Analysis
behavior_vars <- data %>%
select(CS_helpful, Recommend, Come_again, All_Products)
# Correlation matrix
cor(behavior_vars, use = "complete.obs")
## CS_helpful Recommend Come_again All_Products
## CS_helpful 1.0000000 0.48809623 0.2714620 0.29345435
## Recommend 0.4880962 1.00000000 0.3808907 0.02515624
## Come_again 0.2714620 0.38089069 1.0000000 0.36875582
## All_Products 0.2934543 0.02515624 0.3687558 1.00000000
# Scatterplot matrix
pairs(behavior_vars, main = "Customer Behavior Relationships")

Service Usage Analysis
services <- data %>%
select(Online_grocery, delivery, Pick_up, Find_items)
# Average usage
colMeans(services, na.rm = TRUE)
## Online_grocery delivery Pick_up Find_items
## 2.272727 2.409091 2.454545 1.454545
# Boxplot
services_long <- pivot_longer(services, cols = everything())
ggplot(services_long, aes(x = name, y = value)) +
geom_boxplot() +
labs(title = "Service Usage Distribution", x = "Service", y = "Score")

Customer Satisfaction Insights
satisfaction <- data %>%
select(CS_helpful, Profesionalism, Limitation)
# Means
colMeans(satisfaction, na.rm = TRUE)
## CS_helpful Profesionalism Limitation
## 1.590909 1.409091 1.500000
# Boxplot
satisfaction_long <- pivot_longer(satisfaction, cols = everything())
ggplot(satisfaction_long, aes(x = name, y = value)) +
geom_boxplot() +
labs(title = "Customer Satisfaction Metrics", x = "Metric", y = "Score")

Satisfaction vs Loyalty
ggplot(data, aes(x = CS_helpful, y = Recommend)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Helpfulness vs Recommendation")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(data, aes(x = CS_helpful, y = Come_again)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Helpfulness vs Return Intent")
## `geom_smooth()` using formula = 'y ~ x'

Customer Segmentation (K-Means)
seg_data <- data %>%
select(CS_helpful, Recommend, Come_again, All_Products, delivery, Pick_up)
# Scale data
seg_scaled <- scale(seg_data)
# K-means clustering
set.seed(123)
clusters <- kmeans(seg_scaled, centers = 3)
# Add cluster to dataset
data$Cluster <- as.factor(clusters$cluster)
# Visualize clusters
ggplot(data, aes(x = CS_helpful, y = Recommend, color = Cluster)) +
geom_point(size = 3) +
labs(title = "Customer Segments")

Cluster Insights
aggregate(seg_data, by = list(Cluster = data$Cluster), mean)
## Cluster CS_helpful Recommend Come_again All_Products delivery Pick_up
## 1 1 1.10 1.000 1.300 2.000 1.7 2.200
## 2 2 2.50 2.000 2.500 3.250 3.0 1.250
## 3 3 1.75 1.375 1.125 1.625 3.0 3.375