library(tidyverse)
library(psych)
library(corrplot)
Load Data
data <- read.csv("customer_segmentation.csv")
str(data)
## 'data.frame': 22 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CS_helpful : int 2 1 2 3 2 1 2 1 1 1 ...
## $ Recommend : int 2 2 1 3 1 1 1 1 1 1 ...
## $ Come_again : int 2 1 1 2 3 3 1 1 1 1 ...
## $ All_Products : int 2 1 1 4 5 2 2 2 2 1 ...
## $ Profesionalism: int 2 1 1 1 2 1 2 1 2 1 ...
## $ Limitation : int 2 1 2 2 1 1 1 2 1 1 ...
## $ Online_grocery: int 2 2 3 3 2 1 2 1 2 3 ...
## $ delivery : int 3 3 3 3 3 2 2 1 1 2 ...
## $ Pick_up : int 4 3 2 2 1 1 2 2 3 2 ...
## $ Find_items : int 1 1 1 2 2 1 1 2 1 1 ...
## $ other_shops : int 2 2 3 2 3 4 1 4 1 1 ...
## $ Gender : int 1 1 1 1 2 1 1 1 2 2 ...
## $ Age : int 2 2 2 3 4 2 2 2 2 2 ...
## $ Education : int 2 2 2 5 2 5 3 2 1 2 ...
summary(data)
## ID CS_helpful Recommend Come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## All_Products Profesionalism Limitation Online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## Pick_up Find_items other_shops Gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## Age Education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
head(data)
## ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## Online_grocery delivery Pick_up Find_items other_shops Gender Age Education
## 1 2 3 4 1 2 1 2 2
## 2 2 3 3 1 2 1 2 2
## 3 3 3 2 1 3 1 2 2
## 4 3 3 2 2 2 1 3 5
## 5 2 3 1 2 3 2 4 2
## 6 1 2 1 1 4 1 2 5
Clean Column Names
names(data) <- gsub(" ", "", names(data))
names(data)
## [1] "ID" "CS_helpful" "Recommend" "Come_again"
## [5] "All_Products" "Profesionalism" "Limitation" "Online_grocery"
## [9] "delivery" "Pick_up" "Find_items" "other_shops"
## [13] "Gender" "Age" "Education"
Convert Variables
data$Gender <- factor(
data$Gender,
levels = c(1, 2),
labels = c("Male", "Female")
)
data$Education <- factor(
data$Education,
levels = c(1, 2, 3, 4, 5),
labels = c("Primary", "Secondary", "HighSchool", "Bachelor", "Postgrad")
)
data$Age <- factor(data$Age, ordered = TRUE)
str(data)
## 'data.frame': 22 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ CS_helpful : int 2 1 2 3 2 1 2 1 1 1 ...
## $ Recommend : int 2 2 1 3 1 1 1 1 1 1 ...
## $ Come_again : int 2 1 1 2 3 3 1 1 1 1 ...
## $ All_Products : int 2 1 1 4 5 2 2 2 2 1 ...
## $ Profesionalism: int 2 1 1 1 2 1 2 1 2 1 ...
## $ Limitation : int 2 1 2 2 1 1 1 2 1 1 ...
## $ Online_grocery: int 2 2 3 3 2 1 2 1 2 3 ...
## $ delivery : int 3 3 3 3 3 2 2 1 1 2 ...
## $ Pick_up : int 4 3 2 2 1 1 2 2 3 2 ...
## $ Find_items : int 1 1 1 2 2 1 1 2 1 1 ...
## $ other_shops : int 2 2 3 2 3 4 1 4 1 1 ...
## $ Gender : Factor w/ 2 levels "Male","Female": 1 1 1 1 2 1 1 1 2 2 ...
## $ Age : Ord.factor w/ 3 levels "2"<"3"<"4": 1 1 1 2 3 1 1 1 1 1 ...
## $ Education : Factor w/ 5 levels "Primary","Secondary",..: 2 2 2 5 2 5 3 2 1 2 ...
Descriptive Statistics
items <- c(
"CS_helpful", "Recommend", "Come_again",
"All_Products", "Profesionalism", "Limitation",
"Online_grocery", "delivery", "Pick_up",
"Find_items", "other_shops"
)
descr <- psych::describe(data[, items])
descr
## vars n mean sd median trimmed mad min max range skew
## CS_helpful 1 22 1.59 0.73 1 1.50 0.00 1 3 2 0.73
## Recommend 2 22 1.32 0.65 1 1.17 0.00 1 3 2 1.67
## Come_again 3 22 1.45 0.74 1 1.33 0.00 1 3 2 1.16
## All_Products 4 22 2.09 1.06 2 1.94 0.00 1 5 4 1.18
## Profesionalism 5 22 1.41 0.59 1 1.33 0.00 1 3 2 1.00
## Limitation 6 22 1.50 0.80 1 1.33 0.00 1 4 3 1.59
## Online_grocery 7 22 2.27 0.77 2 2.33 1.48 1 3 2 -0.46
## delivery 8 22 2.41 0.73 3 2.50 0.00 1 3 2 -0.73
## Pick_up 9 22 2.45 1.06 2 2.39 1.48 1 5 4 0.46
## Find_items 10 22 1.45 0.67 1 1.33 0.00 1 3 2 1.06
## other_shops 11 22 2.59 1.40 2 2.50 1.48 1 5 4 0.42
## kurtosis se
## CS_helpful -0.89 0.16
## Recommend 1.38 0.14
## Come_again -0.23 0.16
## All_Products 0.79 0.23
## Profesionalism -0.14 0.13
## Limitation 1.99 0.17
## Online_grocery -1.25 0.16
## delivery -0.89 0.16
## Pick_up -0.37 0.23
## Find_items -0.19 0.14
## other_shops -1.21 0.30
Frequency Tables
table(data$Gender)
##
## Male Female
## 16 6
prop.table(table(data$Gender))
##
## Male Female
## 0.7272727 0.2727273
table(data$Education)
##
## Primary Secondary HighSchool Bachelor Postgrad
## 3 8 2 0 9
prop.table(table(data$Education))
##
## Primary Secondary HighSchool Bachelor Postgrad
## 0.13636364 0.36363636 0.09090909 0.00000000 0.40909091
table(data$Age)
##
## 2 3 4
## 15 4 3
prop.table(table(data$Age))
##
## 2 3 4
## 0.6818182 0.1818182 0.1363636
Graph 1: Customer Service Helpful
ggplot(data, aes(x = CS_helpful)) +
geom_bar(fill = "steelblue") +
labs(
title = "Distribution of Customer Service Helpful",
x = "Response",
y = "Count"
)

Graph 2: Gender Distribution
ggplot(data, aes(x = Gender)) +
geom_bar(fill = "darkgreen") +
labs(
title = "Gender Distribution",
x = "Gender",
y = "Count"
)

Correlation Matrix
cor_mat <- cor(data[, items], use = "pairwise.complete.obs")
round(cor_mat, 2)
## CS_helpful Recommend Come_again All_Products Profesionalism
## CS_helpful 1.00 0.49 0.27 0.29 0.51
## Recommend 0.49 1.00 0.38 0.03 0.39
## Come_again 0.27 0.38 1.00 0.37 0.43
## All_Products 0.29 0.03 0.37 1.00 0.09
## Profesionalism 0.51 0.39 0.43 0.09 1.00
## Limitation 0.61 0.05 0.00 0.06 0.05
## Online_grocery 0.21 0.30 -0.15 -0.15 0.06
## delivery 0.59 0.42 0.17 0.07 0.25
## Pick_up -0.18 -0.08 -0.52 -0.25 -0.16
## Find_items 0.30 -0.02 0.04 0.54 -0.01
## other_shops -0.31 -0.06 0.33 0.22 -0.19
## Limitation Online_grocery delivery Pick_up Find_items
## CS_helpful 0.61 0.21 0.59 -0.18 0.30
## Recommend 0.05 0.30 0.42 -0.08 -0.02
## Come_again 0.00 -0.15 0.17 -0.52 0.04
## All_Products 0.06 -0.15 0.07 -0.25 0.54
## Profesionalism 0.05 0.06 0.25 -0.16 -0.01
## Limitation 1.00 -0.15 0.36 0.00 0.44
## Online_grocery -0.15 1.00 0.30 0.31 -0.16
## delivery 0.36 0.30 1.00 0.12 0.28
## Pick_up 0.00 0.31 0.12 1.00 -0.10
## Find_items 0.44 -0.16 0.28 -0.10 1.00
## other_shops -0.06 -0.11 -0.20 -0.03 0.00
## other_shops
## CS_helpful -0.31
## Recommend -0.06
## Come_again 0.33
## All_Products 0.22
## Profesionalism -0.19
## Limitation -0.06
## Online_grocery -0.11
## delivery -0.20
## Pick_up -0.03
## Find_items 0.00
## other_shops 1.00
Correlation Plot
corrplot(
cor_mat,
method = "color",
type = "upper",
tl.col = "black",
addCoef.col = "black",
number.cex = 0.6
)

Reliability Analysis
scale_items <- c(
"CS_helpful", "Recommend", "Come_again",
"All_Products", "Profesionalism", "Limitation",
"Online_grocery", "delivery", "Pick_up",
"Find_items", "other_shops"
)
alpha_result <- psych::alpha(data[, scale_items])
## Some items ( Pick_up other_shops ) were negatively correlated with the first principal component and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
alpha_result
##
## Reliability analysis
## Call: psych::alpha(x = data[, scale_items])
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.47 0.59 0.82 0.12 1.5 0.17 1.9 0.35 0.057
##
## 95% confidence boundaries
## lower alpha upper
## Feldt 0.07 0.47 0.75
## Duhachek 0.14 0.47 0.81
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## CS_helpful 0.35 0.47 0.67 0.081 0.88 0.21 0.053 0.046
## Recommend 0.41 0.53 0.79 0.101 1.12 0.19 0.068 0.057
## Come_again 0.42 0.56 0.77 0.113 1.28 0.19 0.062 0.056
## All_Products 0.41 0.56 0.76 0.114 1.29 0.19 0.068 0.050
## Profesionalism 0.44 0.56 0.79 0.111 1.25 0.18 0.067 0.056
## Limitation 0.43 0.56 0.74 0.113 1.27 0.19 0.067 0.090
## Online_grocery 0.48 0.60 0.81 0.133 1.53 0.17 0.069 0.072
## delivery 0.37 0.50 0.77 0.090 0.99 0.20 0.066 0.044
## Pick_up 0.58 0.66 0.83 0.163 1.94 0.14 0.058 0.168
## Find_items 0.42 0.56 0.78 0.113 1.28 0.19 0.068 0.072
## other_shops 0.58 0.64 0.83 0.152 1.79 0.14 0.065 0.117
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## CS_helpful 22 0.662 0.775 0.832 0.531 1.6 0.73
## Recommend 22 0.506 0.590 0.538 0.363 1.3 0.65
## Come_again 22 0.466 0.475 0.448 0.294 1.5 0.74
## All_Products 22 0.527 0.464 0.442 0.279 2.1 1.06
## Profesionalism 22 0.383 0.496 0.447 0.239 1.4 0.59
## Limitation 22 0.452 0.481 0.485 0.261 1.5 0.80
## Online_grocery 22 0.272 0.297 0.209 0.074 2.3 0.77
## delivery 22 0.614 0.689 0.654 0.471 2.4 0.73
## Pick_up 22 0.089 0.021 -0.092 -0.185 2.5 1.06
## Find_items 22 0.475 0.474 0.432 0.322 1.5 0.67
## other_shops 22 0.311 0.120 0.014 -0.060 2.6 1.40
##
## Non missing response frequency for each item
## 1 2 3 4 5 miss
## CS_helpful 0.55 0.32 0.14 0.00 0.00 0
## Recommend 0.77 0.14 0.09 0.00 0.00 0
## Come_again 0.68 0.18 0.14 0.00 0.00 0
## All_Products 0.27 0.55 0.05 0.09 0.05 0
## Profesionalism 0.64 0.32 0.05 0.00 0.00 0
## Limitation 0.64 0.27 0.05 0.05 0.00 0
## Online_grocery 0.18 0.36 0.45 0.00 0.00 0
## delivery 0.14 0.32 0.55 0.00 0.00 0
## Pick_up 0.18 0.36 0.32 0.09 0.05 0
## Find_items 0.64 0.27 0.09 0.00 0.00 0
## other_shops 0.27 0.27 0.18 0.14 0.14 0
Satisfaction Score
data$Satisfaction_Total <- rowMeans(data[, scale_items], na.rm = TRUE)
summary(data$Satisfaction_Total)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.182 1.636 1.864 1.868 2.182 2.455
Satisfaction by Gender
data %>%
group_by(Gender) %>%
summarise(
n = n(),
mean_satisfaction = mean(Satisfaction_Total, na.rm = TRUE),
sd_satisfaction = sd(Satisfaction_Total, na.rm = TRUE)
)
## # A tibble: 2 × 4
## Gender n mean_satisfaction sd_satisfaction
## <fct> <int> <dbl> <dbl>
## 1 Male 16 1.87 0.317
## 2 Female 6 1.86 0.452
Satisfaction by Education
data %>%
group_by(Education) %>%
summarise(
n = n(),
mean_satisfaction = mean(Satisfaction_Total, na.rm = TRUE),
sd_satisfaction = sd(Satisfaction_Total, na.rm = TRUE)
)
## # A tibble: 4 × 4
## Education n mean_satisfaction sd_satisfaction
## <fct> <int> <dbl> <dbl>
## 1 Primary 3 1.82 0.417
## 2 Secondary 8 1.90 0.338
## 3 HighSchool 2 1.91 0.514
## 4 Postgrad 9 1.85 0.369
Graph 3: Satisfaction by Gender
ggplot(data, aes(x = Gender, y = Satisfaction_Total, fill = Gender)) +
geom_boxplot() +
labs(
title = "Satisfaction by Gender",
x = "Gender",
y = "Satisfaction (mean score)"
) +
theme(legend.position = "none")

Save Processed File
write.csv(data, "customer_segmentation_processed.csv", row.names = FALSE)