# Set global options
knitr::opts_chunk$set(echo = TRUE, warning = TRUE, message = TRUE, error = TRUE)
# Install required packages if not already installed
if(!require(readr)) install.packages("readr")
## Loading required package: readr
if(!require(cluster)) install.packages("cluster")
## Loading required package: cluster
if(!require(factoextra)) install.packages("factoextra")
## Loading required package: factoextra
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
if(!require(dplyr)) install.packages("dplyr")
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
if(!require(ggplot2)) install.packages("ggplot2")
# Load the libraries
library(readr)
library(cluster)
library(factoextra)
library(dplyr)
library(ggplot2)
# Set working directory to the location of your CSV file if needed
# setwd("path/to/your/directory")
# Check if file exists before reading
if(file.exists("customer_segmentation.csv")) {
data <- read_csv("customer_segmentation.csv")
print("File successfully loaded")
} else {
# Create sample data if file doesn't exist (for demonstration)
set.seed(123)
data <- data.frame(
CustomerID = 1:22,
Age = sample(18:70, 22, replace = TRUE),
Income = sample(30000:150000, 22, replace = TRUE),
SpendingScore = sample(1:100, 22, replace = TRUE),
Frequency = sample(1:30, 22, replace = TRUE),
Recency = sample(1:365, 22, replace = TRUE)
)
print("Using sample data as file was not found")
}
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "File successfully loaded"
# View the first few rows
head(data)
## # A tibble: 6 × 15
## ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## # Education <dbl>
# Summary of the data
summary(data)
## ID CS_helpful Recommend Come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## All_Products Profesionalism Limitation Online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## Pick_up Find_items other_shops Gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## Age Education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
# Keep only numeric columns for clustering
data_clean <- data[, sapply(data, is.numeric)]
# Check for missing values
missing_values <- colSums(is.na(data_clean))
print(paste("Missing values in the dataset:", sum(missing_values)))
## [1] "Missing values in the dataset: 0"
# Remove columns with missing values if needed
if(sum(missing_values) > 0) {
data_clean <- data_clean[, missing_values == 0]
}
# Scale the numeric data
data_scaled <- scale(data_clean)
head(data_scaled)
## ID CS_helpful Recommend Come_again All_Products Profesionalism
## [1,] -1.6169801 0.5572385 1.0548991 0.7385489 -0.08536162 1.0009877
## [2,] -1.4629820 -0.8049001 1.0548991 -0.6154575 -1.02433946 -0.6929915
## [3,] -1.3089839 0.5572385 -0.4922862 -0.6154575 -1.02433946 -0.6929915
## [4,] -1.1549858 1.9193772 2.6020844 0.7385489 1.79259406 -0.6929915
## [5,] -1.0009877 0.5572385 -0.4922862 2.0925553 2.73157191 1.0009877
## [6,] -0.8469896 -0.8049001 -0.4922862 2.0925553 -0.08536162 -0.6929915
## Limitation Online_grocery delivery Pick_up Find_items other_shops
## [1,] 0.6236096 -0.3554390 0.8049001 1.4623535 -0.6774335 -0.4212692
## [2,] -0.6236096 -0.3554390 0.8049001 0.5161248 -0.6774335 -0.4212692
## [3,] 0.6236096 0.9478374 0.8049001 -0.4301040 -0.6774335 0.2916479
## [4,] 0.6236096 0.9478374 0.8049001 -0.4301040 0.8129201 -0.4212692
## [5,] -0.6236096 -0.3554390 0.8049001 -1.3763327 0.8129201 0.2916479
## [6,] -0.6236096 -1.6587154 -0.5572385 -1.3763327 -0.6774335 1.0045650
## Gender Age Education
## [1,] -0.598293 -0.6154575 -0.7284586
## [2,] -0.598293 -0.6154575 -0.7284586
## [3,] -0.598293 -0.6154575 -0.7284586
## [4,] -0.598293 0.7385489 1.1207055
## [5,] 1.595448 2.0925553 -0.7284586
## [6,] -0.598293 -0.6154575 1.1207055
# Compute distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")
# Perform hierarchical clustering using different methods
hc_complete <- hclust(dist_matrix, method = "complete")
hc_average <- hclust(dist_matrix, method = "average")
hc_ward <- hclust(dist_matrix, method = "ward.D2")
# Plot dendrograms
par(mfrow = c(1, 3))
plot(hc_complete, main = "Complete Linkage", xlab = "", sub = "", cex = 0.6)
plot(hc_average, main = "Average Linkage", xlab = "", sub = "", cex = 0.6)
plot(hc_ward, main = "Ward's Method", xlab = "", sub = "", cex = 0.6)
# Display elbow method plot
fviz_nbclust(data_scaled, FUN = hcut, method = "wss") +
labs(title = "Elbow Method for Optimal Clusters")
# Display silhouette method plot
fviz_nbclust(data_scaled, FUN = hcut, method = "silhouette") +
labs(title = "Silhouette Method for Optimal Clusters")
# Cut tree to create 3 clusters
clusters <- cutree(hc_ward, k = 3)
# Add cluster information to original data
data$cluster <- as.factor(clusters)
# Visualize clusters using fviz_cluster
fviz_cluster(list(data = data_scaled, cluster = clusters),
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
ellipse.type = "convex",
repel = TRUE,
ggtheme = theme_minimal())
# Calculate mean values for each cluster
cluster_means <- data %>%
select(where(is.numeric), cluster) %>%
group_by(cluster) %>%
summarise_all(mean)
# Display cluster means
print(cluster_means)
## # A tibble: 3 × 16
## cluster ID CS_helpful Recommend Come_again All_Products Profesionalism
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 12.2 1.67 1.33 1.11 1.67 1.33
## 2 2 12 2.5 2 2.5 3.25 2
## 3 3 10.6 1.11 1 1.33 2 1.22
## # ℹ 9 more variables: Limitation <dbl>, Online_grocery <dbl>, delivery <dbl>,
## # Pick_up <dbl>, Find_items <dbl>, other_shops <dbl>, Gender <dbl>,
## # Age <dbl>, Education <dbl>
# Visualize key variables by cluster
if("Age" %in% colnames(data) && "Income" %in% colnames(data)) {
ggplot(data, aes(x = Age, y = Income, color = cluster)) +
geom_point(size = 3) +
labs(title = "Customer Segments by Age and Income",
x = "Age", y = "Income") +
theme_minimal()
}
The hierarchical clustering analysis has identified 3 distinct customer segments. The characteristics of each segment can be used to develop targeted marketing strategies and improve customer engagement.