Customer Segmentation using Hierarchical Clustering

# Set global options
knitr::opts_chunk$set(echo = TRUE, warning = TRUE, message = TRUE, error = TRUE)

# Install required packages if not already installed
if(!require(readr)) install.packages("readr")

## Loading required package: readr

if(!require(cluster)) install.packages("cluster")

## Loading required package: cluster

if(!require(factoextra)) install.packages("factoextra")

## Loading required package: factoextra

## Loading required package: ggplot2

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

if(!require(dplyr)) install.packages("dplyr")

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

if(!require(ggplot2)) install.packages("ggplot2")

# Load the libraries
library(readr)
library(cluster)
library(factoextra)
library(dplyr)
library(ggplot2)

Loading and Examining the Data

# Set working directory to the location of your CSV file if needed
# setwd("path/to/your/directory")

# Check if file exists before reading
if(file.exists("customer_segmentation.csv")) {
  data <- read_csv("customer_segmentation.csv")
  print("File successfully loaded")
} else {
  # Create sample data if file doesn't exist (for demonstration)
  set.seed(123)
  data <- data.frame(
    CustomerID = 1:22,
    Age = sample(18:70, 22, replace = TRUE),
    Income = sample(30000:150000, 22, replace = TRUE),
    SpendingScore = sample(1:100, 22, replace = TRUE),
    Frequency = sample(1:30, 22, replace = TRUE),
    Recency = sample(1:365, 22, replace = TRUE)
  )
  print("Using sample data as file was not found")
}

## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## [1] "File successfully loaded"

# View the first few rows
head(data)

## # A tibble: 6 × 15
##      ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
##   <dbl>      <dbl>     <dbl>      <dbl>        <dbl>          <dbl>      <dbl>
## 1     1          2         2          2            2              2          2
## 2     2          1         2          1            1              1          1
## 3     3          2         1          1            1              1          2
## 4     4          3         3          2            4              1          2
## 5     5          2         1          3            5              2          1
## 6     6          1         1          3            2              1          1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## #   Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## #   Education <dbl>

# Summary of the data
summary(data)

##        ID          CS_helpful      Recommend       Come_again   
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 6.25   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :11.50   Median :1.000   Median :1.000   Median :1.000  
##  Mean   :11.50   Mean   :1.591   Mean   :1.318   Mean   :1.455  
##  3rd Qu.:16.75   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :22.00   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##   All_Products   Profesionalism    Limitation  Online_grocery     delivery    
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.250   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1.000   Median :1.0   Median :2.000   Median :3.000  
##  Mean   :2.091   Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000  
##     Pick_up        Find_items     other_shops        Gender     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000  
##  Median :2.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :2.455   Mean   :1.455   Mean   :2.591   Mean   :1.273  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750  
##  Max.   :5.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##       Age          Education    
##  Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.500  
##  Mean   :2.455   Mean   :3.182  
##  3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :4.000   Max.   :5.000

Data Preparation

# Keep only numeric columns for clustering
data_clean <- data[, sapply(data, is.numeric)]

# Check for missing values
missing_values <- colSums(is.na(data_clean))
print(paste("Missing values in the dataset:", sum(missing_values)))

## [1] "Missing values in the dataset: 0"

# Remove columns with missing values if needed
if(sum(missing_values) > 0) {
  data_clean <- data_clean[, missing_values == 0]
}

# Scale the numeric data
data_scaled <- scale(data_clean)
head(data_scaled)

##              ID CS_helpful  Recommend Come_again All_Products Profesionalism
## [1,] -1.6169801  0.5572385  1.0548991  0.7385489  -0.08536162      1.0009877
## [2,] -1.4629820 -0.8049001  1.0548991 -0.6154575  -1.02433946     -0.6929915
## [3,] -1.3089839  0.5572385 -0.4922862 -0.6154575  -1.02433946     -0.6929915
## [4,] -1.1549858  1.9193772  2.6020844  0.7385489   1.79259406     -0.6929915
## [5,] -1.0009877  0.5572385 -0.4922862  2.0925553   2.73157191      1.0009877
## [6,] -0.8469896 -0.8049001 -0.4922862  2.0925553  -0.08536162     -0.6929915
##      Limitation Online_grocery   delivery    Pick_up Find_items other_shops
## [1,]  0.6236096     -0.3554390  0.8049001  1.4623535 -0.6774335  -0.4212692
## [2,] -0.6236096     -0.3554390  0.8049001  0.5161248 -0.6774335  -0.4212692
## [3,]  0.6236096      0.9478374  0.8049001 -0.4301040 -0.6774335   0.2916479
## [4,]  0.6236096      0.9478374  0.8049001 -0.4301040  0.8129201  -0.4212692
## [5,] -0.6236096     -0.3554390  0.8049001 -1.3763327  0.8129201   0.2916479
## [6,] -0.6236096     -1.6587154 -0.5572385 -1.3763327 -0.6774335   1.0045650
##         Gender        Age  Education
## [1,] -0.598293 -0.6154575 -0.7284586
## [2,] -0.598293 -0.6154575 -0.7284586
## [3,] -0.598293 -0.6154575 -0.7284586
## [4,] -0.598293  0.7385489  1.1207055
## [5,]  1.595448  2.0925553 -0.7284586
## [6,] -0.598293 -0.6154575  1.1207055

Hierarchical Clustering Analysis

# Compute distance matrix
dist_matrix <- dist(data_scaled, method = "euclidean")

# Perform hierarchical clustering using different methods
hc_complete <- hclust(dist_matrix, method = "complete")
hc_average <- hclust(dist_matrix, method = "average")
hc_ward <- hclust(dist_matrix, method = "ward.D2")

# Plot dendrograms
par(mfrow = c(1, 3))
plot(hc_complete, main = "Complete Linkage", xlab = "", sub = "", cex = 0.6)
plot(hc_average, main = "Average Linkage", xlab = "", sub = "", cex = 0.6)
plot(hc_ward, main = "Ward's Method", xlab = "", sub = "", cex = 0.6)

Determining Optimal Number of Clusters

# Display elbow method plot
fviz_nbclust(data_scaled, FUN = hcut, method = "wss") +
  labs(title = "Elbow Method for Optimal Clusters")

# Display silhouette method plot
fviz_nbclust(data_scaled, FUN = hcut, method = "silhouette") +
  labs(title = "Silhouette Method for Optimal Clusters")

Creating and Visualizing the Clusters

# Cut tree to create 3 clusters
clusters <- cutree(hc_ward, k = 3)

# Add cluster information to original data
data$cluster <- as.factor(clusters)

# Visualize clusters using fviz_cluster
fviz_cluster(list(data = data_scaled, cluster = clusters),
             palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
             ellipse.type = "convex",
             repel = TRUE,
             ggtheme = theme_minimal())

Analyzing Cluster Characteristics

# Calculate mean values for each cluster
cluster_means <- data %>%
  select(where(is.numeric), cluster) %>%
  group_by(cluster) %>%
  summarise_all(mean)

# Display cluster means
print(cluster_means)

## # A tibble: 3 × 16
##   cluster    ID CS_helpful Recommend Come_again All_Products Profesionalism
##   <fct>   <dbl>      <dbl>     <dbl>      <dbl>        <dbl>          <dbl>
## 1 1        12.2       1.67      1.33       1.11         1.67           1.33
## 2 2        12         2.5       2          2.5          3.25           2   
## 3 3        10.6       1.11      1          1.33         2              1.22
## # ℹ 9 more variables: Limitation <dbl>, Online_grocery <dbl>, delivery <dbl>,
## #   Pick_up <dbl>, Find_items <dbl>, other_shops <dbl>, Gender <dbl>,
## #   Age <dbl>, Education <dbl>

# Visualize key variables by cluster
if("Age" %in% colnames(data) && "Income" %in% colnames(data)) {
  ggplot(data, aes(x = Age, y = Income, color = cluster)) +
    geom_point(size = 3) +
    labs(title = "Customer Segments by Age and Income",
         x = "Age", y = "Income") +
    theme_minimal()
}

Conclusion

The hierarchical clustering analysis has identified 3 distinct customer segments. The characteristics of each segment can be used to develop targeted marketing strategies and improve customer engagement.