Market Segmentation using Hierarchical Clustering

Project Overview

This document performs hierarchical clustering analysis for market segmentation. The analysis helps identify distinct customer groups based on common characteristics, which can inform targeted marketing strategies.

Data Import and Preparation

# Load required packages
library(readr)
library(ggplot2)

# Import dataset
mydata <- read_csv('customer_segmentation.csv')

## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Display the first few rows of the dataset
head(mydata)

## # A tibble: 6 × 15
##      ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
##   <dbl>      <dbl>     <dbl>      <dbl>        <dbl>          <dbl>      <dbl>
## 1     1          2         2          2            2              2          2
## 2     2          1         2          1            1              1          1
## 3     3          2         1          1            1              1          2
## 4     4          3         3          2            4              1          2
## 5     5          2         1          3            5              2          1
## 6     6          1         1          3            2              1          1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## #   Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## #   Education <dbl>

# Summary statistics
summary(mydata)

##        ID          CS_helpful      Recommend       Come_again   
##  Min.   : 1.00   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 6.25   1st Qu.:1.000   1st Qu.:1.000   1st Qu.:1.000  
##  Median :11.50   Median :1.000   Median :1.000   Median :1.000  
##  Mean   :11.50   Mean   :1.591   Mean   :1.318   Mean   :1.455  
##  3rd Qu.:16.75   3rd Qu.:2.000   3rd Qu.:1.000   3rd Qu.:2.000  
##  Max.   :22.00   Max.   :3.000   Max.   :3.000   Max.   :3.000  
##   All_Products   Profesionalism    Limitation  Online_grocery     delivery    
##  Min.   :1.000   Min.   :1.000   Min.   :1.0   Min.   :1.000   Min.   :1.000  
##  1st Qu.:1.250   1st Qu.:1.000   1st Qu.:1.0   1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :1.000   Median :1.0   Median :2.000   Median :3.000  
##  Mean   :2.091   Mean   :1.409   Mean   :1.5   Mean   :2.273   Mean   :2.409  
##  3rd Qu.:2.000   3rd Qu.:2.000   3rd Qu.:2.0   3rd Qu.:3.000   3rd Qu.:3.000  
##  Max.   :5.000   Max.   :3.000   Max.   :4.0   Max.   :3.000   Max.   :3.000  
##     Pick_up        Find_items     other_shops        Gender     
##  Min.   :1.000   Min.   :1.000   Min.   :1.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:1.000   1st Qu.:1.250   1st Qu.:1.000  
##  Median :2.000   Median :1.000   Median :2.000   Median :1.000  
##  Mean   :2.455   Mean   :1.455   Mean   :2.591   Mean   :1.273  
##  3rd Qu.:3.000   3rd Qu.:2.000   3rd Qu.:3.750   3rd Qu.:1.750  
##  Max.   :5.000   Max.   :3.000   Max.   :5.000   Max.   :2.000  
##       Age          Education    
##  Min.   :2.000   Min.   :1.000  
##  1st Qu.:2.000   1st Qu.:2.000  
##  Median :2.000   Median :2.500  
##  Mean   :2.455   Mean   :3.182  
##  3rd Qu.:3.000   3rd Qu.:5.000  
##  Max.   :4.000   Max.   :5.000

Data Standardization

# Standardize the data (exclude ID column if present)
# Adjust the column indices as needed based on your data structure
use <- scale(mydata[,-1], center = TRUE, scale = TRUE)

# Check the result of standardization
head(use)

##      CS_helpful  Recommend Come_again All_Products Profesionalism Limitation
## [1,]  0.5572385  1.0548991  0.7385489  -0.08536162      1.0009877  0.6236096
## [2,] -0.8049001  1.0548991 -0.6154575  -1.02433946     -0.6929915 -0.6236096
## [3,]  0.5572385 -0.4922862 -0.6154575  -1.02433946     -0.6929915  0.6236096
## [4,]  1.9193772  2.6020844  0.7385489   1.79259406     -0.6929915  0.6236096
## [5,]  0.5572385 -0.4922862  2.0925553   2.73157191      1.0009877 -0.6236096
## [6,] -0.8049001 -0.4922862  2.0925553  -0.08536162     -0.6929915 -0.6236096
##      Online_grocery   delivery    Pick_up Find_items other_shops    Gender
## [1,]     -0.3554390  0.8049001  1.4623535 -0.6774335  -0.4212692 -0.598293
## [2,]     -0.3554390  0.8049001  0.5161248 -0.6774335  -0.4212692 -0.598293
## [3,]      0.9478374  0.8049001 -0.4301040 -0.6774335   0.2916479 -0.598293
## [4,]      0.9478374  0.8049001 -0.4301040  0.8129201  -0.4212692 -0.598293
## [5,]     -0.3554390  0.8049001 -1.3763327  0.8129201   0.2916479  1.595448
## [6,]     -1.6587154 -0.5572385 -1.3763327 -0.6774335   1.0045650 -0.598293
##             Age  Education
## [1,] -0.6154575 -0.7284586
## [2,] -0.6154575 -0.7284586
## [3,] -0.6154575 -0.7284586
## [4,]  0.7385489  1.1207055
## [5,]  2.0925553 -0.7284586
## [6,] -0.6154575  1.1207055

Hierarchical Clustering Analysis

# Calculate distance matrix
dist_matrix <- dist(use)

# Apply hierarchical clustering
seg.hclust <- hclust(dist_matrix)

# Plot dendrogram
plot(seg.hclust, main = "Hierarchical Clustering Dendrogram", 
     xlab = "Observations", sub = "", cex = 0.6)

# Add colored rectangles to highlight clusters
rect.hclust(seg.hclust, k = 3, border = "red")

Identifying Cluster Memberships

# Cut tree to form 3 clusters
# You can adjust the number of clusters based on your analysis
groups.3 <- cutree(seg.hclust, 3)

# Count observations in each cluster
table(groups.3)

## groups.3
##  1  2  3 
## 17  3  2

# Find members in each cluster
cluster1_members <- mydata$ID[groups.3 == 1]
cluster2_members <- mydata$ID[groups.3 == 2]
cluster3_members <- mydata$ID[groups.3 == 3]

# Display the first few members of each cluster
head(cluster1_members)

## [1] 1 2 3 6 7 8

head(cluster2_members)

## [1]  4 20 22

head(cluster3_members)

## [1]  5 19

Analyzing Cluster Characteristics

# Calculate mean values for each variable by cluster
cluster_means <- aggregate(mydata[,-1], list(Cluster = groups.3), mean)
print(cluster_means)

##   Cluster CS_helpful Recommend Come_again All_Products Profesionalism
## 1       1   1.294118  1.117647   1.235294     1.823529       1.235294
## 2       2   2.666667  2.666667   2.000000     2.333333       2.000000
## 3       3   2.500000  1.000000   2.500000     4.000000       2.000000
##   Limitation Online_grocery delivery  Pick_up Find_items other_shops   Gender
## 1   1.352941       2.235294 2.235294 2.705882   1.294118    2.647059 1.176471
## 2   1.666667       3.000000 3.000000 2.000000   1.666667    2.333333 1.333333
## 3   2.500000       1.500000 3.000000 1.000000   2.500000    2.500000 2.000000
##        Age Education
## 1 2.411765  3.117647
## 2 2.333333  4.333333
## 3 3.000000  2.000000

# Calculate median values for each variable by cluster
cluster_medians <- aggregate(mydata[,-1], list(Cluster = groups.3), median)
print(cluster_medians)

##   Cluster CS_helpful Recommend Come_again All_Products Profesionalism
## 1       1        1.0         1        1.0            2              1
## 2       2        3.0         3        2.0            2              2
## 3       3        2.5         1        2.5            4              2
##   Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age
## 1        1.0            2.0        2       3        1.0         2.0      1   2
## 2        2.0            3.0        3       2        2.0         2.0      1   2
## 3        2.5            1.5        3       1        2.5         2.5      2   3
##   Education
## 1         2
## 2         5
## 3         2

# Visualize differences between clusters
# Choose a few key variables for visualization
key_vars <- names(mydata[,-1])[1:3]  # Adjust indices as needed

# Create a function to plot cluster differences
plot_cluster_diff <- function(var_name) {
  boxplot(mydata[[var_name]] ~ groups.3, 
          main = paste("Distribution of", var_name, "by Cluster"),
          xlab = "Cluster", ylab = var_name)
}

# Apply the function to key variables
for (var in key_vars) {
  plot_cluster_diff(var)
}

Principal Component Analysis (Optional)

# Perform PCA
pca_result <- prcomp(use, scale = TRUE)

# Summary of PCA
summary(pca_result)

# Plot the first two principal components with cluster colors
pca_data <- as.data.frame(pca_result$x[,1:2])
pca_data$cluster <- as.factor(groups.3)

ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(alpha = 0.7) +
  theme_minimal() +
  labs(title = "PCA Plot with Cluster Assignment",
       x = "Principal Component 1",
       y = "Principal Component 2")

Export Results

# Export cluster assignments
write.csv(data.frame(ID = mydata$ID, Cluster = groups.3), 
          "cluster_assignments.csv", row.names = FALSE)

# Export cluster statistics
write.csv(cluster_means, "cluster_means.csv", row.names = FALSE)
write.csv(cluster_medians, "cluster_medians.csv", row.names = FALSE)