This document performs hierarchical clustering analysis for market segmentation. The analysis helps identify distinct customer groups based on common characteristics, which can inform targeted marketing strategies.
# Load required packages
library(readr)
library(ggplot2)
# Import dataset
mydata <- read_csv('customer_segmentation.csv')
## Rows: 22 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (15): ID, CS_helpful, Recommend, Come_again, All_Products, Profesionalis...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Display the first few rows of the dataset
head(mydata)
## # A tibble: 6 × 15
## ID CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 2 2 2 2 2 2
## 2 2 1 2 1 1 1 1
## 3 3 2 1 1 1 1 2
## 4 4 3 3 2 4 1 2
## 5 5 2 1 3 5 2 1
## 6 6 1 1 3 2 1 1
## # ℹ 8 more variables: Online_grocery <dbl>, delivery <dbl>, Pick_up <dbl>,
## # Find_items <dbl>, other_shops <dbl>, Gender <dbl>, Age <dbl>,
## # Education <dbl>
# Summary statistics
summary(mydata)
## ID CS_helpful Recommend Come_again
## Min. : 1.00 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.: 6.25 1st Qu.:1.000 1st Qu.:1.000 1st Qu.:1.000
## Median :11.50 Median :1.000 Median :1.000 Median :1.000
## Mean :11.50 Mean :1.591 Mean :1.318 Mean :1.455
## 3rd Qu.:16.75 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:2.000
## Max. :22.00 Max. :3.000 Max. :3.000 Max. :3.000
## All_Products Profesionalism Limitation Online_grocery delivery
## Min. :1.000 Min. :1.000 Min. :1.0 Min. :1.000 Min. :1.000
## 1st Qu.:1.250 1st Qu.:1.000 1st Qu.:1.0 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :1.000 Median :1.0 Median :2.000 Median :3.000
## Mean :2.091 Mean :1.409 Mean :1.5 Mean :2.273 Mean :2.409
## 3rd Qu.:2.000 3rd Qu.:2.000 3rd Qu.:2.0 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :5.000 Max. :3.000 Max. :4.0 Max. :3.000 Max. :3.000
## Pick_up Find_items other_shops Gender
## Min. :1.000 Min. :1.000 Min. :1.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:1.250 1st Qu.:1.000
## Median :2.000 Median :1.000 Median :2.000 Median :1.000
## Mean :2.455 Mean :1.455 Mean :2.591 Mean :1.273
## 3rd Qu.:3.000 3rd Qu.:2.000 3rd Qu.:3.750 3rd Qu.:1.750
## Max. :5.000 Max. :3.000 Max. :5.000 Max. :2.000
## Age Education
## Min. :2.000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:2.000
## Median :2.000 Median :2.500
## Mean :2.455 Mean :3.182
## 3rd Qu.:3.000 3rd Qu.:5.000
## Max. :4.000 Max. :5.000
# Standardize the data (exclude ID column if present)
# Adjust the column indices as needed based on your data structure
use <- scale(mydata[,-1], center = TRUE, scale = TRUE)
# Check the result of standardization
head(use)
## CS_helpful Recommend Come_again All_Products Profesionalism Limitation
## [1,] 0.5572385 1.0548991 0.7385489 -0.08536162 1.0009877 0.6236096
## [2,] -0.8049001 1.0548991 -0.6154575 -1.02433946 -0.6929915 -0.6236096
## [3,] 0.5572385 -0.4922862 -0.6154575 -1.02433946 -0.6929915 0.6236096
## [4,] 1.9193772 2.6020844 0.7385489 1.79259406 -0.6929915 0.6236096
## [5,] 0.5572385 -0.4922862 2.0925553 2.73157191 1.0009877 -0.6236096
## [6,] -0.8049001 -0.4922862 2.0925553 -0.08536162 -0.6929915 -0.6236096
## Online_grocery delivery Pick_up Find_items other_shops Gender
## [1,] -0.3554390 0.8049001 1.4623535 -0.6774335 -0.4212692 -0.598293
## [2,] -0.3554390 0.8049001 0.5161248 -0.6774335 -0.4212692 -0.598293
## [3,] 0.9478374 0.8049001 -0.4301040 -0.6774335 0.2916479 -0.598293
## [4,] 0.9478374 0.8049001 -0.4301040 0.8129201 -0.4212692 -0.598293
## [5,] -0.3554390 0.8049001 -1.3763327 0.8129201 0.2916479 1.595448
## [6,] -1.6587154 -0.5572385 -1.3763327 -0.6774335 1.0045650 -0.598293
## Age Education
## [1,] -0.6154575 -0.7284586
## [2,] -0.6154575 -0.7284586
## [3,] -0.6154575 -0.7284586
## [4,] 0.7385489 1.1207055
## [5,] 2.0925553 -0.7284586
## [6,] -0.6154575 1.1207055
# Calculate distance matrix
dist_matrix <- dist(use)
# Apply hierarchical clustering
seg.hclust <- hclust(dist_matrix)
# Plot dendrogram
plot(seg.hclust, main = "Hierarchical Clustering Dendrogram",
xlab = "Observations", sub = "", cex = 0.6)
# Add colored rectangles to highlight clusters
rect.hclust(seg.hclust, k = 3, border = "red")
# Cut tree to form 3 clusters
# You can adjust the number of clusters based on your analysis
groups.3 <- cutree(seg.hclust, 3)
# Count observations in each cluster
table(groups.3)
## groups.3
## 1 2 3
## 17 3 2
# Find members in each cluster
cluster1_members <- mydata$ID[groups.3 == 1]
cluster2_members <- mydata$ID[groups.3 == 2]
cluster3_members <- mydata$ID[groups.3 == 3]
# Display the first few members of each cluster
head(cluster1_members)
## [1] 1 2 3 6 7 8
head(cluster2_members)
## [1] 4 20 22
head(cluster3_members)
## [1] 5 19
# Calculate mean values for each variable by cluster
cluster_means <- aggregate(mydata[,-1], list(Cluster = groups.3), mean)
print(cluster_means)
## Cluster CS_helpful Recommend Come_again All_Products Profesionalism
## 1 1 1.294118 1.117647 1.235294 1.823529 1.235294
## 2 2 2.666667 2.666667 2.000000 2.333333 2.000000
## 3 3 2.500000 1.000000 2.500000 4.000000 2.000000
## Limitation Online_grocery delivery Pick_up Find_items other_shops Gender
## 1 1.352941 2.235294 2.235294 2.705882 1.294118 2.647059 1.176471
## 2 1.666667 3.000000 3.000000 2.000000 1.666667 2.333333 1.333333
## 3 2.500000 1.500000 3.000000 1.000000 2.500000 2.500000 2.000000
## Age Education
## 1 2.411765 3.117647
## 2 2.333333 4.333333
## 3 3.000000 2.000000
# Calculate median values for each variable by cluster
cluster_medians <- aggregate(mydata[,-1], list(Cluster = groups.3), median)
print(cluster_medians)
## Cluster CS_helpful Recommend Come_again All_Products Profesionalism
## 1 1 1.0 1 1.0 2 1
## 2 2 3.0 3 2.0 2 2
## 3 3 2.5 1 2.5 4 2
## Limitation Online_grocery delivery Pick_up Find_items other_shops Gender Age
## 1 1.0 2.0 2 3 1.0 2.0 1 2
## 2 2.0 3.0 3 2 2.0 2.0 1 2
## 3 2.5 1.5 3 1 2.5 2.5 2 3
## Education
## 1 2
## 2 5
## 3 2
# Visualize differences between clusters
# Choose a few key variables for visualization
key_vars <- names(mydata[,-1])[1:3] # Adjust indices as needed
# Create a function to plot cluster differences
plot_cluster_diff <- function(var_name) {
boxplot(mydata[[var_name]] ~ groups.3,
main = paste("Distribution of", var_name, "by Cluster"),
xlab = "Cluster", ylab = var_name)
}
# Apply the function to key variables
for (var in key_vars) {
plot_cluster_diff(var)
}
# Perform PCA
pca_result <- prcomp(use, scale = TRUE)
# Summary of PCA
summary(pca_result)
# Plot the first two principal components with cluster colors
pca_data <- as.data.frame(pca_result$x[,1:2])
pca_data$cluster <- as.factor(groups.3)
ggplot(pca_data, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(alpha = 0.7) +
theme_minimal() +
labs(title = "PCA Plot with Cluster Assignment",
x = "Principal Component 1",
y = "Principal Component 2")
# Export cluster assignments
write.csv(data.frame(ID = mydata$ID, Cluster = groups.3),
"cluster_assignments.csv", row.names = FALSE)
# Export cluster statistics
write.csv(cluster_means, "cluster_means.csv", row.names = FALSE)
write.csv(cluster_medians, "cluster_medians.csv", row.names = FALSE)