UCI Machine Learning Repository: https://archive.ics.uci.edu/ml/datasets/Wholesale+customers
A food wholesaler wanted to understand the types of customer that used the store based on their purchasing history. This insight would allow the company to segment its target market and allocate its marketing resources to the group(s) most likely to generate increased sales. To achieve this I used cluster analysis on the purchasing history of 440 clients to segment the customer base into distinct groups.
The data showed spending on six product categories expressed as annualised spending in $USD:
library(readxl)
data <- read_xlsx("C:\\Users\\Dan\\Desktop\\R\\Markdowns\\Wholesale_data.xlsx")
head(data)
## # A tibble: 6 x 6
## Fresh Milk Grocery Frozen DP Deli
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 12669 9656 7561 214 2674 1338
## 2 7057 9810 9568 1762 3293 1776
## 3 6353 8808 7684 2405 3516 7844
## 4 13265 1196 4221 6404 507 1788
## 5 22615 5410 7198 3915 1777 5185
## 6 9413 8259 5126 666 1795 1451
apply(data, 2, function (x) sum(is.na(x)))
## Fresh Milk Grocery Frozen DP Deli
## 0 0 0 0 0 0
There were no missing values and all units are the same so no scaling required
summary(data)
## Fresh Milk Grocery Frozen
## Min. : 3 Min. : 55 Min. : 3 Min. : 25.0
## 1st Qu.: 3128 1st Qu.: 1533 1st Qu.: 2153 1st Qu.: 742.2
## Median : 8504 Median : 3627 Median : 4756 Median : 1526.0
## Mean : 12000 Mean : 5796 Mean : 7951 Mean : 3071.9
## 3rd Qu.: 16934 3rd Qu.: 7190 3rd Qu.:10656 3rd Qu.: 3554.2
## Max. :112151 Max. :73498 Max. :92780 Max. :60869.0
## DP Deli
## Min. : 3.0 Min. : 3.0
## 1st Qu.: 256.8 1st Qu.: 408.2
## Median : 816.5 Median : 965.5
## Mean : 2881.5 Mean : 1524.9
## 3rd Qu.: 3922.0 3rd Qu.: 1820.2
## Max. :40827.0 Max. :47943.0
library(tidyr)
library(ggplot2)
boxplot_df <- gather(data, key = "Product", value = "Spend")
ggplot(boxplot_df, aes(x = Product, y = Spend, fill = Product)) +
geom_boxplot() + coord_flip() +
ggtitle("Distribution of Customer Spend by Product")
The top spenders on each product made the distribution heavily right skewed. K-means and Hierarchical clustering methods are not very robust when dealing with outliers so I used two approaches:
By convention the ‘minPts’ parameter was set to (dimensions) + 1 = 6 and neighbourhood radius ‘eps’ was chosen at the “elbow” of kNN distance plot which was at a height of 7,000.
library(dbscan)
kNNdistplot(data, k = 6)
abline(h = 7000, col = "red", lty = 2)
results <- dbscan(data, eps = 7000, minPts = 6)
results
## DBSCAN clustering for 440 objects.
## Parameters: eps = 7000, minPts = 6
## The clustering contains 1 cluster(s) and 49 noise points.
##
## 0 1
## 49 391
##
## Available fields: cluster, eps, minPts
hullplot(data, results)
DBSCAN was only able to identify one cluster and 49 noise points, so wasn’t helpful to our analysis.
The top 6 spenders in each category were identified and removed.
library(dplyr)
IDddata <- mutate(data, ID = 1:440)
IDs <- apply(data, 2, function (x) sort(x, index.return = TRUE, decreasing = TRUE))
top6spenderIDs <- unique(c(IDs$Fresh$ix[1:6],
IDs$Milk$ix[1:6],
IDs$Grocery$ix[1:6],
IDs$Frozen$ix[1:6],
IDs$DP$ix[1:6],
IDs$Deli$ix[1:6]))
data_processed <- data[-top6spenderIDs,]
Euclidian distance was calculated and the algorithm was run to produce the uncut dendrogram.
dist_customers <- dist(data_processed, method = "euclidean")
hc_customers <- hclust(dist_customers, method = "complete")
library(factoextra)
fviz_dend(hc_customers, show_labels = FALSE, main = "Uncut Dendrogram")
I chose a cut height of 40,000 since any lower would have risked the robustness of the model and left any insights vulnerable to small changes in the source data.
cluster_assign <- cutree(hc_customers, h = 40000)
table(cluster_assign)
## cluster_assign
## 1 2 3
## 332 49 37
fviz_dend(hc_customers,
k = 3,
show_labels = FALSE,
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
rect = TRUE,
rect_border = c("#2E9FDF", "#00AFBB", "#E7B800"),
rect_fill = TRUE,
main = "Three Cluster Dendrogram")
data_segmented <- mutate(data_processed, Cluster = cluster_assign)
data_segmented %>%
group_by(Cluster) %>%
summarise_all(funs(round(mean(.)))) %>%
mutate(TotalSpend = rowSums(.[2:7]), ClusterSize = as.integer(c(332,49,37)))
## # A tibble: 3 x 9
## Cluster Fresh Milk Grocery Frozen DP Deli TotalSpend ClusterSize
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 8369 4055 5401 2431 1821 1111 23188 332
## 2 2 31865 4852 5591 4249 928 1751 49236 49
## 3 3 4421 14221 23849 1452 10771 1703 56417 37
The Hierarchical clustering method identified three distinct customer profiles. These were compared to results of k-means clustering to converge on a conclusion.
I used two methods for selecting a value for k, the number of clusters to create.
library(purrr)
tot_withinss <- map_dbl(2:10, function(k){
model <- kmeans(data_processed, centers = k, nstart = 20)
model$tot.withinss
})
screeplot_df <- data.frame(k = 2:10,
tot_withinss = tot_withinss)
ggplot(screeplot_df, aes(x = k, y = tot_withinss)) +
geom_line() +
scale_x_continuous(breaks = 1:10) +
labs(x = "k", y = "Within Cluster Sum of Squares") +
ggtitle("Total Within Cluster Sum of Squares by # of Clusters (k)") +
geom_point(data = screeplot_df[2,], aes(x = k, y = tot_withinss),
col = "red2", pch = 4, size = 7)
The “elbow” is at k=3
library(cluster)
avg_silwidth <- map_dbl(2:10, function(k){
model <- pam(data_processed, k = k)
model$silinfo$avg.width
})
sil_width_df <- data.frame(k = 2:10,
avg_silwidth = avg_silwidth)
ggplot(sil_width_df, aes(x = k, y = avg_silwidth)) + geom_line() +
labs(x = "k", y = "Average Silhouette Width") +
ggtitle("Average Silhouette Width by Number of Clusters (k)") +
scale_x_continuous(breaks = 1:10) +
geom_point(data = sil_width_df[2,], aes(x = k, y = avg_silwidth),
col="red2", pch = 4, size = 6)
The maximum average silhouette width was achieved using k = 3 clusters so I used a value of 3 for k-means clustering
km.out <- kmeans(data_processed, centers = 3, nstart = 50)
hclust_assign <- km.out$cluster
km_segment_customers <- mutate(data_processed, Cluster = hclust_assign)
table(km_segment_customers$Cluster)
##
## 1 2 3
## 238 100 80
km_segment_customers %>%
group_by(Cluster) %>%
summarise_all(funs(round(mean(.)))) %>%
mutate(TotalSpend = rowSums(.[2:7]), ClusterSize = as.integer(c(238,80,100)))
## # A tibble: 3 x 9
## Cluster Fresh Milk Grocery Frozen DP Deli TotalSpend ClusterSize
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 6686 3203 4015 2390 1178 967 18439 238
## 2 2 25342 4010 5171 3764 1126 1566 40979 80
## 3 3 4724 11837 18462 1547 8196 1639 46405 100
K-means clustering also showed three distinct customer profiles but with a more even distribution of customers between the clusters.
## # A tibble: 3 x 9
## Cluster Fresh Milk Grocery Frozen DP Deli TotalSpend ClusterSize
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 8369 4055 5401 2431 1821 1111 23188 332
## 2 2 31865 4852 5591 4249 928 1751 49236 49
## 3 3 4421 14221 23849 1452 10771 1703 56417 37
## # A tibble: 3 x 9
## Cluster Fresh Milk Grocery Frozen DP Deli TotalSpend ClusterSize
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1 6686 3203 4015 2390 1178 967 18439 238
## 2 2 25342 4010 5171 3764 1126 1566 40979 80
## 3 3 4724 11837 18462 1547 8196 1639 46405 100
The models agreed that the customer base of the wholesaler could be broken down into three profiles.
Small spenders
Customers who had a small total spend, below $25,000 per year on average.
Big spenders on Fresh products
Customers who spent over $30,000 per year with over half of their total spend on fresh products.
Big spenders on Grocery, Milk, Detergents and Paper
Customers who spent over $30,000 per year with milk, grocery, detergent and paper products accounting for over two thirds of their total spend.