Note: The objective is not to predict a given outcome but to discover useful patterns in the data.

1 Clear Workspace

rm(list = ls())

2 Load packages

library(ggplot2)
library(dplyr)
library(pastecs)
library(fpc)
library(FactoMineR)
library(readxl)

3 Load data

raw.data <- read_excel("segmentationdata.xlsx")
data <- raw.data

4 Data wrangling

4.1 Calculate Total spending

data$Total <- data$Quantity * data$UnitPrice

4.2 Select column

cust_data <- select(data, c(InvoiceDate, CustomerID, Total))

4.3 RFM indicators

# rename column 
cust_data <- rename(cust_data,
                    customer_id = CustomerID, 
                    purchase_amount = Total, 
                    date_of_purchase= InvoiceDate)

# Convert purchase date to R date object
cust_data$date_of_purchase <- as.Date(cust_data$date_of_purchase, "%Y-%m-%d")

## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y-%m-%d'

# recency
cust_data$days_since <- as.numeric(difftime(time1 = "2012-01-01",
                                            time2 = cust_data$date_of_purchase,
                                            units = "days"))

# frequency & avg_amount spend
customers <- cust_data %>% 
  group_by(customer_id) %>% 
  summarise(recency = min(days_since),
            frequency = n(),
            avg_amount = mean(purchase_amount))

customers <- customers %>% filter(!is.na(customer_id))
customers <- customers %>% filter(!avg_amount <= 0)

5 Data exploration

5.1 Descriptive statistics

options(digits = 2, scipen = 99)
stat.desc(select(customers, -customer_id))

##                recency frequency avg_amount
## nbr.val        4317.00    4317.0    4317.00
## nbr.null          0.00       0.0       0.00
## nbr.na            0.00       0.0       0.00
## min              22.71       1.0       0.73
## max             395.71    7983.0    3861.00
## range           373.00    7982.0    3860.28
## sum          486143.88  406236.0  135070.90
## median           71.71      43.0      17.02
## mean            112.61      94.1      31.29
## SE.mean           1.51       3.6       1.60
## CI.mean.0.95      2.96       7.0       3.13
## var            9833.05   54634.7   11017.51
## std.dev          99.16     233.7     104.96
## coef.var          0.88       2.5       3.35

# data types
str(customers)

## tibble [4,317 × 4] (S3: tbl_df/tbl/data.frame)
##  $ customer_id: num [1:4317] 12347 12348 12349 12350 12352 ...
##  $ recency    : num [1:4317] 24.7 97.7 40.7 332.7 58.7 ...
##  $ frequency  : int [1:4317] 182 31 73 17 95 4 58 13 59 131 ...
##  $ avg_amount : num [1:4317] 23.7 58 24.1 19.7 16.3 ...

5.2 Distributions

5.2.1 Frequency

ggplot(customers, aes(x = frequency)) +
   geom_histogram(bins = 20, fill = "steelblue") + #bin: width of bar 
   theme_classic() +
   labs(x = "", title = "Purchase frequency")

  # + scale_x_continuous(breaks = c(0, 5, 10, 15, 20, 25, 30, 35, 45, 50, 60, 70, 100, 200, 300, 400, 500, 8000))

Note: The distribution is severely right skewed. Most customers have made between 1 and 500 purchases.

dim(filter(customers, frequency == 4)) [1]/dim(customers)[1]

## [1] 0.012

Note: Almost 1.2% of customers have only ever made 4 purchase

5.2.2 Recency

ggplot(customers, aes(x = recency)) +
   geom_histogram(fill = "darkred") +
   theme_classic() +
   labs(x = "", y = "", title = "Recency in Days") +
  scale_x_continuous(breaks = c(0,20,40,50,70,80,100,120,200,250,300,350,400))

dim(filter(customers, recency < 365))[1]/dim(customers)[1]

## [1] 0.97

Note: 97% of customers have been active in the last year.

5.2.3 Amount

summary(customers$avg_amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1      11      17      31      24    3861

ggplot(customers, aes(x = avg_amount)) +
   geom_histogram(bins = 20, fill = "darkred") +
   theme_classic() +
   labs(x = "average amount", title = "Average purchase amount")

ggplot(customers, aes(x = avg_amount)) +
   geom_histogram(bins = 20, fill = "darkred") +
   theme_classic() +
   labs(x = "amount (log10)", 
        title = "Average purchase amount\n (Data are log transformed to ease visualization)") +
   scale_x_log10()

6 Statistical Segmentation

6.1 Data preparation for modelling

# Remove customer_id column and set as row names
row.names(customers) <- customers$customer_id

## Warning: Setting row names on a tibble is deprecated.

customers <- select(customers, -customer_id)

Scale data to balance the influence of variables measured on different scales and/or with vastly different variances

clust_data <- scale(customers)

6.2 Clustering Method

set.seed(11)
hc.complete =hclust (dist(clust_data), method = "complete")
hc.average =hclust (dist(clust_data), method = "average")
hc.ward =hclust (dist(clust_data), method = "ward.D2")

6.3 Dendrograms

plot(hc.complete, main = "Complete Linkage", xlab = "", sub = "", cex = .9)

plot(hc.average, main = "Average Linkage", xlab = "", sub = "", cex = .9)

plot(hc.ward, main = "Cluster Tree", xlab = "", sub = "", cex = .9)
rect.hclust(hc.ward, k = 5, border = "darkred")

Note: The Ward.D2 linkage provides the most balanced dendrogram and will be used to obtain the clusters.

Next, Cut the dendrogram to produce 5 clusters. This choice is partly based on cluster validity evaluation by R packages {NbClust} and {clValid}. The tests are unfortunately not reproducible here due to memory constraints.

plot(hc.ward, main = "Ward.D2 agglomeration method", xlab = "", sub = "", cex = .9)
abline(h = 30, col = "red")

clusters <- cutree(hc.ward, 5)
table(clusters)

## clusters
##    1    2    3    4    5 
## 3060 1131  116    4    6

customers$cluster <- clusters

6.4 Segment Centroids

We calculate the average recency, frequency and amount spent for each segment:

profiles <- customers %>% 
  group_by(cluster) %>%
  summarise_each(funs(mean))

## Warning: `summarise_each_()` was deprecated in dplyr 0.7.0.
## Please use `across()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

cluster_counts <- customers %>% 
  group_by(cluster) %>% 
  summarise(counts = n())

profiles <- mutate(profiles, counts = cluster_counts$counts) 
profiles

## # A tibble: 5 × 5
##   cluster recency frequency avg_amount counts
##     <int>   <dbl>     <dbl>      <dbl>  <int>
## 1       1    62.5     86.6        30.7   3060
## 2       2   257.      30.1        22.3   1131
## 3       3    31.4    719.         19.5    116
## 4       4    24.2   5914          11.2      4
## 5       5   112.       5.17     2248.       6

7 Cluster Visulisation

7.1 Pairwise

customers$cluster <- as.factor(customers$cluster)

ggplot(data = customers,
       aes(x = frequency, y = avg_amount, colour = cluster, shape = cluster)) +
  geom_point() +
  theme_dark() +
  labs(title = "average amount spent vs. frequency", shape = "cluster", colour = "cluster")

ggplot(data = customers,
       aes(x = frequency, y = avg_amount, colour = cluster, shape = cluster)) +
  geom_point() +
  theme_grey() +
  labs(title = "", 
       x = "purchase frequency", y = "$", shape = "customer\nclusters", colour = "customer\nclusters")

# avg_amount against recency with segment as the grouping variable 
ggplot(data = customers,
       aes(x = recency, y = avg_amount, colour = cluster, shape = cluster)) +
  geom_point() +
  theme_dark() +
  labs(title = "average amount spent vs recency",  shape = "cluster", colour = "cluster")

7.2 First 2 Principal Components

# Project clusters onto the first 2 principal components
prin_comp <- princomp(clust_data) #clust_data <- du lieu da duoc scale 
nComp <- 2
project <- predict(prin_comp, newdata=clust_data)[,1:nComp]

# Create dataframe with transformed data
project_data <- cbind(as.data.frame(project),
                      cluster = as.factor(clusters))

ggplot(project_data, aes(x=Comp.1, y=Comp.2)) +
   geom_point(aes(shape=cluster, colour = cluster)) +
   theme_dark() +
   labs(x = "Principal Component 1", y = "Principal Component 2")

Segment 4,5 clearly separates itself out and shows significant variability.

8 Cluster evaluation

The clusterboot algorithm attempts to answer this question by evaluating the robustness of each cluster to perturbations (nhiễu loạn) in the data. The cluster’s stability is evaluated by measuring the similarity between sets of a 100 bootstrap samples.

clust_no <- 5

set.seed(12)
cboot_hclust <- clusterboot(clust_data, 
                            clustermethod = hclustCBI,
                            method = "ward.D2", 
                            k = clust_no)

## boot 1 
## boot 2 
## boot 3 
## boot 4 
## boot 5 
## boot 6 
## boot 7 
## boot 8 
## boot 9 
## boot 10 
## boot 11 
## boot 12 
## boot 13 
## boot 14 
## boot 15 
## boot 16 
## boot 17 
## boot 18 
## boot 19 
## boot 20 
## boot 21 
## boot 22 
## boot 23 
## boot 24 
## boot 25 
## boot 26 
## boot 27 
## boot 28 
## boot 29 
## boot 30 
## boot 31 
## boot 32 
## boot 33 
## boot 34 
## boot 35 
## boot 36 
## boot 37 
## boot 38 
## boot 39 
## boot 40 
## boot 41 
## boot 42 
## boot 43 
## boot 44 
## boot 45 
## boot 46 
## boot 47 
## boot 48 
## boot 49 
## boot 50 
## boot 51 
## boot 52 
## boot 53 
## boot 54 
## boot 55 
## boot 56 
## boot 57 
## boot 58 
## boot 59 
## boot 60 
## boot 61 
## boot 62 
## boot 63 
## boot 64 
## boot 65 
## boot 66 
## boot 67 
## boot 68 
## boot 69 
## boot 70 
## boot 71 
## boot 72 
## boot 73 
## boot 74 
## boot 75 
## boot 76 
## boot 77 
## boot 78 
## boot 79 
## boot 80 
## boot 81 
## boot 82 
## boot 83 
## boot 84 
## boot 85 
## boot 86 
## boot 87 
## boot 88 
## boot 89 
## boot 90 
## boot 91 
## boot 92 
## boot 93 
## boot 94 
## boot 95 
## boot 96 
## boot 97 
## boot 98 
## boot 99 
## boot 100

bootMean_data <- data.frame(cluster = 1:5, bootMeans = cboot_hclust$bootmean)

ggplot(data = bootMean_data, aes(x = cluster, y = bootMeans)) +
  geom_point(aes(colour = "darkred", size = 1)) +
  geom_hline(yintercept = c(0.6, 0.8)) +
  labs(y = "stability", title = "Stability evaluation") +
  theme(legend.position="none")

Note: A rule of thumb for interpreting stability values suggest that clusters scored below 0.6 should be considered unstable. Those between 0.6 and 0.75 are measuring patterns to a reasonable degree. Scores above 0.8 indicate highly stable clusters.

Unsupervised learning task: Clustering method

Apr 28, 2022