Library Loading
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
library(fpc)
library(FactoMineR)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
data <- read_excel("mysample.xlsx")
df <- data
df <- df %>% filter(!NET_VAL_S_100 <= 0 & !QUANT_B <= 0) # Successfyl Transaction thanh cong
Data loading
cust_data <- df[c(4,6,16)]
Data Preparation
Name original columns
cust_data <- rename(cust_data,
customer_id = SOLD_TO, purchase_amount = NET_VAL_S_100, date_of_purchase = BILL_DATE)
Convert purchase date to R date object
cust_data$date_of_purchase <- as.Date(cust_data$date_of_purchase, "%Y-%m-%d")
Create recency variable
cust_data$days_since <- as.numeric(difftime(time1 = "2020-04-15",
time2 = cust_data$date_of_purchase,
units = "days"))
Create analysis dataset including 'frequency' variable
customers <- cust_data %>%
group_by(customer_id) %>%
summarise(recency = min(days_since),
frequency = n(),
avg_amount = mean(purchase_amount))
Data Exploration
Descriptive statistics
options(digits = 2, scipen = 99)
stat.desc(select(customers, -customer_id))
## recency frequency avg_amount
## nbr.val 6467.00 6467.00 6467.0
## nbr.null 0.00 0.00 0.0
## nbr.na 0.00 0.00 0.0
## min 14.71 1.00 15000.0
## max 378.71 902.00 3584518111.0
## range 364.00 901.00 3584503111.0
## sum 1075483.79 10000.00 44735388707.9
## median 146.71 1.00 2687700.0
## mean 166.30 1.55 6917487.0
## SE.mean 1.26 0.14 615234.2
## CI.mean.0.95 2.47 0.28 1206062.6
## var 10280.24 131.66 2447844013985931.0
## std.dev 101.39 11.47 49475691.1
## coef.var 0.61 7.42 7.2
Distribution
# frequency
ggplot(customers, aes(x = frequency)) +
geom_histogram(bins = 20, fill = "steelblue") +
theme_classic() +
labs(x = "", title = "Purchase frequency") +
scale_x_continuous(breaks = c(0, 5, 10, 15, 25, 35, 45))

dim(filter(customers, frequency == 1))[1]/dim(customers)[1]
## [1] 0.91
# recency
ggplot(customers, aes(x = recency)) +
geom_histogram(fill = "steelblue") +
theme_classic() +
labs(x = "", y = "", title = "Recency in Days") +
scale_x_continuous(breaks = c(0, 250, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dim(filter(customers, recency < 365))[1]/dim(customers)[1]
## [1] 0.97
# amount
summary(customers$avg_amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15000 537000 2690000 6920000 5090000 3580000000
ggplot(customers, aes(x = avg_amount)) +
geom_histogram(bins = 20, fill = "steelblue") +
theme_classic() +
labs(x = "average amount", title = "Average purchase amount")

ggplot(customers, aes(x = avg_amount)) +
geom_histogram(bins = 20, fill = "steelblue") +
theme_classic() +
labs(x = "amount (log10)",
title = "Average purchase amount\n (Data are log transformed to ease visualization)") +
scale_x_log10()

Statistical Segmentation
Data preparation for modelling
# Remove customer_id column and set as row names
row.names(customers) <- customers$customer_id
## Warning: Setting row names on a tibble is deprecated.
customers <- select(customers, -customer_id)
clust_data <- scale(customers)
Clustering method
set.seed(11)
hc.complete =hclust (dist(clust_data), method = "complete")
hc.average =hclust (dist(clust_data), method = "average")
hc.ward =hclust (dist(clust_data), method = "ward.D2")
plot(hc.complete, main = "Complete Linkage", xlab = "", sub = "", cex = .9)

plot(hc.average, main = "Average Linkage", xlab = "", sub = "", cex = .9)

plot(hc.ward, main = "Cluster Tree", xlab = "", sub = "", cex = .9)
rect.hclust(hc.ward, k = 5, border = "darkred")

plot(hc.ward, main = "Cluster Tree cut at 5 clusters", xlab = "", sub = "", cex = .9)
abline(h = 45, col = "red")

clusters <- cutree(hc.ward, 5)
table(clusters)
## clusters
## 1 2 3 4 5
## 1 1796 2897 1772 1
#Assign customers to segments
customers$cluster <- clusters
Segment Centroids
profiles <- customers %>%
group_by(cluster) %>%
summarise_each(funs(mean))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
cluster_counts <- customers %>%
group_by(cluster) %>%
summarise(counts = n())
profiles <- mutate(profiles, counts = cluster_counts$counts)
profiles
## # A tibble: 5 x 5
## cluster recency frequency avg_amount counts
## <int> <dbl> <dbl> <dbl> <int>
## 1 1 14.7 902 13077229. 1
## 2 2 176. 1.10 8664311. 1796
## 3 3 74.9 1.82 5231619. 2897
## 4 4 306. 1.05 5880751. 1772
## 5 5 198. 1 3584518111 1
Cluster Visualizations
Pairwise
# frequency against avg_amount with segment as the grouping variable.
customers$cluster <- as.factor(customers$cluster)
ggplot(data = customers,
aes(x = frequency, y = avg_amount, colour = cluster, shape = cluster)) +
geom_point() +
theme_dark() +
labs(title = "average amount spent vs. frequency",
shape = "customer\nclusters", colour = "customer\nclusters")

# avg_amount against recency with segment as the grouping variable.
ggplot(data = customers,
aes(x = recency, y = avg_amount, colour = cluster, shape = cluster)) +
geom_point() +
theme_dark() +
labs(title = "average amount spent vs recency", shape = "cluster", colour = "cluster")

First 2 Principal Components
# Project clusters onto the first 2 principal components
prin_comp <- princomp(clust_data)
nComp <- 2 # This is a more faithful 2 dimensional visualization of the clusters
project <- predict(prin_comp, newdata=clust_data)[,1:nComp]
# Create dataframe with transformed data
project_data <- cbind(as.data.frame(project),
cluster = as.factor(clusters))
ggplot(project_data, aes(x=Comp.1, y=Comp.2)) +
geom_point(aes(shape=cluster, colour = cluster)) +
theme_dark() +
labs(x = "Principal Component 1", y = "Principal Component 2")

Cluster Evaluation
clust_no <- 5
set.seed(12)
cboot_hclust <- clusterboot(clust_data,
clustermethod = hclustCBI,
method = "ward.D2",
k = clust_no)
## boot 1
## boot 2
## boot 3
## boot 4
## boot 5
## boot 6
## boot 7
## boot 8
## boot 9
## boot 10
## boot 11
## boot 12
## boot 13
## boot 14
## boot 15
## boot 16
## boot 17
## boot 18
## boot 19
## boot 20
## boot 21
## boot 22
## boot 23
## boot 24
## boot 25
## boot 26
## boot 27
## boot 28
## boot 29
## boot 30
## boot 31
## boot 32
## boot 33
## boot 34
## boot 35
## boot 36
## boot 37
## boot 38
## boot 39
## boot 40
## boot 41
## boot 42
## boot 43
## boot 44
## boot 45
## boot 46
## boot 47
## boot 48
## boot 49
## boot 50
## boot 51
## boot 52
## boot 53
## boot 54
## boot 55
## boot 56
## boot 57
## boot 58
## boot 59
## boot 60
## boot 61
## boot 62
## boot 63
## boot 64
## boot 65
## boot 66
## boot 67
## boot 68
## boot 69
## boot 70
## boot 71
## boot 72
## boot 73
## boot 74
## boot 75
## boot 76
## boot 77
## boot 78
## boot 79
## boot 80
## boot 81
## boot 82
## boot 83
## boot 84
## boot 85
## boot 86
## boot 87
## boot 88
## boot 89
## boot 90
## boot 91
## boot 92
## boot 93
## boot 94
## boot 95
## boot 96
## boot 97
## boot 98
## boot 99
## boot 100
bootMean_data <- data.frame(cluster = 1:5, bootMeans = cboot_hclust$bootmean)
p <- ggplot(data = bootMean_data, aes(x = cluster, y = bootMeans)) +
geom_point(aes(colour = "darkred", size = 1)) +
geom_hline(yintercept = c(0.6, 0.8)) +
labs(y = "stability", title = "Cluster Stability Evaluation") +
theme(legend.position="none")
ggplotly(p)