1 Library Loading

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pastecs)
## 
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
library(fpc)
library(FactoMineR)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readxl)
data <- read_excel("mysample.xlsx")
df <- data
df <- df %>% filter(!NET_VAL_S_100 <= 0 & !QUANT_B <= 0) # Successfyl Transaction thanh cong 

2 Data loading

cust_data <- df[c(4,6,16)]

3 Data Preparation

3.1 Name original columns

cust_data <- rename(cust_data, 
                     customer_id = SOLD_TO, purchase_amount = NET_VAL_S_100, date_of_purchase = BILL_DATE)

3.2 Convert purchase date to R date object

cust_data$date_of_purchase <- as.Date(cust_data$date_of_purchase, "%Y-%m-%d")

3.3 Create recency variable

cust_data$days_since <- as.numeric(difftime(time1 = "2020-04-15",
                                            time2 = cust_data$date_of_purchase,
                                            units = "days"))  

3.4 Create analysis dataset including 'frequency' variable

customers <- cust_data %>%
                group_by(customer_id) %>%
                summarise(recency = min(days_since), 
                          frequency = n(), 
                          avg_amount = mean(purchase_amount))

4 Data Exploration

4.1 Descriptive statistics

options(digits = 2, scipen = 99)
stat.desc(select(customers, -customer_id))
##                 recency frequency         avg_amount
## nbr.val         6467.00   6467.00             6467.0
## nbr.null           0.00      0.00                0.0
## nbr.na             0.00      0.00                0.0
## min               14.71      1.00            15000.0
## max              378.71    902.00       3584518111.0
## range            364.00    901.00       3584503111.0
## sum          1075483.79  10000.00      44735388707.9
## median           146.71      1.00          2687700.0
## mean             166.30      1.55          6917487.0
## SE.mean            1.26      0.14           615234.2
## CI.mean.0.95       2.47      0.28          1206062.6
## var            10280.24    131.66 2447844013985931.0
## std.dev          101.39     11.47         49475691.1
## coef.var           0.61      7.42                7.2

4.2 Distribution

# frequency
ggplot(customers, aes(x = frequency)) +
   geom_histogram(bins = 20, fill = "steelblue") +
   theme_classic() +
   labs(x = "", title = "Purchase frequency") +
   scale_x_continuous(breaks = c(0, 5, 10, 15, 25, 35, 45))

dim(filter(customers, frequency == 1))[1]/dim(customers)[1]
## [1] 0.91
# recency
ggplot(customers, aes(x = recency)) +
   geom_histogram(fill = "steelblue") +
   theme_classic() +
   labs(x = "", y = "", title = "Recency in Days") +
  scale_x_continuous(breaks = c(0, 250, 500, 1000, 1500, 2000, 2500, 3000, 3500, 4000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

dim(filter(customers, recency < 365))[1]/dim(customers)[1]
## [1] 0.97
# amount
summary(customers$avg_amount)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
##      15000     537000    2690000    6920000    5090000 3580000000
ggplot(customers, aes(x = avg_amount)) +
   geom_histogram(bins = 20, fill = "steelblue") +
   theme_classic() +
   labs(x = "average amount", title = "Average purchase amount")

ggplot(customers, aes(x = avg_amount)) +
   geom_histogram(bins = 20, fill = "steelblue") +
   theme_classic() +
   labs(x = "amount (log10)", 
        title = "Average purchase amount\n (Data are log transformed to ease visualization)") +
   scale_x_log10()

5 Statistical Segmentation

5.1 Data preparation for modelling

# Remove customer_id column and set as row names
row.names(customers) <- customers$customer_id
## Warning: Setting row names on a tibble is deprecated.
customers <- select(customers, -customer_id)

clust_data <- scale(customers)

5.2 Clustering method

set.seed(11)
hc.complete =hclust (dist(clust_data), method = "complete")
hc.average =hclust (dist(clust_data), method = "average")
hc.ward =hclust (dist(clust_data), method = "ward.D2")

plot(hc.complete, main = "Complete Linkage", xlab = "", sub = "", cex = .9)

plot(hc.average, main = "Average Linkage", xlab = "", sub = "", cex = .9)

plot(hc.ward, main = "Cluster Tree", xlab = "", sub = "", cex = .9)

rect.hclust(hc.ward, k = 5, border = "darkred")

plot(hc.ward, main = "Cluster Tree cut at 5 clusters", xlab = "", sub = "", cex = .9)
abline(h = 45, col = "red")

clusters <- cutree(hc.ward, 5)
table(clusters)
## clusters
##    1    2    3    4    5 
##    1 1796 2897 1772    1
#Assign customers to segments
customers$cluster <- clusters

5.3 Segment Centroids

profiles <- customers %>% 
  group_by(cluster) %>%
  summarise_each(funs(mean))
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
cluster_counts <- customers %>% 
  group_by(cluster) %>% 
  summarise(counts = n())

profiles <- mutate(profiles, counts = cluster_counts$counts) 
profiles
## # A tibble: 5 x 5
##   cluster recency frequency  avg_amount counts
##     <int>   <dbl>     <dbl>       <dbl>  <int>
## 1       1    14.7    902      13077229.      1
## 2       2   176.       1.10    8664311.   1796
## 3       3    74.9      1.82    5231619.   2897
## 4       4   306.       1.05    5880751.   1772
## 5       5   198.       1    3584518111       1

6 Cluster Visualizations

6.1 Pairwise

# frequency against avg_amount with segment as the grouping variable.
customers$cluster <- as.factor(customers$cluster)
ggplot(data = customers,
       aes(x = frequency, y = avg_amount, colour = cluster, shape = cluster)) +
  geom_point() +
  theme_dark() +
  labs(title = "average amount spent vs. frequency", 
       shape = "customer\nclusters", colour = "customer\nclusters")

# avg_amount against recency with segment as the grouping variable.
ggplot(data = customers,
       aes(x = recency, y = avg_amount, colour = cluster, shape = cluster)) +
  geom_point() +
  theme_dark() +
  labs(title = "average amount spent vs recency",  shape = "cluster", colour = "cluster")

6.2 First 2 Principal Components

# Project clusters onto the first 2 principal components
prin_comp <- princomp(clust_data)
nComp <- 2 # This is a more faithful 2 dimensional visualization of the clusters 
project <- predict(prin_comp, newdata=clust_data)[,1:nComp]

# Create dataframe with transformed data
project_data <- cbind(as.data.frame(project),
                      cluster = as.factor(clusters))
ggplot(project_data, aes(x=Comp.1, y=Comp.2)) +
   geom_point(aes(shape=cluster, colour = cluster)) +
   theme_dark() +
   labs(x = "Principal Component 1", y = "Principal Component 2")

7 Cluster Evaluation

clust_no <- 5

set.seed(12)
cboot_hclust <- clusterboot(clust_data, 
                            clustermethod = hclustCBI,
                            method = "ward.D2", 
                            k = clust_no)
## boot 1 
## boot 2 
## boot 3 
## boot 4 
## boot 5 
## boot 6 
## boot 7 
## boot 8 
## boot 9 
## boot 10 
## boot 11 
## boot 12 
## boot 13 
## boot 14 
## boot 15 
## boot 16 
## boot 17 
## boot 18 
## boot 19 
## boot 20 
## boot 21 
## boot 22 
## boot 23 
## boot 24 
## boot 25 
## boot 26 
## boot 27 
## boot 28 
## boot 29 
## boot 30 
## boot 31 
## boot 32 
## boot 33 
## boot 34 
## boot 35 
## boot 36 
## boot 37 
## boot 38 
## boot 39 
## boot 40 
## boot 41 
## boot 42 
## boot 43 
## boot 44 
## boot 45 
## boot 46 
## boot 47 
## boot 48 
## boot 49 
## boot 50 
## boot 51 
## boot 52 
## boot 53 
## boot 54 
## boot 55 
## boot 56 
## boot 57 
## boot 58 
## boot 59 
## boot 60 
## boot 61 
## boot 62 
## boot 63 
## boot 64 
## boot 65 
## boot 66 
## boot 67 
## boot 68 
## boot 69 
## boot 70 
## boot 71 
## boot 72 
## boot 73 
## boot 74 
## boot 75 
## boot 76 
## boot 77 
## boot 78 
## boot 79 
## boot 80 
## boot 81 
## boot 82 
## boot 83 
## boot 84 
## boot 85 
## boot 86 
## boot 87 
## boot 88 
## boot 89 
## boot 90 
## boot 91 
## boot 92 
## boot 93 
## boot 94 
## boot 95 
## boot 96 
## boot 97 
## boot 98 
## boot 99 
## boot 100
bootMean_data <- data.frame(cluster = 1:5, bootMeans = cboot_hclust$bootmean) 

p <- ggplot(data = bootMean_data, aes(x = cluster, y = bootMeans)) +
     geom_point(aes(colour = "darkred", size = 1)) +
     geom_hline(yintercept = c(0.6, 0.8)) +
     labs(y = "stability", title = "Cluster Stability Evaluation") +
     theme(legend.position="none")

ggplotly(p)