1 Library Loading

library(tidyverse)

## -- Attaching packages -------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.4
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readxl)
library(knitr)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## The following object is masked from 'package:purrr':
## 
##     transpose

library(dplyr)
library(ggplot2)
library(tidyr)
library(writexl)
library(pastecs)

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:data.table':
## 
##     first, last

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## The following object is masked from 'package:tidyr':
## 
##     extract

library(fpc)
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:pastecs':
## 
##     extract

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

2 Data Cleaning

df$DATE <- as.Date(df$DATE, format= "%Y-%m-%d")

df$COMPANY <- as.character(df$COMPANY)

TradeMarketing <- df %>% 
  filter(str_detect(COMPANY, "^4"))

df$DATE <- as.Date(df$DATE, format= "%Y-%m-%d")
table(df$ACCNT_GRP)  

df$COMPANY <- as.character(df$COMPANY)
TradeMarketing <- df %>% 
  filter(str_detect(COMPANY, "^4"))

3 Analysis

3.1 RFM & Ticket size

dsRFM <- TradeMarketing %>% group_by(SOLD) %>% summarise(recency = as.numeric(as.Date("2020-04-01") - max(as.Date(DATE))), frequency = n(), monetary = sum(as.numeric(VALUE)))

dsRFM <- dsRFM %>% mutate(ticketSize = monetary/frequency)

# Remove outlier
dsRFM <- dsRFM %>% filter(!SOLD %in% c("0101680800", "0101563814", "0101572263", "0101214910"))

3.2 Basic Statistics

# Histogram 
par(mfrow=c(2,2))

ggplot(dsRFM, aes(x=recency)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=frequency)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=monetary)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=ticketSize)) + 
 geom_histogram(aes(y=..density..), colour="black", fill="white")+
 geom_density(alpha=.2, fill="#FF6666")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Quantile
options(digits = 2, scipen = 99)
stat.desc(select(dsRFM, -SOLD))

##                 recency frequency          monetary       ticketSize
## nbr.val        10440.00 10440.000           10440.0          10440.0
## nbr.null           0.00     0.000             165.0            165.0
## nbr.na             0.00     0.000               0.0              0.0
## min                1.00     1.000        -3863636.0        -838950.0
## max              365.00   291.000       420231186.0      110364545.0
## range            364.00   290.000       424094822.0      111203495.0
## sum          1675235.00 18467.000     80790888960.0    53524931704.1
## median           153.00     1.000         4793091.0        3627273.0
## mean             160.46     1.769         7738590.9        5126909.2
## SE.mean            0.91     0.040          124933.9          55274.0
## CI.mean.0.95       1.78     0.079          244894.3         108347.7
## var             8619.92    16.997 162952531958794.6 31896473565954.6
## std.dev           92.84     4.123        12765286.2        5647696.3
## coef.var           0.58     2.331               1.6              1.1

quantile(dsRFM$recency)

##   0%  25%  50%  75% 100% 
##    1   83  153  238  365

quants <- c(0,0.05,0.25,0.50,0.75,0.90,0.95,0.99,1)
apply( dsRFM[2:4] , 2 , quantile , probs = quants , na.rm = TRUE )

##      recency frequency  monetary
## 0%         1         1  -3863636
## 5%        25         1   1809091
## 25%       83         1   2945455
## 50%      153         1   4793091
## 75%      238         2   7627273
## 90%      281         3  14990909
## 95%      321         4  22993364
## 99%      350         9  54925814
## 100%     365       291 420231186

3.3 Customer Clustering

# K-mean segmentation 
dsData <- dsRFM

# Data Normalization 
row.names(dsData) <- dsRFM$SOLD

## Warning: Setting row names on a tibble is deprecated.

dsData <- scale(dsData[,2:4])
summary(dsData)

##     recency        frequency     monetary 
##  Min.   :-1.72   Min.   : 0   Min.   :-1  
##  1st Qu.:-0.83   1st Qu.: 0   1st Qu.: 0  
##  Median :-0.08   Median : 0   Median : 0  
##  Mean   : 0.00   Mean   : 0   Mean   : 0  
##  3rd Qu.: 0.84   3rd Qu.: 0   3rd Qu.: 0  
##  Max.   : 2.20   Max.   :70   Max.   :32

# Finding k 
tot_withinss <- c()
for (i in 1:10) {
  set.seed(123)
  u <- kmeans(dsData, i, nstart = 25)
  k <- u$tot.withinss
  tot_withinss <- c(tot_withinss, k)
}

mydf <- data_frame(TWSS = tot_withinss, N = 1:10)

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

mydf %>%
  ggplot(aes(N, TWSS)) +
  geom_line() +
  geom_point() +
  geom_point(data = mydf %>% filter(N == 4), color = "red", size = 3) +
  scale_x_continuous(breaks = seq(1, 10, by = 1)) +
  labs(title = "Number of Clusters v.s TWSS")

# Cluster with k = 4: 
set.seed(123)
km.res <- kmeans(dsData, 4, nstart = 25)

# Data Labelling 
dsRFM %<>% mutate(cluster = km.res$cluster)

# Centroid
km.res$centers

##   recency frequency monetary
## 1   -0.76    -0.016    -0.11
## 2    1.00    -0.103    -0.18
## 3   -0.14     0.650     2.96
## 4   -0.87    15.861    14.60

km.res$cluster %>% table()

## .
##    1    2    3    4 
## 5657 4388  376   19

dsRFM$Name_Group <- ifelse(dsRFM$cluster == "1", "Loyal",
                     ifelse(dsRFM$cluster == "4", "High Potential",
                            ifelse(dsRFM$cluster == "2", "Potential","New")))

dsRFM$Churn <- ifelse(dsRFM$recency <= 90, "1-3 month",ifelse(dsRFM$recency > 90 & dsRFM$recency <= 180, "3-6 month", "9-12 month"))

# Mean
dsRFM$cluster <- as.character(dsRFM$cluster)
dsRFM %>% summarise_if(is.numeric, mean)

## # A tibble: 1 x 4
##   recency frequency monetary ticketSize
##     <dbl>     <dbl>    <dbl>      <dbl>
## 1    160.      1.77 7738591.   5126909.

# Group (Contigency table)
h <- dsRFM %>% group_by(SOLD, Name_Group, Churn) %>% summarise(frequency = n())
L <- dsRFM %>% group_by(Name_Group, Churn) %>% summarise(sum = sum(as.numeric(frequency)))

# Data Viz 
ggplot(L, aes(x = Name_Group, y = sum, fill = Churn, label = sum)) +
  geom_bar(stat = "identity") +
  geom_text(size = 3, position = position_stack(vjust = 0.5)) + xlab("Cluster") + ylab("Number of customer") + scale_fill_brewer(palette="Paired")+
  theme_minimal()

percentData <- h  %>% group_by(Name_Group) %>% count(Churn) %>%
    mutate(ratio=scales::percent(n/sum(n)))
ggplot(h,aes(x=factor(Name_Group),fill=factor(Churn)))+
    geom_bar(position="fill")+
    geom_text(data=percentData, aes(y=n,label=ratio), 
    position=position_fill(vjust=0.5)) + scale_fill_brewer(palette="Paired")+
  theme_minimal() + xlab("Cluster") + ylab("Number of customer") + scale_fill_brewer(palette="Paired")

## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

3.4 Purchase Behavior

Product <- merge(dsRFM, TradeMarketing, by = "SOLD")
Product <- Product[!(!is.na(Product$CAT) & Product$CAT==""), ] 

WhatBuy <- Product %>% filter(frequency == 3)

WhatBuy_reshape_1 <- aggregate(CAT ~ SOLD, FUN = paste, collapse = "|", data = WhatBuy)

WhatBuy_reshape_2 <- WhatBuy_reshape_1 %>%
  group_by(CAT) %>%
  summarise(count = n())

WhatBuy_reshape_3 <- merge(x = WhatBuy_reshape_1, y = dsRFM[ , c("SOLD", "Churn", "Name_Group")], by = "SOLD", all.x=TRUE)

WhatBuy_reshape_4 <- WhatBuy_reshape_3 %>%
  group_by(CAT) %>%
  summarise(count = n())

WhatBuy_reshape_5 <- WhatBuy_reshape_3 %>%
  group_by(Churn, CAT) %>%
  summarise(count = n())

WhatBuy_reshape_6 <- WhatBuy_reshape_3 %>%
  group_by(Name_Group, Churn, CAT) %>%
  summarise(count = n())

a <- strsplit(WhatBuy_reshape_6$CAT, "\\|") 
n.obs <- sapply(a, length)
seq.max <- seq_len(max(n.obs))
a_mat <- t(sapply(a, "[", i = seq.max))
class(a_mat)

## [1] "matrix"

colnames(a_mat) <- c("L1","L2","L3") #name the column 
a3 <- a_mat %>% as_data_frame(a_mat) #Convert matrix to data frame

## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.

## Warning: The `.name_repair` argument to `as_tibble()` takes precedence over the
## deprecated `validate` argument.

a3$Name_Group <- WhatBuy_reshape_6$Name_Group
a5 <- merge(WhatBuy_reshape_6, a3, by="row.names")
a5$Row.names <- NULL
a5$Name_Group.x <- NULL
a5$Name_Group.y <- NULL
a5$CAT <- NULL

require(alluvial)

## Loading required package: alluvial

library(ggalluvial)
titanic_wide <- a5[1:20,] #Lay mau de ve cho de nhin 
str(titanic_wide)

## 'data.frame':    20 obs. of  5 variables:
##  $ Churn: chr  "1-3 month" "1-3 month" "1-3 month" "3-6 month" ...
##  $ count: int  1 1 1 1 1 1 4 1 3 1 ...
##  $ L1   : chr  "Andr\303\251 Mouche" "SKU_16" "Titoni" "Andr\303\251 Mouche" ...
##  $ L2   : chr  "Andr\303\251 Mouche" "SKU_16" "SKU_5" "Silvana" ...
##  $ L3   : chr  "Andr\303\251 Mouche" "SKU_16" NA NA ...

titanic_wide$L1 <- as.factor(titanic_wide$L1)
titanic_wide$L2 <- as.factor(titanic_wide$L2)
titanic_wide$L3 <- as.factor(titanic_wide$L3)


ggplot(data = titanic_wide,
       aes(axis1 = L1, axis2 = L2, axis3 = L3, y = count)) + scale_x_discrete(limits = c("L1", "L2", "L3"), expand = c(.1, .05)) + geom_alluvium(aes(fill = Churn))+ geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE) + theme_minimal()+ theme(legend.position = 'bottom')

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

# Revenue 
WhatBuy <- Product %>% filter(frequency == 3)

WhatBuy$Revenue_group <- ifelse(WhatBuy$VALUE <= 500000,"Below 500k", 
                                ifelse(WhatBuy$VALUE >500000 & WhatBuy$VALUE <= 2000000,"500k - 2mil",
                                       ifelse(WhatBuy$VALUE > 2000000 & WhatBuy$NET_VAL_S_10<= 6000000, "2mil - 6mil", "Above 6mil")))


WhatBuy_revenue_1 <- aggregate(Revenue_group ~ SOLD, FUN = paste, collapse = "|", data = WhatBuy)

WhatBuy_revenue_2 <- WhatBuy_revenue_1 %>%
  group_by(Revenue_group) %>%
  summarise(count = n())


WhatBuy_revenue_3 <- merge(x = WhatBuy_revenue_1, y = dsRFM[ , c("SOLD", "Churn", "Name_Group")], by = "SOLD", all.x=TRUE)

WhatBuy_revenue_4 <- WhatBuy_revenue_3 %>%
  group_by(Revenue_group) %>%
  summarise(count = n())

WhatBuy_revenue_5 <- WhatBuy_revenue_3 %>%
  group_by(Churn, Revenue_group) %>%
  summarise(count = n())

WhatBuy_revenue_6 <- WhatBuy_revenue_3 %>%
  group_by(Name_Group, Churn, Revenue_group) %>%
  summarise(count = n())

aa <- strsplit(WhatBuy_revenue_6$Revenue_group, "\\|")  
n.obs <- sapply(aa, length)
seq.max <- seq_len(max(n.obs))
a_mat_revenue <- t(sapply(aa, "[", i = seq.max))
class(a_mat_revenue)

## [1] "matrix"

colnames(a_mat_revenue) <- c("L1","L2","L3") #name the column 
a33 <- a_mat_revenue %>% as_data_frame(a_mat_revenue) #Convert matrix to data frame

## Warning: The `.name_repair` argument to `as_tibble()` takes precedence over the
## deprecated `validate` argument.

a33$Name_Group <- WhatBuy_revenue_6$Name_Group
a55 <- merge(WhatBuy_revenue_6, a33, by="row.names")
a55$Row.names <- NULL
a55$Name_Group.x <- NULL
a55$Name_Group.y <- NULL
a55$CAT <- NULL

require(alluvial)
library(ggalluvial)
revenue_titanic_wide <- a55[1:30,] #Lay mau de ve cho de nhin 
str(revenue_titanic_wide)

## 'data.frame':    30 obs. of  6 variables:
##  $ Churn        : chr  "1-3 month" "3-6 month" "3-6 month" "3-6 month" ...
##  $ Revenue_group: chr  "500k - 2mil" "500k - 2mil|Below 500k" "500k - 2mil|Below 500k|500k - 2mil" "Below 500k" ...
##  $ count        : int  7 2 1 150 2 4 1 4 1 3 ...
##  $ L1           : chr  "500k - 2mil" "500k - 2mil" "500k - 2mil" "Below 500k" ...
##  $ L2           : chr  NA "Below 500k" "Below 500k" NA ...
##  $ L3           : chr  NA NA "500k - 2mil" NA ...

revenue_titanic_wide$L1 <- as.factor(revenue_titanic_wide$L1)
revenue_titanic_wide$L2 <- as.factor(revenue_titanic_wide$L2)
revenue_titanic_wide$L3 <- as.factor(revenue_titanic_wide$L3)


ggplot(data = revenue_titanic_wide,
       aes(axis1 = L1, axis2 = L2, axis3 = L3, y = count)) + scale_x_discrete(limits = c("L1", "L2", "L3"), expand = c(.1, .05)) + geom_alluvium(aes(fill = Churn))+ geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE) + theme_minimal()+ theme(legend.position = 'bottom')

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Customer Purchase Behavior Analysis

Phuong Linh

Dec 7, 2019

1 Library Loading

2 Data Cleaning

3 Analysis

3.1 RFM & Ticket size

3.2 Basic Statistics

3.3 Customer Clustering

3.4 Purchase Behavior