Basic Statistics
# Histogram
par(mfrow=c(2,2))
ggplot(dsRFM, aes(x=recency)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=frequency)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=monetary)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(dsRFM, aes(x=ticketSize)) +
geom_histogram(aes(y=..density..), colour="black", fill="white")+
geom_density(alpha=.2, fill="#FF6666")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Quantile
options(digits = 2, scipen = 99)
stat.desc(select(dsRFM, -SOLD))
## recency frequency monetary ticketSize
## nbr.val 10440.00 10440.000 10440.0 10440.0
## nbr.null 0.00 0.000 165.0 165.0
## nbr.na 0.00 0.000 0.0 0.0
## min 1.00 1.000 -3863636.0 -838950.0
## max 365.00 291.000 420231186.0 110364545.0
## range 364.00 290.000 424094822.0 111203495.0
## sum 1675235.00 18467.000 80790888960.0 53524931704.1
## median 153.00 1.000 4793091.0 3627273.0
## mean 160.46 1.769 7738590.9 5126909.2
## SE.mean 0.91 0.040 124933.9 55274.0
## CI.mean.0.95 1.78 0.079 244894.3 108347.7
## var 8619.92 16.997 162952531958794.6 31896473565954.6
## std.dev 92.84 4.123 12765286.2 5647696.3
## coef.var 0.58 2.331 1.6 1.1
## 0% 25% 50% 75% 100%
## 1 83 153 238 365
quants <- c(0,0.05,0.25,0.50,0.75,0.90,0.95,0.99,1)
apply( dsRFM[2:4] , 2 , quantile , probs = quants , na.rm = TRUE )
## recency frequency monetary
## 0% 1 1 -3863636
## 5% 25 1 1809091
## 25% 83 1 2945455
## 50% 153 1 4793091
## 75% 238 2 7627273
## 90% 281 3 14990909
## 95% 321 4 22993364
## 99% 350 9 54925814
## 100% 365 291 420231186
Customer Clustering
# K-mean segmentation
dsData <- dsRFM
# Data Normalization
row.names(dsData) <- dsRFM$SOLD
## Warning: Setting row names on a tibble is deprecated.
dsData <- scale(dsData[,2:4])
summary(dsData)
## recency frequency monetary
## Min. :-1.72 Min. : 0 Min. :-1
## 1st Qu.:-0.83 1st Qu.: 0 1st Qu.: 0
## Median :-0.08 Median : 0 Median : 0
## Mean : 0.00 Mean : 0 Mean : 0
## 3rd Qu.: 0.84 3rd Qu.: 0 3rd Qu.: 0
## Max. : 2.20 Max. :70 Max. :32
# Finding k
tot_withinss <- c()
for (i in 1:10) {
set.seed(123)
u <- kmeans(dsData, i, nstart = 25)
k <- u$tot.withinss
tot_withinss <- c(tot_withinss, k)
}
mydf <- data_frame(TWSS = tot_withinss, N = 1:10)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
mydf %>%
ggplot(aes(N, TWSS)) +
geom_line() +
geom_point() +
geom_point(data = mydf %>% filter(N == 4), color = "red", size = 3) +
scale_x_continuous(breaks = seq(1, 10, by = 1)) +
labs(title = "Number of Clusters v.s TWSS")

# Cluster with k = 4:
set.seed(123)
km.res <- kmeans(dsData, 4, nstart = 25)
# Data Labelling
dsRFM %<>% mutate(cluster = km.res$cluster)
# Centroid
km.res$centers
## recency frequency monetary
## 1 -0.76 -0.016 -0.11
## 2 1.00 -0.103 -0.18
## 3 -0.14 0.650 2.96
## 4 -0.87 15.861 14.60
km.res$cluster %>% table()
## .
## 1 2 3 4
## 5657 4388 376 19
dsRFM$Name_Group <- ifelse(dsRFM$cluster == "1", "Loyal",
ifelse(dsRFM$cluster == "4", "High Potential",
ifelse(dsRFM$cluster == "2", "Potential","New")))
dsRFM$Churn <- ifelse(dsRFM$recency <= 90, "1-3 month",ifelse(dsRFM$recency > 90 & dsRFM$recency <= 180, "3-6 month", "9-12 month"))
# Mean
dsRFM$cluster <- as.character(dsRFM$cluster)
dsRFM %>% summarise_if(is.numeric, mean)
## # A tibble: 1 x 4
## recency frequency monetary ticketSize
## <dbl> <dbl> <dbl> <dbl>
## 1 160. 1.77 7738591. 5126909.
# Group (Contigency table)
h <- dsRFM %>% group_by(SOLD, Name_Group, Churn) %>% summarise(frequency = n())
L <- dsRFM %>% group_by(Name_Group, Churn) %>% summarise(sum = sum(as.numeric(frequency)))
# Data Viz
ggplot(L, aes(x = Name_Group, y = sum, fill = Churn, label = sum)) +
geom_bar(stat = "identity") +
geom_text(size = 3, position = position_stack(vjust = 0.5)) + xlab("Cluster") + ylab("Number of customer") + scale_fill_brewer(palette="Paired")+
theme_minimal()

percentData <- h %>% group_by(Name_Group) %>% count(Churn) %>%
mutate(ratio=scales::percent(n/sum(n)))
ggplot(h,aes(x=factor(Name_Group),fill=factor(Churn)))+
geom_bar(position="fill")+
geom_text(data=percentData, aes(y=n,label=ratio),
position=position_fill(vjust=0.5)) + scale_fill_brewer(palette="Paired")+
theme_minimal() + xlab("Cluster") + ylab("Number of customer") + scale_fill_brewer(palette="Paired")
## Scale for 'fill' is already present. Adding another scale for 'fill', which
## will replace the existing scale.

Purchase Behavior
Product <- merge(dsRFM, TradeMarketing, by = "SOLD")
Product <- Product[!(!is.na(Product$CAT) & Product$CAT==""), ]
WhatBuy <- Product %>% filter(frequency == 3)
WhatBuy_reshape_1 <- aggregate(CAT ~ SOLD, FUN = paste, collapse = "|", data = WhatBuy)
WhatBuy_reshape_2 <- WhatBuy_reshape_1 %>%
group_by(CAT) %>%
summarise(count = n())
WhatBuy_reshape_3 <- merge(x = WhatBuy_reshape_1, y = dsRFM[ , c("SOLD", "Churn", "Name_Group")], by = "SOLD", all.x=TRUE)
WhatBuy_reshape_4 <- WhatBuy_reshape_3 %>%
group_by(CAT) %>%
summarise(count = n())
WhatBuy_reshape_5 <- WhatBuy_reshape_3 %>%
group_by(Churn, CAT) %>%
summarise(count = n())
WhatBuy_reshape_6 <- WhatBuy_reshape_3 %>%
group_by(Name_Group, Churn, CAT) %>%
summarise(count = n())
a <- strsplit(WhatBuy_reshape_6$CAT, "\\|")
n.obs <- sapply(a, length)
seq.max <- seq_len(max(n.obs))
a_mat <- t(sapply(a, "[", i = seq.max))
class(a_mat)
## [1] "matrix"
colnames(a_mat) <- c("L1","L2","L3") #name the column
a3 <- a_mat %>% as_data_frame(a_mat) #Convert matrix to data frame
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
## Warning: The `.name_repair` argument to `as_tibble()` takes precedence over the
## deprecated `validate` argument.
a3$Name_Group <- WhatBuy_reshape_6$Name_Group
a5 <- merge(WhatBuy_reshape_6, a3, by="row.names")
a5$Row.names <- NULL
a5$Name_Group.x <- NULL
a5$Name_Group.y <- NULL
a5$CAT <- NULL
require(alluvial)
## Loading required package: alluvial
library(ggalluvial)
titanic_wide <- a5[1:20,] #Lay mau de ve cho de nhin
str(titanic_wide)
## 'data.frame': 20 obs. of 5 variables:
## $ Churn: chr "1-3 month" "1-3 month" "1-3 month" "3-6 month" ...
## $ count: int 1 1 1 1 1 1 4 1 3 1 ...
## $ L1 : chr "Andr\303\251 Mouche" "SKU_16" "Titoni" "Andr\303\251 Mouche" ...
## $ L2 : chr "Andr\303\251 Mouche" "SKU_16" "SKU_5" "Silvana" ...
## $ L3 : chr "Andr\303\251 Mouche" "SKU_16" NA NA ...
titanic_wide$L1 <- as.factor(titanic_wide$L1)
titanic_wide$L2 <- as.factor(titanic_wide$L2)
titanic_wide$L3 <- as.factor(titanic_wide$L3)
ggplot(data = titanic_wide,
aes(axis1 = L1, axis2 = L2, axis3 = L3, y = count)) + scale_x_discrete(limits = c("L1", "L2", "L3"), expand = c(.1, .05)) + geom_alluvium(aes(fill = Churn))+ geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE) + theme_minimal()+ theme(legend.position = 'bottom')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

# Revenue
WhatBuy <- Product %>% filter(frequency == 3)
WhatBuy$Revenue_group <- ifelse(WhatBuy$VALUE <= 500000,"Below 500k",
ifelse(WhatBuy$VALUE >500000 & WhatBuy$VALUE <= 2000000,"500k - 2mil",
ifelse(WhatBuy$VALUE > 2000000 & WhatBuy$NET_VAL_S_10<= 6000000, "2mil - 6mil", "Above 6mil")))
WhatBuy_revenue_1 <- aggregate(Revenue_group ~ SOLD, FUN = paste, collapse = "|", data = WhatBuy)
WhatBuy_revenue_2 <- WhatBuy_revenue_1 %>%
group_by(Revenue_group) %>%
summarise(count = n())
WhatBuy_revenue_3 <- merge(x = WhatBuy_revenue_1, y = dsRFM[ , c("SOLD", "Churn", "Name_Group")], by = "SOLD", all.x=TRUE)
WhatBuy_revenue_4 <- WhatBuy_revenue_3 %>%
group_by(Revenue_group) %>%
summarise(count = n())
WhatBuy_revenue_5 <- WhatBuy_revenue_3 %>%
group_by(Churn, Revenue_group) %>%
summarise(count = n())
WhatBuy_revenue_6 <- WhatBuy_revenue_3 %>%
group_by(Name_Group, Churn, Revenue_group) %>%
summarise(count = n())
aa <- strsplit(WhatBuy_revenue_6$Revenue_group, "\\|")
n.obs <- sapply(aa, length)
seq.max <- seq_len(max(n.obs))
a_mat_revenue <- t(sapply(aa, "[", i = seq.max))
class(a_mat_revenue)
## [1] "matrix"
colnames(a_mat_revenue) <- c("L1","L2","L3") #name the column
a33 <- a_mat_revenue %>% as_data_frame(a_mat_revenue) #Convert matrix to data frame
## Warning: The `.name_repair` argument to `as_tibble()` takes precedence over the
## deprecated `validate` argument.
a33$Name_Group <- WhatBuy_revenue_6$Name_Group
a55 <- merge(WhatBuy_revenue_6, a33, by="row.names")
a55$Row.names <- NULL
a55$Name_Group.x <- NULL
a55$Name_Group.y <- NULL
a55$CAT <- NULL
require(alluvial)
library(ggalluvial)
revenue_titanic_wide <- a55[1:30,] #Lay mau de ve cho de nhin
str(revenue_titanic_wide)
## 'data.frame': 30 obs. of 6 variables:
## $ Churn : chr "1-3 month" "3-6 month" "3-6 month" "3-6 month" ...
## $ Revenue_group: chr "500k - 2mil" "500k - 2mil|Below 500k" "500k - 2mil|Below 500k|500k - 2mil" "Below 500k" ...
## $ count : int 7 2 1 150 2 4 1 4 1 3 ...
## $ L1 : chr "500k - 2mil" "500k - 2mil" "500k - 2mil" "Below 500k" ...
## $ L2 : chr NA "Below 500k" "Below 500k" NA ...
## $ L3 : chr NA NA "500k - 2mil" NA ...
revenue_titanic_wide$L1 <- as.factor(revenue_titanic_wide$L1)
revenue_titanic_wide$L2 <- as.factor(revenue_titanic_wide$L2)
revenue_titanic_wide$L3 <- as.factor(revenue_titanic_wide$L3)
ggplot(data = revenue_titanic_wide,
aes(axis1 = L1, axis2 = L2, axis3 = L3, y = count)) + scale_x_discrete(limits = c("L1", "L2", "L3"), expand = c(.1, .05)) + geom_alluvium(aes(fill = Churn))+ geom_stratum() + geom_text(stat = "stratum", label.strata = TRUE) + theme_minimal()+ theme(legend.position = 'bottom')
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
