1 Objective

  1. Determine the point when each customer will begin to act “anomalously” (Churn)
  2. Application of Empirical Cumulative Distribution Function (ECDF) to define the point

2 Load library

library(tidyverse)
library(lubridate)
library(XLConnect)
library(dplyr)
library(ggplot2)
library(readxl)
library(writexl)

theme_set(theme_minimal())

3 Data input

raw.data <- read_excel("/Users/admin/Desktop/Linh Data Studio/Business Intelligent Anlytics/Online Retail.xlsx")

data <- raw.data

4 Data wrangling

Create a “Total” column to show how much each customer spent on each purchase:

data$Total <- data$Quantity * data$UnitPrice

txns <- data %>% 
  mutate(CustomerID = as.factor(CustomerID),
         InvoiceDate = InvoiceDate) %>% 
  group_by(CustomerID, InvoiceNo, InvoiceDate) %>% 
  summarise(Spend = sum(Total)) %>% 
  ungroup() %>% 
  filter(Spend > 0)
## `summarise()` has grouped output by 'CustomerID', 'InvoiceNo'. You can override
## using the `.groups` argument.

Calculate time between purchases for each customer:

time_between <- txns %>% 
  arrange(CustomerID, InvoiceDate) %>% 
  group_by(CustomerID) %>% 
  mutate(dt = as.numeric(InvoiceDate - lag(InvoiceDate), unit = 'days')) %>% 
  ungroup() %>% 
  na.omit()

Select customer who have made at least 20 purchases:

Ntrans = txns %>% 
  group_by(CustomerID) %>% 
  summarise(N = n()) %>% 
  filter(N > 20)

5 Data Viz

5.1 Calculate ECDF

ecdf_df <- time_between %>% 
  group_by(CustomerID) %>% 
  arrange(dt) %>% 
  mutate(e_cdf = 1:length(dt)/length(dt))
write_xlsx(time_between,"/Users/admin/Desktop/Linh Data Studio/Business Intelligent Anlytics/ecdf.xlsx")
# check ecdf in excel 

5.2 Select randomly 20 sample

## basic function 
sample_n_groups = function(tbl, size, replace = FALSE, weight = NULL) {
  grps = tbl %>% groups %>% lapply(as.character) %>% unlist
  keep = tbl %>% summarise() %>% ungroup() %>% sample_n(size, replace, weight)
  tbl %>% right_join(keep, by=grps) %>% group_by_(.dots = grps)
}

## randomly select 
sample_users <- ecdf_df %>% inner_join(Ntrans) %>% sample_n_groups(20)
## Joining, by = "CustomerID"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

5.3 Plot

5.3.1 Time between

ggplot(data = time_between %>% inner_join(Ntrans) %>% filter(CustomerID %in% sample_users$CustomerID), aes(dt)) + 
  geom_histogram(aes(y = ..count../sum(..count..)), bins = 15) + 
  facet_wrap(~CustomerID) +
  labs(x = 'Time Since Last Purchase (Days)',y = 'Frequency')
## Joining, by = "CustomerID"

5.3.2 ECDF

ggplot(data = ecdf_df %>% inner_join(Ntrans) %>% filter(CustomerID %in% sample_users$CustomerID), aes(dt,e_cdf) ) + 
  geom_point(size =0.5) +
  geom_line() + 
  geom_hline(yintercept = 0.9, color = 'red') + 
  facet_wrap(~CustomerID) +
  labs(x = 'Time Since Last Purchase (Days)')
## Joining, by = "CustomerID"

Note: The red line represents approximate 90 percentile. So if the ECDF crosses the red line at 20 days, this means 9 times out of 10 that customer will make another purchase within 20 days

5.4 Calculate 90th percentile

getq <- function(x,a = 0.9){
  if(a>1|a<0){
    print('Check your quantile')
  }
  X <- sort(x)
  e_cdf <- 1:length(X) / length(X)
  aprx = approx(e_cdf, X, xout = c(0.9))
  return(aprx$y)
}

percentiles <-  time_between %>% 
  inner_join(Ntrans) %>% 
  filter(N>5) %>% 
  group_by(CustomerID) %>% 
  summarise(percentile.90= getq(dt)) %>% 
  arrange(percentile.90)
## Joining, by = "CustomerID"

5.5 Case study

5.5.1 90th percentile

percentiles[which(percentiles$CustomerID == 12748), ]
## # A tibble: 1 × 2
##   CustomerID percentile.90
##   <fct>              <dbl>
## 1 12748               4.74

Note: The model tells us: 9 times out of 10, CustomerID 12748 will make another purchase within 4.74 days, If CustomerID 12748 does not make another purchase within 4.74 days, we know that there is only a 1 in 10 chance of this happening, and that this behaviour is anomalous. At this point, we know that CustomerID 12748 begins to act “anomalously”.

5.5.2 Purchase history

txns[which(txns$CustomerID == 12748), ]
## # A tibble: 210 × 4
##    CustomerID InvoiceNo InvoiceDate          Spend
##    <fct>      <chr>     <dttm>               <dbl>
##  1 12748      536521    2010-12-01 12:48:00   4.95
##  2 12748      536641    2010-12-02 11:46:00   4.25
##  3 12748      537136    2010-12-05 12:42:00 274.  
##  4 12748      537140    2010-12-05 12:53:00  70.4 
##  5 12748      537142    2010-12-05 12:57:00  41.4 
##  6 12748      537155    2010-12-05 13:05:00  75.6 
##  7 12748      537213    2010-12-05 15:26:00   8.65
##  8 12748      537225    2010-12-05 16:41:00 496.  
##  9 12748      537354    2010-12-06 12:23:00 133.  
## 10 12748      537429    2010-12-06 15:54:00  82.1 
## # … with 200 more rows

Note: Most of CustomerID 12748’s purchases happened in 1 to 4 days. It makes sense that we should be concerned if he (or she) does not make another purchase in 4.74 days.