library(tidyverse)
library(lubridate)
library(XLConnect)
library(dplyr)
library(ggplot2)
library(readxl)
library(writexl)
theme_set(theme_minimal())
<- read_excel("/Users/admin/Desktop/Linh Data Studio/Business Intelligent Anlytics/Online Retail.xlsx")
raw.data
<- raw.data data
Create a “Total” column to show how much each customer spent on each purchase:
$Total <- data$Quantity * data$UnitPrice
data
<- data %>%
txns mutate(CustomerID = as.factor(CustomerID),
InvoiceDate = InvoiceDate) %>%
group_by(CustomerID, InvoiceNo, InvoiceDate) %>%
summarise(Spend = sum(Total)) %>%
ungroup() %>%
filter(Spend > 0)
## `summarise()` has grouped output by 'CustomerID', 'InvoiceNo'. You can override
## using the `.groups` argument.
Calculate time between purchases for each customer:
<- txns %>%
time_between arrange(CustomerID, InvoiceDate) %>%
group_by(CustomerID) %>%
mutate(dt = as.numeric(InvoiceDate - lag(InvoiceDate), unit = 'days')) %>%
ungroup() %>%
na.omit()
Select customer who have made at least 20 purchases:
= txns %>%
Ntrans group_by(CustomerID) %>%
summarise(N = n()) %>%
filter(N > 20)
<- time_between %>%
ecdf_df group_by(CustomerID) %>%
arrange(dt) %>%
mutate(e_cdf = 1:length(dt)/length(dt))
write_xlsx(time_between,"/Users/admin/Desktop/Linh Data Studio/Business Intelligent Anlytics/ecdf.xlsx")
# check ecdf in excel
## basic function
= function(tbl, size, replace = FALSE, weight = NULL) {
sample_n_groups = tbl %>% groups %>% lapply(as.character) %>% unlist
grps = tbl %>% summarise() %>% ungroup() %>% sample_n(size, replace, weight)
keep %>% right_join(keep, by=grps) %>% group_by_(.dots = grps)
tbl
}
## randomly select
<- ecdf_df %>% inner_join(Ntrans) %>% sample_n_groups(20) sample_users
## Joining, by = "CustomerID"
## Warning: `group_by_()` was deprecated in dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
ggplot(data = time_between %>% inner_join(Ntrans) %>% filter(CustomerID %in% sample_users$CustomerID), aes(dt)) +
geom_histogram(aes(y = ..count../sum(..count..)), bins = 15) +
facet_wrap(~CustomerID) +
labs(x = 'Time Since Last Purchase (Days)',y = 'Frequency')
## Joining, by = "CustomerID"
ggplot(data = ecdf_df %>% inner_join(Ntrans) %>% filter(CustomerID %in% sample_users$CustomerID), aes(dt,e_cdf) ) +
geom_point(size =0.5) +
geom_line() +
geom_hline(yintercept = 0.9, color = 'red') +
facet_wrap(~CustomerID) +
labs(x = 'Time Since Last Purchase (Days)')
## Joining, by = "CustomerID"
Note: The red line represents approximate 90 percentile. So if the ECDF crosses the red line at 20 days, this means 9 times out of 10 that customer will make another purchase within 20 days
<- function(x,a = 0.9){
getq if(a>1|a<0){
print('Check your quantile')
}<- sort(x)
X <- 1:length(X) / length(X)
e_cdf = approx(e_cdf, X, xout = c(0.9))
aprx return(aprx$y)
}
<- time_between %>%
percentiles inner_join(Ntrans) %>%
filter(N>5) %>%
group_by(CustomerID) %>%
summarise(percentile.90= getq(dt)) %>%
arrange(percentile.90)
## Joining, by = "CustomerID"
which(percentiles$CustomerID == 12748), ] percentiles[
## # A tibble: 1 × 2
## CustomerID percentile.90
## <fct> <dbl>
## 1 12748 4.74
Note: The model tells us: 9 times out of 10, CustomerID 12748 will make another purchase within 4.74 days, If CustomerID 12748 does not make another purchase within 4.74 days, we know that there is only a 1 in 10 chance of this happening, and that this behaviour is anomalous. At this point, we know that CustomerID 12748 begins to act “anomalously”.
which(txns$CustomerID == 12748), ] txns[
## # A tibble: 210 × 4
## CustomerID InvoiceNo InvoiceDate Spend
## <fct> <chr> <dttm> <dbl>
## 1 12748 536521 2010-12-01 12:48:00 4.95
## 2 12748 536641 2010-12-02 11:46:00 4.25
## 3 12748 537136 2010-12-05 12:42:00 274.
## 4 12748 537140 2010-12-05 12:53:00 70.4
## 5 12748 537142 2010-12-05 12:57:00 41.4
## 6 12748 537155 2010-12-05 13:05:00 75.6
## 7 12748 537213 2010-12-05 15:26:00 8.65
## 8 12748 537225 2010-12-05 16:41:00 496.
## 9 12748 537354 2010-12-06 12:23:00 133.
## 10 12748 537429 2010-12-06 15:54:00 82.1
## # … with 200 more rows
Note: Most of CustomerID 12748’s purchases happened in 1 to 4 days. It makes sense that we should be concerned if he (or she) does not make another purchase in 4.74 days.