Fig-1: Data Preparation
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
rm(list=ls(all=TRUE))
load("data/tf0.rdata")
Remove data after the demarcation date
feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01) # 618212
X = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 88387
summary(X)
tid date cust age area
Min. : 1 Min. :2000-11-01 Min. : 1069 D :17541 E :37496
1st Qu.:22098 1st Qu.:2000-11-23 1st Qu.: 923910 C :14624 F :25412
Median :44194 Median :2000-12-12 Median : 1607000 E :14578 G : 6787
Mean :44194 Mean :2000-12-15 Mean : 1395768 F :10354 C : 6329
3rd Qu.:66290 3rd Qu.:2001-01-12 3rd Qu.: 1888874 B : 7817 H : 5524
Max. :88387 Max. :2001-01-31 Max. :20002000 G : 6308 D : 3655
(Other):17165 (Other): 3184
items pieces total gross
Min. : 1.00 Min. : 1.00 Min. : 5 Min. :-1645
1st Qu.: 2.00 1st Qu.: 3.00 1st Qu.: 230 1st Qu.: 23
Median : 5.00 Median : 6.00 Median : 522 Median : 72
Mean : 6.99 Mean : 9.45 Mean : 889 Mean : 138
3rd Qu.: 9.00 3rd Qu.: 12.00 3rd Qu.: 1120 3rd Qu.: 174
Max. :112.00 Max. :339.00 Max. :30171 Max. : 8069
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
items pieces total gross
99.9% 56.000 84.00 9378.7 1883.2
99.95% 64.000 98.00 11261.8 2317.1
99.99% 85.646 137.65 17699.3 3389.6
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 28584
feb = filter(X0, date>= feb01) %>% group_by(cust) %>%
summarise(amount = sum(total)) # 16899
A$amountSimply a Left Joint
A = merge(A, feb, by="cust", all.x=T)
A$buyA$buy = !is.na(A$amount)
summary(A)
cust r s f m rev
Min. : 1069 Min. : 1.0 Min. : 1.0 Min. : 1.00 Min. : 8 Min. : 8
1st Qu.: 1060898 1st Qu.:11.0 1st Qu.:47.0 1st Qu.: 1.00 1st Qu.: 359 1st Qu.: 638
Median : 1654100 Median :21.0 Median :68.0 Median : 2.00 Median : 710 Median : 1566
Mean : 1461070 Mean :32.1 Mean :61.3 Mean : 3.09 Mean : 1012 Mean : 2711
3rd Qu.: 1945003 3rd Qu.:53.0 3rd Qu.:83.0 3rd Qu.: 4.00 3rd Qu.: 1315 3rd Qu.: 3426
Max. :20002000 Max. :92.0 Max. :92.0 Max. :60.00 Max. :10634 Max. :99597
raw age area amount buy
Min. : -742 D :5832 E :9907 Min. : 8 Mode :logical
1st Qu.: 70 C :5238 F :7798 1st Qu.: 454 FALSE:15342
Median : 218 E :4514 C :3169 Median : 993 TRUE :13242
Mean : 421 F :3308 G :3052 Mean : 1498
3rd Qu.: 535 B :2802 D :1778 3rd Qu.: 1955
Max. :15565 G :1940 H :1295 Max. :28089
(Other):4950 (Other):1585 NA's :15342
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')
tapply(A$buy, A$area, mean) %>% barplot
abline(h = mean(A$buy), col='red')
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
[1] 28584 20008 8576
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
[1] 13242 9269 3973
cbind(A2, spl2) %>%
ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)
save(Z, X, A, spl, spl2, file="data/tf2.rdata")