Fig-1: Pata Preparation
Zrm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
library(dplyr)
library(ggplot2)
library(caTools)
do.call-rbind-lapply ComboZ = do.call(rbind, lapply(
dir('data/TaFengDataSet','.*csv$',full.names=T),
read.csv, header=F)
) %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
nrow(Z)
[1] 817741
Z$date = as.Date(as.character(Z$date))
summary(Z)
date cust age area cat
Min. :2000-11-01 Min. : 1069 D :181213 E :312501 Min. :100101
1st Qu.:2000-11-28 1st Qu.: 969222 E :151023 F :245213 1st Qu.:110106
Median :2001-01-01 Median : 1587722 C :140805 G : 72092 Median :130106
Mean :2000-12-30 Mean : 1406620 F : 99719 C : 71640 Mean :284950
3rd Qu.:2001-01-30 3rd Qu.: 1854930 B : 66432 H : 40666 3rd Qu.:520314
Max. :2001-02-28 Max. :20002000 G : 53719 D : 38674 Max. :780510
(Other):124830 (Other): 36955
prod qty cost price
Min. : 20008819 Min. : 1.00 Min. : 0 Min. : 1
1st Qu.:4710085127020 1st Qu.: 1.00 1st Qu.: 35 1st Qu.: 42
Median :4710421090060 Median : 1.00 Median : 62 Median : 76
Mean :4461639280530 Mean : 1.38 Mean : 112 Mean : 132
3rd Qu.:4712500125130 3rd Qu.: 1.00 3rd Qu.: 112 3rd Qu.: 132
Max. :9789579967620 Max. :1200.00 Max. :432000 Max. :444000
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
qty cost price
99% 6 858.0 1014.0
99.9% 14 2722.0 3135.8
99.95% 24 3799.3 3999.0
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z)
[1] 817182
Z$tid = group_indices(Z, date, cust)
sapply(Z[,c("cust","cat","prod","tid")], n_distinct)
cust cat prod tid
32256 2007 23789 119422
summary(Z)
date cust age area cat
Min. :2000-11-01 Min. : 1069 D :181089 E :312358 Min. :100101
1st Qu.:2000-11-28 1st Qu.: 968775 E :150947 F :245079 1st Qu.:110106
Median :2001-01-01 Median : 1587685 C :140721 G : 71905 Median :130106
Mean :2000-12-30 Mean : 1406500 F : 99641 C : 71600 Mean :284784
3rd Qu.:2001-01-30 3rd Qu.: 1854701 B : 66353 H : 40647 3rd Qu.:520311
Max. :2001-02-28 Max. :20002000 G : 53689 D : 38654 Max. :780510
(Other):124742 (Other): 36939
prod qty cost price tid
Min. : 20008819 Min. : 1.00 Min. : 0 Min. : 1 Min. : 1
1st Qu.:4710085127020 1st Qu.: 1.00 1st Qu.: 35 1st Qu.: 42 1st Qu.: 28783
Median :4710421090060 Median : 1.00 Median : 62 Median : 76 Median : 59391
Mean :4461978778400 Mean : 1.36 Mean : 106 Mean : 126 Mean : 58845
3rd Qu.:4712500125130 3rd Qu.: 1.00 3rd Qu.: 112 3rd Qu.: 132 3rd Qu.: 87391
Max. :9789579967620 Max. :24.00 Max. :3798 Max. :4000 Max. :119422
XX = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 119422
summary(X)
tid date cust age area
Min. : 1 Min. :2000-11-01 Min. : 1069 D :23775 E :50532
1st Qu.: 29856 1st Qu.:2000-11-29 1st Qu.: 927093 C :19661 F :33826
Median : 59712 Median :2001-01-01 Median : 1615661 E :19596 G : 9498
Mean : 59712 Mean :2000-12-31 Mean : 1402548 F :13992 C : 8527
3rd Qu.: 89567 3rd Qu.:2001-02-02 3rd Qu.: 1894493 B :10515 H : 7502
Max. :119422 Max. :2001-02-28 Max. :20002000 G : 8493 D : 5108
(Other):23390 (Other): 4429
items pieces total gross
Min. : 1.00 Min. : 1.00 Min. : 5 Min. :-1645
1st Qu.: 2.00 1st Qu.: 3.00 1st Qu.: 227 1st Qu.: 21
Median : 5.00 Median : 6.00 Median : 510 Median : 68
Mean : 6.84 Mean : 9.29 Mean : 859 Mean : 132
3rd Qu.: 9.00 3rd Qu.: 12.00 3rd Qu.: 1082 3rd Qu.: 169
Max. :112.00 Max. :339.00 Max. :30171 Max. : 8069
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
items pieces total gross
99.9% 54 81.00 9009.6 1824.7
99.95% 62 94.29 10611.6 2179.8
99.99% 82 133.00 16044.4 3226.5
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
par(cex=0.8)
hist(X$date, "weeks", freq=T, border='lightgray', col='darkcyan',
las=2, main="No. Transaction per Week")
Ad0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area), # area code
) %>% data.frame # 33241
summary(A)
cust r s f m
Min. : 1069 Min. : 1.0 Min. : 1.0 Min. : 1.0 Min. : 8
1st Qu.: 1088519 1st Qu.: 9.0 1st Qu.: 56.0 1st Qu.: 1.0 1st Qu.: 365
Median : 1663402 Median : 26.0 Median : 92.0 Median : 2.0 Median : 706
Mean : 1473585 Mean : 37.5 Mean : 80.8 Mean : 3.7 Mean : 993
3rd Qu.: 1958089 3rd Qu.: 60.0 3rd Qu.:110.0 3rd Qu.: 4.0 3rd Qu.: 1291
Max. :20002000 Max. :120.0 Max. :120.0 Max. :85.0 Max. :12636
rev raw age area
Min. : 8 Min. : -784 D :6580 E :10800
1st Qu.: 707 1st Qu.: 75 C :5915 F : 8539
Median : 1750 Median : 241 E :5080 G : 3695
Mean : 3152 Mean : 485 F :3719 C : 3683
3rd Qu.: 3968 3rd Qu.: 612 B :3193 D : 2166
Max. :127686 Max. :20273 G :2183 H : 1453
(Other):5571 (Other): 1905
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")
range(X$date)
[1] "2000-11-01" "2001-02-28"
使用一月底(含2001-01-31)以前的資料,建立模型來預測每一位顧客:
【Q】 What are the Unit of Analysis?
【Q】 What are the Target of Analysis? Should we model for every customers in the dataset? Why not?
【Q】 How to make the Training/Testing Data Split?
【Q】 What are the Predicting and Targeted Variables?
Screen out the new customers (who arrive after 2001-02-01)
A = filter(A0, s > 28) # 28584
mean(A$r <= 28)
[1] 0.4633
library(caTools)
set.seed(1234); spl = sample.split(A$r <= 28, SplitRatio=0.75)
cid1 = subset(A, spl)$cust # 21438
cid2 = subset(A, !spl)$cust # 7146
cid1/cid2 are the customers ids in the training/testing data. But, …
【Q】 What are the Predicting (X) and Targeted Variables (Y)?