Fig-1: Pata Preparation
Zrm(list=ls(all=T))
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
do.call-rbind-lapply ComboZ = do.call(rbind, lapply(
dir('data/TaFengDataSet','.*csv$',full.names=T),
read.csv, header=F)
) %>%
setNames(c("date","cust","age","area","cat","prod","qty","cost","price"))
Z
Z$date = as.Date(as.character(Z$date))
summary(Z)
date cust age
Min. :2000-11-01 Min. : 1069 D :181213
1st Qu.:2000-11-28 1st Qu.: 969222 E :151023
Median :2001-01-01 Median : 1587722 C :140805
Mean :2000-12-30 Mean : 1406620 F : 99719
3rd Qu.:2001-01-30 3rd Qu.: 1854930 B : 66432
Max. :2001-02-28 Max. :20002000 G : 53719
(Other):124830
area cat prod
E :312501 Min. :100101 Min. : 20008819
F :245213 1st Qu.:110106 1st Qu.:4710085127020
G : 72092 Median :130106 Median :4710421090060
C : 71640 Mean :284950 Mean :4461639280530
H : 40666 3rd Qu.:520314 3rd Qu.:4712500125130
D : 38674 Max. :780510 Max. :9789579967620
(Other): 36955
qty cost price
Min. : 1.0 Min. : 0 Min. : 1
1st Qu.: 1.0 1st Qu.: 35 1st Qu.: 42
Median : 1.0 Median : 62 Median : 76
Mean : 1.4 Mean : 112 Mean : 132
3rd Qu.: 1.0 3rd Qu.: 112 3rd Qu.: 132
Max. :1200.0 Max. :432000 Max. :444000
sapply(Z[,7:9], quantile, prob=c(.99, .999, .9995))
qty cost price
99% 6 858 1014
99.9% 14 2722 3136
99.95% 24 3799 3999
Z = subset(Z, qty<=24 & cost<=3800 & price<=4000)
nrow(Z)
[1] 817182
#為一筆交易產生id
Z$tid = group_indices(Z, date, cust)
Z
sapply(Z[,c("cust","cat","prod","tid")], n_distinct)
cust cat prod tid
32256 2007 23789 119422
summary(Z)
date cust age
Min. :2000-11-01 Min. : 1069 D :181089
1st Qu.:2000-11-28 1st Qu.: 968775 E :150947
Median :2001-01-01 Median : 1587685 C :140721
Mean :2000-12-30 Mean : 1406500 F : 99641
3rd Qu.:2001-01-30 3rd Qu.: 1854701 B : 66353
Max. :2001-02-28 Max. :20002000 G : 53689
(Other):124742
area cat prod
E :312358 Min. :100101 Min. : 20008819
F :245079 1st Qu.:110106 1st Qu.:4710085127020
G : 71905 Median :130106 Median :4710421090060
C : 71600 Mean :284784 Mean :4461978778400
H : 40647 3rd Qu.:520311 3rd Qu.:4712500125130
D : 38654 Max. :780510 Max. :9789579967620
(Other): 36939
qty cost price tid
Min. : 1.00 Min. : 0 Min. : 1 Min. : 1
1st Qu.: 1.00 1st Qu.: 35 1st Qu.: 42 1st Qu.: 28783
Median : 1.00 Median : 62 Median : 76 Median : 59391
Mean : 1.36 Mean : 106 Mean : 126 Mean : 58845
3rd Qu.: 1.00 3rd Qu.: 112 3rd Qu.: 132 3rd Qu.: 87391
Max. :24.00 Max. :3798 Max. :4000 Max. :119422
XX = group_by(Z, tid) %>% summarise(
date = first(date), # 交易日期
cust = first(cust), # 顧客 ID
age = first(age), # 顧客 年齡級別
area = first(area), # 顧客 居住區別
items = n(), # 交易項目(總)數
pieces = sum(qty), # 產品(總)件數
total = sum(price), # 交易(總)金額
gross = sum(price - cost) # 毛利
) %>% data.frame # 119422
X
summary(X)
tid date cust
Min. : 1 Min. :2000-11-01 Min. : 1069
1st Qu.: 29856 1st Qu.:2000-11-29 1st Qu.: 927093
Median : 59712 Median :2001-01-01 Median : 1615661
Mean : 59712 Mean :2000-12-31 Mean : 1402548
3rd Qu.: 89567 3rd Qu.:2001-02-02 3rd Qu.: 1894493
Max. :119422 Max. :2001-02-28 Max. :20002000
age area items pieces
D :23775 E :50532 Min. : 1.00 Min. : 1.0
C :19661 F :33826 1st Qu.: 2.00 1st Qu.: 3.0
E :19596 G : 9498 Median : 5.00 Median : 6.0
F :13992 C : 8527 Mean : 6.84 Mean : 9.3
B :10515 H : 7502 3rd Qu.: 9.00 3rd Qu.: 12.0
G : 8493 D : 5108 Max. :112.00 Max. :339.0
(Other):23390 (Other): 4429
total gross
Min. : 5 Min. :-1645
1st Qu.: 227 1st Qu.: 21
Median : 510 Median : 68
Mean : 859 Mean : 132
3rd Qu.: 1082 3rd Qu.: 169
Max. :30171 Max. : 8069
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
items pieces total gross
99.9% 54 81.00 9010 1825
99.95% 62 94.29 10612 2180
99.99% 82 133.00 16044 3227
X = subset(X, items<=62 & pieces<95 & total<16000) # 119328
par(cex=0.8)
hist(X$date, "weeks", freq=T, border='lightgray', col='darkcyan',
las=2, main="No. Transaction per Week")
A#r:顧客最後一次消費距離今日
#S:顧客最近一次消費距離今日
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
f = n(), # frquency
m = mean(total), # monetary
rev = sum(total), # total revenue contribution
raw = sum(gross), # total gross profit contribution
age = first(age), # age group
area = first(area) # area code
) %>% data.frame # 33241
A
summary(A)
cust r s f
Min. : 1069 Min. : 1.0 Min. : 1.0 Min. : 1.0
1st Qu.: 1088519 1st Qu.: 9.0 1st Qu.: 56.0 1st Qu.: 1.0
Median : 1663402 Median : 26.0 Median : 92.0 Median : 2.0
Mean : 1473585 Mean : 37.5 Mean : 80.8 Mean : 3.7
3rd Qu.: 1958089 3rd Qu.: 60.0 3rd Qu.:110.0 3rd Qu.: 4.0
Max. :20002000 Max. :120.0 Max. :120.0 Max. :85.0
m rev raw age
Min. : 8 Min. : 8 Min. : -784 D :6580
1st Qu.: 365 1st Qu.: 707 1st Qu.: 75 C :5915
Median : 706 Median : 1750 Median : 241 E :5080
Mean : 993 Mean : 3152 Mean : 485 F :3719
3rd Qu.: 1291 3rd Qu.: 3968 3rd Qu.: 612 B :3193
Max. :12636 Max. :127686 Max. :20273 G :2183
(Other):5571
area
E :10800
F : 8539
G : 3695
C : 3683
D : 2166
H : 1453
(Other): 1905
par(mfrow=c(3,2), mar=c(3,3,4,2))
for(x in c('r','s','f','m'))
hist(A[,x],freq=T,main=x,xlab="",ylab="",cex.main=2)
hist(pmin(A$f,10),0:10,freq=T,xlab="",ylab="",cex.main=2)
hist(log(A$m,10),freq=T,xlab="",ylab="",cex.main=2)
A0 = A; X0 = X; Z0 = Z
save(Z0, X0, A0, file="data/tf0.rdata")
range(X$date)
[1] "2000-11-01" "2001-02-28"
使用一月底(含2001-01-31)以前的資料,建立模型來預測每一位顧客:
【Q】 What are the Unit of Analysis?
【Q】 What are the Target of Analysis? Should we model for every customers in the dataset? Why not?
【Q】 How to make the Training/Testing Data Split?
【Q】 What are the Predicting and Targeted Variables?
Screen out the new customers (who arrive after 2001-02-01)
A = filter(A0, s > 28) # 28584
mean(A$r <= 28)
[1] 0.4633
library(caTools)
set.seed(1234); spl = sample.split(A$r <= 28, SplitRatio=0.75)
cid1 = subset(A, spl)$cust # 21438
cid2 = subset(A, !spl)$cust # 7146
cid1/cid2 are the customers ids in the training/testing data. But, …
【Q】 What are the Predicting (X) and Targeted Variables (Y)?