資料準備流程

Fig-1: Data Preparation

Fig-1: Data Preparation


Preparing The Predictors (X)

Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
rm(list=ls(all=TRUE))
load("data/tf0.rdata")
The Demarcation Date

Remove data after the demarcation date

feb01 = as.Date("2001-02-01")
Z = subset(Z0, date < feb01)    # 618212
Aggregate for the Transaction Records
X = group_by(Z, tid) %>% summarise(
  date = first(date),  # 交易日期
  cust = first(cust),  # 顧客 ID
  age = first(age),    # 顧客 年齡級別
  area = first(area),  # 顧客 居住區別
  items = n(),                # 交易項目(總)數
  pieces = sum(qty),          # 產品(總)件數
  total = sum(price),         # 交易(總)金額
  gross = sum(price - cost)   # 毛利
  ) %>% data.frame  # 88387
summary(X)
      tid             date                 cust               age             area      
 Min.   :    1   Min.   :2000-11-01   Min.   :    1069   D      :17541   E      :37496  
 1st Qu.:22098   1st Qu.:2000-11-23   1st Qu.:  923910   C      :14624   F      :25412  
 Median :44194   Median :2000-12-12   Median : 1607000   E      :14578   G      : 6787  
 Mean   :44194   Mean   :2000-12-15   Mean   : 1395768   F      :10354   C      : 6329  
 3rd Qu.:66290   3rd Qu.:2001-01-12   3rd Qu.: 1888874   B      : 7817   H      : 5524  
 Max.   :88387   Max.   :2001-01-31   Max.   :20002000   G      : 6308   D      : 3655  
                                                         (Other):17165   (Other): 3184  
     items            pieces           total           gross      
 Min.   :  1.00   Min.   :  1.00   Min.   :    5   Min.   :-1645  
 1st Qu.:  2.00   1st Qu.:  3.00   1st Qu.:  230   1st Qu.:   23  
 Median :  5.00   Median :  6.00   Median :  522   Median :   72  
 Mean   :  6.99   Mean   :  9.45   Mean   :  889   Mean   :  138  
 3rd Qu.:  9.00   3rd Qu.: 12.00   3rd Qu.: 1120   3rd Qu.:  174  
 Max.   :112.00   Max.   :339.00   Max.   :30171   Max.   : 8069  
                                                                  
Check Quantile and Remove Outlier
sapply(X[,6:9], quantile, prob=c(.999, .9995, .9999))
        items pieces   total  gross
99.9%  56.000  84.00  9378.7 1883.2
99.95% 64.000  98.00 11261.8 2317.1
99.99% 85.646 137.65 17699.3 3389.6
X = subset(X, items<=64 & pieces<=98 & total<=11260) # 88387 -> 88295
Aggregate for Customer Records
d0 = max(X$date)
A = group_by(X, cust) %>% summarise(
  r = 1 + as.integer(difftime(d0, max(date), units="days")), # recency
  s = 1 + as.integer(difftime(d0, min(date), units="days")), # seniority
  f = n(),            # frquency
  m = mean(total),    # monetary
  rev = sum(total),   # total revenue contribution
  raw = sum(gross),   # total gross profit contribution
  age = first(age),   # age group
  area = first(area), # area code
  ) %>% data.frame    # 28584



Preparing the Target Variables (Y)

Aggregate Feb’s Transaction by Customer
feb = filter(X0, date>= feb01) %>% group_by(cust) %>% 
  summarise(amount = sum(total))  # 16899
The Target for Regression - A$amount

Simply a Left Joint

A = merge(A, feb, by="cust", all.x=T)
The Target for Classification - A$buy
A$buy = !is.na(A$amount)
Summary of the Dataset
summary(A)
      cust                r              s              f               m              rev       
 Min.   :    1069   Min.   : 1.0   Min.   : 1.0   Min.   : 1.00   Min.   :    8   Min.   :    8  
 1st Qu.: 1060898   1st Qu.:11.0   1st Qu.:47.0   1st Qu.: 1.00   1st Qu.:  359   1st Qu.:  638  
 Median : 1654100   Median :21.0   Median :68.0   Median : 2.00   Median :  710   Median : 1566  
 Mean   : 1461070   Mean   :32.1   Mean   :61.3   Mean   : 3.09   Mean   : 1012   Mean   : 2711  
 3rd Qu.: 1945003   3rd Qu.:53.0   3rd Qu.:83.0   3rd Qu.: 4.00   3rd Qu.: 1315   3rd Qu.: 3426  
 Max.   :20002000   Max.   :92.0   Max.   :92.0   Max.   :60.00   Max.   :10634   Max.   :99597  
                                                                                                 
      raw             age            area          amount         buy         
 Min.   : -742   D      :5832   E      :9907   Min.   :    8   Mode :logical  
 1st Qu.:   70   C      :5238   F      :7798   1st Qu.:  454   FALSE:15342    
 Median :  218   E      :4514   C      :3169   Median :  993   TRUE :13242    
 Mean   :  421   F      :3308   G      :3052   Mean   : 1498                  
 3rd Qu.:  535   B      :2802   D      :1778   3rd Qu.: 1955                  
 Max.   :15565   G      :1940   H      :1295   Max.   :28089                  
                 (Other):4950   (Other):1585   NA's   :15342                  
The Association of Categorial Predictors
tapply(A$buy, A$age, mean) %>% barplot
abline(h = mean(A$buy), col='red')

tapply(A$buy, A$area, mean) %>% barplot
abline(h = mean(A$buy), col='red')

Contest Dataset
X = subset(X, cust %in% A$cust & date < as.Date("2001-02-01"))
Z = subset(Z, cust %in% A$cust & date < as.Date("2001-02-01"))
set.seed(2018); spl = sample.split(A$buy, SplitRatio=0.7)
c(nrow(A), sum(spl), sum(!spl))
[1] 28584 20008  8576
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
n = nrow(A2)
set.seed(2018); spl2 = 1:n %in% sample(1:n, round(0.7*n))
c(nrow(A2), sum(spl2), sum(!spl2))
[1] 13242  9269  3973
cbind(A2, spl2) %>% 
  ggplot(aes(x=amount)) + geom_density(aes(fill=spl2), alpha=0.5)

save(Z, X, A, spl, spl2, file="data/tf2.rdata")











LS0tDQp0aXRsZTogIkRhdGEgUHJlcGFyYXRpb24sIFRhLUZlbmciDQphdXRob3I6ICLljZPpm43nhLYsIOS4reWxseWkp+WtuCDnrqHnkIblrbjooZPnoJTnqbbkuK3lv4MiDQpkYXRlOiAiYHIgU3lzLnRpbWUoKWAiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQo8YnI+DQoNCiMjIyDos4fmlpnmupblgpnmtYHnqIsNCg0KPGNlbnRlcj4NCg0KIVtGaWctMTogRGF0YSBQcmVwYXJhdGlvbl0oZmlnL3ByZXBhcmF0aW9uLmpwZykNCg0KPC9jZW50ZXI+DQoNCjxocj4NCg0KIyMjIFByZXBhcmluZyBUaGUgUHJlZGljdG9ycyAoWCkNCmBgYHtyIGVjaG89VCwgbWVzc2FnZT1GLCBjYWNoZT1GLCB3YXJuaW5nPUZ9DQpTeXMuc2V0bG9jYWxlKCJMQ19BTEwiLCJDIikNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KGNhVG9vbHMpDQpgYGANCg0KYGBge3J9DQpybShsaXN0PWxzKGFsbD1UUlVFKSkNCmxvYWQoImRhdGEvdGYwLnJkYXRhIikNCmBgYA0KDQojIyMjIyBUaGUgRGVtYXJjYXRpb24gRGF0ZQ0KUmVtb3ZlIGRhdGEgYWZ0ZXIgdGhlIGRlbWFyY2F0aW9uIGRhdGUNCmBgYHtyfQ0KZmViMDEgPSBhcy5EYXRlKCIyMDAxLTAyLTAxIikNClogPSBzdWJzZXQoWjAsIGRhdGUgPCBmZWIwMSkgICAgIyA2MTgyMTINCmBgYA0KDQojIyMjIyBBZ2dyZWdhdGUgZm9yIHRoZSBUcmFuc2FjdGlvbiBSZWNvcmRzDQpgYGB7cn0NClggPSBncm91cF9ieShaLCB0aWQpICU+JSBzdW1tYXJpc2UoDQogIGRhdGUgPSBmaXJzdChkYXRlKSwgICMg5Lqk5piT5pel5pyfDQogIGN1c3QgPSBmaXJzdChjdXN0KSwgICMg6aGn5a6iIElEDQogIGFnZSA9IGZpcnN0KGFnZSksICAgICMg6aGn5a6iIOW5tOm9oee0muWIpQ0KICBhcmVhID0gZmlyc3QoYXJlYSksICAjIOmhp+WuoiDlsYXkvY/ljYDliKUNCiAgaXRlbXMgPSBuKCksICAgICAgICAgICAgICAgICMg5Lqk5piT6aCF55uuKOe4vSnmlbgNCiAgcGllY2VzID0gc3VtKHF0eSksICAgICAgICAgICMg55Si5ZOBKOe4vSnku7bmlbgNCiAgdG90YWwgPSBzdW0ocHJpY2UpLCAgICAgICAgICMg5Lqk5piTKOe4vSnph5HpoY0NCiAgZ3Jvc3MgPSBzdW0ocHJpY2UgLSBjb3N0KSAgICMg5q+b5YipDQogICkgJT4lIGRhdGEuZnJhbWUgICMgODgzODcNCmBgYA0KDQpgYGB7cn0NCnN1bW1hcnkoWCkNCmBgYA0KDQojIyMjIyBDaGVjayBRdWFudGlsZSBhbmQgUmVtb3ZlIE91dGxpZXIgDQpgYGB7cn0NCnNhcHBseShYWyw2OjldLCBxdWFudGlsZSwgcHJvYj1jKC45OTksIC45OTk1LCAuOTk5OSkpDQpgYGANCg0KYGBge3J9DQpYID0gc3Vic2V0KFgsIGl0ZW1zPD02NCAmIHBpZWNlczw9OTggJiB0b3RhbDw9MTEyNjApICMgODgzODcgLT4gODgyOTUNCmBgYA0KDQojIyMjIyBBZ2dyZWdhdGUgZm9yIEN1c3RvbWVyIFJlY29yZHMNCmBgYHtyfQ0KZDAgPSBtYXgoWCRkYXRlKQ0KQSA9IGdyb3VwX2J5KFgsIGN1c3QpICU+JSBzdW1tYXJpc2UoDQogIHIgPSAxICsgYXMuaW50ZWdlcihkaWZmdGltZShkMCwgbWF4KGRhdGUpLCB1bml0cz0iZGF5cyIpKSwgIyByZWNlbmN5DQogIHMgPSAxICsgYXMuaW50ZWdlcihkaWZmdGltZShkMCwgbWluKGRhdGUpLCB1bml0cz0iZGF5cyIpKSwgIyBzZW5pb3JpdHkNCiAgZiA9IG4oKSwgICAgICAgICAgICAjIGZycXVlbmN5DQogIG0gPSBtZWFuKHRvdGFsKSwgICAgIyBtb25ldGFyeQ0KICByZXYgPSBzdW0odG90YWwpLCAgICMgdG90YWwgcmV2ZW51ZSBjb250cmlidXRpb24NCiAgcmF3ID0gc3VtKGdyb3NzKSwgICAjIHRvdGFsIGdyb3NzIHByb2ZpdCBjb250cmlidXRpb24NCiAgYWdlID0gZmlyc3QoYWdlKSwgICAjIGFnZSBncm91cA0KICBhcmVhID0gZmlyc3QoYXJlYSksICMgYXJlYSBjb2RlDQogICkgJT4lIGRhdGEuZnJhbWUgICAgIyAyODU4NA0KYGBgDQo8YnI+PGJyPjxocj4NCg0KIyMjIFByZXBhcmluZyB0aGUgVGFyZ2V0IFZhcmlhYmxlcyAoWSkNCg0KIyMjIyMgQWdncmVnYXRlIEZlYidzIFRyYW5zYWN0aW9uIGJ5IEN1c3RvbWVyDQpgYGB7cn0NCmZlYiA9IGZpbHRlcihYMCwgZGF0ZT49IGZlYjAxKSAlPiUgZ3JvdXBfYnkoY3VzdCkgJT4lIA0KICBzdW1tYXJpc2UoYW1vdW50ID0gc3VtKHRvdGFsKSkgICMgMTY4OTkNCmBgYA0KDQojIyMjIyBUaGUgVGFyZ2V0IGZvciBSZWdyZXNzaW9uIC0gYEEkYW1vdW50YA0KU2ltcGx5IGEgTGVmdCBKb2ludA0KYGBge3J9DQpBID0gbWVyZ2UoQSwgZmViLCBieT0iY3VzdCIsIGFsbC54PVQpDQpgYGANCg0KIyMjIyMgVGhlIFRhcmdldCBmb3IgQ2xhc3NpZmljYXRpb24gLSBgQSRidXlgDQpgYGB7cn0NCkEkYnV5ID0gIWlzLm5hKEEkYW1vdW50KQ0KYGBgDQoNCiMjIyMjIFN1bW1hcnkgb2YgdGhlIERhdGFzZXQNCmBgYHtyfQ0Kc3VtbWFyeShBKQ0KYGBgDQoNCiMjIyMjIFRoZSBBc3NvY2lhdGlvbiBvZiBDYXRlZ29yaWFsIFByZWRpY3RvcnMNCmBgYHtyIGZpZy5oZWlnaHQ9MywgZmlnLndpZHRoPTcuMn0NCnRhcHBseShBJGJ1eSwgQSRhZ2UsIG1lYW4pICU+JSBiYXJwbG90DQphYmxpbmUoaCA9IG1lYW4oQSRidXkpLCBjb2w9J3JlZCcpDQpgYGANCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9Ny4yfQ0KdGFwcGx5KEEkYnV5LCBBJGFyZWEsIG1lYW4pICU+JSBiYXJwbG90DQphYmxpbmUoaCA9IG1lYW4oQSRidXkpLCBjb2w9J3JlZCcpDQpgYGANCg0KIyMjIyMgQ29udGVzdCBEYXRhc2V0DQpgYGB7cn0NClggPSBzdWJzZXQoWCwgY3VzdCAlaW4lIEEkY3VzdCAmIGRhdGUgPCBhcy5EYXRlKCIyMDAxLTAyLTAxIikpDQpaID0gc3Vic2V0KFosIGN1c3QgJWluJSBBJGN1c3QgJiBkYXRlIDwgYXMuRGF0ZSgiMjAwMS0wMi0wMSIpKQ0Kc2V0LnNlZWQoMjAxOCk7IHNwbCA9IHNhbXBsZS5zcGxpdChBJGJ1eSwgU3BsaXRSYXRpbz0wLjcpDQpjKG5yb3coQSksIHN1bShzcGwpLCBzdW0oIXNwbCkpDQoNCkEyID0gc3Vic2V0KEEsIGJ1eSkgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KbiA9IG5yb3coQTIpDQpzZXQuc2VlZCgyMDE4KTsgc3BsMiA9IDE6biAlaW4lIHNhbXBsZSgxOm4sIHJvdW5kKDAuNypuKSkNCmMobnJvdyhBMiksIHN1bShzcGwyKSwgc3VtKCFzcGwyKSkNCmBgYA0KDQpgYGB7ciBmaWcuaGVpZ2h0PTMsIGZpZy53aWR0aD03fQ0KY2JpbmQoQTIsIHNwbDIpICU+JSANCiAgZ2dwbG90KGFlcyh4PWFtb3VudCkpICsgZ2VvbV9kZW5zaXR5KGFlcyhmaWxsPXNwbDIpLCBhbHBoYT0wLjUpDQpgYGANCg0KYGBge3J9DQpzYXZlKFosIFgsIEEsIHNwbCwgc3BsMiwgZmlsZT0iZGF0YS90ZjIucmRhdGEiKQ0KYGBgDQoNCjxicj48YnI+PGhyPg0KDQo8YnI+PGJyPjxicj48YnI+PGhyPjxicj48YnI+PGJyPg0KDQo8c3R5bGU+DQoNCi5jYXB0aW9uIHsNCiAgY29sb3I6ICM3Nzc7DQogIG1hcmdpbi10b3A6IDEwcHg7DQp9DQpwIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnByZSB7DQogIHdvcmQtYnJlYWs6IG5vcm1hbDsNCiAgd29yZC13cmFwOiBub3JtYWw7DQogIGxpbmUtaGVpZ2h0OiAxOw0KfQ0KcHJlIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnAsbGkgew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KLnJ7DQogIGxpbmUtaGVpZ2h0OiAxLjI7DQp9DQoNCi5xaXogew0KICBsaW5lLWhlaWdodDogMS43NTsNCiAgYmFja2dyb3VuZDogI2YwZjBmMDsNCiAgYm9yZGVyLWxlZnQ6IDEycHggc29saWQgI2NjZmZjYzsNCiAgcGFkZGluZzogNHB4Ow0KICBwYWRkaW5nLWxlZnQ6IDEwcHg7DQogIGNvbG9yOiAjMDA5OTAwOw0KfQ0KDQp0aXRsZXsNCiAgY29sb3I6ICNjYzAwMDA7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpib2R5ew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KaDEsaDIsaDMsaDQsaDV7DQogIGNvbG9yOiAjMDA2NmZmOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KDQpoM3sNCiAgY29sb3I6ICMwMDg4MDA7DQogIGJhY2tncm91bmQ6ICNlNmZmZTY7DQogIGxpbmUtaGVpZ2h0OiAyOw0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KaDV7DQogIGNvbG9yOiAjMDA2MDAwOw0KICBiYWNrZ3JvdW5kOiAjZjhmOGY4Ow0KICBsaW5lLWhlaWdodDogMS41Ow0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KDQo8L3N0eWxlPg0KDQo=