變數製作與資料分割流程

Fig-1: Feature Engineering

Fig-1: Feature Engineering

Fig-2: Feature Engr. & Data Spliting Process

Fig-2: Feature Engr. & Data Spliting Process



Loading & Preparing Data

Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008  9269


Weekday Percentage: W1 ~ W7

X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)

    0     1     2     3     4     5     6 
18011 12615 11288  9898 11245 10651 14587 
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584     7
mx[1:5,]
      wday
cust   0 1 2 3 4 5 6
  1069 1 1 0 0 0 0 0
  1113 2 1 0 0 0 0 1
  1359 0 1 0 0 0 0 0
  1823 0 1 0 1 1 0 0
  2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
      wday
cust         0       1       2       3       4       5       6
  1069 0.50000 0.50000 0.00000 0.00000 0.00000 0.00000 0.00000
  1113 0.50000 0.25000 0.00000 0.00000 0.00000 0.00000 0.25000
  1359 0.00000 1.00000 0.00000 0.00000 0.00000 0.00000 0.00000
  1823 0.00000 0.33333 0.00000 0.33333 0.33333 0.00000 0.00000
  2189 0.00000 0.00000 0.00000 0.50000 0.00000 0.00000 0.50000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')
head(A)
  cust  W1      W2  W3      W4      W5  W6   W7  r  s f      m   rev  raw age area amount   buy
1 1069 0.5 0.50000 0.0 0.00000 0.00000 0.0 0.00 11 80 2  579.0  1158  129   K    E    786  TRUE
2 1113 0.5 0.25000 0.0 0.00000 0.00000 0.0 0.25 26 81 4  557.5  2230  241   K    F     NA FALSE
3 1359 0.0 1.00000 0.0 0.00000 0.00000 0.0 0.00 59 59 1  364.0   364  104   K    G     NA FALSE
4 1823 0.0 0.33333 0.0 0.33333 0.33333 0.0 0.00  8 91 3  869.0  2607  498   K    D     NA FALSE
5 2189 0.0 0.00000 0.0 0.50000 0.00000 0.0 0.50 29 61 2 7028.0 14056 3299   K    B     NA FALSE
6 3667 0.0 0.00000 0.5 0.00000 0.00000 0.5 0.00 37 55 2 2379.5  4759  351   K    G   1570  TRUE


Classification (Buy) Model

TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred =  predict(rpart1, TS)[,2]  # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3730  873
  TRUE   1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   # 0.70662          
[1] 0.70662
colAUC(pred, TS$buy)                            # 0.6984
                 [,1]
FALSE vs. TRUE 0.6984
rpart.plot(rpart1,cex=0.6)

rpart2 = rpart(buy ~ ., TR[,c(2:16,18)], method="class",cp=0.001)
pred =  predict(rpart2, TS)[,2]  # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3878  725
  TRUE   1812 2161
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   # 0.70417          
[1] 0.70417
colAUC(pred, TS$buy)                            # 0.7169         
                 [,1]
FALSE vs. TRUE 0.7169
rpart.plot(rpart2,cex=0.6)

Regression (Amount) Model

A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart3, TS2) -  TS2$amount)^2)
1 - (SSE/SST)
[1] 0.24606







LS0tDQp0aXRsZTogIkZlYXR1cmUgRW5naW5lZXJpbmcsIFRhLUZlbmciDQphdXRob3I6ICLljZPpm43nhLYsIOS4reWxseWkp+WtuCDnrqHnkIblrbjooZPnoJTnqbbkuK3lv4MiDQpkYXRlOiAiYHIgU3lzLnRpbWUoKWAiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQo8YnI+DQoNCiMjIyDorormlbjoo73kvZzoiIfos4fmlpnliIblibLmtYHnqIsNCg0KPGNlbnRlcj4NCg0KIVtGaWctMTogRmVhdHVyZSBFbmdpbmVlcmluZ10oZmlnL2ZlYXR1cmluZy5qcGcpDQoNCiFbRmlnLTI6IEZlYXR1cmUgRW5nci4gJiBEYXRhIFNwbGl0aW5nIFByb2Nlc3NdKGZpZy9mZWF0dXJlX2VuZ3IuanBnKQ0KDQoNCjwvY2VudGVyPg0KDQo8YnI+PGhyPg0KDQojIyMgTG9hZGluZyAmIFByZXBhcmluZyBEYXRhDQpgYGB7ciBlY2hvPVQsIG1lc3NhZ2U9RiwgY2FjaGU9Riwgd2FybmluZz1GfQ0KU3lzLnNldGxvY2FsZSgiTENfQUxMIiwiQyIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjYVRvb2xzKQ0KbGlicmFyeShNYXRyaXgpDQpsaWJyYXJ5KHNsYW0pDQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KYGBgDQoNCmBgYHtyfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpsb2FkKCJkYXRhL3RmMi5yZGF0YSIpDQpBMiA9IHN1YnNldChBLCBidXkpDQpjKHN1bShzcGwpLCBzdW0oc3BsMikpDQpgYGANCjxicj48aHI+DQoNCiMjIyBXZWVrZGF5IFBlcmNlbnRhZ2U6IFcxIH4gVzcNCmBgYHtyfQ0KWCA9IFggJT4lIG11dGF0ZSh3ZGF5ID0gZm9ybWF0KGRhdGUsICIldyIpKQ0KdGFibGUoWCR3ZGF5KQ0KYGBgDQoNCmBgYHtyfQ0KbXggPSB4dGFicyh+IGN1c3QgKyB3ZGF5LCBYKQ0KZGltKG14KQ0KYGBgDQoNCmBgYHtyfQ0KbXhbMTo1LF0NCmBgYA0KDQpgYGB7cn0NCm14ID0gbXggLyByb3dTdW1zKG14KQ0KbXhbMTo1LF0NCmBgYA0KDQpgYGB7cn0NCkEgPSBkYXRhLmZyYW1lKGFzLmludGVnZXIocm93bmFtZXMobXgpKSwgYXMubWF0cml4LmRhdGEuZnJhbWUobXgpKSAlPiUgDQogIHNldE5hbWVzKGMoImN1c3QiLCJXMSIsIlcyIiwiVzMiLCJXNCIsIlc1IiwiVzYiLCJXNyIpKSAlPiUgDQogIHJpZ2h0X2pvaW4oQSwgYnk9J2N1c3QnKQ0KaGVhZChBKQ0KYGBgDQo8YnI+PGhyPg0KDQojIyMgQ2xhc3NpZmljYXRpb24gKEJ1eSkgTW9kZWwNCmBgYHtyfQ0KVFIgPSBzdWJzZXQoQSwgc3BsKQ0KVFMgPSBzdWJzZXQoQSwgIXNwbCkNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpycGFydDEgPSBycGFydChidXkgfiAuLCBUUlssYygyOjE2LDE4KV0sIG1ldGhvZD0iY2xhc3MiKQ0KcHJlZCA9ICBwcmVkaWN0KHJwYXJ0MSwgVFMpWywyXSAgIyBwcmVkaWN0IHByb2INCmNtID0gdGFibGUoYWN0dWFsID0gVFMkYnV5LCBwcmVkaWN0ID0gcHJlZCA+IDAuNSk7IGNtDQphY2MudHMgPSBjbSAlPiUge3N1bShkaWFnKC4pKS9zdW0oLil9OyBhY2MudHMgICAjIDAuNzA2NjIgICAgICAgICAgDQpjb2xBVUMocHJlZCwgVFMkYnV5KSAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIDAuNjk4NA0KYGBgDQoNCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9Ny4yfQ0KcnBhcnQucGxvdChycGFydDEsY2V4PTAuNikNCmBgYA0KDQpgYGB7cn0NCnJwYXJ0MiA9IHJwYXJ0KGJ1eSB+IC4sIFRSWyxjKDI6MTYsMTgpXSwgbWV0aG9kPSJjbGFzcyIsY3A9MC4wMDEpDQpwcmVkID0gIHByZWRpY3QocnBhcnQyLCBUUylbLDJdICAjIHByZWRpY3QgcHJvYg0KY20gPSB0YWJsZShhY3R1YWwgPSBUUyRidXksIHByZWRpY3QgPSBwcmVkID4gMC41KTsgY20NCmFjYy50cyA9IGNtICU+JSB7c3VtKGRpYWcoLikpL3N1bSguKX07IGFjYy50cyAgICMgMC43MDQxNyAgICAgICAgICANCmNvbEFVQyhwcmVkLCBUUyRidXkpICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgMC43MTY5ICAgICAgICAgDQpgYGANCg0KYGBge3J9DQpycGFydC5wbG90KHJwYXJ0MixjZXg9MC42KQ0KYGBgDQoNCiMjIyBSZWdyZXNzaW9uIChBbW91bnQpIE1vZGVsDQpgYGB7cn0NCkEyID0gc3Vic2V0KEEsIGJ1eSkgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KVFIyID0gc3Vic2V0KEEyLCBzcGwyKQ0KVFMyID0gc3Vic2V0KEEyLCAhc3BsMikNCmBgYA0KDQpgYGB7cn0NCnJwYXJ0MyA9IHJwYXJ0KGFtb3VudCB+IC4sIFRSMlssYygyOjE3KV0sIGNwPTAuMDAyKQ0KU1NUID0gc3VtKChUUzIkYW1vdW50IC0gbWVhbihUUjIkYW1vdW50KSleIDIpDQpTU0UgPSBzdW0oKHByZWRpY3QocnBhcnQzLCBUUzIpIC0gIFRTMiRhbW91bnQpXjIpDQoxIC0gKFNTRS9TU1QpDQpgYGANCg0KDQoNCg0KDQoNCjxicj48YnI+PGhyPjxicj48YnI+PGJyPjxicj4NCjxzdHlsZT4NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogIzAwODgwMDsNCiAgYmFja2dyb3VuZDogI2U2ZmZlNjsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQo8L3N0eWxlPg0K