變數製作與資料分割流程

Fig-1: Feature Engineering

Fig-1: Feature Engineering

Fig-2: Feature Engr. & Data Spliting Process

Fig-2: Feature Engr. & Data Spliting Process



Loading & Preparing Data

Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008  9269


Weekday Percentage: W1 ~ W7

X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)

    0     1     2     3     4     5     6 
18011 12615 11288  9898 11245 10651 14587 
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584     7
mx[1:5,]
      wday
cust   0 1 2 3 4 5 6
  1069 1 1 0 0 0 0 0
  1113 2 1 0 0 0 0 1
  1359 0 1 0 0 0 0 0
  1823 0 1 0 1 1 0 0
  2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
      wday
cust           0         1         2         3         4         5
  1069 0.5000000 0.5000000 0.0000000 0.0000000 0.0000000 0.0000000
  1113 0.5000000 0.2500000 0.0000000 0.0000000 0.0000000 0.0000000
  1359 0.0000000 1.0000000 0.0000000 0.0000000 0.0000000 0.0000000
  1823 0.0000000 0.3333333 0.0000000 0.3333333 0.3333333 0.0000000
  2189 0.0000000 0.0000000 0.0000000 0.5000000 0.0000000 0.0000000
      wday
cust           6
  1069 0.0000000
  1113 0.2500000
  1359 0.0000000
  1823 0.0000000
  2189 0.5000000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>% 
  setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>% 
  right_join(A, by='cust')
head(A)


Classification (Buy) Model

TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred =  predict(rpart1, TS)[,2]  # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
       predict
actual  FALSE TRUE
  FALSE  3730  873
  TRUE   1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts   # 0.70662          
[1] 0.7066231
colAUC(pred, TS$buy)                            # 0.6984
                    [,1]
FALSE vs. TRUE 0.6983998

rr rpart.plot(rpart1,cex=0.6)

[1] 0.70417

rr colAUC(pred, TS$buy) # 0.7169

                 [,1]
FALSE vs. TRUE 0.7169

rr rpart.plot(rpart2,cex=0.6)

Regression (Amount) Model

rr rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002) SST = sum((TS2\(amount - mean(TR2\)amount))^ 2) SSE = sum((predict(rpart3, TS2) - TS2$amount)^2) 1 - (SSE/SST)

[1] 0.24606







LS0tDQp0aXRsZTogIkZlYXR1cmUgRW5naW5lZXJpbmcsIFRhLUZlbmciDQphdXRob3I6ICLljZPpm43nhLYsIOS4reWxseWkp+WtuCDnrqHnkIblrbjooZPnoJTnqbbkuK3lv4MiDQpkYXRlOiAiYHIgU3lzLnRpbWUoKWAiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQo8YnI+DQoNCiMjIyDorormlbjoo73kvZzoiIfos4fmlpnliIblibLmtYHnqIsNCg0KPGNlbnRlcj4NCg0KIVtGaWctMTogRmVhdHVyZSBFbmdpbmVlcmluZ10oZmlnL2ZlYXR1cmluZy5qcGcpDQoNCiFbRmlnLTI6IEZlYXR1cmUgRW5nci4gJiBEYXRhIFNwbGl0aW5nIFByb2Nlc3NdKGZpZy9mZWF0dXJlX2VuZ3IuanBnKQ0KDQoNCjwvY2VudGVyPg0KDQo8YnI+PGhyPg0KDQojIyMgTG9hZGluZyAmIFByZXBhcmluZyBEYXRhDQpgYGB7ciBlY2hvPVQsIG1lc3NhZ2U9RiwgY2FjaGU9Riwgd2FybmluZz1GfQ0KU3lzLnNldGxvY2FsZSgiTENfQUxMIiwiQyIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjYVRvb2xzKQ0KbGlicmFyeShNYXRyaXgpDQpsaWJyYXJ5KHNsYW0pDQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KYGBgDQoNCmBgYHtyfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpsb2FkKCJkYXRhL3RmMi5yZGF0YSIpDQpBMiA9IHN1YnNldChBLCBidXkpDQpjKHN1bShzcGwpLCBzdW0oc3BsMikpDQpgYGANCjxicj48aHI+DQoNCiMjIyBXZWVrZGF5IFBlcmNlbnRhZ2U6IFcxIH4gVzcNCmBgYHtyfQ0KWCA9IFggJT4lIG11dGF0ZSh3ZGF5ID0gZm9ybWF0KGRhdGUsICIldyIpKQ0KdGFibGUoWCR3ZGF5KQ0KYGBgDQoNCg0KYGBge3J9DQpteCA9IHh0YWJzKH4gY3VzdCArIHdkYXksIFgpDQpkaW0obXgpDQpgYGANCg0KYGBge3J9DQpteFsxOjUsXQ0KYGBgDQoNCmBgYHtyfQ0KbXggPSBteCAvIHJvd1N1bXMobXgpDQpteFsxOjUsXQ0KYGBgDQoNCmBgYHtyfQ0KQSA9IGRhdGEuZnJhbWUoYXMuaW50ZWdlcihyb3duYW1lcyhteCkpLCBhcy5tYXRyaXguZGF0YS5mcmFtZShteCkpICU+JSANCiAgc2V0TmFtZXMoYygiY3VzdCIsIlcxIiwiVzIiLCJXMyIsIlc0IiwiVzUiLCJXNiIsIlc3IikpICU+JSANCiAgcmlnaHRfam9pbihBLCBieT0nY3VzdCcpDQpoZWFkKEEpDQpgYGANCjxicj48aHI+DQoNCiMjIyBDbGFzc2lmaWNhdGlvbiAoQnV5KSBNb2RlbA0KYGBge3J9DQpUUiA9IHN1YnNldChBLCBzcGwpDQpUUyA9IHN1YnNldChBLCAhc3BsKQ0KYGBgDQoNCmBgYHtyfQ0KbGlicmFyeShycGFydCkNCmxpYnJhcnkocnBhcnQucGxvdCkNCnJwYXJ0MSA9IHJwYXJ0KGJ1eSB+IC4sIFRSWyxjKDI6MTYsMTgpXSwgbWV0aG9kPSJjbGFzcyIpDQpwcmVkID0gIHByZWRpY3QocnBhcnQxLCBUUylbLDJdICAjIHByZWRpY3QgcHJvYg0KY20gPSB0YWJsZShhY3R1YWwgPSBUUyRidXksIHByZWRpY3QgPSBwcmVkID4gMC41KTsgY20NCmFjYy50cyA9IGNtICU+JSB7c3VtKGRpYWcoLikpL3N1bSguKX07IGFjYy50cyAgICMgMC43MDY2MiAgICAgICAgICANCmNvbEFVQyhwcmVkLCBUUyRidXkpICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgMC42OTg0DQpgYGANCisg5Yip55SoQ0FSVCDigJMgQ2xhc3NpZmljYXRpb24gJiBSZWdyZXNzaW9uIFRyZWXlu7rnq4vpoJDmuKzmqKHlnosNCisg5L2/55SoQ0FSVCDpoJDmuKwg6aGe5YilDQorIOaqouimlua4rOippuizh+aWmeeahOa6lueiuuW6pg0KKyDmqqLoppZBVUMNCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9Ny4yfQ0KcnBhcnQucGxvdChycGFydDEsY2V4PTAuNikNCmBgYA0KDQpgYGB7cn0NCnJwYXJ0MiA9IHJwYXJ0KGJ1eSB+IC4sIFRSWyxjKDI6MTYsMTgpXSwgbWV0aG9kPSJjbGFzcyIsY3A9MC4wMDEpDQpwcmVkID0gIHByZWRpY3QocnBhcnQyLCBUUylbLDJdICAjIHByZWRpY3QgcHJvYg0KY20gPSB0YWJsZShhY3R1YWwgPSBUUyRidXksIHByZWRpY3QgPSBwcmVkID4gMC41KTsgY20NCmFjYy50cyA9IGNtICU+JSB7c3VtKGRpYWcoLikpL3N1bSguKX07IGFjYy50cyAgICMgMC43MDQxNyAgICAgICAgICANCmNvbEFVQyhwcmVkLCBUUyRidXkpICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgMC43MTY5ICAgICAgICAgDQpgYGANCg0KYGBge3J9DQpycGFydC5wbG90KHJwYXJ0MixjZXg9MC42KQ0KYGBgDQoNCiMjIyBSZWdyZXNzaW9uIChBbW91bnQpIE1vZGVsDQpgYGB7cn0NCkEyID0gc3Vic2V0KEEsIGJ1eSkgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KVFIyID0gc3Vic2V0KEEyLCBzcGwyKQ0KVFMyID0gc3Vic2V0KEEyLCAhc3BsMikNCmBgYA0KKyDnlLHmlrzmmK/poJDmuKzmlbjph4/vvIzlsIfos4fmlpnlj5ZMb2cxMOWPr+S7pemBv+WFjeWWruS9jeS4jeWQjOaJgOmAoOaIkOeahOaVuOWtl+W3rueVsA0KYGBge3J9DQpycGFydDMgPSBycGFydChhbW91bnQgfiAuLCBUUjJbLGMoMjoxNyldLCBjcD0wLjAwMikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KHJwYXJ0MywgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KMSAtIChTU0UvU1NUKQ0KYGBgDQoNCisg5Y2z57i96K6K55WwKFNTVCk95bey6Kej6YeL6K6K55WwKFNTUikrIOacquino+mHi+iuiueVsChTU0UpDQoNCg0KDQoNCg0KPGJyPjxicj48aHI+PGJyPjxicj48YnI+PGJyPg0KPHN0eWxlPg0KDQouY2FwdGlvbiB7DQogIGNvbG9yOiAjNzc3Ow0KICBtYXJnaW4tdG9wOiAxMHB4Ow0KfQ0KcCBjb2RlIHsNCiAgd2hpdGUtc3BhY2U6IGluaGVyaXQ7DQp9DQpwcmUgew0KICB3b3JkLWJyZWFrOiBub3JtYWw7DQogIHdvcmQtd3JhcDogbm9ybWFsOw0KICBsaW5lLWhlaWdodDogMTsNCn0NCnByZSBjb2RlIHsNCiAgd2hpdGUtc3BhY2U6IGluaGVyaXQ7DQp9DQpwLGxpIHsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCi5yew0KICBsaW5lLWhlaWdodDogMS4yOw0KfQ0KDQoucWl6IHsNCiAgbGluZS1oZWlnaHQ6IDEuNzU7DQogIGJhY2tncm91bmQ6ICNmMGYwZjA7DQogIGJvcmRlci1sZWZ0OiAxMnB4IHNvbGlkICNjY2ZmY2M7DQogIHBhZGRpbmc6IDRweDsNCiAgcGFkZGluZy1sZWZ0OiAxMHB4Ow0KICBjb2xvcjogIzAwOTkwMDsNCn0NCg0KdGl0bGV7DQogIGNvbG9yOiAjY2MwMDAwOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KYm9keXsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmgxLGgyLGgzLGg0LGg1ew0KICBjb2xvcjogIzAwNjZmZjsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCg0KaDN7DQogIGNvbG9yOiAjMDA4ODAwOw0KICBiYWNrZ3JvdW5kOiAjZTZmZmU2Ow0KICBsaW5lLWhlaWdodDogMjsNCiAgZm9udC13ZWlnaHQ6IGJvbGQ7DQp9DQoNCmg1ew0KICBjb2xvcjogIzAwNjAwMDsNCiAgYmFja2dyb3VuZDogI2Y4ZjhmODsNCiAgbGluZS1oZWlnaHQ6IDEuNTsNCiAgZm9udC13ZWlnaHQ6IGJvbGQ7DQp9DQoNCjwvc3R5bGU+DQo=