Loading & Preparing Data
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(slam)
library(rpart)
library(rpart.plot)
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A2 = subset(A, buy)
c(sum(spl), sum(spl2))
[1] 20008 9269
Weekday Percentage: W1 ~ W7
X = X %>% mutate(wday = format(date, "%w"))
table(X$wday)
0 1 2 3 4 5 6
18011 12615 11288 9898 11245 10651 14587
mx = xtabs(~ cust + wday, X)
dim(mx)
[1] 28584 7
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 1 1 0 0 0 0 0
1113 2 1 0 0 0 0 1
1359 0 1 0 0 0 0 0
1823 0 1 0 1 1 0 0
2189 0 0 0 1 0 0 1
mx = mx / rowSums(mx)
mx[1:5,]
wday
cust 0 1 2 3 4 5 6
1069 0.50000 0.50000 0.00000 0.00000 0.00000 0.00000 0.00000
1113 0.50000 0.25000 0.00000 0.00000 0.00000 0.00000 0.25000
1359 0.00000 1.00000 0.00000 0.00000 0.00000 0.00000 0.00000
1823 0.00000 0.33333 0.00000 0.33333 0.33333 0.00000 0.00000
2189 0.00000 0.00000 0.00000 0.50000 0.00000 0.00000 0.50000
A = data.frame(as.integer(rownames(mx)), as.matrix.data.frame(mx)) %>%
setNames(c("cust","W1","W2","W3","W4","W5","W6","W7")) %>%
right_join(A, by='cust')
head(A)
cust W1 W2 W3 W4 W5 W6 W7 r s f m rev raw age area amount buy
1 1069 0.5 0.50000 0.0 0.00000 0.00000 0.0 0.00 11 80 2 579.0 1158 129 K E 786 TRUE
2 1113 0.5 0.25000 0.0 0.00000 0.00000 0.0 0.25 26 81 4 557.5 2230 241 K F NA FALSE
3 1359 0.0 1.00000 0.0 0.00000 0.00000 0.0 0.00 59 59 1 364.0 364 104 K G NA FALSE
4 1823 0.0 0.33333 0.0 0.33333 0.33333 0.0 0.00 8 91 3 869.0 2607 498 K D NA FALSE
5 2189 0.0 0.00000 0.0 0.50000 0.00000 0.0 0.50 29 61 2 7028.0 14056 3299 K B NA FALSE
6 3667 0.0 0.00000 0.5 0.00000 0.00000 0.5 0.00 37 55 2 2379.5 4759 351 K G 1570 TRUE
Classification (Buy) Model
TR = subset(A, spl)
TS = subset(A, !spl)
library(rpart)
library(rpart.plot)
rpart1 = rpart(buy ~ ., TR[,c(2:16,18)], method="class")
pred = predict(rpart1, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3730 873
TRUE 1643 2330
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70662
[1] 0.70662
colAUC(pred, TS$buy) # 0.6984
[,1]
FALSE vs. TRUE 0.6984
rpart.plot(rpart1,cex=0.6)

rpart2 = rpart(buy ~ ., TR[,c(2:16,18)], method="class",cp=0.001)
pred = predict(rpart2, TS)[,2] # predict prob
cm = table(actual = TS$buy, predict = pred > 0.5); cm
predict
actual FALSE TRUE
FALSE 3878 725
TRUE 1812 2161
acc.ts = cm %>% {sum(diag(.))/sum(.)}; acc.ts # 0.70417
[1] 0.70417
colAUC(pred, TS$buy) # 0.7169
[,1]
FALSE vs. TRUE 0.7169
rpart.plot(rpart2,cex=0.6)

Regression (Amount) Model
A2 = subset(A, buy) %>% mutate_at(c("m","rev","amount"), log10)
TR2 = subset(A2, spl2)
TS2 = subset(A2, !spl2)
rpart3 = rpart(amount ~ ., TR2[,c(2:17)], cp=0.002)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart3, TS2) - TS2$amount)^2)
1 - (SSE/SST)
[1] 0.24606
LS0tDQp0aXRsZTogIkZlYXR1cmUgRW5naW5lZXJpbmcsIFRhLUZlbmciDQphdXRob3I6ICLljZPpm43nhLYsIOS4reWxseWkp+WtuCDnrqHnkIblrbjooZPnoJTnqbbkuK3lv4MiDQpkYXRlOiAiYHIgU3lzLnRpbWUoKWAiDQpvdXRwdXQ6IGh0bWxfbm90ZWJvb2sNCi0tLQ0KDQo8YnI+DQoNCiMjIyDorormlbjoo73kvZzoiIfos4fmlpnliIblibLmtYHnqIsNCg0KPGNlbnRlcj4NCg0KIVtGaWctMTogRmVhdHVyZSBFbmdpbmVlcmluZ10oZmlnL2ZlYXR1cmluZy5qcGcpDQoNCiFbRmlnLTI6IEZlYXR1cmUgRW5nci4gJiBEYXRhIFNwbGl0aW5nIFByb2Nlc3NdKGZpZy9mZWF0dXJlX2VuZ3IuanBnKQ0KDQoNCjwvY2VudGVyPg0KDQo8YnI+PGhyPg0KDQojIyMgTG9hZGluZyAmIFByZXBhcmluZyBEYXRhDQpgYGB7ciBlY2hvPVQsIG1lc3NhZ2U9RiwgY2FjaGU9Riwgd2FybmluZz1GfQ0KU3lzLnNldGxvY2FsZSgiTENfQUxMIiwiQyIpDQpsaWJyYXJ5KGRwbHlyKQ0KbGlicmFyeShnZ3Bsb3QyKQ0KbGlicmFyeShjYVRvb2xzKQ0KbGlicmFyeShNYXRyaXgpDQpsaWJyYXJ5KHNsYW0pDQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KYGBgDQoNCmBgYHtyfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpsb2FkKCJkYXRhL3RmMi5yZGF0YSIpDQpBMiA9IHN1YnNldChBLCBidXkpDQpjKHN1bShzcGwpLCBzdW0oc3BsMikpDQpgYGANCjxicj48aHI+DQoNCiMjIyBXZWVrZGF5IFBlcmNlbnRhZ2U6IFcxIH4gVzcNCmBgYHtyfQ0KWCA9IFggJT4lIG11dGF0ZSh3ZGF5ID0gZm9ybWF0KGRhdGUsICIldyIpKQ0KdGFibGUoWCR3ZGF5KQ0KYGBgDQoNCmBgYHtyfQ0KbXggPSB4dGFicyh+IGN1c3QgKyB3ZGF5LCBYKQ0KZGltKG14KQ0KYGBgDQoNCmBgYHtyfQ0KbXhbMTo1LF0NCmBgYA0KDQpgYGB7cn0NCm14ID0gbXggLyByb3dTdW1zKG14KQ0KbXhbMTo1LF0NCmBgYA0KDQpgYGB7cn0NCkEgPSBkYXRhLmZyYW1lKGFzLmludGVnZXIocm93bmFtZXMobXgpKSwgYXMubWF0cml4LmRhdGEuZnJhbWUobXgpKSAlPiUgDQogIHNldE5hbWVzKGMoImN1c3QiLCJXMSIsIlcyIiwiVzMiLCJXNCIsIlc1IiwiVzYiLCJXNyIpKSAlPiUgDQogIHJpZ2h0X2pvaW4oQSwgYnk9J2N1c3QnKQ0KaGVhZChBKQ0KYGBgDQo8YnI+PGhyPg0KDQojIyMgQ2xhc3NpZmljYXRpb24gKEJ1eSkgTW9kZWwNCmBgYHtyfQ0KVFIgPSBzdWJzZXQoQSwgc3BsKQ0KVFMgPSBzdWJzZXQoQSwgIXNwbCkNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpycGFydDEgPSBycGFydChidXkgfiAuLCBUUlssYygyOjE2LDE4KV0sIG1ldGhvZD0iY2xhc3MiKQ0KcHJlZCA9ICBwcmVkaWN0KHJwYXJ0MSwgVFMpWywyXSAgIyBwcmVkaWN0IHByb2INCmNtID0gdGFibGUoYWN0dWFsID0gVFMkYnV5LCBwcmVkaWN0ID0gcHJlZCA+IDAuNSk7IGNtDQphY2MudHMgPSBjbSAlPiUge3N1bShkaWFnKC4pKS9zdW0oLil9OyBhY2MudHMgICAjIDAuNzA2NjIgICAgICAgICAgDQpjb2xBVUMocHJlZCwgVFMkYnV5KSAgICAgICAgICAgICAgICAgICAgICAgICAgICAjIDAuNjk4NA0KYGBgDQoNCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9Ny4yfQ0KcnBhcnQucGxvdChycGFydDEsY2V4PTAuNikNCmBgYA0KDQpgYGB7cn0NCnJwYXJ0MiA9IHJwYXJ0KGJ1eSB+IC4sIFRSWyxjKDI6MTYsMTgpXSwgbWV0aG9kPSJjbGFzcyIsY3A9MC4wMDEpDQpwcmVkID0gIHByZWRpY3QocnBhcnQyLCBUUylbLDJdICAjIHByZWRpY3QgcHJvYg0KY20gPSB0YWJsZShhY3R1YWwgPSBUUyRidXksIHByZWRpY3QgPSBwcmVkID4gMC41KTsgY20NCmFjYy50cyA9IGNtICU+JSB7c3VtKGRpYWcoLikpL3N1bSguKX07IGFjYy50cyAgICMgMC43MDQxNyAgICAgICAgICANCmNvbEFVQyhwcmVkLCBUUyRidXkpICAgICAgICAgICAgICAgICAgICAgICAgICAgICMgMC43MTY5ICAgICAgICAgDQpgYGANCg0KYGBge3J9DQpycGFydC5wbG90KHJwYXJ0MixjZXg9MC42KQ0KYGBgDQoNCiMjIyBSZWdyZXNzaW9uIChBbW91bnQpIE1vZGVsDQpgYGB7cn0NCkEyID0gc3Vic2V0KEEsIGJ1eSkgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KVFIyID0gc3Vic2V0KEEyLCBzcGwyKQ0KVFMyID0gc3Vic2V0KEEyLCAhc3BsMikNCmBgYA0KDQpgYGB7cn0NCnJwYXJ0MyA9IHJwYXJ0KGFtb3VudCB+IC4sIFRSMlssYygyOjE3KV0sIGNwPTAuMDAyKQ0KU1NUID0gc3VtKChUUzIkYW1vdW50IC0gbWVhbihUUjIkYW1vdW50KSleIDIpDQpTU0UgPSBzdW0oKHByZWRpY3QocnBhcnQzLCBUUzIpIC0gIFRTMiRhbW91bnQpXjIpDQoxIC0gKFNTRS9TU1QpDQpgYGANCg0KDQoNCg0KDQoNCjxicj48YnI+PGhyPjxicj48YnI+PGJyPjxicj4NCjxzdHlsZT4NCg0KLmNhcHRpb24gew0KICBjb2xvcjogIzc3NzsNCiAgbWFyZ2luLXRvcDogMTBweDsNCn0NCnAgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcHJlIHsNCiAgd29yZC1icmVhazogbm9ybWFsOw0KICB3b3JkLXdyYXA6IG5vcm1hbDsNCiAgbGluZS1oZWlnaHQ6IDE7DQp9DQpwcmUgY29kZSB7DQogIHdoaXRlLXNwYWNlOiBpbmhlcml0Ow0KfQ0KcCxsaSB7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoucnsNCiAgbGluZS1oZWlnaHQ6IDEuMjsNCn0NCg0KLnFpeiB7DQogIGxpbmUtaGVpZ2h0OiAxLjc1Ow0KICBiYWNrZ3JvdW5kOiAjZjBmMGYwOw0KICBib3JkZXItbGVmdDogMTJweCBzb2xpZCAjY2NmZmNjOw0KICBwYWRkaW5nOiA0cHg7DQogIHBhZGRpbmctbGVmdDogMTBweDsNCiAgY29sb3I6ICMwMDk5MDA7DQp9DQoNCnRpdGxlew0KICBjb2xvcjogI2NjMDAwMDsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmJvZHl7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpoMSxoMixoMyxoNCxoNXsNCiAgY29sb3I6ICMwMDY2ZmY7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQoNCmgzew0KICBjb2xvcjogIzAwODgwMDsNCiAgYmFja2dyb3VuZDogI2U2ZmZlNjsNCiAgbGluZS1oZWlnaHQ6IDI7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQpoNXsNCiAgY29sb3I6ICMwMDYwMDA7DQogIGJhY2tncm91bmQ6ICNmOGY4Zjg7DQogIGxpbmUtaGVpZ2h0OiAxLjU7DQogIGZvbnQtd2VpZ2h0OiBib2xkOw0KfQ0KDQo8L3N0eWxlPg0K