Preparing Data
Libraries
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
Loading and Spliting
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no")) # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
Turn on Parallel Processing
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
CV for Classification Model
CV Control for Classification
ctrl = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final", classProbs=TRUE,
summaryFunction=twoClassSummary)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
buy ~ ., data=TR, method="rpart",
trControl=ctrl, metric="ROC",
tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 19.511 secs
plot(cv.rpart)

cv.rpart$results
cp ROC Sens Spec ROCSD SensSD SpecSD
1 0.0002 0.70841 0.75985 0.57913 0.010364 0.014478 0.019012
2 0.0003 0.71555 0.78010 0.57423 0.010408 0.029139 0.025506
3 0.0004 0.71786 0.79710 0.56716 0.015507 0.028938 0.026220
4 0.0005 0.71821 0.80497 0.56219 0.017172 0.027989 0.024565
5 0.0006 0.71697 0.81041 0.56058 0.016999 0.021776 0.021361
6 0.0007 0.71382 0.81311 0.56112 0.017613 0.016638 0.016526
7 0.0008 0.70874 0.81628 0.55945 0.013033 0.013177 0.013865
8 0.0009 0.70777 0.81777 0.55923 0.010477 0.014680 0.014994
9 0.0010 0.70758 0.81819 0.55918 0.010578 0.014432 0.015508
Classification Tree, Final Model
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>%
colAUC(TS$buy)
[,1]
no vs. yes 0.74018
CV: glm(), General Linear Model
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
buy ~ ., data=TR, method="glm",
trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 4.4909 secs
cv.glm$results
parameter ROC Sens Spec ROCSD SensSD SpecSD
1 none 0.74879 0.80576 0.56856 0.010178 0.013218 0.014611
glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
[,1]
no vs. yes 0.7556
CV for Regression Model
Spliting Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
CV Control for Regression
ctrl2 = trainControl(
method="repeatedcv", number=10, # 10-fold, Repeated CV
savePredictions = "final")
CV: rpart() Regression Tree
ctrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
amount ~ ., data=TR2, method="rpart",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)

cv.rpart2$results
cp RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 0.0008 0.42675 0.27564 0.32766 0.0090329 0.023591 0.0061173
2 0.0009 0.42477 0.28117 0.32635 0.0103407 0.025773 0.0066172
3 0.0010 0.42359 0.28403 0.32583 0.0102501 0.025828 0.0064855
4 0.0011 0.42309 0.28549 0.32526 0.0100893 0.025807 0.0065420
5 0.0012 0.42297 0.28581 0.32530 0.0096765 0.023913 0.0058852
6 0.0013 0.42313 0.28532 0.32534 0.0097162 0.023931 0.0058034
7 0.0014 0.42320 0.28514 0.32556 0.0100614 0.026334 0.0061210
8 0.0015 0.42279 0.28628 0.32541 0.0104651 0.026525 0.0062114
9 0.0016 0.42252 0.28711 0.32525 0.0105918 0.025878 0.0061396
10 0.0017 0.42256 0.28688 0.32533 0.0107874 0.026540 0.0063368
11 0.0018 0.42275 0.28627 0.32552 0.0111151 0.028421 0.0066105
12 0.0019 0.42335 0.28421 0.32591 0.0110973 0.027216 0.0066057
13 0.0020 0.42369 0.28307 0.32599 0.0110733 0.027393 0.0067058
14 0.0021 0.42339 0.28411 0.32581 0.0110896 0.028081 0.0067905
15 0.0022 0.42349 0.28378 0.32590 0.0113953 0.029014 0.0069534
16 0.0023 0.42335 0.28421 0.32575 0.0111905 0.028332 0.0066832
17 0.0024 0.42342 0.28393 0.32573 0.0108693 0.027563 0.0064283
rpart(), Regression Tree Final Model
rpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) - TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
[1] 0.24672
CV: lm(), Linear Model
ctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
amount ~ ., data=TR2, method="lm",
trControl=ctrl2, metric="Rsquared",
tuneGrid = expand.grid( intercept = seq(0,5,0.5) )
)
plot(cv.lm2)

cv.lm2$results
intercept RMSE Rsquared MAE RMSESD RsquaredSD MAESD
1 0.0 0.43331 0.27303 0.33095 0.01424 0.033815 0.0103186
2 0.5 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
3 1.0 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
4 1.5 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
5 2.0 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
6 2.5 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
7 3.0 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
8 3.5 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
9 4.0 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
10 4.5 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
11 5.0 0.42046 0.29444 0.32349 0.01090 0.028294 0.0066456
lm() Final Model
lm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) - TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
[1] 0.26226
Stop Parallel Processing
stopCluster(clust)
LS0tDQp0aXRsZTogIkNyb3NzIFZhbGlkLiAmIE1vZGVsIFNlbGVjdGlvbiwgVGEtRmVuZyINCmF1dGhvcjogIuWNk+mbjeeEtiwg5Lit5bGx5aSn5a24IOeuoeeQhuWtuOihk+eglOeptuS4reW/gyINCmRhdGU6ICJgciBTeXMudGltZSgpYCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCjxicj4NCg0KIyMjIOS6pOWPiempl+itieiIh+WPg+aVuOiqv+agoea1geeoiw0KDQo8Y2VudGVyPg0KDQohW0ZpZy0xOiBTdXBlcnZpc2VkIExlYXJuaW5nIFByb2Nlc3NdKGZpZy9zdXBlcnZpc2VkLmpwZykNCg0KIVtGaWctMjogQ1YsIE1vZGVsIFNlbC4gJiBQYXJhbWV0ZXIgVHVuaW5nXShmaWcvY3YuanBnKQ0KDQo8L2NlbnRlcj4NCg0KPGJyPjxocj4NCg0KIyMjIFByZXBhcmluZyBEYXRhDQoNCiMjIyMjIExpYnJhcmllcw0KYGBge3IgZWNobz1ULCBtZXNzYWdlPUYsIGNhY2hlPUYsIHdhcm5pbmc9Rn0NClN5cy5zZXRsb2NhbGUoIkxDX0FMTCIsIkMiKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoY2FUb29scykNCmxpYnJhcnkoTWF0cml4KQ0KbGlicmFyeShycGFydCkNCmxpYnJhcnkocnBhcnQucGxvdCkNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KGRvUGFyYWxsZWwpDQpgYGANCg0KIyMjIyMgTG9hZGluZyBhbmQgU3BsaXRpbmcNCmBgYHtyfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpsb2FkKCJkYXRhL3RmMi5yZGF0YSIpDQpBJGJ1eSA9IGZhY3RvcihpZmVsc2UoQSRidXksICJ5ZXMiLCAibm8iKSkgICMgY29tcGx5IHRvIHRoZSBydWxlIG9mIGNhcmV0DQpUUiA9IEFbc3BsLCBjKDI6OSwxMSldDQpUUyA9IEFbIXNwbCwgYygyOjksMTEpXQ0KYGBgDQoNCiMjIyMjIFR1cm4gb24gUGFyYWxsZWwgUHJvY2Vzc2luZw0KYGBge3J9DQpjbHVzdCA9IG1ha2VDbHVzdGVyKGRldGVjdENvcmVzKCkpDQpyZWdpc3RlckRvUGFyYWxsZWwoY2x1c3QpOyBnZXREb1BhcldvcmtlcnMoKQ0KYGBgDQoNCiMjIyBDViBmb3IgQ2xhc3NpZmljYXRpb24gTW9kZWwgDQoNCiMjIyMjIENWIENvbnRyb2wgZm9yIENsYXNzaWZpY2F0aW9uDQpgYGB7cn0NCmN0cmwgPSB0cmFpbkNvbnRyb2woDQogIG1ldGhvZD0icmVwZWF0ZWRjdiIsIG51bWJlcj0xMCwgICAgIyAxMC1mb2xkLCBSZXBlYXRlZCBDVg0KICBzYXZlUHJlZGljdGlvbnMgPSAiZmluYWwiLCBjbGFzc1Byb2JzPVRSVUUsDQogIHN1bW1hcnlGdW5jdGlvbj10d29DbGFzc1N1bW1hcnkpDQpgYGANCg0KIyMjIyMgQ1Y6IGBycGFydCgpYCwgQ2xhc3NpZmljYXRpb24gVHJlZSANCmBgYHtyfQ0KY3RybCRyZXBlYXRzID0gMg0KdDAgPSBTeXMudGltZSgpOyBzZXQuc2VlZCgyKQ0KY3YucnBhcnQgPSB0cmFpbigNCiAgYnV5IH4gLiwgZGF0YT1UUiwgbWV0aG9kPSJycGFydCIsIA0KICB0ckNvbnRyb2w9Y3RybCwgbWV0cmljPSJST0MiLA0KICB0dW5lR3JpZCA9IGV4cGFuZC5ncmlkKGNwID0gc2VxKDAuMDAwMiwwLjAwMSwwLjAwMDEpICkgKQ0KU3lzLnRpbWUoKSAtIHQwDQpgYGANCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9N30NCnBsb3QoY3YucnBhcnQpDQpgYGANCg0KYGBge3J9DQpjdi5ycGFydCRyZXN1bHRzIA0KYGBgDQoNCiMjIyMjIENsYXNzaWZpY2F0aW9uIFRyZWUsIEZpbmFsIE1vZGVsDQpgYGB7cn0NCnJwYXJ0MSA9IHJwYXJ0KGJ1eSB+IC4sIFRSLCBtZXRob2Q9ImNsYXNzIiwgY3A9MC4wMDA1KQ0KcHJlZGljdChycGFydDEsIFRTLCB0eXBlPSJwcm9iIilbLDJdICU+JSANCiAgY29sQVVDKFRTJGJ1eSkNCmBgYA0KPGJyPjxocj4NCg0KIyMjIyMgQ1Y6IGBnbG0oKWAsIEdlbmVyYWwgTGluZWFyIE1vZGVsDQpgYGB7cn0NCmN0cmwkcmVwZWF0cyA9IDINCnQwID0gU3lzLnRpbWUoKTsgc2V0LnNlZWQoMikNCmN2LmdsbSA9IHRyYWluKA0KICBidXkgfiAuLCBkYXRhPVRSLCBtZXRob2Q9ImdsbSIsIA0KICB0ckNvbnRyb2w9Y3RybCwgbWV0cmljPSJST0MiKQ0KU3lzLnRpbWUoKSAtIHQwDQpgYGANCg0KYGBge3J9DQpjdi5nbG0kcmVzdWx0cw0KYGBgDQoNCiMjIyMjIGBnbG0oKWAsIEZpbmFsIE1vZGVsDQpgYGB7cn0NCmdsbTEgPSBiPWdsbShidXkgfiAuLCBUUiwgZmFtaWx5PWJpbm9taWFsKQ0KcHJlZGljdChnbG0xLCBUUywgdHlwZT0icmVzcG9uc2UiKSAlPiUgY29sQVVDKFRTJGJ1eSkNCmBgYA0KPGJyPjxocj4NCg0KDQojIyMgQ1YgZm9yIFJlZ3Jlc3Npb24gTW9kZWwNCg0KIyMjIyMgU3BsaXRpbmcgRGF0YQ0KYGBge3J9DQpBMiA9IHN1YnNldChBLCBBJGJ1eSA9PSAieWVzIikgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KVFIyID0gQTJbIHNwbDIsIGMoMjoxMCldDQpUUzIgPSBBMlshc3BsMiwgYygyOjEwKV0NCmBgYA0KDQojIyMjIyBDViBDb250cm9sIGZvciBSZWdyZXNzaW9uDQpgYGB7cn0NCmN0cmwyID0gdHJhaW5Db250cm9sKA0KICBtZXRob2Q9InJlcGVhdGVkY3YiLCBudW1iZXI9MTAsICAgICMgMTAtZm9sZCwgUmVwZWF0ZWQgQ1YNCiAgc2F2ZVByZWRpY3Rpb25zID0gImZpbmFsIikNCmBgYA0KDQojIyMjIyBDVjogYHJwYXJ0KClgIFJlZ3Jlc3Npb24gVHJlZQ0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9N30NCmN0cmwkcmVwZWF0cyA9IDINCnNldC5zZWVkKDIpDQpjdi5ycGFydDIgPSB0cmFpbigNCiAgYW1vdW50IH4gLiwgZGF0YT1UUjIsIG1ldGhvZD0icnBhcnQiLCANCiAgdHJDb250cm9sPWN0cmwyLCBtZXRyaWM9IlJzcXVhcmVkIiwNCiAgdHVuZUdyaWQgPSBleHBhbmQuZ3JpZChjcCA9IHNlcSgwLjAwMDgsMC4wMDI0LDAuMDAwMSkgKSApDQpwbG90KGN2LnJwYXJ0MikNCmBgYA0KDQpgYGB7cn0NCmN2LnJwYXJ0MiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYHJwYXJ0KClgLCBSZWdyZXNzaW9uIFRyZWUgRmluYWwgTW9kZWwNCmBgYHtyfQ0KcnBhcnQyID0gcnBhcnQoYW1vdW50IH4gLiwgZGF0YT1UUjIsIGNwPTAuMDAxNikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KHJwYXJ0MiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLnJwYXJ0MiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KIyMjIyMgQ1Y6IGBsbSgpYCwgTGluZWFyIE1vZGVsDQpgYGB7ciBmaWcuaGVpZ2h0PTMsIGZpZy53aWR0aD03fQ0KY3RybCRyZXBlYXRzID0gMg0Kc2V0LnNlZWQoMikNCmN2LmxtMiA9IHRyYWluKA0KICBhbW91bnQgfiAuLCBkYXRhPVRSMiwgbWV0aG9kPSJsbSIsIA0KICB0ckNvbnRyb2w9Y3RybDIsIG1ldHJpYz0iUnNxdWFyZWQiLA0KICAgIHR1bmVHcmlkID0gZXhwYW5kLmdyaWQoIGludGVyY2VwdCA9IHNlcSgwLDUsMC41KSApIA0KICApDQpwbG90KGN2LmxtMikNCmBgYA0KDQpgYGB7cn0NCmN2LmxtMiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYGxtKClgIEZpbmFsIE1vZGVsDQpgYGB7cn0NCmxtMiA9IGxtKGFtb3VudCB+IC4sIFRSMikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KGxtMiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLmxtMiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KIyMjIyMgU3RvcCBQYXJhbGxlbCBQcm9jZXNzaW5nDQpgYGB7cn0NCnN0b3BDbHVzdGVyKGNsdXN0KQ0KYGBgDQo8YnI+PGJyPjxocj48YnI+PGJyPjxicj48YnI+DQo8c3R5bGU+DQoNCi5jYXB0aW9uIHsNCiAgY29sb3I6ICM3Nzc7DQogIG1hcmdpbi10b3A6IDEwcHg7DQp9DQpwIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnByZSB7DQogIHdvcmQtYnJlYWs6IG5vcm1hbDsNCiAgd29yZC13cmFwOiBub3JtYWw7DQogIGxpbmUtaGVpZ2h0OiAxOw0KfQ0KcHJlIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnAsbGkgew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KLnJ7DQogIGxpbmUtaGVpZ2h0OiAxLjI7DQp9DQoNCi5xaXogew0KICBsaW5lLWhlaWdodDogMS43NTsNCiAgYmFja2dyb3VuZDogI2YwZjBmMDsNCiAgYm9yZGVyLWxlZnQ6IDEycHggc29saWQgI2NjZmZjYzsNCiAgcGFkZGluZzogNHB4Ow0KICBwYWRkaW5nLWxlZnQ6IDEwcHg7DQogIGNvbG9yOiAjMDA5OTAwOw0KfQ0KDQp0aXRsZXsNCiAgY29sb3I6ICNjYzAwMDA7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpib2R5ew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KaDEsaDIsaDMsaDQsaDV7DQogIGNvbG9yOiAjMDA2NmZmOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KDQpoM3sNCiAgY29sb3I6ICMwMDg4MDA7DQogIGJhY2tncm91bmQ6ICNlNmZmZTY7DQogIGxpbmUtaGVpZ2h0OiAyOw0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KaDV7DQogIGNvbG9yOiAjMDA2MDAwOw0KICBiYWNrZ3JvdW5kOiAjZjhmOGY4Ow0KICBsaW5lLWhlaWdodDogMS41Ow0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg==