交叉驗證與參數調校流程

Fig-1: Supervised Learning Process

Fig-1: Supervised Learning Process

Fig-2: CV, Model Sel. & Parameter Tuning

Fig-2: CV, Model Sel. & Parameter Tuning



Preparing Data

Libraries
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
Loading and Spliting
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
Turn on Parallel Processing
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4

CV for Classification Model

CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 19.511 secs
plot(cv.rpart)

cv.rpart$results 
      cp     ROC    Sens    Spec    ROCSD   SensSD   SpecSD
1 0.0002 0.70841 0.75985 0.57913 0.010364 0.014478 0.019012
2 0.0003 0.71555 0.78010 0.57423 0.010408 0.029139 0.025506
3 0.0004 0.71786 0.79710 0.56716 0.015507 0.028938 0.026220
4 0.0005 0.71821 0.80497 0.56219 0.017172 0.027989 0.024565
5 0.0006 0.71697 0.81041 0.56058 0.016999 0.021776 0.021361
6 0.0007 0.71382 0.81311 0.56112 0.017613 0.016638 0.016526
7 0.0008 0.70874 0.81628 0.55945 0.013033 0.013177 0.013865
8 0.0009 0.70777 0.81777 0.55923 0.010477 0.014680 0.014994
9 0.0010 0.70758 0.81819 0.55918 0.010578 0.014432 0.015508
Classification Tree, Final Model
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>% 
  colAUC(TS$buy)
              [,1]
no vs. yes 0.74018


CV: glm(), General Linear Model
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR, method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 4.4909 secs
cv.glm$results
  parameter     ROC    Sens    Spec    ROCSD   SensSD   SpecSD
1      none 0.74879 0.80576 0.56856 0.010178 0.013218 0.014611
glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
             [,1]
no vs. yes 0.7556


CV for Regression Model

Spliting Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
CV Control for Regression
ctrl2 = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final")
CV: rpart() Regression Tree
ctrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
  amount ~ ., data=TR2, method="rpart", 
  trControl=ctrl2, metric="Rsquared",
  tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)

cv.rpart2$results
       cp    RMSE Rsquared     MAE    RMSESD RsquaredSD     MAESD
1  0.0008 0.42675  0.27564 0.32766 0.0090329   0.023591 0.0061173
2  0.0009 0.42477  0.28117 0.32635 0.0103407   0.025773 0.0066172
3  0.0010 0.42359  0.28403 0.32583 0.0102501   0.025828 0.0064855
4  0.0011 0.42309  0.28549 0.32526 0.0100893   0.025807 0.0065420
5  0.0012 0.42297  0.28581 0.32530 0.0096765   0.023913 0.0058852
6  0.0013 0.42313  0.28532 0.32534 0.0097162   0.023931 0.0058034
7  0.0014 0.42320  0.28514 0.32556 0.0100614   0.026334 0.0061210
8  0.0015 0.42279  0.28628 0.32541 0.0104651   0.026525 0.0062114
9  0.0016 0.42252  0.28711 0.32525 0.0105918   0.025878 0.0061396
10 0.0017 0.42256  0.28688 0.32533 0.0107874   0.026540 0.0063368
11 0.0018 0.42275  0.28627 0.32552 0.0111151   0.028421 0.0066105
12 0.0019 0.42335  0.28421 0.32591 0.0110973   0.027216 0.0066057
13 0.0020 0.42369  0.28307 0.32599 0.0110733   0.027393 0.0067058
14 0.0021 0.42339  0.28411 0.32581 0.0110896   0.028081 0.0067905
15 0.0022 0.42349  0.28378 0.32590 0.0113953   0.029014 0.0069534
16 0.0023 0.42335  0.28421 0.32575 0.0111905   0.028332 0.0066832
17 0.0024 0.42342  0.28393 0.32573 0.0108693   0.027563 0.0064283
rpart(), Regression Tree Final Model
rpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
[1] 0.24672
CV: lm(), Linear Model
ctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
  amount ~ ., data=TR2, method="lm", 
  trControl=ctrl2, metric="Rsquared",
    tuneGrid = expand.grid( intercept = seq(0,5,0.5) ) 
  )
plot(cv.lm2)

cv.lm2$results
   intercept    RMSE Rsquared     MAE  RMSESD RsquaredSD     MAESD
1        0.0 0.43331  0.27303 0.33095 0.01424   0.033815 0.0103186
2        0.5 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
3        1.0 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
4        1.5 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
5        2.0 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
6        2.5 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
7        3.0 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
8        3.5 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
9        4.0 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
10       4.5 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
11       5.0 0.42046  0.29444 0.32349 0.01090   0.028294 0.0066456
lm() Final Model
lm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) -  TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
[1] 0.26226
Stop Parallel Processing
stopCluster(clust)







LS0tDQp0aXRsZTogIkNyb3NzIFZhbGlkLiAmIE1vZGVsIFNlbGVjdGlvbiwgVGEtRmVuZyINCmF1dGhvcjogIuWNk+mbjeeEtiwg5Lit5bGx5aSn5a24IOeuoeeQhuWtuOihk+eglOeptuS4reW/gyINCmRhdGU6ICJgciBTeXMudGltZSgpYCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCjxicj4NCg0KIyMjIOS6pOWPiempl+itieiIh+WPg+aVuOiqv+agoea1geeoiw0KDQo8Y2VudGVyPg0KDQohW0ZpZy0xOiBTdXBlcnZpc2VkIExlYXJuaW5nIFByb2Nlc3NdKGZpZy9zdXBlcnZpc2VkLmpwZykNCg0KIVtGaWctMjogQ1YsIE1vZGVsIFNlbC4gJiBQYXJhbWV0ZXIgVHVuaW5nXShmaWcvY3YuanBnKQ0KDQo8L2NlbnRlcj4NCg0KPGJyPjxocj4NCg0KIyMjIFByZXBhcmluZyBEYXRhDQoNCiMjIyMjIExpYnJhcmllcw0KYGBge3IgZWNobz1ULCBtZXNzYWdlPUYsIGNhY2hlPUYsIHdhcm5pbmc9Rn0NClN5cy5zZXRsb2NhbGUoIkxDX0FMTCIsIkMiKQ0KbGlicmFyeShkcGx5cikNCmxpYnJhcnkoZ2dwbG90MikNCmxpYnJhcnkoY2FUb29scykNCmxpYnJhcnkoTWF0cml4KQ0KbGlicmFyeShycGFydCkNCmxpYnJhcnkocnBhcnQucGxvdCkNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KGRvUGFyYWxsZWwpDQpgYGANCg0KIyMjIyMgTG9hZGluZyBhbmQgU3BsaXRpbmcNCmBgYHtyfQ0Kcm0obGlzdD1scyhhbGw9VFJVRSkpDQpsb2FkKCJkYXRhL3RmMi5yZGF0YSIpDQpBJGJ1eSA9IGZhY3RvcihpZmVsc2UoQSRidXksICJ5ZXMiLCAibm8iKSkgICMgY29tcGx5IHRvIHRoZSBydWxlIG9mIGNhcmV0DQpUUiA9IEFbc3BsLCBjKDI6OSwxMSldDQpUUyA9IEFbIXNwbCwgYygyOjksMTEpXQ0KYGBgDQoNCiMjIyMjIFR1cm4gb24gUGFyYWxsZWwgUHJvY2Vzc2luZw0KYGBge3J9DQpjbHVzdCA9IG1ha2VDbHVzdGVyKGRldGVjdENvcmVzKCkpDQpyZWdpc3RlckRvUGFyYWxsZWwoY2x1c3QpOyBnZXREb1BhcldvcmtlcnMoKQ0KYGBgDQoNCiMjIyBDViBmb3IgQ2xhc3NpZmljYXRpb24gTW9kZWwgDQoNCiMjIyMjIENWIENvbnRyb2wgZm9yIENsYXNzaWZpY2F0aW9uDQpgYGB7cn0NCmN0cmwgPSB0cmFpbkNvbnRyb2woDQogIG1ldGhvZD0icmVwZWF0ZWRjdiIsIG51bWJlcj0xMCwgICAgIyAxMC1mb2xkLCBSZXBlYXRlZCBDVg0KICBzYXZlUHJlZGljdGlvbnMgPSAiZmluYWwiLCBjbGFzc1Byb2JzPVRSVUUsDQogIHN1bW1hcnlGdW5jdGlvbj10d29DbGFzc1N1bW1hcnkpDQpgYGANCg0KIyMjIyMgQ1Y6IGBycGFydCgpYCwgQ2xhc3NpZmljYXRpb24gVHJlZSANCmBgYHtyfQ0KY3RybCRyZXBlYXRzID0gMg0KdDAgPSBTeXMudGltZSgpOyBzZXQuc2VlZCgyKQ0KY3YucnBhcnQgPSB0cmFpbigNCiAgYnV5IH4gLiwgZGF0YT1UUiwgbWV0aG9kPSJycGFydCIsIA0KICB0ckNvbnRyb2w9Y3RybCwgbWV0cmljPSJST0MiLA0KICB0dW5lR3JpZCA9IGV4cGFuZC5ncmlkKGNwID0gc2VxKDAuMDAwMiwwLjAwMSwwLjAwMDEpICkgKQ0KU3lzLnRpbWUoKSAtIHQwDQpgYGANCg0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9N30NCnBsb3QoY3YucnBhcnQpDQpgYGANCg0KYGBge3J9DQpjdi5ycGFydCRyZXN1bHRzIA0KYGBgDQoNCiMjIyMjIENsYXNzaWZpY2F0aW9uIFRyZWUsIEZpbmFsIE1vZGVsDQpgYGB7cn0NCnJwYXJ0MSA9IHJwYXJ0KGJ1eSB+IC4sIFRSLCBtZXRob2Q9ImNsYXNzIiwgY3A9MC4wMDA1KQ0KcHJlZGljdChycGFydDEsIFRTLCB0eXBlPSJwcm9iIilbLDJdICU+JSANCiAgY29sQVVDKFRTJGJ1eSkNCmBgYA0KPGJyPjxocj4NCg0KIyMjIyMgQ1Y6IGBnbG0oKWAsIEdlbmVyYWwgTGluZWFyIE1vZGVsDQpgYGB7cn0NCmN0cmwkcmVwZWF0cyA9IDINCnQwID0gU3lzLnRpbWUoKTsgc2V0LnNlZWQoMikNCmN2LmdsbSA9IHRyYWluKA0KICBidXkgfiAuLCBkYXRhPVRSLCBtZXRob2Q9ImdsbSIsIA0KICB0ckNvbnRyb2w9Y3RybCwgbWV0cmljPSJST0MiKQ0KU3lzLnRpbWUoKSAtIHQwDQpgYGANCg0KYGBge3J9DQpjdi5nbG0kcmVzdWx0cw0KYGBgDQoNCiMjIyMjIGBnbG0oKWAsIEZpbmFsIE1vZGVsDQpgYGB7cn0NCmdsbTEgPSBiPWdsbShidXkgfiAuLCBUUiwgZmFtaWx5PWJpbm9taWFsKQ0KcHJlZGljdChnbG0xLCBUUywgdHlwZT0icmVzcG9uc2UiKSAlPiUgY29sQVVDKFRTJGJ1eSkNCmBgYA0KPGJyPjxocj4NCg0KDQojIyMgQ1YgZm9yIFJlZ3Jlc3Npb24gTW9kZWwNCg0KIyMjIyMgU3BsaXRpbmcgRGF0YQ0KYGBge3J9DQpBMiA9IHN1YnNldChBLCBBJGJ1eSA9PSAieWVzIikgJT4lIG11dGF0ZV9hdChjKCJtIiwicmV2IiwiYW1vdW50IiksIGxvZzEwKQ0KVFIyID0gQTJbIHNwbDIsIGMoMjoxMCldDQpUUzIgPSBBMlshc3BsMiwgYygyOjEwKV0NCmBgYA0KDQojIyMjIyBDViBDb250cm9sIGZvciBSZWdyZXNzaW9uDQpgYGB7cn0NCmN0cmwyID0gdHJhaW5Db250cm9sKA0KICBtZXRob2Q9InJlcGVhdGVkY3YiLCBudW1iZXI9MTAsICAgICMgMTAtZm9sZCwgUmVwZWF0ZWQgQ1YNCiAgc2F2ZVByZWRpY3Rpb25zID0gImZpbmFsIikNCmBgYA0KDQojIyMjIyBDVjogYHJwYXJ0KClgIFJlZ3Jlc3Npb24gVHJlZQ0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9N30NCmN0cmwkcmVwZWF0cyA9IDINCnNldC5zZWVkKDIpDQpjdi5ycGFydDIgPSB0cmFpbigNCiAgYW1vdW50IH4gLiwgZGF0YT1UUjIsIG1ldGhvZD0icnBhcnQiLCANCiAgdHJDb250cm9sPWN0cmwyLCBtZXRyaWM9IlJzcXVhcmVkIiwNCiAgdHVuZUdyaWQgPSBleHBhbmQuZ3JpZChjcCA9IHNlcSgwLjAwMDgsMC4wMDI0LDAuMDAwMSkgKSApDQpwbG90KGN2LnJwYXJ0MikNCmBgYA0KDQpgYGB7cn0NCmN2LnJwYXJ0MiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYHJwYXJ0KClgLCBSZWdyZXNzaW9uIFRyZWUgRmluYWwgTW9kZWwNCmBgYHtyfQ0KcnBhcnQyID0gcnBhcnQoYW1vdW50IH4gLiwgZGF0YT1UUjIsIGNwPTAuMDAxNikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KHJwYXJ0MiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLnJwYXJ0MiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KIyMjIyMgQ1Y6IGBsbSgpYCwgTGluZWFyIE1vZGVsDQpgYGB7ciBmaWcuaGVpZ2h0PTMsIGZpZy53aWR0aD03fQ0KY3RybCRyZXBlYXRzID0gMg0Kc2V0LnNlZWQoMikNCmN2LmxtMiA9IHRyYWluKA0KICBhbW91bnQgfiAuLCBkYXRhPVRSMiwgbWV0aG9kPSJsbSIsIA0KICB0ckNvbnRyb2w9Y3RybDIsIG1ldHJpYz0iUnNxdWFyZWQiLA0KICAgIHR1bmVHcmlkID0gZXhwYW5kLmdyaWQoIGludGVyY2VwdCA9IHNlcSgwLDUsMC41KSApIA0KICApDQpwbG90KGN2LmxtMikNCmBgYA0KDQpgYGB7cn0NCmN2LmxtMiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYGxtKClgIEZpbmFsIE1vZGVsDQpgYGB7cn0NCmxtMiA9IGxtKGFtb3VudCB+IC4sIFRSMikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KGxtMiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLmxtMiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KIyMjIyMgU3RvcCBQYXJhbGxlbCBQcm9jZXNzaW5nDQpgYGB7cn0NCnN0b3BDbHVzdGVyKGNsdXN0KQ0KYGBgDQo8YnI+PGJyPjxocj48YnI+PGJyPjxicj48YnI+DQo8c3R5bGU+DQoNCi5jYXB0aW9uIHsNCiAgY29sb3I6ICM3Nzc7DQogIG1hcmdpbi10b3A6IDEwcHg7DQp9DQpwIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnByZSB7DQogIHdvcmQtYnJlYWs6IG5vcm1hbDsNCiAgd29yZC13cmFwOiBub3JtYWw7DQogIGxpbmUtaGVpZ2h0OiAxOw0KfQ0KcHJlIGNvZGUgew0KICB3aGl0ZS1zcGFjZTogaW5oZXJpdDsNCn0NCnAsbGkgew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KLnJ7DQogIGxpbmUtaGVpZ2h0OiAxLjI7DQp9DQoNCi5xaXogew0KICBsaW5lLWhlaWdodDogMS43NTsNCiAgYmFja2dyb3VuZDogI2YwZjBmMDsNCiAgYm9yZGVyLWxlZnQ6IDEycHggc29saWQgI2NjZmZjYzsNCiAgcGFkZGluZzogNHB4Ow0KICBwYWRkaW5nLWxlZnQ6IDEwcHg7DQogIGNvbG9yOiAjMDA5OTAwOw0KfQ0KDQp0aXRsZXsNCiAgY29sb3I6ICNjYzAwMDA7DQogIGZvbnQtZmFtaWx5OiAiVHJlYnVjaGV0IE1TIiwgIuW+rui7n+ato+m7kemrlCIsICJNaWNyb3NvZnQgSmhlbmdIZWkiOw0KfQ0KDQpib2R5ew0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KaDEsaDIsaDMsaDQsaDV7DQogIGNvbG9yOiAjMDA2NmZmOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KDQpoM3sNCiAgY29sb3I6ICMwMDg4MDA7DQogIGJhY2tncm91bmQ6ICNlNmZmZTY7DQogIGxpbmUtaGVpZ2h0OiAyOw0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KaDV7DQogIGNvbG9yOiAjMDA2MDAwOw0KICBiYWNrZ3JvdW5kOiAjZjhmOGY4Ow0KICBsaW5lLWhlaWdodDogMS41Ow0KICBmb250LXdlaWdodDogYm9sZDsNCn0NCg0KPC9zdHlsZT4NCg==