交叉驗證與參數調校流程

Fig-1: Supervised Learning Process

Fig-1: Supervised Learning Process

Fig-2: CV, Model Sel. & Parameter Tuning

Fig-2: CV, Model Sel. & Parameter Tuning



Preparing Data

Libraries
Sys.setlocale("LC_ALL","C")
[1] "C"
library(dplyr)
library(ggplot2)
library(caTools)
library(Matrix)
library(rpart)
library(rpart.plot)
library(caret)
library(doParallel)
Loading and Spliting
rm(list=ls(all=TRUE))
load("data/tf2.rdata")
A$buy = factor(ifelse(A$buy, "yes", "no"))  # comply to the rule of caret
TR = A[spl, c(2:9,11)]
TS = A[!spl, c(2:9,11)]
Turn on Parallel Processing
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
  • 開啟平行運算,將電腦的每一個CPU都叫出來工作,以免執行交叉驗證的等待時間過長
  • 可以看到自己的電腦有幾顆CPU

決策樹之交叉驗證

CV Control for Classification
ctrl = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final", classProbs=TRUE,
  summaryFunction=twoClassSummary)
  • 設定交叉驗證要將原本的資料切成幾塊(執行幾次)
CV: rpart(), Classification Tree
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.rpart = train(
  buy ~ ., data=TR, method="rpart", 
  trControl=ctrl, metric="ROC",
  tuneGrid = expand.grid(cp = seq(0.0002,0.001,0.0001) ) )
Sys.time() - t0
Time difference of 1.117289 mins
plot(cv.rpart)

cv.rpart$results 
  • 如同影片所說,複雜度越高不一定越“準”,因此透過參數調校,找出最適合的複雜度和參數組合
Classification Tree, Final Model
rpart1 = rpart(buy ~ ., TR, method="class", cp=0.0005)
predict(rpart1, TS, type="prob")[,2] %>% 
  colAUC(TS$buy)
                [,1]
no vs. yes 0.7401771


CV: glm(), General Linear Model(邏輯式回歸)
ctrl$repeats = 2
t0 = Sys.time(); set.seed(2)
cv.glm = train(
  buy ~ ., data=TR, method="glm", 
  trControl=ctrl, metric="ROC")
Sys.time() - t0
Time difference of 22.44225 secs
cv.glm$results
glm(), Final Model
glm1 = b=glm(buy ~ ., TR, family=binomial)
predict(glm1, TS, type="response") %>% colAUC(TS$buy)
                [,1]
no vs. yes 0.7556038
  • 執行完CV後,決策樹之AUC上升至0.7556038


線性迴歸之交叉驗證

Spliting Data
A2 = subset(A, A$buy == "yes") %>% mutate_at(c("m","rev","amount"), log10)
TR2 = A2[ spl2, c(2:10)]
TS2 = A2[!spl2, c(2:10)]
CV Control for Regression
ctrl2 = trainControl(
  method="repeatedcv", number=10,    # 10-fold, Repeated CV
  savePredictions = "final")
  • 同樣將資料切為10等分
CV: rpart() Regression Tree
ctrl$repeats = 2
set.seed(2)
cv.rpart2 = train(
  amount ~ ., data=TR2, method="rpart", 
  trControl=ctrl2, metric="Rsquared",
  tuneGrid = expand.grid(cp = seq(0.0008,0.0024,0.0001) ) )
plot(cv.rpart2)

  • 透過參數調校找出最佳複雜度
cv.rpart2$results
rpart(), Regression Tree Final Model
rpart2 = rpart(amount ~ ., data=TR2, cp=0.0016)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(rpart2, TS2) -  TS2$amount)^2)
(r2.ts.rpart2 = 1 - (SSE/SST))
[1] 0.2174555
CV: lm(), Linear Model
ctrl$repeats = 2
set.seed(2)
cv.lm2 = train(
  amount ~ ., data=TR2, method="lm", 
  trControl=ctrl2, metric="Rsquared",
    tuneGrid = expand.grid( intercept = seq(0,5,0.5) ) 
  )
plot(cv.lm2)

cv.lm2$results
lm() Final Model
lm2 = lm(amount ~ ., TR2)
SST = sum((TS2$amount - mean(TR2$amount))^ 2)
SSE = sum((predict(lm2, TS2) -  TS2$amount)^2)
(r2.ts.lm2 = 1 - (SSE/SST))
[1] 0.2381007
  • 線性迴歸做完交叉驗證後之R^2為0.2381007
要記得關閉平行運算功能喔!
stopCluster(clust)







LS0tDQp0aXRsZTogIkNyb3NzIFZhbGlkLiAmIE1vZGVsIFNlbGVjdGlvbiwgVGEtRmVuZyINCmF1dGhvcjogIuWNk+mbjeeEtiwg5Lit5bGx5aSn5a24IOeuoeeQhuWtuOihk+eglOeptuS4reW/gyINCmRhdGU6ICJgciBTeXMudGltZSgpYCINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCjxicj4NCg0KKyDmnKznq6Dph43pu546IOS7i+e0ueS9leisguS6pOWPiempl+itiShjcm9zcyB2YWxpZGF0aW9uKQ0KKyDlsIfkuqTlj4npqZforYnkuYvntZDmnpzmh4nnlKjlnKjmiJHlgJHnmoTpoJDmuKzmgKfmqKHlnovkuIoNCg0KIyMjIOS6pOWPiempl+itieiIh+WPg+aVuOiqv+agoea1geeoiw0KDQo8Y2VudGVyPg0KDQohW0ZpZy0xOiBTdXBlcnZpc2VkIExlYXJuaW5nIFByb2Nlc3NdKHN1cGVydmlzZWQuanBnKQ0KDQohW0ZpZy0yOiBDViwgTW9kZWwgU2VsLiAmIFBhcmFtZXRlciBUdW5pbmddKGN2LmpwZykNCg0KPC9jZW50ZXI+DQoNCjxicj48aHI+DQoNCiMjIyBQcmVwYXJpbmcgRGF0YQ0KDQojIyMjIyBMaWJyYXJpZXMNCmBgYHtyIGVjaG89VCwgbWVzc2FnZT1GLCBjYWNoZT1GLCB3YXJuaW5nPUZ9DQpTeXMuc2V0bG9jYWxlKCJMQ19BTEwiLCJDIikNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KGdncGxvdDIpDQpsaWJyYXJ5KGNhVG9vbHMpDQpsaWJyYXJ5KE1hdHJpeCkNCmxpYnJhcnkocnBhcnQpDQpsaWJyYXJ5KHJwYXJ0LnBsb3QpDQpsaWJyYXJ5KGNhcmV0KQ0KbGlicmFyeShkb1BhcmFsbGVsKQ0KYGBgDQoNCiMjIyMjIExvYWRpbmcgYW5kIFNwbGl0aW5nDQpgYGB7cn0NCnJtKGxpc3Q9bHMoYWxsPVRSVUUpKQ0KbG9hZCgiZGF0YS90ZjIucmRhdGEiKQ0KQSRidXkgPSBmYWN0b3IoaWZlbHNlKEEkYnV5LCAieWVzIiwgIm5vIikpICAjIGNvbXBseSB0byB0aGUgcnVsZSBvZiBjYXJldA0KVFIgPSBBW3NwbCwgYygyOjksMTEpXQ0KVFMgPSBBWyFzcGwsIGMoMjo5LDExKV0NCmBgYA0KDQojIyMjIyBUdXJuIG9uIFBhcmFsbGVsIFByb2Nlc3NpbmcNCmBgYHtyfQ0KY2x1c3QgPSBtYWtlQ2x1c3RlcihkZXRlY3RDb3JlcygpKQ0KcmVnaXN0ZXJEb1BhcmFsbGVsKGNsdXN0KTsgZ2V0RG9QYXJXb3JrZXJzKCkNCmBgYA0KDQorIOmWi+WVn+W5s+ihjOmBi+eul++8jOWwh+mbu+iFpueahOavj+S4gOWAi0NQVemDveWPq+WHuuS+huW3peS9nO+8jOS7peWFjeWft+ihjOS6pOWPiempl+itieeahOetieW+heaZgumWk+mBjumVtw0KKyDlj6/ku6XnnIvliLDoh6rlt7HnmoTpm7vohabmnInlub7poYZDUFUNCg0KIyMjIOaxuuetluaoueS5i+S6pOWPiempl+itiSANCg0KIyMjIyMgQ1YgQ29udHJvbCBmb3IgQ2xhc3NpZmljYXRpb24NCmBgYHtyfQ0KY3RybCA9IHRyYWluQ29udHJvbCgNCiAgbWV0aG9kPSJyZXBlYXRlZGN2IiwgbnVtYmVyPTEwLCAgICAjIDEwLWZvbGQsIFJlcGVhdGVkIENWDQogIHNhdmVQcmVkaWN0aW9ucyA9ICJmaW5hbCIsIGNsYXNzUHJvYnM9VFJVRSwNCiAgc3VtbWFyeUZ1bmN0aW9uPXR3b0NsYXNzU3VtbWFyeSkNCmBgYA0KDQorIOioreWumuS6pOWPiempl+itieimgeWwh+WOn+acrOeahOizh+aWmeWIh+aIkOW5vuWhiijln7fooYzlub7mrKEpDQoNCiMjIyMjIENWOiBgcnBhcnQoKWAsIENsYXNzaWZpY2F0aW9uIFRyZWUgDQpgYGB7cn0NCmN0cmwkcmVwZWF0cyA9IDINCnQwID0gU3lzLnRpbWUoKTsgc2V0LnNlZWQoMikNCmN2LnJwYXJ0ID0gdHJhaW4oDQogIGJ1eSB+IC4sIGRhdGE9VFIsIG1ldGhvZD0icnBhcnQiLCANCiAgdHJDb250cm9sPWN0cmwsIG1ldHJpYz0iUk9DIiwNCiAgdHVuZUdyaWQgPSBleHBhbmQuZ3JpZChjcCA9IHNlcSgwLjAwMDIsMC4wMDEsMC4wMDAxKSApICkNClN5cy50aW1lKCkgLSB0MA0KYGBgDQoNCmBgYHtyIGZpZy5oZWlnaHQ9MywgZmlnLndpZHRoPTd9DQpwbG90KGN2LnJwYXJ0KQ0KYGBgDQoNCmBgYHtyfQ0KY3YucnBhcnQkcmVzdWx0cyANCmBgYA0KDQorIOWmguWQjOW9seeJh+aJgOiqqu+8jOikh+mbnOW6pui2iumrmOS4jeS4gOWumui2iiLmupYi77yM5Zug5q2k6YCP6YGO5Y+D5pW46Kq/5qCh77yM5om+5Ye65pyA6YGp5ZCI55qE6KSH6Zuc5bqm5ZKM5Y+D5pW457WE5ZCIDQoNCiMjIyMjIENsYXNzaWZpY2F0aW9uIFRyZWUsIEZpbmFsIE1vZGVsDQpgYGB7cn0NCnJwYXJ0MSA9IHJwYXJ0KGJ1eSB+IC4sIFRSLCBtZXRob2Q9ImNsYXNzIiwgY3A9MC4wMDA1KQ0KcHJlZGljdChycGFydDEsIFRTLCB0eXBlPSJwcm9iIilbLDJdICU+JSANCiAgY29sQVVDKFRTJGJ1eSkNCmBgYA0KPGJyPjxocj4NCg0KIyMjIyMgQ1Y6IGBnbG0oKWAsIEdlbmVyYWwgTGluZWFyIE1vZGVsKOmCj+i8r+W8j+WbnuatuCkNCmBgYHtyfQ0KY3RybCRyZXBlYXRzID0gMg0KdDAgPSBTeXMudGltZSgpOyBzZXQuc2VlZCgyKQ0KY3YuZ2xtID0gdHJhaW4oDQogIGJ1eSB+IC4sIGRhdGE9VFIsIG1ldGhvZD0iZ2xtIiwgDQogIHRyQ29udHJvbD1jdHJsLCBtZXRyaWM9IlJPQyIpDQpTeXMudGltZSgpIC0gdDANCmBgYA0KDQpgYGB7cn0NCmN2LmdsbSRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYGdsbSgpYCwgRmluYWwgTW9kZWwNCmBgYHtyfQ0KZ2xtMSA9IGI9Z2xtKGJ1eSB+IC4sIFRSLCBmYW1pbHk9Ymlub21pYWwpDQpwcmVkaWN0KGdsbTEsIFRTLCB0eXBlPSJyZXNwb25zZSIpICU+JSBjb2xBVUMoVFMkYnV5KQ0KYGBgDQoNCisg5Z+36KGM5a6MQ1blvozvvIzmsbrnrZbmqLnkuYtBVUPkuIrljYfoh7MwLjc1NTYwMzgNCg0KPGJyPjxocj4NCg0KDQojIyMg57ea5oCn6L+05q245LmL5Lqk5Y+J6amX6K2JDQoNCiMjIyMjIFNwbGl0aW5nIERhdGENCmBgYHtyfQ0KQTIgPSBzdWJzZXQoQSwgQSRidXkgPT0gInllcyIpICU+JSBtdXRhdGVfYXQoYygibSIsInJldiIsImFtb3VudCIpLCBsb2cxMCkNClRSMiA9IEEyWyBzcGwyLCBjKDI6MTApXQ0KVFMyID0gQTJbIXNwbDIsIGMoMjoxMCldDQpgYGANCg0KIyMjIyMgQ1YgQ29udHJvbCBmb3IgUmVncmVzc2lvbg0KYGBge3J9DQpjdHJsMiA9IHRyYWluQ29udHJvbCgNCiAgbWV0aG9kPSJyZXBlYXRlZGN2IiwgbnVtYmVyPTEwLCAgICAjIDEwLWZvbGQsIFJlcGVhdGVkIENWDQogIHNhdmVQcmVkaWN0aW9ucyA9ICJmaW5hbCIpDQpgYGANCg0KKyDlkIzmqKPlsIfos4fmlpnliIfngroxMOetieWIhg0KDQojIyMjIyBDVjogYHJwYXJ0KClgIFJlZ3Jlc3Npb24gVHJlZQ0KYGBge3IgZmlnLmhlaWdodD0zLCBmaWcud2lkdGg9N30NCmN0cmwkcmVwZWF0cyA9IDINCnNldC5zZWVkKDIpDQpjdi5ycGFydDIgPSB0cmFpbigNCiAgYW1vdW50IH4gLiwgZGF0YT1UUjIsIG1ldGhvZD0icnBhcnQiLCANCiAgdHJDb250cm9sPWN0cmwyLCBtZXRyaWM9IlJzcXVhcmVkIiwNCiAgdHVuZUdyaWQgPSBleHBhbmQuZ3JpZChjcCA9IHNlcSgwLjAwMDgsMC4wMDI0LDAuMDAwMSkgKSApDQpwbG90KGN2LnJwYXJ0MikNCmBgYA0KDQorIOmAj+mBjuWPg+aVuOiqv+agoeaJvuWHuuacgOS9s+ikh+mbnOW6pg0KDQpgYGB7cn0NCmN2LnJwYXJ0MiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYHJwYXJ0KClgLCBSZWdyZXNzaW9uIFRyZWUgRmluYWwgTW9kZWwNCmBgYHtyfQ0KcnBhcnQyID0gcnBhcnQoYW1vdW50IH4gLiwgZGF0YT1UUjIsIGNwPTAuMDAxNikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KHJwYXJ0MiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLnJwYXJ0MiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KKyANCg0KIyMjIyMgQ1Y6IGBsbSgpYCwgTGluZWFyIE1vZGVsDQpgYGB7ciBmaWcuaGVpZ2h0PTMsIGZpZy53aWR0aD03fQ0KY3RybCRyZXBlYXRzID0gMg0Kc2V0LnNlZWQoMikNCmN2LmxtMiA9IHRyYWluKA0KICBhbW91bnQgfiAuLCBkYXRhPVRSMiwgbWV0aG9kPSJsbSIsIA0KICB0ckNvbnRyb2w9Y3RybDIsIG1ldHJpYz0iUnNxdWFyZWQiLA0KICAgIHR1bmVHcmlkID0gZXhwYW5kLmdyaWQoIGludGVyY2VwdCA9IHNlcSgwLDUsMC41KSApIA0KICApDQpwbG90KGN2LmxtMikNCmBgYA0KDQpgYGB7cn0NCmN2LmxtMiRyZXN1bHRzDQpgYGANCg0KIyMjIyMgYGxtKClgIEZpbmFsIE1vZGVsDQpgYGB7cn0NCmxtMiA9IGxtKGFtb3VudCB+IC4sIFRSMikNClNTVCA9IHN1bSgoVFMyJGFtb3VudCAtIG1lYW4oVFIyJGFtb3VudCkpXiAyKQ0KU1NFID0gc3VtKChwcmVkaWN0KGxtMiwgVFMyKSAtICBUUzIkYW1vdW50KV4yKQ0KKHIyLnRzLmxtMiA9IDEgLSAoU1NFL1NTVCkpDQpgYGANCg0KKyDnt5rmgKfov7TmrbjlgZrlrozkuqTlj4npqZforYnlvozkuYtSXjLngrowLjIzODEwMDcNCg0KIyMjIyMg6KaB6KiY5b6X6Zec6ZaJ5bmz6KGM6YGL566X5Yqf6IO95ZaUIQ0KYGBge3J9DQpzdG9wQ2x1c3RlcihjbHVzdCkNCmBgYA0KPGJyPjxicj48aHI+PGJyPjxicj48YnI+PGJyPg0KPHN0eWxlPg0KDQouY2FwdGlvbiB7DQogIGNvbG9yOiAjNzc3Ow0KICBtYXJnaW4tdG9wOiAxMHB4Ow0KfQ0KcCBjb2RlIHsNCiAgd2hpdGUtc3BhY2U6IGluaGVyaXQ7DQp9DQpwcmUgew0KICB3b3JkLWJyZWFrOiBub3JtYWw7DQogIHdvcmQtd3JhcDogbm9ybWFsOw0KICBsaW5lLWhlaWdodDogMTsNCn0NCnByZSBjb2RlIHsNCiAgd2hpdGUtc3BhY2U6IGluaGVyaXQ7DQp9DQpwLGxpIHsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCi5yew0KICBsaW5lLWhlaWdodDogMS4yOw0KfQ0KDQoucWl6IHsNCiAgbGluZS1oZWlnaHQ6IDEuNzU7DQogIGJhY2tncm91bmQ6ICNmMGYwZjA7DQogIGJvcmRlci1sZWZ0OiAxMnB4IHNvbGlkICNjY2ZmY2M7DQogIHBhZGRpbmc6IDRweDsNCiAgcGFkZGluZy1sZWZ0OiAxMHB4Ow0KICBjb2xvcjogIzAwOTkwMDsNCn0NCg0KdGl0bGV7DQogIGNvbG9yOiAjY2MwMDAwOw0KICBmb250LWZhbWlseTogIlRyZWJ1Y2hldCBNUyIsICLlvq7ou5/mraPpu5Hpq5QiLCAiTWljcm9zb2Z0IEpoZW5nSGVpIjsNCn0NCg0KYm9keXsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCmgxLGgyLGgzLGg0LGg1ew0KICBjb2xvcjogIzAwNjZmZjsNCiAgZm9udC1mYW1pbHk6ICJUcmVidWNoZXQgTVMiLCAi5b6u6Luf5q2j6buR6auUIiwgIk1pY3Jvc29mdCBKaGVuZ0hlaSI7DQp9DQoNCg0KaDN7DQogIGNvbG9yOiAjMDA4ODAwOw0KICBiYWNrZ3JvdW5kOiAjZTZmZmU2Ow0KICBsaW5lLWhlaWdodDogMjsNCiAgZm9udC13ZWlnaHQ6IGJvbGQ7DQp9DQoNCmg1ew0KICBjb2xvcjogIzAwNjAwMDsNCiAgYmFja2dyb3VuZDogI2Y4ZjhmODsNCiAgbGluZS1oZWlnaHQ6IDEuNTsNCiAgZm9udC13ZWlnaHQ6IGJvbGQ7DQp9DQoNCjwvc3R5bGU+DQo=