我們用Unit4:Census這一個作業來示範,如何使用一般的多核心筆電來做:



Load the Libraries and Helper Function

library(magrittr)
library(rpart)
library(rpart.plot)
library(caret)
library(caTools)
source('DPP.R')


(A) Read and Split Data

D =  read.csv('census.csv')
set.seed(2000)
spl = sample.split(D$over50k, SplitRatio = 0.6)
train = subset(D, spl==TRUE)
test = subset(D, spl==FALSE)
sapply(list(D=D, train=train, test=test), dim) %>% 'rownames<-'(c('nrow','ncol')) 
         D train  test
nrow 31978 19187 12791
ncol    13    13    13


(B) Turn on Parallel Processing

library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust)
getDoParWorkers()
[1] 4

As above, you’d see the number of cores in your computer.


(C) Cross Validation, Pass 1

numFolds = trainControl( method = "cv", number = 10)
cpGrid = expand.grid( .cp = seq(0.002,0.1,0.002))   # 0.002,0.85157
set.seed(2)
cv1 = train(over50k ~ ., data = train, 
            method = "rpart", 
            trControl = numFolds, 
            tuneGrid = cpGrid )
cv1
CART 

19187 samples
   12 predictor
    2 classes: ' <=50K', ' >50K' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 17268, 17269, 17268, 17268, 17269, 17269, ... 
Resampling results across tuning parameters:

  cp     Accuracy  Kappa  
  0.002  0.85157   0.55759
  0.004  0.84766   0.55353
  0.006  0.84443   0.53388
  0.008  0.84416   0.53440
  0.010  0.84432   0.53575
  0.012  0.84432   0.53575
  0.014  0.84432   0.53575
  0.016  0.84344   0.53117
  0.018  0.84052   0.51398
  0.020  0.83958   0.50605
  0.022  0.83911   0.50180
  0.024  0.83911   0.50180
  0.026  0.83911   0.50180
  0.028  0.83911   0.50180
  0.030  0.83911   0.50180
  0.032  0.83911   0.50180
  0.034  0.83702   0.48877
  0.036  0.83218   0.46363
  0.038  0.82655   0.43857
  0.040  0.82462   0.43016
  0.042  0.82462   0.43016
  0.044  0.82462   0.43016
  0.046  0.82201   0.41297
  0.048  0.82201   0.41297
  0.050  0.82201   0.41297
  0.052  0.81540   0.35631
  0.054  0.81284   0.32563
  0.056  0.81190   0.30789
  0.058  0.81190   0.30789
  0.060  0.81190   0.30789
  0.062  0.81190   0.30789
  0.064  0.80956   0.29541
  0.066  0.80956   0.29541
  0.068  0.80096   0.24623
  0.070  0.79585   0.21467
  0.072  0.79585   0.21467
  0.074  0.79585   0.21467
  0.076  0.77256   0.07856
  0.078  0.75937   0.00000
  0.080  0.75937   0.00000
  0.082  0.75937   0.00000
  0.084  0.75937   0.00000
  0.086  0.75937   0.00000
  0.088  0.75937   0.00000
  0.090  0.75937   0.00000
  0.092  0.75937   0.00000
  0.094  0.75937   0.00000
  0.096  0.75937   0.00000
  0.098  0.75937   0.00000
  0.100  0.75937   0.00000

Accuracy was used to select the optimal model using  the largest value.
The final value used for the model was cp = 0.002.
plot(cv1)

Decision Tree

cart1 = rpart(over50k ~ ., train, method='class', cp=0.002)
prp(cart1)

Prediction & Consusion Matrix (cutoff = 0.5)

pred = predict(cart1, test)[,2]
table(test$over50k, pred > 0.5)
        
         FALSE TRUE
   <=50K  9178  535
   >50K   1240 1838

Accuracy

table(test$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)} 
[1] 0.86123




(D) Cross Validation, Pass 2

numFolds = trainControl( method = "cv", number = 10)
cpGrid = expand.grid( .cp = seq(0,0.002,0.00005))  # 0.00085  0.85370
set.seed(2)
cv2 = train(over50k ~ ., data = train, 
            method = "rpart", 
            trControl = numFolds, 
            tuneGrid = cpGrid )
cv2
CART 

19187 samples
   12 predictor
    2 classes: ' <=50K', ' >50K' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 17268, 17269, 17268, 17268, 17269, 17269, ... 
Resampling results across tuning parameters:

  cp       Accuracy  Kappa  
  0.00000  0.84250   0.55083
  0.00005  0.84302   0.55162
  0.00010  0.84557   0.55622
  0.00015  0.84745   0.56032
  0.00020  0.84964   0.56499
  0.00025  0.84959   0.56212
  0.00030  0.85037   0.56174
  0.00035  0.85089   0.56261
  0.00040  0.85203   0.56618
  0.00045  0.85271   0.56698
  0.00050  0.85193   0.56329
  0.00055  0.85183   0.56157
  0.00060  0.85157   0.56064
  0.00065  0.85214   0.56299
  0.00070  0.85224   0.56361
  0.00075  0.85339   0.56801
  0.00080  0.85344   0.56825
  0.00085  0.85370   0.56898
  0.00090  0.85313   0.56683
  0.00095  0.85287   0.56454
  0.00100  0.85230   0.56472
  0.00105  0.85209   0.56432
  0.00110  0.85198   0.56393
  0.00115  0.85188   0.56385
  0.00120  0.85188   0.56385
  0.00125  0.85183   0.56164
  0.00130  0.85172   0.56059
  0.00135  0.85224   0.56229
  0.00140  0.85214   0.56178
  0.00145  0.85198   0.56025
  0.00150  0.85198   0.56025
  0.00155  0.85214   0.56039
  0.00160  0.85214   0.56047
  0.00165  0.85193   0.55947
  0.00170  0.85131   0.55691
  0.00175  0.85131   0.55691
  0.00180  0.85131   0.55691
  0.00185  0.85120   0.55675
  0.00190  0.85120   0.55675
  0.00195  0.85157   0.55759
  0.00200  0.85157   0.55759

Accuracy was used to select the optimal model using  the largest value.
The final value used for the model was cp = 0.00085.
plot(cv2)

Remember to turn off Parallel Processing

stopCluster(clust)

Decision Tree

cart2 = rpart(over50k ~ ., train, method='class', cp=0.00085) # 0.86326 
prp(cart2)

Prediction & Confusion Matrix (cutoff = 0.5)

pred = predict(cart2, test)[,2]
table(test$over50k, pred > 0.5)
        
         FALSE TRUE
   <=50K  9158  555
   >50K   1194 1884

Accuracy

table(test$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)} 
[1] 0.86326

cart2 has a 0.2% improvement, comparing to cart1


(E) Stratigic Optimization


ROC and AUC

library(ROCR)
par(cex=0.75)
ROCRpred = prediction(pred, test$over50k)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
plot(ROCRperf, colorize=TRUE, 
     print.cutoffs.at=seq(0,1,0.2), text.adj=c(-0.2,1.7))

Distribution of Preidcted Probability (預測機率分佈)

DPP(pred, test$over50k, " >50K")
[1] 0.89266
abline(v = c(0.2,0.5,0.8), lty=3, col='blue')

The Confusion Matrix (cutoff = 0.5)

(confuse = table(test$over50k, pred > 0.5))
        
         FALSE TRUE
   <=50K  9158  555
   >50K   1194 1884

The Penalty Matrix

(penalty = matrix(c(0, -100, -10, -40),2,2))
     [,1] [,2]
[1,]    0  -10
[2,] -100  -40

The Expeacted Payoff

(exp.payoff = sum(penalty * confuse))
[1] -200310


Find the Optimal Cutoff by Simulation

# scan for the for cutoffs, from 0.01 to 0.99 
x = seq(0.01,0.99,0.01)
# calculate the expected payoff for each cutoff
y = sapply(x, function(x) sum(penalty * table(test$over50k, pred > x)))
# make a plot
plot(x, y, type='l')
# add grid lines
abline(h = seq(-300000,-100000,20000),
       v = c(seq(0, 1, 0.2)), lty=3, col='lightgray' )
# identify the optimal cutoff and maximun expected payoff
(best.payoff = max(y))
[1] -169240
(best.cutoff = x[which.max(y)])
[1] 0.1
# mark it on the plot
abline(v = best.cutoff, col='red', lty=3)

LS0tDQp0aXRsZTogIuS6pOWPiempl+itieOAgeaooeWei+mBuOaThyjlj4Pmlbjoqr/moKEp44CB562W55Wl5pyA5L2z5YyWIg0Kc3VidGl0bGU6IENyb3NzIFZhbGlkYXRpb24sIE1vZGVsIFNlbGVjdGlvbiwgU3RyYXRlZ3kgT3B0aW1pemF0aW9uIA0KYXV0aG9yOiAiVG9ueSBDaHVvLCB0b255Y2h1b0BnbWFpbC5jb20iDQpkYXRlOiAiMjAxNy8wOC8yNSINCm91dHB1dDoNCiAgaHRtbF9ub3RlYm9vazoNCiAgICBoaWdobGlnaHQ6IHRleHRtYXRlDQogICAgdGhlbWU6IGx1bWVuDQotLS0NCg0KPGJyPg0KPGJyPg0K5oiR5YCR55SoYFVuaXQ0OkNlbnN1c2DpgJnkuIDlgIvkvZzmpa3kvobnpLrnr4TvvIzlpoLkvZXkvb/nlKjkuIDoiKznmoTlpJrmoLjlv4PnrYbpm7vkvoblgZrvvJoNCg0KKyBfX+W5s+ihjOmBi+eul19fIDogUGFyYWxsZWwgUHJvY2Vzc2luZyANCisgX1/kuqTlj4npqZforYlfXyA6IENyb3NzIFZhbGlkYXRpb24NCisgX1/mqKHlnovpgbjmk4cv5Y+D5pW46Kq/5qChX18gOiBNb2RlbCBTZWxlY3Rpb24sIFBhcmFtZXRlciBUdW5pbmcNCisgX1/nrZbnlaXmnIDkvbPljJZfXyA6IFN0cmF0ZWd5IE9wdGltaXphdGlvbg0KDQo8YnI+DQoNCi0gLSAtDQoNCmBgYHtyIHNldC1vcHRpb25zLCBlY2hvPUZBTFNFLCBjYWNoZT1GQUxTRX0NCmxpYnJhcnkoa25pdHIpDQpvcHRpb25zKHdpZHRoPTkwKQ0Kb3B0c19jaHVuayRzZXQoY29tbWVudCA9IE5BKQ0KYGBgDQoNCiMjIyMgTG9hZCB0aGUgTGlicmFyaWVzIGFuZCBIZWxwZXIgRnVuY3Rpb24NCmBgYHtyIHJlc3VsdHM9J2FzaXMnLCB3YXJuaW5nPUYsIG1lc3NhZ2U9RiwgY2FjaGU9Rn0NCmxpYnJhcnkobWFncml0dHIpDQpsaWJyYXJ5KHJwYXJ0KQ0KbGlicmFyeShycGFydC5wbG90KQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkoY2FUb29scykNCnNvdXJjZSgnRFBQLlInKQ0KYGBgDQo8YnI+DQoNCiMjIChBKSBSZWFkIGFuZCBTcGxpdCBEYXRhDQpgYGB7cn0NCkQgPSAgcmVhZC5jc3YoJ2NlbnN1cy5jc3YnKQ0Kc2V0LnNlZWQoMjAwMCkNCnNwbCA9IHNhbXBsZS5zcGxpdChEJG92ZXI1MGssIFNwbGl0UmF0aW8gPSAwLjYpDQp0cmFpbiA9IHN1YnNldChELCBzcGw9PVRSVUUpDQp0ZXN0ID0gc3Vic2V0KEQsIHNwbD09RkFMU0UpDQpzYXBwbHkobGlzdChEPUQsIHRyYWluPXRyYWluLCB0ZXN0PXRlc3QpLCBkaW0pICU+JSAncm93bmFtZXM8LScoYygnbnJvdycsJ25jb2wnKSkgDQpgYGANCjxicj4gDQoNCiMjIChCKSBUdXJuIG9uIFBhcmFsbGVsIFByb2Nlc3NpbmcgDQpgYGB7cn0NCmxpYnJhcnkoZG9QYXJhbGxlbCkNCmNsdXN0ID0gbWFrZUNsdXN0ZXIoZGV0ZWN0Q29yZXMoKSkNCnJlZ2lzdGVyRG9QYXJhbGxlbChjbHVzdCkNCmdldERvUGFyV29ya2VycygpDQpgYGANCkFzIGFib3ZlLCB5b3UnZCBzZWUgdGhlIG51bWJlciBvZiBjb3JlcyBpbiB5b3VyIGNvbXB1dGVyLg0KPGJyPiANCjxicj4NCg0KLSAtIC0NCg0KIyMgKEMpIENyb3NzIFZhbGlkYXRpb24sIFBhc3MgMQ0KYGBge3J9DQpudW1Gb2xkcyA9IHRyYWluQ29udHJvbCggbWV0aG9kID0gImN2IiwgbnVtYmVyID0gMTApDQpjcEdyaWQgPSBleHBhbmQuZ3JpZCggLmNwID0gc2VxKDAuMDAyLDAuMSwwLjAwMikpICAgIyAwLjAwMiwwLjg1MTU3DQpzZXQuc2VlZCgyKQ0KY3YxID0gdHJhaW4ob3ZlcjUwayB+IC4sIGRhdGEgPSB0cmFpbiwgDQogICAgICAgICAgICBtZXRob2QgPSAicnBhcnQiLCANCiAgICAgICAgICAgIHRyQ29udHJvbCA9IG51bUZvbGRzLCANCiAgICAgICAgICAgIHR1bmVHcmlkID0gY3BHcmlkICkNCmN2MQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChjdjEpDQpgYGANCg0KIyMjIyBEZWNpc2lvbiBUcmVlDQpgYGB7cn0NCmNhcnQxID0gcnBhcnQob3ZlcjUwayB+IC4sIHRyYWluLCBtZXRob2Q9J2NsYXNzJywgY3A9MC4wMDIpDQpwcnAoY2FydDEpDQpgYGANCg0KIyMjIyBQcmVkaWN0aW9uICYgQ29uc3VzaW9uIE1hdHJpeCAoY3V0b2ZmID0gMC41KQ0KYGBge3J9DQpwcmVkID0gcHJlZGljdChjYXJ0MSwgdGVzdClbLDJdDQp0YWJsZSh0ZXN0JG92ZXI1MGssIHByZWQgPiAwLjUpDQpgYGANCg0KIyMjIyBBY2N1cmFjeQ0KYGBge3J9DQp0YWJsZSh0ZXN0JG92ZXI1MGssIHByZWQgPiAwLjUpICU+JSB7c3VtKGRpYWcoLikpL3N1bSguKX0gDQpgYGANCjxicj4gDQo8YnI+DQoNCi0gLSAtDQoNCiMjIChEKSBDcm9zcyBWYWxpZGF0aW9uLCBQYXNzIDINCmBgYHtyfQ0KbnVtRm9sZHMgPSB0cmFpbkNvbnRyb2woIG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwKQ0KY3BHcmlkID0gZXhwYW5kLmdyaWQoIC5jcCA9IHNlcSgwLDAuMDAyLDAuMDAwMDUpKSAgIyAwLjAwMDg1ICAwLjg1MzcwDQpzZXQuc2VlZCgyKQ0KY3YyID0gdHJhaW4ob3ZlcjUwayB+IC4sIGRhdGEgPSB0cmFpbiwgDQogICAgICAgICAgICBtZXRob2QgPSAicnBhcnQiLCANCiAgICAgICAgICAgIHRyQ29udHJvbCA9IG51bUZvbGRzLCANCiAgICAgICAgICAgIHR1bmVHcmlkID0gY3BHcmlkICkNCmN2Mg0KYGBgDQoNCmBgYHtyfQ0KcGxvdChjdjIpDQpgYGANCg0KUmVtZW1iZXIgdG8gdHVybiBvZmYgUGFyYWxsZWwgUHJvY2Vzc2luZw0KYGBge3J9DQpzdG9wQ2x1c3RlcihjbHVzdCkNCmBgYA0KDQojIyMjIERlY2lzaW9uIFRyZWUNCmBgYHtyfQ0KY2FydDIgPSBycGFydChvdmVyNTBrIH4gLiwgdHJhaW4sIG1ldGhvZD0nY2xhc3MnLCBjcD0wLjAwMDg1KSAjIDAuODYzMjYgDQpwcnAoY2FydDIpDQpgYGANCg0KIyMjIyBQcmVkaWN0aW9uICYgQ29uZnVzaW9uIE1hdHJpeCAgKGN1dG9mZiA9IDAuNSkNCmBgYHtyfQ0KcHJlZCA9IHByZWRpY3QoY2FydDIsIHRlc3QpWywyXQ0KdGFibGUodGVzdCRvdmVyNTBrLCBwcmVkID4gMC41KQ0KYGBgDQoNCiMjIyMgQWNjdXJhY3kNCmBgYHtyfQ0KdGFibGUodGVzdCRvdmVyNTBrLCBwcmVkID4gMC41KSAlPiUge3N1bShkaWFnKC4pKS9zdW0oLil9IA0KYGBgDQpgY2FydDJgIGhhcyBhIDAuMiUgaW1wcm92ZW1lbnQsIGNvbXBhcmluZyB0byBgY2FydDFgDQo8YnI+IA0KPGJyPg0KDQotIC0gLQ0KDQojIyAoRSkgU3RyYXRpZ2ljIE9wdGltaXphdGlvbg0KDQo8YnI+DQoNCiMjIyMgUk9DIGFuZCBBVUMNCmBgYHtyfQ0KbGlicmFyeShST0NSKQ0KcGFyKGNleD0wLjc1KQ0KUk9DUnByZWQgPSBwcmVkaWN0aW9uKHByZWQsIHRlc3Qkb3ZlcjUwaykNClJPQ1JwZXJmID0gcGVyZm9ybWFuY2UoUk9DUnByZWQsICJ0cHIiLCAiZnByIikNCnBsb3QoUk9DUnBlcmYsIGNvbG9yaXplPVRSVUUsIA0KICAgICBwcmludC5jdXRvZmZzLmF0PXNlcSgwLDEsMC4yKSwgdGV4dC5hZGo9YygtMC4yLDEuNykpDQpgYGANCg0KIyMjIyBEaXN0cmlidXRpb24gb2YgUHJlaWRjdGVkIFByb2JhYmlsaXR5ICjpoJDmuKzmqZ/njofliIbkvYgpDQpgYGB7cn0NCkRQUChwcmVkLCB0ZXN0JG92ZXI1MGssICIgPjUwSyIpDQphYmxpbmUodiA9IGMoMC4yLDAuNSwwLjgpLCBsdHk9MywgY29sPSdibHVlJykNCmBgYA0KDQoNClRoZSBfX0NvbmZ1c2lvbiBNYXRyaXhfXyAoY3V0b2ZmID0gMC41KQ0KYGBge3J9DQooY29uZnVzZSA9IHRhYmxlKHRlc3Qkb3ZlcjUwaywgcHJlZCA+IDAuNSkpDQpgYGANCg0KVGhlIF9fUGVuYWx0eSBNYXRyaXhfXw0KYGBge3J9DQoocGVuYWx0eSA9IG1hdHJpeChjKDAsIC0xMDAsIC0xMCwgLTQwKSwyLDIpKQ0KYGBgDQoNClRoZSBfX0V4cGVhY3RlZCBQYXlvZmZfXw0KYGBge3J9DQooZXhwLnBheW9mZiA9IHN1bShwZW5hbHR5ICogY29uZnVzZSkpDQpgYGANCjxicj4NCg0KIyMjIyBGaW5kIHRoZSBfX09wdGltYWwgQ3V0b2ZmX18gYnkgX19TaW11bGF0aW9uX18NCmBgYHtyfQ0KIyBzY2FuIGZvciB0aGUgZm9yIGN1dG9mZnMsIGZyb20gMC4wMSB0byAwLjk5IA0KeCA9IHNlcSgwLjAxLDAuOTksMC4wMSkNCg0KIyBjYWxjdWxhdGUgdGhlIGV4cGVjdGVkIHBheW9mZiBmb3IgZWFjaCBjdXRvZmYNCnkgPSBzYXBwbHkoeCwgZnVuY3Rpb24oeCkgc3VtKHBlbmFsdHkgKiB0YWJsZSh0ZXN0JG92ZXI1MGssIHByZWQgPiB4KSkpDQoNCiMgbWFrZSBhIHBsb3QNCnBsb3QoeCwgeSwgdHlwZT0nbCcpDQoNCiMgYWRkIGdyaWQgbGluZXMNCmFibGluZShoID0gc2VxKC0zMDAwMDAsLTEwMDAwMCwyMDAwMCksDQogICAgICAgdiA9IGMoc2VxKDAsIDEsIDAuMikpLCBsdHk9MywgY29sPSdsaWdodGdyYXknICkNCg0KIyBpZGVudGlmeSB0aGUgb3B0aW1hbCBjdXRvZmYgYW5kIG1heGltdW4gZXhwZWN0ZWQgcGF5b2ZmDQooYmVzdC5wYXlvZmYgPSBtYXgoeSkpDQooYmVzdC5jdXRvZmYgPSB4W3doaWNoLm1heCh5KV0pDQoNCiMgbWFyayBpdCBvbiB0aGUgcGxvdA0KYWJsaW5lKHYgPSBiZXN0LmN1dG9mZiwgY29sPSdyZWQnLCBsdHk9MykNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg==