我們用Unit4:Census這一個作業來示範,如何使用一般的多核心筆電來做:
library(magrittr)
library(rpart)
library(rpart.plot)
library(caret)
library(caTools)
source('DPP.R')
D = read.csv('census.csv')
set.seed(2000)
spl = sample.split(D$over50k, SplitRatio = 0.6)
train = subset(D, spl==TRUE)
test = subset(D, spl==FALSE)
sapply(list(D=D, train=train, test=test), dim) %>% 'rownames<-'(c('nrow','ncol'))
D train test
nrow 31978 19187 12791
ncol 13 13 13
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust)
getDoParWorkers()
[1] 4
As above, you’d see the number of cores in your computer.
numFolds = trainControl( method = "cv", number = 10)
cpGrid = expand.grid( .cp = seq(0.002,0.1,0.002)) # 0.002,0.85157
set.seed(2)
cv1 = train(over50k ~ ., data = train,
method = "rpart",
trControl = numFolds,
tuneGrid = cpGrid )
cv1
CART
19187 samples
12 predictor
2 classes: ' <=50K', ' >50K'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 17268, 17269, 17268, 17268, 17269, 17269, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.002 0.85157 0.55759
0.004 0.84766 0.55353
0.006 0.84443 0.53388
0.008 0.84416 0.53440
0.010 0.84432 0.53575
0.012 0.84432 0.53575
0.014 0.84432 0.53575
0.016 0.84344 0.53117
0.018 0.84052 0.51398
0.020 0.83958 0.50605
0.022 0.83911 0.50180
0.024 0.83911 0.50180
0.026 0.83911 0.50180
0.028 0.83911 0.50180
0.030 0.83911 0.50180
0.032 0.83911 0.50180
0.034 0.83702 0.48877
0.036 0.83218 0.46363
0.038 0.82655 0.43857
0.040 0.82462 0.43016
0.042 0.82462 0.43016
0.044 0.82462 0.43016
0.046 0.82201 0.41297
0.048 0.82201 0.41297
0.050 0.82201 0.41297
0.052 0.81540 0.35631
0.054 0.81284 0.32563
0.056 0.81190 0.30789
0.058 0.81190 0.30789
0.060 0.81190 0.30789
0.062 0.81190 0.30789
0.064 0.80956 0.29541
0.066 0.80956 0.29541
0.068 0.80096 0.24623
0.070 0.79585 0.21467
0.072 0.79585 0.21467
0.074 0.79585 0.21467
0.076 0.77256 0.07856
0.078 0.75937 0.00000
0.080 0.75937 0.00000
0.082 0.75937 0.00000
0.084 0.75937 0.00000
0.086 0.75937 0.00000
0.088 0.75937 0.00000
0.090 0.75937 0.00000
0.092 0.75937 0.00000
0.094 0.75937 0.00000
0.096 0.75937 0.00000
0.098 0.75937 0.00000
0.100 0.75937 0.00000
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.002.
plot(cv1)
cart1 = rpart(over50k ~ ., train, method='class', cp=0.002)
prp(cart1)
pred = predict(cart1, test)[,2]
table(test$over50k, pred > 0.5)
FALSE TRUE
<=50K 9178 535
>50K 1240 1838
table(test$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.86123
numFolds = trainControl( method = "cv", number = 10)
cpGrid = expand.grid( .cp = seq(0,0.002,0.00005)) # 0.00085 0.85370
set.seed(2)
cv2 = train(over50k ~ ., data = train,
method = "rpart",
trControl = numFolds,
tuneGrid = cpGrid )
cv2
CART
19187 samples
12 predictor
2 classes: ' <=50K', ' >50K'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 17268, 17269, 17268, 17268, 17269, 17269, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.00000 0.84250 0.55083
0.00005 0.84302 0.55162
0.00010 0.84557 0.55622
0.00015 0.84745 0.56032
0.00020 0.84964 0.56499
0.00025 0.84959 0.56212
0.00030 0.85037 0.56174
0.00035 0.85089 0.56261
0.00040 0.85203 0.56618
0.00045 0.85271 0.56698
0.00050 0.85193 0.56329
0.00055 0.85183 0.56157
0.00060 0.85157 0.56064
0.00065 0.85214 0.56299
0.00070 0.85224 0.56361
0.00075 0.85339 0.56801
0.00080 0.85344 0.56825
0.00085 0.85370 0.56898
0.00090 0.85313 0.56683
0.00095 0.85287 0.56454
0.00100 0.85230 0.56472
0.00105 0.85209 0.56432
0.00110 0.85198 0.56393
0.00115 0.85188 0.56385
0.00120 0.85188 0.56385
0.00125 0.85183 0.56164
0.00130 0.85172 0.56059
0.00135 0.85224 0.56229
0.00140 0.85214 0.56178
0.00145 0.85198 0.56025
0.00150 0.85198 0.56025
0.00155 0.85214 0.56039
0.00160 0.85214 0.56047
0.00165 0.85193 0.55947
0.00170 0.85131 0.55691
0.00175 0.85131 0.55691
0.00180 0.85131 0.55691
0.00185 0.85120 0.55675
0.00190 0.85120 0.55675
0.00195 0.85157 0.55759
0.00200 0.85157 0.55759
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.00085.
plot(cv2)
Remember to turn off Parallel Processing
stopCluster(clust)
cart2 = rpart(over50k ~ ., train, method='class', cp=0.00085) # 0.86326
prp(cart2)
pred = predict(cart2, test)[,2]
table(test$over50k, pred > 0.5)
FALSE TRUE
<=50K 9158 555
>50K 1194 1884
table(test$over50k, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.86326
cart2 has a 0.2% improvement, comparing to cart1
library(ROCR)
par(cex=0.75)
ROCRpred = prediction(pred, test$over50k)
ROCRperf = performance(ROCRpred, "tpr", "fpr")
plot(ROCRperf, colorize=TRUE,
print.cutoffs.at=seq(0,1,0.2), text.adj=c(-0.2,1.7))
DPP(pred, test$over50k, " >50K")
[1] 0.89266
abline(v = c(0.2,0.5,0.8), lty=3, col='blue')
The Confusion Matrix (cutoff = 0.5)
(confuse = table(test$over50k, pred > 0.5))
FALSE TRUE
<=50K 9158 555
>50K 1194 1884
The Penalty Matrix
(penalty = matrix(c(0, -100, -10, -40),2,2))
[,1] [,2]
[1,] 0 -10
[2,] -100 -40
The Expeacted Payoff
(exp.payoff = sum(penalty * confuse))
[1] -200310
# scan for the for cutoffs, from 0.01 to 0.99
x = seq(0.01,0.99,0.01)
# calculate the expected payoff for each cutoff
y = sapply(x, function(x) sum(penalty * table(test$over50k, pred > x)))
# make a plot
plot(x, y, type='l')
# add grid lines
abline(h = seq(-300000,-100000,20000),
v = c(seq(0, 1, 0.2)), lty=3, col='lightgray' )
# identify the optimal cutoff and maximun expected payoff
(best.payoff = max(y))
[1] -169240
(best.cutoff = x[which.max(y)])
[1] 0.1
# mark it on the plot
abline(v = best.cutoff, col='red', lty=3)