library(tidyverse)
library(caret)
library(rattle)
data("iris")
table(iris$Species)
setosa versicolor virginica
50 50 50
inTrain <- createDataPartition(iris$Species, p = 0.7, list = FALSE)
training <- iris[inTrain, ]
testing <- iris[-inTrain, ]
qplot(Petal.Width, Sepal.Width, col = Species, data = training)

modFit <- train(Species ~., method = "rpart", data = training)
modFit$finalModel
n= 105
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 105 70 setosa (0.33333333 0.33333333 0.33333333)
2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
3) Petal.Length>=2.45 70 35 versicolor (0.00000000 0.50000000 0.50000000)
6) Petal.Width< 1.75 36 2 versicolor (0.00000000 0.94444444 0.05555556) *
7) Petal.Width>=1.75 34 1 virginica (0.00000000 0.02941176 0.97058824) *
plot(modFit$finalModel, uniform = TRUE, main = "Classification Tree")
text(modFit$finalModel, use.n = TRUE, all = TRUE, cex = 0.8 )

fancyRpartPlot(modFit$finalModel, main = "Classification Tree")

predict(modFit, newdata = testing)
[1] setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa setosa
[12] setosa setosa setosa setosa versicolor versicolor versicolor versicolor versicolor versicolor versicolor
[23] versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor versicolor virginica virginica
[34] virginica virginica virginica virginica virginica virginica versicolor virginica virginica versicolor virginica
[45] virginica
Levels: setosa versicolor virginica
Bootstrap aggregating(bagging)
data(ozone, package = "ElemStatLearn")
ozone <- ozone[order(ozone$ozone), ]
head(ozone)
ll <- matrix(NA, nrow = 10, ncol = 155)
for (i in 1:10) {
ss <- sample(1:dim(ozone)[1], replace = TRUE)
ozone0 <- ozone[ss, ]
ozone0 <- ozone0[order(ozone0$ozone), ]
loess0 <- loess(temperature ~ ozone, data = ozone0, span = 0.2)
ll[i, ] <- predict(loess0, newdata = data.frame(ozone = 1:155))
}
plot(ozone$ozone, ozone$temperature, pch = 19, cex = 0.5)
for (i in 1:10) {
lines(1:155, ll[i, ], col ="grey", lwd = 2)
}
lines(1:155, apply(ll, 2, mean), col = "red", lwd = 2)

library(party)
predictors <- data.frame(ozone=ozone$ozone)
temperature = ozone$temperature
treebag <- bag(predictors, temperature, B= 10,
bagControl = bagControl(fit = ctreeBag$fit,
predict = ctreeBag$pred,
aggregate = ctreeBag$aggregate))
plot(ozone$ozone, temperature, col = "lightgrey", pch = 19)
points(ozone$ozone, predict(treebag$fits[[1]]$fit, predictors), pch = 19, col = "red")
points(ozone$ozone, predict(treebag, predictors), pch= 19, col = "blue")

NA
NA
Random Forests
modFit
Random Forest
105 samples
4 predictor
3 classes: 'setosa', 'versicolor', 'virginica'
No pre-processing
Resampling: Bootstrapped (25 reps)
Summary of sample sizes: 105, 105, 105, 105, 105, 105, ...
Resampling results across tuning parameters:
mtry Accuracy Kappa
2 0.9440576 0.9152679
3 0.9430061 0.9137014
4 0.9407163 0.9102969
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was mtry = 2.
getTree(modFit$finalModel, k=2)
left daughter right daughter split var split point status prediction
1 2 3 3 2.60 1 0
2 0 0 0 0.00 -1 1
3 4 5 1 6.15 1 0
4 6 7 3 4.75 1 0
5 8 9 3 5.05 1 0
6 0 0 0 0.00 -1 2
7 0 0 0 0.00 -1 3
8 10 11 1 6.40 1 0
9 0 0 0 0.00 -1 3
10 12 13 4 1.65 1 0
11 0 0 0 0.00 -1 2
12 0 0 0 0.00 -1 2
13 0 0 0 0.00 -1 3
irisP <- classCenter(training[ ,c(3,4)], training$Species, modFit$finalModel$proximity)
irisP <- as.data.frame(irisP)
irisP$Species <- rownames(irisP)
p <- qplot(Petal.Width, Petal.Length, col = Species, data = training)
p + geom_point(aes(x = Petal.Width, y= Petal.Length, col = Species), size = 5, shape = 14, data = irisP)

table(pred, testing$Species)
pred setosa versicolor virginica
setosa 15 0 0
versicolor 0 15 3
virginica 0 0 12
qplot(Petal.Width, Petal.Length, col = predRight, data = testing, main = "newdata Predictions")

Boosting
library(ISLR)
data("Wage")
inTrain2 <- createDataPartition(y = Wage$wage,
p = 0.7, list = FALSE)
training2 <- Wage[inTrain2, ]
testing2 <- Wage[-inTrain2, ]
modFit2 <- train(wage ~., method = "gbm", data = training2, verbose = FALSE)
modFit2
## plot the result
qplot(predict(modFit2, testing2), wage, data = testing2)

Model Based Prediction
modlda <- train(Species ~., method = "lda", data = training)
modnb <- train(Species ~., method = "nb", data = training)
plda <- predict(modlda, testing)
pnb <- predict(modnb, testing)
table(plda, pnb)
pnb
plda setosa versicolor virginica
setosa 15 0 0
versicolor 0 14 1
virginica 0 2 13
equalPredictions = (plda == pnb)
qplot(Petal.Width, Sepal.Width, col = equalPredictions, data = testing, main = "Comparison of Result")

LS0tDQp0aXRsZTogIlByYWN0aWNhbCBNYWNoaW5lIExlYXJuaW5nIFdlZWsgMyINCm91dHB1dDogaHRtbF9ub3RlYm9vaw0KLS0tDQoNCiANCg0KYGBge3Igd2FybmluZz1GQUxTRX0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShjYXJldCkNCmxpYnJhcnkocmF0dGxlKQ0KZGF0YSgiaXJpcyIpDQpgYGANCg0KDQpgYGB7cn0NCnRhYmxlKGlyaXMkU3BlY2llcykNCmBgYA0KDQoNCmBgYHtyfQ0KaW5UcmFpbiA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKGlyaXMkU3BlY2llcywgcCA9IDAuNywgbGlzdCA9IEZBTFNFKQ0KdHJhaW5pbmcgPC0gaXJpc1tpblRyYWluLCBdDQp0ZXN0aW5nIDwtIGlyaXNbLWluVHJhaW4sIF0NCmBgYA0KDQoNCmBgYHtyfQ0KcXBsb3QoUGV0YWwuV2lkdGgsIFNlcGFsLldpZHRoLCBjb2wgPSBTcGVjaWVzLCBkYXRhID0gdHJhaW5pbmcpDQpgYGANCg0KDQpgYGB7cn0NCm1vZEZpdCA8LSB0cmFpbihTcGVjaWVzIH4uLCBtZXRob2QgPSAicnBhcnQiLCBkYXRhID0gdHJhaW5pbmcpDQptb2RGaXQkZmluYWxNb2RlbA0KcGxvdChtb2RGaXQkZmluYWxNb2RlbCwgdW5pZm9ybSA9IFRSVUUsIG1haW4gPSAiQ2xhc3NpZmljYXRpb24gVHJlZSIpDQp0ZXh0KG1vZEZpdCRmaW5hbE1vZGVsLCB1c2UubiA9IFRSVUUsIGFsbCA9IFRSVUUsIGNleCA9IDAuOCApDQpgYGANCg0KYGBge3J9DQpmYW5jeVJwYXJ0UGxvdChtb2RGaXQkZmluYWxNb2RlbCwgbWFpbiA9ICJDbGFzc2lmaWNhdGlvbiBUcmVlIikNCmBgYA0KDQoNCmBgYHtyfQ0KcHJlZGljdChtb2RGaXQsIG5ld2RhdGEgPSB0ZXN0aW5nKQ0KYGBgDQoNCg0KIyMgQm9vdHN0cmFwIGFnZ3JlZ2F0aW5nKGJhZ2dpbmcpDQoNCmBgYHtyfQ0KZGF0YShvem9uZSwgcGFja2FnZSA9ICJFbGVtU3RhdExlYXJuIikNCm96b25lIDwtIG96b25lW29yZGVyKG96b25lJG96b25lKSwgXQ0KaGVhZChvem9uZSkNCmBgYA0KDQpgYGB7cn0NCiMjIEJhZ2dlZCBsb2Vzcw0KDQpsbCA8LSBtYXRyaXgoTkEsIG5yb3cgPSAxMCwgbmNvbCA9IDE1NSkNCmZvciAoaSBpbiAxOjEwKSB7DQogIHNzIDwtIHNhbXBsZSgxOmRpbShvem9uZSlbMV0sIHJlcGxhY2UgPSBUUlVFKQ0KICBvem9uZTAgPC0gb3pvbmVbc3MsIF0NCiAgb3pvbmUwIDwtIG96b25lMFtvcmRlcihvem9uZTAkb3pvbmUpLCBdDQogIGxvZXNzMCA8LSBsb2Vzcyh0ZW1wZXJhdHVyZSB+IG96b25lLCBkYXRhID0gb3pvbmUwLCBzcGFuID0gMC4yKQ0KICBsbFtpLCBdIDwtIHByZWRpY3QobG9lc3MwLCBuZXdkYXRhID0gZGF0YS5mcmFtZShvem9uZSA9IDE6MTU1KSkNCn0NCmBgYA0KDQpgYGB7cn0NCnBsb3Qob3pvbmUkb3pvbmUsIG96b25lJHRlbXBlcmF0dXJlLCBwY2ggPSAxOSwgY2V4ID0gMC41KQ0KZm9yIChpIGluIDE6MTApIHsNCiAgbGluZXMoMToxNTUsIGxsW2ksIF0sIGNvbCA9ImdyZXkiLCBsd2QgPSAyKQ0KfQ0KbGluZXMoMToxNTUsIGFwcGx5KGxsLCAyLCBtZWFuKSwgY29sID0gInJlZCIsIGx3ZCA9IDIpDQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KHBhcnR5KQ0KcHJlZGljdG9ycyA8LSBkYXRhLmZyYW1lKG96b25lPW96b25lJG96b25lKQ0KdGVtcGVyYXR1cmUgPSBvem9uZSR0ZW1wZXJhdHVyZQ0KdHJlZWJhZyA8LSBiYWcocHJlZGljdG9ycywgdGVtcGVyYXR1cmUsIEI9IDEwLA0KICAgICAgICAgICAgICAgYmFnQ29udHJvbCA9IGJhZ0NvbnRyb2woZml0ID0gY3RyZWVCYWckZml0LA0KICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcHJlZGljdCA9IGN0cmVlQmFnJHByZWQsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgICBhZ2dyZWdhdGUgPSBjdHJlZUJhZyRhZ2dyZWdhdGUpKQ0KYGBgDQoNCmBgYHtyfQ0KcGxvdChvem9uZSRvem9uZSwgdGVtcGVyYXR1cmUsIGNvbCA9ICJsaWdodGdyZXkiLCBwY2ggPSAxOSkNCnBvaW50cyhvem9uZSRvem9uZSwgcHJlZGljdCh0cmVlYmFnJGZpdHNbWzFdXSRmaXQsIHByZWRpY3RvcnMpLCBwY2ggPSAxOSwgY29sID0gInJlZCIpDQpwb2ludHMob3pvbmUkb3pvbmUsIHByZWRpY3QodHJlZWJhZywgcHJlZGljdG9ycyksIHBjaD0gMTksIGNvbCA9ICJibHVlIikNCg0KYGBgDQoNCg0KDQojIyBSYW5kb20gRm9yZXN0cw0KDQpgYGB7cn0NCm1vZEZpdCA8LSB0cmFpbihTcGVjaWVzIH4uLCBkYXRhID0gdHJhaW5pbmcsIG1ldGhvZCA9ICJyZiIsIHByb3ggPSBUUlVFKQ0KbW9kRml0DQpgYGANCg0KYGBge3J9DQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkgDQpnZXRUcmVlKG1vZEZpdCRmaW5hbE1vZGVsLCBrPTIpDQpgYGANCg0KYGBge3J9DQppcmlzUCA8LSBjbGFzc0NlbnRlcih0cmFpbmluZ1sgLGMoMyw0KV0sIHRyYWluaW5nJFNwZWNpZXMsIG1vZEZpdCRmaW5hbE1vZGVsJHByb3hpbWl0eSkNCmlyaXNQIDwtIGFzLmRhdGEuZnJhbWUoaXJpc1ApDQppcmlzUCRTcGVjaWVzIDwtIHJvd25hbWVzKGlyaXNQKQ0KcCA8LSBxcGxvdChQZXRhbC5XaWR0aCwgUGV0YWwuTGVuZ3RoLCBjb2wgPSBTcGVjaWVzLCBkYXRhID0gdHJhaW5pbmcpDQpwICsgZ2VvbV9wb2ludChhZXMoeCA9IFBldGFsLldpZHRoLCB5PSBQZXRhbC5MZW5ndGgsIGNvbCA9IFNwZWNpZXMpLCBzaXplID0gNSwgc2hhcGUgPSAxNCwgZGF0YSA9IGlyaXNQKQ0KYGBgDQoNCg0KYGBge3J9DQpwcmVkPC0gcHJlZGljdChtb2RGaXQsIHRlc3RpbmcpDQp0ZXN0aW5nJHByZWRSaWdodCA8LSBwcmVkID09IHRlc3RpbmckU3BlY2llcw0KdGFibGUocHJlZCwgdGVzdGluZyRTcGVjaWVzKQ0KYGBgDQoNCmBgYHtyfQ0KcXBsb3QoUGV0YWwuV2lkdGgsIFBldGFsLkxlbmd0aCwgY29sID0gcHJlZFJpZ2h0LCBkYXRhID0gdGVzdGluZywgbWFpbiA9ICJuZXdkYXRhIFByZWRpY3Rpb25zIikNCmBgYA0KDQoNCiMjIEJvb3N0aW5nDQpgYGB7cn0NCmxpYnJhcnkoSVNMUikNCmxpYnJhcnkoZ2JtKQ0KZGF0YSgiV2FnZSIpDQppblRyYWluMiA8LSBjcmVhdGVEYXRhUGFydGl0aW9uKHkgPSBXYWdlJHdhZ2UsDQogICAgICAgICAgICAgICAgICAgICAgICAgICAgICAgcCA9IDAuNywgbGlzdCA9IEZBTFNFKQ0KdHJhaW5pbmcyIDwtIFdhZ2VbaW5UcmFpbjIsIF0NCnRlc3RpbmcyIDwtICBXYWdlWy1pblRyYWluMiwgXQ0KYGBgDQoNCg0KYGBge3J9DQptb2RGaXQyIDwtIHRyYWluKHdhZ2Ugfi4sIG1ldGhvZCA9ICJnYm0iLCBkYXRhID0gdHJhaW5pbmcyLCB2ZXJib3NlID0gRkFMU0UpDQptb2RGaXQyDQpgYGANCg0KYGBge3J9DQojIyBwbG90IHRoZSByZXN1bHQNCnFwbG90KHByZWRpY3QobW9kRml0MiwgdGVzdGluZzIpLCB3YWdlLCBkYXRhID0gdGVzdGluZzIpDQoNCmBgYA0KDQoNCiMjIE1vZGVsIEJhc2VkIFByZWRpY3Rpb24NCmBgYHtyfQ0KbGlicmFyeShrbGFSKQ0KbW9kbGRhIDwtIHRyYWluKFNwZWNpZXMgfi4sIG1ldGhvZCA9ICJsZGEiLCBkYXRhID0gdHJhaW5pbmcpDQptb2RuYiA8LSB0cmFpbihTcGVjaWVzIH4uLCBtZXRob2QgPSAibmIiLCBkYXRhID0gdHJhaW5pbmcpDQpwbGRhIDwtIHByZWRpY3QobW9kbGRhLCB0ZXN0aW5nKQ0KcG5iIDwtIHByZWRpY3QobW9kbmIsIHRlc3RpbmcpDQp0YWJsZShwbGRhLCBwbmIpDQpgYGANCg0KDQpgYGB7cn0NCmVxdWFsUHJlZGljdGlvbnMgPSAocGxkYSA9PSBwbmIpDQpxcGxvdChQZXRhbC5XaWR0aCwgU2VwYWwuV2lkdGgsIGNvbCA9IGVxdWFsUHJlZGljdGlvbnMsIGRhdGEgPSB0ZXN0aW5nLCBtYWluID0gIkNvbXBhcmlzb24gb2YgUmVzdWx0IikNCmBgYA0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg0KDQoNCg==