Libraries
library(tidyverse)
library(caret)
library(rpart)
library(rattle)
library(randomForest)
library(gbm)
Load data
data <- iris
head(data)
str(data)
'data.frame': 150 obs. of 5 variables:
$ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
$ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
$ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
$ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
$ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
Data Exploration and Visualization
summary(data)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100 setosa :50
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
Median :5.800 Median :3.000 Median :4.350 Median :1.300 virginica :50
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
qplot(Petal.Length,Petal.Width,colour=Species,data=data)

pairs(data[c("Sepal.Length","Sepal.Width","Petal.Length","Petal.Width")],
main="Matrix plot", pch=22,
bg=c("red", "yellow", "blue")[unclass(data$Species)])

Data Preparation
set.seed(123)
train.flag <- createDataPartition(y=data$Species, p=0.5, list=FALSE)
training <- data[ train.flag, ]
Validation <- data[-train.flag, ]
Decision Tree Modelling
Model Fitting
modfit.CART <- train(Species~., method="rpart", data=training)
fancyRpartPlot(modfit.CART$finalModel)

Prediction and Evaluation
Predict train set
cm.train <- list() # initial
cm.test <- list() # initial
train.cart <- predict(modfit.CART, newdata=training)
mtab.cart.train <- table(train.cart, training$Species)
cm.train[[1]] <- c("classification and Regression Trees","CART",confusionMatrix(mtab.cart.train))
cm.train[[1]]$table
train.cart setosa versicolor virginica
setosa 25 0 0
versicolor 0 22 0
virginica 0 3 25
cm.train[[1]]$overall[1]
Accuracy
0.96
cm.train[[1]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate
Class: setosa 1.00 1.00 1.0000000 1.0000000 1.0000000 1.00 1.0000000 0.3333333 0.3333333
Class: versicolor 0.88 1.00 1.0000000 0.9433962 1.0000000 0.88 0.9361702 0.3333333 0.2933333
Class: virginica 1.00 0.94 0.8928571 1.0000000 0.8928571 1.00 0.9433962 0.3333333 0.3333333
Detection Prevalence Balanced Accuracy
Class: setosa 0.3333333 1.00
Class: versicolor 0.2933333 0.94
Class: virginica 0.3733333 0.97
Predict test set
pred.cart <- predict(modfit.CART, newdata=Validation)
Evaluation
mtab.cart.test <- table(pred.cart, Validation$Species)
cm.test[[1]] <- c("classification and Regression Trees","CART",confusionMatrix(mtab.cart.test))
cm.test[[1]]$table
pred.cart setosa versicolor virginica
setosa 25 0 0
versicolor 0 22 1
virginica 0 3 24
cm.test[[1]]$overall[1]
Accuracy
0.9466667
cm.test[[1]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate
Class: setosa 1.00 1.00 1.0000000 1.0000000 1.0000000 1.00 1.0000000 0.3333333 0.3333333
Class: versicolor 0.88 0.98 0.9565217 0.9423077 0.9565217 0.88 0.9166667 0.3333333 0.2933333
Class: virginica 0.96 0.94 0.8888889 0.9791667 0.8888889 0.96 0.9230769 0.3333333 0.3200000
Detection Prevalence Balanced Accuracy
Class: setosa 0.3333333 1.00
Class: versicolor 0.3066667 0.93
Class: virginica 0.3600000 0.95
# visualize the cases for which the prediction went wrong
correct.CART <- pred.cart == Validation$Species
qplot(Petal.Length, Petal.Width, colour=correct.CART, data=Validation)

Random Forest Modelling
Model Fitting
modfit.RF <- train(Species~ ., method="rf", data=training)
Prediction and Evaluation
Predict train set
train.RF <- predict(modfit.RF, training)
mtab.RF.train <- table(train.RF, training$Species)
cm.train[[2]] <- c("Random Forest","RF",confusionMatrix(mtab.RF.train))
cm.train[[2]]$table
train.RF setosa versicolor virginica
setosa 25 0 0
versicolor 0 25 0
virginica 0 0 25
cm.train[[2]]$overall[1]
Accuracy
1
cm.train[[2]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate Detection Prevalence
Class: setosa 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Class: versicolor 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Class: virginica 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Balanced Accuracy
Class: setosa 1
Class: versicolor 1
Class: virginica 1
Predict test set
pred.RF <- predict(modfit.RF, newdata=Validation)
Evaluation
mtab.RF.test <- table(pred.RF, Validation$Species)
cm.test[[2]] <- c("Random Forest","RF",confusionMatrix(mtab.RF.test))
cm.test[[2]]$table
pred.RF setosa versicolor virginica
setosa 25 0 0
versicolor 0 23 2
virginica 0 2 23
cm.test[[2]]$overall[1]
Accuracy
0.9466667
cm.test[[2]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate Detection Prevalence
Class: setosa 1.00 1.00 1.00 1.00 1.00 1.00 1.00 0.3333333 0.3333333 0.3333333
Class: versicolor 0.92 0.96 0.92 0.96 0.92 0.92 0.92 0.3333333 0.3066667 0.3333333
Class: virginica 0.92 0.96 0.92 0.96 0.92 0.92 0.92 0.3333333 0.3066667 0.3333333
Balanced Accuracy
Class: setosa 1.00
Class: versicolor 0.94
Class: virginica 0.94
Gradient Boosting Modelling
Model Fitting
fitControl <- trainControl(method = "cv", number = 10)
tune_Grid <- expand.grid(interaction.depth = 2, n.trees = 500,
shrinkage = 0.1, n.minobsinnode = 10)
set.seed(123)
modfit.gbm <- train(Species~ ., method="gbm", data=training,
trControl = fitControl, verbose = FALSE, tuneGrid=tune_Grid)
Prediction and Evaluation
Predict train set
train.gbm <- predict(modfit.gbm, training)
mtab.gbm.train <- table(train.gbm, training$Species)
cm.train[[3]] <- c("Gradient Boosted Machine","GBM",confusionMatrix(mtab.gbm.train))
cm.train[[3]]$table
train.gbm setosa versicolor virginica
setosa 25 0 0
versicolor 0 25 0
virginica 0 0 25
cm.train[[3]]$overall[1]
Accuracy
1
cm.train[[3]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate Detection Prevalence
Class: setosa 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Class: versicolor 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Class: virginica 1 1 1 1 1 1 1 0.3333333 0.3333333 0.3333333
Balanced Accuracy
Class: setosa 1
Class: versicolor 1
Class: virginica 1
Predict test set
pred.gbm <- predict(modfit.gbm, newdata=Validation)
Evaluation
mtab.gbm.test <-table(pred.gbm, Validation$Species)
cm.test[[3]] <- c("Gradient Boosted Machine","GBM",confusionMatrix(mtab.gbm.test))
cm.test[[3]]$table
pred.gbm setosa versicolor virginica
setosa 25 0 0
versicolor 0 23 3
virginica 0 2 22
cm.test[[3]]$overall[1]
Accuracy
0.9333333
cm.test[[3]]$byClass
Sensitivity Specificity Pos Pred Value Neg Pred Value Precision Recall F1 Prevalence Detection Rate
Class: setosa 1.00 1.00 1.0000000 1.0000000 1.0000000 1.00 1.0000000 0.3333333 0.3333333
Class: versicolor 0.92 0.94 0.8846154 0.9591837 0.8846154 0.92 0.9019608 0.3333333 0.3066667
Class: virginica 0.88 0.96 0.9166667 0.9411765 0.9166667 0.88 0.8979592 0.3333333 0.2933333
Detection Prevalence Balanced Accuracy
Class: setosa 0.3333333 1.00
Class: versicolor 0.3466667 0.93
Class: virginica 0.3200000 0.92
LS0tCnRpdGxlOiAiTWFjaGluZSBMZWFybmluZyBDbGFzc2lmaWNhdGlvbiIKb3V0cHV0OgogIGh0bWxfbm90ZWJvb2s6CiAgICB0b2M6IHRydWUKICAgIHRvY19mbG9hdDoKICAgICAgdG9jX2NvbGxhcHNlZDogdHJ1ZQotLS0KCiMjIExpYnJhcmllcwpgYGB7cn0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoY2FyZXQpCmxpYnJhcnkocnBhcnQpCmxpYnJhcnkocmF0dGxlKQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkKbGlicmFyeShnYm0pCmBgYAoKCiMjIExvYWQgZGF0YQpgYGB7cn0KZGF0YSA8LSBpcmlzCmhlYWQoZGF0YSkKYGBgCgpgYGB7cn0Kc3RyKGRhdGEpCmBgYAoKCiMjIERhdGEgRXhwbG9yYXRpb24gYW5kIFZpc3VhbGl6YXRpb24KYGBge3J9CnN1bW1hcnkoZGF0YSkKYGBgCgoKYGBge3J9CnFwbG90KFBldGFsLkxlbmd0aCxQZXRhbC5XaWR0aCxjb2xvdXI9U3BlY2llcyxkYXRhPWRhdGEpCmBgYAoKYGBge3J9CnBhaXJzKGRhdGFbYygiU2VwYWwuTGVuZ3RoIiwiU2VwYWwuV2lkdGgiLCJQZXRhbC5MZW5ndGgiLCJQZXRhbC5XaWR0aCIpXSwgCiAgIG1haW49Ik1hdHJpeCBwbG90IiwgcGNoPTIyLAogICBiZz1jKCJyZWQiLCAieWVsbG93IiwgImJsdWUiKVt1bmNsYXNzKGRhdGEkU3BlY2llcyldKSAKYGBgCgojIyBEYXRhIFByZXBhcmF0aW9uCmBgYHtyfQpzZXQuc2VlZCgxMjMpCnRyYWluLmZsYWcgPC0gY3JlYXRlRGF0YVBhcnRpdGlvbih5PWRhdGEkU3BlY2llcywgcD0wLjUsIGxpc3Q9RkFMU0UpCnRyYWluaW5nICAgPC0gZGF0YVsgdHJhaW4uZmxhZywgXQpWYWxpZGF0aW9uIDwtIGRhdGFbLXRyYWluLmZsYWcsIF0KYGBgCgojIyBEZWNpc2lvbiBUcmVlIE1vZGVsbGluZwoKIyMjIE1vZGVsIEZpdHRpbmcKYGBge3J9Cm1vZGZpdC5DQVJUIDwtIHRyYWluKFNwZWNpZXN+LiwgbWV0aG9kPSJycGFydCIsIGRhdGE9dHJhaW5pbmcpCmZhbmN5UnBhcnRQbG90KG1vZGZpdC5DQVJUJGZpbmFsTW9kZWwpCmBgYAoKIyMjIFByZWRpY3Rpb24gYW5kIEV2YWx1YXRpb24KCiMjIyMgUHJlZGljdCB0cmFpbiBzZXQKYGBge3J9CmNtLnRyYWluICA8LSBsaXN0KCkgICAjIGluaXRpYWwKY20udGVzdCAgIDwtIGxpc3QoKSAgICMgaW5pdGlhbAoKdHJhaW4uY2FydCA8LSBwcmVkaWN0KG1vZGZpdC5DQVJULCBuZXdkYXRhPXRyYWluaW5nKQptdGFiLmNhcnQudHJhaW4gPC0gdGFibGUodHJhaW4uY2FydCwgdHJhaW5pbmckU3BlY2llcykKY20udHJhaW5bWzFdXSA8LSBjKCJjbGFzc2lmaWNhdGlvbiBhbmQgUmVncmVzc2lvbiBUcmVlcyIsIkNBUlQiLGNvbmZ1c2lvbk1hdHJpeChtdGFiLmNhcnQudHJhaW4pKQpjbS50cmFpbltbMV1dJHRhYmxlCmBgYAoKYGBge3J9CmNtLnRyYWluW1sxXV0kb3ZlcmFsbFsxXQpgYGAKCmBgYHtyfQpjbS50cmFpbltbMV1dJGJ5Q2xhc3MKYGBgCgoKIyMjIyBQcmVkaWN0IHRlc3Qgc2V0CmBgYHtyfQpwcmVkLmNhcnQgPC0gcHJlZGljdChtb2RmaXQuQ0FSVCwgbmV3ZGF0YT1WYWxpZGF0aW9uKQpgYGAKCiMjIyMgRXZhbHVhdGlvbgpgYGB7cn0KbXRhYi5jYXJ0LnRlc3QgPC0gdGFibGUocHJlZC5jYXJ0LCBWYWxpZGF0aW9uJFNwZWNpZXMpCmNtLnRlc3RbWzFdXSA8LSBjKCJjbGFzc2lmaWNhdGlvbiBhbmQgUmVncmVzc2lvbiBUcmVlcyIsIkNBUlQiLGNvbmZ1c2lvbk1hdHJpeChtdGFiLmNhcnQudGVzdCkpCmNtLnRlc3RbWzFdXSR0YWJsZQpgYGAKCgpgYGB7cn0KY20udGVzdFtbMV1dJG92ZXJhbGxbMV0KYGBgCgpgYGB7cn0KY20udGVzdFtbMV1dJGJ5Q2xhc3MKYGBgCgoKYGBge3J9CiMgdmlzdWFsaXplIHRoZSBjYXNlcyBmb3Igd2hpY2ggdGhlIHByZWRpY3Rpb24gd2VudCB3cm9uZwoKY29ycmVjdC5DQVJUIDwtIHByZWQuY2FydCA9PSBWYWxpZGF0aW9uJFNwZWNpZXMKCnFwbG90KFBldGFsLkxlbmd0aCwgUGV0YWwuV2lkdGgsIGNvbG91cj1jb3JyZWN0LkNBUlQsIGRhdGE9VmFsaWRhdGlvbikKYGBgCgojIyBSYW5kb20gRm9yZXN0IE1vZGVsbGluZwoKIyMjIE1vZGVsIEZpdHRpbmcKYGBge3J9Cm1vZGZpdC5SRiA8LSB0cmFpbihTcGVjaWVzfiAuLCBtZXRob2Q9InJmIiwgZGF0YT10cmFpbmluZykKYGBgCgojIyMgUHJlZGljdGlvbiBhbmQgRXZhbHVhdGlvbgojIyMjIFByZWRpY3QgdHJhaW4gc2V0CmBgYHtyfQp0cmFpbi5SRiA8LSBwcmVkaWN0KG1vZGZpdC5SRiwgdHJhaW5pbmcpCm10YWIuUkYudHJhaW4gPC0gdGFibGUodHJhaW4uUkYsIHRyYWluaW5nJFNwZWNpZXMpCmNtLnRyYWluW1syXV0gPC0gYygiUmFuZG9tIEZvcmVzdCIsIlJGIixjb25mdXNpb25NYXRyaXgobXRhYi5SRi50cmFpbikpCmNtLnRyYWluW1syXV0kdGFibGUKY20udHJhaW5bWzJdXSRvdmVyYWxsWzFdCmNtLnRyYWluW1syXV0kYnlDbGFzcwpgYGAKCgojIyMjIFByZWRpY3QgdGVzdCBzZXQKCmBgYHtyfQpwcmVkLlJGIDwtIHByZWRpY3QobW9kZml0LlJGLCBuZXdkYXRhPVZhbGlkYXRpb24pCmBgYAoKIyMjIyBFdmFsdWF0aW9uCmBgYHtyfQptdGFiLlJGLnRlc3QgIDwtIHRhYmxlKHByZWQuUkYsIFZhbGlkYXRpb24kU3BlY2llcykKY20udGVzdFtbMl1dIDwtIGMoIlJhbmRvbSBGb3Jlc3QiLCJSRiIsY29uZnVzaW9uTWF0cml4KG10YWIuUkYudGVzdCkpCmNtLnRlc3RbWzJdXSR0YWJsZQpjbS50ZXN0W1syXV0kb3ZlcmFsbFsxXQpjbS50ZXN0W1syXV0kYnlDbGFzcwpgYGAKCiMjIEdyYWRpZW50IEJvb3N0aW5nIE1vZGVsbGluZwoKIyMjIE1vZGVsIEZpdHRpbmcKYGBge3J9CmZpdENvbnRyb2wgPC0gdHJhaW5Db250cm9sKG1ldGhvZCA9ICJjdiIsIG51bWJlciA9IDEwKQp0dW5lX0dyaWQgPC0gZXhwYW5kLmdyaWQoaW50ZXJhY3Rpb24uZGVwdGggPSAyLCBuLnRyZWVzID0gNTAwLCAKICAgICAgICAgICAgICAgICAgICAgICAgIHNocmlua2FnZSA9IDAuMSwgbi5taW5vYnNpbm5vZGUgPSAxMCkKc2V0LnNlZWQoMTIzKQptb2RmaXQuZ2JtIDwtIHRyYWluKFNwZWNpZXN+IC4sIG1ldGhvZD0iZ2JtIiwgZGF0YT10cmFpbmluZywKICAgICAgICAgICAgICAgICAgICB0ckNvbnRyb2wgPSBmaXRDb250cm9sLCB2ZXJib3NlID0gRkFMU0UsIHR1bmVHcmlkPXR1bmVfR3JpZCkKYGBgCgojIyMgUHJlZGljdGlvbiBhbmQgRXZhbHVhdGlvbgojIyMjIFByZWRpY3QgdHJhaW4gc2V0CmBgYHtyfQp0cmFpbi5nYm0gPC0gcHJlZGljdChtb2RmaXQuZ2JtLCB0cmFpbmluZykKbXRhYi5nYm0udHJhaW4gPC0gdGFibGUodHJhaW4uZ2JtLCB0cmFpbmluZyRTcGVjaWVzKQpjbS50cmFpbltbM11dIDwtIGMoIkdyYWRpZW50IEJvb3N0ZWQgTWFjaGluZSIsIkdCTSIsY29uZnVzaW9uTWF0cml4KG10YWIuZ2JtLnRyYWluKSkKY20udHJhaW5bWzNdXSR0YWJsZQpjbS50cmFpbltbM11dJG92ZXJhbGxbMV0KY20udHJhaW5bWzNdXSRieUNsYXNzCmBgYAoKCiMjIyMgUHJlZGljdCB0ZXN0IHNldAoKYGBge3J9CnByZWQuZ2JtIDwtIHByZWRpY3QobW9kZml0LmdibSwgbmV3ZGF0YT1WYWxpZGF0aW9uKQpgYGAKCiMjIyMgRXZhbHVhdGlvbgpgYGB7cn0KbXRhYi5nYm0udGVzdCAgPC10YWJsZShwcmVkLmdibSwgVmFsaWRhdGlvbiRTcGVjaWVzKQpjbS50ZXN0W1szXV0gPC0gYygiR3JhZGllbnQgQm9vc3RlZCBNYWNoaW5lIiwiR0JNIixjb25mdXNpb25NYXRyaXgobXRhYi5nYm0udGVzdCkpCmNtLnRlc3RbWzNdXSR0YWJsZQpjbS50ZXN0W1szXV0kb3ZlcmFsbFsxXQpjbS50ZXN0W1szXV0kYnlDbGFzcwpgYGAKCgoKCg==