These notes follow the models presented in the Datacamp course on tree-based models with the exception of the chapter on regression. There are for models used to solve binary classification problems. These notes use the caret package consistently.
For each model, we examine the list of models available in caret to determine the specifics of using caret to implement the model. We need to know the required packages , the method to invoke the model, and the hyperparameters to be tuned. The list is at https://topepo.github.io/caret/available-models.html.
library(tidyverse)
## ── Attaching packages ───────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rpart)
library(ipred)
library(e1071)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(gbm)
## Loaded gbm 2.1.5
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
library(ranger)
We will use the German credit data, which can be found in many places on the web. The one below will work. I renamed the target variable to “OK” to reduce typing.
url="http://freakonometrics.free.fr/german_credit.csv"
credit=read.csv(url, header = TRUE, sep = ",")
credit %>% rename(OK = Creditability) %>%
mutate( OK = factor(OK) ) -> credit
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ OK : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
This is the standard caret splitting process.
set.seed(123)
ind = createDataPartition(credit$OK,
p = .8,
list = FALSE)
traind = credit[ind,]
testd = credit[-ind,]
table(traind$OK)
##
## 0 1
## 240 560
table(testd$OK)
##
## 0 1
## 60 140
myTc = trainControl(method = "cv",number = 10)
mod_rpart <- train(OK ~ .,
method = "rpart",
data = traind,
tuneLength = 10,
metric = "Accuracy",
trControl = myTc)
mod_rpart$bestTune
## cp
## 5 0.03055556
pred_rpart = predict(mod_rpart,newdata = testd)
confusionMatrix(pred_rpart,testd$OK)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 25 23
## 1 35 117
##
## Accuracy : 0.71
## 95% CI : (0.6418, 0.7718)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.4123
##
## Kappa : 0.2677
##
## Mcnemar's Test P-Value : 0.1486
##
## Sensitivity : 0.4167
## Specificity : 0.8357
## Pos Pred Value : 0.5208
## Neg Pred Value : 0.7697
## Prevalence : 0.3000
## Detection Rate : 0.1250
## Detection Prevalence : 0.2400
## Balanced Accuracy : 0.6262
##
## 'Positive' Class : 0
##
mod_bag <- train(OK ~ .,
method = "treebag",
data = traind,
trControl = myTc,
metric = "Accuracy")
pred_bag = predict(mod_bag,newdata = testd)
confusionMatrix(pred_bag,testd$OK)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 31 31
## 1 29 109
##
## Accuracy : 0.7
## 95% CI : (0.6314, 0.7626)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.5348
##
## Kappa : 0.2925
##
## Mcnemar's Test P-Value : 0.8973
##
## Sensitivity : 0.5167
## Specificity : 0.7786
## Pos Pred Value : 0.5000
## Neg Pred Value : 0.7899
## Prevalence : 0.3000
## Detection Rate : 0.1550
## Detection Prevalence : 0.3100
## Balanced Accuracy : 0.6476
##
## 'Positive' Class : 0
##
mod_bag
## Bagged CART
##
## 800 samples
## 20 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results:
##
## Accuracy Kappa
## 0.76 0.3947859
mod_ranger <- train(OK ~ .,
method = "ranger",
data = traind,
tuneLength = 10,
metric = "Accuracy",
trControl = myTc)
mod_ranger$bestTune
## mtry splitrule min.node.size
## 16 16 extratrees 1
pred_ranger = predict(mod_ranger,newdata = testd)
confusionMatrix(pred_ranger,testd$OK)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 32 24
## 1 28 116
##
## Accuracy : 0.74
## 95% CI : (0.6734, 0.7993)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.1228
##
## Kappa : 0.3689
##
## Mcnemar's Test P-Value : 0.6774
##
## Sensitivity : 0.5333
## Specificity : 0.8286
## Pos Pred Value : 0.5714
## Neg Pred Value : 0.8056
## Prevalence : 0.3000
## Detection Rate : 0.1600
## Detection Prevalence : 0.2800
## Balanced Accuracy : 0.6810
##
## 'Positive' Class : 0
##
Gradient Boosting.
mod_gbm <- train(OK ~ .,
method = "gbm",
data = traind,
tuneLength = 10,
metric = "Accuracy",
trControl = myTc,
verbose = FALSE)
mod_gbm$bestTune
## n.trees interaction.depth shrinkage n.minobsinnode
## 20 500 2 0.1 10
pred_gbm = predict(mod_gbm,newdata = testd)
confusionMatrix(pred_gbm,testd$OK)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 37 26
## 1 23 114
##
## Accuracy : 0.755
## 95% CI : (0.6894, 0.8129)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.05059
##
## Kappa : 0.4249
##
## Mcnemar's Test P-Value : 0.77510
##
## Sensitivity : 0.6167
## Specificity : 0.8143
## Pos Pred Value : 0.5873
## Neg Pred Value : 0.8321
## Prevalence : 0.3000
## Detection Rate : 0.1850
## Detection Prevalence : 0.3150
## Balanced Accuracy : 0.7155
##
## 'Positive' Class : 0
##
Caret makes it easy to compare a collection of models focused on the same target.
results <- resamples(list(rpart=mod_rpart, bag=mod_bag, ranger=mod_ranger, gbm = mod_gbm))
# summarize the distributions
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: rpart, bag, ranger, gbm
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.7000 0.737500 0.76875 0.77125 0.800000 0.8625 0
## bag 0.6875 0.728125 0.76250 0.76000 0.784375 0.8625 0
## ranger 0.7375 0.753125 0.77500 0.77750 0.806250 0.8250 0
## gbm 0.7125 0.753125 0.78125 0.78500 0.828125 0.8500 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## rpart 0.2105263 0.3181818 0.3961538 0.4067088 0.4871795 0.6518987 0
## bag 0.2283951 0.3149805 0.3795996 0.3947859 0.4509710 0.6686747 0
## ranger 0.3000000 0.3789975 0.4230769 0.4241405 0.4730594 0.5394737 0
## gbm 0.2532468 0.3499493 0.4574452 0.4542740 0.5664557 0.6428571 0
# boxplots of results
bwplot(results)