Decision tree, random forest and gradient boosting: step-by-step example

Load the necessary packages

library(rpart)         # decision tree model
library(rpart.plot)    # decision tree diagram

## Warning: package 'rpart.plot' was built under R version 4.0.5

library(wrapr)         # formula

## Warning: package 'wrapr' was built under R version 4.0.5

library(randomForest)  # random forest model

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(GA)            # genetic algorithm for hyperparameter tuning

## Warning: package 'GA' was built under R version 4.0.5

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.0.5

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 4.0.5

## Package 'GA' version 3.2.1
## Type 'citation("GA")' for citing this R package in publications.

## 
## Attaching package: 'GA'

## The following object is masked from 'package:utils':
## 
##     de

library(rsample)       # stratified sampling

## Warning: package 'rsample' was built under R version 4.0.5

library(xgboost)       # gradient boosting

## Warning: package 'xgboost' was built under R version 4.0.5

library(ggplot2)       # beautiful plot

## Warning: package 'ggplot2' was built under R version 4.0.5

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(sigr)          # AUC

## Warning: package 'sigr' was built under R version 4.0.5

Model performance function

performance=function(y,pred){
  confmat_test=table(truth=y,predict=pred>0.5)
  acc=sum(diag(confmat_test))/sum(confmat_test)
  precision=confmat_test[2,2]/sum(confmat_test[,2])
  recall=confmat_test[2,2]/sum(confmat_test[2,])
  auc=calcAUC(pred,y)
  c(acc,precision,recall,auc)
}

Load the data and split it into training and test data.

data=read.csv('diabetes.csv')
vars=setdiff(colnames(data),"Outcome")

set.seed(123)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]

Decision tree

# formula
data_formula=mk_formula("Outcome",vars)
# decision tree model
treemodel=rpart(data_formula,train,method = "class")

# Plot decision tree diagram
rpart.plot(treemodel,type=5,extra=2,cex=0.65)

Evaluation decision tree model

pred=predict(treemodel,newdata = train)[,2]
perf_train=performance(train$Outcome,pred)
pred=predict(treemodel,newdata=test)[,2]
perf_test=performance(test$Outcome,pred)
perf=rbind(perf_train,perf_test)
colnames(perf)=c("accuracy","precision","recall","AUC")
perf

##             accuracy precision    recall       AUC
## perf_train 0.8450245 0.8146067 0.7004831 0.8655196
## perf_test  0.6838710 0.6250000 0.4918033 0.6915766

Random forest

#function of random forest hyperparameter optimization
# x1 refers to maxnodes, x2 refers to nodesize
rf_param_opt=function(x) {
  # further split the training dataset
  split=initial_split(train,prop = 0.8,strata = "Outcome")
  train_split=training(split)
  test_split=testing(split)
  # Use the train_split to train with different combination x1 and x2, then evaluate
  # model with test_split. Set the ntree to a large number. x1 and x2 must be integers
  model_rf=randomForest(train_split[,vars],y=as.factor(train_split$Outcome),ntree=1000,
                        maxnodes = floor(x[1]),nodesize=floor(x[2]))
  pred=predict(model_rf,test_split[,vars])
  # accuracy
  mean(test_split$Outcome==pred)
}

# GA parameter optimization
GA=ga(type="real-valued",fitness = rf_param_opt, lower = c(2,1),upper = c(50,10),
      popSize = 10, maxiter = 100,run=20,seed=1)
summary(GA)

## -- Genetic Algorithm ------------------- 
## 
## GA settings: 
## Type                  =  real-valued 
## Population size       =  10 
## Number of generations =  100 
## Elitism               =  1 
## Crossover probability =  0.8 
## Mutation probability  =  0.1 
## Search domain = 
##       x1 x2
## lower  2  1
## upper 50 10
## 
## GA results: 
## Iterations             = 42 
## Fitness function value = 0.8709677 
## Solution = 
##           x1      x2
## [1,] 26.3418 5.03413

plot(GA)

Training and evaluation

model_rf=randomForest(train[,vars],y=as.factor(train$Outcome),ntree=1000,importance = TRUE,
                      maxnodes = floor(GA@solution[1]),nodesize = floor(GA@solution[2]))
pred=predict(model_rf,train[,vars],type="prob")[,2]
perf_train=performance(train$Outcome,pred)
pred=predict(model_rf,test[,vars],type="prob")[,2]
perf_test=performance(test$Outcome,pred)
perf_rf=rbind(perf_train,perf_test)
colnames(perf_rf)=c("accuracy","precision","recall","AUC")
perf_rf

##             accuracy precision    recall       AUC
## perf_train 0.8548124 0.8554217 0.6859903 0.9395183
## perf_test  0.7161290 0.7297297 0.4426230 0.8020579

Variables importance plot

varImpPlot(model_rf,type=1)

Gradient boosting

input=as.matrix(train[,vars])
cv=xgb.cv(input,label=train$Outcome,params = list(objective="binary:logistic",eta=0.005,max_depth=3),
          nfold = 5,nrounds = 1000,print_every_n = 50,metrics = "logloss")

## [1]  train-logloss:0.690970+0.000055 test-logloss:0.691324+0.000186 
## [51] train-logloss:0.603988+0.002307 test-logloss:0.620406+0.008091 
## [101]    train-logloss:0.546129+0.003548 test-logloss:0.575625+0.013352 
## [151]    train-logloss:0.504917+0.004672 test-logloss:0.545006+0.015776 
## [201]    train-logloss:0.473902+0.005201 test-logloss:0.523205+0.017620 
## [251]    train-logloss:0.449737+0.005506 test-logloss:0.508528+0.019729 
## [301]    train-logloss:0.430507+0.006177 test-logloss:0.497874+0.022640 
## [351]    train-logloss:0.414860+0.006658 test-logloss:0.490847+0.025479 
## [401]    train-logloss:0.401713+0.007343 test-logloss:0.485834+0.027032 
## [451]    train-logloss:0.390165+0.007872 test-logloss:0.482588+0.028371 
## [501]    train-logloss:0.379946+0.007947 test-logloss:0.480210+0.029218 
## [551]    train-logloss:0.371043+0.008078 test-logloss:0.478908+0.030270 
## [601]    train-logloss:0.363336+0.008128 test-logloss:0.477902+0.031145 
## [651]    train-logloss:0.356254+0.007993 test-logloss:0.476618+0.031664 
## [701]    train-logloss:0.349894+0.007933 test-logloss:0.475636+0.032399 
## [751]    train-logloss:0.344017+0.007896 test-logloss:0.475432+0.033030 
## [801]    train-logloss:0.338446+0.007641 test-logloss:0.475873+0.033181 
## [851]    train-logloss:0.333145+0.007603 test-logloss:0.476466+0.033592 
## [901]    train-logloss:0.328216+0.007582 test-logloss:0.476788+0.033913 
## [951]    train-logloss:0.323446+0.007448 test-logloss:0.476962+0.033349 
## [1000]   train-logloss:0.318955+0.007346 test-logloss:0.477156+0.032818

evalframe=as.data.frame(cv$evaluation_log)
(ntree=which.min(evalframe$test_logloss_mean))

## [1] 736

Get the optimal number of iterations.

ggplot(evalframe,aes(x=iter,y=test_logloss_mean)) +
  geom_line() +
  geom_vline(xintercept = ntree, linetype=2, color="red")

Use the optimal parameter settings to train the model and evaluate its performance

# Use the optimal parameter settings
model=xgboost(data=input,label=train$Outcome,params = list(objective="binary:logistic",eta=0.005,max_depth=3),
              nrounds = ntree,verbose = FALSE)

## [14:52:15] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

pred=predict(model,input)
perf_train=performance(train$Outcome,pred)
test_input=as.matrix(test[,vars])
pred=predict(model,test_input)
perf_test=performance(test$Outcome,pred)
perf_gb=rbind(perf_train,perf_test)
colnames(perf_gb)=c("accuracy","precision","recall","AUC")
perf_gb

##             accuracy precision    recall       AUC
## perf_train 0.8482871 0.8238636 0.7004831 0.9213607
## perf_test  0.7225806 0.6875000 0.5409836 0.7847925

Variable importance and plot

importance_matrix=xgb.importance(vars,model=model)
xgb.plot.importance(importance_matrix ,rel_to_first = TRUE)

Effects on prediction of onset of diabetes by several important variables

xgb.plot.shap(test_input, model = model, features = c("Glucose","BMI","Age"))