Load the necessary packages
library(rpart) # decision tree model
library(rpart.plot) # decision tree diagram
## Warning: package 'rpart.plot' was built under R version 4.0.5
library(wrapr) # formula
## Warning: package 'wrapr' was built under R version 4.0.5
library(randomForest) # random forest model
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(GA) # genetic algorithm for hyperparameter tuning
## Warning: package 'GA' was built under R version 4.0.5
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.0.5
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.0.5
## Package 'GA' version 3.2.1
## Type 'citation("GA")' for citing this R package in publications.
##
## Attaching package: 'GA'
## The following object is masked from 'package:utils':
##
## de
library(rsample) # stratified sampling
## Warning: package 'rsample' was built under R version 4.0.5
library(xgboost) # gradient boosting
## Warning: package 'xgboost' was built under R version 4.0.5
library(ggplot2) # beautiful plot
## Warning: package 'ggplot2' was built under R version 4.0.5
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(sigr) # AUC
## Warning: package 'sigr' was built under R version 4.0.5
Model performance function
performance=function(y,pred){
confmat_test=table(truth=y,predict=pred>0.5)
acc=sum(diag(confmat_test))/sum(confmat_test)
precision=confmat_test[2,2]/sum(confmat_test[,2])
recall=confmat_test[2,2]/sum(confmat_test[2,])
auc=calcAUC(pred,y)
c(acc,precision,recall,auc)
}
Load the data and split it into training and test data.
data=read.csv('diabetes.csv')
vars=setdiff(colnames(data),"Outcome")
set.seed(123)
randn=runif(nrow(data))
train_idx=randn<=0.8
train=data[train_idx,]
test=data[!train_idx,]
# formula
data_formula=mk_formula("Outcome",vars)
# decision tree model
treemodel=rpart(data_formula,train,method = "class")
# Plot decision tree diagram
rpart.plot(treemodel,type=5,extra=2,cex=0.65)
Evaluation decision tree model
pred=predict(treemodel,newdata = train)[,2]
perf_train=performance(train$Outcome,pred)
pred=predict(treemodel,newdata=test)[,2]
perf_test=performance(test$Outcome,pred)
perf=rbind(perf_train,perf_test)
colnames(perf)=c("accuracy","precision","recall","AUC")
perf
## accuracy precision recall AUC
## perf_train 0.8450245 0.8146067 0.7004831 0.8655196
## perf_test 0.6838710 0.6250000 0.4918033 0.6915766
#function of random forest hyperparameter optimization
# x1 refers to maxnodes, x2 refers to nodesize
rf_param_opt=function(x) {
# further split the training dataset
split=initial_split(train,prop = 0.8,strata = "Outcome")
train_split=training(split)
test_split=testing(split)
# Use the train_split to train with different combination x1 and x2, then evaluate
# model with test_split. Set the ntree to a large number. x1 and x2 must be integers
model_rf=randomForest(train_split[,vars],y=as.factor(train_split$Outcome),ntree=1000,
maxnodes = floor(x[1]),nodesize=floor(x[2]))
pred=predict(model_rf,test_split[,vars])
# accuracy
mean(test_split$Outcome==pred)
}
# GA parameter optimization
GA=ga(type="real-valued",fitness = rf_param_opt, lower = c(2,1),upper = c(50,10),
popSize = 10, maxiter = 100,run=20,seed=1)
summary(GA)
## -- Genetic Algorithm -------------------
##
## GA settings:
## Type = real-valued
## Population size = 10
## Number of generations = 100
## Elitism = 1
## Crossover probability = 0.8
## Mutation probability = 0.1
## Search domain =
## x1 x2
## lower 2 1
## upper 50 10
##
## GA results:
## Iterations = 42
## Fitness function value = 0.8709677
## Solution =
## x1 x2
## [1,] 26.3418 5.03413
plot(GA)
Training and evaluation
model_rf=randomForest(train[,vars],y=as.factor(train$Outcome),ntree=1000,importance = TRUE,
maxnodes = floor(GA@solution[1]),nodesize = floor(GA@solution[2]))
pred=predict(model_rf,train[,vars],type="prob")[,2]
perf_train=performance(train$Outcome,pred)
pred=predict(model_rf,test[,vars],type="prob")[,2]
perf_test=performance(test$Outcome,pred)
perf_rf=rbind(perf_train,perf_test)
colnames(perf_rf)=c("accuracy","precision","recall","AUC")
perf_rf
## accuracy precision recall AUC
## perf_train 0.8548124 0.8554217 0.6859903 0.9395183
## perf_test 0.7161290 0.7297297 0.4426230 0.8020579
Variables importance plot
varImpPlot(model_rf,type=1)
input=as.matrix(train[,vars])
cv=xgb.cv(input,label=train$Outcome,params = list(objective="binary:logistic",eta=0.005,max_depth=3),
nfold = 5,nrounds = 1000,print_every_n = 50,metrics = "logloss")
## [1] train-logloss:0.690970+0.000055 test-logloss:0.691324+0.000186
## [51] train-logloss:0.603988+0.002307 test-logloss:0.620406+0.008091
## [101] train-logloss:0.546129+0.003548 test-logloss:0.575625+0.013352
## [151] train-logloss:0.504917+0.004672 test-logloss:0.545006+0.015776
## [201] train-logloss:0.473902+0.005201 test-logloss:0.523205+0.017620
## [251] train-logloss:0.449737+0.005506 test-logloss:0.508528+0.019729
## [301] train-logloss:0.430507+0.006177 test-logloss:0.497874+0.022640
## [351] train-logloss:0.414860+0.006658 test-logloss:0.490847+0.025479
## [401] train-logloss:0.401713+0.007343 test-logloss:0.485834+0.027032
## [451] train-logloss:0.390165+0.007872 test-logloss:0.482588+0.028371
## [501] train-logloss:0.379946+0.007947 test-logloss:0.480210+0.029218
## [551] train-logloss:0.371043+0.008078 test-logloss:0.478908+0.030270
## [601] train-logloss:0.363336+0.008128 test-logloss:0.477902+0.031145
## [651] train-logloss:0.356254+0.007993 test-logloss:0.476618+0.031664
## [701] train-logloss:0.349894+0.007933 test-logloss:0.475636+0.032399
## [751] train-logloss:0.344017+0.007896 test-logloss:0.475432+0.033030
## [801] train-logloss:0.338446+0.007641 test-logloss:0.475873+0.033181
## [851] train-logloss:0.333145+0.007603 test-logloss:0.476466+0.033592
## [901] train-logloss:0.328216+0.007582 test-logloss:0.476788+0.033913
## [951] train-logloss:0.323446+0.007448 test-logloss:0.476962+0.033349
## [1000] train-logloss:0.318955+0.007346 test-logloss:0.477156+0.032818
evalframe=as.data.frame(cv$evaluation_log)
(ntree=which.min(evalframe$test_logloss_mean))
## [1] 736
Get the optimal number of iterations.
ggplot(evalframe,aes(x=iter,y=test_logloss_mean)) +
geom_line() +
geom_vline(xintercept = ntree, linetype=2, color="red")
Use the optimal parameter settings to train the model and evaluate its performance
# Use the optimal parameter settings
model=xgboost(data=input,label=train$Outcome,params = list(objective="binary:logistic",eta=0.005,max_depth=3),
nrounds = ntree,verbose = FALSE)
## [14:52:15] WARNING: amalgamation/../src/learner.cc:1095: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
pred=predict(model,input)
perf_train=performance(train$Outcome,pred)
test_input=as.matrix(test[,vars])
pred=predict(model,test_input)
perf_test=performance(test$Outcome,pred)
perf_gb=rbind(perf_train,perf_test)
colnames(perf_gb)=c("accuracy","precision","recall","AUC")
perf_gb
## accuracy precision recall AUC
## perf_train 0.8482871 0.8238636 0.7004831 0.9213607
## perf_test 0.7225806 0.6875000 0.5409836 0.7847925
Variable importance and plot
importance_matrix=xgb.importance(vars,model=model)
xgb.plot.importance(importance_matrix ,rel_to_first = TRUE)
Effects on prediction of onset of diabetes by several important variables
xgb.plot.shap(test_input, model = model, features = c("Glucose","BMI","Age"))