library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)## [1] 8001 4
## [1] 1999 4
set.seed(123)
# fitting boosting classification model
ModelXGboost <- train(default ~ .,
data = train.df,
method = "xgbTree",
trControl = trctrl)
# model summary
ModelXGboost## eXtreme Gradient Boosting
##
## 8001 samples
## 3 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 7200, 7200, 7201, 7202, 7201, 7202, ...
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree subsample nrounds Accuracy Kappa
## 0.3 1 0.6 0.50 50 0.9713793 0.3913068
## 0.3 1 0.6 0.50 100 0.9720040 0.4006042
## 0.3 1 0.6 0.50 150 0.9723789 0.4099108
## 0.3 1 0.6 0.75 50 0.9716292 0.3799901
## 0.3 1 0.6 0.75 100 0.9717542 0.3997378
## 0.3 1 0.6 0.75 150 0.9718792 0.4046255
## 0.3 1 0.6 1.00 50 0.9706295 0.3649273
## 0.3 1 0.6 1.00 100 0.9716292 0.3959953
## 0.3 1 0.6 1.00 150 0.9712540 0.3877077
## 0.3 1 0.8 0.50 50 0.9730036 0.4199934
## 0.3 1 0.8 0.50 100 0.9718790 0.3952624
## 0.3 1 0.8 0.50 150 0.9718789 0.3909090
## 0.3 1 0.8 0.75 50 0.9706301 0.3858335
## 0.3 1 0.8 0.75 100 0.9713798 0.3931568
## 0.3 1 0.8 0.75 150 0.9717542 0.4010233
## 0.3 1 0.8 1.00 50 0.9715047 0.3899435
## 0.3 1 0.8 1.00 100 0.9715047 0.3957805
## 0.3 1 0.8 1.00 150 0.9715047 0.4009329
## 0.3 2 0.6 0.50 50 0.9721276 0.4004037
## 0.3 2 0.6 0.50 100 0.9715031 0.4083184
## 0.3 2 0.6 0.50 150 0.9716281 0.4086541
## 0.3 2 0.6 0.75 50 0.9717542 0.3873674
## 0.3 2 0.6 0.75 100 0.9722548 0.4178143
## 0.3 2 0.6 0.75 150 0.9710039 0.3875485
## 0.3 2 0.6 1.00 50 0.9717539 0.3828775
## 0.3 2 0.6 1.00 100 0.9712543 0.3897696
## 0.3 2 0.6 1.00 150 0.9708792 0.3925408
## 0.3 2 0.8 0.50 50 0.9715033 0.4004922
## 0.3 2 0.8 0.50 100 0.9707540 0.3936825
## 0.3 2 0.8 0.50 150 0.9710034 0.4032012
## 0.3 2 0.8 0.75 50 0.9710043 0.3873647
## 0.3 2 0.8 0.75 100 0.9708792 0.3928655
## 0.3 2 0.8 0.75 150 0.9702550 0.3798871
## 0.3 2 0.8 1.00 50 0.9716293 0.3961700
## 0.3 2 0.8 1.00 100 0.9710037 0.3857656
## 0.3 2 0.8 1.00 150 0.9707551 0.3977728
## 0.3 3 0.6 0.50 50 0.9708790 0.3822028
## 0.3 3 0.6 0.50 100 0.9702537 0.3818082
## 0.3 3 0.6 0.50 150 0.9696300 0.3881593
## 0.3 3 0.6 0.75 50 0.9718789 0.3949405
## 0.3 3 0.6 0.75 100 0.9705050 0.3804730
## 0.3 3 0.6 0.75 150 0.9697553 0.3861759
## 0.3 3 0.6 1.00 50 0.9715042 0.3776866
## 0.3 3 0.6 1.00 100 0.9697547 0.3776029
## 0.3 3 0.6 1.00 150 0.9696298 0.3835070
## 0.3 3 0.8 0.50 50 0.9710043 0.4112140
## 0.3 3 0.8 0.50 100 0.9688801 0.3702965
## 0.3 3 0.8 0.50 150 0.9702548 0.4017938
## 0.3 3 0.8 0.75 50 0.9706292 0.4016517
## 0.3 3 0.8 0.75 100 0.9702545 0.3924848
## 0.3 3 0.8 0.75 150 0.9692554 0.3890719
## 0.3 3 0.8 1.00 50 0.9708790 0.3791814
## 0.3 3 0.8 1.00 100 0.9703801 0.3849929
## 0.3 3 0.8 1.00 150 0.9701303 0.3949087
## 0.4 1 0.6 0.50 50 0.9702532 0.3676569
## 0.4 1 0.6 0.50 100 0.9720034 0.4070468
## 0.4 1 0.6 0.50 150 0.9716287 0.4048304
## 0.4 1 0.6 0.75 50 0.9721295 0.4240445
## 0.4 1 0.6 0.75 100 0.9720040 0.4044536
## 0.4 1 0.6 0.75 150 0.9717543 0.3995174
## 0.4 1 0.6 1.00 50 0.9718795 0.4012599
## 0.4 1 0.6 1.00 100 0.9718793 0.3962749
## 0.4 1 0.6 1.00 150 0.9718790 0.4028057
## 0.4 1 0.8 0.50 50 0.9716292 0.4018029
## 0.4 1 0.8 0.50 100 0.9721289 0.4130694
## 0.4 1 0.8 0.50 150 0.9716287 0.4040398
## 0.4 1 0.8 0.75 50 0.9713792 0.4078365
## 0.4 1 0.8 0.75 100 0.9717543 0.4053278
## 0.4 1 0.8 0.75 150 0.9708790 0.3920528
## 0.4 1 0.8 1.00 50 0.9720043 0.4005942
## 0.4 1 0.8 1.00 100 0.9716298 0.3986964
## 0.4 1 0.8 1.00 150 0.9713800 0.3939931
## 0.4 2 0.6 0.50 50 0.9708793 0.3829116
## 0.4 2 0.6 0.50 100 0.9698795 0.3746342
## 0.4 2 0.6 0.50 150 0.9698792 0.3828474
## 0.4 2 0.6 0.75 50 0.9716289 0.3966735
## 0.4 2 0.6 0.75 100 0.9712539 0.3965996
## 0.4 2 0.6 0.75 150 0.9698798 0.3784100
## 0.4 2 0.6 1.00 50 0.9715036 0.3914698
## 0.4 2 0.6 1.00 100 0.9712540 0.3930020
## 0.4 2 0.6 1.00 150 0.9701292 0.3722026
## 0.4 2 0.8 0.50 50 0.9715043 0.3943484
## 0.4 2 0.8 0.50 100 0.9703789 0.3852922
## 0.4 2 0.8 0.50 150 0.9701287 0.3859388
## 0.4 2 0.8 0.75 50 0.9717531 0.4160240
## 0.4 2 0.8 0.75 100 0.9700037 0.3844063
## 0.4 2 0.8 0.75 150 0.9706292 0.4108813
## 0.4 2 0.8 1.00 50 0.9710039 0.3841293
## 0.4 2 0.8 1.00 100 0.9706298 0.3913856
## 0.4 2 0.8 1.00 150 0.9695057 0.3726841
## 0.4 3 0.6 0.50 50 0.9705042 0.3840673
## 0.4 3 0.6 0.50 100 0.9711306 0.4101082
## 0.4 3 0.6 0.50 150 0.9697542 0.3876548
## 0.4 3 0.6 0.75 50 0.9707542 0.3911122
## 0.4 3 0.6 0.75 100 0.9701297 0.3912948
## 0.4 3 0.6 0.75 150 0.9693790 0.3812046
## 0.4 3 0.6 1.00 50 0.9707550 0.3823064
## 0.4 3 0.6 1.00 100 0.9706300 0.3997119
## 0.4 3 0.6 1.00 150 0.9698807 0.3948422
## 0.4 3 0.8 0.50 50 0.9700059 0.3893254
## 0.4 3 0.8 0.50 100 0.9688807 0.3805709
## 0.4 3 0.8 0.50 150 0.9677554 0.3705645
## 0.4 3 0.8 0.75 50 0.9698803 0.3799118
## 0.4 3 0.8 0.75 100 0.9705061 0.3998332
## 0.4 3 0.8 0.75 150 0.9692554 0.3803766
## 0.4 3 0.8 1.00 50 0.9705050 0.3947427
## 0.4 3 0.8 1.00 100 0.9692561 0.3767798
## 0.4 3 0.8 1.00 150 0.9691307 0.3784295
##
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
## parameter 'min_child_weight' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 1, eta
## = 0.3, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and subsample
## = 0.5.
# plot of probabilities
plot(PredXGBoostModel$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# taking the cut-off probability 50%
PredXGBoostModel <- ifelse(PredXGBoostModel$Yes > 0.50, "Yes", "No")
# saving predicted vector as factor
Pred <- as.factor(PredXGBoostModel)
# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
cm## Confusion Matrix and Statistics
##
## Reference
## Prediction Yes No
## Yes 28 9
## No 38 1924
##
## Accuracy : 0.9765
## 95% CI : (0.9689, 0.9827)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.007885
##
## Kappa : 0.5326
##
## Mcnemar's Test P-Value : 4.423e-05
##
## Sensitivity : 0.42424
## Specificity : 0.99534
## Pos Pred Value : 0.75676
## Neg Pred Value : 0.98063
## Prevalence : 0.03302
## Detection Rate : 0.01401
## Detection Prevalence : 0.01851
## Balanced Accuracy : 0.70979
##
## 'Positive' Class : Yes
##
## Warning: package 'ROCR' was built under R version 4.0.4
BoostPrediction <- predict(ModelXGboost, test.df,type = "prob")
BoostPrediction <- prediction(BoostPrediction[2],test.df$default)
Boostperformance <- performance(BoostPrediction, "tpr","fpr")
# plotting ROC curve
plot(Boostperformance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
BoostBag <- performance(BoostPrediction, measure = "auc")
BoostBag <- BoostBag@y.values[[1]]
BoostBag## [1] 0.9608436