Boosting

LOADING DATA INTO R ENVIRONMENT
- Reading the Data
- Splitting data into training and test set
TRAINING THE BOOSTING MODEL
TESTING THE BOOSTING MODEL

LOADING DATA INTO R ENVIRONMENT

Reading the Data

# reading the data
df <- read.csv("DefaultData.csv")

Splitting data into training and test set

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(df$default, p = 0.80, list = FALSE)
# 80% training data
train.df <- df[trainIndex, ]
# 20% testing data
test.df <- df[-trainIndex, ]
dim(train.df)

## [1] 8001    4

dim(test.df)

## [1] 1999    4

TRAINING THE BOOSTING MODEL

Setting Control parameters

library(caret)
# control parameters
trctrl <- trainControl(method = "cv", n = 10, classProbs = TRUE)

Running the Training Model

set.seed(123)
# fitting boosting classification model
ModelXGboost <- train(default ~ ., 
                         data = train.df, 
                         method = "xgbTree",
                         trControl = trctrl)
# model summary
ModelXGboost

## eXtreme Gradient Boosting 
## 
## 8001 samples
##    3 predictor
##    2 classes: 'No', 'Yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 7200, 7200, 7201, 7202, 7201, 7202, ... 
## Resampling results across tuning parameters:
## 
##   eta  max_depth  colsample_bytree  subsample  nrounds  Accuracy   Kappa    
##   0.3  1          0.6               0.50        50      0.9713793  0.3913068
##   0.3  1          0.6               0.50       100      0.9720040  0.4006042
##   0.3  1          0.6               0.50       150      0.9723789  0.4099108
##   0.3  1          0.6               0.75        50      0.9716292  0.3799901
##   0.3  1          0.6               0.75       100      0.9717542  0.3997378
##   0.3  1          0.6               0.75       150      0.9718792  0.4046255
##   0.3  1          0.6               1.00        50      0.9706295  0.3649273
##   0.3  1          0.6               1.00       100      0.9716292  0.3959953
##   0.3  1          0.6               1.00       150      0.9712540  0.3877077
##   0.3  1          0.8               0.50        50      0.9730036  0.4199934
##   0.3  1          0.8               0.50       100      0.9718790  0.3952624
##   0.3  1          0.8               0.50       150      0.9718789  0.3909090
##   0.3  1          0.8               0.75        50      0.9706301  0.3858335
##   0.3  1          0.8               0.75       100      0.9713798  0.3931568
##   0.3  1          0.8               0.75       150      0.9717542  0.4010233
##   0.3  1          0.8               1.00        50      0.9715047  0.3899435
##   0.3  1          0.8               1.00       100      0.9715047  0.3957805
##   0.3  1          0.8               1.00       150      0.9715047  0.4009329
##   0.3  2          0.6               0.50        50      0.9721276  0.4004037
##   0.3  2          0.6               0.50       100      0.9715031  0.4083184
##   0.3  2          0.6               0.50       150      0.9716281  0.4086541
##   0.3  2          0.6               0.75        50      0.9717542  0.3873674
##   0.3  2          0.6               0.75       100      0.9722548  0.4178143
##   0.3  2          0.6               0.75       150      0.9710039  0.3875485
##   0.3  2          0.6               1.00        50      0.9717539  0.3828775
##   0.3  2          0.6               1.00       100      0.9712543  0.3897696
##   0.3  2          0.6               1.00       150      0.9708792  0.3925408
##   0.3  2          0.8               0.50        50      0.9715033  0.4004922
##   0.3  2          0.8               0.50       100      0.9707540  0.3936825
##   0.3  2          0.8               0.50       150      0.9710034  0.4032012
##   0.3  2          0.8               0.75        50      0.9710043  0.3873647
##   0.3  2          0.8               0.75       100      0.9708792  0.3928655
##   0.3  2          0.8               0.75       150      0.9702550  0.3798871
##   0.3  2          0.8               1.00        50      0.9716293  0.3961700
##   0.3  2          0.8               1.00       100      0.9710037  0.3857656
##   0.3  2          0.8               1.00       150      0.9707551  0.3977728
##   0.3  3          0.6               0.50        50      0.9708790  0.3822028
##   0.3  3          0.6               0.50       100      0.9702537  0.3818082
##   0.3  3          0.6               0.50       150      0.9696300  0.3881593
##   0.3  3          0.6               0.75        50      0.9718789  0.3949405
##   0.3  3          0.6               0.75       100      0.9705050  0.3804730
##   0.3  3          0.6               0.75       150      0.9697553  0.3861759
##   0.3  3          0.6               1.00        50      0.9715042  0.3776866
##   0.3  3          0.6               1.00       100      0.9697547  0.3776029
##   0.3  3          0.6               1.00       150      0.9696298  0.3835070
##   0.3  3          0.8               0.50        50      0.9710043  0.4112140
##   0.3  3          0.8               0.50       100      0.9688801  0.3702965
##   0.3  3          0.8               0.50       150      0.9702548  0.4017938
##   0.3  3          0.8               0.75        50      0.9706292  0.4016517
##   0.3  3          0.8               0.75       100      0.9702545  0.3924848
##   0.3  3          0.8               0.75       150      0.9692554  0.3890719
##   0.3  3          0.8               1.00        50      0.9708790  0.3791814
##   0.3  3          0.8               1.00       100      0.9703801  0.3849929
##   0.3  3          0.8               1.00       150      0.9701303  0.3949087
##   0.4  1          0.6               0.50        50      0.9702532  0.3676569
##   0.4  1          0.6               0.50       100      0.9720034  0.4070468
##   0.4  1          0.6               0.50       150      0.9716287  0.4048304
##   0.4  1          0.6               0.75        50      0.9721295  0.4240445
##   0.4  1          0.6               0.75       100      0.9720040  0.4044536
##   0.4  1          0.6               0.75       150      0.9717543  0.3995174
##   0.4  1          0.6               1.00        50      0.9718795  0.4012599
##   0.4  1          0.6               1.00       100      0.9718793  0.3962749
##   0.4  1          0.6               1.00       150      0.9718790  0.4028057
##   0.4  1          0.8               0.50        50      0.9716292  0.4018029
##   0.4  1          0.8               0.50       100      0.9721289  0.4130694
##   0.4  1          0.8               0.50       150      0.9716287  0.4040398
##   0.4  1          0.8               0.75        50      0.9713792  0.4078365
##   0.4  1          0.8               0.75       100      0.9717543  0.4053278
##   0.4  1          0.8               0.75       150      0.9708790  0.3920528
##   0.4  1          0.8               1.00        50      0.9720043  0.4005942
##   0.4  1          0.8               1.00       100      0.9716298  0.3986964
##   0.4  1          0.8               1.00       150      0.9713800  0.3939931
##   0.4  2          0.6               0.50        50      0.9708793  0.3829116
##   0.4  2          0.6               0.50       100      0.9698795  0.3746342
##   0.4  2          0.6               0.50       150      0.9698792  0.3828474
##   0.4  2          0.6               0.75        50      0.9716289  0.3966735
##   0.4  2          0.6               0.75       100      0.9712539  0.3965996
##   0.4  2          0.6               0.75       150      0.9698798  0.3784100
##   0.4  2          0.6               1.00        50      0.9715036  0.3914698
##   0.4  2          0.6               1.00       100      0.9712540  0.3930020
##   0.4  2          0.6               1.00       150      0.9701292  0.3722026
##   0.4  2          0.8               0.50        50      0.9715043  0.3943484
##   0.4  2          0.8               0.50       100      0.9703789  0.3852922
##   0.4  2          0.8               0.50       150      0.9701287  0.3859388
##   0.4  2          0.8               0.75        50      0.9717531  0.4160240
##   0.4  2          0.8               0.75       100      0.9700037  0.3844063
##   0.4  2          0.8               0.75       150      0.9706292  0.4108813
##   0.4  2          0.8               1.00        50      0.9710039  0.3841293
##   0.4  2          0.8               1.00       100      0.9706298  0.3913856
##   0.4  2          0.8               1.00       150      0.9695057  0.3726841
##   0.4  3          0.6               0.50        50      0.9705042  0.3840673
##   0.4  3          0.6               0.50       100      0.9711306  0.4101082
##   0.4  3          0.6               0.50       150      0.9697542  0.3876548
##   0.4  3          0.6               0.75        50      0.9707542  0.3911122
##   0.4  3          0.6               0.75       100      0.9701297  0.3912948
##   0.4  3          0.6               0.75       150      0.9693790  0.3812046
##   0.4  3          0.6               1.00        50      0.9707550  0.3823064
##   0.4  3          0.6               1.00       100      0.9706300  0.3997119
##   0.4  3          0.6               1.00       150      0.9698807  0.3948422
##   0.4  3          0.8               0.50        50      0.9700059  0.3893254
##   0.4  3          0.8               0.50       100      0.9688807  0.3805709
##   0.4  3          0.8               0.50       150      0.9677554  0.3705645
##   0.4  3          0.8               0.75        50      0.9698803  0.3799118
##   0.4  3          0.8               0.75       100      0.9705061  0.3998332
##   0.4  3          0.8               0.75       150      0.9692554  0.3803766
##   0.4  3          0.8               1.00        50      0.9705050  0.3947427
##   0.4  3          0.8               1.00       100      0.9692561  0.3767798
##   0.4  3          0.8               1.00       150      0.9691307  0.3784295
## 
## Tuning parameter 'gamma' was held constant at a value of 0
## Tuning
##  parameter 'min_child_weight' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 50, max_depth = 1, eta
##  = 0.3, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1 and subsample
##  = 0.5.

Variable Importance in Boosting Model

# plotting variable importance
plot(varImp(ModelXGboost))

TESTING THE BOOSTING MODEL

Predicting Model on Test Data Set

# predicting the model on test data set
PredXGBoostModel <- predict(ModelXGboost, 
                    newdata = test.df,type = "prob")

Plotting the Predicted Probabilities

# plot of probabilities
plot(PredXGBoostModel$Yes, 
     main = "Scatterplot of Probabilities of default (test data)", 
     xlab = "Customer ID", 
     ylab = "Predicted Probability of default")

Confusion Matrix at 50% Cut-Off Probability

# taking the cut-off probability 50%
PredXGBoostModel <- ifelse(PredXGBoostModel$Yes > 0.50, "Yes", "No")

# saving predicted vector as factor 
Pred <- as.factor(PredXGBoostModel)

# ordering the vectors
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(test.df$default,levels = c("Yes", "No"))

# making confusion matrix
cm <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
cm

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  Yes   No
##        Yes   28    9
##        No    38 1924
##                                           
##                Accuracy : 0.9765          
##                  95% CI : (0.9689, 0.9827)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.007885        
##                                           
##                   Kappa : 0.5326          
##                                           
##  Mcnemar's Test P-Value : 4.423e-05       
##                                           
##             Sensitivity : 0.42424         
##             Specificity : 0.99534         
##          Pos Pred Value : 0.75676         
##          Neg Pred Value : 0.98063         
##              Prevalence : 0.03302         
##          Detection Rate : 0.01401         
##    Detection Prevalence : 0.01851         
##       Balanced Accuracy : 0.70979         
##                                           
##        'Positive' Class : Yes             
##

ROC Curve using Package ROCR

# loading the package
library(ROCR)

## Warning: package 'ROCR' was built under R version 4.0.4

BoostPrediction <- predict(ModelXGboost, test.df,type = "prob")
BoostPrediction <- prediction(BoostPrediction[2],test.df$default)
Boostperformance <- performance(BoostPrediction, "tpr","fpr")
# plotting ROC curve
plot(Boostperformance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)

library(ROCR)
# area under curve
BoostBag <- performance(BoostPrediction, measure = "auc")
BoostBag <- BoostBag@y.values[[1]]
BoostBag

## [1] 0.9608436

Boosting

Sameer Mathur

LOADING DATA INTO R ENVIRONMENT

Reading the Data

Splitting data into training and test set

TRAINING THE BOOSTING MODEL

Setting Control parameters

Running the Training Model

Variable Importance in Boosting Model

TESTING THE BOOSTING MODEL

Predicting Model on Test Data Set

Plotting the Predicted Probabilities

Confusion Matrix at 50% Cut-Off Probability

ROC Curve using Package ROCR

AUC (Area Under the Curve)