set.seed(123)
# fitting boosting classification model
ModelXGboost <- train(Default ~ CreditLimit
+ Gender
+ Education
+ MaritalStatus
+ Age
+ BillOutstanding
+ LastPayment,
data = trainData.dt,
method = "xgbTree",
trControl = trctrl)
# model summary
ModelXGboost## eXtreme Gradient Boosting
##
## 23681 samples
## 7 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 21313, 21314, 21314, 21313, 21312, 21312, ...
## Resampling results across tuning parameters:
##
## eta max_depth colsample_bytree subsample nrounds Accuracy
## 0.3 1 0.6 0.50 50 0.7779654
## 0.3 1 0.6 0.50 100 0.7778390
## 0.3 1 0.6 0.50 150 0.7782612
## 0.3 1 0.6 0.75 50 0.7774588
## 0.3 1 0.6 0.75 100 0.7786833
## 0.3 1 0.6 0.75 150 0.7780078
## 0.3 1 0.6 1.00 50 0.7778811
## 0.3 1 0.6 1.00 100 0.7785146
## 0.3 1 0.6 1.00 150 0.7782191
## 0.3 1 0.8 0.50 50 0.7778391
## 0.3 1 0.8 0.50 100 0.7780503
## 0.3 1 0.8 0.50 150 0.7786834
## 0.3 1 0.8 0.75 50 0.7775012
## 0.3 1 0.8 0.75 100 0.7777967
## 0.3 1 0.8 0.75 150 0.7778811
## 0.3 1 0.8 1.00 50 0.7777122
## 0.3 1 0.8 1.00 100 0.7780502
## 0.3 1 0.8 1.00 150 0.7783879
## 0.3 2 0.6 0.50 50 0.7783877
## 0.3 2 0.6 0.50 100 0.7785566
## 0.3 2 0.6 0.50 150 0.7796546
## 0.3 2 0.6 0.75 50 0.7788945
## 0.3 2 0.6 0.75 100 0.7789790
## 0.3 2 0.6 0.75 150 0.7794012
## 0.3 2 0.6 1.00 50 0.7789790
## 0.3 2 0.6 1.00 100 0.7794435
## 0.3 2 0.6 1.00 150 0.7797389
## 0.3 2 0.8 0.50 50 0.7799079
## 0.3 2 0.8 0.50 100 0.7799925
## 0.3 2 0.8 0.50 150 0.7791060
## 0.3 2 0.8 0.75 50 0.7794436
## 0.3 2 0.8 0.75 100 0.7796125
## 0.3 2 0.8 0.75 150 0.7804992
## 0.3 2 0.8 1.00 50 0.7795702
## 0.3 2 0.8 1.00 100 0.7807103
## 0.3 2 0.8 1.00 150 0.7811748
## 0.3 3 0.6 0.50 50 0.7797392
## 0.3 3 0.6 0.50 100 0.7785991
## 0.3 3 0.6 0.50 150 0.7774589
## 0.3 3 0.6 0.75 50 0.7807102
## 0.3 3 0.6 0.75 100 0.7801616
## 0.3 3 0.6 0.75 150 0.7802457
## 0.3 3 0.6 1.00 50 0.7801193
## 0.3 3 0.6 1.00 100 0.7796125
## 0.3 3 0.6 1.00 150 0.7792750
## 0.3 3 0.8 0.50 50 0.7789790
## 0.3 3 0.8 0.50 100 0.7786834
## 0.3 3 0.8 0.50 150 0.7781768
## 0.3 3 0.8 0.75 50 0.7801191
## 0.3 3 0.8 0.75 100 0.7797393
## 0.3 3 0.8 0.75 150 0.7788101
## 0.3 3 0.8 1.00 50 0.7807106
## 0.3 3 0.8 1.00 100 0.7799925
## 0.3 3 0.8 1.00 150 0.7793589
## 0.4 1 0.6 0.50 50 0.7780923
## 0.4 1 0.6 0.50 100 0.7771212
## 0.4 1 0.6 0.50 150 0.7770366
## 0.4 1 0.6 0.75 50 0.7779656
## 0.4 1 0.6 0.75 100 0.7781767
## 0.4 1 0.6 0.75 150 0.7782611
## 0.4 1 0.6 1.00 50 0.7779234
## 0.4 1 0.6 1.00 100 0.7782189
## 0.4 1 0.6 1.00 150 0.7783033
## 0.4 1 0.8 0.50 50 0.7785568
## 0.4 1 0.8 0.50 100 0.7773320
## 0.4 1 0.8 0.50 150 0.7773320
## 0.4 1 0.8 0.75 50 0.7776278
## 0.4 1 0.8 0.75 100 0.7779235
## 0.4 1 0.8 0.75 150 0.7773745
## 0.4 1 0.8 1.00 50 0.7776279
## 0.4 1 0.8 1.00 100 0.7783034
## 0.4 1 0.8 1.00 150 0.7781344
## 0.4 2 0.6 0.50 50 0.7793164
## 0.4 2 0.6 0.50 100 0.7784722
## 0.4 2 0.6 0.50 150 0.7797391
## 0.4 2 0.6 0.75 50 0.7798660
## 0.4 2 0.6 0.75 100 0.7792749
## 0.4 2 0.6 0.75 150 0.7794857
## 0.4 2 0.6 1.00 50 0.7799924
## 0.4 2 0.6 1.00 100 0.7803726
## 0.4 2 0.6 1.00 150 0.7798660
## 0.4 2 0.8 0.50 50 0.7777968
## 0.4 2 0.8 0.50 100 0.7789367
## 0.4 2 0.8 0.50 150 0.7791479
## 0.4 2 0.8 0.75 50 0.7797814
## 0.4 2 0.8 0.75 100 0.7794857
## 0.4 2 0.8 0.75 150 0.7798659
## 0.4 2 0.8 1.00 50 0.7802880
## 0.4 2 0.8 1.00 100 0.7796549
## 0.4 2 0.8 1.00 150 0.7806257
## 0.4 3 0.6 0.50 50 0.7793590
## 0.4 3 0.6 0.50 100 0.7771634
## 0.4 3 0.6 0.50 150 0.7751366
## 0.4 3 0.6 0.75 50 0.7793592
## 0.4 3 0.6 0.75 100 0.7788101
## 0.4 3 0.6 0.75 150 0.7764878
## 0.4 3 0.6 1.00 50 0.7800350
## 0.4 3 0.6 1.00 100 0.7788528
## 0.4 3 0.6 1.00 150 0.7784726
## 0.4 3 0.8 0.50 50 0.7789367
## 0.4 3 0.8 0.50 100 0.7765298
## 0.4 3 0.8 0.50 150 0.7760226
## 0.4 3 0.8 0.75 50 0.7792323
## 0.4 3 0.8 0.75 100 0.7774164
## 0.4 3 0.8 0.75 150 0.7767831
## 0.4 3 0.8 1.00 50 0.7796549
## 0.4 3 0.8 1.00 100 0.7793166
## 0.4 3 0.8 1.00 150 0.7783035
## Kappa
## 0.05215425
## 0.06383431
## 0.06823095
## 0.04133710
## 0.07017065
## 0.06825885
## 0.02171847
## 0.06503546
## 0.06556696
## 0.05511830
## 0.06320222
## 0.07088069
## 0.04289436
## 0.06615456
## 0.06653417
## 0.02260535
## 0.06187452
## 0.06626162
## 0.06702150
## 0.07608836
## 0.08797840
## 0.06753951
## 0.07592400
## 0.08236265
## 0.06514723
## 0.07794471
## 0.08342203
## 0.07597178
## 0.08082709
## 0.08440120
## 0.07269380
## 0.08095632
## 0.08938600
## 0.06688971
## 0.08231574
## 0.09060890
## 0.08312853
## 0.08807302
## 0.09372481
## 0.08750676
## 0.09548614
## 0.10123761
## 0.08467924
## 0.09145568
## 0.09093150
## 0.08481403
## 0.09358372
## 0.10131228
## 0.08641733
## 0.09409560
## 0.09811641
## 0.08789884
## 0.09326249
## 0.09439176
## 0.05504424
## 0.06279036
## 0.06844911
## 0.05608456
## 0.06711174
## 0.06893537
## 0.04756445
## 0.06552158
## 0.06850041
## 0.06386828
## 0.06360168
## 0.06428661
## 0.05726900
## 0.06677700
## 0.06627247
## 0.04957321
## 0.06588960
## 0.06684371
## 0.06921649
## 0.07656174
## 0.09109936
## 0.08367121
## 0.08404956
## 0.08887245
## 0.07977519
## 0.08651224
## 0.08915767
## 0.06914630
## 0.08456212
## 0.09514198
## 0.08081818
## 0.08464022
## 0.09210846
## 0.07731593
## 0.08607558
## 0.09543951
## 0.09205747
## 0.09561430
## 0.10153557
## 0.08614713
## 0.09325227
## 0.09420404
## 0.08444604
## 0.08937415
## 0.09450598
## 0.09372866
## 0.09863778
## 0.10782700
## 0.08619952
## 0.09320778
## 0.09638328
## 0.08926219
## 0.09660444
## 0.09794811
##
## Tuning parameter 'gamma' was held constant at a value of 0
##
## Tuning parameter 'min_child_weight' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 150, max_depth = 2,
## eta = 0.3, gamma = 0, colsample_bytree = 0.8, min_child_weight = 1
## and subsample = 1.
# plot of probabilities
plot(PredBoosting$Yes,
main = "Scatterplot of Probabilities of default (test data)",
xlab = "Customer ID",
ylab = "Predicted Probability of default")# fixing the cut-off probability
pred.Boosting <- ifelse(PredBoosting$Yes > 0.50, "Yes", "No")
# actual and predicted data columns
Pred <- as.factor(pred.Boosting)
# ordering the levels
Predicted <- ordered(Pred, levels = c("Yes", "No"))
Actual <- ordered(testData.dt$Default,levels = c("Yes", "No"))
# making confusion matrix
cm <-confusionMatrix(table(Predicted,Actual))
cm## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 92 85
## No 1229 4514
##
## Accuracy : 0.778
## 95% CI : (0.7672, 0.7886)
## No Information Rate : 0.7769
## P-Value [Acc > NIR] : 0.4207
##
## Kappa : 0.074
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.06964
## Specificity : 0.98152
## Pos Pred Value : 0.51977
## Neg Pred Value : 0.78600
## Prevalence : 0.22314
## Detection Rate : 0.01554
## Detection Prevalence : 0.02990
## Balanced Accuracy : 0.52558
##
## 'Positive' Class : Yes
##
library(magrittr)
table(Predicted,Actual)%>%
divide_by(length(Actual)) %>% multiply_by(100) %>% round(1)## Actual
## Predicted Yes No
## Yes 1.6 1.4
## No 20.8 76.2
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
PredBoosting <- predict(ModelXGboost, testData.dt,type = "prob")
C1 <- ifelse(PredBoosting$Yes > cutoff, "Yes", "No")
C2 <- testData.dt$Default
predY <- as.factor(C1)
actualY <- as.factor(C2)
Predicted <- ordered(predY, levels = c("Yes", "No"))
Actual <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(data = Predicted,reference = Actual, positive = "Yes")
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
# printing the table
tab4 <- as.data.frame(tab3)
tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
tab6 <- as.data.frame(tab5)
tab7 <- rename(tab6,cutoff = cutoff1, Accuracy = V2 ,
Senstivity = V3 ,Specificity = V4 ,kappa = V5)
tab7## cutoff Accuracy Senstivity Specificity kappa
## 1 0.10 0.3476351 0.9591218774 0.1719939 0.065142252
## 2 0.15 0.4543919 0.8796366389 0.3322461 0.118661636
## 3 0.20 0.5451014 0.7645722937 0.4820613 0.158228479
## 4 0.25 0.6547297 0.5715367146 0.6786258 0.200765861
## 5 0.30 0.7376689 0.3686601060 0.8436617 0.219119949
## 6 0.35 0.7673986 0.2278576836 0.9223744 0.182956736
## 7 0.40 0.7765203 0.1566994701 0.9545553 0.147192150
## 8 0.45 0.7800676 0.1127933384 0.9717330 0.117579379
## 9 0.50 0.7780405 0.0696442089 0.9815177 0.074000842
## 10 0.55 0.7788851 0.0257380772 0.9952164 0.031810540
## 11 0.60 0.7770270 0.0060560182 0.9984779 0.007000454
## 12 0.65 0.7773649 0.0022710068 1.0000000 0.003524069
## 13 0.70 0.7770270 0.0007570023 1.0000000 0.001175674
## 14 0.75 0.7768581 0.0000000000 1.0000000 0.000000000
## 15 0.80 0.7768581 0.0000000000 1.0000000 0.000000000
## 16 0.85 0.7768581 0.0000000000 1.0000000 0.000000000
## 17 0.90 0.7768581 0.0000000000 1.0000000 0.000000000
# loading the package
library(ROCR)
PredBoosting <- predict(ModelXGboost, testData.dt,type = "prob")
prediction <- prediction(PredBoosting[2],testData.dt$Default)
performance <- performance(prediction, "tpr","fpr")
# plotting ROC curve
plot(performance,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")# area under curve
aucXGBoost <- performance(prediction, measure = "auc")
aucXGBoost <- aucXGBoost@y.values[[1]]
aucXGBoost## [1] 0.6831132