LOADING DATA INTO R ENVIRONMENT
Splitting the Data into training set and test set
## Warning: package 'caTools' was built under R version 4.0.4
TRAINING THE LOGISTIC REGRESSION MODEL USING caret
PACKAGE
Setting Control parameters
MODEL BUILDING
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4563 -0.1434 -0.0560 -0.0203 3.6795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.088e+01 5.560e-01 -19.576 <2e-16 ***
## studentYes -5.865e-01 2.638e-01 -2.223 0.0262 *
## balance 5.708e-03 2.583e-04 22.095 <2e-16 ***
## income 4.017e-06 9.319e-06 0.431 0.6664
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2333.8 on 7999 degrees of freedom
## Residual deviance: 1263.7 on 7996 degrees of freedom
## AIC: 1271.7
##
## Number of Fisher Scoring iterations: 8
TESTING THE LOGISTIC REGRESSION MODEL
Predicting Test Set Results
Plotting the Predicted Plobalities

Confusion Matrix at 50% Cut-Off Probability
## Actual
## Predicted Yes No
## Yes 24 4
## No 43 1929
Machine Learning Metrics using Caret Package
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 24 4
## No 43 1929
##
## Accuracy : 0.9765
## 95% CI : (0.9689, 0.9827)
## No Information Rate : 0.9665
## P-Value [Acc > NIR] : 0.005658
##
## Kappa : 0.4953
##
## Mcnemar's Test P-Value : 2.976e-08
##
## Sensitivity : 0.3582
## Specificity : 0.9979
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.9782
## Prevalence : 0.0335
## Detection Rate : 0.0120
## Detection Prevalence : 0.0140
## Balanced Accuracy : 0.6781
##
## 'Positive' Class : Yes
##
Measuring Machine Learning Metrics at different Cut-off Probabilities
library(caret)
library(dplyr)
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
logitModelPred <- predict(caretLogitModel, testSet, type = "prob")
C1 <- ifelse(logitModelPred[,2] > cutoff, "Yes", "No")
C2 <- testSet$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
predY <- ordered(predY, levels = c("Yes", "No"))
actualY <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(table(predY,actualY))
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
# printing the table
tab4 <- as.data.frame(tab3)
tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
tab6 <- as.data.frame(tab5)
tab7 <- rename(tab6,cutoff = cutoff1, Accuracy = V2 ,
Senstivity = V3 ,Specificity = V4 ,kappa = V5)
tab7
## cutoff Accuracy Senstivity Specificity kappa
## 1 0.10 0.9340 0.73134328 0.9410243 0.39747762
## 2 0.15 0.9510 0.64179104 0.9617175 0.44369071
## 3 0.20 0.9605 0.58208955 0.9736161 0.47671723
## 4 0.25 0.9685 0.53731343 0.9834454 0.51703413
## 5 0.30 0.9705 0.52238806 0.9860321 0.52741778
## 6 0.35 0.9745 0.50746269 0.9906880 0.55850272
## 7 0.40 0.9755 0.46268657 0.9932747 0.54651464
## 8 0.45 0.9765 0.38805970 0.9968960 0.51474354
## 9 0.50 0.9765 0.35820896 0.9979307 0.49529659
## 10 0.55 0.9770 0.34328358 0.9989653 0.49072793
## 11 0.60 0.9755 0.29850746 0.9989653 0.44016635
## 12 0.65 0.9735 0.23880597 0.9989653 0.36749648
## 13 0.70 0.9725 0.19402985 0.9994827 0.31303240
## 14 0.75 0.9700 0.11940299 0.9994827 0.20421237
## 15 0.80 0.9695 0.10447761 0.9994827 0.18081220
## 16 0.85 0.9680 0.05970149 0.9994827 0.10695598
## 17 0.90 0.9680 0.04477612 1.0000000 0.08308142
ROC Curve
## Warning: package 'ROCR' was built under R version 4.0.4
PredLR <- predict(caretLogitModel, testSet,type = "prob")
lgPredObj <- prediction((1-PredLR[,2]),testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")

AUC (Area Under the Curve)
## [1] 0.9485604