## Warning: package 'caTools' was built under R version 4.0.4
# changing the levels of target variable in training data set
trainingSet$default <- ifelse(trainingSet$default == "Yes",1,0)
# fit logistic regression model
logitModel <- glm(default ~
balance
+ income
+ student,
data = trainingSet,
family = binomial())
# summary of the logistic regression model
summary(logitModel)##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(),
## data = trainingSet)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4563 -0.1434 -0.0560 -0.0203 3.6795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.088e+01 5.560e-01 -19.576 <2e-16 ***
## balance 5.708e-03 2.583e-04 22.095 <2e-16 ***
## income 4.017e-06 9.319e-06 0.431 0.6664
## studentYes -5.865e-01 2.638e-01 -2.223 0.0262 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2333.8 on 7999 degrees of freedom
## Residual deviance: 1263.7 on 7996 degrees of freedom
## AIC: 1271.7
##
## Number of Fisher Scoring iterations: 8
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")# setting the cut-off probablity
classify50 <- ifelse(logitModelPred > 0.5,"Yes","No")
# ordering the levels
classify50 <- ordered(classify50, levels = c("Yes", "No"))
testSet$default <- ordered(testSet$default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify50, Actual = testSet$default)
cm## Actual
## Predicted Yes No
## Yes 24 4
## No 43 1929
## Loading required package: lattice
## Loading required package: ggplot2
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 24 4
## No 43 1929
##
## Accuracy : 0.9765
## 95% CI : (0.9689, 0.9827)
## No Information Rate : 0.9665
## P-Value [Acc > NIR] : 0.005658
##
## Kappa : 0.4953
##
## Mcnemar's Test P-Value : 2.976e-08
##
## Sensitivity : 0.3582
## Specificity : 0.9979
## Pos Pred Value : 0.8571
## Neg Pred Value : 0.9782
## Prevalence : 0.0335
## Detection Rate : 0.0120
## Detection Prevalence : 0.0140
## Balanced Accuracy : 0.6781
##
## 'Positive' Class : Yes
##
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# function to print confusion matrices for diffrent cut-off levels of probability
CmFn <- function(cutoff) {
# predicting the test set results
logitModelPred <- predict(logitModel, testSet, type = "response")
C1 <- ifelse(logitModelPred > cutoff, "Yes", "No")
C2 <- testSet$default
predY <- as.factor(C1)
actualY <- as.factor(C2)
predY <- ordered(predY, levels = c("Yes", "No"))
actualY <- ordered(actualY, levels = c("Yes", "No"))
# use the confusionMatrix from the caret package
cm1 <-confusionMatrix(table(predY,actualY))
# extracting accuracy
Accuracy <- cm1$overall[1]
# extracting sensitivity
Sensitivity <- cm1$byClass[1]
# extracting specificity
Specificity <- cm1$byClass[2]
# extracting value of kappa
Kappa <- cm1$overall[2]
# combined table
tab <- cbind(Accuracy,Sensitivity,Specificity,Kappa)
return(tab)}
# making sequence of cut-off probabilities
cutoff1 <- seq( .1, .9, by = .05 )
# loop using "lapply"
tab2 <- lapply(cutoff1, CmFn)
# extra coding for saving table as desired format
tab3 <- rbind(tab2[[1]],tab2[[2]],tab2[[3]],tab2[[4]],tab2[[5]],tab2[[6]],tab2[[7]],
tab2[[8]],tab2[[9]],tab2[[10]],tab2[[11]],tab2[[12]],tab2[[13]],tab2[[14]],
tab2[[15]],tab2[[16]],tab2[[17]])
# printing the table
tab4 <- as.data.frame(tab3)
tab5 <- cbind(cutoff1,tab4$Accuracy,tab4$Sensitivity,tab4$Specificity,tab4$Kappa)
tab6 <- as.data.frame(tab5)
tab7 <- rename(tab6,cutoff = cutoff1, Accuracy = V2 ,
Senstivity = V3 ,Specificity = V4 ,kappa = V5)
tab7## cutoff Accuracy Senstivity Specificity kappa
## 1 0.10 0.9340 0.73134328 0.9410243 0.39747762
## 2 0.15 0.9510 0.64179104 0.9617175 0.44369071
## 3 0.20 0.9605 0.58208955 0.9736161 0.47671723
## 4 0.25 0.9685 0.53731343 0.9834454 0.51703413
## 5 0.30 0.9705 0.52238806 0.9860321 0.52741778
## 6 0.35 0.9745 0.50746269 0.9906880 0.55850272
## 7 0.40 0.9755 0.46268657 0.9932747 0.54651464
## 8 0.45 0.9765 0.38805970 0.9968960 0.51474354
## 9 0.50 0.9765 0.35820896 0.9979307 0.49529659
## 10 0.55 0.9770 0.34328358 0.9989653 0.49072793
## 11 0.60 0.9755 0.29850746 0.9989653 0.44016635
## 12 0.65 0.9735 0.23880597 0.9989653 0.36749648
## 13 0.70 0.9725 0.19402985 0.9994827 0.31303240
## 14 0.75 0.9700 0.11940299 0.9994827 0.20421237
## 15 0.80 0.9695 0.10447761 0.9994827 0.18081220
## 16 0.85 0.9680 0.05970149 0.9994827 0.10695598
## 17 0.90 0.9680 0.04477612 1.0000000 0.08308142
## Warning: package 'ROCR' was built under R version 4.0.4
PredLR <- predict(logitModel, testSet,type = "response")
lgPredObj <- prediction((1-PredLR),testSet$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)
abline(a = 0,b = 1,lwd = 2,lty = 3,col = "black")library(ROCR)
# area under curve
aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR## [1] 0.9485604