Load Libraries

library(readr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caTools)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess

Prepare data

data <- read_csv("../S4/DefaultData.csv")
## Parsed with column specification:
## cols(
##   default = col_character(),
##   student = col_character(),
##   balance = col_double(),
##   income = col_double()
## )
dim(data)
## [1] 10000     4
# column names
colnames(data)
## [1] "default" "student" "balance" "income"
# structure of dataframe
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of  4 variables:
##  $ default: chr  "No" "No" "No" "No" ...
##  $ student: chr  "No" "Yes" "No" "No" ...
##  $ balance: num  730 817 1074 529 786 ...
##  $ income : num  44362 12106 31767 35704 38463 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   default = col_character(),
##   ..   student = col_character(),
##   ..   balance = col_double(),
##   ..   income = col_double()
##   .. )
data$default <- as.factor(data$default)
data$student <- as.factor(data$student)

data$default <- ordered(data$default, levels = c("Yes", "No"))
levels(data$default)
## [1] "Yes" "No"
set.seed(2341)
trainIndex <- createDataPartition(data$default, p=0.80, list = FALSE)
train_data <- data[trainIndex,]
test_data <- data[-trainIndex,]

dim(train_data)
## [1] 8001    4
dim(test_data)
## [1] 1999    4
round(prop.table(table(train_data$default))*100,2)
## 
##   Yes    No 
##  3.34 96.66
round(prop.table(table(test_data$default))*100,2)
## 
##  Yes   No 
##  3.3 96.7
trctrl <- trainControl(method = "repeatedcv",
                       number = 10,
                       repeats = 3)
set.seed(3333)
knn_fit  <- train(default ~ ., 
                         data = train_data,
                         method = "knn",
                         trControl = trctrl,
                         preProcess = c("center", "scale"),
                         tuneLength = 10)
knn_fit
## k-Nearest Neighbors 
## 
## 8001 samples
##    3 predictor
##    2 classes: 'Yes', 'No' 
## 
## Pre-processing: centered (3), scaled (3) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 7200, 7201, 7200, 7201, 7201, 7201, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9702956  0.4104444
##    7  0.9713784  0.4200407
##    9  0.9719200  0.4246494
##   11  0.9725033  0.4287720
##   13  0.9727534  0.4313951
##   15  0.9729203  0.4270575
##   17  0.9735034  0.4358933
##   19  0.9735034  0.4366315
##   21  0.9731285  0.4201842
##   23  0.9731284  0.4160789
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
knn_pred <- predict(knn_fit, test_data, type="raw")

cm <- table(Predicted=knn_pred, Actual=test_data$default)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes   19   10
##       No    47 1923
##                                           
##                Accuracy : 0.9715          
##                  95% CI : (0.9632, 0.9783)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 0.1429          
##                                           
##                   Kappa : 0.3877          
##                                           
##  Mcnemar's Test P-Value : 1.858e-06       
##                                           
##             Sensitivity : 0.287879        
##             Specificity : 0.994827        
##          Pos Pred Value : 0.655172        
##          Neg Pred Value : 0.976142        
##              Prevalence : 0.033017        
##          Detection Rate : 0.009505        
##    Detection Prevalence : 0.014507        
##       Balanced Accuracy : 0.641353        
##                                           
##        'Positive' Class : Yes             
## 
PredLR <- predict(knn_fit, test_data,type = "prob")
levels(test_data$default)
## [1] "Yes" "No"
test_data$default <- ordered(test_data$default, levels = c("Yes", "No"))
lgPredObj <- prediction(PredLR[1],test_data$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.1004562

Logistic

# fit logistic regression model 
 logitModel <- glm(default ~ 
                          balance 
                        + income 
                        + student, 
                        data = train_data, 
                        family = binomial())
# summary of the logistic regression model 
summary(logitModel)
## 
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(), 
##     data = train_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.7170   0.0205   0.0573   0.1431   2.1358  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.104e+01  5.565e-01  19.846   <2e-16 ***
## balance     -5.719e-03  2.582e-04 -22.151   <2e-16 ***
## income      -8.800e-06  9.283e-06  -0.948   0.3431    
## studentYes   5.524e-01  2.684e-01   2.058   0.0395 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2340.6  on 8000  degrees of freedom
## Residual deviance: 1262.4  on 7997  degrees of freedom
## AIC: 1270.4
## 
## Number of Fisher Scoring iterations: 8
logitModelPred <- predict(logitModel, test_data, type = "response")
# plot of probabilities
plot(logitModelPred, 
     main = "Scatterplot of Probabilities of Default (test data)", 
     xlab = "Customer ID", ylab = "Predicted Probability of Default")

# setting the cut-off probablity
classify20 <- ifelse(logitModelPred > 0.2,"Yes","No")

# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
test_data$default <- ordered(test_data$default, levels = c("Yes", "No"))

# confusion matrix
cm <- table(Predicted = classify20, Actual = test_data$default)
cm
##          Actual
## Predicted  Yes   No
##       Yes   61 1932
##       No     5    1
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##          Actual
## Predicted  Yes   No
##       Yes   61 1932
##       No     5    1
##                                           
##                Accuracy : 0.031           
##                  95% CI : (0.0239, 0.0396)
##     No Information Rate : 0.967           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.005          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9242424       
##             Specificity : 0.0005173       
##          Pos Pred Value : 0.0306071       
##          Neg Pred Value : 0.1666667       
##              Prevalence : 0.0330165       
##          Detection Rate : 0.0305153       
##    Detection Prevalence : 0.9969985       
##       Balanced Accuracy : 0.4623799       
##                                           
##        'Positive' Class : Yes             
## 
PredLR <- predict(logitModel, test_data,type = "response")
lgPredObj <- prediction(PredLR,test_data$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.95384