Load Libraries
library(readr)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caTools)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
Prepare data
data <- read_csv("../S4/DefaultData.csv")
## Parsed with column specification:
## cols(
## default = col_character(),
## student = col_character(),
## balance = col_double(),
## income = col_double()
## )
dim(data)
## [1] 10000 4
# column names
colnames(data)
## [1] "default" "student" "balance" "income"
# structure of dataframe
str(data)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 10000 obs. of 4 variables:
## $ default: chr "No" "No" "No" "No" ...
## $ student: chr "No" "Yes" "No" "No" ...
## $ balance: num 730 817 1074 529 786 ...
## $ income : num 44362 12106 31767 35704 38463 ...
## - attr(*, "spec")=
## .. cols(
## .. default = col_character(),
## .. student = col_character(),
## .. balance = col_double(),
## .. income = col_double()
## .. )
data$default <- as.factor(data$default)
data$student <- as.factor(data$student)
data$default <- ordered(data$default, levels = c("Yes", "No"))
levels(data$default)
## [1] "Yes" "No"
set.seed(2341)
trainIndex <- createDataPartition(data$default, p=0.80, list = FALSE)
train_data <- data[trainIndex,]
test_data <- data[-trainIndex,]
dim(train_data)
## [1] 8001 4
dim(test_data)
## [1] 1999 4
round(prop.table(table(train_data$default))*100,2)
##
## Yes No
## 3.34 96.66
round(prop.table(table(test_data$default))*100,2)
##
## Yes No
## 3.3 96.7
trctrl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 3)
set.seed(3333)
knn_fit <- train(default ~ .,
data = train_data,
method = "knn",
trControl = trctrl,
preProcess = c("center", "scale"),
tuneLength = 10)
knn_fit
## k-Nearest Neighbors
##
## 8001 samples
## 3 predictor
## 2 classes: 'Yes', 'No'
##
## Pre-processing: centered (3), scaled (3)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 7200, 7201, 7200, 7201, 7201, 7201, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9702956 0.4104444
## 7 0.9713784 0.4200407
## 9 0.9719200 0.4246494
## 11 0.9725033 0.4287720
## 13 0.9727534 0.4313951
## 15 0.9729203 0.4270575
## 17 0.9735034 0.4358933
## 19 0.9735034 0.4366315
## 21 0.9731285 0.4201842
## 23 0.9731284 0.4160789
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 19.
knn_pred <- predict(knn_fit, test_data, type="raw")
cm <- table(Predicted=knn_pred, Actual=test_data$default)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 19 10
## No 47 1923
##
## Accuracy : 0.9715
## 95% CI : (0.9632, 0.9783)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 0.1429
##
## Kappa : 0.3877
##
## Mcnemar's Test P-Value : 1.858e-06
##
## Sensitivity : 0.287879
## Specificity : 0.994827
## Pos Pred Value : 0.655172
## Neg Pred Value : 0.976142
## Prevalence : 0.033017
## Detection Rate : 0.009505
## Detection Prevalence : 0.014507
## Balanced Accuracy : 0.641353
##
## 'Positive' Class : Yes
##
PredLR <- predict(knn_fit, test_data,type = "prob")
levels(test_data$default)
## [1] "Yes" "No"
test_data$default <- ordered(test_data$default, levels = c("Yes", "No"))
lgPredObj <- prediction(PredLR[1],test_data$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.1004562
Logistic
# fit logistic regression model
logitModel <- glm(default ~
balance
+ income
+ student,
data = train_data,
family = binomial())
# summary of the logistic regression model
summary(logitModel)
##
## Call:
## glm(formula = default ~ balance + income + student, family = binomial(),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7170 0.0205 0.0573 0.1431 2.1358
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.104e+01 5.565e-01 19.846 <2e-16 ***
## balance -5.719e-03 2.582e-04 -22.151 <2e-16 ***
## income -8.800e-06 9.283e-06 -0.948 0.3431
## studentYes 5.524e-01 2.684e-01 2.058 0.0395 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2340.6 on 8000 degrees of freedom
## Residual deviance: 1262.4 on 7997 degrees of freedom
## AIC: 1270.4
##
## Number of Fisher Scoring iterations: 8
logitModelPred <- predict(logitModel, test_data, type = "response")
# plot of probabilities
plot(logitModelPred,
main = "Scatterplot of Probabilities of Default (test data)",
xlab = "Customer ID", ylab = "Predicted Probability of Default")

# setting the cut-off probablity
classify20 <- ifelse(logitModelPred > 0.2,"Yes","No")
# ordering the levels
classify20 <- ordered(classify20, levels = c("Yes", "No"))
test_data$default <- ordered(test_data$default, levels = c("Yes", "No"))
# confusion matrix
cm <- table(Predicted = classify20, Actual = test_data$default)
cm
## Actual
## Predicted Yes No
## Yes 61 1932
## No 5 1
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## Actual
## Predicted Yes No
## Yes 61 1932
## No 5 1
##
## Accuracy : 0.031
## 95% CI : (0.0239, 0.0396)
## No Information Rate : 0.967
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.005
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9242424
## Specificity : 0.0005173
## Pos Pred Value : 0.0306071
## Neg Pred Value : 0.1666667
## Prevalence : 0.0330165
## Detection Rate : 0.0305153
## Detection Prevalence : 0.9969985
## Balanced Accuracy : 0.4623799
##
## 'Positive' Class : Yes
##
PredLR <- predict(logitModel, test_data,type = "response")
lgPredObj <- prediction(PredLR,test_data$default)
lgPerfObj <- performance(lgPredObj, "tpr","fpr")
# plotting ROC curve
plot(lgPerfObj,main = "ROC Curve",col = 2,lwd = 2)

aucLR <- performance(lgPredObj, measure = "auc")
aucLR <- aucLR@y.values[[1]]
aucLR
## [1] 0.95384