library(beepr)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(InformationValue)
library(formula.tools)
library(knitr)
setwd("C:/Users/bstok/Desktop/Park DA 6213 Data Drive Decision Making and Design/Case Study 2")
#knitr::opts_chunk$set(echo = TRUE)
load("GermanData.RData")
credit$V21[credit$V21==1]=0;
credit$V21[credit$V21==2]=1;
credit$V21 = as.factor(credit$V21)
credit$V1 = as.factor(credit$V1) #status of checking account
credit$V3 = as.factor(credit$V3) #credit history
credit$V4 = as.factor(credit$V4) #purpose
credit$V6 = as.factor(credit$V6) #savings accounts/bonds
credit$V7 = as.factor(credit$V7) #present employment
credit$V9 = as.factor(credit$V9) #personal status and sex
credit$V10 = as.factor(credit$V10) #other debtors/guarantors: 1 none, 2 co, 3 guarantor
credit$V12 = as.factor(credit$V12) #property: 1 re, 2 life insurance, 3 car, 4 unknown
credit$V14 = as.factor(credit$V14) #other installment plans: 1 bank, 2 stores, 3 none
credit$V15 = as.factor(credit$V15) #housing: 1 rent, 2 own , 3 for free
credit$V17 = as.factor(credit$V17) #job: 1 unemployed, 2 unskilled, 3 skilled, 4 management
credit$V19 = as.factor(credit$V19) #telephone: 1 none, 2 under name
credit$V20 = as.factor(credit$V20) #foreign worker: 1 yes, 2 no
train <- credit[1:800,]
test <- credit[801:1000,]
set.seed(1)
index = sample(1:nrow(train), 0.8*nrow(train))
pseudotrain = train[index,]
pseudotest = train[-index,]
Model 1 (All predictors)
#base model
model = glm(formula = V21 ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 +
V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20, family = "binomial", data = pseudotrain)
#influential points
inf <- 4/(nrow(pseudotrain))
out <- which(cooks.distance(model)>inf)
#model formula
f = as.character(formula(model))
#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")
#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")
#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)
mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.325
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE) # false positive
## [1] 0.3381295
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE) # false negative
## [1] 0.295082
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 92 18
## 1 47 43
##
## Accuracy : 0.675
## 95% CI : (0.6053, 0.7394)
## No Information Rate : 0.695
## P-Value [Acc > NIR] : 0.7568554
##
## Kappa : 0.3236
##
## Mcnemar's Test P-Value : 0.0005147
##
## Sensitivity : 0.7049
## Specificity : 0.6619
## Pos Pred Value : 0.4778
## Neg Pred Value : 0.8364
## Prevalence : 0.3050
## Detection Rate : 0.2150
## Detection Prevalence : 0.4500
## Balanced Accuracy : 0.6834
##
## 'Positive' Class : 1
##
Model 2 (Stepwise Selection with AIC criteria)
model = glm(formula = V21 ~ V1 + V2 + V3 + V5 + V7 + V8 + V9 +
V10 + V13 + V16 + V20, family = "binomial", data = pseudotrain)
#influential points
inf <- 4/(nrow(pseudotrain))
out <- which(cooks.distance(model)>inf)
#model formula
f = as.character(formula(model))
#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])
#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")
#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")
#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)
mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.33
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE) # false positive
## [1] 0.3669065
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE) # false negative
## [1] 0.2459016
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 88 15
## 1 51 46
##
## Accuracy : 0.67
## 95% CI : (0.6002, 0.7347)
## No Information Rate : 0.695
## P-Value [Acc > NIR] : 0.8017
##
## Kappa : 0.3322
##
## Mcnemar's Test P-Value : 1.646e-05
##
## Sensitivity : 0.7541
## Specificity : 0.6331
## Pos Pred Value : 0.4742
## Neg Pred Value : 0.8544
## Prevalence : 0.3050
## Detection Rate : 0.2300
## Detection Prevalence : 0.4850
## Balanced Accuracy : 0.6936
##
## 'Positive' Class : 1
##
Model 3 (Stepwise Selection with AIC criteria and interaction terms)
model = glm(formula = V21 ~ V1 + V2 + V3 + V5 + V7 + V8 + V9 +
V10 + V13 + V16 + V20 + V3:V5 + V7:V8, family = "binomial", data = pseudotrain)
#influential points
inf <- 4/(nrow(pseudotrain))
out <- which(cooks.distance(model)>inf)
#model formula
f = as.character(formula(model))
#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])
#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")
#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")
#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)
mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.3
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE) # false positive
## [1] 0.3021583
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE) # false negative
## [1] 0.295082
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 97 18
## 1 42 43
##
## Accuracy : 0.7
## 95% CI : (0.6314, 0.7626)
## No Information Rate : 0.695
## P-Value [Acc > NIR] : 0.473370
##
## Kappa : 0.3627
##
## Mcnemar's Test P-Value : 0.002985
##
## Sensitivity : 0.7049
## Specificity : 0.6978
## Pos Pred Value : 0.5059
## Neg Pred Value : 0.8435
## Prevalence : 0.3050
## Detection Rate : 0.2150
## Detection Prevalence : 0.4250
## Balanced Accuracy : 0.7014
##
## 'Positive' Class : 1
##