library(beepr)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(InformationValue)
library(formula.tools)
library(knitr)

setwd("C:/Users/bstok/Desktop/Park DA 6213 Data Drive Decision Making and Design/Case Study 2")
#knitr::opts_chunk$set(echo = TRUE)
load("GermanData.RData")
credit$V21[credit$V21==1]=0; 
credit$V21[credit$V21==2]=1; 
credit$V21 = as.factor(credit$V21)

credit$V1 = as.factor(credit$V1) #status of checking account 
credit$V3 = as.factor(credit$V3) #credit history 
credit$V4 = as.factor(credit$V4) #purpose 
credit$V6 = as.factor(credit$V6) #savings accounts/bonds 
credit$V7 = as.factor(credit$V7) #present employment 
credit$V9 = as.factor(credit$V9) #personal status and sex 
credit$V10 = as.factor(credit$V10) #other debtors/guarantors: 1 none, 2 co, 3 guarantor
credit$V12 = as.factor(credit$V12) #property: 1 re, 2 life insurance, 3 car, 4 unknown 
credit$V14 = as.factor(credit$V14) #other installment plans: 1 bank, 2 stores, 3 none 
credit$V15 = as.factor(credit$V15) #housing: 1 rent, 2 own , 3 for free 
credit$V17 = as.factor(credit$V17) #job: 1 unemployed, 2 unskilled, 3 skilled, 4 management 
credit$V19 = as.factor(credit$V19) #telephone: 1 none, 2 under name 
credit$V20 = as.factor(credit$V20) #foreign worker: 1 yes, 2 no 

train <- credit[1:800,]
test <- credit[801:1000,]

set.seed(1)
index = sample(1:nrow(train), 0.8*nrow(train))
pseudotrain = train[index,]
pseudotest = train[-index,]

Model 1 (All predictors)

#base model
model = glm(formula = V21 ~ V1 + V2 + V3 + V4 + V5 + V6 + V7 + V8 + V9 + 
    V10 + V11 + V12 + V13 + V14 + V15 + V16 + V17 + V18 + V19 + V20, family = "binomial", data = pseudotrain)

#influential points
inf <- 4/(nrow(pseudotrain)) 
out <- which(cooks.distance(model)>inf)

#model formula
f = as.character(formula(model))

#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")

#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")

#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)

mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.325
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE)  # false positive
## [1] 0.3381295
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE)  # false negative
## [1] 0.295082
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 92 18
##          1 47 43
##                                           
##                Accuracy : 0.675           
##                  95% CI : (0.6053, 0.7394)
##     No Information Rate : 0.695           
##     P-Value [Acc > NIR] : 0.7568554       
##                                           
##                   Kappa : 0.3236          
##                                           
##  Mcnemar's Test P-Value : 0.0005147       
##                                           
##             Sensitivity : 0.7049          
##             Specificity : 0.6619          
##          Pos Pred Value : 0.4778          
##          Neg Pred Value : 0.8364          
##              Prevalence : 0.3050          
##          Detection Rate : 0.2150          
##    Detection Prevalence : 0.4500          
##       Balanced Accuracy : 0.6834          
##                                           
##        'Positive' Class : 1               
## 

Model 2 (Stepwise Selection with AIC criteria)

model = glm(formula = V21 ~ V1 + V2 + V3  + V5  + V7 + V8 + V9 + 
    V10  + V13 + V16 + V20, family = "binomial", data = pseudotrain)

#influential points
inf <- 4/(nrow(pseudotrain)) 
out <- which(cooks.distance(model)>inf)

#model formula
f = as.character(formula(model))

#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])

#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")

#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")

#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)

mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.33
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE)  # false positive
## [1] 0.3669065
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE)  # false negative
## [1] 0.2459016
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 88 15
##          1 51 46
##                                           
##                Accuracy : 0.67            
##                  95% CI : (0.6002, 0.7347)
##     No Information Rate : 0.695           
##     P-Value [Acc > NIR] : 0.8017          
##                                           
##                   Kappa : 0.3322          
##                                           
##  Mcnemar's Test P-Value : 1.646e-05       
##                                           
##             Sensitivity : 0.7541          
##             Specificity : 0.6331          
##          Pos Pred Value : 0.4742          
##          Neg Pred Value : 0.8544          
##              Prevalence : 0.3050          
##          Detection Rate : 0.2300          
##    Detection Prevalence : 0.4850          
##       Balanced Accuracy : 0.6936          
##                                           
##        'Positive' Class : 1               
## 

Model 3 (Stepwise Selection with AIC criteria and interaction terms)

model = glm(formula = V21 ~ V1 + V2 + V3  + V5  + V7 + V8 + V9 + 
    V10  + V13 + V16 + V20 + V3:V5 + V7:V8, family = "binomial", data = pseudotrain)

#influential points
inf <- 4/(nrow(pseudotrain)) 
out <- which(cooks.distance(model)>inf)

#model formula
f = as.character(formula(model))

#remove influential points
model = glm(formula = f, family = "binomial", data = pseudotrain[-out,])

#test on pseudotest
preds = predict(model, newdata = pseudotest, type="response")

#roc curve
roc = roc(pseudotest$V21 ~ preds, plot = T, print.auc = T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#find optimal cutoff
optCutOff <- optimalCutoff(pseudotest$V21, preds, optimiseFor = "Both")

#confusion matrix
test.preds = predict(model, test, type="response")
test.preds.class = ifelse(test.preds > optCutOff, 1, 0)

mean(test$V21 != test.preds.class, na.rm=TRUE) #final error rate for case study
## [1] 0.3
mean(test$V21[test$V21 == 0 ] != test.preds.class[test$V21 == 0 ], na.rm=TRUE)  # false positive
## [1] 0.3021583
mean(test$V21[test$V21 == 1 ] != test.preds.class[test$V21 == 1 ], na.rm=TRUE)  # false negative
## [1] 0.295082
caret::confusionMatrix(as.factor(test.preds.class), as.factor(test$V21), positive = '1')
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 97 18
##          1 42 43
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6314, 0.7626)
##     No Information Rate : 0.695           
##     P-Value [Acc > NIR] : 0.473370        
##                                           
##                   Kappa : 0.3627          
##                                           
##  Mcnemar's Test P-Value : 0.002985        
##                                           
##             Sensitivity : 0.7049          
##             Specificity : 0.6978          
##          Pos Pred Value : 0.5059          
##          Neg Pred Value : 0.8435          
##              Prevalence : 0.3050          
##          Detection Rate : 0.2150          
##    Detection Prevalence : 0.4250          
##       Balanced Accuracy : 0.7014          
##                                           
##        'Positive' Class : 1               
##