Loading Libraries

library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(Hmisc)
library(corrplot)
library(leaps)
library(glmnet)
library(ROCR)
library(verification)

Loading Dataset and Data filtering

cc_data <- read_csv("German_Credit_Card_Data.csv")

factor_var <- c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)
num_var <- c(2,5,8,11,13,16,18)


cc_data[,factor_var] <- lapply(cc_data[,factor_var] , factor)
cc_data[num_var] <- sapply(cc_data[num_var] , as.numeric)

summary(cc_data)

Basic Plots

hist.data.frame(cc_data[num_var])

for (c in cc_data[factor_var])
{
  plot(c, type = "l")
}

cc_data[num_var] %>%
  cor() %>%
  corrplot(type = "upper", method = "number")

set.seed(12363925)
index <- sample(1:nrow(cc_data), 0.8*nrow(cc_data))
train <- cc_data[index,]
test <- cc_data[-index,]

Building Logistic Regression Model

log.fit <- glm(Default ~ ., data = train, family = "binomial")
AIC(log.fit)
## [1] 791.1765
log.fit2 <- glm(Default ~ Existing_checking_account + Duration + Credit_history +
                  Purpose + `Credit Amount` + `Savings account` + Installmant_rate +
                  `Other debtors / guarantors` + `Other installment plans` +
                  `No of credits` + Telephone + `Foreign Worker`, data = train,
                family = "binomial")
AIC(log.fit2)
## [1] 783.5598
anova(log.fit, log.fit2)
## Analysis of Deviance Table
## 
## Model 1: Default ~ Existing_checking_account + Duration + Credit_history + 
##     Purpose + `Credit Amount` + `Savings account` + `Present employment since` + 
##     Installmant_rate + `Personal status and Tax` + `Other debtors / guarantors` + 
##     Present_residence + Property + Age + `Other installment plans` + 
##     Housing + `No of credits` + Job + `Liable people` + Telephone + 
##     `Foreign Worker`
## Model 2: Default ~ Existing_checking_account + Duration + Credit_history + 
##     Purpose + `Credit Amount` + `Savings account` + Installmant_rate + 
##     `Other debtors / guarantors` + `Other installment plans` + 
##     `No of credits` + Telephone + `Foreign Worker`
##   Resid. Df Resid. Dev  Df Deviance
## 1       751     693.18             
## 2       769     721.56 -18  -28.383

Stepwise Model Selection

null.model <- glm(Default ~ 1, data = train, family = "binomial")
full.model <- glm(Default ~ Existing_checking_account + Duration +
                    Credit_history + Purpose + `Credit Amount` + `Savings account` 
                  + Installmant_rate + `Other debtors / guarantors` + 
                    `Other installment plans` +`No of credits` + Telephone + 
                    `Foreign Worker`, data = train, family = "binomial")

foward <- step(null.model, scope = list(lower=null.model, upper=full.model), 
               k = 2,direction="forward")
#foward$anova

backward <- step(full.model, scope = list(lower=null.model, upper=full.model), 
                 k = 2, direction="backward")
#backward$anova

both <- step(null.model, scope = list(lower=null.model, upper=full.model), 
                 k = 2, direction="both")
#both$anova

Model Accuray

##AIC and BIC
AIC(both)
## [1] 783.5598
BIC(both)
## [1] 928.7827
#prediction on train data
train_prob <- predict(both, data = train, type = "response")
train$pred <- ifelse(train_prob > 0.5, 2 ,1)
mean(train$pred == train$Default)
## [1] 0.78875
#ROC Curve
roc.plot(train$Default == 2, train_prob)
roc.plot(train$Default == 2, train_prob)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8316896 5.738626e-51          NA
##Prediction of test data
test_prob <- predict(both, newdata = test, type = "response")
test$pred <- ifelse(test_prob > 0.5, 2 ,1)
mean(test$pred == test$Default)
## [1] 0.72
##ROC Curve
roc.plot(test$Default == 2, test_prob)

roc.plot(test$Default == 2, test_prob)$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.7543574 1.438592e-08          NA

LASSO on logistic Regression

cc_data_scale <- cc_data
cc_data_scale[num_var] <- scale(cc_data_scale[num_var])
cc_data_scale[factor_var] <- sapply(cc_data[factor_var] , as.numeric)

X.train<- as.matrix(cc_data_scale)[index,1:20]
X.test<-  as.matrix(cc_data_scale)[-index,1:20]
Y.train <- cc_data_scale$Default[index]
Y.test <- cc_data_scale$Default[-index]


lasso_fit <- glmnet(x = X.train ,y = Y.train ,alpha = 1 , family = "binomial")
plot(lasso_fit)

plot(lasso_fit, xvar = "lambda", label=TRUE)

Cross Validation on LASSO Regression

cv.out <- cv.glmnet(x = X.train ,y = Y.train ,alpha = 1 , 
                    family = "binomial" ,type.measure = "mse")

plot(cv.out, xvar = "lambda", label=TRUE)

#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)
## 21 x 1 sparse Matrix of class "dgCMatrix"
##                                       1
## (Intercept)                 2.158415846
## Existing_checking_account  -0.492390009
## Duration                    0.286797087
## Credit_history             -0.284699490
## Purpose                     .          
## Credit Amount               0.083046795
## Savings account            -0.142847400
## Present employment since   -0.109053159
## Installmant_rate            0.086089401
## Personal status and Tax    -0.020346626
## Other debtors / guarantors -0.076803448
## Present_residence           .          
## Property                    0.115861917
## Age                        -0.003675815
## Other installment plans    -0.060858514
## Housing                     .          
## No of credits               .          
## Job                         .          
## Liable people               .          
## Telephone                  -0.101978380
## Foreign Worker             -0.017000915
##Prediction on Train
lasso_prob1 <- predict(cv.out, newx = X.train, s = lambda_1se, type = "response")
train$pred <- ifelse(lasso_prob1 >0.5 , 2, 1 )
table(pred = train$pred , true = Y.train)
##     true
## pred   1   2
##    1 510 144
##    2  45 101
mean(train$pred == Y.train)
## [1] 0.76375
#ROC Curve on LASSO for training Data 
roc.plot(train$Default == 2, lasso_prob1)
roc.plot(train$Default == 2, lasso_prob1)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8025666 9.311367e-43          NA
##Prediction on Test
lasso_prob <- predict(cv.out, newx = X.test, s = lambda_1se, type = "response")
test$pred <- ifelse(lasso_prob >0.5 , 2, 1 )
table(pred = test$pred , true = Y.test)
##     true
## pred   1   2
##    1 127  39
##    2  18  16
mean(test$pred == Y.test)
## [1] 0.715
#ROC Curve on LASSO for test Data 
roc.plot(test$Default == 2, lasso_prob)

roc.plot(test$Default == 2, lasso_prob)$roc.vol
##      Model      Area      p.value binorm.area
## 1 Model  1 0.7458307 4.099407e-08          NA
roc.plot(x = test$Default == "2", pred = cbind(test_prob, lasso_prob), legend = TRUE, leg.text = c("Variable Selection", "LASSO Regression"))$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.7543574 1.438592e-08          NA
## 2 Model  2 0.7458307 4.099407e-08          NA

Calculating Misclassification rate on training Data

# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)

# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi) 
{
  weight1 = 5
  weight0 = 1
  c1 = (r == 2) & (pi < pcut)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 1) & (pi > pcut)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

credit.glm1 <- both
for (i in 1:length(searchgrid)) 
{
  pcut <- result[i, 1]
  # assign the cost to the 2nd col
  result[i, 2] <- cost1(train$Default, predict(credit.glm1, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

result[which(result[,2] == min(result[,2])),]
## searchgrid            
##    0.14000    0.46875
#we get min cost when probablity cutoff = 0.14

Calculating Misclassification rate on test Data

# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)

# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi) 
{
  weight1 = 5
  weight0 = 1
  c1 = (r == 2) & (pi < pcut)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 1) & (pi > pcut)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

credit.glm1 <- both
for (i in 1:length(searchgrid)) 
{
  pcut <- result[i, 1]
  # assign the cost to the 2nd col
  result[i, 2] <- cost1(test$Default, predict(credit.glm1, newdata = test, 
                                              type = "response"))
}
plot(result, ylab = "Cost in Test Set")

result[which(result[,2] == min(result[,2])),]
## searchgrid            
##      0.170      0.455
#we get min cost when probablity cutoff = 0.17