German Credit Card Default

Loading Libraries

library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(Hmisc)
library(corrplot)
library(leaps)
library(glmnet)
library(ROCR)
library(verification)

Loading Dataset and Data filtering

cc_data <- read_csv("German_Credit_Card_Data.csv")

factor_var <- c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)
num_var <- c(2,5,8,11,13,16,18)


cc_data[,factor_var] <- lapply(cc_data[,factor_var] , factor)
cc_data[num_var] <- sapply(cc_data[num_var] , as.numeric)

summary(cc_data)

Basic Plots

hist.data.frame(cc_data[num_var])

for (c in cc_data[factor_var])
{
  plot(c, type = "l")
}

cc_data[num_var] %>%
  cor() %>%
  corrplot(type = "upper", method = "number")

set.seed(12363925)
index <- sample(1:nrow(cc_data), 0.8*nrow(cc_data))
train <- cc_data[index,]
test <- cc_data[-index,]

Building Logistic Regression Model

log.fit <- glm(Default ~ ., data = train, family = "binomial")
AIC(log.fit)

## [1] 791.1765

log.fit2 <- glm(Default ~ Existing_checking_account + Duration + Credit_history +
                  Purpose + `Credit Amount` + `Savings account` + Installmant_rate +
                  `Other debtors / guarantors` + `Other installment plans` +
                  `No of credits` + Telephone + `Foreign Worker`, data = train,
                family = "binomial")
AIC(log.fit2)

## [1] 783.5598

anova(log.fit, log.fit2)

## Analysis of Deviance Table
## 
## Model 1: Default ~ Existing_checking_account + Duration + Credit_history + 
##     Purpose + `Credit Amount` + `Savings account` + `Present employment since` + 
##     Installmant_rate + `Personal status and Tax` + `Other debtors / guarantors` + 
##     Present_residence + Property + Age + `Other installment plans` + 
##     Housing + `No of credits` + Job + `Liable people` + Telephone + 
##     `Foreign Worker`
## Model 2: Default ~ Existing_checking_account + Duration + Credit_history + 
##     Purpose + `Credit Amount` + `Savings account` + Installmant_rate + 
##     `Other debtors / guarantors` + `Other installment plans` + 
##     `No of credits` + Telephone + `Foreign Worker`
##   Resid. Df Resid. Dev  Df Deviance
## 1       751     693.18             
## 2       769     721.56 -18  -28.383

Stepwise Model Selection

null.model <- glm(Default ~ 1, data = train, family = "binomial")
full.model <- glm(Default ~ Existing_checking_account + Duration +
                    Credit_history + Purpose + `Credit Amount` + `Savings account` 
                  + Installmant_rate + `Other debtors / guarantors` + 
                    `Other installment plans` +`No of credits` + Telephone + 
                    `Foreign Worker`, data = train, family = "binomial")

foward <- step(null.model, scope = list(lower=null.model, upper=full.model), 
               k = 2,direction="forward")
#foward$anova

backward <- step(full.model, scope = list(lower=null.model, upper=full.model), 
                 k = 2, direction="backward")
#backward$anova

both <- step(null.model, scope = list(lower=null.model, upper=full.model), 
                 k = 2, direction="both")
#both$anova

Model Accuray

##AIC and BIC
AIC(both)

## [1] 783.5598

BIC(both)

## [1] 928.7827

#prediction on train data
train_prob <- predict(both, data = train, type = "response")
train$pred <- ifelse(train_prob > 0.5, 2 ,1)
mean(train$pred == train$Default)

## [1] 0.78875

#ROC Curve
roc.plot(train$Default == 2, train_prob)
roc.plot(train$Default == 2, train_prob)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8316896 5.738626e-51          NA

##Prediction of test data
test_prob <- predict(both, newdata = test, type = "response")
test$pred <- ifelse(test_prob > 0.5, 2 ,1)
mean(test$pred == test$Default)

## [1] 0.72

##ROC Curve
roc.plot(test$Default == 2, test_prob)

roc.plot(test$Default == 2, test_prob)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.7543574 1.438592e-08          NA

LASSO on logistic Regression

cc_data_scale <- cc_data
cc_data_scale[num_var] <- scale(cc_data_scale[num_var])
cc_data_scale[factor_var] <- sapply(cc_data[factor_var] , as.numeric)

X.train<- as.matrix(cc_data_scale)[index,1:20]
X.test<-  as.matrix(cc_data_scale)[-index,1:20]
Y.train <- cc_data_scale$Default[index]
Y.test <- cc_data_scale$Default[-index]


lasso_fit <- glmnet(x = X.train ,y = Y.train ,alpha = 1 , family = "binomial")
plot(lasso_fit)

plot(lasso_fit, xvar = "lambda", label=TRUE)

Cross Validation on LASSO Regression

cv.out <- cv.glmnet(x = X.train ,y = Y.train ,alpha = 1 , 
                    family = "binomial" ,type.measure = "mse")

plot(cv.out, xvar = "lambda", label=TRUE)

#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)

## 21 x 1 sparse Matrix of class "dgCMatrix"
##                                       1
## (Intercept)                 2.158415846
## Existing_checking_account  -0.492390009
## Duration                    0.286797087
## Credit_history             -0.284699490
## Purpose                     .          
## Credit Amount               0.083046795
## Savings account            -0.142847400
## Present employment since   -0.109053159
## Installmant_rate            0.086089401
## Personal status and Tax    -0.020346626
## Other debtors / guarantors -0.076803448
## Present_residence           .          
## Property                    0.115861917
## Age                        -0.003675815
## Other installment plans    -0.060858514
## Housing                     .          
## No of credits               .          
## Job                         .          
## Liable people               .          
## Telephone                  -0.101978380
## Foreign Worker             -0.017000915

##Prediction on Train
lasso_prob1 <- predict(cv.out, newx = X.train, s = lambda_1se, type = "response")
train$pred <- ifelse(lasso_prob1 >0.5 , 2, 1 )
table(pred = train$pred , true = Y.train)

##     true
## pred   1   2
##    1 510 144
##    2  45 101

mean(train$pred == Y.train)

## [1] 0.76375

#ROC Curve on LASSO for training Data 
roc.plot(train$Default == 2, lasso_prob1)
roc.plot(train$Default == 2, lasso_prob1)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.8025666 9.311367e-43          NA

##Prediction on Test
lasso_prob <- predict(cv.out, newx = X.test, s = lambda_1se, type = "response")
test$pred <- ifelse(lasso_prob >0.5 , 2, 1 )
table(pred = test$pred , true = Y.test)

##     true
## pred   1   2
##    1 127  39
##    2  18  16

mean(test$pred == Y.test)

## [1] 0.715

#ROC Curve on LASSO for test Data 
roc.plot(test$Default == 2, lasso_prob)

roc.plot(test$Default == 2, lasso_prob)$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.7458307 4.099407e-08          NA

roc.plot(x = test$Default == "2", pred = cbind(test_prob, lasso_prob), legend = TRUE, leg.text = c("Variable Selection", "LASSO Regression"))$roc.vol

##      Model      Area      p.value binorm.area
## 1 Model  1 0.7543574 1.438592e-08          NA
## 2 Model  2 0.7458307 4.099407e-08          NA

Calculating Misclassification rate on training Data

# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)

# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi) 
{
  weight1 = 5
  weight0 = 1
  c1 = (r == 2) & (pi < pcut)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 1) & (pi > pcut)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

credit.glm1 <- both
for (i in 1:length(searchgrid)) 
{
  pcut <- result[i, 1]
  # assign the cost to the 2nd col
  result[i, 2] <- cost1(train$Default, predict(credit.glm1, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

result[which(result[,2] == min(result[,2])),]

## searchgrid            
##    0.14000    0.46875

#we get min cost when probablity cutoff = 0.14

Calculating Misclassification rate on test Data

# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)

# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi) 
{
  weight1 = 5
  weight0 = 1
  c1 = (r == 2) & (pi < pcut)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 1) & (pi > pcut)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

credit.glm1 <- both
for (i in 1:length(searchgrid)) 
{
  pcut <- result[i, 1]
  # assign the cost to the 2nd col
  result[i, 2] <- cost1(test$Default, predict(credit.glm1, newdata = test, 
                                              type = "response"))
}
plot(result, ylab = "Cost in Test Set")

result[which(result[,2] == min(result[,2])),]

## searchgrid            
##      0.170      0.455

#we get min cost when probablity cutoff = 0.17

German Credit Card Default

Amoul Singhi

30 January 2018

Loading Libraries

Loading Dataset and Data filtering

Basic Plots

Building Logistic Regression Model

Stepwise Model Selection

Model Accuray

LASSO on logistic Regression

Cross Validation on LASSO Regression

Calculating Misclassification rate on training Data

Calculating Misclassification rate on test Data