Loading Libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(readr)
library(Hmisc)
library(corrplot)
library(leaps)
library(glmnet)
library(ROCR)
library(verification)
Loading Dataset and Data filtering
cc_data <- read_csv("German_Credit_Card_Data.csv")
factor_var <- c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)
num_var <- c(2,5,8,11,13,16,18)
cc_data[,factor_var] <- lapply(cc_data[,factor_var] , factor)
cc_data[num_var] <- sapply(cc_data[num_var] , as.numeric)
summary(cc_data)
Building Logistic Regression Model
log.fit <- glm(Default ~ ., data = train, family = "binomial")
AIC(log.fit)
## [1] 791.1765
log.fit2 <- glm(Default ~ Existing_checking_account + Duration + Credit_history +
Purpose + `Credit Amount` + `Savings account` + Installmant_rate +
`Other debtors / guarantors` + `Other installment plans` +
`No of credits` + Telephone + `Foreign Worker`, data = train,
family = "binomial")
AIC(log.fit2)
## [1] 783.5598
anova(log.fit, log.fit2)
## Analysis of Deviance Table
##
## Model 1: Default ~ Existing_checking_account + Duration + Credit_history +
## Purpose + `Credit Amount` + `Savings account` + `Present employment since` +
## Installmant_rate + `Personal status and Tax` + `Other debtors / guarantors` +
## Present_residence + Property + Age + `Other installment plans` +
## Housing + `No of credits` + Job + `Liable people` + Telephone +
## `Foreign Worker`
## Model 2: Default ~ Existing_checking_account + Duration + Credit_history +
## Purpose + `Credit Amount` + `Savings account` + Installmant_rate +
## `Other debtors / guarantors` + `Other installment plans` +
## `No of credits` + Telephone + `Foreign Worker`
## Resid. Df Resid. Dev Df Deviance
## 1 751 693.18
## 2 769 721.56 -18 -28.383
Stepwise Model Selection
null.model <- glm(Default ~ 1, data = train, family = "binomial")
full.model <- glm(Default ~ Existing_checking_account + Duration +
Credit_history + Purpose + `Credit Amount` + `Savings account`
+ Installmant_rate + `Other debtors / guarantors` +
`Other installment plans` +`No of credits` + Telephone +
`Foreign Worker`, data = train, family = "binomial")
foward <- step(null.model, scope = list(lower=null.model, upper=full.model),
k = 2,direction="forward")
#foward$anova
backward <- step(full.model, scope = list(lower=null.model, upper=full.model),
k = 2, direction="backward")
#backward$anova
both <- step(null.model, scope = list(lower=null.model, upper=full.model),
k = 2, direction="both")
#both$anova
Model Accuray
##AIC and BIC
AIC(both)
## [1] 783.5598
BIC(both)
## [1] 928.7827
#prediction on train data
train_prob <- predict(both, data = train, type = "response")
train$pred <- ifelse(train_prob > 0.5, 2 ,1)
mean(train$pred == train$Default)
## [1] 0.78875
#ROC Curve
roc.plot(train$Default == 2, train_prob)
roc.plot(train$Default == 2, train_prob)$roc.vol

## Model Area p.value binorm.area
## 1 Model 1 0.8316896 5.738626e-51 NA
##Prediction of test data
test_prob <- predict(both, newdata = test, type = "response")
test$pred <- ifelse(test_prob > 0.5, 2 ,1)
mean(test$pred == test$Default)
## [1] 0.72
##ROC Curve
roc.plot(test$Default == 2, test_prob)

roc.plot(test$Default == 2, test_prob)$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.7543574 1.438592e-08 NA
LASSO on logistic Regression
cc_data_scale <- cc_data
cc_data_scale[num_var] <- scale(cc_data_scale[num_var])
cc_data_scale[factor_var] <- sapply(cc_data[factor_var] , as.numeric)
X.train<- as.matrix(cc_data_scale)[index,1:20]
X.test<- as.matrix(cc_data_scale)[-index,1:20]
Y.train <- cc_data_scale$Default[index]
Y.test <- cc_data_scale$Default[-index]
lasso_fit <- glmnet(x = X.train ,y = Y.train ,alpha = 1 , family = "binomial")
plot(lasso_fit)

plot(lasso_fit, xvar = "lambda", label=TRUE)

Cross Validation on LASSO Regression
cv.out <- cv.glmnet(x = X.train ,y = Y.train ,alpha = 1 ,
family = "binomial" ,type.measure = "mse")
plot(cv.out, xvar = "lambda", label=TRUE)

#min value of lambda
lambda_min <- cv.out$lambda.min
#best value of lambda
lambda_1se <- cv.out$lambda.1se
#regression coefficients
coef(cv.out,s=lambda_1se)
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) 2.158415846
## Existing_checking_account -0.492390009
## Duration 0.286797087
## Credit_history -0.284699490
## Purpose .
## Credit Amount 0.083046795
## Savings account -0.142847400
## Present employment since -0.109053159
## Installmant_rate 0.086089401
## Personal status and Tax -0.020346626
## Other debtors / guarantors -0.076803448
## Present_residence .
## Property 0.115861917
## Age -0.003675815
## Other installment plans -0.060858514
## Housing .
## No of credits .
## Job .
## Liable people .
## Telephone -0.101978380
## Foreign Worker -0.017000915
##Prediction on Train
lasso_prob1 <- predict(cv.out, newx = X.train, s = lambda_1se, type = "response")
train$pred <- ifelse(lasso_prob1 >0.5 , 2, 1 )
table(pred = train$pred , true = Y.train)
## true
## pred 1 2
## 1 510 144
## 2 45 101
mean(train$pred == Y.train)
## [1] 0.76375
#ROC Curve on LASSO for training Data
roc.plot(train$Default == 2, lasso_prob1)
roc.plot(train$Default == 2, lasso_prob1)$roc.vol

## Model Area p.value binorm.area
## 1 Model 1 0.8025666 9.311367e-43 NA
##Prediction on Test
lasso_prob <- predict(cv.out, newx = X.test, s = lambda_1se, type = "response")
test$pred <- ifelse(lasso_prob >0.5 , 2, 1 )
table(pred = test$pred , true = Y.test)
## true
## pred 1 2
## 1 127 39
## 2 18 16
mean(test$pred == Y.test)
## [1] 0.715
#ROC Curve on LASSO for test Data
roc.plot(test$Default == 2, lasso_prob)

roc.plot(test$Default == 2, lasso_prob)$roc.vol
## Model Area p.value binorm.area
## 1 Model 1 0.7458307 4.099407e-08 NA
roc.plot(x = test$Default == "2", pred = cbind(test_prob, lasso_prob), legend = TRUE, leg.text = c("Variable Selection", "LASSO Regression"))$roc.vol

## Model Area p.value binorm.area
## 1 Model 1 0.7543574 1.438592e-08 NA
## 2 Model 2 0.7458307 4.099407e-08 NA
Calculating Misclassification rate on training Data
# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)
# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi)
{
weight1 = 5
weight0 = 1
c1 = (r == 2) & (pi < pcut) #logical vector - true if actual 1 but predict 0
c0 = (r == 1) & (pi > pcut) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
credit.glm1 <- both
for (i in 1:length(searchgrid))
{
pcut <- result[i, 1]
# assign the cost to the 2nd col
result[i, 2] <- cost1(train$Default, predict(credit.glm1, type = "response"))
}
plot(result, ylab = "Cost in Training Set")

result[which(result[,2] == min(result[,2])),]
## searchgrid
## 0.14000 0.46875
#we get min cost when probablity cutoff = 0.14
Calculating Misclassification rate on test Data
# define the searc grid from 0.01 to 0.99
searchgrid = seq(0.01, 0.99, 0.01)
result = cbind(searchgrid, NA)
# in the cost function, both r and pi are vectors, r=truth, pi=predicted probability
cost1 <- function(r, pi)
{
weight1 = 5
weight0 = 1
c1 = (r == 2) & (pi < pcut) #logical vector - true if actual 1 but predict 0
c0 = (r == 1) & (pi > pcut) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
credit.glm1 <- both
for (i in 1:length(searchgrid))
{
pcut <- result[i, 1]
# assign the cost to the 2nd col
result[i, 2] <- cost1(test$Default, predict(credit.glm1, newdata = test,
type = "response"))
}
plot(result, ylab = "Cost in Test Set")

result[which(result[,2] == min(result[,2])),]
## searchgrid
## 0.170 0.455
#we get min cost when probablity cutoff = 0.17