Logistic Regression and CART on German Credit Data

In this project, I have ventured into comparison between classification tree and logistic regression, we use performance metrics such as AUC score and Asymmeteric misclassification rate to compare two models. I have also compared various model selection procedures using aic score, bic score and lasso variable selection.

Libraries Used

library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(dplyr)
library(glmnet)
library(boot)
library(verification)
library(rpart)
library(rpart.plot)

Importing Data

german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")

colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose", 
                            "amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor", 
                            "present_resid", "property", "age", "other_install", "housing", "n_credits", 
                            "job", "n_people", "telephone", "foreign", "response")

german_credit$response = german_credit$response - 1
table(german_credit$response)

Train/Test split

set.seed(12420424)
in.train <- createDataPartition(as.factor(german_credit$response), p=0.8, list=FALSE)
german_credit.train <- german_credit[in.train,]
german_credit.test <- german_credit[-in.train,]
credit.glm0 <- glm(response ~ ., family = binomial, german_credit.train)

Stepwise variable selection using AIC

credit.glm.step <- step(credit.glm0, direction = "backward")

Chi-square test for significance of variables

summary(credit.glm.step)
drop1(credit.glm0, test ="Chi")

stepwise variable selection using BIC

credit.glm.step.bic <- step(credit.glm0, k = log(nrow(german_credit.train)))
summary(credit.glm.step.bic)

Lasso variable selection

german_credit.train$response <- as.numeric(german_credit.train$response)
summary(credit.glm.step.bic)
factor_var <- c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)
num_var <- c(2,5,8,11,13,16,18)
train2 <- german_credit.train
train2[num_var] <- scale(train2[num_var])
train2[factor_var] <- sapply(train2[factor_var] , as.numeric)
summary(train2)
X.train <- as.matrix(train2[,1:20])
Y.train <- as.matrix(train2[,21])


lasso.fit<- glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1)
plot(lasso.fit)

cv.lasso<- cv.glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1, nfolds = 10)
plot(cv.lasso)

cv.lasso$lambda.1se
coef(lasso.fit, s=cv.lasso$lambda.1se)

Final model for GLM

credit.glm.final <- glm(response ~ chk_acct + duration +
                          credit_his + amount +
                          saving_acct  +
                          other_install + installment_rate,
                        family = binomial, german_credit.train)

In-sample misclassification rate

prob.glm1.insample <- predict(credit.glm.final, type = "response")
predicted.glm1.insample <- prob.glm1.insample > 0.1667
predicted.glm1.insample <- as.numeric(predicted.glm1.insample)
mean(ifelse(german_credit.train$response != predicted.glm1.insample, 1, 0))

In-sample AUC score

table(german_credit.train$response, predicted.glm1.insample, dnn = c("Truth", "Predicted"))
roc.plot(german_credit.train$response == "1", prob.glm1.insample)
roc.plot(german_credit.train$response == "1", prob.glm1.insample)$roc.vol$Area

Out of sample misclassification rate and AUC score

prob.glm1.outsample <- predict(credit.glm.final, german_credit.test, type = "response")
predicted.glm1.outsample <- prob.glm1.outsample > 0.1667
predicted.glm1.outsample <- as.numeric(predicted.glm1.outsample)
table(german_credit.test$response, predicted.glm1.outsample, dnn = c("Truth", "Predicted"))
mean(ifelse(german_credit.test$response != predicted.glm1.outsample, 1, 0))
roc.plot(german_credit.test$response == "1", prob.glm1.outsample)
roc.plot(german_credit.test$response == "1", prob.glm1.outsample)$roc.vol$Area

Asymmetric misclassification rate giving more penalty for false positives

cost1 <- function(r, pi) {
  weight1 = 5
  weight0 = 1
  c1 = (r == 1) & (pi < 0.17)  #logical vector - true if actual 1 but predict 0
  c0 = (r == 0) & (pi > 0.17)  #logical vecotr - true if actual 0 but predict 1
  return(mean(weight1 * c1 + weight0 * c0))
}

cost1(german_credit.test$response,predicted.glm1.outsample)

Default tree model with cp = 0.01 and asymetric misclassification rate

credit.rpart <- rpart(formula = response ~ ., data = german_credit.train, method = "class", 
                      parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
rpart.plot(credit.rpart, type = 1, fallen.leaves = FALSE, extra = 4)

Pruning tree by training a large tree and choosing an apt cp, using 10 fold CV error

tree.large <- rpart(formula = response ~ ., data = german_credit.train, method = "class", 
                    cp=0.001)
rpart.plot(tree.large, type = 1, fallen.leaves = FALSE, extra = 4)

plotcp(tree.large)

printcp(tree.large)

Final tree model after pruning

credit.rpart.final <- rpart(formula = response ~ ., cp=0.0166667,
                            data = german_credit.train, 
                            method = "class", 
                            parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
rpart.plot(credit.rpart.final, type = 1, fallen.leaves = FALSE, extra = 4)

Tree - insample performance

credit.test.pred.tree1 = predict(credit.rpart.final, type = "prob")
roc.plot(german_credit.train$response == "1", credit.test.pred.tree1[,2])
roc.plot(german_credit.train$response == "1", credit.test.pred.tree1[,2])$roc.vol

Tree out of sample performance

credit.test.prob.rpart2 = predict(credit.rpart.final, german_credit.test, type = "prob")
roc.plot(german_credit.test$response == "1", credit.test.prob.rpart2[,2])
roc.plot(german_credit.test$response == "1", credit.test.prob.rpart2[,2])$roc.vol

Hence we compare and conclude logistic model is better in terms of AUC score and asymetric mislassification

mean(ifelse(german_credit.train$response != credit.test.pred.tree1, 1, 0))
b <- predict(credit.rpart.final, german_credit.test, type = "class")
cost1(german_credit.test$response,as.numeric(b))
cost1(german_credit.test$response,predicted.glm1.outsample)

Logistic Regression and CART on German Credit Data

Akul Mahajan

3/3/2018