In this project, I have ventured into comparison between classification tree and logistic regression, we use performance metrics such as AUC score and Asymmeteric misclassification rate to compare two models. I have also compared various model selection procedures using aic score, bic score and lasso variable selection.
Libraries Used
library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(caret)
library(dplyr)
library(glmnet)
library(boot)
library(verification)
library(rpart)
library(rpart.plot)
Importing Data
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german_credit$response = german_credit$response - 1
table(german_credit$response)
Train/Test split
set.seed(12420424)
in.train <- createDataPartition(as.factor(german_credit$response), p=0.8, list=FALSE)
german_credit.train <- german_credit[in.train,]
german_credit.test <- german_credit[-in.train,]
credit.glm0 <- glm(response ~ ., family = binomial, german_credit.train)
Stepwise variable selection using AIC
credit.glm.step <- step(credit.glm0, direction = "backward")
Chi-square test for significance of variables
summary(credit.glm.step)
drop1(credit.glm0, test ="Chi")
stepwise variable selection using BIC
credit.glm.step.bic <- step(credit.glm0, k = log(nrow(german_credit.train)))
summary(credit.glm.step.bic)
Lasso variable selection
german_credit.train$response <- as.numeric(german_credit.train$response)
summary(credit.glm.step.bic)
factor_var <- c(1,3,4,6,7,9,10,12,14,15,17,19,20,21)
num_var <- c(2,5,8,11,13,16,18)
train2 <- german_credit.train
train2[num_var] <- scale(train2[num_var])
train2[factor_var] <- sapply(train2[factor_var] , as.numeric)
summary(train2)
X.train <- as.matrix(train2[,1:20])
Y.train <- as.matrix(train2[,21])
lasso.fit<- glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1)
plot(lasso.fit)
cv.lasso<- cv.glmnet(x=X.train, y=Y.train, family = "gaussian", alpha = 1, nfolds = 10)
plot(cv.lasso)
cv.lasso$lambda.1se
coef(lasso.fit, s=cv.lasso$lambda.1se)
Final model for GLM
credit.glm.final <- glm(response ~ chk_acct + duration +
credit_his + amount +
saving_acct +
other_install + installment_rate,
family = binomial, german_credit.train)
In-sample misclassification rate
prob.glm1.insample <- predict(credit.glm.final, type = "response")
predicted.glm1.insample <- prob.glm1.insample > 0.1667
predicted.glm1.insample <- as.numeric(predicted.glm1.insample)
mean(ifelse(german_credit.train$response != predicted.glm1.insample, 1, 0))
In-sample AUC score
table(german_credit.train$response, predicted.glm1.insample, dnn = c("Truth", "Predicted"))
roc.plot(german_credit.train$response == "1", prob.glm1.insample)
roc.plot(german_credit.train$response == "1", prob.glm1.insample)$roc.vol$Area
Out of sample misclassification rate and AUC score
prob.glm1.outsample <- predict(credit.glm.final, german_credit.test, type = "response")
predicted.glm1.outsample <- prob.glm1.outsample > 0.1667
predicted.glm1.outsample <- as.numeric(predicted.glm1.outsample)
table(german_credit.test$response, predicted.glm1.outsample, dnn = c("Truth", "Predicted"))
mean(ifelse(german_credit.test$response != predicted.glm1.outsample, 1, 0))
roc.plot(german_credit.test$response == "1", prob.glm1.outsample)
roc.plot(german_credit.test$response == "1", prob.glm1.outsample)$roc.vol$Area
Asymmetric misclassification rate giving more penalty for false positives
cost1 <- function(r, pi) {
weight1 = 5
weight0 = 1
c1 = (r == 1) & (pi < 0.17) #logical vector - true if actual 1 but predict 0
c0 = (r == 0) & (pi > 0.17) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
cost1(german_credit.test$response,predicted.glm1.outsample)
Default tree model with cp = 0.01 and asymetric misclassification rate
credit.rpart <- rpart(formula = response ~ ., data = german_credit.train, method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
rpart.plot(credit.rpart, type = 1, fallen.leaves = FALSE, extra = 4)
Pruning tree by training a large tree and choosing an apt cp, using 10 fold CV error
tree.large <- rpart(formula = response ~ ., data = german_credit.train, method = "class",
cp=0.001)
rpart.plot(tree.large, type = 1, fallen.leaves = FALSE, extra = 4)
plotcp(tree.large)
printcp(tree.large)
Final tree model after pruning
credit.rpart.final <- rpart(formula = response ~ ., cp=0.0166667,
data = german_credit.train,
method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
rpart.plot(credit.rpart.final, type = 1, fallen.leaves = FALSE, extra = 4)
Tree - insample performance
credit.test.pred.tree1 = predict(credit.rpart.final, type = "prob")
roc.plot(german_credit.train$response == "1", credit.test.pred.tree1[,2])
roc.plot(german_credit.train$response == "1", credit.test.pred.tree1[,2])$roc.vol
Tree out of sample performance
credit.test.prob.rpart2 = predict(credit.rpart.final, german_credit.test, type = "prob")
roc.plot(german_credit.test$response == "1", credit.test.prob.rpart2[,2])
roc.plot(german_credit.test$response == "1", credit.test.prob.rpart2[,2])$roc.vol
Hence we compare and conclude logistic model is better in terms of AUC score and asymetric mislassification
mean(ifelse(german_credit.train$response != credit.test.pred.tree1, 1, 0))
b <- predict(credit.rpart.final, german_credit.test, type = "class")
cost1(german_credit.test$response,as.numeric(b))
cost1(german_credit.test$response,predicted.glm1.outsample)