Libraries Used
library(caret)
library(gbm)
library(verification)
Importing Data
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german_credit$response = german_credit$response - 1
Train/Test Split
set.seed(12420424)
in.train <- createDataPartition(as.factor(german_credit$response), p=0.8, list=FALSE)
german_credit.train <- german_credit[in.train,]
german_credit.test <- german_credit[-in.train,]
german_credit$response <- as.factor(german_credit$response)
credit.boost <- gbm(response ~ . , data = german_credit.train, interaction.depth = 20, n.trees = 50, shrinkage = 0.01)
pred.credit.boost <- predict(credit.boost, newdata = german_credit.test, type = "response",n.trees=50)
roc.plot(german_credit.test$response == "1", pred.credit.boost)
roc.plot(german_credit.test$response == "1", pred.credit.boost)$roc.vol$Area
Hyperparameter Tuning for Gradient Boosting
#df <- data.frame(matrix(0L, nrow = 200000, ncol = 4))
#names(df) <- c("ntrees","intdep","shirnk","error")
#ntrees <- 1:100
#intdep <- 1:20
#shrink <- seq(0.01,1,0.01)
#x <- 1
#for(i in 1:length(ntrees)){
#for(j in 1:length(intdep)){
#for(k in 1:length(shrink)){
#df[x,1] <- ntrees[i]
#df[x,2] <- intdep[j]
#df[x,3] <- shrink[k]
#boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = intdep[j], n.trees = ntrees[i], shrinkage = shrink[k])
#pred <- predict(boost.tree, newdata = german_credit.test, type = "response",ntrees[i])
#pred.tree <- as.numeric(pred > 0.17)
#df[x,4] <- mean(ifelse(german_credit.test!=pred.tree,1,0))
#x <- x + 1
#}
#}
#}
Running the above lines of code gives us the parameters with least misclassification error, we then compare them with area under ROC curve and choose the one with maximizes the area.
#df[df$error == min(df$error),]
boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = 4, n.trees = 52, shrinkage = 0.02)
pred.train <- predict(boost.tree, newdata = german_credit.test, type = "response",n.trees = 52)
roc.plot(german_credit.test$response == "1", pred.train)$roc.vol$Area
boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = 6, n.trees = 3, shrinkage = 0.32)
pred.train <- predict(boost.tree, newdata = german_credit.test, type = "response",n.trees = 3)
roc.plot(german_credit.test$response == "1", pred.train)$roc.vol$Area
boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = 12, n.trees = 4, shrinkage = 0.18)
pred.train <- predict(boost.tree, newdata = german_credit.test, type = "response",n.trees = 4)
roc.plot(german_credit.test$response == "1", pred.train)$roc.vol$Area
boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = 18, n.trees = 6, shrinkage = 0.11)
pred.train <- predict(boost.tree, newdata = german_credit.test, type = "response",n.trees = 6)
roc.plot(german_credit.test$response == "1", pred.train)$roc.vol$Area
boost.tree <- gbm(response ~ . , data = german_credit.train, distribution = "bernoulli", interaction.depth = 20, n.trees = 16, shrinkage = 0.04)
pred.train <- predict(boost.tree, newdata = german_credit.test, type = "response",n.trees = 16)
roc.plot(german_credit.test$response == "1", pred.train)$roc.vol$Area