Libraries Used
library(ggplot2)
library(caret)
library(rpart.plot)
library(rpart)
library(ipred)
library(verification)
library(randomForest)
library(reshape2)
Importing data
german_credit = read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german_credit) = c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german_credit$response = german_credit$response - 1
Train-Test Split
set.seed(12420424)
in.train <- createDataPartition(as.factor(german_credit$response), p=0.8, list=FALSE)
german_credit.train <- german_credit[in.train,]
german_credit.test <- german_credit[-in.train,]
Pruned - Tree model
credit.rpart.final <- rpart(formula = response ~ ., cp=0.0138889,
data = german_credit.train,
method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
rpart.plot(credit.rpart.final, fallen.leaves = FALSE)
Tree with bagging
credit.bag<- bagging(as.factor(response)~., data = german_credit.train, nbagg=100)
credit.bag.pred.test<- predict(credit.bag, newdata = german_credit.test, type="prob")[,2]
cost1 <- function(r, pi) {
weight1 = 1
weight0 = 1
c1 = (r == 1) & (pi < 0.17) #logical vector - true if actual 1 but predict 0
c0 = (r == 0) & (pi > 0.17) #logical vecotr - true if actual 0 but predict 1
return(mean(weight1 * c1 + weight0 * c0))
}
ntree<- c(1, 3, 5, seq(10, 200, 10))
MSR.train<- rep(0, length(ntree))
for(i in 1:length(ntree)){
boston.bag1<- bagging(response~., data = german_credit.train, nbagg=ntree[i])
boston.bag.pred1<- predict(boston.bag1, newdata = german_credit.train, type ="prob")
boston.bag.class <- as.numeric(boston.bag.pred1 > 0.5)
MSR.train[i]<- mean(ifelse(german_credit.train$response != boston.bag.class,1,0))
}
df <- data.frame(cbind(ntree,MSR.train))
ggplot(df,aes(x=ntree,y=MSR.train)) + geom_line(col = "blue")
Final Tree’s performance with tree parameter = 140
bag.model.final <- bagging(as.factor(response)~., data = german_credit.train, nbagg=140)
credit.test.pred.tree1<- predict(bag.model.final, german_credit.test, type="class")
credit.test.pred.tree1.prob <- predict(bag.model.final, german_credit.test, type="prob")
table(german_credit.test$response, credit.test.pred.tree1, dnn=c("Truth","Predicted"))
roc.plot(german_credit.test$response == "1", credit.test.pred.tree1.prob[,2])
roc.plot(german_credit.test$response == "1", credit.test.pred.tree1.prob[,2])$roc.vol$Area
Random Forest Model
credit.rf<- randomForest(as.factor(response)~., data = german_credit.train)
credit.rf$err.rate[500]
err.rate <- data.frame(credit.rf$err.rate)
seq <- data.frame(seq(1:nrow(err.rate)))
err.rate <- cbind(err.rate,seq)
names(err.rate) <- c("OOB","FPR","FNR","index")
d <- melt(err.rate, id.vars = "index")
ggplot(d,aes(index,value,col=variable)) + geom_line()
Choosing number of parameters for Random Forest
oob.err<- rep(0, 20)
test.err<- rep(0, 20)
for(i in 1:20){
fit<- randomForest(as.factor(response)~., data = german_credit.train, mtry = i)
oob.err[i]<- credit.rf$err.rate[500]
rf.pred.test <- predict(fit,german_credit.test,type="class")
test.err[i]<- mean(ifelse(german_credit.test$response != rf.pred.test, 1, 0))
}
index <- data.frame(1:20)
df <- data.frame(cbind(index,oob.err,test.err))
d <- melt(df, id.vars = "X1.20")
ggplot(d,aes(X1.20,value,col=variable)) + geom_line()
Choosing cut-off threshold as FNR is very high
rf.final <- randomForest(as.factor(response)~., data = german_credit.train, mtry = 6)
rf.final.pred<- predict(rf.final, type = "prob")[,2]
costfunc = function(obs, pred.p, pcut){
weight1 = 5
weight0 = 1
c1 = (obs==1)&(pred.p<pcut)
c0 = (obs==0)&(pred.p>=pcut)
cost = mean(weight1*c1 + weight0*c0)
return(cost)
}
p.seq = seq(0.01, 0.5, 0.01)
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = german_credit.train$response, pred.p = rf.final.pred, pcut = p.seq[i])
}
plot(p.seq,cost)
df <- data.frame(cbind(p.seq,cost))
df[df$cost == min(df$cost),]
Random Forest Performance on training set
rf.final.train.class <- as.numeric(rf.final.pred > 0.17)
table(german_credit.train$response, rf.final.train.class, dnn=c("Truth","Predicted"))
roc.plot(german_credit.train$response == "1", rf.final.pred)
roc.plot(german_credit.train$response == "1", rf.final.pred)$roc.vol$Area
Random Forest Performance on Test set
rf.pred.final.test <- predict(rf.final, german_credit.test,type = "prob")[,2]
rf.final.test.class <- as.numeric(rf.pred.final.test > 0.17)
table(german_credit.test$response, rf.final.test.class, dnn=c("Truth","Predicted"))
roc.plot(german_credit.test$response == "1", rf.pred.final.test)
roc.plot(german_credit.test$response == "1", rf.pred.final.test)$roc.vol$Area