Bank Fixed Deposit Prediction- Decision Tree

# Setting Working Directory
setwd("G:\\IIMK DABS\\Session Files\\21-08-22 (Decision Tree- Classification)")
getwd()

## [1] "G:/IIMK DABS/Session Files/21-08-22 (Decision Tree- Classification)"

# Reading data and selecting relevant variables for evaluation
data <- read.csv("bank-full.csv", sep = ";", stringsAsFactors = T, header = T)
data1 <- subset(data, select = -c(job, month, day))

# Loading required libraries
library(caret)
library(pROC)
library(rpart.plot)
library(ROSE)

# Using caret package to split data into train and test data
set.seed(1234)
index <-createDataPartition(data1$y, p  = .80, list = FALSE)
train <- data1[index,]
test <- data1 [-index,]

# Using caret package for repeated cross validation to create maximum variability in the data
set.seed(1234)
cv <- trainControl (method = "repeatedcv", number = 10, repeats = 5, allowParallel = TRUE)

# Create training model 
model1 <- train(y ~., data = train, method= 'rpart', trControl = cv, tuneLength = 10)
model1

## CART 
## 
## 36170 samples
##    13 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 32553, 32554, 32553, 32553, 32553, 32553, ... 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.001299622  0.9015760  0.4247119
##   0.001339004  0.9016976  0.4232326
##   0.001417769  0.9017695  0.4215040
##   0.001606805  0.9017087  0.4155153
##   0.001654064  0.9016644  0.4139209
##   0.002126654  0.9016534  0.3937838
##   0.005080340  0.9007299  0.3934127
##   0.018667297  0.8991982  0.3888559
##   0.025992439  0.8963450  0.3778361
##   0.036940769  0.8887255  0.2014977
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.001417769.

# Using library rpart.plot for visualizing the decision tree

rpart.plot(model1$finalModel, cex = 0.7)

# Predict test data using the model and create confusion matrix to check accuracy and sensitivity

predicted1 <- predict(model1, newdata = test, type = "raw")
confusionMatrix(predicted1, test$y, positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7761  666
##        yes  223  391
##                                           
##                Accuracy : 0.9017          
##                  95% CI : (0.8953, 0.9077)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 9.924e-09       
##                                           
##                   Kappa : 0.418           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.36991         
##             Specificity : 0.97207         
##          Pos Pred Value : 0.63681         
##          Neg Pred Value : 0.92097         
##              Prevalence : 0.11691         
##          Detection Rate : 0.04325         
##    Detection Prevalence : 0.06791         
##       Balanced Accuracy : 0.67099         
##                                           
##        'Positive' Class : yes             
##

# Using library proc to create ROC curve and check the AUC. For that, predicted probabilities are needed
predicted11 <- predict(model1, newdata = test, type = "prob")

roc(test$y, predicted11$yes, plot=TRUE, 
    legacy.axes=TRUE, percent=T, xlab="False Positive Percentage", ylab="True Postive Percentage", 
    col="black", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 70)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = test$y, predictor = predicted11$yes, percent = T,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "black", lwd = 1,     print.auc = TRUE, print.auc.x = 50, print.auc.y = 70)
## 
## Data: predicted11$yes in 7984 controls (test$y no) < 1057 cases (test$y yes).
## Area under the curve: 84.95%

The sensitivity value is very low. Sensitivity is the True positive rate. The value of sensitivity is low because the data is imbalanced. The proportion of event class and non-event class in the dependent variable is not same. In order to balance that, data balancing needs to be done. This can be done using ROSE package and creating over-sampled, under-sampled and ‘both’ sampled data

# Using ROSE package to create over-sampled, under-sampled and 'both' data
set.seed(1234)
over <- ovun.sample(y~., data = train, method = "over", N = 63876)$data

set.seed(1234)
under <- ovun.sample(y~., data = train, method = "under", N = 8464)$data

set.seed(1234)
both <- ovun.sample(y~., data = train, method = "both", N = 36170, p=0.5 )$data

# Over sampled data prediction and ROC curve

model2 <- train(y ~., data = over, method= 'rpart', trControl = cv, tuneLength = 10)
predicted2 <- predict(model2, newdata = test, type = "raw")
confusionMatrix(predicted2, test$y, positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6447  190
##        yes 1537  867
##                                          
##                Accuracy : 0.809          
##                  95% CI : (0.8007, 0.817)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.4043         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8202         
##             Specificity : 0.8075         
##          Pos Pred Value : 0.3606         
##          Neg Pred Value : 0.9714         
##              Prevalence : 0.1169         
##          Detection Rate : 0.0959         
##    Detection Prevalence : 0.2659         
##       Balanced Accuracy : 0.8139         
##                                          
##        'Positive' Class : yes            
##

# Sensitivity : 0.8202

predicted22 <- predict(model2, newdata = test, type = "prob")
par(pty="s")
roc(test$y, predicted22$yes, plot=TRUE, 
    legacy.axes=TRUE, percent=T, xlab="False Positive Percentage", ylab="True Postive Percentage", 
    col="red", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 70)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = test$y, predictor = predicted22$yes, percent = T,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "red", lwd = 1, print.auc = TRUE,     print.auc.x = 50, print.auc.y = 70)
## 
## Data: predicted22$yes in 7984 controls (test$y no) < 1057 cases (test$y yes).
## Area under the curve: 87.21%

# Area under the curve: 87.21%

# Under sampled data prediction and ROC curve

model3 <- train(y ~., data = under, method= 'rpart', trControl = cv, tuneLength = 10)
predicted3 <- predict(model3, newdata = test, type = "raw")
confusionMatrix(predicted3, test$y, positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6326  168
##        yes 1658  889
##                                           
##                Accuracy : 0.798           
##                  95% CI : (0.7896, 0.8063)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.393           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.84106         
##             Specificity : 0.79233         
##          Pos Pred Value : 0.34904         
##          Neg Pred Value : 0.97413         
##              Prevalence : 0.11691         
##          Detection Rate : 0.09833         
##    Detection Prevalence : 0.28172         
##       Balanced Accuracy : 0.81670         
##                                           
##        'Positive' Class : yes             
##

# Sensitivity : 0.84106

predicted33 <- predict(model3, newdata = test, type = "prob")
roc(test$y, predicted33$yes, plot=TRUE, 
    legacy.axes=TRUE, percent=T, xlab="False Positive Percentage", ylab="True Postive Percentage", 
    col="red", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 70)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = test$y, predictor = predicted33$yes, percent = T,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "red", lwd = 1, print.auc = TRUE,     print.auc.x = 50, print.auc.y = 70)
## 
## Data: predicted33$yes in 7984 controls (test$y no) < 1057 cases (test$y yes).
## Area under the curve: 87.12%

# Area under the curve: 87.12%

# Both sampled data prediction and ROC curve

model4 <- train(y ~., data = both, method= 'rpart', trControl = cv, tuneLength = 10)
predicted4 <- predict(model4, newdata = test, type = "raw")
confusionMatrix(predicted4, test$y, positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  6362  166
##        yes 1622  891
##                                           
##                Accuracy : 0.8022          
##                  95% CI : (0.7939, 0.8104)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4005          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.84295         
##             Specificity : 0.79684         
##          Pos Pred Value : 0.35456         
##          Neg Pred Value : 0.97457         
##              Prevalence : 0.11691         
##          Detection Rate : 0.09855         
##    Detection Prevalence : 0.27796         
##       Balanced Accuracy : 0.81990         
##                                           
##        'Positive' Class : yes             
##

# Sensitivity : 0.84295

predicted44 <- predict(model4, newdata = test, type = "prob")
roc(test$y, predicted44$yes, plot=TRUE, 
    legacy.axes=TRUE, percent=T, xlab="False Positive Percentage", ylab="True Postive Percentage", 
    col="red", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 70)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = test$y, predictor = predicted44$yes, percent = T,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "red", lwd = 1, print.auc = TRUE,     print.auc.x = 50, print.auc.y = 70)
## 
## Data: predicted44$yes in 7984 controls (test$y no) < 1057 cases (test$y yes).
## Area under the curve: 86.76%

# Area under the curve: 86.76%

# Combined ROC of all 3 models

roc(test$y, predicted22$yes, plot=TRUE, 
    legacy.axes=TRUE, percent=T, xlab="False Positive Percentage", ylab="True Postive Percentage", 
    col="red", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 70)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

## 
## Call:
## roc.default(response = test$y, predictor = predicted22$yes, percent = T,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "red", lwd = 1, print.auc = TRUE,     print.auc.x = 50, print.auc.y = 70)
## 
## Data: predicted22$yes in 7984 controls (test$y no) < 1057 cases (test$y yes).
## Area under the curve: 87.21%

plot.roc(test$y, predicted33$yes, legacy.axes=TRUE, 
         percent=T, col="blue", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 60, add=TRUE)

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

plot.roc(test$y, predicted44$yes, legacy.axes=TRUE, percent=T,
         col="green", lwd=1, print.auc = TRUE, print.auc.x=50, print.auc.y= 50, add=TRUE)

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

legend ("bottomright", bg ="transparent", legend=c("Over", "Under", "Both"), 
        col=c("red", "blue","green"), cex = 0.55, lwd=2)

Bank Fixed Deposit Prediction- Decision Tree

Aravind S

16/09/2022