assignment2

Author

Cameron Gray

library(tidyverse)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)

Data Import

The data being imported is the data that was preprocessed in assignment 1.

# reads in the data
data <- read.csv("preprocessed_bank.csv")
print(head(data))
  age          job marital education default balance housing loan duration
1  55   management married  tertiary      no    2000     yes   no      260
2  40   technician  single secondary      no       0     yes   no      150
3  30 entrepreneur married secondary      no       0     yes  yes       70
4  45  blue-collar married   unknown      no    1000     yes   no       90
5  30      unknown  single   unknown      no       0      no   no      190
6  35   management married  tertiary      no       0     yes   no      130
  campaign pdays previous  y days_since
1        1    -1        0 no        240
2        1    -1        0 no        240
3        1    -1        0 no        240
4        1    -1        0 no        240
5        1    -1        0 no        240
6        1    -1        0 no        240

Training Data

With a seed set the training and test data is sampled at random for the duration of the assignment to provide a fair testing enviroment for each model.

#setting the seed for all experiments
set.seed(314)

#creating a random sample subset of indices
samp <- sample(nrow(data), round(nrow(data)*0.8), replace = FALSE)

#setting the training and testing data from the sample indices
data_train <- data[samp,]
data_test <- data[-samp,]

Decision Trees

Experiment 1

First run of the decision tree experiments is to use the default settings for the rpart function. Although not encoded that is a cp = 0.01. This produced a relatively simple 2 split tree on a single variable. This gave an accuracy of 0.8918.

#creating the decision tree model
dt_model <- rpart(y~., method="class", data = data_train)
#graphing the final model
rpart.plot(dt_model)
title("Default", line = 2.5)

#predicting the results of the test data with the model
pred <- predict(dt_model, data_test, type="class")
#printing the metrics for the model's predictions
confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
Confusion Matrix and Statistics

          Reference
Prediction   no  yes
       no  7847  802
       yes  176  217
                                          
               Accuracy : 0.8918          
                 95% CI : (0.8853, 0.8982)
    No Information Rate : 0.8873          
    P-Value [Acc > NIR] : 0.08843         
                                          
                  Kappa : 0.261           
                                          
 Mcnemar's Test P-Value : < 2e-16         
                                          
            Sensitivity : 0.9781          
            Specificity : 0.2130          
         Pos Pred Value : 0.9073          
         Neg Pred Value : 0.5522          
              Precision : 0.9073          
                 Recall : 0.9781          
                     F1 : 0.9413          
             Prevalence : 0.8873          
         Detection Rate : 0.8678          
   Detection Prevalence : 0.9565          
      Balanced Accuracy : 0.5955          
                                          
       'Positive' Class : no              
                                          

Experiment 2

In order to see if the decision tree could be based on other features I will remove the duration feature from this model to see what decision tree is created. This caused the model to create a decision tree with no splits and just categorizes everything as a no. This gave an accuracy of 0.8808 because most of the actual test data is categorized as “no”.

#creating a new base dataset removing the duration feature
data2 <- data |> dplyr::select(-duration)

#creating new test and training data with new data set
samp <- sample(nrow(data2), round(nrow(data2)*0.8), replace = FALSE)
data_train2 <- data2[samp,]
data_test2 <- data2[-samp,]

#creating a new decision tree model and displaying the output
dt_model2 <- rpart(y~., method="class", data = data_train2)
rpart.plot(dt_model2)
title("No duration")

#predicting and printing metrics for the model
pred <- predict(dt_model2, data_test2, type="class")
confusionMatrix(pred, as.factor(data_test2$y), mode = "everything")
Confusion Matrix and Statistics

          Reference
Prediction   no  yes
       no  7964 1078
       yes    0    0
                                          
               Accuracy : 0.8808          
                 95% CI : (0.8739, 0.8874)
    No Information Rate : 0.8808          
    P-Value [Acc > NIR] : 0.5081          
                                          
                  Kappa : 0               
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 1.0000          
            Specificity : 0.0000          
         Pos Pred Value : 0.8808          
         Neg Pred Value :    NaN          
              Precision : 0.8808          
                 Recall : 1.0000          
                     F1 : 0.9366          
             Prevalence : 0.8808          
         Detection Rate : 0.8808          
   Detection Prevalence : 1.0000          
      Balanced Accuracy : 0.5000          
                                          
       'Positive' Class : no              
                                          

Experiment 3

To combat the issue of the last experiment I will try to vary the complexity parameter cp over a number of different values and take the metrics from each of those to determine which value is optimal. I found an optimal cp = 0.001 and that has an accuracy of 0.8990, incrementally better than default model. This led to an extremely complex decision tree with >50 decisions to be made. If we wanted a decision tree that could be easily explained in a business setting cp = 0.008 would be the best since it is not that complex to explain and has marginally better accuracy than the default, 0.8929, and is only marginally worse than cp = 0.001.

#creating a list of complexity parameters to try
cp = c(0.001,0.004,0.006,0.008,0.01,0.02,0.03)

#creating a df to store the results in
df <- data.frame(cp = integer(),
                Accuracy = double(),
                Precision = double(),
                Recall = double())

#looping through the different cp values and storing choice metrics in df
for (x in cp){
  #creation of decision tree model
  dt_model <- rpart(
    y~., 
    method="class", 
    data = data_train, 
    #setting cp for the current loop
    control = rpart.control(cp=x)
  )
  #plotting model
  rpart.plot(dt_model)
  title(paste("cp=", as.character(x)), line = 2.5)
  
  #predicting based on model and getting metrics
  pred <- predict(dt_model, data_test, type="class")
  cM <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
  
  #temp storage of choice metrics
  acc <- cM$overall[[1]]
  prec <- cM$byClass[[5]]
  recall <- cM$byClass[[6]]
  
  #storing metrics for the run
  df[nrow(df) + 1,] = c(x,acc, prec, recall)
}

#printing metrics for all runs
print(df |> arrange(desc(Accuracy)))
     cp  Accuracy Precision    Recall
1 0.001 0.8990268 0.9276949 0.9611118
2 0.004 0.8971466 0.9242732 0.9629814
3 0.006 0.8968149 0.9228292 0.9643525
4 0.008 0.8929440 0.9106041 0.9750717
5 0.010 0.8918381 0.9072725 0.9780631
6 0.020 0.8918381 0.9072725 0.9780631
7 0.030 0.8873037 0.8873037 1.0000000

Random Forest

Experiment 4

For the first experiment for random forests I will run with defaults for the rf method in the train function from the caret package. I will set the .mtry parameter to be the recommended default of the square root of the number of features, we have 13 features so we end up at 4 (\(\sqrt{13} \approx 3.606\)). This provides an accuracy of 0.902 which is the best I’ve been able to produce so far.

#training default model
rf_model <- train(
  y~., 
  data = data_train, 
  metric = "Accuracy", 
  method = "rf",
  trControl = trainControl(method = "none"),
  tuneGrid = expand.grid(.mtry = 4),
  na.action=na.omit
)
#predict based on test data and printing metrics for model
rf_pred <- predict(rf_model, data_test)
confusionMatrix(rf_pred, as.factor(data_test$y), mode = "everything")
Confusion Matrix and Statistics

          Reference
Prediction   no  yes
       no  7836  699
       yes  187  320
                                          
               Accuracy : 0.902           
                 95% CI : (0.8957, 0.9081)
    No Information Rate : 0.8873          
    P-Value [Acc > NIR] : 3.552e-06       
                                          
                  Kappa : 0.3724          
                                          
 Mcnemar's Test P-Value : < 2.2e-16       
                                          
            Sensitivity : 0.9767          
            Specificity : 0.3140          
         Pos Pred Value : 0.9181          
         Neg Pred Value : 0.6312          
              Precision : 0.9181          
                 Recall : 0.9767          
                     F1 : 0.9465          
             Prevalence : 0.8873          
         Detection Rate : 0.8666          
   Detection Prevalence : 0.9439          
      Balanced Accuracy : 0.6454          
                                          
       'Positive' Class : no              
                                          

Experiment 5

To experiment further with the random forest method I decided to test a number of .mtry ranging from 1 to 10. The best model proved to be the one where .mtry = 8, this parameter setting had the highest accuracy, 0.9041, and one of the highest precision values which is preferable in this business case. This model is the most successful of the experiments so far.

#setting values to test in .mtry parameter
try = c(1:10)

#creating df for storage of metrics
df <- data.frame(mtry = integer(),
                Accuracy = double(),
                Precision = double(),
                Recall = double())

for (x in try){
  # defining model for the run
  rf_model <- train(
    y~., 
    data = data_train, 
    metric = "Accuracy", 
    method = "rf",
    trControl = trainControl(method = "none"),
    #setting .mtry for the run
    tuneGrid = expand.grid(.mtry = x),
    na.action=na.omit
  )
  #predicting and getting metrics
  rf_pred <- predict(rf_model, data_test)
  cM <- confusionMatrix(rf_pred, as.factor(data_test$y), mode = "everything")
  
  #storing choice metrics for run
  acc <- cM$overall[[1]]
  prec <- cM$byClass[[5]]
  recall <- cM$byClass[[6]]
  
  #storing metrics for run
  df[nrow(df) + 1,] = c(x,acc, prec, recall)
}
print(df |> arrange(desc(Precision)) |> arrange(desc(Accuracy)))
   mtry  Accuracy Precision    Recall
1     8 0.9044459 0.9319416 0.9626075
2     7 0.9041141 0.9303584 0.9641032
3     6 0.9041141 0.9271729 0.9679671
4    10 0.9035612 0.9329217 0.9603640
5     9 0.9034506 0.9325992 0.9606132
6     5 0.9025658 0.9238073 0.9702106
7     4 0.9014599 0.9175644 0.9766920
8     3 0.8972572 0.9056496 0.9870373
9     2 0.8891838 0.8895794 0.9991275
10    1 0.8873037 0.8873037 1.0000000

Adaboost

Experiment 6

Now for adaboost (actually xgbTree which is a version of gradient boosting rather than adaptive boosting). Starting with defaults (which this method doesn’t seem to come with defaults so I’m going with some defaults from an example from the textbook Practical Machine Learning in R). This newest experiment provides the highest accuracy so far with a value of 0.9049.

#creating default model for xgbTree
xgb_model <- train(
  y~., 
  data = data_train, 
  metric = "Accuracy", 
  method = "xgbTree", 
  na.action=na.omit,
  trControl = trainControl(method = "none"),
  tuneGrid = expand.grid(
    nrounds = 100,
    max_depth = 6,
    eta = 0.3,
    gamma = 0.01,
    colsample_bytree = 1,
    min_child_weight = 1,
    subsample = 1
  )
)
#predicting and printing metrics
xgb_pred <- predict(xgb_model, data_test)
confusionMatrix(xgb_pred, as.factor(data_test$y), mode= "everything")
Confusion Matrix and Statistics

          Reference
Prediction   no  yes
       no  7684  521
       yes  339  498
                                          
               Accuracy : 0.9049          
                 95% CI : (0.8987, 0.9109)
    No Information Rate : 0.8873          
    P-Value [Acc > NIR] : 3.429e-08       
                                          
                  Kappa : 0.4842          
                                          
 Mcnemar's Test P-Value : 6.741e-10       
                                          
            Sensitivity : 0.9577          
            Specificity : 0.4887          
         Pos Pred Value : 0.9365          
         Neg Pred Value : 0.5950          
              Precision : 0.9365          
                 Recall : 0.9577          
                     F1 : 0.9470          
             Prevalence : 0.8873          
         Detection Rate : 0.8498          
   Detection Prevalence : 0.9074          
      Balanced Accuracy : 0.7232          
                                          
       'Positive' Class : no              
                                          

Experiment 7

After that last experiment and how the experiments went for decision trees and random forests I thought I’d try a mix of values for the gradient boosting. I decided to vary the number of rounds, the depth of the boost and the eta which impacts the speed at which the model learns (I’m not sure how much this impacts the final output but that’s the point of experimentation). After this experiment I found that the parameters being set to nround = 100, max_depth = 8, and eta = 0.1 gave the best results with accuracy = 0.9062 (while a few settings gave this same accuracy value this had the highest precision which again leads to the least number of false positives). This provides the model with the highest accuracy of all the experiments.

#setting values for experiment to be iterated through
rounds = c(25,50,75,100,125)
depths = c(4,5,6,7,8)
etas = c(0.1,0.2,0.3,0.4,0.5)

#creating df for storage of metrics
df <- data.frame(nrounds = integer(),
                depth = integer(),
                eta = double(),
                Accuracy = double(),
                Precision = double(),
                Recall = double())

#triple nested for loops to set all the selected parameter combinations
for (x in rounds) {
  for (y in depths) {
    for (z in etas) {
      xgb_model <- train(
        y~.,
        data = data_train,
        metric = "Accuracy",
        method = "xgbTree",
        na.action=na.omit,
        trControl = trainControl(method = "none"),
        tuneGrid = expand.grid(
          #setting parameters for run
          nrounds = x,
          max_depth = y,
          eta = z,
          gamma = 0.01,
          colsample_bytree = 1,
          min_child_weight = 1,
          subsample = 1
        )
      )
    #predicting and getting metrics
    xgb_pred <- predict(xgb_model, data_test)
    cM <- confusionMatrix(xgb_pred, as.factor(data_test$y), mode = "everything")
    
    #gathering choice metrics
    acc <- cM$overall[[1]]
    prec <- cM$byClass[[5]]
    recall <- cM$byClass[[6]]
    
    #storing metrics for run
    df[nrow(df) + 1,] = c(x,y,z,acc, prec, recall)
    }
  }
}
#printing metrics for all runs
print(df |> arrange(desc(Precision)) |> arrange(desc(Accuracy)) |> head())
  nrounds depth eta  Accuracy Precision    Recall
1     100     8 0.1 0.9062154 0.9359582 0.9599900
2      75     7 0.1 0.9062154 0.9337444 0.9626075
3      75     5 0.2 0.9062154 0.9330115 0.9634800
4      50     5 0.2 0.9059942 0.9320564 0.9643525
5      50     4 0.3 0.9057731 0.9308977 0.9654743
6     100     5 0.1 0.9057731 0.9303805 0.9660975