library(tidyverse)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
library(xgboost)
library(corrplot)
assignment2
Data Import
The data being imported is the data that was preprocessed in assignment 1.
# reads in the data
<- read.csv("preprocessed_bank.csv")
data print(head(data))
age job marital education default balance housing loan duration
1 55 management married tertiary no 2000 yes no 260
2 40 technician single secondary no 0 yes no 150
3 30 entrepreneur married secondary no 0 yes yes 70
4 45 blue-collar married unknown no 1000 yes no 90
5 30 unknown single unknown no 0 no no 190
6 35 management married tertiary no 0 yes no 130
campaign pdays previous y days_since
1 1 -1 0 no 240
2 1 -1 0 no 240
3 1 -1 0 no 240
4 1 -1 0 no 240
5 1 -1 0 no 240
6 1 -1 0 no 240
Training Data
With a seed set the training and test data is sampled at random for the duration of the assignment to provide a fair testing enviroment for each model.
#setting the seed for all experiments
set.seed(314)
#creating a random sample subset of indices
<- sample(nrow(data), round(nrow(data)*0.8), replace = FALSE)
samp
#setting the training and testing data from the sample indices
<- data[samp,]
data_train <- data[-samp,] data_test
Decision Trees
Experiment 1
First run of the decision tree experiments is to use the default settings for the rpart
function. Although not encoded that is a cp
= 0.01. This produced a relatively simple 2 split tree on a single variable. This gave an accuracy of 0.8918.
#creating the decision tree model
<- rpart(y~., method="class", data = data_train)
dt_model #graphing the final model
rpart.plot(dt_model)
title("Default", line = 2.5)
#predicting the results of the test data with the model
<- predict(dt_model, data_test, type="class")
pred #printing the metrics for the model's predictions
confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7847 802
yes 176 217
Accuracy : 0.8918
95% CI : (0.8853, 0.8982)
No Information Rate : 0.8873
P-Value [Acc > NIR] : 0.08843
Kappa : 0.261
Mcnemar's Test P-Value : < 2e-16
Sensitivity : 0.9781
Specificity : 0.2130
Pos Pred Value : 0.9073
Neg Pred Value : 0.5522
Precision : 0.9073
Recall : 0.9781
F1 : 0.9413
Prevalence : 0.8873
Detection Rate : 0.8678
Detection Prevalence : 0.9565
Balanced Accuracy : 0.5955
'Positive' Class : no
Experiment 2
In order to see if the decision tree could be based on other features I will remove the duration feature from this model to see what decision tree is created. This caused the model to create a decision tree with no splits and just categorizes everything as a no. This gave an accuracy of 0.8808 because most of the actual test data is categorized as “no”.
#creating a new base dataset removing the duration feature
<- data |> dplyr::select(-duration)
data2
#creating new test and training data with new data set
<- sample(nrow(data2), round(nrow(data2)*0.8), replace = FALSE)
samp <- data2[samp,]
data_train2 <- data2[-samp,]
data_test2
#creating a new decision tree model and displaying the output
<- rpart(y~., method="class", data = data_train2)
dt_model2 rpart.plot(dt_model2)
title("No duration")
#predicting and printing metrics for the model
<- predict(dt_model2, data_test2, type="class")
pred confusionMatrix(pred, as.factor(data_test2$y), mode = "everything")
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7964 1078
yes 0 0
Accuracy : 0.8808
95% CI : (0.8739, 0.8874)
No Information Rate : 0.8808
P-Value [Acc > NIR] : 0.5081
Kappa : 0
Mcnemar's Test P-Value : <2e-16
Sensitivity : 1.0000
Specificity : 0.0000
Pos Pred Value : 0.8808
Neg Pred Value : NaN
Precision : 0.8808
Recall : 1.0000
F1 : 0.9366
Prevalence : 0.8808
Detection Rate : 0.8808
Detection Prevalence : 1.0000
Balanced Accuracy : 0.5000
'Positive' Class : no
Experiment 3
To combat the issue of the last experiment I will try to vary the complexity parameter cp
over a number of different values and take the metrics from each of those to determine which value is optimal. I found an optimal cp
= 0.001 and that has an accuracy of 0.8990, incrementally better than default model. This led to an extremely complex decision tree with >50 decisions to be made. If we wanted a decision tree that could be easily explained in a business setting cp
= 0.008 would be the best since it is not that complex to explain and has marginally better accuracy than the default, 0.8929, and is only marginally worse than cp
= 0.001.
#creating a list of complexity parameters to try
= c(0.001,0.004,0.006,0.008,0.01,0.02,0.03)
cp
#creating a df to store the results in
<- data.frame(cp = integer(),
df Accuracy = double(),
Precision = double(),
Recall = double())
#looping through the different cp values and storing choice metrics in df
for (x in cp){
#creation of decision tree model
<- rpart(
dt_model ~.,
ymethod="class",
data = data_train,
#setting cp for the current loop
control = rpart.control(cp=x)
)#plotting model
rpart.plot(dt_model)
title(paste("cp=", as.character(x)), line = 2.5)
#predicting based on model and getting metrics
<- predict(dt_model, data_test, type="class")
pred <- confusionMatrix(pred, as.factor(data_test$y), mode = "everything")
cM
#temp storage of choice metrics
<- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall
#storing metrics for the run
nrow(df) + 1,] = c(x,acc, prec, recall)
df[ }
#printing metrics for all runs
print(df |> arrange(desc(Accuracy)))
cp Accuracy Precision Recall
1 0.001 0.8990268 0.9276949 0.9611118
2 0.004 0.8971466 0.9242732 0.9629814
3 0.006 0.8968149 0.9228292 0.9643525
4 0.008 0.8929440 0.9106041 0.9750717
5 0.010 0.8918381 0.9072725 0.9780631
6 0.020 0.8918381 0.9072725 0.9780631
7 0.030 0.8873037 0.8873037 1.0000000
Random Forest
Experiment 4
For the first experiment for random forests I will run with defaults for the rf method in the train
function from the caret
package. I will set the .mtry
parameter to be the recommended default of the square root of the number of features, we have 13 features so we end up at 4 (\(\sqrt{13} \approx 3.606\)). This provides an accuracy of 0.902 which is the best I’ve been able to produce so far.
#training default model
<- train(
rf_model ~.,
ydata = data_train,
metric = "Accuracy",
method = "rf",
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(.mtry = 4),
na.action=na.omit
)#predict based on test data and printing metrics for model
<- predict(rf_model, data_test)
rf_pred confusionMatrix(rf_pred, as.factor(data_test$y), mode = "everything")
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7836 699
yes 187 320
Accuracy : 0.902
95% CI : (0.8957, 0.9081)
No Information Rate : 0.8873
P-Value [Acc > NIR] : 3.552e-06
Kappa : 0.3724
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9767
Specificity : 0.3140
Pos Pred Value : 0.9181
Neg Pred Value : 0.6312
Precision : 0.9181
Recall : 0.9767
F1 : 0.9465
Prevalence : 0.8873
Detection Rate : 0.8666
Detection Prevalence : 0.9439
Balanced Accuracy : 0.6454
'Positive' Class : no
Experiment 5
To experiment further with the random forest method I decided to test a number of .mtry
ranging from 1 to 10. The best model proved to be the one where .mtry
= 8, this parameter setting had the highest accuracy, 0.9041, and one of the highest precision values which is preferable in this business case. This model is the most successful of the experiments so far.
#setting values to test in .mtry parameter
= c(1:10)
try
#creating df for storage of metrics
<- data.frame(mtry = integer(),
df Accuracy = double(),
Precision = double(),
Recall = double())
for (x in try){
# defining model for the run
<- train(
rf_model ~.,
ydata = data_train,
metric = "Accuracy",
method = "rf",
trControl = trainControl(method = "none"),
#setting .mtry for the run
tuneGrid = expand.grid(.mtry = x),
na.action=na.omit
)#predicting and getting metrics
<- predict(rf_model, data_test)
rf_pred <- confusionMatrix(rf_pred, as.factor(data_test$y), mode = "everything")
cM
#storing choice metrics for run
<- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall
#storing metrics for run
nrow(df) + 1,] = c(x,acc, prec, recall)
df[
}print(df |> arrange(desc(Precision)) |> arrange(desc(Accuracy)))
mtry Accuracy Precision Recall
1 8 0.9044459 0.9319416 0.9626075
2 7 0.9041141 0.9303584 0.9641032
3 6 0.9041141 0.9271729 0.9679671
4 10 0.9035612 0.9329217 0.9603640
5 9 0.9034506 0.9325992 0.9606132
6 5 0.9025658 0.9238073 0.9702106
7 4 0.9014599 0.9175644 0.9766920
8 3 0.8972572 0.9056496 0.9870373
9 2 0.8891838 0.8895794 0.9991275
10 1 0.8873037 0.8873037 1.0000000
Adaboost
Experiment 6
Now for adaboost (actually xgbTree which is a version of gradient boosting rather than adaptive boosting). Starting with defaults (which this method doesn’t seem to come with defaults so I’m going with some defaults from an example from the textbook Practical Machine Learning in R). This newest experiment provides the highest accuracy so far with a value of 0.9049.
#creating default model for xgbTree
<- train(
xgb_model ~.,
ydata = data_train,
metric = "Accuracy",
method = "xgbTree",
na.action=na.omit,
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(
nrounds = 100,
max_depth = 6,
eta = 0.3,
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
)#predicting and printing metrics
<- predict(xgb_model, data_test)
xgb_pred confusionMatrix(xgb_pred, as.factor(data_test$y), mode= "everything")
Confusion Matrix and Statistics
Reference
Prediction no yes
no 7684 521
yes 339 498
Accuracy : 0.9049
95% CI : (0.8987, 0.9109)
No Information Rate : 0.8873
P-Value [Acc > NIR] : 3.429e-08
Kappa : 0.4842
Mcnemar's Test P-Value : 6.741e-10
Sensitivity : 0.9577
Specificity : 0.4887
Pos Pred Value : 0.9365
Neg Pred Value : 0.5950
Precision : 0.9365
Recall : 0.9577
F1 : 0.9470
Prevalence : 0.8873
Detection Rate : 0.8498
Detection Prevalence : 0.9074
Balanced Accuracy : 0.7232
'Positive' Class : no
Experiment 7
After that last experiment and how the experiments went for decision trees and random forests I thought I’d try a mix of values for the gradient boosting. I decided to vary the number of rounds, the depth of the boost and the eta which impacts the speed at which the model learns (I’m not sure how much this impacts the final output but that’s the point of experimentation). After this experiment I found that the parameters being set to nround
= 100, max_depth
= 8, and eta
= 0.1 gave the best results with accuracy = 0.9062 (while a few settings gave this same accuracy value this had the highest precision which again leads to the least number of false positives). This provides the model with the highest accuracy of all the experiments.
#setting values for experiment to be iterated through
= c(25,50,75,100,125)
rounds = c(4,5,6,7,8)
depths = c(0.1,0.2,0.3,0.4,0.5)
etas
#creating df for storage of metrics
<- data.frame(nrounds = integer(),
df depth = integer(),
eta = double(),
Accuracy = double(),
Precision = double(),
Recall = double())
#triple nested for loops to set all the selected parameter combinations
for (x in rounds) {
for (y in depths) {
for (z in etas) {
<- train(
xgb_model ~.,
ydata = data_train,
metric = "Accuracy",
method = "xgbTree",
na.action=na.omit,
trControl = trainControl(method = "none"),
tuneGrid = expand.grid(
#setting parameters for run
nrounds = x,
max_depth = y,
eta = z,
gamma = 0.01,
colsample_bytree = 1,
min_child_weight = 1,
subsample = 1
)
)#predicting and getting metrics
<- predict(xgb_model, data_test)
xgb_pred <- confusionMatrix(xgb_pred, as.factor(data_test$y), mode = "everything")
cM
#gathering choice metrics
<- cM$overall[[1]]
acc <- cM$byClass[[5]]
prec <- cM$byClass[[6]]
recall
#storing metrics for run
nrow(df) + 1,] = c(x,y,z,acc, prec, recall)
df[
}
}
}#printing metrics for all runs
print(df |> arrange(desc(Precision)) |> arrange(desc(Accuracy)) |> head())
nrounds depth eta Accuracy Precision Recall
1 100 8 0.1 0.9062154 0.9359582 0.9599900
2 75 7 0.1 0.9062154 0.9337444 0.9626075
3 75 5 0.2 0.9062154 0.9330115 0.9634800
4 50 5 0.2 0.9059942 0.9320564 0.9643525
5 50 4 0.3 0.9057731 0.9308977 0.9654743
6 100 5 0.1 0.9057731 0.9303805 0.9660975