#load required libraries
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)
library(rpart)
library(rpart.plot)
library(doParallel)
library(foreach)
library(xgboost)}))
#load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")

Relevant Data Pre-Processing Steps from Assignment 1

# handle categorical data
# convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
  mutate(across(where(is.character), as.factor))
# feature selection to improve computational efficiency 
# eliminate 'pdays' and 'default' (near zero variance)
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)

Assignment 2 Code

Decision Trees

Control Model

# Split data into training and testing sets
set.seed(1989)
trainIndex_80 <- createDataPartition(bank_addtl_full$y, p = 0.8, list = FALSE)
trainData_80 <- bank_addtl_full[trainIndex_80, ]
testData_20  <- bank_addtl_full[-trainIndex_80, ]
# Train a Decision Tree model using rpart
tree_model_control <- train(
  y ~ .,        
  data = trainData_80,   
  method = "rpart", # Standard decision tree (CART)
  trControl = trainControl(method = "cv", number = 5) # 5-fold cross validation
)
# Print model summary
print(tree_model_control)
## CART 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26361, 26360, 26361, 26361 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.02007004  0.9066795  0.4426613
##   0.02128233  0.9061029  0.4509412
##   0.06950431  0.8949958  0.2495608
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02007004.
# Visualize the tree
rpart.plot(tree_model_control$finalModel) 

# Evaluate variable importance
plot(varImp(tree_model_control), top=20)

# Make predictions
predictions_tree_control <- predict(tree_model_control, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_tree_control, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7153  599
##        yes  156  329
##                                           
##                Accuracy : 0.9083          
##                  95% CI : (0.9019, 0.9145)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 3.126e-10       
##                                           
##                   Kappa : 0.4209          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9787          
##             Specificity : 0.3545          
##          Pos Pred Value : 0.9227          
##          Neg Pred Value : 0.6784          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8684          
##    Detection Prevalence : 0.9411          
##       Balanced Accuracy : 0.6666          
##                                           
##        'Positive' Class : no              
## 

Experiment 1: Use hyperparameter tuning to improve performance

# Train a Decision Tree model using rpart with hyperparameter tuning of complexity parameter
tree_model_exp_1 <- train(
  y ~ .,      
  data = trainData_80,
  method = "rpart",   # Standard decision tree (CART)
  trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
  tuneGrid = expand.grid(cp = c(0.01, 0.05, 0.1))  # Complexity parameter for pruning in rpart
)
# Print model summary
print(tree_model_exp_1)
## CART 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26361, 26362, 26360, 26360 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa    
##   0.01  0.9107462  0.5178619
##   0.05  0.9028861  0.4370908
##   0.10  0.8873479  0.0000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01.
# Visualize the tree
rpart.plot(tree_model_exp_1$finalModel) 

# Evaluate variable importance
plot(varImp(tree_model_exp_1), top=20)

# Make predictions
predictions_tree_1 <- predict(tree_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_tree_1, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7052  435
##        yes  257  493
##                                           
##                Accuracy : 0.916           
##                  95% CI : (0.9098, 0.9219)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5414          
##                                           
##  Mcnemar's Test P-Value : 1.714e-11       
##                                           
##             Sensitivity : 0.9648          
##             Specificity : 0.5312          
##          Pos Pred Value : 0.9419          
##          Neg Pred Value : 0.6573          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8561          
##    Detection Prevalence : 0.9089          
##       Balanced Accuracy : 0.7480          
##                                           
##        'Positive' Class : no              
## 

Experiment 2: Increase train/test split to prevent overfitting

# Increase train/test split to 85% train and 15% test
set.seed(1989)
trainIndex_85 <- createDataPartition(bank_addtl_full$y, p = 0.85, list = FALSE)
trainData_85 <- bank_addtl_full[trainIndex_85, ]
testData_15 <- bank_addtl_full[-trainIndex_85, ]
# Train a Decision Tree model on new training data
tree_model_exp_2 <- train(
  y ~ .,      
  data = trainData_85, # higher train/test split
  method = "rpart",   # Standard decision tree (CART)
  trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
  tuneGrid = expand.grid(cp = c(0.01, 0.05, 0.1))  # Complexity parameter for pruning in rpart
)
# Print model summary
print(tree_model_exp_2)
## CART 
## 
## 35010 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 28008, 28007, 28009, 28008, 28008 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa    
##   0.01  0.9115396  0.5154555
##   0.05  0.9028276  0.4336195
##   0.10  0.8873465  0.0000000
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01.
# Visualize the tree
rpart.plot(tree_model_exp_2$finalModel) 

# Evaluate variable importance
plot(varImp(tree_model_exp_2), top=20)

# Make predictions on new testing data
predictions_tree_2 <- predict(tree_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_tree_2, testData_15$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5307  345
##        yes  175  351
##                                           
##                Accuracy : 0.9158          
##                  95% CI : (0.9086, 0.9226)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 9.996e-14       
##                                           
##                   Kappa : 0.5288          
##                                           
##  Mcnemar's Test P-Value : 1.252e-13       
##                                           
##             Sensitivity : 0.9681          
##             Specificity : 0.5043          
##          Pos Pred Value : 0.9390          
##          Neg Pred Value : 0.6673          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8590          
##    Detection Prevalence : 0.9149          
##       Balanced Accuracy : 0.7362          
##                                           
##        'Positive' Class : no              
## 

Random Forest

Control Model

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)


# Train a Random Forest model using rf
rf_model_control <- train(
  y ~ .,
  data = trainData_80,
  method = "rf",   # Random Forest model
  trControl = trainControl(method = "cv", number = 5) # 5-fold cross validation
)

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print model summary
print(rf_model_control)
## Random Forest 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26361, 26361, 26360, 26361 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.9008831  0.2499619
##   26    0.9129312  0.5356371
##   50    0.9112317  0.5273636
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
# Evaluate variable importance
plot(varImp(rf_model_control), top=20)

# Make predictions
predictions_rf_control <- predict(rf_model_control, newdata = testData_20)
# Evaluate accuracy
confusionMatrix(predictions_rf_control, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7011  410
##        yes  298  518
##                                         
##                Accuracy : 0.914         
##                  95% CI : (0.9078, 0.92)
##     No Information Rate : 0.8873        
##     P-Value [Acc > NIR] : 1.018e-15     
##                                         
##                   Kappa : 0.5462        
##                                         
##  Mcnemar's Test P-Value : 3.024e-05     
##                                         
##             Sensitivity : 0.9592        
##             Specificity : 0.5582        
##          Pos Pred Value : 0.9448        
##          Neg Pred Value : 0.6348        
##              Prevalence : 0.8873        
##          Detection Rate : 0.8512        
##    Detection Prevalence : 0.9009        
##       Balanced Accuracy : 0.7587        
##                                         
##        'Positive' Class : no            
## 

Experiment 1: Use hyperparameter tuning to improve performance

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)

# Train a Random Forest model using rf with hyperparameter tuning of mtry parameter
rf_model_exp_1 <- train(
  y ~ .,      
  data = trainData_80,
  method = "rf",   # Random Forest model
  trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
  tuneGrid = expand.grid(mtry = c(1, 2, 3))  # Mtry parameter
)

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print model summary
print(rf_model_exp_1)
## Random Forest 
## 
## 32951 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 26361, 26360, 26361, 26360, 26362 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.8873479  0.0000000
##   2     0.9006709  0.2480457
##   3     0.9063460  0.3537073
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# Evaluate variable importance
plot(varImp(rf_model_exp_1), top=20)

# Make predictions
predictions_rf_1 <- predict(rf_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_rf_1, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7222  664
##        yes   87  264
##                                          
##                Accuracy : 0.9088         
##                  95% CI : (0.9024, 0.915)
##     No Information Rate : 0.8873         
##     P-Value [Acc > NIR] : 1.21e-10       
##                                          
##                   Kappa : 0.3741         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.9881         
##             Specificity : 0.2845         
##          Pos Pred Value : 0.9158         
##          Neg Pred Value : 0.7521         
##              Prevalence : 0.8873         
##          Detection Rate : 0.8768         
##    Detection Prevalence : 0.9574         
##       Balanced Accuracy : 0.6363         
##                                          
##        'Positive' Class : no             
## 

Experiment 2: Increase train/test split to prevent overfitting

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)

# Train a Random Forest model using rf on new training data
rf_model_exp_2 <- train(
  y ~ .,      
  data = trainData_85,
  method = "rf",   # Random Forest model
  trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
  tuneGrid = expand.grid(mtry = c(1, 2, 3))  # Mtry parameter
)

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print model summary
print(rf_model_exp_2)
## Random Forest 
## 
## 35010 samples
##    18 predictor
##     2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 28008, 28008, 28008, 28009, 28007 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   1     0.8873465  0.0000000
##   2     0.9016566  0.2574433
##   3     0.9065695  0.3539608
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# Evaluate variable importance
plot(varImp(rf_model_exp_2), top=20)

# Make predictions on new testing data
predictions_rf_2 <- predict(rf_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_rf_2, testData_15$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5422  511
##        yes   60  185
##                                           
##                Accuracy : 0.9076          
##                  95% CI : (0.9001, 0.9147)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1.33e-07        
##                                           
##                   Kappa : 0.3554          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9891          
##             Specificity : 0.2658          
##          Pos Pred Value : 0.9139          
##          Neg Pred Value : 0.7551          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8776          
##    Detection Prevalence : 0.9603          
##       Balanced Accuracy : 0.6274          
##                                           
##        'Positive' Class : no              
## 

XGBoost

*I attempted to use Adaboost, however I found that the package fastAdaboost is required with the “adaboost” method in the caret package, and fastAdaboost is not maintained with the current version of R. As an alternative, I am using XGBoost.

Control Model

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)

# Train an XGBoost model using xgbTree
xgb_model_control <- train(
  y ~ .,    
  data = trainData_80,
  method = "xgbTree",   # XGBoost model
  trControl = trainControl(method = "cv", number = 3) # 3-fold cross-validation
)

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_control$results, xgb_model_control$bestTune)
##   eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.4         3     0              0.6                1      0.75      50
##   Accuracy     Kappa  AccuracySD    KappaSD
## 1   0.9163 0.5408723 0.002577816 0.01250585
# Evaluate variable importance
plot(varImp(xgb_model_control), top=20)

# Make predictions
predictions_xgb_control <- predict(xgb_model_control, newdata = testData_20)
# Evaluate accuracy
confusionMatrix(predictions_xgb_control, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7064  446
##        yes  245  482
##                                          
##                Accuracy : 0.9161         
##                  95% CI : (0.9099, 0.922)
##     No Information Rate : 0.8873         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5366         
##                                          
##  Mcnemar's Test P-Value : 2.776e-14      
##                                          
##             Sensitivity : 0.9665         
##             Specificity : 0.5194         
##          Pos Pred Value : 0.9406         
##          Neg Pred Value : 0.6630         
##              Prevalence : 0.8873         
##          Detection Rate : 0.8576         
##    Detection Prevalence : 0.9117         
##       Balanced Accuracy : 0.7429         
##                                          
##        'Positive' Class : no             
## 

Experiment 1: Use hyperparameter tuning to improve performance

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)

# Train an XGBoost model using xgbTree with hyperparameter tuning of multiple parameters
xgb_model_exp_1 <- train(
  y ~ .,      
  data = trainData_80,
  method = "xgbTree",   # XGBoost model
  trControl = trainControl(method = "cv", number = 3), # 3-fold cross validation
  tuneGrid = expand.grid(
  nrounds = c(50, 100, 150),   # Number of trees
  max_depth = c(3, 6),      # Tree depth
  eta = c(0.1, 0.2, 0.3),     # Learning rate
  gamma = c(0, 1, 5),          # Minimum loss reduction to split
  colsample_bytree = c(0.6, 0.8, 1),  # Feature sampling per tree
  min_child_weight = c(1, 3, 5),      # Minimum sum of instance weight
  subsample = c(0.6, 0.8)  # Fraction of data used per boosting round
))

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_exp_1$results, xgb_model_exp_1$bestTune)
##   eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.1         6     5                1                5       0.8      50
##    Accuracy     Kappa  AccuracySD    KappaSD
## 1 0.9172105 0.5498451 0.001738418 0.01546551
# Evaluate variable importance
plot(varImp(xgb_model_exp_1), top=20)

# Make predictions
predictions_xgb_1 <- predict(xgb_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_xgb_1, testData_20$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7054  421
##        yes  255  507
##                                           
##                Accuracy : 0.9179          
##                  95% CI : (0.9118, 0.9238)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5548          
##                                           
##  Mcnemar's Test P-Value : 2.208e-10       
##                                           
##             Sensitivity : 0.9651          
##             Specificity : 0.5463          
##          Pos Pred Value : 0.9437          
##          Neg Pred Value : 0.6654          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8564          
##    Detection Prevalence : 0.9075          
##       Balanced Accuracy : 0.7557          
##                                           
##        'Positive' Class : no              
## 

Experiment 2: Increase train/test split to prevent overfitting

# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()

cl <- makeCluster(num_cores - 1)  
registerDoParallel(cl)

# Train an XGBoost model using xgbTree on new training data
xgb_model_exp_2 <- train(
  y ~ .,      
  data = trainData_85,
  method = "xgbTree",   # XGBoost model
  trControl = trainControl(method = "cv", number = 3), # 3-fold cross validation
  tuneGrid = expand.grid(
  nrounds = c(50, 100, 150),   # Number of trees
  max_depth = c(3, 6),      # Tree depth
  eta = c(0.1, 0.2, 0.3),     # Learning rate
  gamma = c(0, 1, 5),          # Minimum loss reduction to split
  colsample_bytree = c(0.6, 0.8, 1),  # Feature sampling per tree
  min_child_weight = c(1, 3, 5),      # Minimum sum of instance weight
  subsample = c(0.6, 0.8)  # Fraction of data used per boosting round
))

# Stop the cluster after training
stopCluster(cl)
registerDoSEQ()  # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_exp_2$results, xgb_model_exp_2$bestTune)
##   eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.3         3     0              0.8                3       0.8      50
##    Accuracy     Kappa  AccuracySD     KappaSD
## 1 0.9167666 0.5390362 0.002402882 0.008881014
# Evaluate variable importance
plot(varImp(xgb_model_exp_2), top=20)

# Make predictions on new testing data
predictions_xgb_2 <- predict(xgb_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_xgb_2, testData_15$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5304  330
##        yes  178  366
##                                           
##                Accuracy : 0.9178          
##                  95% CI : (0.9106, 0.9245)
##     No Information Rate : 0.8873          
##     P-Value [Acc > NIR] : 1.688e-15       
##                                           
##                   Kappa : 0.5454          
##                                           
##  Mcnemar's Test P-Value : 2.091e-11       
##                                           
##             Sensitivity : 0.9675          
##             Specificity : 0.5259          
##          Pos Pred Value : 0.9414          
##          Neg Pred Value : 0.6728          
##              Prevalence : 0.8873          
##          Detection Rate : 0.8585          
##    Detection Prevalence : 0.9119          
##       Balanced Accuracy : 0.7467          
##                                           
##        'Positive' Class : no              
##