#load required libraries
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)
library(rpart)
library(rpart.plot)
library(doParallel)
library(foreach)
library(xgboost)}))
#load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")
# handle categorical data
# convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
mutate(across(where(is.character), as.factor))
# feature selection to improve computational efficiency
# eliminate 'pdays' and 'default' (near zero variance)
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)
# Split data into training and testing sets
set.seed(1989)
trainIndex_80 <- createDataPartition(bank_addtl_full$y, p = 0.8, list = FALSE)
trainData_80 <- bank_addtl_full[trainIndex_80, ]
testData_20 <- bank_addtl_full[-trainIndex_80, ]
# Train a Decision Tree model using rpart
tree_model_control <- train(
y ~ .,
data = trainData_80,
method = "rpart", # Standard decision tree (CART)
trControl = trainControl(method = "cv", number = 5) # 5-fold cross validation
)
# Print model summary
print(tree_model_control)
## CART
##
## 32951 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26361, 26360, 26361, 26361
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.02007004 0.9066795 0.4426613
## 0.02128233 0.9061029 0.4509412
## 0.06950431 0.8949958 0.2495608
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.02007004.
# Visualize the tree
rpart.plot(tree_model_control$finalModel)
# Evaluate variable importance
plot(varImp(tree_model_control), top=20)
# Make predictions
predictions_tree_control <- predict(tree_model_control, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_tree_control, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7153 599
## yes 156 329
##
## Accuracy : 0.9083
## 95% CI : (0.9019, 0.9145)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 3.126e-10
##
## Kappa : 0.4209
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9787
## Specificity : 0.3545
## Pos Pred Value : 0.9227
## Neg Pred Value : 0.6784
## Prevalence : 0.8873
## Detection Rate : 0.8684
## Detection Prevalence : 0.9411
## Balanced Accuracy : 0.6666
##
## 'Positive' Class : no
##
# Train a Decision Tree model using rpart with hyperparameter tuning of complexity parameter
tree_model_exp_1 <- train(
y ~ .,
data = trainData_80,
method = "rpart", # Standard decision tree (CART)
trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
tuneGrid = expand.grid(cp = c(0.01, 0.05, 0.1)) # Complexity parameter for pruning in rpart
)
# Print model summary
print(tree_model_exp_1)
## CART
##
## 32951 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26361, 26362, 26360, 26360
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01 0.9107462 0.5178619
## 0.05 0.9028861 0.4370908
## 0.10 0.8873479 0.0000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01.
# Visualize the tree
rpart.plot(tree_model_exp_1$finalModel)
# Evaluate variable importance
plot(varImp(tree_model_exp_1), top=20)
# Make predictions
predictions_tree_1 <- predict(tree_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_tree_1, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7052 435
## yes 257 493
##
## Accuracy : 0.916
## 95% CI : (0.9098, 0.9219)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5414
##
## Mcnemar's Test P-Value : 1.714e-11
##
## Sensitivity : 0.9648
## Specificity : 0.5312
## Pos Pred Value : 0.9419
## Neg Pred Value : 0.6573
## Prevalence : 0.8873
## Detection Rate : 0.8561
## Detection Prevalence : 0.9089
## Balanced Accuracy : 0.7480
##
## 'Positive' Class : no
##
# Increase train/test split to 85% train and 15% test
set.seed(1989)
trainIndex_85 <- createDataPartition(bank_addtl_full$y, p = 0.85, list = FALSE)
trainData_85 <- bank_addtl_full[trainIndex_85, ]
testData_15 <- bank_addtl_full[-trainIndex_85, ]
# Train a Decision Tree model on new training data
tree_model_exp_2 <- train(
y ~ .,
data = trainData_85, # higher train/test split
method = "rpart", # Standard decision tree (CART)
trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
tuneGrid = expand.grid(cp = c(0.01, 0.05, 0.1)) # Complexity parameter for pruning in rpart
)
# Print model summary
print(tree_model_exp_2)
## CART
##
## 35010 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 28008, 28007, 28009, 28008, 28008
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.01 0.9115396 0.5154555
## 0.05 0.9028276 0.4336195
## 0.10 0.8873465 0.0000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.01.
# Visualize the tree
rpart.plot(tree_model_exp_2$finalModel)
# Evaluate variable importance
plot(varImp(tree_model_exp_2), top=20)
# Make predictions on new testing data
predictions_tree_2 <- predict(tree_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_tree_2, testData_15$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5307 345
## yes 175 351
##
## Accuracy : 0.9158
## 95% CI : (0.9086, 0.9226)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 9.996e-14
##
## Kappa : 0.5288
##
## Mcnemar's Test P-Value : 1.252e-13
##
## Sensitivity : 0.9681
## Specificity : 0.5043
## Pos Pred Value : 0.9390
## Neg Pred Value : 0.6673
## Prevalence : 0.8873
## Detection Rate : 0.8590
## Detection Prevalence : 0.9149
## Balanced Accuracy : 0.7362
##
## 'Positive' Class : no
##
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train a Random Forest model using rf
rf_model_control <- train(
y ~ .,
data = trainData_80,
method = "rf", # Random Forest model
trControl = trainControl(method = "cv", number = 5) # 5-fold cross validation
)
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print model summary
print(rf_model_control)
## Random Forest
##
## 32951 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26361, 26361, 26360, 26361
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9008831 0.2499619
## 26 0.9129312 0.5356371
## 50 0.9112317 0.5273636
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 26.
# Evaluate variable importance
plot(varImp(rf_model_control), top=20)
# Make predictions
predictions_rf_control <- predict(rf_model_control, newdata = testData_20)
# Evaluate accuracy
confusionMatrix(predictions_rf_control, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7011 410
## yes 298 518
##
## Accuracy : 0.914
## 95% CI : (0.9078, 0.92)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.018e-15
##
## Kappa : 0.5462
##
## Mcnemar's Test P-Value : 3.024e-05
##
## Sensitivity : 0.9592
## Specificity : 0.5582
## Pos Pred Value : 0.9448
## Neg Pred Value : 0.6348
## Prevalence : 0.8873
## Detection Rate : 0.8512
## Detection Prevalence : 0.9009
## Balanced Accuracy : 0.7587
##
## 'Positive' Class : no
##
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train a Random Forest model using rf with hyperparameter tuning of mtry parameter
rf_model_exp_1 <- train(
y ~ .,
data = trainData_80,
method = "rf", # Random Forest model
trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
tuneGrid = expand.grid(mtry = c(1, 2, 3)) # Mtry parameter
)
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print model summary
print(rf_model_exp_1)
## Random Forest
##
## 32951 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 26361, 26360, 26361, 26360, 26362
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.8873479 0.0000000
## 2 0.9006709 0.2480457
## 3 0.9063460 0.3537073
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# Evaluate variable importance
plot(varImp(rf_model_exp_1), top=20)
# Make predictions
predictions_rf_1 <- predict(rf_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_rf_1, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7222 664
## yes 87 264
##
## Accuracy : 0.9088
## 95% CI : (0.9024, 0.915)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.21e-10
##
## Kappa : 0.3741
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9881
## Specificity : 0.2845
## Pos Pred Value : 0.9158
## Neg Pred Value : 0.7521
## Prevalence : 0.8873
## Detection Rate : 0.8768
## Detection Prevalence : 0.9574
## Balanced Accuracy : 0.6363
##
## 'Positive' Class : no
##
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train a Random Forest model using rf on new training data
rf_model_exp_2 <- train(
y ~ .,
data = trainData_85,
method = "rf", # Random Forest model
trControl = trainControl(method = "cv", number = 5), # 5-fold cross validation
tuneGrid = expand.grid(mtry = c(1, 2, 3)) # Mtry parameter
)
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print model summary
print(rf_model_exp_2)
## Random Forest
##
## 35010 samples
## 18 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 28008, 28008, 28008, 28009, 28007
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.8873465 0.0000000
## 2 0.9016566 0.2574433
## 3 0.9065695 0.3539608
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 3.
# Evaluate variable importance
plot(varImp(rf_model_exp_2), top=20)
# Make predictions on new testing data
predictions_rf_2 <- predict(rf_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_rf_2, testData_15$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5422 511
## yes 60 185
##
## Accuracy : 0.9076
## 95% CI : (0.9001, 0.9147)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.33e-07
##
## Kappa : 0.3554
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9891
## Specificity : 0.2658
## Pos Pred Value : 0.9139
## Neg Pred Value : 0.7551
## Prevalence : 0.8873
## Detection Rate : 0.8776
## Detection Prevalence : 0.9603
## Balanced Accuracy : 0.6274
##
## 'Positive' Class : no
##
*I attempted to use Adaboost, however I found that the package fastAdaboost is required with the “adaboost” method in the caret package, and fastAdaboost is not maintained with the current version of R. As an alternative, I am using XGBoost.
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train an XGBoost model using xgbTree
xgb_model_control <- train(
y ~ .,
data = trainData_80,
method = "xgbTree", # XGBoost model
trControl = trainControl(method = "cv", number = 3) # 3-fold cross-validation
)
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_control$results, xgb_model_control$bestTune)
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.4 3 0 0.6 1 0.75 50
## Accuracy Kappa AccuracySD KappaSD
## 1 0.9163 0.5408723 0.002577816 0.01250585
# Evaluate variable importance
plot(varImp(xgb_model_control), top=20)
# Make predictions
predictions_xgb_control <- predict(xgb_model_control, newdata = testData_20)
# Evaluate accuracy
confusionMatrix(predictions_xgb_control, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7064 446
## yes 245 482
##
## Accuracy : 0.9161
## 95% CI : (0.9099, 0.922)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5366
##
## Mcnemar's Test P-Value : 2.776e-14
##
## Sensitivity : 0.9665
## Specificity : 0.5194
## Pos Pred Value : 0.9406
## Neg Pred Value : 0.6630
## Prevalence : 0.8873
## Detection Rate : 0.8576
## Detection Prevalence : 0.9117
## Balanced Accuracy : 0.7429
##
## 'Positive' Class : no
##
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train an XGBoost model using xgbTree with hyperparameter tuning of multiple parameters
xgb_model_exp_1 <- train(
y ~ .,
data = trainData_80,
method = "xgbTree", # XGBoost model
trControl = trainControl(method = "cv", number = 3), # 3-fold cross validation
tuneGrid = expand.grid(
nrounds = c(50, 100, 150), # Number of trees
max_depth = c(3, 6), # Tree depth
eta = c(0.1, 0.2, 0.3), # Learning rate
gamma = c(0, 1, 5), # Minimum loss reduction to split
colsample_bytree = c(0.6, 0.8, 1), # Feature sampling per tree
min_child_weight = c(1, 3, 5), # Minimum sum of instance weight
subsample = c(0.6, 0.8) # Fraction of data used per boosting round
))
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_exp_1$results, xgb_model_exp_1$bestTune)
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.1 6 5 1 5 0.8 50
## Accuracy Kappa AccuracySD KappaSD
## 1 0.9172105 0.5498451 0.001738418 0.01546551
# Evaluate variable importance
plot(varImp(xgb_model_exp_1), top=20)
# Make predictions
predictions_xgb_1 <- predict(xgb_model_exp_1, newdata = testData_20)
# Evaluate model
confusionMatrix(predictions_xgb_1, testData_20$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7054 421
## yes 255 507
##
## Accuracy : 0.9179
## 95% CI : (0.9118, 0.9238)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5548
##
## Mcnemar's Test P-Value : 2.208e-10
##
## Sensitivity : 0.9651
## Specificity : 0.5463
## Pos Pred Value : 0.9437
## Neg Pred Value : 0.6654
## Prevalence : 0.8873
## Detection Rate : 0.8564
## Detection Prevalence : 0.9075
## Balanced Accuracy : 0.7557
##
## 'Positive' Class : no
##
# set up parallel clusters to improve computational efficiency
num_cores <- detectCores()
cl <- makeCluster(num_cores - 1)
registerDoParallel(cl)
# Train an XGBoost model using xgbTree on new training data
xgb_model_exp_2 <- train(
y ~ .,
data = trainData_85,
method = "xgbTree", # XGBoost model
trControl = trainControl(method = "cv", number = 3), # 3-fold cross validation
tuneGrid = expand.grid(
nrounds = c(50, 100, 150), # Number of trees
max_depth = c(3, 6), # Tree depth
eta = c(0.1, 0.2, 0.3), # Learning rate
gamma = c(0, 1, 5), # Minimum loss reduction to split
colsample_bytree = c(0.6, 0.8, 1), # Feature sampling per tree
min_child_weight = c(1, 3, 5), # Minimum sum of instance weight
subsample = c(0.6, 0.8) # Fraction of data used per boosting round
))
# Stop the cluster after training
stopCluster(cl)
registerDoSEQ() # Reset back to sequential processing
# Print concise model summary (best tune only)
merge(xgb_model_exp_2$results, xgb_model_exp_2$bestTune)
## eta max_depth gamma colsample_bytree min_child_weight subsample nrounds
## 1 0.3 3 0 0.8 3 0.8 50
## Accuracy Kappa AccuracySD KappaSD
## 1 0.9167666 0.5390362 0.002402882 0.008881014
# Evaluate variable importance
plot(varImp(xgb_model_exp_2), top=20)
# Make predictions on new testing data
predictions_xgb_2 <- predict(xgb_model_exp_2, newdata = testData_15)
# Evaluate model
confusionMatrix(predictions_xgb_2, testData_15$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5304 330
## yes 178 366
##
## Accuracy : 0.9178
## 95% CI : (0.9106, 0.9245)
## No Information Rate : 0.8873
## P-Value [Acc > NIR] : 1.688e-15
##
## Kappa : 0.5454
##
## Mcnemar's Test P-Value : 2.091e-11
##
## Sensitivity : 0.9675
## Specificity : 0.5259
## Pos Pred Value : 0.9414
## Neg Pred Value : 0.6728
## Prevalence : 0.8873
## Detection Rate : 0.8585
## Detection Prevalence : 0.9119
## Balanced Accuracy : 0.7467
##
## 'Positive' Class : no
##