#loading libraries
library(tidyverse)
library(ggplot2)
library(caret)
library(reshape2)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ada)
library(e1071)
library(fastDummies)

Assignment 2: Experimentation & Model Training

Imported data from assignment 1.

#CSV file imported from github
Bank_data<-read.csv2("https://raw.githubusercontent.com/Andreina-A/Data-622/refs/heads/main/bank-full.csv", stringsAsFactors = TRUE) #imported characters as factors for further analysis

#preprocessing
#removed "default" variable as it had many unknown data and very low correlation with the subscription outcome.
Bank_data<-Bank_data|>
  select(-default)

head(Bank_data)
##   age          job marital education balance housing loan contact day month
## 1  58   management married  tertiary    2143     yes   no unknown   5   may
## 2  44   technician  single secondary      29     yes   no unknown   5   may
## 3  33 entrepreneur married secondary       2     yes  yes unknown   5   may
## 4  47  blue-collar married   unknown    1506     yes   no unknown   5   may
## 5  33      unknown  single   unknown       1      no   no unknown   5   may
## 6  35   management married  tertiary     231     yes   no unknown   5   may
##   duration campaign pdays previous poutcome  y
## 1      261        1    -1        0  unknown no
## 2      151        1    -1        0  unknown no
## 3       76        1    -1        0  unknown no
## 4       92        1    -1        0  unknown no
## 5      198        1    -1        0  unknown no
## 6      139        1    -1        0  unknown no
#preprocessing create dummy variables
dmy<-dummyVars("~.", data = Bank_data)
Bank_1data<-data.frame(predict(dmy, newdata=Bank_data))
Bank_1data$y<-Bank_data$y

This assignment consists of conducting at least two (2) experiments for different algorithms: Decision Trees, Random Forest and Adaboost. That is, at least six (6) experiments in total (3 algorithms x 2 experiments each). For each experiment you will define what you are trying to achieve (before each run), conduct the experiment, and at the end you will review how your experiment went. These experiments will allow you to compare algorithms and choose the optimal model.

set.seed(123)
#Use 80 percent of data for training 20 percent for test
trainIndex1<-createDataPartition(Bank_1data$y, p=0.8, list=FALSE)
trainData1<-Bank_1data[trainIndex1,]
testData1<-Bank_1data[-trainIndex1,]

#recode for positive = yes
trainData1$y<-relevel(trainData1$y, ref = "yes")
testData1$y<-relevel(testData1$y, ref= "yes")

Decision Trees

Decision Tree: Experiment 1 with Dummy variables

Objective: Establish a baseline performance with dummy variable Metric:accuracy, sensitivity, F1 score, and specificity

DT1_model1<-rpart(y ~ ., data=trainData1, method = "class")
rpart.plot(DT1_model1, main="Decision Tree: Baseline")

DT1_predictions1<-predict(DT1_model1, newdata = testData1, type = "class")
confusionMatrix(DT1_predictions1, testData1$y, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes 1057    0
##        no     0 7984
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9996, 1)
##     No Information Rate : 0.8831     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.1169     
##          Detection Rate : 0.1169     
##    Detection Prevalence : 0.1169     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : yes        
## 

Creating dummy variables didn’t really work, as the results seem indicate there is a mistake. I will go back to using the data as it was originally without creating dummy variables.

set.seed(123)
#Use 80 percent of data for training 20 percent for test
trainIndex<-createDataPartition(Bank_data$y, p=0.8, list=FALSE)
trainData<-Bank_data[trainIndex,]
testData<-Bank_data[-trainIndex,]

#recode for positive = yes
trainData$y<-relevel(trainData$y, ref = "yes")
testData$y<-relevel(testData$y, ref= "yes")

decision tree redone without dummy variables

Objective: Establish a baseline performance without dummy variable metric:accuracy, sensitivity, F1 score, and precision

DT1_model<-rpart(y ~ ., data=trainData, method = "class")
rpart.plot(DT1_model, main="Decision Tree: Baseline")

DT1_predictions<-predict(DT1_model, newdata = testData, type = "class")
confusionMatrix(DT1_predictions, testData$y, positive = "yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  350  217
##        no   707 7767
##                                          
##                Accuracy : 0.8978         
##                  95% CI : (0.8914, 0.904)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 5.031e-06      
##                                          
##                   Kappa : 0.3805         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.33113        
##             Specificity : 0.97282        
##          Pos Pred Value : 0.61728        
##          Neg Pred Value : 0.91657        
##              Prevalence : 0.11691        
##          Detection Rate : 0.03871        
##    Detection Prevalence : 0.06271        
##       Balanced Accuracy : 0.65197        
##                                          
##        'Positive' Class : yes            
## 

Decision tree: Experiment 2

Objective:Test performance with pruning The data is pretty large I used prune to prevent overfitting and see if the accuracy will increase. Using the complexity parameter function (CP) I was able to obtain the optimal CP. The optimal CP would be the one with the lowest cross validation error (xerror), in this case cp of 0.010 had the lowest xerror of 0.841. metric:accuracy, sensitivity, F1 score, and precision

printcp(DT1_model)
## 
## Classification tree:
## rpart(formula = y ~ ., data = trainData, method = "class")
## 
## Variables actually used in tree construction:
## [1] duration poutcome
## 
## Root node error: 4232/36170 = 0.117
## 
## n= 36170 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.040170      0   1.00000 1.00000 0.014445
## 2 0.027410      3   0.87949 0.88020 0.013659
## 3 0.015595      4   0.85208 0.85681 0.013497
## 4 0.010000      5   0.83648 0.84145 0.013389
DT2_model<-prune(DT1_model, cp=0.010)
rpart.plot(DT2_model)

DT2_predictions<-predict(DT2_model, newdata=testData, type= "class")
confusionMatrix(DT2_predictions, testData$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  350  217
##        no   707 7767
##                                          
##                Accuracy : 0.8978         
##                  95% CI : (0.8914, 0.904)
##     No Information Rate : 0.8831         
##     P-Value [Acc > NIR] : 5.031e-06      
##                                          
##                   Kappa : 0.3805         
##                                          
##  Mcnemar's Test P-Value : < 2.2e-16      
##                                          
##             Sensitivity : 0.33113        
##             Specificity : 0.97282        
##          Pos Pred Value : 0.61728        
##          Neg Pred Value : 0.91657        
##              Prevalence : 0.11691        
##          Detection Rate : 0.03871        
##    Detection Prevalence : 0.06271        
##       Balanced Accuracy : 0.65197        
##                                          
##        'Positive' Class : yes            
## 

In the decision tree, unpruned model and the pruned model seemed to have the same results. This indication means that there wasn’t much overfitting and the first model with random forest was good enough. The first tree model didn’t have much depth, I guess the splits didn’t capture much noise to begin with. Overall the pruned model didn’t improve the accuracy, this could be because the pruned tree only used two features duration and poutcome, creating a simpler and a model interpretable model at no cost to the performance.

Random Forest

Experiment 3

Objective: Evaluate baseline performance Metric: accuracy, sensitivity, F1 score and precision

set.seed(123)
RF1_model<-randomForest(y~., data=trainData, ntree=100)
RF1_predictions<-predict(RF1_model, newdata= testData)
confusionMatrix(RF1_predictions, testData$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  383  210
##        no   674 7774
##                                           
##                Accuracy : 0.9022          
##                  95% CI : (0.8959, 0.9083)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 3.615e-09       
##                                           
##                   Kappa : 0.4151          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.36235         
##             Specificity : 0.97370         
##          Pos Pred Value : 0.64587         
##          Neg Pred Value : 0.92022         
##              Prevalence : 0.11691         
##          Detection Rate : 0.04236         
##    Detection Prevalence : 0.06559         
##       Balanced Accuracy : 0.66802         
##                                           
##        'Positive' Class : yes             
## 

Experiment 4

Objective: Test performance when changing the number of features (mtry) randomly selected at each split in a decision tree within the random forest. This hyperparameter controls how diverse and independent the trees are in the random forest model. The first random forest model used an mtry of 3 it could have been to low which can create very diverse tree causing it to miss important features, I used 5 for the second random forest experiment as just to see if it would increase the accuracy. Metric: accuracy, sensitivity, F1 score, and precision

mtryused<-RF1_model$mtry
print(mtryused)
## [1] 3
set.seed(123)
RF2_model<-randomForest(y~., data=trainData, ntree=100, mtry=5)
RF2_predictions<-predict(RF2_model, newdata= testData)
confusionMatrix(RF2_predictions, testData$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  457  254
##        no   600 7730
##                                           
##                Accuracy : 0.9055          
##                  95% CI : (0.8993, 0.9115)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 4.41e-12        
##                                           
##                   Kappa : 0.4668          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.43236         
##             Specificity : 0.96819         
##          Pos Pred Value : 0.64276         
##          Neg Pred Value : 0.92797         
##              Prevalence : 0.11691         
##          Detection Rate : 0.05055         
##    Detection Prevalence : 0.07864         
##       Balanced Accuracy : 0.70027         
##                                           
##        'Positive' Class : yes             
## 

Adaboost

Experiment 5

Objective: Adaboost baseline metric:accuracy, sensitivity, F1 score, and precision

set.seed(123)
ada1_model<-ada(y~., data=trainData, iter=50)
ada1_predictions<-predict(ada1_model, newdata=testData)
confusionMatrix(ada1_predictions, testData$y)
## Warning in confusionMatrix.default(ada1_predictions, testData$y): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  392  206
##        no   665 7778
##                                           
##                Accuracy : 0.9037          
##                  95% CI : (0.8974, 0.9097)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.266e-10       
##                                           
##                   Kappa : 0.4251          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.37086         
##             Specificity : 0.97420         
##          Pos Pred Value : 0.65552         
##          Neg Pred Value : 0.92124         
##              Prevalence : 0.11691         
##          Detection Rate : 0.04336         
##    Detection Prevalence : 0.06614         
##       Balanced Accuracy : 0.67253         
##                                           
##        'Positive' Class : yes             
## 

Experiment 6

Objective: Increase the iteration to 100 as it can increase the number of accuracy but it can cause overfitting, I also set the loss to log. metric:accuracy, sensitivity,F1 score and precision

set.seed(123)
ada2_model<-ada(y~., data=trainData, iter=100, loss="logistic")
ada2_predictions<-predict(ada1_model, newdata=testData)
confusionMatrix(ada2_predictions, testData$y)
## Warning in confusionMatrix.default(ada2_predictions, testData$y): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  yes   no
##        yes  392  206
##        no   665 7778
##                                           
##                Accuracy : 0.9037          
##                  95% CI : (0.8974, 0.9097)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 2.266e-10       
##                                           
##                   Kappa : 0.4251          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.37086         
##             Specificity : 0.97420         
##          Pos Pred Value : 0.65552         
##          Neg Pred Value : 0.92124         
##              Prevalence : 0.11691         
##          Detection Rate : 0.04336         
##    Detection Prevalence : 0.06614         
##       Balanced Accuracy : 0.67253         
##                                           
##        'Positive' Class : yes             
## 

Comparsion Table

#created Eval function
evaluate_model<-function(model_predictions, true_labels, model_name){
  cm<- confusionMatrix(model_predictions, true_labels, positive="yes")
  
  list(model=model_name,
       accurracy= cm$overall["Accuracy"],
       precision=cm$byClass["Precision"],
       recall=cm$byClass["Recall"],
       f1_score=cm$byClass["F1"]
  )
}
results<-bind_rows(
  evaluate_model(DT1_predictions, testData$y, "Decision Tree 1"),
  evaluate_model(DT2_predictions, testData$y, "Decision Tree 2"),
  evaluate_model(RF1_predictions, testData$y, "Random Forest 1"),
  evaluate_model(RF2_predictions, testData$y, "Random Forest 2"),
  evaluate_model(ada1_predictions, testData$y, "AdaBoost 1"),
  evaluate_model(ada2_predictions, testData$y, "AdaBoost 2")
)
## Warning in confusionMatrix.default(model_predictions, true_labels, positive =
## "yes"): Levels are not in the same order for reference and data. Refactoring
## data to match.
## Warning in confusionMatrix.default(model_predictions, true_labels, positive =
## "yes"): Levels are not in the same order for reference and data. Refactoring
## data to match.
print(results)
## # A tibble: 6 × 5
##   model           accurracy precision recall f1_score
##   <chr>               <dbl>     <dbl>  <dbl>    <dbl>
## 1 Decision Tree 1     0.898     0.617  0.331    0.431
## 2 Decision Tree 2     0.898     0.617  0.331    0.431
## 3 Random Forest 1     0.902     0.646  0.362    0.464
## 4 Random Forest 2     0.906     0.643  0.432    0.517
## 5 AdaBoost 1          0.904     0.656  0.371    0.474
## 6 AdaBoost 2          0.904     0.656  0.371    0.474