#loading libraries
library(tidyverse)
library(ggplot2)
library(caret)
library(reshape2)
library(rpart)
library(rpart.plot)
library(randomForest)
library(ada)
library(e1071)
library(fastDummies)
Imported data from assignment 1.
#CSV file imported from github
Bank_data<-read.csv2("https://raw.githubusercontent.com/Andreina-A/Data-622/refs/heads/main/bank-full.csv", stringsAsFactors = TRUE) #imported characters as factors for further analysis
#preprocessing
#removed "default" variable as it had many unknown data and very low correlation with the subscription outcome.
Bank_data<-Bank_data|>
select(-default)
head(Bank_data)
## age job marital education balance housing loan contact day month
## 1 58 management married tertiary 2143 yes no unknown 5 may
## 2 44 technician single secondary 29 yes no unknown 5 may
## 3 33 entrepreneur married secondary 2 yes yes unknown 5 may
## 4 47 blue-collar married unknown 1506 yes no unknown 5 may
## 5 33 unknown single unknown 1 no no unknown 5 may
## 6 35 management married tertiary 231 yes no unknown 5 may
## duration campaign pdays previous poutcome y
## 1 261 1 -1 0 unknown no
## 2 151 1 -1 0 unknown no
## 3 76 1 -1 0 unknown no
## 4 92 1 -1 0 unknown no
## 5 198 1 -1 0 unknown no
## 6 139 1 -1 0 unknown no
#preprocessing create dummy variables
dmy<-dummyVars("~.", data = Bank_data)
Bank_1data<-data.frame(predict(dmy, newdata=Bank_data))
Bank_1data$y<-Bank_data$y
This assignment consists of conducting at least two (2) experiments for different algorithms: Decision Trees, Random Forest and Adaboost. That is, at least six (6) experiments in total (3 algorithms x 2 experiments each). For each experiment you will define what you are trying to achieve (before each run), conduct the experiment, and at the end you will review how your experiment went. These experiments will allow you to compare algorithms and choose the optimal model.
set.seed(123)
#Use 80 percent of data for training 20 percent for test
trainIndex1<-createDataPartition(Bank_1data$y, p=0.8, list=FALSE)
trainData1<-Bank_1data[trainIndex1,]
testData1<-Bank_1data[-trainIndex1,]
#recode for positive = yes
trainData1$y<-relevel(trainData1$y, ref = "yes")
testData1$y<-relevel(testData1$y, ref= "yes")
Objective: Establish a baseline performance with dummy variable Metric:accuracy, sensitivity, F1 score, and specificity
DT1_model1<-rpart(y ~ ., data=trainData1, method = "class")
rpart.plot(DT1_model1, main="Decision Tree: Baseline")
DT1_predictions1<-predict(DT1_model1, newdata = testData1, type = "class")
confusionMatrix(DT1_predictions1, testData1$y, positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 1057 0
## no 0 7984
##
## Accuracy : 1
## 95% CI : (0.9996, 1)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.1169
## Detection Rate : 0.1169
## Detection Prevalence : 0.1169
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : yes
##
Creating dummy variables didn’t really work, as the results seem indicate there is a mistake. I will go back to using the data as it was originally without creating dummy variables.
set.seed(123)
#Use 80 percent of data for training 20 percent for test
trainIndex<-createDataPartition(Bank_data$y, p=0.8, list=FALSE)
trainData<-Bank_data[trainIndex,]
testData<-Bank_data[-trainIndex,]
#recode for positive = yes
trainData$y<-relevel(trainData$y, ref = "yes")
testData$y<-relevel(testData$y, ref= "yes")
Objective: Establish a baseline performance without dummy variable metric:accuracy, sensitivity, F1 score, and precision
DT1_model<-rpart(y ~ ., data=trainData, method = "class")
rpart.plot(DT1_model, main="Decision Tree: Baseline")
DT1_predictions<-predict(DT1_model, newdata = testData, type = "class")
confusionMatrix(DT1_predictions, testData$y, positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 350 217
## no 707 7767
##
## Accuracy : 0.8978
## 95% CI : (0.8914, 0.904)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 5.031e-06
##
## Kappa : 0.3805
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.33113
## Specificity : 0.97282
## Pos Pred Value : 0.61728
## Neg Pred Value : 0.91657
## Prevalence : 0.11691
## Detection Rate : 0.03871
## Detection Prevalence : 0.06271
## Balanced Accuracy : 0.65197
##
## 'Positive' Class : yes
##
Objective:Test performance with pruning The data is pretty large I used prune to prevent overfitting and see if the accuracy will increase. Using the complexity parameter function (CP) I was able to obtain the optimal CP. The optimal CP would be the one with the lowest cross validation error (xerror), in this case cp of 0.010 had the lowest xerror of 0.841. metric:accuracy, sensitivity, F1 score, and precision
printcp(DT1_model)
##
## Classification tree:
## rpart(formula = y ~ ., data = trainData, method = "class")
##
## Variables actually used in tree construction:
## [1] duration poutcome
##
## Root node error: 4232/36170 = 0.117
##
## n= 36170
##
## CP nsplit rel error xerror xstd
## 1 0.040170 0 1.00000 1.00000 0.014445
## 2 0.027410 3 0.87949 0.88020 0.013659
## 3 0.015595 4 0.85208 0.85681 0.013497
## 4 0.010000 5 0.83648 0.84145 0.013389
DT2_model<-prune(DT1_model, cp=0.010)
rpart.plot(DT2_model)
DT2_predictions<-predict(DT2_model, newdata=testData, type= "class")
confusionMatrix(DT2_predictions, testData$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 350 217
## no 707 7767
##
## Accuracy : 0.8978
## 95% CI : (0.8914, 0.904)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 5.031e-06
##
## Kappa : 0.3805
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.33113
## Specificity : 0.97282
## Pos Pred Value : 0.61728
## Neg Pred Value : 0.91657
## Prevalence : 0.11691
## Detection Rate : 0.03871
## Detection Prevalence : 0.06271
## Balanced Accuracy : 0.65197
##
## 'Positive' Class : yes
##
In the decision tree, unpruned model and the pruned model seemed to have the same results. This indication means that there wasn’t much overfitting and the first model with random forest was good enough. The first tree model didn’t have much depth, I guess the splits didn’t capture much noise to begin with. Overall the pruned model didn’t improve the accuracy, this could be because the pruned tree only used two features duration and poutcome, creating a simpler and a model interpretable model at no cost to the performance.
Objective: Evaluate baseline performance Metric: accuracy, sensitivity, F1 score and precision
set.seed(123)
RF1_model<-randomForest(y~., data=trainData, ntree=100)
RF1_predictions<-predict(RF1_model, newdata= testData)
confusionMatrix(RF1_predictions, testData$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 383 210
## no 674 7774
##
## Accuracy : 0.9022
## 95% CI : (0.8959, 0.9083)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 3.615e-09
##
## Kappa : 0.4151
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.36235
## Specificity : 0.97370
## Pos Pred Value : 0.64587
## Neg Pred Value : 0.92022
## Prevalence : 0.11691
## Detection Rate : 0.04236
## Detection Prevalence : 0.06559
## Balanced Accuracy : 0.66802
##
## 'Positive' Class : yes
##
Objective: Test performance when changing the number of features (mtry) randomly selected at each split in a decision tree within the random forest. This hyperparameter controls how diverse and independent the trees are in the random forest model. The first random forest model used an mtry of 3 it could have been to low which can create very diverse tree causing it to miss important features, I used 5 for the second random forest experiment as just to see if it would increase the accuracy. Metric: accuracy, sensitivity, F1 score, and precision
mtryused<-RF1_model$mtry
print(mtryused)
## [1] 3
set.seed(123)
RF2_model<-randomForest(y~., data=trainData, ntree=100, mtry=5)
RF2_predictions<-predict(RF2_model, newdata= testData)
confusionMatrix(RF2_predictions, testData$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 457 254
## no 600 7730
##
## Accuracy : 0.9055
## 95% CI : (0.8993, 0.9115)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 4.41e-12
##
## Kappa : 0.4668
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.43236
## Specificity : 0.96819
## Pos Pred Value : 0.64276
## Neg Pred Value : 0.92797
## Prevalence : 0.11691
## Detection Rate : 0.05055
## Detection Prevalence : 0.07864
## Balanced Accuracy : 0.70027
##
## 'Positive' Class : yes
##
Objective: Adaboost baseline metric:accuracy, sensitivity, F1 score, and precision
set.seed(123)
ada1_model<-ada(y~., data=trainData, iter=50)
ada1_predictions<-predict(ada1_model, newdata=testData)
confusionMatrix(ada1_predictions, testData$y)
## Warning in confusionMatrix.default(ada1_predictions, testData$y): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 392 206
## no 665 7778
##
## Accuracy : 0.9037
## 95% CI : (0.8974, 0.9097)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.266e-10
##
## Kappa : 0.4251
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.37086
## Specificity : 0.97420
## Pos Pred Value : 0.65552
## Neg Pred Value : 0.92124
## Prevalence : 0.11691
## Detection Rate : 0.04336
## Detection Prevalence : 0.06614
## Balanced Accuracy : 0.67253
##
## 'Positive' Class : yes
##
Objective: Increase the iteration to 100 as it can increase the number of accuracy but it can cause overfitting, I also set the loss to log. metric:accuracy, sensitivity,F1 score and precision
set.seed(123)
ada2_model<-ada(y~., data=trainData, iter=100, loss="logistic")
ada2_predictions<-predict(ada1_model, newdata=testData)
confusionMatrix(ada2_predictions, testData$y)
## Warning in confusionMatrix.default(ada2_predictions, testData$y): Levels are
## not in the same order for reference and data. Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction yes no
## yes 392 206
## no 665 7778
##
## Accuracy : 0.9037
## 95% CI : (0.8974, 0.9097)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 2.266e-10
##
## Kappa : 0.4251
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.37086
## Specificity : 0.97420
## Pos Pred Value : 0.65552
## Neg Pred Value : 0.92124
## Prevalence : 0.11691
## Detection Rate : 0.04336
## Detection Prevalence : 0.06614
## Balanced Accuracy : 0.67253
##
## 'Positive' Class : yes
##
#created Eval function
evaluate_model<-function(model_predictions, true_labels, model_name){
cm<- confusionMatrix(model_predictions, true_labels, positive="yes")
list(model=model_name,
accurracy= cm$overall["Accuracy"],
precision=cm$byClass["Precision"],
recall=cm$byClass["Recall"],
f1_score=cm$byClass["F1"]
)
}
results<-bind_rows(
evaluate_model(DT1_predictions, testData$y, "Decision Tree 1"),
evaluate_model(DT2_predictions, testData$y, "Decision Tree 2"),
evaluate_model(RF1_predictions, testData$y, "Random Forest 1"),
evaluate_model(RF2_predictions, testData$y, "Random Forest 2"),
evaluate_model(ada1_predictions, testData$y, "AdaBoost 1"),
evaluate_model(ada2_predictions, testData$y, "AdaBoost 2")
)
## Warning in confusionMatrix.default(model_predictions, true_labels, positive =
## "yes"): Levels are not in the same order for reference and data. Refactoring
## data to match.
## Warning in confusionMatrix.default(model_predictions, true_labels, positive =
## "yes"): Levels are not in the same order for reference and data. Refactoring
## data to match.
print(results)
## # A tibble: 6 × 5
## model accurracy precision recall f1_score
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Decision Tree 1 0.898 0.617 0.331 0.431
## 2 Decision Tree 2 0.898 0.617 0.331 0.431
## 3 Random Forest 1 0.902 0.646 0.362 0.464
## 4 Random Forest 2 0.906 0.643 0.432 0.517
## 5 AdaBoost 1 0.904 0.656 0.371 0.474
## 6 AdaBoost 2 0.904 0.656 0.371 0.474