set.seed(10023)
library(tidyverse)
library(caret)
library(ggplot2)
library(ggcorrplot)
library(dplyr)
library(rpart)
library(randomForest)
library(e1071)
library(ada)
library(pROC)
I performed two experiments with the below algorithms using the data from the EDA assignment:
Each experiment has its own section within each model and the experiements are self contained. One can be performed without the other.
I will be using the full banking full from the Bank Marketing Dataset. The others are just subsets.
bank_additional_full_csv <- read_csv2('https://storage.googleapis.com/data_science_masters_files/data_622/bank%2Bmarketing/bank-additional-full.csv', show_col_types = FALSE)
This is an overview from the dataset authors on what each file represents: “The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.
There are four datasets: 1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010), very close…”
Hypothesis: A shallow tree with basic preprocessing will be accurate but underfit
Tree max depth: 3
I held constant the the feature set and 80/20 slit and did not perform scaling. The metrics to be assessed are accuracy and F1-score.
dct_ex1_df <- bank_additional_full_csv
dct_ex1_df <- dct_ex1_df %>% mutate_if(is.character, as.factor)
split_one <- createDataPartition(dct_ex1_df$y, p=0.8, list=FALSE)
train_one <- dct_ex1_df[split_one,]
test_one <- dct_ex1_df[-split_one,]
dct1 <- rpart(y ~ ., data=train_one, method="class", control=rpart.control(maxdepth=3))
pred_one <- predict(dct1, test_one, type="class")
conf_m1 <- confusionMatrix(pred_one, test_one$y)
fone_dct1 <- function(pred, true) {
cm <- table(pred, true)
precision <- diag(cm) / colSums(cm)
recall <- diag(cm) / rowSums(cm)
f1 <- 2 * ((precision * recall) / (precision + recall))
mean(f1, na.rm=TRUE)
}
cat("Experiment 1: accuracy with basic shallow tree:\n")
## Experiment 1: accuracy with basic shallow tree:
cat("Accuracy:", round(conf_m1$overall["Accuracy"], 4), "\n")
## Accuracy: 0.9103
cat("F1 Score:", round(fone_dct1(pred_one, test_one$y), 4), "\n")
## F1 Score: 0.7807
Analysis: The results show that the accuracy was quite high at 90% and the F1 score was decent at 77%. It’s not a bad trade off to make for being able to run a simple model on a smaller data set. I was expecting the F1 score to be a bit less.
This experiment is performed on the same baseline dataset.
Hypothesis: A Random Forest (still covered as separate section below) will outperform a single tree due to ensembling averaing.
Features and scaling have been held constant. The metrics are Accuracy, F1-score
dct_ex2_df <- bank_additional_full_csv
dct_ex2_df <- dct_ex2_df %>% mutate_if(is.character, as.factor)
sp2 <- createDataPartition(dct_ex2_df$y, p=0.8, list=FALSE)
train_dos <- dct_ex2_df[sp2,]
test_dos <- dct_ex2_df[-sp2,]
d_grid <- expand.grid(mtry = c(2, 4, 6))
controller <- trainControl(method="cv", number=5)
rf_model2 <- train(y ~ ., data=train_dos, method="rf", trControl=controller, tuneGrid=d_grid, ntree=5)
ptwo <- predict(rf_model2, test_dos)
confm2 <- confusionMatrix(ptwo, test_dos$y)
ftwo_rf <- function(pred, true) {
cm <- table(pred, true)
precision <- diag(cm) / colSums(cm)
recall <- diag(cm) / rowSums(cm)
f1 <- 2 * ((precision * recall) / (precision + recall))
mean(f1, na.rm=TRUE)
}
cat("Experiment 2: single tree versus random forest \n")
## Experiment 2: single tree versus random forest
cat("Best mtry:", rf_model2$bestTune$mtry, "\n")
## Best mtry: 6
cat("Accuracy:", round(confm2$overall["Accuracy"], 4), "\n")
## Accuracy: 0.8968
cat("F1 Score:", round(ftwo_rf(ptwo, test_dos$y), 4), "\n")
## F1 Score: 0.5901
Hypothesis: Scaling only the numeric features improves model discrimination when using a minimal feature set.
Features: age, duration, campaign, job, marital, y
Metric: Area Under the ROC Curve (AUC)
rf_scaledf <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
rf_scaledf <- rf_scaledf %>% select(age, duration, campaign, job, marital, y)
numerics <- c("age", "duration", "campaign")
rf_scaledf[numerics] <- scale(rf_scaledf[numerics])
wesplit <- createDataPartition(rf_scaledf$y, p=0.8, list=FALSE)
wetrain <- rf_scaledf[wesplit,]
wetest <- rf_scaledf[-wesplit,]
forester <- randomForest(y ~ ., data=wetrain, ntree=5)
scalep <- predict(forester, wetest, type="prob")[, "yes"]
roc1 <- roc(wetest$y, scalep)
cat("Experiment 1: numeric scaling of a random forest \n")
## Experiment 1: numeric scaling of a random forest
cat("AUC:", round(auc(roc1), 4), "\n")
## AUC: 0.7395
Hypothesis: Normalizing all numeric features in a wider feature set helps the model focus on relative patterns, improving precision
Selected Features: age, duration, campaign, pdays, previous, job, education, contact
Preprocessing: Min-max normalization of numeric features to [0, 1]
Metric: Precision (Positive Predictive Value). Measures how many predicted “yes” values were actually correct
normrf <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
normrf <- normrf %>% select(age, duration, campaign, pdays, previous, job, education, contact, y)
more_metrics <- c("age", "duration", "campaign", "pdays", "previous")
normrf[more_metrics] <- as.data.frame(lapply(normrf[more_metrics], function(x) (x - min(x)) / (max(x) - min(x))))
splitting <- createDataPartition(normrf$y, p=0.8, list=FALSE)
training <- normrf[splitting,]
testing <- normrf[-splitting,]
foresting <- randomForest(y ~ ., data=training, ntree=100)
predicting <- predict(foresting, testing)
confusing <- confusionMatrix(predicting, testing$y, positive="yes")
cat("Experiment 2: better normalization \n")
## Experiment 2: better normalization
cat("Precision:", round(confusing$byClass["Precision"], 4), "\n")
## Precision: 0.6849
Please note: I’m on R version 4.4.1 . The standard
adabag
package only works on R up to 3.5ins
Hypothesis: A lightweight AdaBoost model using only a few scaled numeric features and one categorical variable will provide provide a strong F1 score.
Features: age, duration, campaign, job, y
Preprocessing: scale() on numeric features
Metric: F1 Score
boosted <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
boosted <- boosted %>% select(age, duration, campaign, job, y)
boost_nums <- c("age", "duration", "campaign")
boosted[boost_nums] <- scale(boosted[boost_nums])
split_boost <- createDataPartition(boosted$y, p=0.8, list=FALSE)
train_boost <- boosted[split_boost,]
test_boost <- boosted[-split_boost,]
boosting <- ada(y ~ ., data=train_boost, iter=50, nu=0.1, control=rpart.control(maxdepth=1))
predict_boost <- predict(boosting, test_boost)
pred_b <- predict_boost
truth_b <- test_boost$y
f1_boost <- function(pred, true) {
cm <- table(pred, true)
precision <- diag(cm) / colSums(cm)
recall <- diag(cm) / rowSums(cm)
f1 <- 2 * ((precision * recall) / (precision + recall))
mean(f1, na.rm=TRUE)
}
cat("Experiment 1: scaled numeric features \n")
## Experiment 1: scaled numeric features
cat("F1 Score:", round(f1_boost(pred_b, truth_b), 4), "\n")
## F1 Score: 0.9403
Features: age, duration, campaign, pdays, previous, job, education, contact, y
Preprocessing: Min-max normalization on numeric features
Metric: Precision
boosted2 <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
boosted2 <- boosted2 %>% select(age, duration, campaign, pdays, previous, job, education, contact, y)
boost_nums2 <- c("age", "duration", "campaign", "pdays", "previous")
boosted2[boost_nums2] <- as.data.frame(lapply(boosted2[boost_nums2], function(x) (x - min(x)) / (max(x) - min(x))))
split_boost2 <- createDataPartition(boosted2$y, p=0.8, list=FALSE)
train_boost2 <- boosted2[split_boost2,]
test_boost2 <- boosted2[-split_boost2,]
boosting2 <- ada(y ~ ., data=train_boost2, iter=50, nu=0.1, control=rpart.control(maxdepth=1))
predict_boost2 <- predict(boosting2, test_boost2)
conf_boost2 <- confusionMatrix(predict_boost2, test_boost2$y, positive="yes")
cat("Experiment 2: AdaBoost with normalization and more categories:\n")
## Experiment 2: AdaBoost with normalization and more categories:
cat("Precision:", round(conf_boost2$byClass["Precision"], 4), "\n")
## Precision: 0.8837