DATA 622 Assignment 2: Experimentation & Model Training

set.seed(10023)
library(tidyverse)
library(caret)
library(ggplot2)
library(ggcorrplot)
library(dplyr)
library(rpart)
library(randomForest)
library(e1071)
library(ada)
library(pROC)

Overview

I performed two experiments with the below algorithms using the data from the EDA assignment:

Decision Trees
Random Forest
Adaboost

Each experiment has its own section within each model and the experiements are self contained. One can be performed without the other.

I will be using the full banking full from the Bank Marketing Dataset. The others are just subsets.

bank_additional_full_csv <- read_csv2('https://storage.googleapis.com/data_science_masters_files/data_622/bank%2Bmarketing/bank-additional-full.csv', show_col_types = FALSE)

This is an overview from the dataset authors on what each file represents: “The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.

There are four datasets: 1) bank-additional-full.csv with all examples (41188) and 20 inputs, ordered by date (from May 2008 to November 2010), very close…”

Decision Trees

First Experiment - basic accuracy

Hypothesis: A shallow tree with basic preprocessing will be accurate but underfit

Tree max depth: 3

I held constant the the feature set and 80/20 slit and did not perform scaling. The metrics to be assessed are accuracy and F1-score.

dct_ex1_df <- bank_additional_full_csv
dct_ex1_df <- dct_ex1_df %>% mutate_if(is.character, as.factor)

split_one <- createDataPartition(dct_ex1_df$y, p=0.8, list=FALSE)
train_one <- dct_ex1_df[split_one,]
test_one <- dct_ex1_df[-split_one,]

dct1 <- rpart(y ~ ., data=train_one, method="class", control=rpart.control(maxdepth=3))
pred_one <- predict(dct1, test_one, type="class")
conf_m1 <- confusionMatrix(pred_one, test_one$y)

fone_dct1 <- function(pred, true) {
  cm <- table(pred, true)
  precision <- diag(cm) / colSums(cm)
  recall <- diag(cm) / rowSums(cm)
  f1 <- 2 * ((precision * recall) / (precision + recall))
  mean(f1, na.rm=TRUE)
}

cat("Experiment 1: accuracy with basic shallow tree:\n")

## Experiment 1: accuracy with basic shallow tree:

cat("Accuracy:", round(conf_m1$overall["Accuracy"], 4), "\n")

## Accuracy: 0.9103

cat("F1 Score:", round(fone_dct1(pred_one, test_one$y), 4), "\n")

## F1 Score: 0.7807

Analysis: The results show that the accuracy was quite high at 90% and the F1 score was decent at 77%. It’s not a bad trade off to make for being able to run a simple model on a smaller data set. I was expecting the F1 score to be a bit less.

Second Experiment - random forest vs decision tree

This experiment is performed on the same baseline dataset.

Hypothesis: A Random Forest (still covered as separate section below) will outperform a single tree due to ensembling averaing.

Features and scaling have been held constant. The metrics are Accuracy, F1-score

dct_ex2_df <- bank_additional_full_csv
dct_ex2_df <- dct_ex2_df %>% mutate_if(is.character, as.factor)

sp2 <- createDataPartition(dct_ex2_df$y, p=0.8, list=FALSE)
train_dos <- dct_ex2_df[sp2,]
test_dos <- dct_ex2_df[-sp2,]

d_grid <- expand.grid(mtry = c(2, 4, 6))
controller <- trainControl(method="cv", number=5)
rf_model2 <- train(y ~ ., data=train_dos, method="rf", trControl=controller, tuneGrid=d_grid, ntree=5)
ptwo <- predict(rf_model2, test_dos)
confm2 <- confusionMatrix(ptwo, test_dos$y)

ftwo_rf <- function(pred, true) {
  cm <- table(pred, true)
  precision <- diag(cm) / colSums(cm)
  recall <- diag(cm) / rowSums(cm)
  f1 <- 2 * ((precision * recall) / (precision + recall))
  mean(f1, na.rm=TRUE)
}

cat("Experiment 2: single tree versus random forest \n")

## Experiment 2: single tree versus random forest

cat("Best mtry:", rf_model2$bestTune$mtry, "\n")

## Best mtry: 6

cat("Accuracy:", round(confm2$overall["Accuracy"], 4), "\n")

## Accuracy: 0.8968

cat("F1 Score:", round(ftwo_rf(ptwo, test_dos$y), 4), "\n")

## F1 Score: 0.5901

Random Forests

Experiment One - scaling test

Hypothesis: Scaling only the numeric features improves model discrimination when using a minimal feature set.

Features: age, duration, campaign, job, marital, y

Metric: Area Under the ROC Curve (AUC)

rf_scaledf <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
rf_scaledf <- rf_scaledf %>% select(age, duration, campaign, job, marital, y)

numerics <- c("age", "duration", "campaign")
rf_scaledf[numerics] <- scale(rf_scaledf[numerics])

wesplit <- createDataPartition(rf_scaledf$y, p=0.8, list=FALSE)
wetrain <- rf_scaledf[wesplit,]
wetest <- rf_scaledf[-wesplit,]

forester <- randomForest(y ~ ., data=wetrain, ntree=5)
scalep <- predict(forester, wetest, type="prob")[, "yes"]
roc1 <- roc(wetest$y, scalep)

cat("Experiment 1: numeric scaling of a random forest \n")

## Experiment 1: numeric scaling of a random forest

cat("AUC:", round(auc(roc1), 4), "\n")

## AUC: 0.7395

Experiment Two - better normalization

Hypothesis: Normalizing all numeric features in a wider feature set helps the model focus on relative patterns, improving precision

Selected Features: age, duration, campaign, pdays, previous, job, education, contact

Preprocessing: Min-max normalization of numeric features to [0, 1]

Metric: Precision (Positive Predictive Value). Measures how many predicted “yes” values were actually correct

normrf <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
normrf <- normrf %>% select(age, duration, campaign, pdays, previous, job, education, contact, y)

more_metrics <- c("age", "duration", "campaign", "pdays", "previous")
normrf[more_metrics] <- as.data.frame(lapply(normrf[more_metrics], function(x) (x - min(x)) / (max(x) - min(x))))

splitting <- createDataPartition(normrf$y, p=0.8, list=FALSE)
training <- normrf[splitting,]
testing <- normrf[-splitting,]

foresting <- randomForest(y ~ ., data=training, ntree=100)
predicting <- predict(foresting, testing)

confusing <- confusionMatrix(predicting, testing$y, positive="yes")
cat("Experiment 2: better normalization \n")

## Experiment 2: better normalization

cat("Precision:", round(confusing$byClass["Precision"], 4), "\n")

## Precision: 0.6849

Adaboost

Please note: I’m on R version 4.4.1 . The standard adabag package only works on R up to 3.5ins

Experiment One - lightweight AdaBoost

Hypothesis: A lightweight AdaBoost model using only a few scaled numeric features and one categorical variable will provide provide a strong F1 score.

Features: age, duration, campaign, job, y

Preprocessing: scale() on numeric features

Metric: F1 Score

boosted <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
boosted <- boosted %>% select(age, duration, campaign, job, y)

boost_nums <- c("age", "duration", "campaign")
boosted[boost_nums] <- scale(boosted[boost_nums])

split_boost <- createDataPartition(boosted$y, p=0.8, list=FALSE)
train_boost <- boosted[split_boost,]
test_boost <- boosted[-split_boost,]

boosting <- ada(y ~ ., data=train_boost, iter=50, nu=0.1, control=rpart.control(maxdepth=1))
predict_boost <- predict(boosting, test_boost)

pred_b <- predict_boost
truth_b <- test_boost$y

f1_boost <- function(pred, true) {
  cm <- table(pred, true)
  precision <- diag(cm) / colSums(cm)
  recall <- diag(cm) / rowSums(cm)
  f1 <- 2 * ((precision * recall) / (precision + recall))
  mean(f1, na.rm=TRUE)
}

cat("Experiment 1: scaled numeric features \n")

## Experiment 1: scaled numeric features

cat("F1 Score:", round(f1_boost(pred_b, truth_b), 4), "\n")

## F1 Score: 0.9403

Experiment two - expanded feature set

Features: age, duration, campaign, pdays, previous, job, education, contact, y

Preprocessing: Min-max normalization on numeric features

Metric: Precision

boosted2 <- bank_additional_full_csv %>% mutate_if(is.character, as.factor)
boosted2 <- boosted2 %>% select(age, duration, campaign, pdays, previous, job, education, contact, y)

boost_nums2 <- c("age", "duration", "campaign", "pdays", "previous")
boosted2[boost_nums2] <- as.data.frame(lapply(boosted2[boost_nums2], function(x) (x - min(x)) / (max(x) - min(x))))

split_boost2 <- createDataPartition(boosted2$y, p=0.8, list=FALSE)
train_boost2 <- boosted2[split_boost2,]
test_boost2 <- boosted2[-split_boost2,]

boosting2 <- ada(y ~ ., data=train_boost2, iter=50, nu=0.1, control=rpart.control(maxdepth=1))
predict_boost2 <- predict(boosting2, test_boost2)

conf_boost2 <- confusionMatrix(predict_boost2, test_boost2$y, positive="yes")

cat("Experiment 2: AdaBoost with normalization and more categories:\n")

## Experiment 2: AdaBoost with normalization and more categories:

cat("Precision:", round(conf_boost2$byClass["Precision"], 4), "\n")

## Precision: 0.8837