Assignment 2: Experiment and model creating

#packages and data loding 
rq_packages <- c("GGally", "naniar", "gridExtra", "scales", "ggplot2",
                  "dplyr", "tidyr", "corrplot", "ggcorrplot", "caret",
                  "naivebayes", "pROC", "car", "knitr", "rpart", "randomForest", 
                 "rpart", "rpart.plot", 'ROSE', "adabag", "reshape2", "ada", "smotefamily" )
for (pkg in rq_packages){
  if (!require(pkg, character.only = TRUE)) {
    install.packages(pkg)
    library(pkg, character.only = TRUE)
  }
}

## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display

## Warning: 'rgl.init' failed, will use the null device.
## See '?rgl.useNULL' for ways to avoid this warning.

df <- read.csv("https://raw.githubusercontent.com/hamza9713/assignment_data_repo/refs/heads/main/bank-additional-full.csv", sep=";")

Preprocessing

Column Renaming: Columns were renamed for clarity and easier referencing, e.g., marital became marital_status, and y was renamed to subscription_status. Handling Missing Data: Instances with “unknown” values were replaced with NA, and rows containing NA values were removed using na.omit(). Standardizing Numeric Variables: Numeric variables were normalized using z-score normalization (scale()), ensuring all numeric features had a mean of 0 and a standard deviation of 1. This is especially useful for algorithms sensitive to feature scaling, like logistic regression or SVM. Converting Categorical Variables to Factors: Variables such as job, education, and marital_status were converted to factors, making them suitable for categorical encoding in machine learning algorithms.

One-Hot Encoding: After balancing, one-hot encoding is applied to the categorical variables job, education, and marital_status using dummyVars from the caret package. This converts each category into a binary column, which is a format suitable for many machine learning algorithms.

In some experiments, only a subset of predictors, such as job, education, and marital_status, were used to build simplified models focused on specific aspects of customer behavior.

Feature Selection for Random Forest: Experiment A used all available features to train the Random Forest model. Experiment B used the top 5 features ranked by their importance in terms of Mean Decrease in Accuracy from the previous Random Forest model. This reduced dimensionality and improved computational efficiency.

##########################################################################
### Data Cleaning & Preprocessing
##########################################################################

# Rename columns for clarity
df <- df %>%
  rename(
    age = age,
    job = job,
    marital_status = marital,
    education = education,
    credit_default = default,
    mortgage = housing,
    personal_loan = loan,
    contact_method = contact,
    contact_month = month,
    contact_day = day_of_week,
    contact_duration = duration,
    campaign_contacts = campaign,
    days_since_last_contact = pdays,
    previous_contacts = previous,
    previous_outcome = poutcome,
    employment_rate = emp.var.rate,
    consumer_price_index = cons.price.idx,
    consumer_confidence_index = cons.conf.idx,
    euribor_rate = euribor3m,
    employees_count = nr.employed,
    subscription_status = y
  )

# Convert "unknown" to NA and omit NAs
df[df == "unknown"] <- NA 
df <- na.omit(df)

# Standardize numeric variables (z-score normalization)
numeric_vars <- df %>%
  select_if(is.numeric) %>%
  colnames()

df[numeric_vars] <- scale(df[numeric_vars])

# List of categorical variables
categorical_vars <- c(
  "job", "marital_status", "education", "credit_default",
  "mortgage", "personal_loan", "contact_method", "contact_month",
  "contact_day", "previous_outcome", "subscription_status"
)

# Convert categorical variables to factors
df[categorical_vars] <- lapply(df[categorical_vars], factor)

# Create a train-test split (70% train, 30% test)
set.seed(555) 
train_index <- createDataPartition(df$subscription_status, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Inspect the training data
str(train_data)

## 'data.frame':    21343 obs. of  21 variables:
##  $ age                      : num  1.6422 -0.1964 0.0939 1.6422 1.9325 ...
##  $ job                      : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 2 ...
##  $ marital_status           : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 2 ...
##  $ education                : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 2 ...
##  $ credit_default           : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ mortgage                 : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
##  $ personal_loan            : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
##  $ contact_method           : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ contact_month            : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ contact_day              : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ contact_duration         : num  0.00579 -0.12794 -0.41451 0.18156 -0.46037 ...
##  $ campaign_contacts        : num  -0.559 -0.559 -0.559 -0.559 -0.559 ...
##  $ days_since_last_contact  : num  0.212 0.212 0.212 0.212 0.212 ...
##  $ previous_contacts        : num  -0.372 -0.372 -0.372 -0.372 -0.372 ...
##  $ previous_outcome         : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ employment_rate          : num  0.727 0.727 0.727 0.727 0.727 ...
##  $ consumer_price_index     : num  0.804 0.804 0.804 0.804 0.804 ...
##  $ consumer_confidence_index: num  0.877 0.877 0.877 0.877 0.877 ...
##  $ euribor_rate             : num  0.786 0.786 0.786 0.786 0.786 ...
##  $ employees_count          : num  0.402 0.402 0.402 0.402 0.402 ...
##  $ subscription_status      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:10700] 2 6 8 11 16 18 20 22 27 28 ...
##   ..- attr(*, "names")= chr [1:10700] "2" "6" "8" "11" ...

summary(train_data)

##       age                     job        marital_status 
##  Min.   :-2.131896   admin.     :6104   divorced: 2488  
##  1st Qu.:-0.777083   blue-collar:3946   married :12257  
##  Median :-0.196449   technician :3813   single  : 6598  
##  Mean   :-0.000288   services   :2016                   
##  3rd Qu.: 0.577730   management :1648                   
##  Max.   : 5.319575   retired    : 865                   
##                      (Other)    :2951                   
##                education    credit_default mortgage    personal_loan
##  basic.4y           :1692   no :21343      no : 9770   no :17955    
##  basic.6y           : 976   yes:    0      yes:11573   yes: 3388    
##  basic.9y           :2978                                           
##  high.school        :5389                                           
##  illiterate         :   5                                           
##  professional.course:3013                                           
##  university.degree  :7290                                           
##    contact_method  contact_month  contact_day contact_duration   
##  cellular :14176   may    :6816   fri:4050    Min.   :-0.991479  
##  telephone: 7167   jul    :3583   mon:4422    1st Qu.:-0.597920  
##                    aug    :3245   thu:4391    Median :-0.303706  
##                    jun    :2578   tue:4236    Mean   :-0.000945  
##                    nov    :2420   wed:4244    3rd Qu.: 0.235050  
##                    apr    :1457               Max.   :17.800008  
##                    (Other):1244                                  
##  campaign_contacts   days_since_last_contact previous_contacts  
##  Min.   :-0.559326   Min.   :-4.749052       Min.   :-0.371610  
##  1st Qu.:-0.559326   1st Qu.: 0.211884       1st Qu.:-0.371610  
##  Median :-0.191699   Median : 0.211884       Median :-0.371610  
##  Mean   : 0.004852   Mean   :-0.003912       Mean   : 0.005075  
##  3rd Qu.: 0.175927   3rd Qu.: 0.211884       3rd Qu.:-0.371610  
##  Max.   :14.880996   Max.   : 0.211884       Max.   :13.018138  
##                                                                 
##     previous_outcome employment_rate     consumer_price_index
##  failure    : 2420   Min.   :-2.066872   Min.   :-2.25892    
##  nonexistent:18071   1st Qu.:-1.073330   1st Qu.:-0.76585    
##  success    :  852   Median : 0.727465   Median :-0.13549    
##                      Mean   : 0.004394   Mean   : 0.00917    
##                      3rd Qu.: 0.913755   3rd Qu.: 0.80408    
##                      Max.   : 0.913755   Max.   : 2.12460    
##                                                              
##  consumer_confidence_index  euribor_rate      employees_count    
##  Min.   :-2.129297         Min.   :-1.59008   Min.   :-2.623982  
##  1st Qu.:-0.438009         1st Qu.:-1.20802   1st Qu.:-0.821115  
##  Median :-0.250089         Median : 0.78553   Median : 0.401641  
##  Mean   : 0.004968         Mean   : 0.00472   Mean   : 0.002237  
##  3rd Qu.: 0.877437         3rd Qu.: 0.84461   3rd Qu.: 0.895268  
##  Max.   : 2.861046         Max.   : 0.89187   Max.   : 0.895268  
##                                                                  
##  subscription_status
##  no :18641          
##  yes: 2702          
##                     
##                     
##                     
##                     
##

Experiment 1 Decision Trees

The target of both experiments was to observe the role of demographic factors (“job,” “education,” “marital status”) in predicting “subscription_status” and to compare different data balancing techniques (oversampling vs. undersampling) using Decision Trees. This was aimed at improving model performance and understanding how data sampling affects results. Variation used for oversampling maxdepth = 6 and minbucket = 10 and undersampling maxdepth = 8 and minbucket = 15.

Hypothesis 1: demographic factors (job, education, and marital status) have a significant influence on subscription status prediction.

Hypothesis 2: oversampling and undersampling techniques will affect Decision Tree model performance in predicting subscription status.

Both experiments were evaluated using multiple metrics, including: * Accuracy. * Sensitivity, specificity, precision, recall, and F1-score. * AUC-ROC to measure the ability to distinguish between classes.

# Balance the dataset using oversampling
set.seed(123)
df_balanced <- ovun.sample(subscription_status ~ job + education + marital_status, 
                           data = df, 
                           method = "over", 
                           N = max(table(df$subscription_status)) * 2)$data

# Encode categorical variables (One-hot encoding)
dummy_model <- dummyVars(~ job + education + marital_status - 1, data = df_balanced, fullRank = TRUE)
df_encoded <- predict(dummy_model, newdata = df_balanced) %>%
  as.data.frame() %>%
  bind_cols(subscription_status = df_balanced$subscription_status)

# Train-test split (70/30)
set.seed(123)
train_index <- createDataPartition(df_encoded$subscription_status, p = 0.7, list = FALSE)
train_data <- df_encoded[train_index, ]
test_data <- df_encoded[-train_index, ]

# Train a Decision Tree model with optimized parameters
tree_model <- rpart(
  subscription_status ~ .,
  data = train_data,
  method = "class",
  control = rpart.control(maxdepth = 6, minbucket = 10, cp = 0.01)
)

# Visualize the decision tree
rpart.plot(
  tree_model, type = 5, extra = 104, box.palette = "GnBu", tweak = 1.2,
  main = "Decision Tree - Job, Education, and Marital Status"
)

# Predict on the test set
pred_probs <- predict(tree_model, test_data, type = "prob")[, "yes"]
predictions <- ifelse(pred_probs > 0.5, "yes", "no")

# Confusion Matrix
conf_matrix <- confusionMatrix(
  factor(predictions, levels = c("no", "yes")),
  factor(test_data$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5005 3823
##        yes 2983 4165
##                                           
##                Accuracy : 0.574           
##                  95% CI : (0.5663, 0.5817)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.148           
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.5214          
##             Specificity : 0.6266          
##          Pos Pred Value : 0.5827          
##          Neg Pred Value : 0.5669          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2607          
##    Detection Prevalence : 0.4474          
##       Balanced Accuracy : 0.5740          
##                                           
##        'Positive' Class : yes             
##

# ROC Curve and AUC
roc_obj <- roc(test_data$subscription_status, pred_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_obj, main = "ROC Curve - Decision Tree", col = "blue", lwd = 2)

auc_value <- auc(roc_obj)
cat("AUC Value:", auc_value, "\n")

## AUC Value: 0.5883991

# Visualization: Precision, Recall, F1-Score
results <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value = c(conf_matrix$overall["Accuracy"], 
            conf_matrix$byClass["Precision"], 
            conf_matrix$byClass["Recall"], 
            conf_matrix$byClass["F1"])
)

ggplot(results, aes(x = Metric, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", width = 0.6) +
  scale_fill_brewer(palette = "Set2") +
  labs(title = "Model Performance Metrics", y = "Value", x = "Metric") +
  theme_minimal()

# experiment 2 of decision trees balance the dataset using undersampling

set.seed(456) # Different seed for reproducibility
df_balanced_undersample <- ovun.sample(subscription_status ~ job + education + marital_status, 
                                       data = df, 
                                       method = "under", 
                                       N = min(table(df$subscription_status)) * 2)$data

# Encode categorical variables (One-hot encoding)
dummy_model_2 <- dummyVars(~ job + education + marital_status - 1, data = df_balanced_undersample, fullRank = TRUE)
df_encoded_undersample <- predict(dummy_model_2, newdata = df_balanced_undersample) %>%
  as.data.frame() %>%
  bind_cols(subscription_status = df_balanced_undersample$subscription_status)

# Train-test split (70/30)
set.seed(456) # Different seed for split
train_index_2 <- createDataPartition(df_encoded_undersample$subscription_status, p = 0.7, list = FALSE)
train_data_2 <- df_encoded_undersample[train_index_2, ]
test_data_2 <- df_encoded_undersample[-train_index_2, ]

# Train a Decision Tree model with different hyperparameters
tree_model_2 <- rpart(
  subscription_status ~ .,
  data = train_data_2,
  method = "class",
  control = rpart.control(maxdepth = 8, minbucket = 15, cp = 0.01)
)

# Visualize the decision tree
rpart.plot(
  tree_model_2, type = 5, extra = 104, box.palette = "PuBu", tweak = 1.2,
  main = "Decision Tree - Experiment 2 (Undersampling)"
)

# Predict on the test set
pred_probs_2 <- predict(tree_model_2, test_data_2, type = "prob")[, "yes"]
predictions_2 <- ifelse(pred_probs_2 > 0.5, "yes", "no")

# Confusion Matrix
conf_matrix_2 <- confusionMatrix(
  factor(predictions_2, levels = c("no", "yes")),
  factor(test_data_2$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)
print(conf_matrix_2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  721 560
##        yes 436 597
##                                           
##                Accuracy : 0.5696          
##                  95% CI : (0.5491, 0.5899)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 1.166e-11       
##                                           
##                   Kappa : 0.1392          
##                                           
##  Mcnemar's Test P-Value : 9.723e-05       
##                                           
##             Sensitivity : 0.5160          
##             Specificity : 0.6232          
##          Pos Pred Value : 0.5779          
##          Neg Pred Value : 0.5628          
##              Prevalence : 0.5000          
##          Detection Rate : 0.2580          
##    Detection Prevalence : 0.4464          
##       Balanced Accuracy : 0.5696          
##                                           
##        'Positive' Class : yes             
##

# ROC Curve and AUC
roc_obj_2 <- roc(test_data_2$subscription_status, pred_probs_2)

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

plot(roc_obj_2, main = "ROC Curve - Decision Tree (Undersampling)", col = "darkred", lwd = 2)

auc_value_2 <- auc(roc_obj_2)
cat("AUC Value (Experiment 2):", auc_value_2, "\n")

## AUC Value (Experiment 2): 0.5837355

# Visualization: Precision, Recall, F1-Score
results_2 <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1-Score"),
  Value = c(conf_matrix_2$overall["Accuracy"], 
            conf_matrix_2$byClass["Precision"], 
            conf_matrix_2$byClass["Recall"], 
            conf_matrix_2$byClass["F1"])
)

ggplot(results_2, aes(x = Metric, y = Value, fill = Metric)) +
  geom_bar(stat = "identity", width = 0.6) +
  scale_fill_brewer(palette = "Set1") +
  labs(title = "Model Performance Metrics - Experiment 2", y = "Value", x = "Metric") +
  theme_minimal()

###***********************************************************************************
# combine results from both experiments into df
comparison_results <- data.frame(
  Experiment = rep(c("Oversampling", "Undersampling"), each = 4),
  Metric = rep(c("Accuracy", "Precision", "Recall", "F1-Score"), times = 2),
  Value = c(
    conf_matrix$overall["Accuracy"], conf_matrix$byClass["Precision"], 
    conf_matrix$byClass["Recall"], conf_matrix$byClass["F1"],
    conf_matrix_2$overall["Accuracy"], conf_matrix_2$byClass["Precision"], 
    conf_matrix_2$byClass["Recall"], conf_matrix_2$byClass["F1"]
  )
)

# Add AUC values to the comparison
comparison_results <- rbind(
  comparison_results,
  data.frame(
    Experiment = c("Oversampling", "Undersampling"),
    Metric = "AUC",
    Value = c(auc_value, auc_value_2)
  )
)

# Create the visualization
ggplot(comparison_results, aes(x = Metric, y = Value, fill = Experiment)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.6) +
  scale_fill_brewer(palette = "Paired") +
  labs(
    title = "Comparison of Decision Tree Experiments",
    subtitle = "Oversampling vs. Undersampling",
    x = "Metric",
    y = "Value"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(size = 16, face = "bold"),
    plot.subtitle = element_text(size = 12)
  )

Result: Oversampling: Accuracy: 57.4%, AUC: 0.588.The model performed moderately well, with slightly higher AUC and balanced accuracy than Experiment 2.

Undersampling: Accuracy: 56.96%, AUC: 0.584.The model performed similarly but slightly worse overall. Undersampling reduced training time but lost information from the majority class, which may explain the marginally lower AUC and accuracy.

Oversampling provided marginally better results, but both approaches struggled to achieve high predictive performance due to limited predictive power of the selected demographic features.

Experiment 2: A Random Forest with All Features

The objective was to evaluate and compare the performance of Random Forest models when using all features (Experiment A) versus only the top 5 most important features (Experiment B). The aim was to determine whether reducing the number of features could maintain performance while improving model simplicity and computational efficiency.

Hypothesis: reducing model complexity by removing less important features could improve generalization and accuracy.

set.seed(555)
train_index <- createDataPartition(df$subscription_status, p = 0.7, list = FALSE)
train_data <- df[train_index, ]
test_data <- df[-train_index, ]

# Experiment A: Random Forest with all features
set.seed(123)
rf_model_all <- randomForest(
  subscription_status ~ .,
  data = train_data,
  ntree = 500,
  mtry = sqrt(ncol(train_data) - 1), # Default mtry
  importance = TRUE
)

# Variable importance plot
importance <- importance(rf_model_all)
varImpPlot(rf_model_all, main = "Variable Importance (Experiment A)")

# Predictions and performance
pred_probs_all <- predict(rf_model_all, test_data, type = "prob")[, "yes"]
pred_all <- ifelse(pred_probs_all > 0.5, "yes", "no")

conf_matrix_all <- confusionMatrix(
  factor(pred_all, levels = c("no", "yes")),
  factor(test_data$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)

roc_obj_all <- roc(test_data$subscription_status, pred_probs_all)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

auc_all <- auc(roc_obj_all)


# Experiment B: Random Forest with top 5 features
top_features <- names(sort(importance[, "MeanDecreaseAccuracy"], decreasing = TRUE))[1:5]
formula_top <- as.formula(paste("subscription_status ~", paste(top_features, collapse = " + ")))

set.seed(123)
rf_model_top <- randomForest(
  formula_top,
  data = train_data,
  ntree = 500,
  mtry = sqrt(5), # mtry adjusted for 5 features
  importance = TRUE
)

# Predictions and performance
pred_probs_top <- predict(rf_model_top, test_data, type = "prob")[, "yes"]
pred_top <- ifelse(pred_probs_top > 0.5, "yes", "no")

conf_matrix_top <- confusionMatrix(
  factor(pred_top, levels = c("no", "yes")),
  factor(test_data$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)

roc_obj_top <- roc(test_data$subscription_status, pred_probs_top)

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

auc_top <- auc(roc_obj_top)


# Compare results
comparison <- data.frame(
  Metric = c("Accuracy", "Precision", "Recall", "F1", "AUC"),
  Experiment_A = c(
    conf_matrix_all$overall["Accuracy"],
    conf_matrix_all$byClass["Precision"],
    conf_matrix_all$byClass["Recall"],
    conf_matrix_all$byClass["F1"],
    auc_all
  ),
  Experiment_B = c(
    conf_matrix_top$overall["Accuracy"],
    conf_matrix_top$byClass["Precision"],
    conf_matrix_top$byClass["Recall"],
    conf_matrix_top$byClass["F1"],
    auc_top
  )
)

print(comparison)

##              Metric Experiment_A Experiment_B
## Accuracy   Accuracy    0.9023510    0.8986331
## Precision Precision    0.6434783    0.6220807
## Recall       Recall    0.5116681    0.5064823
## F1               F1    0.5700530    0.5583611
##                 AUC    0.9408488    0.9318257

# Visualization
comparison_long <- comparison %>%
  tidyr::pivot_longer(-Metric, names_to = "Experiment", values_to = "Value")

ggplot(comparison_long, aes(x = Metric, y = Value, fill = Experiment)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.6) +
  scale_fill_brewer(palette = "Set1") +
  labs(
    title = "Performance Comparison: Experiment A (All Features) vs. Experiment B (Top 5 Features)",
    y = "Value",
    x = "Metric"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Result Experiment A achieved slightly better performance across all metrics (Accuracy: 90.24%, Precision: 64.35%, Recall: 51.17%, F1-Score: 57.01%, AUC: 94.08%) compared to experiment B (Accuracy: 89.86%, Precision: 62.21%, Recall: 50.65%, F1-Score: 55.84%, AUC: 93.18%).While the reduction in performance was minimal, experiment A maintained slightly better predictive performance. Less features (experiment B) simplified the model but led to a minor decrease in performance. This trade-off may be acceptable in scenarios where computational efficiency and model simplicity are prioritized.

Experiment 3: Adaboost

The aim of the experiments was to apply AdaBoost with two sampling techniques (oversampling and undersampling) to balance the dataset and evaluate model performance using accuracy, confusion matrices, ROC curves, and AUC values.

Hypothesis: over and under sampling will balance the clasees then the AdaBoost algorithm will achieve higher accuracy and AUC. If the dataset is undersampled to balance the classes and the model is trained primarily on demographic features, then the AdaBoost algorithm will demonstrate lower accuracy and AUC

# Balance the dataset using oversampling
set.seed(123)
df_balanced <- ovun.sample(subscription_status ~ ., 
                           data = df, 
                           method = "over", 
                           N = max(table(df$subscription_status)) * 2)$data

# Train-test split
set.seed(123)
train_index <- createDataPartition(df_balanced$subscription_status, p = 0.7, list = FALSE)
train_data <- df_balanced[train_index, ]
test_data <- df_balanced[-train_index, ]

# AdaBoost model with limited iterations (nIter)
adaboost_model_oversampling <- ada(
  subscription_status ~ ., 
  data = train_data, 
  iter = 50,          # Reduced number of iterations
  control = rpart.control(maxdepth = 2, cp = 0.01)  # Shallow trees
)

# Predictions
pred_probs_oversampling <- predict(adaboost_model_oversampling, test_data, type = "probs")[, 2]
predictions_oversampling <- ifelse(pred_probs_oversampling > 0.5, "yes", "no")

# Confusion Matrix
conf_matrix_oversampling <- confusionMatrix(
  factor(predictions_oversampling, levels = c("no", "yes")),
  factor(test_data$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)

# AUC
roc_obj_oversampling <- roc(test_data$subscription_status, pred_probs_oversampling)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

auc_value_oversampling <- auc(roc_obj_oversampling)

cat("Oversampling Experiment - Accuracy:", conf_matrix_oversampling$overall["Accuracy"], "\n")

## Oversampling Experiment - Accuracy: 0.8646094

cat("Oversampling Experiment - AUC:", auc_value_oversampling, "\n")

## Oversampling Experiment - AUC: 0.9302706

# Visualization for Experiment 1: AdaBoost with Oversampling
roc_df_oversampling <- data.frame(
  FPR = 1 - roc_obj_oversampling$specificities,
  TPR = roc_obj_oversampling$sensitivities
)

ggplot(roc_df_oversampling, aes(x = FPR, y = TPR)) +
  geom_line(color = "blue", size = 1.2) +
  geom_abline(linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - AdaBoost with Oversampling",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

###### exp 2 adaboost 

# Balance the dataset using undersampling
set.seed(456)
df_balanced_undersample <- ovun.sample(subscription_status ~ ., 
                                       data = df, 
                                       method = "under", 
                                       N = min(table(df$subscription_status)) * 2)$data

# Train-test split
set.seed(456)
train_index_undersample <- createDataPartition(df_balanced_undersample$subscription_status, p = 0.7, list = FALSE)
train_data_undersample <- df_balanced_undersample[train_index_undersample, ]
test_data_undersample <- df_balanced_undersample[-train_index_undersample, ]

# Select demographic-related features
demographic_features <- c("age", "job", "marital_status", "education", "mortgage", 
                          "personal_loan", "subscription_status")


# Train AdaBoost model on demographic features
adaboost_model_demographics <- ada(
  subscription_status ~ ., 
  data = train_data_undersample[, demographic_features], 
  iter = 50,          # Reduced number of iterations
  control = rpart.control(maxdepth = 2, cp = 0.01)  # Shallow trees
)

# Predictions
pred_probs_demographics <- predict(adaboost_model_demographics, test_data_undersample[, demographic_features], type = "probs")[, 2]
predictions_demographics <- ifelse(pred_probs_demographics > 0.5, "yes", "no")

# Confusion Matrix
conf_matrix_demographics <- confusionMatrix(
  factor(predictions_demographics, levels = c("no", "yes")),
  factor(test_data_undersample$subscription_status, levels = c("no", "yes")),
  positive = "yes"
)

# AUC
roc_obj_demographics <- roc(test_data_undersample$subscription_status, pred_probs_demographics)

## Setting levels: control = no, case = yes
## Setting direction: controls < cases

auc_value_demographics <- auc(roc_obj_demographics)

cat("Demographics Experiment - Accuracy:", conf_matrix_demographics$overall["Accuracy"], "\n")

## Demographics Experiment - Accuracy: 0.5920484

cat("Demographics Experiment - AUC:", auc_value_demographics, "\n")

## Demographics Experiment - AUC: 0.62968

# Visualization for Experiment 2: AdaBoost with Undersampling (Demographics)
roc_df_demographics <- data.frame(
  FPR = 1 - roc_obj_demographics$specificities,
  TPR = roc_obj_demographics$sensitivities
)

ggplot(roc_df_demographics, aes(x = FPR, y = TPR)) +
  geom_line(color = "red", size = 1.2) +
  geom_abline(linetype = "dashed", color = "gray") +
  labs(
    title = "ROC Curve - AdaBoost with Demographics",
    x = "False Positive Rate (1 - Specificity)",
    y = "True Positive Rate (Sensitivity)"
  ) +
  theme_minimal()

# Bar Plot: Comparison of Accuracy and AUC
metrics <- data.frame(
  Experiment = c("Oversampling", "Demographics"),
  Accuracy = c(conf_matrix_oversampling$overall["Accuracy"], conf_matrix_demographics$overall["Accuracy"]),
  AUC = c(auc_value_oversampling, auc_value_demographics)
)

# Accuracy bar plot
ggplot(metrics, aes(x = Experiment, y = Accuracy, fill = Experiment)) +
  geom_bar(stat = "identity", color = "black") +
  labs(
    title = "Comparison of Accuracy",
    x = "Experiment",
    y = "Accuracy"
  ) +
  theme_minimal()

# AUC bar plot
ggplot(metrics, aes(x = Experiment, y = AUC, fill = Experiment)) +
  geom_bar(stat = "identity", color = "black") +
  labs(
    title = "Comparison of AUC",
    x = "Experiment",
    y = "AUC"
  ) +
  theme_minimal()

Result

The results align with the hypotheses for both experiments. Oversampling led to much better performance compared to undersampling, particularly in terms of AUC, which is critical for evaluating imbalanced datasets. AdaBoost experiments with oversampling and demographic features revealed significant differences in bias and variance. The oversampling experiment achieved an accuracy of 0.8646094 and an AUC of 0.9302706, indicating low bias and variance. This suggests that the model effectively balanced the dataset and generalized well to unseen data.

Codes

Load necessary packages

rq_packages <- c(“GGally”, “naniar”, “gridExtra”, “scales”, “ggplot2”, “dplyr”, “tidyr”, “corrplot”, “ggcorrplot”, “caret”, “naivebayes”, “pROC”, “car”, “knitr”, “rpart”, “randomForest”, “rpart.plot”, “ROSE”, “adabag”, “reshape2”, “ada”, “smotefamily”)

Install and load packages

for (pkg in rq_packages) { if (!require(pkg, character.only = TRUE)) { install.packages(pkg) library(pkg, character.only = TRUE) } }

Load the dataset

d_path <- “Google Drive/Hunter /ML Big data 622/assignment 1/bank-additional-full.csv” df <- read.csv(d_path, sep = “;”, stringsAsFactors = TRUE)

Rename columns for clarity

df <- df %>% rename( age = age, job = job, marital_status = marital, education = education, credit_default = default, mortgage = housing, personal_loan = loan, contact_method = contact, contact_month = month, contact_day = day_of_week, contact_duration = duration, campaign_contacts = campaign, days_since_last_contact = pdays, previous_contacts = previous, previous_outcome = poutcome, employment_rate = emp.var.rate, consumer_price_index = cons.price.idx, consumer_confidence_index = cons.conf.idx, euribor_rate = euribor3m, employees_count = nr.employed, subscription_status = y )

Data inspection

str(df) summary(df)

Data Cleaning & Preprocessing

Convert “unknown” to NA and omit NAs

df[df == “unknown”] <- NA df <- na.omit(df)

Standardize numeric variables (z-score normalization)

numeric_vars <- df %>% select_if(is.numeric) %>% colnames()

df[numeric_vars] <- scale(df[numeric_vars])

List of categorical variables

categorical_vars <- c( “job”, “marital_status”, “education”, “credit_default”, “mortgage”, “personal_loan”, “contact_method”, “contact_month”, “contact_day”, “previous_outcome”, “subscription_status” )

Convert categorical variables to factors

df[categorical_vars] <- lapply(df[categorical_vars], factor)

Create a train-test split (70% train, 30% test)

set.seed(555) train_index <- createDataPartition(df$subscription_status, p = 0.7, list = FALSE) train_data <- df[train_index, ] test_data <- df[-train_index, ]

Inspect the training data

str(train_data) summary(train_data)

Experiment 1: Decision Trees

Balance the dataset using oversampling

set.seed(123) df_balanced <- ovun.sample(subscription_status ~ job + education + marital_status, data = df, method = “over”, N = max(table(df$subscription_status)) * 2)$data

Encode categorical variables (One-hot encoding)

dummy_model <- dummyVars(~ job + education + marital_status - 1, data = df_balanced, fullRank = TRUE) df_encoded <- predict(dummy_model, newdata = df_balanced) %>% as.data.frame() %>% bind_cols(subscription_status = df_balanced$subscription_status)

Train-test split (70/30)

set.seed(123) train_index <- createDataPartition(df_encoded$subscription_status, p = 0.7, list = FALSE) train_data <- df_encoded[train_index, ] test_data <- df_encoded[-train_index, ]

Train a Decision Tree model with optimized parameters

tree_model <- rpart( subscription_status ~ ., data = train_data, method = “class”, control = rpart.control(maxdepth = 6, minbucket = 10, cp = 0.01) )

Visualize the decision tree

rpart.plot( tree_model, type = 5, extra = 104, box.palette = “GnBu”, tweak = 1.2, main = “Decision Tree - Job, Education, and Marital Status” )

Predict on the test set

pred_probs <- predict(tree_model, test_data, type = “prob”)[, “yes”] predictions <- ifelse(pred_probs > 0.5, “yes”, “no”)

Confusion Matrix

conf_matrix <- confusionMatrix( factor(predictions, levels = c(“no”, “yes”)), factor(test_data$subscription_status, levels = c(“no”, “yes”)), positive = “yes” ) print(conf_matrix)

ROC Curve and AUC

roc_obj <- roc(test_data$subscription_status, pred_probs) plot(roc_obj, main = “ROC Curve - Decision Tree”, col = “blue”, lwd = 2) auc_value <- auc(roc_obj) cat(“AUC Value:”, auc_value, “”)

Visualization: Precision, Recall, F1-Score

results <- data.frame( Metric = c(“Accuracy”, “Precision”, “Recall”, “F1-Score”), Value = c(conf_matrix$overall["Accuracy"], conf_matrix$byClass[“Precision”], conf_matrix$byClass["Recall"], conf_matrix$byClass[“F1”]) )

ggplot(results, aes(x = Metric, y = Value, fill = Metric)) + geom_bar(stat = “identity”, width = 0.6) + scale_fill_brewer(palette = “Set2”) + labs(title = “Model Performance Metrics”, y = “Value”, x = “Metric”) + theme_minimal()

Experiment 2: Decision Trees with Undersampling

set.seed(456) # Different seed for reproducibility df_balanced_undersample <- ovun.sample(subscription_status ~ job + education + marital_status, data = df, method = “under”, N = min(table(df$subscription_status)) * 2)$data

Encode categorical variables (One-hot encoding)

dummy_model_2 <- dummyVars(~ job + education + marital_status - 1, data = df_balanced_undersample, fullRank = TRUE) df_encoded_undersample <- predict(dummy_model_2, newdata = df_balanced_undersample) %>% as.data.frame() %>% bind_cols(subscription_status = df_balanced_undersample$subscription_status)

Train-test split (70/30)

set.seed(456) # Different seed for split train_index_2 <- createDataPartition(df_encoded_undersample$subscription_status, p = 0.7, list = FALSE) train_data_2 <- df_encoded_undersample[train_index_2, ] test_data_2 <- df_encoded_undersample[-train_index_2, ]

Train a Decision Tree model with different hyperparameters

tree_model_2 <- rpart( subscription_status ~ ., data = train_data_2, method = “class”, control = rpart.control(maxdepth = 8, minbucket = 15, cp = 0.01) )

Visualize the decision tree

rpart.plot( tree_model_2, type = 5, extra = 104, box.palette = “PuBu”, tweak = 1.2, main = “Decision Tree - Experiment 2 (Undersampling)” )

Predict on the test set

pred_probs_2 <- predict(tree_model_2, test_data_2, type = “prob”)[, “yes”] predictions_2 <- ifelse(pred_probs_2 > 0.5, “yes”, “no”)

Confusion Matrix

conf_matrix_2 <- confusionMatrix( factor(predictions_2, levels = c(“no”, “yes”)), factor(test_data_2$subscription_status, levels = c(“no”, “yes”)), positive = “yes” ) print(conf_matrix_2)

ROC Curve and AUC

roc_obj_2 <- roc(test_data_2$subscription_status, pred_probs_2) plot(roc_obj_2, main = “ROC Curve - Decision Tree (Undersampling)”, col = “darkred”, lwd = 2) auc_value_2 <- auc(roc_obj_2) cat(“AUC Value (Experiment 2):”, auc_value_2, “”)

Visualization: Precision, Recall, F1-Score

results_2 <- data.frame( Metric = c(“Accuracy”, “Precision”, “Recall”, “F1-Score”), Value = c(conf_matrix_2$overall["Accuracy"], conf_matrix_2$byClass[“Precision”], conf_matrix_2$byClass["Recall"], conf_matrix_2$byClass[“F1”]) )

ggplot(results_2, aes(x = Metric, y = Value, fill = Metric)) + geom_bar(stat = “identity”, width = 0.6) + scale_fill_brewer(palette = “Set1”) + labs(title = “Model Performance Metrics - Experiment 2”, y = “Value”, x = “Metric”) + theme_minimal()

Combine results from both experiments into a single dataframe

comparison_results <- data.frame( Experiment = rep(c(“Oversampling”, “Undersampling”), each = 4), Metric = rep(c(“Accuracy”, “Precision”, “Recall”, “F1-Score”), times = 2), Value = c( conf_matrix$overall["Accuracy"], conf_matrix$byClass[“Precision”], conf_matrix$byClass["Recall"], conf_matrix$byClass[“F1”], conf_matrix_2$overall["Accuracy"], conf_matrix_2$byClass[“Precision”], conf_matrix_2$byClass["Recall"], conf_matrix_2$byClass[“F1”] ) )

Add AUC values to the comparison

comparison_results <- rbind( comparison_results, data.frame( Experiment = c(“Oversampling”, “Undersampling”), Metric = “AUC”, Value = c(auc_value, auc_value_2) ) )

Create the visualization

ggplot(comparison_results, aes(x = Metric, y = Value, fill = Experiment)) + geom_bar(stat = “identity”, position = “dodge”, width = 0.6) + scale_fill_brewer(palette = “Paired”) + labs( title = “Comparison of Decision Tree Experiments”, subtitle = “Oversampling vs. Undersampling”, x = “Metric”, y = “Value” ) + theme_minimal() + theme( axis.text.x = element_text(angle = 45, hjust = 1), plot.title = element_text(size = 16, face = “bold”), plot.subtitle = element_text(size = 12) )

Experiment 2: Random Forest with All Features

Define target variable

set.seed(555) train_index <- createDataPartition(df$subscription_status, p = 0.7, list = FALSE) train_data <- df[train_index, ] test_data <- df[-train_index, ]

Experiment A: Random Forest with all features

set.seed(123) rf_model_all <- randomForest( subscription_status ~ ., data = train_data, ntree = 500, mtry = sqrt(ncol(train_data) - 1), # Default mtry importance = TRUE )

Variable importance plot

importance <- importance(rf_model_all) varImpPlot(rf_model_all, main = “Variable Importance (Experiment A)”)

Predictions and performance

pred_probs_all <- predict(rf_model_all, test_data, type = “prob”)[, “yes”] pred_all <- ifelse(pred_probs_all > 0.5, “yes”, “no”)

conf_matrix_all <- confusionMatrix( factor(pred_all, levels = c(“no”, “yes”)), factor(test_data$subscription_status, levels = c(“no”, “yes”)), positive = “yes” )

roc_obj_all <- roc(test_data$subscription_status, pred_probs_all) auc_all <- auc(roc_obj_all)

Experiment B: Random Forest with top 5 features

top_features <- names(sort(importance[, “MeanDecreaseAccuracy”], decreasing = TRUE))[1:5] formula_top <- as.formula(paste(“subscription_status ~”, paste(top_features, collapse = ” + “)))

set.seed(123) rf_model_top <- randomForest( formula_top, data = train_data, ntree = 500, mtry = sqrt(5), # mtry adjusted for 5 features importance = TRUE )

Predictions and performance

pred_probs_top <- predict(rf_model_top, test_data, type = “prob”)[, “yes”] pred_top <- ifelse(pred_probs_top > 0.5, “yes”, “no”)

conf_matrix_top <- confusionMatrix( factor(pred_top, levels = c(“no”, “yes”)), factor(test_data$subscription_status, levels = c(“no”, “yes”)), positive = “yes” )

roc_obj_top <- roc(test_data$subscription_status, pred_probs_top) auc_top <- auc(roc_obj_top)

Compare results

comparison <- data.frame( Metric = c(“Accuracy”, “Precision”, “Recall”, “F1”, “AUC”), Experiment_A = c( conf_matrix_all$overall["Accuracy"], conf_matrix_all$byClass[“Precision”], conf_matrix_all$byClass["Recall"], conf_matrix_all$byClass[“F1”], auc_all ), Experiment_B = c( conf_matrix_top$overall["Accuracy"], conf_matrix_top$byClass[“Precision”], conf_matrix_top$byClass["Recall"], conf_matrix_top$byClass[“F1”], auc_top ) )

print(comparison)

Visualization

comparison_long <- comparison %>% tidyr::pivot_longer(-Metric, names_to = “Experiment”, values_to = “Value”)

ggplot(comparison_long, aes(x = Metric, y = Value, fill = Experiment)) + geom_bar(stat = “identity”, position = “dodge”, width = 0.6) + scale_fill_brewer(palette = “Set1”) + labs( title = “Performance Comparison: Experiment A (All Features) vs. Experiment B (Top 5 Features)”, y = “Value”, x = “Metric” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Experiment 4: AdaBoost

Balance the dataset using oversampling

set.seed(123) df_balanced <- ovun.sample(subscription_status ~ ., data = df, method = “over”, N = max(table(df$subscription_status)) * 2)$data

Train-test split

set.seed(123) train_index <- createDataPartition(df_balanced$subscription_status, p = 0.7, list = FALSE) train_data <- df_balanced[train_index, ] test_data <- df_balanced[-train_index, ]

AdaBoost model with limited iterations

adaboost_model_oversampling <- ada( subscription_status ~ ., data = train_data, iter = 50, # Reduced number of iterations control = rpart.control(maxdepth = 2, cp = 0.01) # Shallow trees )

Predictions

pred_probs_oversampling <- predict(adaboost_model_oversampling, test_data, type = “probs”)[, 2] predictions_oversampling <- ifelse(pred_probs_oversampling > 0.5, “yes”, “no”)

Confusion Matrix

conf_matrix_oversampling <- confusionMatrix( factor(predictions_oversampling, levels = c(“no”, “yes”)), factor(test_data$subscription_status, levels = c(“no”, “yes”)), positive = “yes” )

AUC

roc_obj_oversampling <- roc(test_data$subscription_status, pred_probs_oversampling) auc_value_oversampling <- auc(roc_obj_oversampling)

cat(“Oversampling Experiment - Accuracy:”, conf_matrix_oversampling$overall[“Accuracy”], “”) cat(“Oversampling Experiment - AUC:”, auc_value_oversampling, “”)

Visualization for Experiment 1: AdaBoost with Oversampling

roc_df_oversampling <- data.frame( FPR = 1 - roc_obj_oversampling$specificities, TPR = roc_obj_oversampling$sensitivities )

ggplot(roc_df_oversampling, aes(x = FPR, y = TPR)) + geom_line(color = “blue”, size = 1.2) + geom_abline(linetype = “dashed”, color = “gray”) + labs( title = “ROC Curve - AdaBoost with Oversampling”, x = “False Positive Rate (1 - Specificity)”, y = “True Positive Rate (Sensitivity)” ) + theme_minimal()

Experiment 2: AdaBoost with Undersampling

Balance the dataset using undersampling

set.seed(456) df_balanced_undersample <- ovun.sample(subscription_status ~ ., data = df, method = “under”, N = min(table(df$subscription_status)) * 2)$data

Train-test split

set.seed(456) train_index_undersample <- createDataPartition(df_balanced_undersample$subscription_status, p = 0.7, list = FALSE) train_data_undersample <- df_balanced_undersample[train_index_undersample, ] test_data_undersample <- df_balanced_undersample[-train_index_undersample, ]

Train AdaBoost model on demographic features

adaboost_model_demographics <- ada( subscription_status ~ ., data = train_data_undersample[, demographic_features], iter = 50, # Reduced number of iterations control = rpart.control(maxdepth = 2, cp = 0.01) # Shallow trees )

Predictions

pred_probs_demographics <- predict(adaboost_model_demographics, test_data_undersample[, demographic_features], type = “probs”)[, 2] predictions_demographics <- ifelse(pred_probs_demographics > 0.5, “yes”, “no”)

Confusion Matrix

conf_matrix_demographics <- confusionMatrix( factor(predictions_demographics, levels = c(“no”, “yes”)), factor(test_data_undersample$subscription_status, levels = c(“no”, “yes”)), positive = “yes” )

AUC

roc_obj_demographics <- roc(test_data_undersample$subscription_status, pred_probs_demographics) auc_value_demographics <- auc(roc_obj_demographics)

cat(“Demographics Experiment - Accuracy:”, conf_matrix_demographics$overall[“Accuracy”], “”) cat(“Demographics Experiment - AUC:”, auc_value_demographics, “”)

Visualization for Experiment 2: AdaBoost with Undersampling (Demographics)

roc_df_demographics <- data.frame( FPR = 1 - roc_obj_demographics$specificities, TPR = roc_obj_demographics$sensitivities )

ggplot(roc_df_demographics, aes(x = FPR, y = TPR)) + geom_line(color = “red”, size = 1.2) + geom_abline(linetype = “dashed”, color = “gray”) + labs( title = “ROC Curve - AdaBoost with Demographics”, x = “False Positive Rate (1 - Specificity)”, y = “True Positive Rate (Sensitivity)” ) + theme_minimal()

Bar Plot: Comparison of Accuracy and AUC

metrics <- data.frame( Experiment = c(“Oversampling”, “Demographics”), Accuracy = c(conf_matrix_oversampling$overall["Accuracy"], conf_matrix_demographics$overall[“Accuracy”]), AUC = c(auc_value_oversampling, auc_value_demographics) )

Accuracy bar plot

ggplot(metrics, aes(x = Experiment, y = Accuracy, fill = Experiment)) + geom_bar(stat = “identity”, color = “black”) + labs( title = “Comparison of Accuracy”, x = “Experiment”, y = “Accuracy” ) + theme_minimal()

AUC bar plot

ggplot(metrics, aes(x = Experiment, y = AUC, fill = Experiment)) + geom_bar(stat = “identity”, color = “black”) + labs( title = “Comparison of AUC”, x = “Experiment”, y = “AUC” ) + theme_minimal()