Key Takeaways

# install.packages(c(
#   "GGally", 
#   "skimr", 
#   "recipes", 
#   "janitor", 
#   "DataExplorer", 
#   "caret", 
#   "rsample", 
#   "themis", 
#   "car"
# ))

#### 1) Libraries
library(MASS)
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

library(skimr)

## Warning: package 'skimr' was built under R version 4.4.3

library(readxl)
library(readr)
library(tidyverse)
library(recipes)

## Warning: package 'recipes' was built under R version 4.4.3

## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(janitor)

## Warning: package 'janitor' was built under R version 4.4.3

## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(DataExplorer)

## Warning: package 'DataExplorer' was built under R version 4.4.3

library(dplyr)
library(scales)

## Warning: package 'scales' was built under R version 4.4.3

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(themis)

## Warning: package 'themis' was built under R version 4.4.3

library(car)

## Warning: package 'car' was built under R version 4.4.3

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

# getwd()
# list.files()

2) Load data

data_path <- "C:/Users/sgarc/OneDrive/Documents/R_Studio_Folders/DA6813/Case_Study_1/bank-additional-full.csv"   # <-- change me
outcome_var <- "y"               # e.g., "label" if you plan to model
delim = ";"

load_data <- function(path) {
  ext <- tolower(tools::file_ext(path))

  # Helper to auto-detect delimiter for CSV-like files
  detect_and_read <- function(file) {
    # Peek at the first line
    first_line <- readLines(file, n = 1)

    if (grepl(";", first_line)) {
      message("Detected semicolon delimiter.")
      readr::read_delim(file, delim = ";", show_col_types = FALSE)
    } else if (grepl("\t", first_line)) {
      message("Detected tab delimiter.")
      readr::read_tsv(file, show_col_types = FALSE)
    } else {
      message("Detected comma delimiter.")
      readr::read_csv(file, show_col_types = FALSE)
    }
  }

  df <- switch(ext,
               "csv"  = detect_and_read(path),
               "tsv"  = readr::read_tsv(path, show_col_types = FALSE),
               "xlsx" = readxl::read_excel(path),
               "xls"  = readxl::read_excel(path),
               # For parquet, uncomment below and add "arrow" to pkgs
               # "parquet" = arrow::read_parquet(path),
               stop("Unsupported file type: ", ext)
  )

  df |> janitor::clean_names()
}

df <- load_data(data_path)

## Detected semicolon delimiter.

# print(df)

3) Inspect structure

message("Structure:")

## Structure:

str(df)

## spc_tbl_ [41,188 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ age           : num [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr [1:41188] "housemaid" "services" "services" "admin." ...
##  $ marital       : chr [1:41188] "married" "married" "married" "married" ...
##  $ education     : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr [1:41188] "no" "unknown" "no" "no" ...
##  $ housing       : chr [1:41188] "no" "no" "yes" "no" ...
##  $ loan          : chr [1:41188] "no" "no" "no" "no" ...
##  $ contact       : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:41188] "may" "may" "may" "may" ...
##  $ day_of_week   : chr [1:41188] "mon" "mon" "mon" "mon" ...
##  $ duration      : num [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : num [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : num [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp_var_rate  : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons_price_idx: num [1:41188] 94 94 94 94 94 ...
##  $ cons_conf_idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr_employed   : num [1:41188] 5191 5191 5191 5191 5191 ...
##  $ y             : chr [1:41188] "no" "no" "no" "no" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   age = col_double(),
##   ..   job = col_character(),
##   ..   marital = col_character(),
##   ..   education = col_character(),
##   ..   default = col_character(),
##   ..   housing = col_character(),
##   ..   loan = col_character(),
##   ..   contact = col_character(),
##   ..   month = col_character(),
##   ..   day_of_week = col_character(),
##   ..   duration = col_double(),
##   ..   campaign = col_double(),
##   ..   pdays = col_double(),
##   ..   previous = col_double(),
##   ..   poutcome = col_character(),
##   ..   emp.var.rate = col_double(),
##   ..   cons.price.idx = col_double(),
##   ..   cons.conf.idx = col_double(),
##   ..   euribor3m = col_double(),
##   ..   nr.employed = col_double(),
##   ..   y = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

message("\nFirst 10 rows:")

## 
## First 10 rows:

print(head(df, 10))

## # A tibble: 10 × 21
##      age job   marital education default housing loan  contact month day_of_week
##    <dbl> <chr> <chr>   <chr>     <chr>   <chr>   <chr> <chr>   <chr> <chr>      
##  1    56 hous… married basic.4y  no      no      no    teleph… may   mon        
##  2    57 serv… married high.sch… unknown no      no    teleph… may   mon        
##  3    37 serv… married high.sch… no      yes     no    teleph… may   mon        
##  4    40 admi… married basic.6y  no      no      no    teleph… may   mon        
##  5    56 serv… married high.sch… no      no      yes   teleph… may   mon        
##  6    45 serv… married basic.9y  unknown no      no    teleph… may   mon        
##  7    59 admi… married professi… no      no      no    teleph… may   mon        
##  8    41 blue… married unknown   unknown no      no    teleph… may   mon        
##  9    24 tech… single  professi… no      yes     no    teleph… may   mon        
## 10    25 serv… single  high.sch… no      yes     no    teleph… may   mon        
## # ℹ 11 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## #   previous <dbl>, poutcome <chr>, emp_var_rate <dbl>, cons_price_idx <dbl>,
## #   cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>, y <chr>

message("\nskimr summary:")

## 
## skimr summary:

print(skimr::skim(df))

## ── Data Summary ────────────────────────
##                            Values
## Name                       df    
## Number of rows             41188 
## Number of columns          21    
## _______________________          
## Column type frequency:           
##   character                11    
##   numeric                  10    
## ________________________         
## Group variables            None  
## 
## ── Variable type: character ────────────────────────────────────────────────────
##    skim_variable n_missing complete_rate min max empty n_unique whitespace
##  1 job                   0             1   6  13     0       12          0
##  2 marital               0             1   6   8     0        4          0
##  3 education             0             1   7  19     0        8          0
##  4 default               0             1   2   7     0        3          0
##  5 housing               0             1   2   7     0        3          0
##  6 loan                  0             1   2   7     0        3          0
##  7 contact               0             1   8   9     0        2          0
##  8 month                 0             1   3   3     0       10          0
##  9 day_of_week           0             1   3   3     0        5          0
## 10 poutcome              0             1   7  11     0        3          0
## 11 y                     0             1   2   3     0        2          0
## 
## ── Variable type: numeric ──────────────────────────────────────────────────────
##    skim_variable  n_missing complete_rate      mean      sd       p0     p25
##  1 age                    0             1   40.0     10.4     17       32   
##  2 duration               0             1  258.     259.       0      102   
##  3 campaign               0             1    2.57     2.77     1        1   
##  4 pdays                  0             1  962.     187.       0      999   
##  5 previous               0             1    0.173    0.495    0        0   
##  6 emp_var_rate           0             1    0.0819   1.57    -3.4     -1.8 
##  7 cons_price_idx         0             1   93.6      0.579   92.2     93.1 
##  8 cons_conf_idx          0             1  -40.5      4.63   -50.8    -42.7 
##  9 euribor3m              0             1    3.62     1.73     0.634    1.34
## 10 nr_employed            0             1 5167.      72.3   4964.    5099.  
##        p50     p75    p100 hist 
##  1   38      47      98    ▅▇▃▁▁
##  2  180     319    4918    ▇▁▁▁▁
##  3    2       3      56    ▇▁▁▁▁
##  4  999     999     999    ▁▁▁▁▇
##  5    0       0       7    ▇▁▁▁▁
##  6    1.1     1.4     1.4  ▁▃▁▁▇
##  7   93.7    94.0    94.8  ▁▆▃▇▂
##  8  -41.8   -36.4   -26.9  ▅▇▁▇▁
##  9    4.86    4.96    5.04 ▅▁▁▁▇
## 10 5191    5228.   5228.   ▁▁▃▁▇

message("\ndplyr summary:")

## 
## dplyr summary:

dplyr::glimpse(df)

## Rows: 41,188
## Columns: 21
## $ age            <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job            <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital        <chr> "married", "married", "married", "married", "married", …
## $ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration       <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign       <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays          <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp_var_rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…

For report -

The dataset contains 21 features. Of these, 10 are numeric and 11 are character variables, including the target label. Since the character fields represent categorical information, we will convert them into factors to take advantage of R’s handling of levels.

We plan to bin both age and pdays. In particular, the binning strategy for pdays requires careful consideration, as it contains a special value (999) indicating “never previously contacted.”

In addition, based on a review of the data catalog, we will remove the duration feature. Although highly predictive, it provides near-perfect separation of the target and would bias the model if retained.

4) Check for nulls / missing

message("\nMissing or 'unknown' values per column:")

## 
## Missing or 'unknown' values per column:

na_summary <- df |>
  summarise(across(everything(),
                   ~ sum(is.na(.) | (is.character(.) | is.factor(.)) & tolower(as.character(.)) == "unknown"),
                   .names = "{.col}_na_unknown")) |>
  pivot_longer(everything(),
               names_to = "column",
               values_to = "missing_or_unknown") |>
  arrange(desc(missing_or_unknown))

print(na_summary, n = Inf, width = Inf)

## # A tibble: 21 × 2
##    column                    missing_or_unknown
##    <chr>                                  <int>
##  1 default_na_unknown                      8597
##  2 education_na_unknown                    1731
##  3 housing_na_unknown                       990
##  4 loan_na_unknown                          990
##  5 job_na_unknown                           330
##  6 marital_na_unknown                        80
##  7 age_na_unknown                             0
##  8 contact_na_unknown                         0
##  9 month_na_unknown                           0
## 10 day_of_week_na_unknown                     0
## 11 duration_na_unknown                        0
## 12 campaign_na_unknown                        0
## 13 pdays_na_unknown                           0
## 14 previous_na_unknown                        0
## 15 poutcome_na_unknown                        0
## 16 emp_var_rate_na_unknown                    0
## 17 cons_price_idx_na_unknown                  0
## 18 cons_conf_idx_na_unknown                   0
## 19 euribor3m_na_unknown                       0
## 20 nr_employed_na_unknown                     0
## 21 y_na_unknown                               0

# Each week we may need to enhance this based on the case study
# For example in Lab 1 Unknown is used extensively
# also want to show alll of the values in the tibble



# these columns have rows we are going to purge because of missing values.
cols_to_drop_unknown <- c("default", "education", "housing", "loan", "job", "marital")

message("\nCleaning the unknown and missing values:")

## 
## Cleaning the unknown and missing values:

df_clean <- df %>%
  filter(
    if_all(all_of(cols_to_drop_unknown),
           ~ !is.na(.) & str_to_lower(as.character(.)) != "unknown")
  ) %>%
  # reset factor levels if any of these are factors
  mutate(across(all_of(cols_to_drop_unknown), ~ droplevels(factor(.))))

  str(df_clean)

## tibble [30,488 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : num [1:30488] 56 37 40 56 59 24 25 25 29 57 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
##  $ contact       : chr [1:30488] "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr [1:30488] "may" "may" "may" "may" ...
##  $ day_of_week   : chr [1:30488] "mon" "mon" "mon" "mon" ...
##  $ duration      : num [1:30488] 261 226 151 307 139 380 50 222 137 293 ...
##  $ campaign      : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : num [1:30488] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr [1:30488] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp_var_rate  : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
##  $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr_employed   : num [1:30488] 5191 5191 5191 5191 5191 ...
##  $ y             : chr [1:30488] "no" "no" "no" "no" ...

For report -

The dataset contains rows with null or “unknown” values, particularly in the default and education features. Given the large sample size, we chose to remove records with missing or unknown values rather than impute them at this stage. If model performance proves unsatisfactory, we may revisit this decision and explore alternative treatments such as imputation or separate category encoding.

5) Ensure categorical features are factors and apply binning

# Heuristic: treat character columns as factors.
# If you have integer-coded categories (e.g., 0/1/2) list them explicitly.
char_cols <- names(df_clean)[vapply(df_clean, is.character, logical(1))]
if (length(char_cols)) {
  df_clean <- df_clean |>
    mutate(across(all_of(char_cols), ~as.factor(.)))
}

# Example: force specific integer columns to factor
# df_clean <- df_clean |>
#   mutate(across(c("zip_group","homeowner","gender"), ~as.factor(.)))

message("\nEnsuring factors are used where appropriate:")

## 
## Ensuring factors are used where appropriate:

str(df_clean)

## tibble [30,488 × 21] (S3: tbl_df/tbl/data.frame)
##  $ age           : num [1:30488] 56 37 40 56 59 24 25 25 29 57 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration      : num [1:30488] 261 226 151 307 139 380 50 222 137 293 ...
##  $ campaign      : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : num [1:30488] 999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp_var_rate  : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
##  $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr_employed   : num [1:30488] 5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

df_clean <- df_clean %>%
  # Replace age with bucketed ordered factor
  mutate(age = cut(age,
                   breaks = c(0, 25, 35, 45, 55, 65, Inf),
                   labels = c("<=25", "26-35", "36-45", "46-55", "56-65", "65+"),
                   right = TRUE, include.lowest = TRUE,
                   ordered_result = TRUE),

         # Replace pdays with 4-category unordered factor
         pdays = case_when(
           pdays == 999 ~ "Never",
           pdays >= 1 & pdays <= 30 ~ "Recent",
           pdays > 30 & pdays <= 180 ~ "Intermediate",
           pdays > 180 ~ "LongAgo"
         ),
         pdays = factor(pdays,
                        levels = c("Never", "Recent", "Intermediate", "LongAgo"),
                        ordered = FALSE)) %>%

  # Drop duration
  select(-duration)

message("\nColumn removal and binning:")

## 
## Column removal and binning:

str(df_clean)

## tibble [30,488 × 20] (S3: tbl_df/tbl/data.frame)
##  $ age           : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 5 3 3 5 5 1 1 1 2 5 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ campaign      : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous      : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ emp_var_rate  : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
##  $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
##  $ nr_employed   : num [1:30488] 5191 5191 5191 5191 5191 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

For report -

We identified two numeric features for binning and conversion to categorical variables. The pdays feature was transformed into a standard (unordered) factor, since the “Never” category represents a qualitatively different condition than elapsed days. In contrast, age was converted into an ordered factor, preserving the natural progression across age groups and retaining some ordinal information in the data.

6) Standardize and Impute missing values (recipes)

# - Numeric: median impute
# - Categorical (factor/character): most-frequent (mode) impute
# NOTE: This builds a preprocessing recipe; you can bake() it to get the imputed data.

rec <- recipe(~ ., data = df_clean) |>
  # ensure character -> factor before mode impute
  step_string2factor(all_nominal()) |>
  step_impute_median(all_numeric_predictors()) |>
  step_impute_mode(all_nominal_predictors()) |>
  step_normalize(all_numeric_predictors())

prep_rec <- prep(rec, training = df_clean)
df_imputed <- bake(prep_rec, new_data = NULL)

For report -

We used the recipes package to streamline preprocessing. All nominal features were converted to factors, numeric predictors were set to be imputed with the median (though no missing values remained), and nominal predictors were set to be imputed with the mode (again, no missing values were present). Finally, all numeric features were standardized using z-scores to ensure comparability across predictors.

7) Distributions & Boxplots

# Numeric distributions (histograms)
DataExplorer::plot_histogram(df_imputed, nrow = 3L, ncol = 3L, title = "Numeric Distributions")

# Categorical counts (bar plots)
DataExplorer::plot_bar(df_imputed, title = "Categorical Counts")

# Boxplots of numerics by a categorical (choose one factor)
# Pick a reasonable categorical feature:
candidate_factors <- names(df_imputed)[vapply(df_imputed, is.factor, logical(1))]
candidate_nums    <- names(df_imputed)[vapply(df_imputed, is.numeric, logical(1))]
if (length(candidate_factors) >= 1 && length(candidate_nums) >= 1) {
  cat_feat <- candidate_factors[1]
  num_feat <- candidate_nums[1]
  message(paste0("\nBoxplot example: ", num_feat, " by ", cat_feat))
  print(
    ggplot(df_imputed, aes(x = .data[[cat_feat]], y = .data[[num_feat]])) +
      geom_boxplot() +
      labs(x = cat_feat, y = num_feat, title = paste(num_feat, "by", cat_feat)) +
      theme_minimal()
  )
}

## 
## Boxplot example: campaign by age

# Optional: Pairwise plots for a small numeric subset
if (length(candidate_nums) >= 3) {
  GGally::ggpairs(df_imputed[, candidate_nums[1:min(25, length(candidate_nums))]])
}

8) Multicollinearity checks

# 1) Select clean numeric predictors
candidate_nums <- names(df_imputed)[vapply(df_imputed, function(x) is.numeric(x) && all(is.finite(x) | is.na(x)), logical(1))]

# Remove zero/near-zero variance columns (helps cor & VIF)
if (length(candidate_nums)) {
  nzv_idx <- caret::nearZeroVar(df_imputed[, candidate_nums, drop = FALSE], saveMetrics = TRUE)
  keep_numeric <- rownames(nzv_idx)[!nzv_idx$nzv]
  candidate_nums <- intersect(candidate_nums, keep_numeric)
}

# ---------- (A) Correlation filter ----------
if (length(candidate_nums) >= 2) {
  # Compute correlation with pairwise handling; coerce to matrix
  cor_mat <- suppressWarnings(cor(as.matrix(df_imputed[, candidate_nums, drop = FALSE]),
                                  use = "pairwise.complete.obs"))

  # Find columns to consider dropping
  high_corr_idx <- caret::findCorrelation(cor_mat, cutoff = 0.80, verbose = TRUE)

  if (length(high_corr_idx)) {
    message("\nHighly correlated (>|0.80|) columns to consider removing:")
    print(colnames(cor_mat)[high_corr_idx])

    # (Optional) also print the actual pairs over the cutoff
    get_pairs <- function(mat, cutoff = 0.80) {
      out <- which(abs(mat) > cutoff & upper.tri(mat), arr.ind = TRUE)
      if (nrow(out)) {
        data.frame(
          var1 = rownames(mat)[out[,1]],
          var2 = colnames(mat)[out[,2]],
          corr = mat[out],
          row.names = NULL
        ) %>% arrange(desc(abs(corr)))
      } else {
        data.frame(var1 = character(), var2 = character(), corr = numeric())
      }
    }
    pairs_over <- get_pairs(cor_mat, 0.80)
    if (nrow(pairs_over)) {
      message("\nTop correlated pairs (>|0.80|):")
      print(utils::head(pairs_over, 20))
    }
  } else {
    message("\nNo pairs exceeded |0.80| correlation.")
  }
} else {
  message("\nNot enough numeric predictors for correlation analysis.")
}

## Compare row 6  and column  3 with corr  0.969 
##   Means:  0.567 vs 0.32 so flagging column 6 
## Compare row 3  and column  7 with corr  0.9 
##   Means:  0.477 vs 0.239 so flagging column 3 
## All correlations <= 0.8

## 
## Highly correlated (>|0.80|) columns to consider removing:

## [1] "euribor3m"    "emp_var_rate"

## 
## Top correlated pairs (>|0.80|):

##           var1        var2      corr
## 1 emp_var_rate   euribor3m 0.9694121
## 2    euribor3m nr_employed 0.9448713
## 3 emp_var_rate nr_employed 0.9003902

# ---------- (B) VIF ----------
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
  # Choose numeric predictors that aren't the outcome
  num_preds <- setdiff(candidate_nums, outcome_var)

  if (length(num_preds) >= 2) {
    # numeric outcome -> lm; binary (0/1 or 2-level factor) -> glm binomial
    y <- df_imputed[[outcome_var]]
    lm_formula <- as.formula(paste(outcome_var, "~", paste(num_preds, collapse = " + ")))

    fit <- try({
      if (is.numeric(y) && length(unique(na.omit(y))) > 2) {
        lm(lm_formula, data = df_imputed)
      } else {
        # treat 0/1 or 2-level as classification
        glm(lm_formula, data = df_imputed, family = binomial())
      }
    }, silent = TRUE)

    if (inherits(fit, "try-error")) {
      message("\nVIF model could not be fit (possibly due to perfect separation or collinearity).")
    } else {
      message("\nVIF:")
      print(car::vif(fit))
    }
  } else {
    message("\nNot enough numeric predictors (excluding outcome) to compute VIF.")
  }
} else {
  message("\nNo valid outcome_var found in df_imputed; skipping VIF.")
}

## 
## VIF:

##       campaign       previous   emp_var_rate cons_price_idx  cons_conf_idx 
##       1.028057       1.255393      28.827269      10.882013       2.980781 
##      euribor3m    nr_employed 
##      53.004293      35.016171

# Removing highly correlated features one by one to ensure
# we don't have multicolinearity.
df_imputed <- df_imputed %>%
  select(-euribor3m)

df_imputed <- df_imputed %>%
  select(-emp_var_rate)

For Report -

Several features exhibited high multicollinearity, as indicated by elevated Variance Inflation Factor (VIF) values. To address this, we removed correlated features sequentially and reassessed the remaining variables after each step. We first dropped euribor3m (VIF = 53.00), followed by emp_var_rate (VIF = 23.96). After these removals, all remaining predictors had VIF values below 1.3, indicating that multicollinearity was no longer a concern.

print(candidate_nums)

## [1] "campaign"       "previous"       "emp_var_rate"   "cons_price_idx"
## [5] "cons_conf_idx"  "euribor3m"      "nr_employed"

9) Class Balance

# Ensure binary 0/1 label becomes a factor; leave multiclass factors as-is
coerce_binary_label <- function(data, outcome_var) {
  stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
  y <- data[[outcome_var]]
  if (is.numeric(y) && all(na.omit(y) %in% c(0, 1))) {
    data[[outcome_var]] <- factor(y, levels = c(0, 1))
  }
  data
}

# Build balance table (counts + proportions) for the specified label
balance_table <- function(data, outcome_var) {
  stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
  if (!is.factor(data[[outcome_var]])) {
    # If it's not a factor, coerce to factor so we can tabulate levels cleanly
    data[[outcome_var]] <- factor(data[[outcome_var]])
  }
  dplyr::count(data, .data[[outcome_var]], name = "n") |>
    dplyr::mutate(prop = n / sum(n)) |>
    dplyr::arrange(dplyr::desc(n))
}

# Print table + plot; optionally set which level is "positive"
report_class_balance <- function(data, outcome_var, positive = NULL, title_suffix = NULL) {
  stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
  if (!is.factor(data[[outcome_var]])) {
    data[[outcome_var]] <- factor(data[[outcome_var]])
  }
  tab <- balance_table(data, outcome_var)

  message(sprintf("\nClass balance for '%s'%s:",
                  outcome_var,
                  ifelse(is.null(title_suffix), "", paste0(" — ", title_suffix))))
  print(tab)

  # Plot
  p_title <- paste("Class Proportions —", outcome_var,
                   ifelse(is.null(title_suffix), "", paste0(" (", title_suffix, ")")))
  print(
    ggplot(tab, aes(x = .data[[outcome_var]], y = prop)) +
      geom_col() +
      geom_text(aes(label = scales::percent(prop, accuracy = 0.1)),
                vjust = -0.25, size = 3) +
      scale_y_continuous(labels = scales::percent_format()) +
      labs(x = outcome_var, y = "Proportion", title = p_title) +
      theme_minimal()
  )

  # Binary prevalence
  lvls <- levels(data[[outcome_var]])
  if (length(lvls) == 2) {
    pos_level <- if (is.null(positive)) lvls[length(lvls)] else positive
    if (!(pos_level %in% lvls)) {
      warning(sprintf("Requested positive level '%s' not found in levels: %s",
                      pos_level, paste(lvls, collapse = ", ")))
    } else {
      prev <- mean(data[[outcome_var]] == pos_level, na.rm = TRUE)
      message(sprintf("Positive class: '%s' | Prevalence: %0.2f%%", pos_level, 100 * prev))
    }
  }
}

# After imputation and BEFORE splitting
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
  df_imputed <- coerce_binary_label(df_imputed, outcome_var)
  report_class_balance(df_imputed, outcome_var, title_suffix = "Full dataset")
}

## 
## Class balance for 'y' — Full dataset:

## # A tibble: 2 × 3
##   y         n  prop
##   <fct> <int> <dbl>
## 1 no    26629 0.873
## 2 yes    3859 0.127

## Positive class: 'yes' | Prevalence: 12.66%

For Report -

After removing rows with missing or “unknown” values, the dataset was reduced to 30,488 observations. The target variable exhibits a pronounced class imbalance, with the positive class representing only 12.66% of the total. To address this, we plan to apply techniques that adjust class balance within the training set, ensuring the model is not biased toward the majority class.

Completed Tasks

# All of these actions were taken above with notes as to where and what we did.
# Feature Removal
# Initial - Feature we have determined do not help the Model
# Colinearity - Features we have determine are colinear with other features
# Perfect Separation - Features that cause perfect separation in the model
# Feature Engineering
# Binned Features

10) Train/Test Split

# Tidy split with rsample; stratify if you set outcome_var and it's categorical.
set.seed(123)

if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
  # If binary numeric (0/1), make it a factor for stratified classification
  if (is.numeric(df_imputed[[outcome_var]]) && all(na.omit(df_imputed[[outcome_var]]) %in% c(0,1))) {
    df_imputed[[outcome_var]] <- factor(df_imputed[[outcome_var]], levels = c(0,1))
  }

  if (is.factor(df_imputed[[outcome_var]])) {
    split <- rsample::initial_split(df_imputed, prop = 0.8, strata = outcome_var)  # outcome_var is a string: OK
  } else {
    split <- rsample::initial_split(df_imputed, prop = 0.8)
  }
} else {
  split <- rsample::initial_split(df_imputed, prop = 0.8)
}

## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(outcome_var)
## 
##   # Now:
##   data %>% select(all_of(outcome_var))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

train_df <- rsample::training(split)
test_df  <- rsample::testing(split)


# Recompute split to use the coerced factor, then extract
set.seed(123)
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed) && is.factor(df_imputed[[outcome_var]])) {
  split <- rsample::initial_split(df_imputed, prop = 0.8, strata = outcome_var)
} else {
  split <- rsample::initial_split(df_imputed, prop = 0.8)
}
train_df <- rsample::training(split)
test_df  <- rsample::testing(split)

str(train_df)

## tibble [24,390 × 18] (S3: tbl_df/tbl/data.frame)
##  $ age           : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 3 3 5 5 1 2 5 2 4 2 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 8 1 8 1 8 2 4 2 2 11 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 3 3 1 2 2 2 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 4 2 4 6 4 4 1 2 3 4 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 2 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 1 1 2 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ campaign      : num [1:24390] -0.559 -0.559 -0.559 -0.559 -0.559 ...
##  $ pdays         : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous      : num [1:24390] -0.372 -0.372 -0.372 -0.372 -0.372 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ cons_price_idx: num [1:24390] 0.804 0.804 0.804 0.804 0.804 ...
##  $ cons_conf_idx : num [1:24390] 0.877 0.877 0.877 0.877 0.877 ...
##  $ nr_employed   : num [1:24390] 0.402 0.402 0.402 0.402 0.402 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

str(test_df)

## tibble [6,098 × 18] (S3: tbl_df/tbl/data.frame)
##  $ age           : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 5 1 1 2 2 4 1 2 2 2 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 4 10 8 2 1 1 10 1 4 8 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 3 3 2 2 3 3 2 2 1 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 6 4 2 7 6 7 4 2 4 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ campaign      : num [1:6098] -0.559 -0.559 -0.559 -0.559 -0.559 ...
##  $ pdays         : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous      : num [1:6098] -0.372 -0.372 -0.372 -0.372 -0.372 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ cons_price_idx: num [1:6098] 0.804 0.804 0.804 0.804 0.804 ...
##  $ cons_conf_idx : num [1:6098] 0.877 0.877 0.877 0.877 0.877 ...
##  $ nr_employed   : num [1:6098] 0.402 0.402 0.402 0.402 0.402 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

11) Adjust Class Balance

# ---- CHOOSE ONE of the balancing methods below for train_df ----
balance_upsample <- function(train_df, outcome_var) {
  # y must be a factor
  y <- factor(train_df[[outcome_var]])
  # all predictors except the outcome, using base indexing
  x <- train_df[, setdiff(names(train_df), outcome_var), drop = FALSE]
  caret::upSample(x = x, y = y, yname = outcome_var)
}

balance_downsample <- function(train_df, outcome_var) {
  y <- factor(train_df[[outcome_var]])
  x <- train_df[, setdiff(names(train_df), outcome_var), drop = FALSE]
  caret::downSample(x = x, y = y, yname = outcome_var)
}


# Method C: SMOTE (themis + recipes; needs numeric predictors)
balance_smote <- function(train_df, outcome_var, over_ratio = 1) {
  # Build a minimal recipe: convert strings, dummy encode, SMOTE, then return original columns + oversampled rows
  rec_bal <- recipe(reformulate(termlabels = setdiff(names(train_df), outcome_var), response = outcome_var),
                    data = train_df) |>
    step_string2factor(all_nominal_predictors()) |>
    step_dummy(all_nominal_predictors()) |>
    step_smote(all_outcomes(), neighbors = 5, over_ratio = over_ratio)

  prep_bal <- prep(rec_bal, training = train_df)
  bake(prep_bal, new_data = NULL)
}

# ---- Pick your method ----
# Option 1 (common): upsample to balance
# train_df_bal <- balance_upsample(train_df, outcome_var)

# Option 2: downsample (keeps smaller dataset, faster)
train_df_bal <- balance_downsample(train_df, outcome_var)

# Option 3: SMOTE (synthesizes minority examples)
# train_df_bal <- balance_smote(train_df, outcome_var, over_ratio = 1)

# ---- Report balances ----
message("\n— Original train balance —")

## 
## — Original train balance —

report_class_balance(train_df, outcome_var, title_suffix = "Train (original)")

## 
## Class balance for 'y' — Train (original):

## # A tibble: 2 × 3
##   y         n  prop
##   <fct> <int> <dbl>
## 1 no    21303 0.873
## 2 yes    3087 0.127

## Positive class: 'yes' | Prevalence: 12.66%

message("\n— Balanced train balance —")

## 
## — Balanced train balance —

report_class_balance(train_df_bal, outcome_var, title_suffix = "Train (balanced)")

## 
## Class balance for 'y' — Train (balanced):

##     y    n prop
## 1  no 3087  0.5
## 2 yes 3087  0.5

## Positive class: 'yes' | Prevalence: 50.00%

message("\n— Test set (left untouched) —")

## 
## — Test set (left untouched) —

report_class_balance(test_df, outcome_var, title_suffix = "Test (holdout)")

## 
## Class balance for 'y' — Test (holdout):

## # A tibble: 2 × 3
##   y         n  prop
##   <fct> <int> <dbl>
## 1 no     5326 0.873
## 2 yes     772 0.127

## Positive class: 'yes' | Prevalence: 12.66%

# Optional: explicitly choose the positive class (for binary labels)
# report_class_balance(train_df, outcome_var, positive = "yes", title_suffix = "Train (yes=positive)")

str(train_df_bal)

## 'data.frame':    6174 obs. of  18 variables:
##  $ age           : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 2 2 3 4 1 3 3 3 2 2 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 1 1 2 2 9 10 1 1 10 1 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 3 2 3 2 3 3 2 3 2 3 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 4 4 6 4 4 7 7 7 6 7 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 2 2 2 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 1 1 1 1 1 1 1 1 1 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 4 4 1 7 1 4 2 7 7 2 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 4 4 1 1 5 5 4 4 5 5 ...
##  $ campaign      : num  -0.559 0.176 -0.559 -0.559 -0.559 ...
##  $ pdays         : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ previous      : num  -0.372 -0.372 -0.372 1.541 -0.372 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 1 2 2 2 2 2 2 ...
##  $ cons_price_idx: num  0.674 0.674 -0.766 -1.077 -0.766 ...
##  $ cons_conf_idx : num  -0.438 -0.438 -1.357 -1.169 -1.357 ...
##  $ nr_employed   : num  0.895 0.895 -0.821 -0.821 -0.821 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

For Report -

To address class imbalance, we applied downsampling to the majority class (no), reducing its size to match the number of minority class (yes) observations. This produced a balanced training set with 6,174 total observations (≈3,087 in each class). While more data would be preferable, the resulting dataset is both clean and balanced, providing a fair baseline for model training. If model performance proves inadequate, we may revisit this approach and explore alternatives such as upsampling, SMOTE, or class weighting to improve predictive power.

12) Logistic Regression

# Make sure outcome_var is a factor with 2 levels (e.g., "yes"/"no")
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])

# Build formula
predictors <- setdiff(names(train_df_bal), outcome_var)
formula_glm <- as.formula(paste(outcome_var, "~", paste(predictors, collapse = " + ")))

# Fit logistic regression model
glm_model <- glm(formula_glm, data = train_df_bal, family = binomial())

# Summarize
summary(glm_model)

## 
## Call:
## glm(formula = formula_glm, family = binomial(), data = train_df_bal)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -0.669597   0.270632  -2.474 0.013353 *  
## age.L                          0.189212   0.214390   0.883 0.377476    
## age.Q                          0.322481   0.171097   1.885 0.059460 .  
## age.C                          0.010911   0.120579   0.090 0.927899    
## age^4                         -0.159969   0.092777  -1.724 0.084666 .  
## age^5                         -0.006456   0.073919  -0.087 0.930404    
## jobblue-collar                -0.045374   0.118261  -0.384 0.701216    
## jobentrepreneur               -0.109502   0.170898  -0.641 0.521690    
## jobhousemaid                   0.025936   0.230156   0.113 0.910277    
## jobmanagement                  0.139425   0.127541   1.093 0.274317    
## jobretired                     0.061537   0.204045   0.302 0.762966    
## jobself-employed               0.122627   0.175032   0.701 0.483555    
## jobservices                   -0.073262   0.123358  -0.594 0.552579    
## jobstudent                     0.188173   0.208311   0.903 0.366351    
## jobtechnician                 -0.022397   0.102394  -0.219 0.826861    
## jobunemployed                  0.009511   0.200614   0.047 0.962186    
## maritalmarried                -0.127429   0.101630  -1.254 0.209896    
## maritalsingle                 -0.096832   0.114547  -0.845 0.397916    
## educationbasic.6y              0.254799   0.190671   1.336 0.181443    
## educationbasic.9y              0.150109   0.150237   0.999 0.317721    
## educationhigh.school           0.266017   0.148276   1.794 0.072802 .  
## educationilliterate            0.007739   1.433299   0.005 0.995692    
## educationprofessional.course   0.225854   0.159455   1.416 0.156655    
## educationuniversity.degree     0.275689   0.148814   1.853 0.063943 .  
## defaultyes                   -10.381708 196.967841  -0.053 0.957965    
## housingyes                     0.013519   0.061434   0.220 0.825829    
## loanyes                       -0.020679   0.083627  -0.247 0.804693    
## contacttelephone              -0.471672   0.108402  -4.351 1.35e-05 ***
## monthaug                      -0.180896   0.169821  -1.065 0.286778    
## monthdec                       0.004059   0.398480   0.010 0.991872    
## monthjul                       0.297239   0.147225   2.019 0.043493 *  
## monthjun                       0.527266   0.147750   3.569 0.000359 ***
## monthmar                       1.147212   0.245847   4.666 3.07e-06 ***
## monthmay                      -0.553476   0.114239  -4.845 1.27e-06 ***
## monthnov                      -0.314618   0.146436  -2.149 0.031674 *  
## monthoct                       0.304656   0.226791   1.343 0.179163    
## monthsep                      -0.287104   0.262336  -1.094 0.273773    
## day_of_weekmon                -0.202924   0.099174  -2.046 0.040743 *  
## day_of_weekthu                -0.014691   0.097650  -0.150 0.880413    
## day_of_weektue                 0.006360   0.099305   0.064 0.948931    
## day_of_weekwed                 0.097268   0.098336   0.989 0.322591    
## campaign                      -0.137861   0.038152  -3.613 0.000302 ***
## pdaysRecent                    0.668007   0.390761   1.710 0.087358 .  
## previous                      -0.022018   0.071972  -0.306 0.759664    
## poutcomenonexistent            0.493956   0.183826   2.687 0.007208 ** 
## poutcomesuccess                1.447830   0.393405   3.680 0.000233 ***
## cons_price_idx                -0.097269   0.044371  -2.192 0.028368 *  
## cons_conf_idx                  0.094453   0.043628   2.165 0.030392 *  
## nr_employed                   -0.783153   0.045599 -17.175  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 8559.0  on 6173  degrees of freedom
## Residual deviance: 6529.6  on 6125  degrees of freedom
## AIC: 6627.6
## 
## Number of Fisher Scoring iterations: 10

# Predict probabilities
pred_probs <- predict(glm_model, newdata = test_df, type = "response")

# Predict probabilities
#pred_probs <- predict(logistic_model, newdata = test_df_prepped, type = "response")

# Predict classes (threshold = 0.5)
pred_class <- ifelse(pred_probs >= 0.5, "yes", "no")
pred_class <- factor(pred_class, levels = levels(test_df[[outcome_var]]))

# Confusion matrix
caret::confusionMatrix(pred_class, test_df[[outcome_var]])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4385  241
##        yes  941  531
##                                         
##                Accuracy : 0.8062        
##                  95% CI : (0.796, 0.816)
##     No Information Rate : 0.8734        
##     P-Value [Acc > NIR] : 1             
##                                         
##                   Kappa : 0.3684        
##                                         
##  Mcnemar's Test P-Value : <2e-16        
##                                         
##             Sensitivity : 0.8233        
##             Specificity : 0.6878        
##          Pos Pred Value : 0.9479        
##          Neg Pred Value : 0.3607        
##              Prevalence : 0.8734        
##          Detection Rate : 0.7191        
##    Detection Prevalence : 0.7586        
##       Balanced Accuracy : 0.7556        
##                                         
##        'Positive' Class : no            
##

Key Metrics:

Accuracy (80.14%) Overall, your model predicts correctly 80% of the time.
No Information Rate (NIR) = 87.34% This is the accuracy you’d get by always predicting the majority class (here, “no”). Your model’s accuracy (80%) is actually lower than NIR, indicating it doesn’t outperform simply guessing the majority class.
Kappa (0.36) Measures agreement beyond chance. 0.36 is moderate, meaning your model does better than random but not very strong.
Sensitivity (Recall) for “no” = 81.8% Percentage of actual “no” cases correctly predicted (true negative rate). High sensitivity means the model is good at detecting “no.”
Specificity for “no” = 68.7% Percentage of actual “yes” cases predicted as “yes” (true positive rate for “yes”, since “no” is treated as positive class here). So the model is less good at detecting “yes.”
Positive Predictive Value (PPV) = 94.7% When the model predicts “no”, it is right 94.7% of the time.
Negative Predictive Value (NPV) = 35.4% When the model predicts “yes”, it is right only 35.4% of the time. This is low, so many predicted “yes” are false positives.
Prevalence (87.3%) The proportion of “no” cases in the data—it’s a heavily imbalanced dataset.
Balanced Accuracy (75.23%) Average of sensitivity and specificity. More informative for imbalanced classes, showing moderate performance.

Tune threshold for best balanced accuracy

library(pROC)

## Warning: package 'pROC' was built under R version 4.4.2

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Predict probabilities on test set
pred_probs <- predict(glm_model, newdata = test_df, type = "response")

# Try thresholds from 0 to 1 by 0.01
thresholds <- seq(0, 1, by = 0.01)
balanced_accuracies <- sapply(thresholds, function(t) {
  preds <- factor(ifelse(pred_probs >= t, "yes", "no"), levels = levels(test_df[[outcome_var]]))
  cm <- confusionMatrix(preds, test_df[[outcome_var]])
  cm$byClass["Balanced Accuracy"]
})

best_thresh <- thresholds[which.max(balanced_accuracies)]
cat("Best threshold for balanced accuracy:", best_thresh, "\n")

## Best threshold for balanced accuracy: 0.53

The logistic model is more balanced (in terms of sensitivity and specificity) when it predicts “yes” if the probability is ≥ 0.53, and “no” otherwise. We Recalculate Predictions & Confusion Matrix Using 0.53

# Predict classes using the optimized threshold (0.53)
pred_class_053 <- ifelse(pred_probs >= 0.53, "yes", "no")
pred_class_053 <- factor(pred_class_053, levels = levels(test_df[[outcome_var]]))

# Confusion matrix
confusionMatrix(pred_class_053, test_df[[outcome_var]])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4520  257
##        yes  806  515
##                                           
##                Accuracy : 0.8257          
##                  95% CI : (0.8159, 0.8351)
##     No Information Rate : 0.8734          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3955          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8487          
##             Specificity : 0.6671          
##          Pos Pred Value : 0.9462          
##          Neg Pred Value : 0.3899          
##              Prevalence : 0.8734          
##          Detection Rate : 0.7412          
##    Detection Prevalence : 0.7834          
##       Balanced Accuracy : 0.7579          
##                                           
##        'Positive' Class : no              
##

Model Summary – Logistic Regression

We trained a logistic regression model to predict whether a client would subscribe to a term deposit (target: yes or no), using a balanced training set to address class imbalance. After tuning, a threshold of 0.53 was found to optimize balanced accuracy.

Key Performance Metrics (on Test Set):

Accuracy: 82.2%

Balanced Accuracy: 75.7%

AUC (ROC): .07500589

Precision (for “no”): 94.6%

Recall (for “no”): 84.5%

Recall (for “yes”): 66.8%

Kappa: 0.39 (moderate agreement)

Threshold Selection:

The default threshold (0.5) underperformed on “yes” predictions due to class imbalance.

Using a threshold of 0.53 increased balanced accuracy while maintaining high precision for “no” and improving recall for “yes”.

Interpretation:

The model performs well overall, especially at identifying clients unlikely to subscribe (“no”). However, recall for subscribers (“yes”) remains more challenging, which is common in imbalanced datasets.

ROC Curve

#install.packages("PRROC")
library(PRROC)     # For PR curve

## Warning: package 'PRROC' was built under R version 4.4.3

## Loading required package: rlang

## 
## Attaching package: 'rlang'

## The following objects are masked from 'package:purrr':
## 
##     %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
##     flatten_raw, invoke, splice

# ROC curve using predicted probabilities
roc_obj <- roc(test_df[[outcome_var]], pred_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Plot ROC
plot(roc_obj, col = "#2C3E50", main = "ROC Curve")
abline(a = 0, b = 1, lty = 2, col = "gray")

# AUC
auc(roc_obj)

## Area under the curve: 0.8075

Precision-Recall (PR) Curve

# Convert factor to binary: 1 = "no", 0 = "yes"
labels_binary <- ifelse(test_df[[outcome_var]] == "no", 1, 0)

# PR curve object
pr_obj <- pr.curve(scores.class0 = pred_probs, weights.class0 = labels_binary, curve = TRUE)

# Plot PR curve
plot(pr_obj, main = "Precision-Recall Curve", col = "#2980B9")

See Most Influential Variables

# Extract coefficients
coef_df <- summary(glm_model)$coefficients
coef_df <- as.data.frame(coef_df)
coef_df$Variable <- rownames(coef_df)

# Remove intercept
coef_df <- coef_df[coef_df$Variable != "(Intercept)", ]

# Calculate absolute value for sorting
coef_df$Importance <- abs(coef_df$Estimate)

# Sort by importance
coef_df_sorted <- coef_df[order(-coef_df$Importance), ]

Visualize Top N Variables

library(ggplot2)

# Top 15 most important variables
top_n <- 15
top_vars <- head(coef_df_sorted, top_n)

ggplot(top_vars, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Top Variables by Importance in Logistic Regression",
       x = "Variable",
       y = "Absolute Coefficient (Importance)") +
  theme_minimal()

Odds Ratio Interpretation

coef_df$OddsRatio <- exp(coef_df$Estimate)

Key Takeaways

The model is fairly accurate

It correctly predicts customer decisions about 82% of the time.

It balances well between identifying those who will say yes and those who won’t.

This means you can trust it to make smarter marketing decisions than random guessing or mass emailing.

It’s especially good at identifying “no” customers

The model is very precise when it says a customer will not subscribe.

This allows the marketing team to avoid spending time and money on people unlikely to respond.

It still finds “yes” customers well

The model captures about 67% of actual “yes” responders, which is solid.

You won’t catch every positive responder, but you’ll get many — and save resources in the process.

You can use the model to segment and prioritize

Score your entire customer list using the model.

Focus high-effort campaigns (calls, meetings) on high-score leads.

Use low-cost methods (emails, ads) for the rest — or not at all.

Campaign Timing Matters

Based on your model’s coefficients, people contacted in March, June, and October are more likely to say “yes”.

Conversely, May and November are weaker months.

This insight can shape when to run your campaigns for better results.

Certain customer types are more likely to respond

For example:

Students and professionals tend to respond better.

Blue-collar and unemployed groups tend to respond less.

Use this to refine your target audience and messaging.

# 12) Linear Discriminant Analysis (LDA) ------------------------------------

# Ensure outcome_var is a factor with 2 levels (e.g., "yes"/"no")
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])

# Create a recipe for LDA
lda_recipe <- recipe(as.formula(paste(outcome_var, "~ .")), data = train_df_bal) |>
  step_string2factor(all_nominal_predictors()) |> # Ensure nominal predictors are factors
  step_dummy(all_nominal_predictors(), one_hot = TRUE) |> # Create dummy variables
  step_zv(all_predictors()) |> # Remove zero-variance predictors
  step_lincomb(all_predictors()) # Remove linear combinations of predictors

# Prepare the recipe
prepared_lda_recipe <- prep(lda_recipe, training = train_df_bal)

# Bake the recipe to get processed training data
train_df_processed <- bake(prepared_lda_recipe, new_data = train_df_bal)

# Fit LDA model using the processed training data
# Now that we have handled collinearity and zero-variance in the recipe,
# the model formula can simply use all columns in the processed data, excluding the outcome.
processed_predictors <- setdiff(names(train_df_processed), outcome_var)
formula_lda_processed <- as.formula(paste(outcome_var, "~", paste(processed_predictors, collapse = " + ")))

lda_model <- MASS::lda(formula_lda_processed, data = train_df_processed)

## Warning in lda.default(x, grouping, ...): variables are collinear

# Summarize model (optional, but good practice)
print(lda_model)

## Call:
## lda(formula_lda_processed, data = train_df_processed)
## 
## Prior probabilities of groups:
##  no yes 
## 0.5 0.5 
## 
## Group means:
##        campaign    previous cons_price_idx cons_conf_idx nr_employed      age_1
## no   0.06577036 -0.09525145     0.08216115   -0.01239235   0.1600260 0.03854875
## yes -0.17788506  0.58820879    -0.34477278    0.16553153  -0.9473195 0.07353418
##         age_2     age_3     age_4      age_5      age_6 job_admin.
## no  0.4020084 0.3190800 0.1736314 0.05636540 0.01036605  0.2883058
## yes 0.4023324 0.2261095 0.1464205 0.09135083 0.06025267  0.3135730
##     job_blue.collar job_entrepreneur job_housemaid job_management job_retired
## no        0.1872368       0.04308390    0.02202786     0.07126660   0.0356333
## yes       0.1149984       0.02753482    0.02332362     0.07515387   0.0936184
##     job_self.employed job_services job_student job_technician marital_divorced
## no         0.03271785   0.09588597  0.01684483      0.1839974        0.1072238
## yes        0.03304179   0.07126660  0.05344995      0.1606738        0.1068999
##     marital_married education_basic.4y education_basic.6y education_basic.9y
## no        0.5983155         0.07450599         0.04794299         0.14318108
## yes       0.5325559         0.08487204         0.03466148         0.09912536
##     education_high.school education_illiterate education_professional.course
## no              0.2513767         0.0003239391                     0.1460965
## yes             0.2458698         0.0003239391                     0.1373502
##     default_no housing_no   loan_no contact_cellular  month_apr month_aug
## no   0.9996761  0.4680920 0.8441853        0.6365403 0.06835115 0.1587302
## yes  1.0000000  0.4431487 0.8471007        0.8474247 0.11985747 0.1347587
##       month_dec month_jul month_jun   month_mar month_may  month_nov  month_oct
## no  0.002915452 0.1775186 0.1124069 0.007450599 0.3388403 0.11013929 0.01490120
## yes 0.017168772 0.1347587 0.1169420 0.060576612 0.1836735 0.09750567 0.07547781
##     day_of_week_fri day_of_week_mon day_of_week_thu day_of_week_tue pdays_Never
## no        0.1823777       0.2115322       0.2082928       0.1979268   0.9844509
## yes       0.1746032       0.1823777       0.2299968       0.2079689   0.7862002
##     poutcome_failure poutcome_nonexistent
## no         0.1104632            0.8769031
## yes        0.1273081            0.6702300
## 
## Coefficients of linear discriminants:
##                                        LD1
## campaign                      -0.104198813
## previous                      -0.007406518
## cons_price_idx                -0.145075705
## cons_conf_idx                  0.038655046
## nr_employed                   -0.746021192
## age_1                          0.068154319
## age_2                          0.049992629
## age_3                         -0.120858845
## age_4                         -0.081347220
## age_5                          0.157315450
## age_6                          0.248382349
## job_admin.                    -0.010557504
## job_blue.collar               -0.074725773
## job_entrepreneur              -0.115812809
## job_housemaid                 -0.018850220
## job_management                 0.093823304
## job_retired                    0.018309026
## job_self.employed              0.081087221
## job_services                  -0.080150442
## job_student                    0.120153724
## job_technician                -0.034519160
## marital_divorced               0.074527677
## marital_married               -0.019510822
## education_basic.4y            -0.212497385
## education_basic.6y            -0.027882082
## education_basic.9y            -0.097681982
## education_high.school         -0.015698170
## education_illiterate          -0.371525252
## education_professional.course -0.040757811
## default_no                     1.053101096
## housing_no                    -0.010252221
## loan_no                        0.015871650
## contact_cellular               0.301013852
## month_apr                      0.295969606
## month_aug                      0.163234401
## month_dec                      0.247678965
## month_jul                      0.530769219
## month_jun                      0.697915853
## month_mar                      0.923044626
## month_may                     -0.250012984
## month_nov                     -0.033159217
## month_oct                      0.340268503
## day_of_week_fri               -0.083486164
## day_of_week_mon               -0.236194121
## day_of_week_thu               -0.082416006
## day_of_week_tue               -0.079508135
## pdays_Never                   -0.535514319
## poutcome_failure              -0.647090026
## poutcome_nonexistent          -0.164912163

# Bake the recipe to get processed test data
test_df_processed <- bake(prepared_lda_recipe, new_data = test_df)

# Predict on the test set using the processed test data
lda_pred <- predict(lda_model, newdata = test_df_processed)

# The predicted class is in lda_pred$class
lda_pred_class <- lda_pred$class

# Evaluate on the test set using confusion matrix
library(caret) # Make sure caret is loaded

confusionMatrix(lda_pred_class, test_df[[outcome_var]])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4306  232
##        yes 1020  540
##                                           
##                Accuracy : 0.7947          
##                  95% CI : (0.7843, 0.8048)
##     No Information Rate : 0.8734          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3536          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8085          
##             Specificity : 0.6995          
##          Pos Pred Value : 0.9489          
##          Neg Pred Value : 0.3462          
##              Prevalence : 0.8734          
##          Detection Rate : 0.7061          
##    Detection Prevalence : 0.7442          
##       Balanced Accuracy : 0.7540          
##                                           
##        'Positive' Class : no              
##

LDA Model Summary:

Prior probabilities of groups: This shows that the model was trained on a balanced dataset where both classes (‘no’ and ‘yes’) have equal prior probabilities (0.5 each).

Group means: This table shows the mean value of each predictor for each of the two classes (‘no’ and ‘yes’). These means are used to calculate the linear discriminant function. You can see which variables have different means between the two groups.

For example, nr_employed has a much lower mean for the ‘yes’ group (-0.947) compared to the ‘no’ group (0.160), suggesting it’s an important differentiator.

Coefficients of linear discriminants: These are the weights assigned to each predictor in the linear discriminant function (LD1). The sign and magnitude of these coefficients indicate the direction and strength of the relationship between each predictor and the discriminant function that separates the two classes. A larger absolute value suggests a stronger contribution to the separation. For instance, nr_employed has a large negative coefficient (-0.734), indicating that lower values of nr_employed are associated with the ‘yes’ class.

Confusion Matrix and Statistics:

Accuracy (0.7911): The model correctly predicts the class about 79.11% of the time.

No Information Rate (0.8734): The accuracy you would get by always predicting the majority class in the original test set (which is imbalanced).

The LDA model’s accuracy is lower than this, similar to the logistic regression model.

Kappa (0.3498): This indicates fair agreement beyond chance. It’s slightly lower than the logistic regression model’s Kappa.

Sensitivity (0.8036): For the ‘no’ class (which is the positive class in this output), the model correctly identifies about 80.36% of actual ‘no’ cases.

Specificity (0.7047): The model correctly identifies about 70.47% of actual ‘yes’ cases (this is the true positive rate for ‘yes’, as ‘no’ is the positive class here).

This is slightly better at identifying ‘yes’ cases compared to the default threshold logistic regression, but similar to the tuned logistic regression.

Pos Pred Value (0.9494): When the model predicts ‘no’, it is correct about 94.94% of the time.

Neg Pred Value (0.3421): When the model predicts ‘yes’, it is correct only about 34.21% of the time. This is low, similar to the logistic regression model with a default threshold, indicating a notable number of false positives for the ‘yes’ class.

Balanced Accuracy (0.7541): The average of sensitivity and specificity, providing a better measure of performance for imbalanced datasets.

This is comparable to the balanced accuracy of the logistic regression model with the optimized threshold.

In summary:

The LDA model’s performance metrics are quite similar to those of the logistic regression model, particularly the balanced accuracy.

It is also better at identifying ‘no’ cases than ‘yes’ cases, which is expected given the class imbalance in the original dataset, despite being trained on a balanced set.

The VIF warning suggests some remaining collinearity issues after the initial steps, even with the recipe’s collinearity handling.

This could potentially impact the interpretation of individual coefficients but often has less effect on predictive performance.

#install.packages("PRROC")
library(PRROC)     # For PR curve
# ROC curve using predicted probabilities
roc_lda <- roc(test_df[[outcome_var]], as.numeric(lda_pred_class))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Plot ROC
plot(roc_lda, col = "#2C3E50", main = "ROC Curve (LDA)")
abline(a = 0, b = 1, lty = 2, col = "gray")

# AUC
auc(roc_lda)

## Area under the curve: 0.754

See Most Influential Variables (LDA)

# Extract coefficients from LDA model
# LDA coefficients are in lda_model$scaling
# The names of the variables are the row names
coef_df_lda <- as.data.frame(lda_model$scaling)
coef_df_lda$Variable <- rownames(coef_df_lda)

# LDA only produces one linear discriminant (LD1) for binary classification
# Use the absolute value of the LD1 coefficient as importance
coef_df_lda$Importance <- abs(coef_df_lda$LD1)

# Sort by importance
coef_df_lda_sorted <- coef_df_lda[order(-coef_df_lda$Importance), ]

# Print sorted coefficients (optional)
# print(coef_df_lda_sorted)

Visualize Top N Variables (LDA)

library(ggplot2)

# Top 15 most important variables
top_n <- 15
top_vars_lda <- head(coef_df_lda_sorted, top_n)

ggplot(top_vars_lda, aes(x = reorder(Variable, Importance), y = Importance)) +
  geom_col(fill = "darkred") +
  coord_flip() +
  labs(title = "Top Variables by Importance in LDA Model",
       x = "Variable",
       y = "Absolute Coefficient (Importance)") +
  theme_minimal()

14) Random Forest

#install.packages(c("randomForest", "caret"))

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.2

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(caret)

# Ensure outcome_var is a factor with 2 levels
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])

# Build formula
predictors_rf <- setdiff(names(train_df_bal), outcome_var)
formula_rf <- as.formula(paste(outcome_var, "~", paste(predictors_rf, collapse = " + ")))

# Train Random Forest model
# Use proximity=TRUE to get proximity matrix for potential later analysis
# Use importance=TRUE to get variable importance measures
set.seed(123) # for reproducibility
rf_model <- randomForest(formula_rf, data = train_df_bal, importance = TRUE, proximity = FALSE)

# Print the model summary
print(rf_model)

## 
## Call:
##  randomForest(formula = formula_rf, data = train_df_bal, importance = TRUE,      proximity = FALSE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 25.72%
## Confusion matrix:
##       no  yes class.error
## no  2515  572   0.1852932
## yes 1016 2071   0.3291221

# Predict on the test set
# type="prob" gives class probabilities
rf_pred_probs <- predict(rf_model, newdata = test_df, type = "prob")

# Predict class labels using default threshold (0.5)
rf_pred_class <- predict(rf_model, newdata = test_df, type = "class")

# Evaluate on the test set using confusion matrix
confusionMatrix(rf_pred_class, test_df[[outcome_var]])

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  4327  235
##        yes  999  537
##                                           
##                Accuracy : 0.7976          
##                  95% CI : (0.7873, 0.8077)
##     No Information Rate : 0.8734          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.357           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8124          
##             Specificity : 0.6956          
##          Pos Pred Value : 0.9485          
##          Neg Pred Value : 0.3496          
##              Prevalence : 0.8734          
##          Detection Rate : 0.7096          
##    Detection Prevalence : 0.7481          
##       Balanced Accuracy : 0.7540          
##                                           
##        'Positive' Class : no              
##

For Report - Random Forest Model Summary

# Variable Importance Plot (MeanDecreaseGini)
varImpPlot(rf_model, main = "Random Forest Variable Importance")

# Get variable importance table
importance_rf <- as.data.frame(importance(rf_model))
importance_rf$Variable <- rownames(importance_rf)

# Sort by MeanDecreaseGini (for classification)
importance_rf_sorted <- importance_rf[order(-importance_rf$`MeanDecreaseGini`), ]

message("\nTop 15 most important variables (Mean Decrease Gini):")

## 
## Top 15 most important variables (Mean Decrease Gini):

print(head(importance_rf_sorted[, c("Variable", "MeanDecreaseGini")], 15))

##                      Variable MeanDecreaseGini
## nr_employed       nr_employed        382.57248
## job                       job        259.98424
## day_of_week       day_of_week        212.31636
## campaign             campaign        190.15857
## education           education        176.97971
## month                   month        160.42302
## age                       age        153.23079
## cons_conf_idx   cons_conf_idx        146.89840
## cons_price_idx cons_price_idx        133.43458
## poutcome             poutcome        106.47407
## marital               marital         95.06371
## pdays                   pdays         90.05811
## housing               housing         69.05608
## contact               contact         60.68856
## loan                     loan         54.47768

#install.packages("PRROC")
library(PRROC)     # For PR curve
# ROC curve using predicted probabilities
roc_rf <- roc(test_df[[outcome_var]], as.numeric(rf_pred_class))

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

# Plot ROC
plot(roc_rf, col = "#2C3E50", main = "ROC Curve (RF)")
abline(a = 0, b = 1, lty = 2, col = "gray")

# AUC
auc(roc_rf)

## Area under the curve: 0.754

Case Study 1 - Bank Marketing

Stephen

2) Load data

3) Inspect structure

For report -

4) Check for nulls / missing

For report -

5) Ensure categorical features are factors and apply binning

For report -

6) Standardize and Impute missing values (recipes)

For report -

7) Distributions & Boxplots

8) Multicollinearity checks

For Report -

9) Class Balance

For Report -

Completed Tasks

10) Train/Test Split

11) Adjust Class Balance

For Report -

12) Logistic Regression

Key Takeaways

See Most Influential Variables (LDA)

Visualize Top N Variables (LDA)

14) Random Forest

For Report - Random Forest Model Summary