# install.packages(c(
# "GGally",
# "skimr",
# "recipes",
# "janitor",
# "DataExplorer",
# "caret",
# "rsample",
# "themis",
# "car"
# ))
#### 1) Libraries
library(MASS)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
library(skimr)
## Warning: package 'skimr' was built under R version 4.4.3
library(readxl)
library(readr)
library(tidyverse)
library(recipes)
## Warning: package 'recipes' was built under R version 4.4.3
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(janitor)
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.4.3
library(dplyr)
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(themis)
## Warning: package 'themis' was built under R version 4.4.3
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
# getwd()
# list.files()
data_path <- "C:/Users/sgarc/OneDrive/Documents/R_Studio_Folders/DA6813/Case_Study_1/bank-additional-full.csv" # <-- change me
outcome_var <- "y" # e.g., "label" if you plan to model
delim = ";"
load_data <- function(path) {
ext <- tolower(tools::file_ext(path))
# Helper to auto-detect delimiter for CSV-like files
detect_and_read <- function(file) {
# Peek at the first line
first_line <- readLines(file, n = 1)
if (grepl(";", first_line)) {
message("Detected semicolon delimiter.")
readr::read_delim(file, delim = ";", show_col_types = FALSE)
} else if (grepl("\t", first_line)) {
message("Detected tab delimiter.")
readr::read_tsv(file, show_col_types = FALSE)
} else {
message("Detected comma delimiter.")
readr::read_csv(file, show_col_types = FALSE)
}
}
df <- switch(ext,
"csv" = detect_and_read(path),
"tsv" = readr::read_tsv(path, show_col_types = FALSE),
"xlsx" = readxl::read_excel(path),
"xls" = readxl::read_excel(path),
# For parquet, uncomment below and add "arrow" to pkgs
# "parquet" = arrow::read_parquet(path),
stop("Unsupported file type: ", ext)
)
df |> janitor::clean_names()
}
df <- load_data(data_path)
## Detected semicolon delimiter.
# print(df)
message("Structure:")
## Structure:
str(df)
## spc_tbl_ [41,188 × 21] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:41188] 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr [1:41188] "housemaid" "services" "services" "admin." ...
## $ marital : chr [1:41188] "married" "married" "married" "married" ...
## $ education : chr [1:41188] "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr [1:41188] "no" "unknown" "no" "no" ...
## $ housing : chr [1:41188] "no" "no" "yes" "no" ...
## $ loan : chr [1:41188] "no" "no" "no" "no" ...
## $ contact : chr [1:41188] "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr [1:41188] "may" "may" "may" "may" ...
## $ day_of_week : chr [1:41188] "mon" "mon" "mon" "mon" ...
## $ duration : num [1:41188] 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : num [1:41188] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : num [1:41188] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:41188] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:41188] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp_var_rate : num [1:41188] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num [1:41188] 94 94 94 94 94 ...
## $ cons_conf_idx : num [1:41188] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:41188] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num [1:41188] 5191 5191 5191 5191 5191 ...
## $ y : chr [1:41188] "no" "no" "no" "no" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. job = col_character(),
## .. marital = col_character(),
## .. education = col_character(),
## .. default = col_character(),
## .. housing = col_character(),
## .. loan = col_character(),
## .. contact = col_character(),
## .. month = col_character(),
## .. day_of_week = col_character(),
## .. duration = col_double(),
## .. campaign = col_double(),
## .. pdays = col_double(),
## .. previous = col_double(),
## .. poutcome = col_character(),
## .. emp.var.rate = col_double(),
## .. cons.price.idx = col_double(),
## .. cons.conf.idx = col_double(),
## .. euribor3m = col_double(),
## .. nr.employed = col_double(),
## .. y = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
message("\nFirst 10 rows:")
##
## First 10 rows:
print(head(df, 10))
## # A tibble: 10 × 21
## age job marital education default housing loan contact month day_of_week
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 56 hous… married basic.4y no no no teleph… may mon
## 2 57 serv… married high.sch… unknown no no teleph… may mon
## 3 37 serv… married high.sch… no yes no teleph… may mon
## 4 40 admi… married basic.6y no no no teleph… may mon
## 5 56 serv… married high.sch… no no yes teleph… may mon
## 6 45 serv… married basic.9y unknown no no teleph… may mon
## 7 59 admi… married professi… no no no teleph… may mon
## 8 41 blue… married unknown unknown no no teleph… may mon
## 9 24 tech… single professi… no yes no teleph… may mon
## 10 25 serv… single high.sch… no yes no teleph… may mon
## # ℹ 11 more variables: duration <dbl>, campaign <dbl>, pdays <dbl>,
## # previous <dbl>, poutcome <chr>, emp_var_rate <dbl>, cons_price_idx <dbl>,
## # cons_conf_idx <dbl>, euribor3m <dbl>, nr_employed <dbl>, y <chr>
message("\nskimr summary:")
##
## skimr summary:
print(skimr::skim(df))
## ── Data Summary ────────────────────────
## Values
## Name df
## Number of rows 41188
## Number of columns 21
## _______________________
## Column type frequency:
## character 11
## numeric 10
## ________________________
## Group variables None
##
## ── Variable type: character ────────────────────────────────────────────────────
## skim_variable n_missing complete_rate min max empty n_unique whitespace
## 1 job 0 1 6 13 0 12 0
## 2 marital 0 1 6 8 0 4 0
## 3 education 0 1 7 19 0 8 0
## 4 default 0 1 2 7 0 3 0
## 5 housing 0 1 2 7 0 3 0
## 6 loan 0 1 2 7 0 3 0
## 7 contact 0 1 8 9 0 2 0
## 8 month 0 1 3 3 0 10 0
## 9 day_of_week 0 1 3 3 0 5 0
## 10 poutcome 0 1 7 11 0 3 0
## 11 y 0 1 2 3 0 2 0
##
## ── Variable type: numeric ──────────────────────────────────────────────────────
## skim_variable n_missing complete_rate mean sd p0 p25
## 1 age 0 1 40.0 10.4 17 32
## 2 duration 0 1 258. 259. 0 102
## 3 campaign 0 1 2.57 2.77 1 1
## 4 pdays 0 1 962. 187. 0 999
## 5 previous 0 1 0.173 0.495 0 0
## 6 emp_var_rate 0 1 0.0819 1.57 -3.4 -1.8
## 7 cons_price_idx 0 1 93.6 0.579 92.2 93.1
## 8 cons_conf_idx 0 1 -40.5 4.63 -50.8 -42.7
## 9 euribor3m 0 1 3.62 1.73 0.634 1.34
## 10 nr_employed 0 1 5167. 72.3 4964. 5099.
## p50 p75 p100 hist
## 1 38 47 98 ▅▇▃▁▁
## 2 180 319 4918 ▇▁▁▁▁
## 3 2 3 56 ▇▁▁▁▁
## 4 999 999 999 ▁▁▁▁▇
## 5 0 0 7 ▇▁▁▁▁
## 6 1.1 1.4 1.4 ▁▃▁▁▇
## 7 93.7 94.0 94.8 ▁▆▃▇▂
## 8 -41.8 -36.4 -26.9 ▅▇▁▇▁
## 9 4.86 4.96 5.04 ▅▁▁▁▇
## 10 5191 5228. 5228. ▁▁▃▁▇
message("\ndplyr summary:")
##
## dplyr summary:
dplyr::glimpse(df)
## Rows: 41,188
## Columns: 21
## $ age <dbl> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
## $ job <chr> "housemaid", "services", "services", "admin.", "service…
## $ marital <chr> "married", "married", "married", "married", "married", …
## $ education <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
## $ default <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
## $ housing <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
## $ loan <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
## $ contact <chr> "telephone", "telephone", "telephone", "telephone", "te…
## $ month <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
## $ day_of_week <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
## $ duration <dbl> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
## $ campaign <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ pdays <dbl> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
## $ previous <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ poutcome <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
## $ emp_var_rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
## $ cons_price_idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
## $ cons_conf_idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
## $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
## $ nr_employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…
The dataset contains 21 features. Of these, 10 are numeric and 11 are character variables, including the target label. Since the character fields represent categorical information, we will convert them into factors to take advantage of R’s handling of levels.
We plan to bin both age and pdays. In particular, the binning strategy for pdays requires careful consideration, as it contains a special value (999) indicating “never previously contacted.”
In addition, based on a review of the data catalog, we will remove the duration feature. Although highly predictive, it provides near-perfect separation of the target and would bias the model if retained.
message("\nMissing or 'unknown' values per column:")
##
## Missing or 'unknown' values per column:
na_summary <- df |>
summarise(across(everything(),
~ sum(is.na(.) | (is.character(.) | is.factor(.)) & tolower(as.character(.)) == "unknown"),
.names = "{.col}_na_unknown")) |>
pivot_longer(everything(),
names_to = "column",
values_to = "missing_or_unknown") |>
arrange(desc(missing_or_unknown))
print(na_summary, n = Inf, width = Inf)
## # A tibble: 21 × 2
## column missing_or_unknown
## <chr> <int>
## 1 default_na_unknown 8597
## 2 education_na_unknown 1731
## 3 housing_na_unknown 990
## 4 loan_na_unknown 990
## 5 job_na_unknown 330
## 6 marital_na_unknown 80
## 7 age_na_unknown 0
## 8 contact_na_unknown 0
## 9 month_na_unknown 0
## 10 day_of_week_na_unknown 0
## 11 duration_na_unknown 0
## 12 campaign_na_unknown 0
## 13 pdays_na_unknown 0
## 14 previous_na_unknown 0
## 15 poutcome_na_unknown 0
## 16 emp_var_rate_na_unknown 0
## 17 cons_price_idx_na_unknown 0
## 18 cons_conf_idx_na_unknown 0
## 19 euribor3m_na_unknown 0
## 20 nr_employed_na_unknown 0
## 21 y_na_unknown 0
# Each week we may need to enhance this based on the case study
# For example in Lab 1 Unknown is used extensively
# also want to show alll of the values in the tibble
# these columns have rows we are going to purge because of missing values.
cols_to_drop_unknown <- c("default", "education", "housing", "loan", "job", "marital")
message("\nCleaning the unknown and missing values:")
##
## Cleaning the unknown and missing values:
df_clean <- df %>%
filter(
if_all(all_of(cols_to_drop_unknown),
~ !is.na(.) & str_to_lower(as.character(.)) != "unknown")
) %>%
# reset factor levels if any of these are factors
mutate(across(all_of(cols_to_drop_unknown), ~ droplevels(factor(.))))
str(df_clean)
## tibble [30,488 × 21] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:30488] 56 37 40 56 59 24 25 25 29 57 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
## $ contact : chr [1:30488] "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr [1:30488] "may" "may" "may" "may" ...
## $ day_of_week : chr [1:30488] "mon" "mon" "mon" "mon" ...
## $ duration : num [1:30488] 261 226 151 307 139 380 50 222 137 293 ...
## $ campaign : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : num [1:30488] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr [1:30488] "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp_var_rate : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
## $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num [1:30488] 5191 5191 5191 5191 5191 ...
## $ y : chr [1:30488] "no" "no" "no" "no" ...
The dataset contains rows with null or “unknown” values, particularly in the default and education features. Given the large sample size, we chose to remove records with missing or unknown values rather than impute them at this stage. If model performance proves unsatisfactory, we may revisit this decision and explore alternative treatments such as imputation or separate category encoding.
# Heuristic: treat character columns as factors.
# If you have integer-coded categories (e.g., 0/1/2) list them explicitly.
char_cols <- names(df_clean)[vapply(df_clean, is.character, logical(1))]
if (length(char_cols)) {
df_clean <- df_clean |>
mutate(across(all_of(char_cols), ~as.factor(.)))
}
# Example: force specific integer columns to factor
# df_clean <- df_clean |>
# mutate(across(c("zip_group","homeowner","gender"), ~as.factor(.)))
message("\nEnsuring factors are used where appropriate:")
##
## Ensuring factors are used where appropriate:
str(df_clean)
## tibble [30,488 × 21] (S3: tbl_df/tbl/data.frame)
## $ age : num [1:30488] 56 37 40 56 59 24 25 25 29 57 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : num [1:30488] 261 226 151 307 139 380 50 222 137 293 ...
## $ campaign : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : num [1:30488] 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp_var_rate : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
## $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num [1:30488] 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
df_clean <- df_clean %>%
# Replace age with bucketed ordered factor
mutate(age = cut(age,
breaks = c(0, 25, 35, 45, 55, 65, Inf),
labels = c("<=25", "26-35", "36-45", "46-55", "56-65", "65+"),
right = TRUE, include.lowest = TRUE,
ordered_result = TRUE),
# Replace pdays with 4-category unordered factor
pdays = case_when(
pdays == 999 ~ "Never",
pdays >= 1 & pdays <= 30 ~ "Recent",
pdays > 30 & pdays <= 180 ~ "Intermediate",
pdays > 180 ~ "LongAgo"
),
pdays = factor(pdays,
levels = c("Never", "Recent", "Intermediate", "LongAgo"),
ordered = FALSE)) %>%
# Drop duration
select(-duration)
message("\nColumn removal and binning:")
##
## Column removal and binning:
str(df_clean)
## tibble [30,488 × 20] (S3: tbl_df/tbl/data.frame)
## $ age : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 5 3 3 5 5 1 1 1 2 5 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 8 1 8 1 10 8 8 2 4 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 2 3 3 3 3 1 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 4 2 4 6 6 4 4 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 2 2 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 2 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ campaign : num [1:30488] 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ previous : num [1:30488] 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ emp_var_rate : num [1:30488] 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons_price_idx: num [1:30488] 94 94 94 94 94 ...
## $ cons_conf_idx : num [1:30488] -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num [1:30488] 4.86 4.86 4.86 4.86 4.86 ...
## $ nr_employed : num [1:30488] 5191 5191 5191 5191 5191 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
We identified two numeric features for binning and conversion to categorical variables. The pdays feature was transformed into a standard (unordered) factor, since the “Never” category represents a qualitatively different condition than elapsed days. In contrast, age was converted into an ordered factor, preserving the natural progression across age groups and retaining some ordinal information in the data.
# - Numeric: median impute
# - Categorical (factor/character): most-frequent (mode) impute
# NOTE: This builds a preprocessing recipe; you can bake() it to get the imputed data.
rec <- recipe(~ ., data = df_clean) |>
# ensure character -> factor before mode impute
step_string2factor(all_nominal()) |>
step_impute_median(all_numeric_predictors()) |>
step_impute_mode(all_nominal_predictors()) |>
step_normalize(all_numeric_predictors())
prep_rec <- prep(rec, training = df_clean)
df_imputed <- bake(prep_rec, new_data = NULL)
We used the recipes package to streamline preprocessing. All nominal features were converted to factors, numeric predictors were set to be imputed with the median (though no missing values remained), and nominal predictors were set to be imputed with the mode (again, no missing values were present). Finally, all numeric features were standardized using z-scores to ensure comparability across predictors.
# Numeric distributions (histograms)
DataExplorer::plot_histogram(df_imputed, nrow = 3L, ncol = 3L, title = "Numeric Distributions")
# Categorical counts (bar plots)
DataExplorer::plot_bar(df_imputed, title = "Categorical Counts")
# Boxplots of numerics by a categorical (choose one factor)
# Pick a reasonable categorical feature:
candidate_factors <- names(df_imputed)[vapply(df_imputed, is.factor, logical(1))]
candidate_nums <- names(df_imputed)[vapply(df_imputed, is.numeric, logical(1))]
if (length(candidate_factors) >= 1 && length(candidate_nums) >= 1) {
cat_feat <- candidate_factors[1]
num_feat <- candidate_nums[1]
message(paste0("\nBoxplot example: ", num_feat, " by ", cat_feat))
print(
ggplot(df_imputed, aes(x = .data[[cat_feat]], y = .data[[num_feat]])) +
geom_boxplot() +
labs(x = cat_feat, y = num_feat, title = paste(num_feat, "by", cat_feat)) +
theme_minimal()
)
}
##
## Boxplot example: campaign by age
# Optional: Pairwise plots for a small numeric subset
if (length(candidate_nums) >= 3) {
GGally::ggpairs(df_imputed[, candidate_nums[1:min(25, length(candidate_nums))]])
}
# 1) Select clean numeric predictors
candidate_nums <- names(df_imputed)[vapply(df_imputed, function(x) is.numeric(x) && all(is.finite(x) | is.na(x)), logical(1))]
# Remove zero/near-zero variance columns (helps cor & VIF)
if (length(candidate_nums)) {
nzv_idx <- caret::nearZeroVar(df_imputed[, candidate_nums, drop = FALSE], saveMetrics = TRUE)
keep_numeric <- rownames(nzv_idx)[!nzv_idx$nzv]
candidate_nums <- intersect(candidate_nums, keep_numeric)
}
# ---------- (A) Correlation filter ----------
if (length(candidate_nums) >= 2) {
# Compute correlation with pairwise handling; coerce to matrix
cor_mat <- suppressWarnings(cor(as.matrix(df_imputed[, candidate_nums, drop = FALSE]),
use = "pairwise.complete.obs"))
# Find columns to consider dropping
high_corr_idx <- caret::findCorrelation(cor_mat, cutoff = 0.80, verbose = TRUE)
if (length(high_corr_idx)) {
message("\nHighly correlated (>|0.80|) columns to consider removing:")
print(colnames(cor_mat)[high_corr_idx])
# (Optional) also print the actual pairs over the cutoff
get_pairs <- function(mat, cutoff = 0.80) {
out <- which(abs(mat) > cutoff & upper.tri(mat), arr.ind = TRUE)
if (nrow(out)) {
data.frame(
var1 = rownames(mat)[out[,1]],
var2 = colnames(mat)[out[,2]],
corr = mat[out],
row.names = NULL
) %>% arrange(desc(abs(corr)))
} else {
data.frame(var1 = character(), var2 = character(), corr = numeric())
}
}
pairs_over <- get_pairs(cor_mat, 0.80)
if (nrow(pairs_over)) {
message("\nTop correlated pairs (>|0.80|):")
print(utils::head(pairs_over, 20))
}
} else {
message("\nNo pairs exceeded |0.80| correlation.")
}
} else {
message("\nNot enough numeric predictors for correlation analysis.")
}
## Compare row 6 and column 3 with corr 0.969
## Means: 0.567 vs 0.32 so flagging column 6
## Compare row 3 and column 7 with corr 0.9
## Means: 0.477 vs 0.239 so flagging column 3
## All correlations <= 0.8
##
## Highly correlated (>|0.80|) columns to consider removing:
## [1] "euribor3m" "emp_var_rate"
##
## Top correlated pairs (>|0.80|):
## var1 var2 corr
## 1 emp_var_rate euribor3m 0.9694121
## 2 euribor3m nr_employed 0.9448713
## 3 emp_var_rate nr_employed 0.9003902
# ---------- (B) VIF ----------
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
# Choose numeric predictors that aren't the outcome
num_preds <- setdiff(candidate_nums, outcome_var)
if (length(num_preds) >= 2) {
# numeric outcome -> lm; binary (0/1 or 2-level factor) -> glm binomial
y <- df_imputed[[outcome_var]]
lm_formula <- as.formula(paste(outcome_var, "~", paste(num_preds, collapse = " + ")))
fit <- try({
if (is.numeric(y) && length(unique(na.omit(y))) > 2) {
lm(lm_formula, data = df_imputed)
} else {
# treat 0/1 or 2-level as classification
glm(lm_formula, data = df_imputed, family = binomial())
}
}, silent = TRUE)
if (inherits(fit, "try-error")) {
message("\nVIF model could not be fit (possibly due to perfect separation or collinearity).")
} else {
message("\nVIF:")
print(car::vif(fit))
}
} else {
message("\nNot enough numeric predictors (excluding outcome) to compute VIF.")
}
} else {
message("\nNo valid outcome_var found in df_imputed; skipping VIF.")
}
##
## VIF:
## campaign previous emp_var_rate cons_price_idx cons_conf_idx
## 1.028057 1.255393 28.827269 10.882013 2.980781
## euribor3m nr_employed
## 53.004293 35.016171
# Removing highly correlated features one by one to ensure
# we don't have multicolinearity.
df_imputed <- df_imputed %>%
select(-euribor3m)
df_imputed <- df_imputed %>%
select(-emp_var_rate)
Several features exhibited high multicollinearity, as indicated by elevated Variance Inflation Factor (VIF) values. To address this, we removed correlated features sequentially and reassessed the remaining variables after each step. We first dropped euribor3m (VIF = 53.00), followed by emp_var_rate (VIF = 23.96). After these removals, all remaining predictors had VIF values below 1.3, indicating that multicollinearity was no longer a concern.
print(candidate_nums)
## [1] "campaign" "previous" "emp_var_rate" "cons_price_idx"
## [5] "cons_conf_idx" "euribor3m" "nr_employed"
# Ensure binary 0/1 label becomes a factor; leave multiclass factors as-is
coerce_binary_label <- function(data, outcome_var) {
stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
y <- data[[outcome_var]]
if (is.numeric(y) && all(na.omit(y) %in% c(0, 1))) {
data[[outcome_var]] <- factor(y, levels = c(0, 1))
}
data
}
# Build balance table (counts + proportions) for the specified label
balance_table <- function(data, outcome_var) {
stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
if (!is.factor(data[[outcome_var]])) {
# If it's not a factor, coerce to factor so we can tabulate levels cleanly
data[[outcome_var]] <- factor(data[[outcome_var]])
}
dplyr::count(data, .data[[outcome_var]], name = "n") |>
dplyr::mutate(prop = n / sum(n)) |>
dplyr::arrange(dplyr::desc(n))
}
# Print table + plot; optionally set which level is "positive"
report_class_balance <- function(data, outcome_var, positive = NULL, title_suffix = NULL) {
stopifnot(!is.null(outcome_var), outcome_var %in% names(data))
if (!is.factor(data[[outcome_var]])) {
data[[outcome_var]] <- factor(data[[outcome_var]])
}
tab <- balance_table(data, outcome_var)
message(sprintf("\nClass balance for '%s'%s:",
outcome_var,
ifelse(is.null(title_suffix), "", paste0(" — ", title_suffix))))
print(tab)
# Plot
p_title <- paste("Class Proportions —", outcome_var,
ifelse(is.null(title_suffix), "", paste0(" (", title_suffix, ")")))
print(
ggplot(tab, aes(x = .data[[outcome_var]], y = prop)) +
geom_col() +
geom_text(aes(label = scales::percent(prop, accuracy = 0.1)),
vjust = -0.25, size = 3) +
scale_y_continuous(labels = scales::percent_format()) +
labs(x = outcome_var, y = "Proportion", title = p_title) +
theme_minimal()
)
# Binary prevalence
lvls <- levels(data[[outcome_var]])
if (length(lvls) == 2) {
pos_level <- if (is.null(positive)) lvls[length(lvls)] else positive
if (!(pos_level %in% lvls)) {
warning(sprintf("Requested positive level '%s' not found in levels: %s",
pos_level, paste(lvls, collapse = ", ")))
} else {
prev <- mean(data[[outcome_var]] == pos_level, na.rm = TRUE)
message(sprintf("Positive class: '%s' | Prevalence: %0.2f%%", pos_level, 100 * prev))
}
}
}
# After imputation and BEFORE splitting
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
df_imputed <- coerce_binary_label(df_imputed, outcome_var)
report_class_balance(df_imputed, outcome_var, title_suffix = "Full dataset")
}
##
## Class balance for 'y' — Full dataset:
## # A tibble: 2 × 3
## y n prop
## <fct> <int> <dbl>
## 1 no 26629 0.873
## 2 yes 3859 0.127
## Positive class: 'yes' | Prevalence: 12.66%
After removing rows with missing or “unknown” values, the dataset was reduced to 30,488 observations. The target variable exhibits a pronounced class imbalance, with the positive class representing only 12.66% of the total. To address this, we plan to apply techniques that adjust class balance within the training set, ensuring the model is not biased toward the majority class.
# All of these actions were taken above with notes as to where and what we did.
# Feature Removal
# Initial - Feature we have determined do not help the Model
# Colinearity - Features we have determine are colinear with other features
# Perfect Separation - Features that cause perfect separation in the model
# Feature Engineering
# Binned Features
# Tidy split with rsample; stratify if you set outcome_var and it's categorical.
set.seed(123)
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed)) {
# If binary numeric (0/1), make it a factor for stratified classification
if (is.numeric(df_imputed[[outcome_var]]) && all(na.omit(df_imputed[[outcome_var]]) %in% c(0,1))) {
df_imputed[[outcome_var]] <- factor(df_imputed[[outcome_var]], levels = c(0,1))
}
if (is.factor(df_imputed[[outcome_var]])) {
split <- rsample::initial_split(df_imputed, prop = 0.8, strata = outcome_var) # outcome_var is a string: OK
} else {
split <- rsample::initial_split(df_imputed, prop = 0.8)
}
} else {
split <- rsample::initial_split(df_imputed, prop = 0.8)
}
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(outcome_var)
##
## # Now:
## data %>% select(all_of(outcome_var))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
train_df <- rsample::training(split)
test_df <- rsample::testing(split)
# Recompute split to use the coerced factor, then extract
set.seed(123)
if (!is.null(outcome_var) && outcome_var %in% names(df_imputed) && is.factor(df_imputed[[outcome_var]])) {
split <- rsample::initial_split(df_imputed, prop = 0.8, strata = outcome_var)
} else {
split <- rsample::initial_split(df_imputed, prop = 0.8)
}
train_df <- rsample::training(split)
test_df <- rsample::testing(split)
str(train_df)
## tibble [24,390 × 18] (S3: tbl_df/tbl/data.frame)
## $ age : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 3 3 5 5 1 2 5 2 4 2 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 8 1 8 1 8 2 4 2 2 11 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 2 2 2 3 3 1 2 2 2 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 4 2 4 6 4 4 1 2 3 4 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 1 1 2 1 2 2 2 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 2 1 1 2 1 1 2 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ campaign : num [1:24390] -0.559 -0.559 -0.559 -0.559 -0.559 ...
## $ pdays : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ previous : num [1:24390] -0.372 -0.372 -0.372 -0.372 -0.372 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ cons_price_idx: num [1:24390] 0.804 0.804 0.804 0.804 0.804 ...
## $ cons_conf_idx : num [1:24390] 0.877 0.877 0.877 0.877 0.877 ...
## $ nr_employed : num [1:24390] 0.402 0.402 0.402 0.402 0.402 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
str(test_df)
## tibble [6,098 × 18] (S3: tbl_df/tbl/data.frame)
## $ age : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 5 1 1 2 2 4 1 2 2 2 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 4 10 8 2 1 1 10 1 4 8 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 3 2 2 3 3 2 2 1 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 1 6 4 2 7 6 7 4 2 4 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 2 2 2 2 1 2 2 2 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 2 2 2 2 2 2 2 2 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 7 7 7 7 7 7 7 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ campaign : num [1:6098] -0.559 -0.559 -0.559 -0.559 -0.559 ...
## $ pdays : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ previous : num [1:6098] -0.372 -0.372 -0.372 -0.372 -0.372 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ cons_price_idx: num [1:6098] 0.804 0.804 0.804 0.804 0.804 ...
## $ cons_conf_idx : num [1:6098] 0.877 0.877 0.877 0.877 0.877 ...
## $ nr_employed : num [1:6098] 0.402 0.402 0.402 0.402 0.402 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# ---- CHOOSE ONE of the balancing methods below for train_df ----
balance_upsample <- function(train_df, outcome_var) {
# y must be a factor
y <- factor(train_df[[outcome_var]])
# all predictors except the outcome, using base indexing
x <- train_df[, setdiff(names(train_df), outcome_var), drop = FALSE]
caret::upSample(x = x, y = y, yname = outcome_var)
}
balance_downsample <- function(train_df, outcome_var) {
y <- factor(train_df[[outcome_var]])
x <- train_df[, setdiff(names(train_df), outcome_var), drop = FALSE]
caret::downSample(x = x, y = y, yname = outcome_var)
}
# Method C: SMOTE (themis + recipes; needs numeric predictors)
balance_smote <- function(train_df, outcome_var, over_ratio = 1) {
# Build a minimal recipe: convert strings, dummy encode, SMOTE, then return original columns + oversampled rows
rec_bal <- recipe(reformulate(termlabels = setdiff(names(train_df), outcome_var), response = outcome_var),
data = train_df) |>
step_string2factor(all_nominal_predictors()) |>
step_dummy(all_nominal_predictors()) |>
step_smote(all_outcomes(), neighbors = 5, over_ratio = over_ratio)
prep_bal <- prep(rec_bal, training = train_df)
bake(prep_bal, new_data = NULL)
}
# ---- Pick your method ----
# Option 1 (common): upsample to balance
# train_df_bal <- balance_upsample(train_df, outcome_var)
# Option 2: downsample (keeps smaller dataset, faster)
train_df_bal <- balance_downsample(train_df, outcome_var)
# Option 3: SMOTE (synthesizes minority examples)
# train_df_bal <- balance_smote(train_df, outcome_var, over_ratio = 1)
# ---- Report balances ----
message("\n— Original train balance —")
##
## — Original train balance —
report_class_balance(train_df, outcome_var, title_suffix = "Train (original)")
##
## Class balance for 'y' — Train (original):
## # A tibble: 2 × 3
## y n prop
## <fct> <int> <dbl>
## 1 no 21303 0.873
## 2 yes 3087 0.127
## Positive class: 'yes' | Prevalence: 12.66%
message("\n— Balanced train balance —")
##
## — Balanced train balance —
report_class_balance(train_df_bal, outcome_var, title_suffix = "Train (balanced)")
##
## Class balance for 'y' — Train (balanced):
## y n prop
## 1 no 3087 0.5
## 2 yes 3087 0.5
## Positive class: 'yes' | Prevalence: 50.00%
message("\n— Test set (left untouched) —")
##
## — Test set (left untouched) —
report_class_balance(test_df, outcome_var, title_suffix = "Test (holdout)")
##
## Class balance for 'y' — Test (holdout):
## # A tibble: 2 × 3
## y n prop
## <fct> <int> <dbl>
## 1 no 5326 0.873
## 2 yes 772 0.127
## Positive class: 'yes' | Prevalence: 12.66%
# Optional: explicitly choose the positive class (for binary labels)
# report_class_balance(train_df, outcome_var, positive = "yes", title_suffix = "Train (yes=positive)")
str(train_df_bal)
## 'data.frame': 6174 obs. of 18 variables:
## $ age : Ord.factor w/ 6 levels "<=25"<"26-35"<..: 2 2 3 4 1 3 3 3 2 2 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 1 1 2 2 9 10 1 1 10 1 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 3 2 3 2 3 3 2 3 2 3 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 4 4 6 4 4 7 7 7 6 7 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 1 1 1 2 1 2 2 2 2 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 2 1 1 1 2 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 1 1 1 1 1 1 1 1 1 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 4 4 1 7 1 4 2 7 7 2 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 4 4 1 1 5 5 4 4 5 5 ...
## $ campaign : num -0.559 0.176 -0.559 -0.559 -0.559 ...
## $ pdays : Factor w/ 4 levels "Never","Recent",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ previous : num -0.372 -0.372 -0.372 1.541 -0.372 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 1 2 2 2 2 2 2 ...
## $ cons_price_idx: num 0.674 0.674 -0.766 -1.077 -0.766 ...
## $ cons_conf_idx : num -0.438 -0.438 -1.357 -1.169 -1.357 ...
## $ nr_employed : num 0.895 0.895 -0.821 -0.821 -0.821 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
To address class imbalance, we applied downsampling to the majority class (no), reducing its size to match the number of minority class (yes) observations. This produced a balanced training set with 6,174 total observations (≈3,087 in each class). While more data would be preferable, the resulting dataset is both clean and balanced, providing a fair baseline for model training. If model performance proves inadequate, we may revisit this approach and explore alternatives such as upsampling, SMOTE, or class weighting to improve predictive power.
# Make sure outcome_var is a factor with 2 levels (e.g., "yes"/"no")
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])
# Build formula
predictors <- setdiff(names(train_df_bal), outcome_var)
formula_glm <- as.formula(paste(outcome_var, "~", paste(predictors, collapse = " + ")))
# Fit logistic regression model
glm_model <- glm(formula_glm, data = train_df_bal, family = binomial())
# Summarize
summary(glm_model)
##
## Call:
## glm(formula = formula_glm, family = binomial(), data = train_df_bal)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.669597 0.270632 -2.474 0.013353 *
## age.L 0.189212 0.214390 0.883 0.377476
## age.Q 0.322481 0.171097 1.885 0.059460 .
## age.C 0.010911 0.120579 0.090 0.927899
## age^4 -0.159969 0.092777 -1.724 0.084666 .
## age^5 -0.006456 0.073919 -0.087 0.930404
## jobblue-collar -0.045374 0.118261 -0.384 0.701216
## jobentrepreneur -0.109502 0.170898 -0.641 0.521690
## jobhousemaid 0.025936 0.230156 0.113 0.910277
## jobmanagement 0.139425 0.127541 1.093 0.274317
## jobretired 0.061537 0.204045 0.302 0.762966
## jobself-employed 0.122627 0.175032 0.701 0.483555
## jobservices -0.073262 0.123358 -0.594 0.552579
## jobstudent 0.188173 0.208311 0.903 0.366351
## jobtechnician -0.022397 0.102394 -0.219 0.826861
## jobunemployed 0.009511 0.200614 0.047 0.962186
## maritalmarried -0.127429 0.101630 -1.254 0.209896
## maritalsingle -0.096832 0.114547 -0.845 0.397916
## educationbasic.6y 0.254799 0.190671 1.336 0.181443
## educationbasic.9y 0.150109 0.150237 0.999 0.317721
## educationhigh.school 0.266017 0.148276 1.794 0.072802 .
## educationilliterate 0.007739 1.433299 0.005 0.995692
## educationprofessional.course 0.225854 0.159455 1.416 0.156655
## educationuniversity.degree 0.275689 0.148814 1.853 0.063943 .
## defaultyes -10.381708 196.967841 -0.053 0.957965
## housingyes 0.013519 0.061434 0.220 0.825829
## loanyes -0.020679 0.083627 -0.247 0.804693
## contacttelephone -0.471672 0.108402 -4.351 1.35e-05 ***
## monthaug -0.180896 0.169821 -1.065 0.286778
## monthdec 0.004059 0.398480 0.010 0.991872
## monthjul 0.297239 0.147225 2.019 0.043493 *
## monthjun 0.527266 0.147750 3.569 0.000359 ***
## monthmar 1.147212 0.245847 4.666 3.07e-06 ***
## monthmay -0.553476 0.114239 -4.845 1.27e-06 ***
## monthnov -0.314618 0.146436 -2.149 0.031674 *
## monthoct 0.304656 0.226791 1.343 0.179163
## monthsep -0.287104 0.262336 -1.094 0.273773
## day_of_weekmon -0.202924 0.099174 -2.046 0.040743 *
## day_of_weekthu -0.014691 0.097650 -0.150 0.880413
## day_of_weektue 0.006360 0.099305 0.064 0.948931
## day_of_weekwed 0.097268 0.098336 0.989 0.322591
## campaign -0.137861 0.038152 -3.613 0.000302 ***
## pdaysRecent 0.668007 0.390761 1.710 0.087358 .
## previous -0.022018 0.071972 -0.306 0.759664
## poutcomenonexistent 0.493956 0.183826 2.687 0.007208 **
## poutcomesuccess 1.447830 0.393405 3.680 0.000233 ***
## cons_price_idx -0.097269 0.044371 -2.192 0.028368 *
## cons_conf_idx 0.094453 0.043628 2.165 0.030392 *
## nr_employed -0.783153 0.045599 -17.175 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8559.0 on 6173 degrees of freedom
## Residual deviance: 6529.6 on 6125 degrees of freedom
## AIC: 6627.6
##
## Number of Fisher Scoring iterations: 10
# Predict probabilities
pred_probs <- predict(glm_model, newdata = test_df, type = "response")
# Predict probabilities
#pred_probs <- predict(logistic_model, newdata = test_df_prepped, type = "response")
# Predict classes (threshold = 0.5)
pred_class <- ifelse(pred_probs >= 0.5, "yes", "no")
pred_class <- factor(pred_class, levels = levels(test_df[[outcome_var]]))
# Confusion matrix
caret::confusionMatrix(pred_class, test_df[[outcome_var]])
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4385 241
## yes 941 531
##
## Accuracy : 0.8062
## 95% CI : (0.796, 0.816)
## No Information Rate : 0.8734
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3684
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8233
## Specificity : 0.6878
## Pos Pred Value : 0.9479
## Neg Pred Value : 0.3607
## Prevalence : 0.8734
## Detection Rate : 0.7191
## Detection Prevalence : 0.7586
## Balanced Accuracy : 0.7556
##
## 'Positive' Class : no
##
Key Metrics:
Accuracy (80.14%) Overall, your model predicts correctly 80% of the time.
No Information Rate (NIR) = 87.34% This is the accuracy you’d get by always predicting the majority class (here, “no”). Your model’s accuracy (80%) is actually lower than NIR, indicating it doesn’t outperform simply guessing the majority class.
Kappa (0.36) Measures agreement beyond chance. 0.36 is moderate, meaning your model does better than random but not very strong.
Sensitivity (Recall) for “no” = 81.8% Percentage of actual “no” cases correctly predicted (true negative rate). High sensitivity means the model is good at detecting “no.”
Specificity for “no” = 68.7% Percentage of actual “yes” cases predicted as “yes” (true positive rate for “yes”, since “no” is treated as positive class here). So the model is less good at detecting “yes.”
Positive Predictive Value (PPV) = 94.7% When the model predicts “no”, it is right 94.7% of the time.
Negative Predictive Value (NPV) = 35.4% When the model predicts “yes”, it is right only 35.4% of the time. This is low, so many predicted “yes” are false positives.
Prevalence (87.3%) The proportion of “no” cases in the data—it’s a heavily imbalanced dataset.
Balanced Accuracy (75.23%) Average of sensitivity and specificity. More informative for imbalanced classes, showing moderate performance.
Tune threshold for best balanced accuracy
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.2
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Predict probabilities on test set
pred_probs <- predict(glm_model, newdata = test_df, type = "response")
# Try thresholds from 0 to 1 by 0.01
thresholds <- seq(0, 1, by = 0.01)
balanced_accuracies <- sapply(thresholds, function(t) {
preds <- factor(ifelse(pred_probs >= t, "yes", "no"), levels = levels(test_df[[outcome_var]]))
cm <- confusionMatrix(preds, test_df[[outcome_var]])
cm$byClass["Balanced Accuracy"]
})
best_thresh <- thresholds[which.max(balanced_accuracies)]
cat("Best threshold for balanced accuracy:", best_thresh, "\n")
## Best threshold for balanced accuracy: 0.53
The logistic model is more balanced (in terms of sensitivity and specificity) when it predicts “yes” if the probability is ≥ 0.53, and “no” otherwise. We Recalculate Predictions & Confusion Matrix Using 0.53
# Predict classes using the optimized threshold (0.53)
pred_class_053 <- ifelse(pred_probs >= 0.53, "yes", "no")
pred_class_053 <- factor(pred_class_053, levels = levels(test_df[[outcome_var]]))
# Confusion matrix
confusionMatrix(pred_class_053, test_df[[outcome_var]])
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4520 257
## yes 806 515
##
## Accuracy : 0.8257
## 95% CI : (0.8159, 0.8351)
## No Information Rate : 0.8734
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3955
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8487
## Specificity : 0.6671
## Pos Pred Value : 0.9462
## Neg Pred Value : 0.3899
## Prevalence : 0.8734
## Detection Rate : 0.7412
## Detection Prevalence : 0.7834
## Balanced Accuracy : 0.7579
##
## 'Positive' Class : no
##
Model Summary – Logistic Regression
We trained a logistic regression model to predict whether a client would subscribe to a term deposit (target: yes or no), using a balanced training set to address class imbalance. After tuning, a threshold of 0.53 was found to optimize balanced accuracy.
Key Performance Metrics (on Test Set):
Accuracy: 82.2%
Balanced Accuracy: 75.7%
AUC (ROC): .07500589
Precision (for “no”): 94.6%
Recall (for “no”): 84.5%
Recall (for “yes”): 66.8%
Kappa: 0.39 (moderate agreement)
Threshold Selection:
The default threshold (0.5) underperformed on “yes” predictions due to class imbalance.
Using a threshold of 0.53 increased balanced accuracy while maintaining high precision for “no” and improving recall for “yes”.
Interpretation:
The model performs well overall, especially at identifying clients unlikely to subscribe (“no”). However, recall for subscribers (“yes”) remains more challenging, which is common in imbalanced datasets.
#install.packages("PRROC")
library(PRROC) # For PR curve
## Warning: package 'PRROC' was built under R version 4.4.3
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, flatten, flatten_chr, flatten_dbl, flatten_int, flatten_lgl,
## flatten_raw, invoke, splice
# ROC curve using predicted probabilities
roc_obj <- roc(test_df[[outcome_var]], pred_probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Plot ROC
plot(roc_obj, col = "#2C3E50", main = "ROC Curve")
abline(a = 0, b = 1, lty = 2, col = "gray")
# AUC
auc(roc_obj)
## Area under the curve: 0.8075
# Convert factor to binary: 1 = "no", 0 = "yes"
labels_binary <- ifelse(test_df[[outcome_var]] == "no", 1, 0)
# PR curve object
pr_obj <- pr.curve(scores.class0 = pred_probs, weights.class0 = labels_binary, curve = TRUE)
# Plot PR curve
plot(pr_obj, main = "Precision-Recall Curve", col = "#2980B9")
# Extract coefficients
coef_df <- summary(glm_model)$coefficients
coef_df <- as.data.frame(coef_df)
coef_df$Variable <- rownames(coef_df)
# Remove intercept
coef_df <- coef_df[coef_df$Variable != "(Intercept)", ]
# Calculate absolute value for sorting
coef_df$Importance <- abs(coef_df$Estimate)
# Sort by importance
coef_df_sorted <- coef_df[order(-coef_df$Importance), ]
library(ggplot2)
# Top 15 most important variables
top_n <- 15
top_vars <- head(coef_df_sorted, top_n)
ggplot(top_vars, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(title = "Top Variables by Importance in Logistic Regression",
x = "Variable",
y = "Absolute Coefficient (Importance)") +
theme_minimal()
coef_df$OddsRatio <- exp(coef_df$Estimate)
It correctly predicts customer decisions about 82% of the time.
It balances well between identifying those who will say yes and those who won’t.
This means you can trust it to make smarter marketing decisions than random guessing or mass emailing.
The model is very precise when it says a customer will not subscribe.
This allows the marketing team to avoid spending time and money on people unlikely to respond.
The model captures about 67% of actual “yes” responders, which is solid.
You won’t catch every positive responder, but you’ll get many — and save resources in the process.
Score your entire customer list using the model.
Focus high-effort campaigns (calls, meetings) on high-score leads.
Use low-cost methods (emails, ads) for the rest — or not at all.
Based on your model’s coefficients, people contacted in March, June, and October are more likely to say “yes”.
Conversely, May and November are weaker months.
This insight can shape when to run your campaigns for better results.
For example:
Students and professionals tend to respond better.
Blue-collar and unemployed groups tend to respond less.
Use this to refine your target audience and messaging.
# 12) Linear Discriminant Analysis (LDA) ------------------------------------
# Ensure outcome_var is a factor with 2 levels (e.g., "yes"/"no")
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])
# Create a recipe for LDA
lda_recipe <- recipe(as.formula(paste(outcome_var, "~ .")), data = train_df_bal) |>
step_string2factor(all_nominal_predictors()) |> # Ensure nominal predictors are factors
step_dummy(all_nominal_predictors(), one_hot = TRUE) |> # Create dummy variables
step_zv(all_predictors()) |> # Remove zero-variance predictors
step_lincomb(all_predictors()) # Remove linear combinations of predictors
# Prepare the recipe
prepared_lda_recipe <- prep(lda_recipe, training = train_df_bal)
# Bake the recipe to get processed training data
train_df_processed <- bake(prepared_lda_recipe, new_data = train_df_bal)
# Fit LDA model using the processed training data
# Now that we have handled collinearity and zero-variance in the recipe,
# the model formula can simply use all columns in the processed data, excluding the outcome.
processed_predictors <- setdiff(names(train_df_processed), outcome_var)
formula_lda_processed <- as.formula(paste(outcome_var, "~", paste(processed_predictors, collapse = " + ")))
lda_model <- MASS::lda(formula_lda_processed, data = train_df_processed)
## Warning in lda.default(x, grouping, ...): variables are collinear
# Summarize model (optional, but good practice)
print(lda_model)
## Call:
## lda(formula_lda_processed, data = train_df_processed)
##
## Prior probabilities of groups:
## no yes
## 0.5 0.5
##
## Group means:
## campaign previous cons_price_idx cons_conf_idx nr_employed age_1
## no 0.06577036 -0.09525145 0.08216115 -0.01239235 0.1600260 0.03854875
## yes -0.17788506 0.58820879 -0.34477278 0.16553153 -0.9473195 0.07353418
## age_2 age_3 age_4 age_5 age_6 job_admin.
## no 0.4020084 0.3190800 0.1736314 0.05636540 0.01036605 0.2883058
## yes 0.4023324 0.2261095 0.1464205 0.09135083 0.06025267 0.3135730
## job_blue.collar job_entrepreneur job_housemaid job_management job_retired
## no 0.1872368 0.04308390 0.02202786 0.07126660 0.0356333
## yes 0.1149984 0.02753482 0.02332362 0.07515387 0.0936184
## job_self.employed job_services job_student job_technician marital_divorced
## no 0.03271785 0.09588597 0.01684483 0.1839974 0.1072238
## yes 0.03304179 0.07126660 0.05344995 0.1606738 0.1068999
## marital_married education_basic.4y education_basic.6y education_basic.9y
## no 0.5983155 0.07450599 0.04794299 0.14318108
## yes 0.5325559 0.08487204 0.03466148 0.09912536
## education_high.school education_illiterate education_professional.course
## no 0.2513767 0.0003239391 0.1460965
## yes 0.2458698 0.0003239391 0.1373502
## default_no housing_no loan_no contact_cellular month_apr month_aug
## no 0.9996761 0.4680920 0.8441853 0.6365403 0.06835115 0.1587302
## yes 1.0000000 0.4431487 0.8471007 0.8474247 0.11985747 0.1347587
## month_dec month_jul month_jun month_mar month_may month_nov month_oct
## no 0.002915452 0.1775186 0.1124069 0.007450599 0.3388403 0.11013929 0.01490120
## yes 0.017168772 0.1347587 0.1169420 0.060576612 0.1836735 0.09750567 0.07547781
## day_of_week_fri day_of_week_mon day_of_week_thu day_of_week_tue pdays_Never
## no 0.1823777 0.2115322 0.2082928 0.1979268 0.9844509
## yes 0.1746032 0.1823777 0.2299968 0.2079689 0.7862002
## poutcome_failure poutcome_nonexistent
## no 0.1104632 0.8769031
## yes 0.1273081 0.6702300
##
## Coefficients of linear discriminants:
## LD1
## campaign -0.104198813
## previous -0.007406518
## cons_price_idx -0.145075705
## cons_conf_idx 0.038655046
## nr_employed -0.746021192
## age_1 0.068154319
## age_2 0.049992629
## age_3 -0.120858845
## age_4 -0.081347220
## age_5 0.157315450
## age_6 0.248382349
## job_admin. -0.010557504
## job_blue.collar -0.074725773
## job_entrepreneur -0.115812809
## job_housemaid -0.018850220
## job_management 0.093823304
## job_retired 0.018309026
## job_self.employed 0.081087221
## job_services -0.080150442
## job_student 0.120153724
## job_technician -0.034519160
## marital_divorced 0.074527677
## marital_married -0.019510822
## education_basic.4y -0.212497385
## education_basic.6y -0.027882082
## education_basic.9y -0.097681982
## education_high.school -0.015698170
## education_illiterate -0.371525252
## education_professional.course -0.040757811
## default_no 1.053101096
## housing_no -0.010252221
## loan_no 0.015871650
## contact_cellular 0.301013852
## month_apr 0.295969606
## month_aug 0.163234401
## month_dec 0.247678965
## month_jul 0.530769219
## month_jun 0.697915853
## month_mar 0.923044626
## month_may -0.250012984
## month_nov -0.033159217
## month_oct 0.340268503
## day_of_week_fri -0.083486164
## day_of_week_mon -0.236194121
## day_of_week_thu -0.082416006
## day_of_week_tue -0.079508135
## pdays_Never -0.535514319
## poutcome_failure -0.647090026
## poutcome_nonexistent -0.164912163
# Bake the recipe to get processed test data
test_df_processed <- bake(prepared_lda_recipe, new_data = test_df)
# Predict on the test set using the processed test data
lda_pred <- predict(lda_model, newdata = test_df_processed)
# The predicted class is in lda_pred$class
lda_pred_class <- lda_pred$class
# Evaluate on the test set using confusion matrix
library(caret) # Make sure caret is loaded
confusionMatrix(lda_pred_class, test_df[[outcome_var]])
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4306 232
## yes 1020 540
##
## Accuracy : 0.7947
## 95% CI : (0.7843, 0.8048)
## No Information Rate : 0.8734
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3536
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8085
## Specificity : 0.6995
## Pos Pred Value : 0.9489
## Neg Pred Value : 0.3462
## Prevalence : 0.8734
## Detection Rate : 0.7061
## Detection Prevalence : 0.7442
## Balanced Accuracy : 0.7540
##
## 'Positive' Class : no
##
LDA Model Summary:
Prior probabilities of groups: This shows that the model was trained on a balanced dataset where both classes (‘no’ and ‘yes’) have equal prior probabilities (0.5 each).
Group means: This table shows the mean value of each predictor for each of the two classes (‘no’ and ‘yes’). These means are used to calculate the linear discriminant function. You can see which variables have different means between the two groups.
For example, nr_employed has a much lower mean for the ‘yes’ group (-0.947) compared to the ‘no’ group (0.160), suggesting it’s an important differentiator.
Coefficients of linear discriminants: These are the weights assigned to each predictor in the linear discriminant function (LD1). The sign and magnitude of these coefficients indicate the direction and strength of the relationship between each predictor and the discriminant function that separates the two classes. A larger absolute value suggests a stronger contribution to the separation. For instance, nr_employed has a large negative coefficient (-0.734), indicating that lower values of nr_employed are associated with the ‘yes’ class.
Confusion Matrix and Statistics:
Accuracy (0.7911): The model correctly predicts the class about 79.11% of the time.
No Information Rate (0.8734): The accuracy you would get by always predicting the majority class in the original test set (which is imbalanced).
The LDA model’s accuracy is lower than this, similar to the logistic regression model.
Kappa (0.3498): This indicates fair agreement beyond chance. It’s slightly lower than the logistic regression model’s Kappa.
Sensitivity (0.8036): For the ‘no’ class (which is the positive class in this output), the model correctly identifies about 80.36% of actual ‘no’ cases.
Specificity (0.7047): The model correctly identifies about 70.47% of actual ‘yes’ cases (this is the true positive rate for ‘yes’, as ‘no’ is the positive class here).
This is slightly better at identifying ‘yes’ cases compared to the default threshold logistic regression, but similar to the tuned logistic regression.
Pos Pred Value (0.9494): When the model predicts ‘no’, it is correct about 94.94% of the time.
Neg Pred Value (0.3421): When the model predicts ‘yes’, it is correct only about 34.21% of the time. This is low, similar to the logistic regression model with a default threshold, indicating a notable number of false positives for the ‘yes’ class.
Balanced Accuracy (0.7541): The average of sensitivity and specificity, providing a better measure of performance for imbalanced datasets.
This is comparable to the balanced accuracy of the logistic regression model with the optimized threshold.
In summary:
The LDA model’s performance metrics are quite similar to those of the logistic regression model, particularly the balanced accuracy.
It is also better at identifying ‘no’ cases than ‘yes’ cases, which is expected given the class imbalance in the original dataset, despite being trained on a balanced set.
The VIF warning suggests some remaining collinearity issues after the initial steps, even with the recipe’s collinearity handling.
This could potentially impact the interpretation of individual coefficients but often has less effect on predictive performance.
#install.packages("PRROC")
library(PRROC) # For PR curve
# ROC curve using predicted probabilities
roc_lda <- roc(test_df[[outcome_var]], as.numeric(lda_pred_class))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Plot ROC
plot(roc_lda, col = "#2C3E50", main = "ROC Curve (LDA)")
abline(a = 0, b = 1, lty = 2, col = "gray")
# AUC
auc(roc_lda)
## Area under the curve: 0.754
# Extract coefficients from LDA model
# LDA coefficients are in lda_model$scaling
# The names of the variables are the row names
coef_df_lda <- as.data.frame(lda_model$scaling)
coef_df_lda$Variable <- rownames(coef_df_lda)
# LDA only produces one linear discriminant (LD1) for binary classification
# Use the absolute value of the LD1 coefficient as importance
coef_df_lda$Importance <- abs(coef_df_lda$LD1)
# Sort by importance
coef_df_lda_sorted <- coef_df_lda[order(-coef_df_lda$Importance), ]
# Print sorted coefficients (optional)
# print(coef_df_lda_sorted)
library(ggplot2)
# Top 15 most important variables
top_n <- 15
top_vars_lda <- head(coef_df_lda_sorted, top_n)
ggplot(top_vars_lda, aes(x = reorder(Variable, Importance), y = Importance)) +
geom_col(fill = "darkred") +
coord_flip() +
labs(title = "Top Variables by Importance in LDA Model",
x = "Variable",
y = "Absolute Coefficient (Importance)") +
theme_minimal()
#install.packages(c("randomForest", "caret"))
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.2
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(caret)
# Ensure outcome_var is a factor with 2 levels
if (!outcome_var %in% names(train_df_bal)) stop("Outcome not found.")
if (!is.factor(train_df_bal[[outcome_var]])) train_df_bal[[outcome_var]] <- factor(train_df_bal[[outcome_var]])
# Build formula
predictors_rf <- setdiff(names(train_df_bal), outcome_var)
formula_rf <- as.formula(paste(outcome_var, "~", paste(predictors_rf, collapse = " + ")))
# Train Random Forest model
# Use proximity=TRUE to get proximity matrix for potential later analysis
# Use importance=TRUE to get variable importance measures
set.seed(123) # for reproducibility
rf_model <- randomForest(formula_rf, data = train_df_bal, importance = TRUE, proximity = FALSE)
# Print the model summary
print(rf_model)
##
## Call:
## randomForest(formula = formula_rf, data = train_df_bal, importance = TRUE, proximity = FALSE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 25.72%
## Confusion matrix:
## no yes class.error
## no 2515 572 0.1852932
## yes 1016 2071 0.3291221
# Predict on the test set
# type="prob" gives class probabilities
rf_pred_probs <- predict(rf_model, newdata = test_df, type = "prob")
# Predict class labels using default threshold (0.5)
rf_pred_class <- predict(rf_model, newdata = test_df, type = "class")
# Evaluate on the test set using confusion matrix
confusionMatrix(rf_pred_class, test_df[[outcome_var]])
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 4327 235
## yes 999 537
##
## Accuracy : 0.7976
## 95% CI : (0.7873, 0.8077)
## No Information Rate : 0.8734
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.357
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8124
## Specificity : 0.6956
## Pos Pred Value : 0.9485
## Neg Pred Value : 0.3496
## Prevalence : 0.8734
## Detection Rate : 0.7096
## Detection Prevalence : 0.7481
## Balanced Accuracy : 0.7540
##
## 'Positive' Class : no
##
# Variable Importance Plot (MeanDecreaseGini)
varImpPlot(rf_model, main = "Random Forest Variable Importance")
# Get variable importance table
importance_rf <- as.data.frame(importance(rf_model))
importance_rf$Variable <- rownames(importance_rf)
# Sort by MeanDecreaseGini (for classification)
importance_rf_sorted <- importance_rf[order(-importance_rf$`MeanDecreaseGini`), ]
message("\nTop 15 most important variables (Mean Decrease Gini):")
##
## Top 15 most important variables (Mean Decrease Gini):
print(head(importance_rf_sorted[, c("Variable", "MeanDecreaseGini")], 15))
## Variable MeanDecreaseGini
## nr_employed nr_employed 382.57248
## job job 259.98424
## day_of_week day_of_week 212.31636
## campaign campaign 190.15857
## education education 176.97971
## month month 160.42302
## age age 153.23079
## cons_conf_idx cons_conf_idx 146.89840
## cons_price_idx cons_price_idx 133.43458
## poutcome poutcome 106.47407
## marital marital 95.06371
## pdays pdays 90.05811
## housing housing 69.05608
## contact contact 60.68856
## loan loan 54.47768
#install.packages("PRROC")
library(PRROC) # For PR curve
# ROC curve using predicted probabilities
roc_rf <- roc(test_df[[outcome_var]], as.numeric(rf_pred_class))
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
# Plot ROC
plot(roc_rf, col = "#2C3E50", main = "ROC Curve (RF)")
abline(a = 0, b = 1, lty = 2, col = "gray")
# AUC
auc(roc_rf)
## Area under the curve: 0.754