The dataset’s main purpose is to predict whether a bank client will
subscribe to a term deposit (y = yes/no
) after a marketing
campaign. Each variable provides information that may help explain or
predict that outcome.
# Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 4.0.0 ✔ stringr 1.5.2
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(tidyr)
library(corrplot)
## corrplot 0.95 loaded
library(ggpubr)
library(naniar) # for missing value visualization
library(DataExplorer) # optional: automated EDA
library(forcats)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(themis)
library(smotefamily)
# Load Dataset
url <- "https://raw.githubusercontent.com/uzmabb182/Data_622/refs/heads/main/Assignment_1_EDA/bank-additional-full.csv"
bank_additional_df <- read.csv2(url, stringsAsFactors = FALSE)
head(bank_additional_df)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
# Basic structure
str(bank_additional_df)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr "housemaid" "services" "services" "admin." ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : chr "1.1" "1.1" "1.1" "1.1" ...
## $ cons.price.idx: chr "93.994" "93.994" "93.994" "93.994" ...
## $ cons.conf.idx : chr "-36.4" "-36.4" "-36.4" "-36.4" ...
## $ euribor3m : chr "4.857" "4.857" "4.857" "4.857" ...
## $ nr.employed : chr "5191" "5191" "5191" "5191" ...
## $ y : chr "no" "no" "no" "no" ...
# Dimensions
dim(bank_additional_df) # rows, columns
## [1] 41188 21
nrow(bank_additional_df) # number of rows
## [1] 41188
ncol(bank_additional_df) # number of columns
## [1] 21
# Column names
names(bank_additional_df)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
# Summary statistics for all variables
summary(bank_additional_df)
## age job marital education
## Min. :17.00 Length:41188 Length:41188 Length:41188
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.02
## 3rd Qu.:47.00
## Max. :98.00
## default housing loan contact
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:41188 Length:41188 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 102.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 180.0 Median : 2.000
## Mean : 258.3 Mean : 2.568
## 3rd Qu.: 319.0 3rd Qu.: 3.000
## Max. :4918.0 Max. :56.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:41188 Length:41188
## 1st Qu.:999.0 1st Qu.:0.000 Class :character Class :character
## Median :999.0 Median :0.000 Mode :character Mode :character
## Mean :962.5 Mean :0.173
## 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :999.0 Max. :7.000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Length:41188 Length:41188 Length:41188 Length:41188
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## y
## Length:41188
## Class :character
## Mode :character
##
##
##
Are there any missing values and how significant are they?
# First and last few records
head(bank_additional_df, 10)
## age job marital education default housing loan contact
## 1 56 housemaid married basic.4y no no no telephone
## 2 57 services married high.school unknown no no telephone
## 3 37 services married high.school no yes no telephone
## 4 40 admin. married basic.6y no no no telephone
## 5 56 services married high.school no no yes telephone
## 6 45 services married basic.9y unknown no no telephone
## 7 59 admin. married professional.course no no no telephone
## 8 41 blue-collar married unknown unknown no no telephone
## 9 24 technician single professional.course no yes no telephone
## 10 25 services single high.school no yes no telephone
## month day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 may mon 261 1 999 0 nonexistent 1.1
## 2 may mon 149 1 999 0 nonexistent 1.1
## 3 may mon 226 1 999 0 nonexistent 1.1
## 4 may mon 151 1 999 0 nonexistent 1.1
## 5 may mon 307 1 999 0 nonexistent 1.1
## 6 may mon 198 1 999 0 nonexistent 1.1
## 7 may mon 139 1 999 0 nonexistent 1.1
## 8 may mon 217 1 999 0 nonexistent 1.1
## 9 may mon 380 1 999 0 nonexistent 1.1
## 10 may mon 50 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
## 7 93.994 -36.4 4.857 5191 no
## 8 93.994 -36.4 4.857 5191 no
## 9 93.994 -36.4 4.857 5191 no
## 10 93.994 -36.4 4.857 5191 no
tail(bank_additional_df, 10)
## age job marital education default housing loan
## 41179 62 retired married university.degree no no no
## 41180 64 retired divorced professional.course no yes no
## 41181 36 admin. married university.degree no no no
## 41182 37 admin. married university.degree no yes no
## 41183 29 unemployed single basic.4y no yes no
## 41184 73 retired married professional.course no yes no
## 41185 46 blue-collar married professional.course no no no
## 41186 56 retired married university.degree no yes no
## 41187 44 technician married professional.course no no no
## 41188 74 retired married professional.course no yes no
## contact month day_of_week duration campaign pdays previous poutcome
## 41179 cellular nov thu 483 2 6 3 success
## 41180 cellular nov fri 151 3 999 0 nonexistent
## 41181 cellular nov fri 254 2 999 0 nonexistent
## 41182 cellular nov fri 281 1 999 0 nonexistent
## 41183 cellular nov fri 112 1 9 1 success
## 41184 cellular nov fri 334 1 999 0 nonexistent
## 41185 cellular nov fri 383 1 999 0 nonexistent
## 41186 cellular nov fri 189 2 999 0 nonexistent
## 41187 cellular nov fri 442 1 999 0 nonexistent
## 41188 cellular nov fri 239 3 999 1 failure
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
## 41179 -1.1 94.767 -50.8 1.031 4963.6 yes
## 41180 -1.1 94.767 -50.8 1.028 4963.6 no
## 41181 -1.1 94.767 -50.8 1.028 4963.6 no
## 41182 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41183 -1.1 94.767 -50.8 1.028 4963.6 no
## 41184 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41185 -1.1 94.767 -50.8 1.028 4963.6 no
## 41186 -1.1 94.767 -50.8 1.028 4963.6 no
## 41187 -1.1 94.767 -50.8 1.028 4963.6 yes
## 41188 -1.1 94.767 -50.8 1.028 4963.6 no
# Missing values per column
missing_summary <- bank_additional_df %>%
summarise(across(everything(), ~ sum(is.na(.)))) %>%
pivot_longer(cols = everything(), names_to = "Variable", values_to = "Missing_Count") %>%
mutate(Missing_Percent = round(Missing_Count / nrow(bank_additional_df) * 100, 2)) %>%
arrange(desc(Missing_Count))
missing_summary
## # A tibble: 21 × 3
## Variable Missing_Count Missing_Percent
## <chr> <int> <dbl>
## 1 age 0 0
## 2 job 0 0
## 3 marital 0 0
## 4 education 0 0
## 5 default 0 0
## 6 housing 0 0
## 7 loan 0 0
## 8 contact 0 0
## 9 month 0 0
## 10 day_of_week 0 0
## # ℹ 11 more rows
The dataset does not contain raw NA
values, but instead
uses special codes or labels to represent “missing” or
“not applicable.” These are structural missing values
and must be considered carefully during analysis.
"unknown"
"unknown"
is very common (many
clients do not disclose credit default history)."unknown"
category (~5%
of records)."unknown"
entries.pdays = 999
Yes, there are missing values, but they appear as coded
placeholders rather than raw NA
:
"unknown"
in categorical variables (default,
education, job).pdays = 999
meaning “not previously contacted.”These placeholders are highly significant because they cover a large portion of the dataset (especially pdays and default). Instead of dropping them, they should be treated as meaningful categories or carefully recoded for modeling.
# Unique values in categorical variables (factor/character columns)
lapply(bank_additional_df[sapply(bank_additional_df, is.character)], unique)
## $job
## [1] "housemaid" "services" "admin." "blue-collar"
## [5] "technician" "retired" "management" "unemployed"
## [9] "self-employed" "unknown" "entrepreneur" "student"
##
## $marital
## [1] "married" "single" "divorced" "unknown"
##
## $education
## [1] "basic.4y" "high.school" "basic.6y"
## [4] "basic.9y" "professional.course" "unknown"
## [7] "university.degree" "illiterate"
##
## $default
## [1] "no" "unknown" "yes"
##
## $housing
## [1] "no" "yes" "unknown"
##
## $loan
## [1] "no" "yes" "unknown"
##
## $contact
## [1] "telephone" "cellular"
##
## $month
## [1] "may" "jun" "jul" "aug" "oct" "nov" "dec" "mar" "apr" "sep"
##
## $day_of_week
## [1] "mon" "tue" "wed" "thu" "fri"
##
## $poutcome
## [1] "nonexistent" "failure" "success"
##
## $emp.var.rate
## [1] "1.1" "1.4" "-0.1" "-0.2" "-1.8" "-2.9" "-3.4" "-3" "-1.7" "-1.1"
##
## $cons.price.idx
## [1] "93.994" "94.465" "93.918" "93.444" "93.798" "93.2" "92.756" "92.843"
## [9] "93.075" "92.893" "92.963" "92.469" "92.201" "92.379" "92.431" "92.649"
## [17] "92.713" "93.369" "93.749" "93.876" "94.055" "94.215" "94.027" "94.199"
## [25] "94.601" "94.767"
##
## $cons.conf.idx
## [1] "-36.4" "-41.8" "-42.7" "-36.1" "-40.4" "-42" "-45.9" "-50" "-47.1"
## [10] "-46.2" "-40.8" "-33.6" "-31.4" "-29.8" "-26.9" "-30.1" "-33" "-34.8"
## [19] "-34.6" "-40" "-39.8" "-40.3" "-38.3" "-37.5" "-49.5" "-50.8"
##
## $euribor3m
## [1] "4.857" "4.856" "4.855" "4.859" "4.86" "4.858" "4.864" "4.865" "4.866"
## [10] "4.967" "4.961" "4.959" "4.958" "4.96" "4.962" "4.955" "4.947" "4.956"
## [19] "4.966" "4.963" "4.957" "4.968" "4.97" "4.965" "4.964" "5.045" "5"
## [28] "4.936" "4.921" "4.918" "4.912" "4.827" "4.794" "4.76" "4.733" "4.7"
## [37] "4.663" "4.592" "4.474" "4.406" "4.343" "4.286" "4.245" "4.223" "4.191"
## [46] "4.153" "4.12" "4.076" "4.021" "3.901" "3.879" "3.853" "3.816" "3.743"
## [55] "3.669" "3.563" "3.488" "3.428" "3.329" "3.282" "3.053" "1.811" "1.799"
## [64] "1.778" "1.757" "1.726" "1.703" "1.687" "1.663" "1.65" "1.64" "1.629"
## [73] "1.614" "1.602" "1.584" "1.574" "1.56" "1.556" "1.548" "1.538" "1.531"
## [82] "1.52" "1.51" "1.498" "1.483" "1.479" "1.466" "1.453" "1.445" "1.435"
## [91] "1.423" "1.415" "1.41" "1.405" "1.406" "1.4" "1.392" "1.384" "1.372"
## [100] "1.365" "1.354" "1.344" "1.334" "1.327" "1.313" "1.299" "1.291" "1.281"
## [109] "1.266" "1.25" "1.244" "1.259" "1.264" "1.27" "1.262" "1.26" "1.268"
## [118] "1.286" "1.252" "1.235" "1.224" "1.215" "1.206" "1.099" "1.085" "1.072"
## [127] "1.059" "1.048" "1.044" "1.029" "1.018" "1.007" "0.996" "0.979" "0.969"
## [136] "0.944" "0.937" "0.933" "0.927" "0.921" "0.914" "0.908" "0.903" "0.899"
## [145] "0.884" "0.883" "0.881" "0.879" "0.873" "0.869" "0.861" "0.859" "0.854"
## [154] "0.851" "0.849" "0.843" "0.838" "0.834" "0.829" "0.825" "0.821" "0.819"
## [163] "0.813" "0.809" "0.803" "0.797" "0.788" "0.781" "0.778" "0.773" "0.771"
## [172] "0.77" "0.768" "0.766" "0.762" "0.755" "0.749" "0.743" "0.741" "0.739"
## [181] "0.75" "0.753" "0.754" "0.752" "0.744" "0.74" "0.742" "0.737" "0.735"
## [190] "0.733" "0.73" "0.731" "0.728" "0.724" "0.722" "0.72" "0.719" "0.716"
## [199] "0.715" "0.714" "0.718" "0.721" "0.717" "0.712" "0.71" "0.709" "0.708"
## [208] "0.706" "0.707" "0.7" "0.655" "0.654" "0.653" "0.652" "0.651" "0.65"
## [217] "0.649" "0.646" "0.644" "0.643" "0.639" "0.637" "0.635" "0.636" "0.634"
## [226] "0.638" "0.64" "0.642" "0.645" "0.659" "0.663" "0.668" "0.672" "0.677"
## [235] "0.682" "0.683" "0.684" "0.685" "0.688" "0.69" "0.692" "0.695" "0.697"
## [244] "0.699" "0.701" "0.702" "0.704" "0.711" "0.713" "0.723" "0.727" "0.729"
## [253] "0.732" "0.748" "0.761" "0.767" "0.782" "0.79" "0.793" "0.802" "0.81"
## [262] "0.822" "0.827" "0.835" "0.84" "0.846" "0.87" "0.876" "0.885" "0.889"
## [271] "0.893" "0.896" "0.898" "0.9" "0.904" "0.905" "0.895" "0.894" "0.891"
## [280] "0.89" "0.888" "0.886" "0.882" "0.88" "0.878" "0.877" "0.942" "0.953"
## [289] "0.956" "0.959" "0.965" "0.972" "0.977" "0.982" "0.985" "0.987" "0.993"
## [298] "1" "1.008" "1.016" "1.025" "1.032" "1.037" "1.043" "1.045" "1.047"
## [307] "1.05" "1.049" "1.046" "1.041" "1.04" "1.039" "1.035" "1.03" "1.031"
## [316] "1.028"
##
## $nr.employed
## [1] "5191" "5228.1" "5195.8" "5176.3" "5099.1" "5076.2" "5017.5" "5023.5"
## [9] "5008.7" "4991.6" "4963.6"
##
## $y
## [1] "no" "yes"
library(dplyr)
library(tidyr)
# Select only character (categorical) columns
categorical_df <- bank_additional_df %>% select(where(is.character))
# Using describe () for summary statsw
describe(categorical_df)
## vars n mean sd median trimmed mad min max range
## job* 1 41188 4.72 3.59 3 4.48 2.97 1 12 11
## marital* 2 41188 2.17 0.61 2 2.21 0.00 1 4 3
## education* 3 41188 4.75 2.14 4 4.88 2.97 1 8 7
## default* 4 41188 1.21 0.41 1 1.14 0.00 1 3 2
## housing* 5 41188 2.07 0.99 3 2.09 0.00 1 3 2
## loan* 6 41188 1.33 0.72 1 1.16 0.00 1 3 2
## contact* 7 41188 1.37 0.48 1 1.33 0.00 1 2 1
## month* 8 41188 5.23 2.32 5 5.31 2.97 1 10 9
## day_of_week* 9 41188 3.00 1.40 3 3.01 1.48 1 5 4
## poutcome* 10 41188 1.93 0.36 2 2.00 0.00 1 3 2
## emp.var.rate* 11 41188 7.44 2.95 9 7.90 1.48 1 10 9
## cons.price.idx* 12 41188 15.20 5.56 15 15.28 5.93 1 26 25
## cons.conf.idx* 13 41188 15.66 5.98 18 15.98 5.93 1 26 25
## euribor3m* 14 41188 256.63 68.67 288 270.84 29.65 1 316 315
## nr.employed* 15 41188 8.85 2.45 9 9.25 2.97 1 11 10
## y* 16 41188 1.11 0.32 1 1.02 0.00 1 2 1
## skew kurtosis se
## job* 0.45 -1.39 0.02
## marital* -0.06 -0.34 0.00
## education* -0.24 -1.21 0.01
## default* 1.44 0.07 0.00
## housing* -0.14 -1.95 0.00
## loan* 1.82 1.38 0.00
## contact* 0.56 -1.69 0.00
## month* -0.31 -1.03 0.01
## day_of_week* 0.01 -1.27 0.01
## poutcome* -0.88 3.98 0.00
## emp.var.rate* -0.86 -0.50 0.01
## cons.price.idx* -0.29 -0.40 0.03
## cons.conf.idx* -0.45 -1.07 0.03
## euribor3m* -1.72 2.56 0.34
## nr.employed* -1.21 1.05 0.01
## y* 2.45 4.00 0.00
What is the overall distribution of each variable?
library(ggplot2)
library(dplyr)
library(forcats)
library(viridis)
## Warning: package 'viridis' was built under R version 4.3.3
## Loading required package: viridisLite
# Ensure correct types
bank_additional_df <- bank_additional_df %>%
mutate(
euribor3m = as.numeric(euribor3m), # force to numeric
nr.employed = as.numeric(nr.employed),
emp.var.rate = as.numeric(emp.var.rate),
cons.price.idx = as.numeric(cons.price.idx),
cons.conf.idx = as.numeric(cons.conf.idx)
)
# Separate categorical and numeric columns
categorical_df <- bank_additional_df %>% select(where(is.character))
numeric_df <- bank_additional_df %>% select(where(is.numeric))
# --- Plot categorical variables (one by one) ---
for (col in names(categorical_df)) {
p <- categorical_df %>%
ggplot(aes(x = fct_infreq(.data[[col]]))) +
geom_bar(fill = viridis(1, begin = 0.3, end = 0.8), alpha = 0.8) +
coord_flip() + # flip for readability
theme_minimal() +
theme(
axis.text.y = element_text(size = 9),
axis.title.y = element_blank(),
axis.title.x = element_text(size = 11),
plot.title = element_text(size = 14, face = "bold")
) +
ylab("Frequency") +
ggtitle(paste("Distribution of", col))
print(p)
}
# --- Plot numeric variables (histogram + density) ---
for (col in names(numeric_df)) {
p <- numeric_df %>%
ggplot(aes(x = .data[[col]])) +
geom_histogram(aes(y = ..density..), bins = 40,
fill = viridis(1, begin = 0.3, end = 0.8), color = "white") +
geom_density(color = "red", size = 0.8) +
theme_minimal() +
ggtitle(paste("Distribution of", col)) +
ylab("Density") +
xlab(col)
print(p)
}
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Correlation matrix for numeric columns
numeric_vars <- bank_additional_df[sapply(bank_additional_df, is.numeric)]
cor(numeric_vars, use = "complete.obs")
## age duration campaign pdays previous
## age 1.0000000000 -0.000865705 0.00459358 -0.03436895 0.02436474
## duration -0.0008657050 1.000000000 -0.07169923 -0.04757702 0.02064035
## campaign 0.0045935805 -0.071699226 1.00000000 0.05258357 -0.07914147
## pdays -0.0343689512 -0.047577015 0.05258357 1.00000000 -0.58751386
## previous 0.0243647409 0.020640351 -0.07914147 -0.58751386 1.00000000
## emp.var.rate -0.0003706855 -0.027967884 0.15075381 0.27100417 -0.42048911
## cons.price.idx 0.0008567150 0.005312268 0.12783591 0.07888911 -0.20312997
## cons.conf.idx 0.1293716142 -0.008172873 -0.01373310 -0.09134235 -0.05093635
## euribor3m 0.0107674295 -0.032896656 0.13513251 0.29689911 -0.45449365
## nr.employed -0.0177251319 -0.044703223 0.14409489 0.37260474 -0.50133293
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## age -0.0003706855 0.000856715 0.129371614 0.01076743
## duration -0.0279678845 0.005312268 -0.008172873 -0.03289666
## campaign 0.1507538056 0.127835912 -0.013733099 0.13513251
## pdays 0.2710041743 0.078889109 -0.091342354 0.29689911
## previous -0.4204891094 -0.203129967 -0.050936351 -0.45449365
## emp.var.rate 1.0000000000 0.775334171 0.196041268 0.97224467
## cons.price.idx 0.7753341708 1.000000000 0.058986182 0.68823011
## cons.conf.idx 0.1960412681 0.058986182 1.000000000 0.27768622
## euribor3m 0.9722446712 0.688230107 0.277686220 1.00000000
## nr.employed 0.9069701013 0.522033977 0.100513432 0.94515443
## nr.employed
## age -0.01772513
## duration -0.04470322
## campaign 0.14409489
## pdays 0.37260474
## previous -0.50133293
## emp.var.rate 0.90697010
## cons.price.idx 0.52203398
## cons.conf.idx 0.10051343
## euribor3m 0.94515443
## nr.employed 1.00000000
Are the features (columns) of the dataset correlated?
age & duration (-0.00087): Essentially zero;
the client’s age has no linear relationship with call duration.
age & campaign (0.0046): Nearly zero; older
clients are not contacted more or less often.
age & pdays (-0.034): Very weak negative
correlation; older clients are slightly more likely to have been
contacted recently in previous campaigns, but the effect is
negligible.
age & previous (0.024): Essentially no correlation; age does not relate to prior contacts.
duration & campaign (-0.072): Very weak
negative correlation; longer calls are slightly associated with fewer
contacts in this campaign.
duration & pdays (-0.048): Very weak
negative correlation; call duration is not meaningfully related to days
since last contact.
duration & previous (0.021): Almost zero; prior contacts do not affect call length.
campaign & pdays (0.053): Very weak positive
correlation; clients contacted longer ago may have slightly more
contacts in this campaign.
campaign & previous (-0.079): Very weak negative correlation; more prior contacts are slightly associated with fewer contacts in this campaign.
pdays & previous (-0.588): Moderate to strong negative correlation; as days since last contact (pdays) increases, the number of prior contacts decreases. This makes sense: if someone was contacted long ago, there were fewer previous contacts.
library(tidyverse)
# bank <- bank_additional_df # your data
bank <- bank_additional_df %>%
mutate(
y = factor(y, levels = c("no","yes")),
y_num = as.numeric(y) - 1
)
numeric_candidates <- c(
"age","duration","campaign","pdays","previous",
"emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed","y_num"
)
bank_num <- bank %>%
mutate(across(all_of(intersect(names(bank), numeric_candidates)),
~ suppressWarnings(as.numeric(.)))) %>%
select(any_of(numeric_candidates)) %>%
select(where(~ !all(is.na(.))))
cmat <- cor(bank_num, use = "complete.obs")
# Long format for ggplot
corr_long <- as.data.frame(as.table(cmat)) %>%
setNames(c("Var1","Var2","value"))
ggplot(corr_long, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2166AC", mid = "white", high = "#B2182B",
midpoint = 0, limits = c(-1, 1), name = "corr") +
geom_text(aes(label = sprintf("%.2f", value)), size = 3) +
labs(title = "Correlation Heatmap (numeric features + y_num)", x = NULL, y = NULL) +
theme_minimal(base_size = 12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Correlation Heatmap Interpretation
The heatmap displays Pearson correlation coefficients between numeric features (ranging from –1 to +1).
Overall, the heatmap highlights clusters of correlated economic indicators and a few notable structural relationships, but confirms that most client-level features are weakly correlated. This suggests feature engineering (e.g., creating flags or buckets) may be more useful than relying on raw numeric values.
# Install if needed
# install.packages("naniar")
# install.packages("ggplot2")
library(dplyr)
library(ggplot2)
library(naniar)
# Copy dataset
bank_missing <- bank_additional_df
# Recode special placeholders as NA
bank_missing <- bank_missing %>%
mutate(across(where(is.character), ~ na_if(., "unknown"))) %>%
mutate(pdays = ifelse(pdays == 999, NA, pdays))
# Single barplot: percentage of missing values per variable
gg_miss_var(bank_missing, show_pct = TRUE) +
ggtitle("Percentage of Missing Values per Variable") +
ylab("Percentage Missing") +
theme_minimal()
# Outlier Detection
# Load libraries
# Outlier Detection
# Load libraries
library(dplyr)
library(tidyr)
library(ggplot2)
library(viridis)
# --- 1) Fix numeric-like character columns safely ---
bank_fixed <- bank_additional_df %>%
mutate(
across(c(emp.var.rate, cons.price.idx, cons.conf.idx, euribor3m, nr.employed),
~ as.numeric(.x)) # convert if not already numeric
)
# --- 2) Select numeric columns ---
numeric_df <- bank_fixed %>% select(where(is.numeric))
# --- 3) Reshape to long format ---
numeric_long <- numeric_df %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value")
# --- 4) Faceted boxplots ---
ggplot(numeric_long, aes(x = variable, y = value, fill = variable)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, alpha = 0.6) +
facet_wrap(~ variable, scales = "free", ncol = 3) + # separate panels
scale_fill_viridis(discrete = TRUE, guide = "none") +
theme_minimal(base_size = 13) +
labs(title = "Outlier Detection Across Numeric Variables",
x = "", y = "Value") +
theme(
strip.text = element_text(face = "bold", size = 11),
axis.text.x = element_blank(),
axis.ticks.x = element_blank()
)
# Quick Refresher: Boxplot Guide
Note: This figure uses free scales per panel, so don’t compare absolute heights across panels.
Even if variables are not correlated, they can be combined into meaningful features.
age * job
.any_loan
.emp.var.rate
, cons.price.idx
,
cons.conf.idx
, euribor3m
,
nr.employed
into an index.Answer in short:
Yes, even if variables are not correlated, they can be combined into
new, meaningful features. Example: height + weight → BMI. In this
dataset: housing loan + personal loan → overall debt indicator. Domain
knowledge guides feature engineering.
Summary: Campaigns are seasonal, call length matters, and both demographics and macroeconomics influence outcomes.
Summary: Duration, campaign, pdays, previous are skewed. Economic indicators are stable.
No raw NA
s, but several variables use special
placeholders:
Summary:
The dataset contains structural missing values rather
than random NA’s. These should be treated as valid categories or
transformed into features (e.g., pdays = 999
→
“not previously contacted”), instead of dropping rows, to avoid losing
important information.
"unknown"
with explicit category
missing.pdays=999
as not previously
contacted.was_contacted_before
(binary) + pdays_num
(numeric if not 999).any_loan
."unknown"
).duration
,
campaign
, previous
).Final Preprocessing Summary
1. Handled structural missing values.
2. Retained but transformed meaningful outliers.
3. Reduced redundancy in economic indicators.
4. Engineered features (e.g., any_loan, was_contacted_before).
5. Prepared dataset for balanced, interpretable modeling.
library(dplyr)
library(forcats)
library(caret)
library(smotefamily)
# --- 1. Load Data ---
url <- "https://raw.githubusercontent.com/uzmabb182/Data_622/main/Assignment_1_EDA/bank-additional-full.csv"
bank <- read.csv2(url, stringsAsFactors = FALSE)
# --- 2. Clean Data ---
# For the columns listed, turn them into numeric columns
# For all character columns (except y), replace the word 'unknown' with 'missing' and then make them categorical factors
# Turn 999 in the pdays column into proper missing values
# Make sure no duplicate records remain
# The target variable (y) is categorical with two values: no/yes, and I want ‘no’ to come first.
num_char_cols <- c("emp.var.rate","cons.price.idx","cons.conf.idx","euribor3m","nr.employed")
df <- bank %>%
mutate(across(all_of(num_char_cols), ~ suppressWarnings(as.numeric(.)))) %>%
mutate(across(where(is.character) & !matches("^y$"),
~ factor(ifelse(. == "unknown", "missing", .)))) %>%
mutate(pdays = ifelse(pdays == 999, NA_integer_, pdays)) %>%
distinct()
df$y <- factor(df$y, levels = c("no","yes"))
# --- 3. Feature Engineering ---
# Checks if the column pdays is missing (NA).
# If missing → assign 0 (never contacted before).
# If not missing → assign 1 (contacted before).
# Stored as integer (0L or 1L).
df <- df %>%
mutate(
was_contacted_before = ifelse(is.na(pdays), 0L, 1L),
pdays_num = ifelse(is.na(pdays), NA_integer_, pdays),
any_loan = ifelse(loan == "yes" | housing == "yes", "yes", "no"),
age_group = case_when(
age < 30 ~ "youth",
age <= 50 ~ "middle",
TRUE ~ "senior"
),
duration_contact = duration * ifelse(contact == "cellular", 1, 0),
duration_log = log1p(duration),
campaign_log = log1p(campaign),
previous_log = log1p(previous)
)
# --- 4. Split target and predictors ---
y <- df$y
predictors <- df %>% select(-y)
# --- 5. One-hot encode + impute + scale (recipe-style with caret) ---
dmy <- dummyVars(" ~ .", data = predictors)
X <- data.frame(predict(dmy, newdata = predictors))
# Impute + scale
pre <- preProcess(X, method = c("medianImpute","center","scale"))
X_imp <- predict(pre, X)
# Check consistency
stopifnot(nrow(X_imp) == length(y))
cat("Rows in X_imp:", nrow(X_imp), " | Rows in y:", length(y), "\n")
## Rows in X_imp: 41176 | Rows in y: 41176
# --- 6. Handle Imbalance ---
## Option A: SMOTE
set.seed(123)
sm <- SMOTE(X_imp, y, K = 5)
df_smote <- sm$data
df_smote$y <- factor(df_smote$class, levels = c("no","yes"))
df_smote$class <- NULL
table(df_smote$y)
##
## no yes
## 36537 32473
## Option B: Upsampling
set.seed(123)
up <- upSample(x = X_imp, y = y, yname = "y")
table(up$y)
##
## no yes
## 36537 36537
The following steps were applied to prepare the dataset for modeling:
"unknown"
text with
"missing"
.pdays
pdays = 999
into proper missing values
(NA
).y
a categorical factor with
levels "no"
and "yes"
.caret::dummyVars()
for one-hot encoding. This
approach is safer and preserves all rows.preProcess()
after dummy encoding to
handle imputation (e.g., pdays_num
) before scaling and
centering.nrow(X_imp)
= length(y)
= 41,176
rows.# visual checks to confirm class balance before and after SMOTE / Upsampling.
library(ggplot2)
# --- Original balance ---
orig_tbl <- as.data.frame(table(y))
colnames(orig_tbl) <- c("Class","Count")
ggplot(orig_tbl, aes(x = Class, y = Count, fill = Class)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = Count), vjust = -0.5) +
labs(title = "Class Balance - Original Data", y = "Count") +
theme_minimal()
# --- After SMOTE ---
smote_tbl <- as.data.frame(table(df_smote$y))
colnames(smote_tbl) <- c("Class","Count")
ggplot(smote_tbl, aes(x = Class, y = Count, fill = Class)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = Count), vjust = -0.5) +
labs(title = "Class Balance - After SMOTE", y = "Count") +
theme_minimal()
# --- After Upsampling ---
up_tbl <- as.data.frame(table(up$y))
colnames(up_tbl) <- c("Class","Count")
ggplot(up_tbl, aes(x = Class, y = Count, fill = Class)) +
geom_col(show.legend = FALSE) +
geom_text(aes(label = Count), vjust = -0.5) +
labs(title = "Class Balance - After Upsampling", y = "Count") +
theme_minimal()
# Split into Train/Test
set.seed(123)
trainIndex <- createDataPartition(y, p = 0.7, list = FALSE)
# Original
X_train <- X_imp[trainIndex, ]
X_test <- X_imp[-trainIndex, ]
y_train <- y[trainIndex]
y_test <- y[-trainIndex]
# Train Logistic Regression (Original)
model_orig <- glm(y_train ~ ., data = data.frame(y_train, X_train), family = binomial)
pred_orig <- predict(model_orig, newdata = X_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
pred_class_orig <- ifelse(pred_orig > 0.5, "yes", "no")
confusionMatrix(factor(pred_class_orig, levels=c("no","yes")), y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 10652 775
## yes 309 616
##
## Accuracy : 0.9122
## 95% CI : (0.9071, 0.9172)
## No Information Rate : 0.8874
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4857
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9718
## Specificity : 0.4428
## Pos Pred Value : 0.9322
## Neg Pred Value : 0.6659
## Prevalence : 0.8874
## Detection Rate : 0.8624
## Detection Prevalence : 0.9251
## Balanced Accuracy : 0.7073
##
## 'Positive' Class : no
##
# Train Logistic Regression (SMOTE)
# Split SMOTE data
set.seed(123)
trainIndex_smote <- createDataPartition(df_smote$y, p = 0.7, list = FALSE)
train_smote <- df_smote[trainIndex_smote, ]
test_smote <- df_smote[-trainIndex_smote, ]
model_smote <- glm(y ~ ., data = train_smote, family = binomial)
pred_smote <- predict(model_smote, newdata = test_smote, type = "response")
pred_class_smote <- ifelse(pred_smote > 0.5, "yes", "no")
confusionMatrix(factor(pred_class_smote, levels=c("no","yes")), test_smote$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 9497 962
## yes 1464 8779
##
## Accuracy : 0.8828
## 95% CI : (0.8784, 0.8872)
## No Information Rate : 0.5295
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7655
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8664
## Specificity : 0.9012
## Pos Pred Value : 0.9080
## Neg Pred Value : 0.8571
## Prevalence : 0.5295
## Detection Rate : 0.4587
## Detection Prevalence : 0.5052
## Balanced Accuracy : 0.8838
##
## 'Positive' Class : no
##
# Train Logistic Regression (Upsampled)
# Split Upsampled data
set.seed(123)
trainIndex_up <- createDataPartition(up$y, p = 0.7, list = FALSE)
train_up <- up[trainIndex_up, ]
test_up <- up[-trainIndex_up, ]
model_up <- glm(y ~ ., data = train_up, family = binomial)
pred_up <- predict(model_up, newdata = test_up, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
pred_class_up <- ifelse(pred_up > 0.5, "yes", "no")
confusionMatrix(factor(pred_class_up, levels=c("no","yes")), test_up$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 9355 1131
## yes 1606 9830
##
## Accuracy : 0.8751
## 95% CI : (0.8707, 0.8795)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7503
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8535
## Specificity : 0.8968
## Pos Pred Value : 0.8921
## Neg Pred Value : 0.8596
## Prevalence : 0.5000
## Detection Rate : 0.4267
## Detection Prevalence : 0.4783
## Balanced Accuracy : 0.8751
##
## 'Positive' Class : no
##
"yes"
is poor because the model
predicts "no"
most of the time due to class imbalance."yes"
improves since synthetic examples
balance the dataset.1. Logistic Regression
- Pros: interpretable coefficients, probability outputs,
efficient.
- Cons: assumes linear log-odds, requires preprocessing, may miss
nonlinearities.
2. Decision Tree
- Pros: handles numeric/categorical, captures interactions, intuitive
rules.
- Cons: prone to overfitting, less stable, lower accuracy than
ensembles.
3. Naïve Bayes (secondary)
- Pros: fast, works with categorical data, probabilistic.
- Cons: assumes independence, less accurate on structured business
data.
Are there labels?
Yes → y (yes/no). It’s supervised classification.
How does algorithm choice relate to data?
Large, imbalanced dataset with mixed variables → logistic regression +
decision tree fit well.
What if dataset < 1,000 records?
Prefer simpler models (logistic regression, LDA, Naïve Bayes). Decision
trees may be unstable.
I recommend Logistic Regression as the primary model for interpretability and alignment with business context, with Decision Trees as a complementary method to capture nonlinear patterns.