#load relevant libraries
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)}))
#load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")
bank_full = read.csv("/Users/mollysiebecker/DATA 622/bank-full.csv")
Exploratory Data Analysis
#view variables
str(bank_addtl_full)
## 'data.frame': 41188 obs. of 21 variables:
## $ age : int 56 57 37 40 56 45 59 41 24 25 ...
## $ job : chr "housemaid" "services" "services" "admin." ...
## $ marital : chr "married" "married" "married" "married" ...
## $ education : chr "basic.4y" "high.school" "high.school" "basic.6y" ...
## $ default : chr "no" "unknown" "no" "no" ...
## $ housing : chr "no" "no" "yes" "no" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "telephone" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "may" "may" ...
## $ day_of_week : chr "mon" "mon" "mon" "mon" ...
## $ duration : int 261 149 226 151 307 198 139 217 380 50 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
## $ cons.price.idx: num 94 94 94 94 94 ...
## $ cons.conf.idx : num -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
## $ euribor3m : num 4.86 4.86 4.86 4.86 4.86 ...
## $ nr.employed : num 5191 5191 5191 5191 5191 ...
## $ y : chr "no" "no" "no" "no" ...
#convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
mutate(across(where(is.character), as.factor))
#create data frames of quantitative and categorical predictors
bank_addtl_quant_full <- bank_addtl_full[,c(1,11:14,16:20)]
bank_addtl_cat_full <- bank_addtl_full[,c(2:10,15)]
Quantitative Predictors
# view summary statistics of quantitative predictors
summary(bank_addtl_quant_full)
## age duration campaign pdays
## Min. :17.00 Min. : 0.0 Min. : 1.000 Min. : 0.0
## 1st Qu.:32.00 1st Qu.: 102.0 1st Qu.: 1.000 1st Qu.:999.0
## Median :38.00 Median : 180.0 Median : 2.000 Median :999.0
## Mean :40.02 Mean : 258.3 Mean : 2.568 Mean :962.5
## 3rd Qu.:47.00 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.:999.0
## Max. :98.00 Max. :4918.0 Max. :56.000 Max. :999.0
## previous emp.var.rate cons.price.idx cons.conf.idx
## Min. :0.000 Min. :-3.40000 Min. :92.20 Min. :-50.8
## 1st Qu.:0.000 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## Median :0.000 Median : 1.10000 Median :93.75 Median :-41.8
## Mean :0.173 Mean : 0.08189 Mean :93.58 Mean :-40.5
## 3rd Qu.:0.000 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. :7.000 Max. : 1.40000 Max. :94.77 Max. :-26.9
## euribor3m nr.employed
## Min. :0.634 Min. :4964
## 1st Qu.:1.344 1st Qu.:5099
## Median :4.857 Median :5191
## Mean :3.621 Mean :5167
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
#pivot quantitative predictors to long format
bank_addtl_quant_full_long <- bank_addtl_quant_full %>%
pivot_longer(cols = c(1:10), names_to = "predictor", values_to = "value")
#create box plots
ggplot(bank_addtl_quant_full_long, aes(x = value)) +
geom_boxplot(fill = "cornflowerblue", color = "black") +
facet_wrap(~ predictor, scales = "free_x") +
theme_minimal()

#calculate skewness
skewValues <- apply(bank_addtl_quant_full, 2, skewness)
skewValues
## age duration campaign pdays previous
## 0.7846397 3.2629036 4.7621598 -4.9218314 3.8317631
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## -0.7240428 -0.2308708 0.3031578 -0.7091363 -1.0441863
#create correlation matrix
ggpairs(bank_addtl_quant_full, columns = c(1:10), progress = FALSE)

#find variables with near zero variance
nearZeroVar(bank_addtl_quant_full)
## [1] 4
Categorical Variables
#view summary of categorical variables
summary(bank_addtl_cat_full)
## job marital education
## admin. :10422 divorced: 4612 university.degree :12168
## blue-collar: 9254 married :24928 high.school : 9515
## technician : 6743 single :11568 basic.9y : 6045
## services : 3969 unknown : 80 professional.course: 5243
## management : 2924 basic.4y : 4176
## retired : 1720 basic.6y : 2292
## (Other) : 6156 (Other) : 1749
## default housing loan contact
## no :32588 no :18622 no :33950 cellular :26144
## unknown: 8597 unknown: 990 unknown: 990 telephone:15044
## yes : 3 yes :21576 yes : 6248
##
##
##
##
## month day_of_week poutcome
## may :13769 fri:7827 failure : 4252
## jul : 7174 mon:8514 nonexistent:35563
## aug : 6178 thu:8623 success : 1373
## jun : 5318 tue:8090
## nov : 4101 wed:8134
## apr : 2632
## (Other): 2016
#find percent of each variable that is unknown
bank_addtl_cat_full %>%
summarize(across(everything(),
~ (sum(. == "unknown") / n()) * 100,
.names = "pct_unknown_{.col}"))
## pct_unknown_job pct_unknown_marital pct_unknown_education pct_unknown_default
## 1 0.8012042 0.1942313 4.20268 20.87258
## pct_unknown_housing pct_unknown_loan pct_unknown_contact pct_unknown_month
## 1 2.403613 2.403613 0 0
## pct_unknown_day_of_week pct_unknown_poutcome
## 1 0 0
#find variables with near zero variance
nearZeroVar(bank_addtl_cat_full)
## integer(0)
Data Pre-processing
#eliminate pdays and default
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)
#log and square root transformations to resolve skewness
bank_addtl_full$duration <- log(bank_addtl_full$duration + 1)
bank_addtl_full$campaign <- log(bank_addtl_full$campaign)
bank_addtl_full$previous <- log(bank_addtl_full$previous + 1)
bank_addtl_full$nr.employed <- sqrt(max(bank_addtl_full$nr.employed)-bank_addtl_full$nr.employed)
#convert 'unknown' to NA
bank_addtl_full[bank_addtl_full == "unknown"] <- NA
#mode imputation to fill in missing values
mode_job <- names(sort(table(bank_addtl_full$job), decreasing = TRUE))[1]
bank_addtl_full$job[is.na(bank_addtl_full$job)] <- mode_job
mode_marital <- names(sort(table(bank_addtl_full$marital), decreasing = TRUE))[1]
bank_addtl_full$marital[is.na(bank_addtl_full$marital)] <- mode_marital
mode_education <- names(sort(table(bank_addtl_full$education), decreasing = TRUE))[1]
bank_addtl_full$education[is.na(bank_addtl_full$education)] <- mode_education
mode_housing <- names(sort(table(bank_addtl_full$housing), decreasing = TRUE))[1]
bank_addtl_full$housing[is.na(bank_addtl_full$housing)] <- mode_housing
mode_loan <- names(sort(table(bank_addtl_full$loan), decreasing = TRUE))[1]
bank_addtl_full$loan[is.na(bank_addtl_full$loan)] <- mode_loan