#load relevant libraries
suppressWarnings(suppressMessages({library(dplyr)
library(tidyr)
library(ggplot2)
library(GGally)
library(ggmosaic)
library(caret)
library(e1071)
library(DMwR2)}))
#load data
bank_addtl_full = read.csv("/Users/mollysiebecker/DATA 622/bank-additional-full.csv")
bank_full = read.csv("/Users/mollysiebecker/DATA 622/bank-full.csv")

Exploratory Data Analysis

#view variables
str(bank_addtl_full)
## 'data.frame':    41188 obs. of  21 variables:
##  $ age           : int  56 57 37 40 56 45 59 41 24 25 ...
##  $ job           : chr  "housemaid" "services" "services" "admin." ...
##  $ marital       : chr  "married" "married" "married" "married" ...
##  $ education     : chr  "basic.4y" "high.school" "high.school" "basic.6y" ...
##  $ default       : chr  "no" "unknown" "no" "no" ...
##  $ housing       : chr  "no" "no" "yes" "no" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "telephone" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "may" "may" ...
##  $ day_of_week   : chr  "mon" "mon" "mon" "mon" ...
##  $ duration      : int  261 149 226 151 307 198 139 217 380 50 ...
##  $ campaign      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 1.1 ...
##  $ cons.price.idx: num  94 94 94 94 94 ...
##  $ cons.conf.idx : num  -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 -36.4 ...
##  $ euribor3m     : num  4.86 4.86 4.86 4.86 4.86 ...
##  $ nr.employed   : num  5191 5191 5191 5191 5191 ...
##  $ y             : chr  "no" "no" "no" "no" ...
#convert character vectors to factors
bank_addtl_full <- bank_addtl_full %>%
  mutate(across(where(is.character), as.factor))
#create data frames of quantitative and categorical predictors
bank_addtl_quant_full <- bank_addtl_full[,c(1,11:14,16:20)]
bank_addtl_cat_full <- bank_addtl_full[,c(2:10,15)]

Quantitative Predictors

# view summary statistics of quantitative predictors
summary(bank_addtl_quant_full)
##       age           duration         campaign          pdays      
##  Min.   :17.00   Min.   :   0.0   Min.   : 1.000   Min.   :  0.0  
##  1st Qu.:32.00   1st Qu.: 102.0   1st Qu.: 1.000   1st Qu.:999.0  
##  Median :38.00   Median : 180.0   Median : 2.000   Median :999.0  
##  Mean   :40.02   Mean   : 258.3   Mean   : 2.568   Mean   :962.5  
##  3rd Qu.:47.00   3rd Qu.: 319.0   3rd Qu.: 3.000   3rd Qu.:999.0  
##  Max.   :98.00   Max.   :4918.0   Max.   :56.000   Max.   :999.0  
##     previous      emp.var.rate      cons.price.idx  cons.conf.idx  
##  Min.   :0.000   Min.   :-3.40000   Min.   :92.20   Min.   :-50.8  
##  1st Qu.:0.000   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7  
##  Median :0.000   Median : 1.10000   Median :93.75   Median :-41.8  
##  Mean   :0.173   Mean   : 0.08189   Mean   :93.58   Mean   :-40.5  
##  3rd Qu.:0.000   3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4  
##  Max.   :7.000   Max.   : 1.40000   Max.   :94.77   Max.   :-26.9  
##    euribor3m      nr.employed  
##  Min.   :0.634   Min.   :4964  
##  1st Qu.:1.344   1st Qu.:5099  
##  Median :4.857   Median :5191  
##  Mean   :3.621   Mean   :5167  
##  3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :5.045   Max.   :5228
#pivot quantitative predictors to long format
bank_addtl_quant_full_long <- bank_addtl_quant_full %>%
  pivot_longer(cols = c(1:10), names_to = "predictor", values_to = "value")
#create box plots
ggplot(bank_addtl_quant_full_long, aes(x = value)) +
  geom_boxplot(fill = "cornflowerblue", color = "black") +
  facet_wrap(~ predictor, scales = "free_x") +
  theme_minimal()

#calculate skewness
skewValues <- apply(bank_addtl_quant_full, 2, skewness)
skewValues
##            age       duration       campaign          pdays       previous 
##      0.7846397      3.2629036      4.7621598     -4.9218314      3.8317631 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##     -0.7240428     -0.2308708      0.3031578     -0.7091363     -1.0441863
#create correlation matrix
ggpairs(bank_addtl_quant_full, columns = c(1:10), progress = FALSE)

#find variables with near zero variance
nearZeroVar(bank_addtl_quant_full)
## [1] 4

Categorical Variables

#view summary of categorical variables
summary(bank_addtl_cat_full)
##           job            marital                    education    
##  admin.     :10422   divorced: 4612   university.degree  :12168  
##  blue-collar: 9254   married :24928   high.school        : 9515  
##  technician : 6743   single  :11568   basic.9y           : 6045  
##  services   : 3969   unknown :   80   professional.course: 5243  
##  management : 2924                    basic.4y           : 4176  
##  retired    : 1720                    basic.6y           : 2292  
##  (Other)    : 6156                    (Other)            : 1749  
##     default         housing           loan            contact     
##  no     :32588   no     :18622   no     :33950   cellular :26144  
##  unknown: 8597   unknown:  990   unknown:  990   telephone:15044  
##  yes    :    3   yes    :21576   yes    : 6248                    
##                                                                   
##                                                                   
##                                                                   
##                                                                   
##      month       day_of_week        poutcome    
##  may    :13769   fri:7827    failure    : 4252  
##  jul    : 7174   mon:8514    nonexistent:35563  
##  aug    : 6178   thu:8623    success    : 1373  
##  jun    : 5318   tue:8090                       
##  nov    : 4101   wed:8134                       
##  apr    : 2632                                  
##  (Other): 2016
#find percent of each variable that is unknown
bank_addtl_cat_full %>%
  summarize(across(everything(), 
                   ~ (sum(. == "unknown") / n()) * 100, 
                   .names = "pct_unknown_{.col}"))
##   pct_unknown_job pct_unknown_marital pct_unknown_education pct_unknown_default
## 1       0.8012042           0.1942313               4.20268            20.87258
##   pct_unknown_housing pct_unknown_loan pct_unknown_contact pct_unknown_month
## 1            2.403613         2.403613                   0                 0
##   pct_unknown_day_of_week pct_unknown_poutcome
## 1                       0                    0
#find variables with near zero variance
nearZeroVar(bank_addtl_cat_full)
## integer(0)

Data Pre-processing

#eliminate pdays and default
bank_addtl_full <- bank_addtl_full %>% select(-pdays, -default)
#log and square root transformations to resolve skewness
bank_addtl_full$duration <- log(bank_addtl_full$duration + 1)
bank_addtl_full$campaign <- log(bank_addtl_full$campaign)
bank_addtl_full$previous <- log(bank_addtl_full$previous + 1)
bank_addtl_full$nr.employed <- sqrt(max(bank_addtl_full$nr.employed)-bank_addtl_full$nr.employed)
#convert 'unknown' to NA
bank_addtl_full[bank_addtl_full == "unknown"] <- NA
#mode imputation to fill in missing values

mode_job <- names(sort(table(bank_addtl_full$job), decreasing = TRUE))[1]
bank_addtl_full$job[is.na(bank_addtl_full$job)] <- mode_job

mode_marital <- names(sort(table(bank_addtl_full$marital), decreasing = TRUE))[1]
bank_addtl_full$marital[is.na(bank_addtl_full$marital)] <- mode_marital

mode_education <- names(sort(table(bank_addtl_full$education), decreasing = TRUE))[1]
bank_addtl_full$education[is.na(bank_addtl_full$education)] <- mode_education

mode_housing <- names(sort(table(bank_addtl_full$housing), decreasing = TRUE))[1]
bank_addtl_full$housing[is.na(bank_addtl_full$housing)] <- mode_housing

mode_loan <- names(sort(table(bank_addtl_full$loan), decreasing = TRUE))[1]
bank_addtl_full$loan[is.na(bank_addtl_full$loan)] <- mode_loan