Bank Marketing Prediction Project

library(tidyverse)  
library(caret)      
library(ggplot2)    
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)

1. Load and Inspect Data

bank <- read.csv("C:/Users/yuan1/Downloads/bank-additional.csv", 
                 sep = ";", header = TRUE, stringsAsFactors = TRUE)

Check the structure of the dataset

str(bank)

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

table(bank$default)

## 
##      no unknown     yes 
##    3315     803       1

# Replace "yes" with "unknown" in the entire dataset (bank)
bank$default[bank$default == "yes"] <- "unknown"

# Ensure factor levels remain consistent
bank$default <- factor(bank$default, levels = c("no", "unknown"))

# Check if the transformation was successful
table(bank$default)

## 
##      no unknown 
##    3315     804

table(bank$housing)

## 
##      no unknown     yes 
##    1839     105    2175

# Replace "unknown" with "no" in the housing column
bank$housing[bank$housing == "unknown"] <- "no"

# Ensure housing remains a factor with correct levels
bank$housing <- factor(bank$housing, levels = c("no", "yes"))

# Check the transformation
table(bank$housing)

## 
##   no  yes 
## 1944 2175

table(bank$loan)

## 
##      no unknown     yes 
##    3349     105     665

# Replace "unknown" with "yes" in the loan column
bank$loan[bank$loan == "unknown"] <- "yes"

# Ensure loan remains a factor with correct levels
bank$loan <- factor(bank$loan, levels = c("no", "yes"))

# Check the transformation
table(bank$loan)

## 
##   no  yes 
## 3349  770

table(bank$previous)

## 
##    0    1    2    3    4    5    6 
## 3523  475   78   25   14    2    2

# Convert previous to a binary factor: "no" (0) and "yes" (>0)
bank$previous <- ifelse(bank$previous == 0, "no", "yes")

# Convert to a factor with correct levels
bank$previous <- factor(bank$previous, levels = c("no", "yes"))

# Check the transformation
table(bank$previous)

## 
##   no  yes 
## 3523  596

table(bank$y)  # Check target variable distribution

## 
##   no  yes 
## 3668  451

str(bank)

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

 # Convert pdays to character before applying ifelse()
bank$pdays <- as.character(bank$pdays)

# Recode pdays into two levels (1 = contacted, 0 = not contacted)
bank$pdays <- ifelse(bank$pdays == "999", "Not Contacted", "Contacted")

# Convert back to a factor with correct levels
bank$pdays <- factor(bank$pdays, levels = c("Not Contacted", "Contacted"))

# Check the transformation
table(bank$pdays)

## 
## Not Contacted     Contacted 
##          3959           160

2. Data Cleaning

bank <- bank%>% dplyr::select(-duration)  # Remove 'duration' column (leakage issue)
sum(is.na(bank))  # Check for missing values

## [1] 0

sum(duplicated(bank))  # Check for duplicate entries

## [1] 20

duplicates <- bank[duplicated(bank), ]  # Identify duplicate rows
bank <- bank[!duplicated(bank), ]  # Remove duplicate rows
bank <- bank %>% mutate_if(is.character, as.factor)  # Convert character columns to factors

Convert ‘pdays’ where 999 means ‘Not Contacted’

bank$pdays <- ifelse(bank$pdays == 999, "Not Contacted", as.character(bank$pdays))
bank$pdays <- factor(bank$pdays)  # Convert to categorical variable

3. Exploratory Data Analysis

# Target variable distribution

ggplot(bank, aes(x = factor(y))) +
  geom_bar(fill = c("blue", "orange")) +
  labs(title = "Target Variable Distribution (y)", x = "Subscription to Term Deposit", y = "Count")

Visualizing numerical variable distributions

numeric_vars <- bank[, sapply(bank, is.numeric)]  # Select numeric variables
numeric_melted <- melt(numeric_vars)

ggplot(numeric_melted, aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  facet_wrap(~variable, scales = "free") +  # Separate subplots for each numeric variable
  labs(title = "Distribution of Numeric Variables", x = "Value", y = "Count") +
  theme_minimal()

table(bank$nr.employed)

## 
## 4963.6 4991.6 5008.7 5017.5 5023.5 5076.2 5099.1 5176.3   5191 5195.8 5228.1 
##     83     86     60    104     21    163    817      1    755    391   1618

table(bank$previous)

## 
##   no  yes 
## 3505  594

bank$campaign = log(bank$campaign)
bank$nr.employed = log(bank$nr.employed)

numeric<- bank %>% select(where(is.numeric)) 

par(mfrow = c(3, 3))  
for (var in names(numeric)) {
  qqnorm(numeric[[var]], main = paste("Q-Q Plot:", var))
  qqline(numeric[[var]], col = "red", lwd = 2)
}
par(mfrow = c(1, 1))

Correlation matrix analysis

cor_matrix <- cor(numeric, use = "complete.obs")
print(cor_matrix)

##                         age     campaign emp.var.rate cons.price.idx
## age             1.000000000 -0.013966224  -0.01938708   -0.001169426
## campaign       -0.013966224  1.000000000   0.17904011    0.146428660
## emp.var.rate   -0.019387084  0.179040112   1.00000000    0.755579401
## cons.price.idx -0.001169426  0.146428660   0.75557940    1.000000000
## cons.conf.idx   0.097893854  0.002892008   0.19240814    0.043520420
## euribor3m      -0.015570727  0.159699049   0.97025583    0.657353850
## nr.employed    -0.042945740  0.160648015   0.89491092    0.469223893
##                cons.conf.idx   euribor3m nr.employed
## age              0.097893854 -0.01557073 -0.04294574
## campaign         0.002892008  0.15969905  0.16064801
## emp.var.rate     0.192408143  0.97025583  0.89491092
## cons.price.idx   0.043520420  0.65735385  0.46922389
## cons.conf.idx    1.000000000  0.27434355  0.10314358
## euribor3m        0.274343550  1.00000000  0.94113103
## nr.employed      0.103143579  0.94113103  1.00000000

kable(cor_matrix, digits = 2, caption = "Numerical Variable Correlation Matrix")

Numerical Variable Correlation Matrix
	age	campaign	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed
age	1.00	-0.01	-0.02	0.00	0.10	-0.02	-0.04
campaign	-0.01	1.00	0.18	0.15	0.00	0.16	0.16
emp.var.rate	-0.02	0.18	1.00	0.76	0.19	0.97	0.89
cons.price.idx	0.00	0.15	0.76	1.00	0.04	0.66	0.47
cons.conf.idx	0.10	0.00	0.19	0.04	1.00	0.27	0.10
euribor3m	-0.02	0.16	0.97	0.66	0.27	1.00	0.94
nr.employed	-0.04	0.16	0.89	0.47	0.10	0.94	1.00

Correlation heatmap

corrplot(cor_matrix, method = "color", type = "upper", 
         tl.cex = 0.8, cl.cex = 0.8, addCoef.col = "black")

Check correlations with target variable

table(bank$pdays)  # Distribution of 'pdays'

## 
##     Contacted Not Contacted 
##           159          3940

cor(bank$nr.employed, as.numeric(bank$y))  # Correlation with employment rate

## [1] -0.3509202

cor(bank$euribor3m, as.numeric(bank$y))  # Correlation with euribor3m

## [1] -0.2990494

Remove highly correlated variable

bank <- bank%>% dplyr::select(-euribor3m)
bank <- bank%>% dplyr::select(-emp.var.rate)
str(bank)

## 'data.frame':    4099 obs. of  18 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ previous      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

4. Feature Engineering (Handling Categorical Variables)

Selecting categorical variables for correlation analysis

categorical_vars <- c("job", "marital", "education", "default", "housing", 
                      "loan", "contact", "month", "day_of_week", "poutcome","previous")

# Compute Cramér's V correlation matrix
cramer_matrix <- matrix(NA, nrow = length(categorical_vars), ncol = length(categorical_vars))
rownames(cramer_matrix) <- categorical_vars
colnames(cramer_matrix) <- categorical_vars

for (i in 1:length(categorical_vars)) {
  for (j in 1:length(categorical_vars)) {
    if (i != j) {
      test_result <- assocstats(table(bank[[categorical_vars[i]]], bank[[categorical_vars[j]]]))
      cramer_matrix[i, j] <- test_result$cramer
    } else {
      cramer_matrix[i, j] <- 1  # Self-correlation is 1
    }
  }
}

print(cramer_matrix)  # Display Cramér's V correlation matrix

##                    job    marital  education     default     housing
## job         1.00000000 0.18346913 0.35478377 0.213070513 0.065990080
## marital     0.18346913 1.00000000 0.10711052 0.133882772 0.012968908
## education   0.35478377 0.10711052 1.00000000 0.242634924 0.053500343
## default     0.21307051 0.13388277 0.24263492 1.000000000 0.012871484
## housing     0.06599008 0.01296891 0.05350034 0.012871484 1.000000000
## loan        0.04251641 0.01624679 0.03803769 0.009700914 0.005030888
## contact     0.13066296 0.08594081 0.14956618 0.149818184 0.074343811
## month       0.11730417 0.06072848 0.09803143 0.160989212 0.090030158
## day_of_week 0.05609785 0.03293065 0.05310208 0.040349083 0.015879748
## poutcome    0.10430784 0.03602368 0.05179062 0.104420912 0.024694557
## previous    0.11136748 0.04556905 0.04381594 0.091649482 0.018613013
##                    loan    contact      month day_of_week    poutcome
## job         0.042516407 0.13066296 0.11730417  0.05609785 0.104307840
## marital     0.016246794 0.08594081 0.06072848  0.03293065 0.036023683
## education   0.038037688 0.14956618 0.09803143  0.05310208 0.051790623
## default     0.009700914 0.14981818 0.16098921  0.04034908 0.104420912
## housing     0.005030888 0.07434381 0.09003016  0.01587975 0.024694557
## loan        1.000000000 0.00696911 0.03635633  0.02864837 0.009674543
## contact     0.006969110 1.00000000 0.60390024  0.06594183 0.251967096
## month       0.036356333 0.60390024 1.00000000  0.08660851 0.251417412
## day_of_week 0.028648374 0.06594183 0.08660851  1.00000000 0.026299807
## poutcome    0.009674543 0.25196710 0.25141741  0.02629981 1.000000000
## previous    0.009654427 0.25189023 0.31656704  0.01502844 1.000000000
##                previous
## job         0.111367478
## marital     0.045569047
## education   0.043815945
## default     0.091649482
## housing     0.018613013
## loan        0.009654427
## contact     0.251890226
## month       0.316567036
## day_of_week 0.015028444
## poutcome    1.000000000
## previous    1.000000000

Ensure categorical variables are correctly encoded

print(levels(bank$housing))

## [1] "no"  "yes"

print(levels(bank$loan))

## [1] "no"  "yes"

print(levels(bank$previous))

## [1] "no"  "yes"

Convert ‘unknown’ values to ‘no’ in housing and loan categories

bank$housing <- as.character(bank$housing)
bank$loan <- as.character(bank$loan)
bank$previous <- as.character(bank$previous)

bank$housing[bank$housing == "unknown"] <- "no"
bank$loan[bank$loan == "unknown"] <- "no"

bank$housing <- factor(bank$housing, levels = c("no", "yes"))
bank$loan <- factor(bank$loan, levels = c("no", "yes"))

table(bank$loan)

## 
##   no  yes 
## 3330  769

table(bank$housing)

## 
##   no  yes 
## 1935 2164

table(bank$previous)

## 
##   no  yes 
## 3505  594

5. Model Training and Evaluation

Train logistic regression model

str(bank)

## 'data.frame':    4099 obs. of  18 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ previous      : chr  "no" "no" "no" "no" ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

prev_cramer <- assocstats(table(bank$previous, bank$y))$cramer
print(prev_cramer)

## [1] 0.2079214

pout_cramer <- assocstats(table(bank$poutcome, bank$y))$cramer
print(pout_cramer)

## [1] 0.3340245

bank <- bank%>% dplyr::select(-previous)


model <- glm(y ~ ., data = bank, family = binomial(link="logit"))
vif(model)  # Check for multicollinearity

##                    GVIF Df GVIF^(1/(2*Df))
## age            2.060339  1        1.435388
## job            5.685170 11        1.082197
## marital        1.495544  3        1.069383
## education      3.332629  7        1.089788
## default        1.155510  1        1.074946
## housing        1.029070  1        1.014431
## loan           1.017915  1        1.008918
## contact        2.392374  1        1.546730
## month          6.638016  9        1.110884
## day_of_week    1.118374  4        1.014083
## campaign       1.061717  1        1.030396
## pdays          7.896885  1        2.810140
## poutcome       9.407044  2        1.751311
## cons.price.idx 2.087104  1        1.444681
## cons.conf.idx  2.413512  1        1.553548
## nr.employed    2.252955  1        1.500985

  # Remove highly correlated variable
summary(model)  # View model summary

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = bank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0835  -0.3966  -0.3218  -0.2549   2.8704  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  434.193393  43.606103   9.957  < 2e-16 ***
## age                            0.017015   0.006834   2.490  0.01278 *  
## jobblue-collar                -0.319127   0.225008  -1.418  0.15611    
## jobentrepreneur               -0.522427   0.397370  -1.315  0.18861    
## jobhousemaid                  -0.132193   0.393883  -0.336  0.73716    
## jobmanagement                 -0.446153   0.247003  -1.806  0.07088 .  
## jobretired                    -0.288981   0.303659  -0.952  0.34127    
## jobself-employed              -0.610755   0.347980  -1.755  0.07924 .  
## jobservices                   -0.195153   0.238708  -0.818  0.41362    
## jobstudent                    -0.045555   0.354599  -0.128  0.89778    
## jobtechnician                 -0.053453   0.191817  -0.279  0.78050    
## jobunemployed                  0.086801   0.332187   0.261  0.79386    
## jobunknown                    -0.485744   0.637515  -0.762  0.44610    
## maritalmarried                 0.160872   0.201097   0.800  0.42373    
## maritalsingle                  0.290952   0.230125   1.264  0.20612    
## maritalunknown                 0.090024   1.169224   0.077  0.93863    
## educationbasic.6y              0.248739   0.339613   0.732  0.46391    
## educationbasic.9y              0.124738   0.272876   0.457  0.64758    
## educationhigh.school           0.108351   0.261443   0.414  0.67856    
## educationilliterate          -10.895400 324.744015  -0.034  0.97324    
## educationprofessional.course   0.204361   0.285021   0.717  0.47337    
## educationuniversity.degree     0.197260   0.262371   0.752  0.45215    
## educationunknown               0.233198   0.342803   0.680  0.49633    
## defaultunknown                -0.039833   0.177832  -0.224  0.82276    
## housingyes                    -0.091361   0.115550  -0.791  0.42914    
## loanyes                       -0.108607   0.150960  -0.719  0.47187    
## contacttelephone              -0.858310   0.214970  -3.993 6.53e-05 ***
## monthaug                      -0.478165   0.308803  -1.548  0.12151    
## monthdec                       0.540134   0.545760   0.990  0.32233    
## monthjul                       0.002272   0.295253   0.008  0.99386    
## monthjun                       0.623026   0.273656   2.277  0.02281 *  
## monthmar                       1.272880   0.391048   3.255  0.00113 ** 
## monthmay                      -0.496764   0.234422  -2.119  0.03408 *  
## monthnov                      -0.538660   0.289626  -1.860  0.06291 .  
## monthoct                      -0.475359   0.379285  -1.253  0.21010    
## monthsep                      -0.764996   0.395975  -1.932  0.05337 .  
## day_of_weekmon                -0.035505   0.181945  -0.195  0.84528    
## day_of_weekthu                 0.032518   0.182566   0.178  0.85863    
## day_of_weektue                -0.033945   0.187269  -0.181  0.85616    
## day_of_weekwed                 0.150846   0.187855   0.803  0.42198    
## campaign                      -0.155289   0.094264  -1.647  0.09948 .  
## pdaysNot Contacted            -0.637117   0.536926  -1.187  0.23538    
## poutcomenonexistent            0.453375   0.181869   2.493  0.01267 *  
## poutcomesuccess                1.209529   0.549903   2.200  0.02784 *  
## cons.price.idx                 0.133557   0.123535   1.081  0.27964    
## cons.conf.idx                  0.042326   0.015455   2.739  0.00617 ** 
## nr.employed                  -52.336116   4.916288 -10.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2837.0  on 4098  degrees of freedom
## Residual deviance: 2208.5  on 4052  degrees of freedom
## AIC: 2302.5
## 
## Number of Fisher Scoring iterations: 11

Split dataset into training (80%) and testing (20%) sets

#bank <- bank%>% dplyr::select(-pdays)#

set.seed(4321)
train_index <- createDataPartition(bank$y, p = 0.8, list = FALSE)
train_data <- bank[train_index, ]
test_data <- bank[-train_index, ]

Define important variables for modeling

# Hybrid sampling (both over- and under-sampling to balance classes)
train_data_hybrid <- ovun.sample(y ~ ., 
                                 data = train_data,  # Use selected features
                                 method = "both",   # Perform both over- and under-sampling
                                 N = nrow(train_data),  # Keep dataset size unchanged
                                 p = 0.6)$data  # Balance 'yes' and 'no'

# Train logistic regression model on resampled data
trainmodel <- glm(y ~ ., data = train_data_hybrid, family = binomial(link="logit"))
summary(trainmodel)  # View model summary

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train_data_hybrid)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1896  -0.9122   0.2187   0.8315   1.8442  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   4.562e+02  2.342e+02   1.948 0.051408 .  
## age                           3.444e-02  5.356e-03   6.430 1.28e-10 ***
## jobblue-collar                5.022e-02  1.587e-01   0.316 0.751699    
## jobentrepreneur              -6.226e-01  2.516e-01  -2.474 0.013365 *  
## jobhousemaid                  7.452e-01  3.247e-01   2.295 0.021710 *  
## jobmanagement                -5.992e-01  1.940e-01  -3.090 0.002004 ** 
## jobretired                   -3.707e-01  2.434e-01  -1.523 0.127799    
## jobself-employed             -9.098e-01  2.595e-01  -3.506 0.000455 ***
## jobservices                   1.828e-01  1.835e-01   0.997 0.318969    
## jobstudent                    4.814e-02  3.171e-01   0.152 0.879350    
## jobtechnician                 1.358e-01  1.486e-01   0.914 0.360790    
## jobunemployed                 8.508e-02  2.745e-01   0.310 0.756630    
## jobunknown                    4.259e-01  4.403e-01   0.967 0.333483    
## maritalmarried                1.120e-01  1.465e-01   0.764 0.444646    
## maritalsingle                 4.074e-01  1.685e-01   2.418 0.015600 *  
## maritalunknown               -1.546e+01  7.613e+02  -0.020 0.983794    
## educationbasic.6y             3.579e-01  2.564e-01   1.396 0.162768    
## educationbasic.9y             5.713e-01  1.896e-01   3.012 0.002591 ** 
## educationhigh.school          1.158e-01  1.967e-01   0.589 0.556158    
## educationilliterate          -1.561e+01  1.455e+03  -0.011 0.991444    
## educationprofessional.course  7.515e-01  2.127e-01   3.532 0.000412 ***
## educationuniversity.degree    6.565e-01  1.956e-01   3.356 0.000790 ***
## educationunknown              6.683e-01  2.780e-01   2.404 0.016220 *  
## defaultunknown                6.846e-02  1.181e-01   0.580 0.562197    
## housingyes                   -1.164e-01  8.697e-02  -1.338 0.180781    
## loanyes                      -1.562e-01  1.120e-01  -1.395 0.163131    
## contacttelephone             -2.221e-01  1.759e-01  -1.263 0.206602    
## monthaug                     -1.885e-01  2.775e-01  -0.679 0.496964    
## monthdec                      5.790e-01  5.744e-01   1.008 0.313513    
## monthjul                      4.702e-01  2.445e-01   1.923 0.054468 .  
## monthjun                      6.919e-01  2.256e-01   3.067 0.002161 ** 
## monthmar                      1.494e+00  4.438e-01   3.366 0.000763 ***
## monthmay                     -4.765e-01  1.919e-01  -2.483 0.013011 *  
## monthnov                     -3.882e-01  2.374e-01  -1.636 0.101926    
## monthoct                      1.145e+00  4.031e-01   2.841 0.004492 ** 
## monthsep                     -2.125e-01  4.613e-01  -0.461 0.645049    
## day_of_weekmon               -1.297e-01  1.372e-01  -0.946 0.344244    
## day_of_weekthu               -8.213e-02  1.360e-01  -0.604 0.545855    
## day_of_weektue                5.692e-02  1.384e-01   0.411 0.680846    
## day_of_weekwed               -2.012e-02  1.427e-01  -0.141 0.887896    
## campaign                     -2.485e-01  7.123e-02  -3.489 0.000485 ***
## pdaysNot Contacted           -1.427e+01  2.312e+02  -0.062 0.950786    
## poutcomenonexistent           2.469e-01  1.456e-01   1.696 0.089931 .  
## poutcomesuccess              -1.200e+01  2.312e+02  -0.052 0.958600    
## cons.price.idx               -2.086e-01  1.086e-01  -1.922 0.054628 .  
## cons.conf.idx                 1.643e-02  1.460e-02   1.125 0.260564    
## nr.employed                  -4.956e+01  4.434e+00 -11.178  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4457.7  on 3279  degrees of freedom
## Residual deviance: 3338.2  on 3233  degrees of freedom
## AIC: 3432.2
## 
## Number of Fisher Scoring iterations: 14

Predict on training data

str(bank)

## 'data.frame':    4099 obs. of  17 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

train_probs <- predict(trainmodel, newdata = train_data_hybrid, type = "response")
train_preds <- ifelse(train_probs > 0.7, "yes", "no")
train_preds <- as.factor(train_preds)
confusionMatrix(train_preds, train_data_hybrid$y)  # Evaluate model performance

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1257  799
##        yes  113 1111
##                                           
##                Accuracy : 0.722           
##                  95% CI : (0.7063, 0.7372)
##     No Information Rate : 0.5823          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4662          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9175          
##             Specificity : 0.5817          
##          Pos Pred Value : 0.6114          
##          Neg Pred Value : 0.9077          
##              Prevalence : 0.4177          
##          Detection Rate : 0.3832          
##    Detection Prevalence : 0.6268          
##       Balanced Accuracy : 0.7496          
##                                           
##        'Positive' Class : no              
##

# Check unique values in train and test
unique(train_data$default)

## [1] no      unknown
## Levels: no unknown

unique(test_data$default)

## [1] unknown no     
## Levels: no unknown

Predict on test data

# Predict probabilities
test_probs <- predict(trainmodel, newdata = test_data, type = "response")

# Convert probabilities to binary classification
test_preds <- ifelse(test_probs > 0.7, "yes", "no")
test_preds <- factor(test_preds, levels = c("no", "yes"))

# Evaluate test performance
confusionMatrix(test_preds, test_data$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  670  42
##        yes  59  48
##                                           
##                Accuracy : 0.8767          
##                  95% CI : (0.8522, 0.8984)
##     No Information Rate : 0.8901          
##     P-Value [Acc > NIR] : 0.8991          
##                                           
##                   Kappa : 0.4178          
##                                           
##  Mcnemar's Test P-Value : 0.1114          
##                                           
##             Sensitivity : 0.9191          
##             Specificity : 0.5333          
##          Pos Pred Value : 0.9410          
##          Neg Pred Value : 0.4486          
##              Prevalence : 0.8901          
##          Detection Rate : 0.8181          
##    Detection Prevalence : 0.8694          
##       Balanced Accuracy : 0.7262          
##                                           
##        'Positive' Class : no              
##

roc_curve <- roc(test_data$y, test_probs)
best_threshold <- coords(roc_curve, "best", ret = "threshold")
print(best_threshold)

##   threshold
## 1  0.675151

Compute ROC curve and AUC

roc_curve <- roc(test_data$y, test_probs)
plot(roc_curve, col = "blue", main = "ROC Curve for Test Data")

auc_value <- auc(roc_curve)
print(paste("AUC:", round(auc_value, 4)))  # Print AUC score

## [1] "AUC: 0.7494"

Bank Marketing Prediction Project

Chris,Jared,Rushabh,Yuan

2025-02-14

1. Load and Inspect Data

Check the structure of the dataset

2. Data Cleaning

Convert ‘pdays’ where 999 means ‘Not Contacted’

3. Exploratory Data Analysis

Visualizing numerical variable distributions

Correlation matrix analysis

Correlation heatmap

Check correlations with target variable

Remove highly correlated variable

4. Feature Engineering (Handling Categorical Variables)

Selecting categorical variables for correlation analysis

Ensure categorical variables are correctly encoded

Convert ‘unknown’ values to ‘no’ in housing and loan categories

5. Model Training and Evaluation

Train logistic regression model

Split dataset into training (80%) and testing (20%) sets

Define important variables for modeling

Predict on training data

Predict on test data

Compute ROC curve and AUC