library(tidyverse)  
library(caret)      
library(ggplot2)    
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)

1. Load and Inspect Data

bank <- read.csv("C:/Users/yuan1/Downloads/bank-additional.csv", 
                 sep = ";", header = TRUE, stringsAsFactors = TRUE)

Check the structure of the dataset

str(bank)  
## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
##  $ loan          : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
table(bank$default)
## 
##      no unknown     yes 
##    3315     803       1
# Replace "yes" with "unknown" in the entire dataset (bank)
bank$default[bank$default == "yes"] <- "unknown"

# Ensure factor levels remain consistent
bank$default <- factor(bank$default, levels = c("no", "unknown"))

# Check if the transformation was successful
table(bank$default)
## 
##      no unknown 
##    3315     804
table(bank$housing)
## 
##      no unknown     yes 
##    1839     105    2175
# Replace "unknown" with "no" in the housing column
bank$housing[bank$housing == "unknown"] <- "no"

# Ensure housing remains a factor with correct levels
bank$housing <- factor(bank$housing, levels = c("no", "yes"))

# Check the transformation
table(bank$housing)
## 
##   no  yes 
## 1944 2175
table(bank$loan)
## 
##      no unknown     yes 
##    3349     105     665
# Replace "unknown" with "yes" in the loan column
bank$loan[bank$loan == "unknown"] <- "yes"

# Ensure loan remains a factor with correct levels
bank$loan <- factor(bank$loan, levels = c("no", "yes"))

# Check the transformation
table(bank$loan)
## 
##   no  yes 
## 3349  770
table(bank$previous)
## 
##    0    1    2    3    4    5    6 
## 3523  475   78   25   14    2    2
# Convert previous to a binary factor: "no" (0) and "yes" (>0)
bank$previous <- ifelse(bank$previous == 0, "no", "yes")

# Convert to a factor with correct levels
bank$previous <- factor(bank$previous, levels = c("no", "yes"))

# Check the transformation
table(bank$previous)
## 
##   no  yes 
## 3523  596
table(bank$y)  # Check target variable distribution
## 
##   no  yes 
## 3668  451
str(bank)
## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
 # Convert pdays to character before applying ifelse()
bank$pdays <- as.character(bank$pdays)

# Recode pdays into two levels (1 = contacted, 0 = not contacted)
bank$pdays <- ifelse(bank$pdays == "999", "Not Contacted", "Contacted")

# Convert back to a factor with correct levels
bank$pdays <- factor(bank$pdays, levels = c("Not Contacted", "Contacted"))

# Check the transformation
table(bank$pdays)
## 
## Not Contacted     Contacted 
##          3959           160

2. Data Cleaning

bank <- bank%>% dplyr::select(-duration)  # Remove 'duration' column (leakage issue)
sum(is.na(bank))  # Check for missing values
## [1] 0
sum(duplicated(bank))  # Check for duplicate entries
## [1] 20
duplicates <- bank[duplicated(bank), ]  # Identify duplicate rows
bank <- bank[!duplicated(bank), ]  # Remove duplicate rows
bank <- bank %>% mutate_if(is.character, as.factor)  # Convert character columns to factors

Convert ‘pdays’ where 999 means ‘Not Contacted’

bank$pdays <- ifelse(bank$pdays == 999, "Not Contacted", as.character(bank$pdays))
bank$pdays <- factor(bank$pdays)  # Convert to categorical variable

3. Exploratory Data Analysis

# Target variable distribution

ggplot(bank, aes(x = factor(y))) +
  geom_bar(fill = c("blue", "orange")) +
  labs(title = "Target Variable Distribution (y)", x = "Subscription to Term Deposit", y = "Count")

Visualizing numerical variable distributions

numeric_vars <- bank[, sapply(bank, is.numeric)]  # Select numeric variables
numeric_melted <- melt(numeric_vars)

ggplot(numeric_melted, aes(x = value)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  facet_wrap(~variable, scales = "free") +  # Separate subplots for each numeric variable
  labs(title = "Distribution of Numeric Variables", x = "Value", y = "Count") +
  theme_minimal()

table(bank$nr.employed)
## 
## 4963.6 4991.6 5008.7 5017.5 5023.5 5076.2 5099.1 5176.3   5191 5195.8 5228.1 
##     83     86     60    104     21    163    817      1    755    391   1618
table(bank$previous)
## 
##   no  yes 
## 3505  594
bank$campaign = log(bank$campaign)
bank$nr.employed = log(bank$nr.employed)

numeric<- bank %>% select(where(is.numeric)) 

par(mfrow = c(3, 3))  
for (var in names(numeric)) {
  qqnorm(numeric[[var]], main = paste("Q-Q Plot:", var))
  qqline(numeric[[var]], col = "red", lwd = 2)
}
par(mfrow = c(1, 1))  

Correlation matrix analysis

cor_matrix <- cor(numeric, use = "complete.obs")
print(cor_matrix)
##                         age     campaign emp.var.rate cons.price.idx
## age             1.000000000 -0.013966224  -0.01938708   -0.001169426
## campaign       -0.013966224  1.000000000   0.17904011    0.146428660
## emp.var.rate   -0.019387084  0.179040112   1.00000000    0.755579401
## cons.price.idx -0.001169426  0.146428660   0.75557940    1.000000000
## cons.conf.idx   0.097893854  0.002892008   0.19240814    0.043520420
## euribor3m      -0.015570727  0.159699049   0.97025583    0.657353850
## nr.employed    -0.042945740  0.160648015   0.89491092    0.469223893
##                cons.conf.idx   euribor3m nr.employed
## age              0.097893854 -0.01557073 -0.04294574
## campaign         0.002892008  0.15969905  0.16064801
## emp.var.rate     0.192408143  0.97025583  0.89491092
## cons.price.idx   0.043520420  0.65735385  0.46922389
## cons.conf.idx    1.000000000  0.27434355  0.10314358
## euribor3m        0.274343550  1.00000000  0.94113103
## nr.employed      0.103143579  0.94113103  1.00000000
kable(cor_matrix, digits = 2, caption = "Numerical Variable Correlation Matrix")
Numerical Variable Correlation Matrix
age campaign emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
age 1.00 -0.01 -0.02 0.00 0.10 -0.02 -0.04
campaign -0.01 1.00 0.18 0.15 0.00 0.16 0.16
emp.var.rate -0.02 0.18 1.00 0.76 0.19 0.97 0.89
cons.price.idx 0.00 0.15 0.76 1.00 0.04 0.66 0.47
cons.conf.idx 0.10 0.00 0.19 0.04 1.00 0.27 0.10
euribor3m -0.02 0.16 0.97 0.66 0.27 1.00 0.94
nr.employed -0.04 0.16 0.89 0.47 0.10 0.94 1.00

Correlation heatmap

corrplot(cor_matrix, method = "color", type = "upper", 
         tl.cex = 0.8, cl.cex = 0.8, addCoef.col = "black")

Check correlations with target variable

table(bank$pdays)  # Distribution of 'pdays'
## 
##     Contacted Not Contacted 
##           159          3940
cor(bank$nr.employed, as.numeric(bank$y))  # Correlation with employment rate
## [1] -0.3509202
cor(bank$euribor3m, as.numeric(bank$y))  # Correlation with euribor3m
## [1] -0.2990494

Remove highly correlated variable

bank <- bank%>% dplyr::select(-euribor3m)
bank <- bank%>% dplyr::select(-emp.var.rate)
str(bank)
## 'data.frame':    4099 obs. of  18 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ previous      : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 2 1 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

4. Feature Engineering (Handling Categorical Variables)

Selecting categorical variables for correlation analysis

categorical_vars <- c("job", "marital", "education", "default", "housing", 
                      "loan", "contact", "month", "day_of_week", "poutcome","previous")

# Compute Cramér's V correlation matrix
cramer_matrix <- matrix(NA, nrow = length(categorical_vars), ncol = length(categorical_vars))
rownames(cramer_matrix) <- categorical_vars
colnames(cramer_matrix) <- categorical_vars

for (i in 1:length(categorical_vars)) {
  for (j in 1:length(categorical_vars)) {
    if (i != j) {
      test_result <- assocstats(table(bank[[categorical_vars[i]]], bank[[categorical_vars[j]]]))
      cramer_matrix[i, j] <- test_result$cramer
    } else {
      cramer_matrix[i, j] <- 1  # Self-correlation is 1
    }
  }
}

print(cramer_matrix)  # Display Cramér's V correlation matrix
##                    job    marital  education     default     housing
## job         1.00000000 0.18346913 0.35478377 0.213070513 0.065990080
## marital     0.18346913 1.00000000 0.10711052 0.133882772 0.012968908
## education   0.35478377 0.10711052 1.00000000 0.242634924 0.053500343
## default     0.21307051 0.13388277 0.24263492 1.000000000 0.012871484
## housing     0.06599008 0.01296891 0.05350034 0.012871484 1.000000000
## loan        0.04251641 0.01624679 0.03803769 0.009700914 0.005030888
## contact     0.13066296 0.08594081 0.14956618 0.149818184 0.074343811
## month       0.11730417 0.06072848 0.09803143 0.160989212 0.090030158
## day_of_week 0.05609785 0.03293065 0.05310208 0.040349083 0.015879748
## poutcome    0.10430784 0.03602368 0.05179062 0.104420912 0.024694557
## previous    0.11136748 0.04556905 0.04381594 0.091649482 0.018613013
##                    loan    contact      month day_of_week    poutcome
## job         0.042516407 0.13066296 0.11730417  0.05609785 0.104307840
## marital     0.016246794 0.08594081 0.06072848  0.03293065 0.036023683
## education   0.038037688 0.14956618 0.09803143  0.05310208 0.051790623
## default     0.009700914 0.14981818 0.16098921  0.04034908 0.104420912
## housing     0.005030888 0.07434381 0.09003016  0.01587975 0.024694557
## loan        1.000000000 0.00696911 0.03635633  0.02864837 0.009674543
## contact     0.006969110 1.00000000 0.60390024  0.06594183 0.251967096
## month       0.036356333 0.60390024 1.00000000  0.08660851 0.251417412
## day_of_week 0.028648374 0.06594183 0.08660851  1.00000000 0.026299807
## poutcome    0.009674543 0.25196710 0.25141741  0.02629981 1.000000000
## previous    0.009654427 0.25189023 0.31656704  0.01502844 1.000000000
##                previous
## job         0.111367478
## marital     0.045569047
## education   0.043815945
## default     0.091649482
## housing     0.018613013
## loan        0.009654427
## contact     0.251890226
## month       0.316567036
## day_of_week 0.015028444
## poutcome    1.000000000
## previous    1.000000000

Ensure categorical variables are correctly encoded

print(levels(bank$housing))
## [1] "no"  "yes"
print(levels(bank$loan))
## [1] "no"  "yes"
print(levels(bank$previous))
## [1] "no"  "yes"

Convert ‘unknown’ values to ‘no’ in housing and loan categories

bank$housing <- as.character(bank$housing)
bank$loan <- as.character(bank$loan)
bank$previous <- as.character(bank$previous)

bank$housing[bank$housing == "unknown"] <- "no"
bank$loan[bank$loan == "unknown"] <- "no"

bank$housing <- factor(bank$housing, levels = c("no", "yes"))
bank$loan <- factor(bank$loan, levels = c("no", "yes"))

table(bank$loan)
## 
##   no  yes 
## 3330  769
table(bank$housing)
## 
##   no  yes 
## 1935 2164
table(bank$previous)
## 
##   no  yes 
## 3505  594

5. Model Training and Evaluation

Train logistic regression model

str(bank)
## 'data.frame':    4099 obs. of  18 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ previous      : chr  "no" "no" "no" "no" ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
prev_cramer <- assocstats(table(bank$previous, bank$y))$cramer
print(prev_cramer)
## [1] 0.2079214
pout_cramer <- assocstats(table(bank$poutcome, bank$y))$cramer
print(pout_cramer)
## [1] 0.3340245
bank <- bank%>% dplyr::select(-previous)


model <- glm(y ~ ., data = bank, family = binomial(link="logit"))
vif(model)  # Check for multicollinearity
##                    GVIF Df GVIF^(1/(2*Df))
## age            2.060339  1        1.435388
## job            5.685170 11        1.082197
## marital        1.495544  3        1.069383
## education      3.332629  7        1.089788
## default        1.155510  1        1.074946
## housing        1.029070  1        1.014431
## loan           1.017915  1        1.008918
## contact        2.392374  1        1.546730
## month          6.638016  9        1.110884
## day_of_week    1.118374  4        1.014083
## campaign       1.061717  1        1.030396
## pdays          7.896885  1        2.810140
## poutcome       9.407044  2        1.751311
## cons.price.idx 2.087104  1        1.444681
## cons.conf.idx  2.413512  1        1.553548
## nr.employed    2.252955  1        1.500985
  # Remove highly correlated variable
summary(model)  # View model summary
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = bank)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0835  -0.3966  -0.3218  -0.2549   2.8704  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  434.193393  43.606103   9.957  < 2e-16 ***
## age                            0.017015   0.006834   2.490  0.01278 *  
## jobblue-collar                -0.319127   0.225008  -1.418  0.15611    
## jobentrepreneur               -0.522427   0.397370  -1.315  0.18861    
## jobhousemaid                  -0.132193   0.393883  -0.336  0.73716    
## jobmanagement                 -0.446153   0.247003  -1.806  0.07088 .  
## jobretired                    -0.288981   0.303659  -0.952  0.34127    
## jobself-employed              -0.610755   0.347980  -1.755  0.07924 .  
## jobservices                   -0.195153   0.238708  -0.818  0.41362    
## jobstudent                    -0.045555   0.354599  -0.128  0.89778    
## jobtechnician                 -0.053453   0.191817  -0.279  0.78050    
## jobunemployed                  0.086801   0.332187   0.261  0.79386    
## jobunknown                    -0.485744   0.637515  -0.762  0.44610    
## maritalmarried                 0.160872   0.201097   0.800  0.42373    
## maritalsingle                  0.290952   0.230125   1.264  0.20612    
## maritalunknown                 0.090024   1.169224   0.077  0.93863    
## educationbasic.6y              0.248739   0.339613   0.732  0.46391    
## educationbasic.9y              0.124738   0.272876   0.457  0.64758    
## educationhigh.school           0.108351   0.261443   0.414  0.67856    
## educationilliterate          -10.895400 324.744015  -0.034  0.97324    
## educationprofessional.course   0.204361   0.285021   0.717  0.47337    
## educationuniversity.degree     0.197260   0.262371   0.752  0.45215    
## educationunknown               0.233198   0.342803   0.680  0.49633    
## defaultunknown                -0.039833   0.177832  -0.224  0.82276    
## housingyes                    -0.091361   0.115550  -0.791  0.42914    
## loanyes                       -0.108607   0.150960  -0.719  0.47187    
## contacttelephone              -0.858310   0.214970  -3.993 6.53e-05 ***
## monthaug                      -0.478165   0.308803  -1.548  0.12151    
## monthdec                       0.540134   0.545760   0.990  0.32233    
## monthjul                       0.002272   0.295253   0.008  0.99386    
## monthjun                       0.623026   0.273656   2.277  0.02281 *  
## monthmar                       1.272880   0.391048   3.255  0.00113 ** 
## monthmay                      -0.496764   0.234422  -2.119  0.03408 *  
## monthnov                      -0.538660   0.289626  -1.860  0.06291 .  
## monthoct                      -0.475359   0.379285  -1.253  0.21010    
## monthsep                      -0.764996   0.395975  -1.932  0.05337 .  
## day_of_weekmon                -0.035505   0.181945  -0.195  0.84528    
## day_of_weekthu                 0.032518   0.182566   0.178  0.85863    
## day_of_weektue                -0.033945   0.187269  -0.181  0.85616    
## day_of_weekwed                 0.150846   0.187855   0.803  0.42198    
## campaign                      -0.155289   0.094264  -1.647  0.09948 .  
## pdaysNot Contacted            -0.637117   0.536926  -1.187  0.23538    
## poutcomenonexistent            0.453375   0.181869   2.493  0.01267 *  
## poutcomesuccess                1.209529   0.549903   2.200  0.02784 *  
## cons.price.idx                 0.133557   0.123535   1.081  0.27964    
## cons.conf.idx                  0.042326   0.015455   2.739  0.00617 ** 
## nr.employed                  -52.336116   4.916288 -10.645  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2837.0  on 4098  degrees of freedom
## Residual deviance: 2208.5  on 4052  degrees of freedom
## AIC: 2302.5
## 
## Number of Fisher Scoring iterations: 11

Split dataset into training (80%) and testing (20%) sets

#bank <- bank%>% dplyr::select(-pdays)#

set.seed(4321)
train_index <- createDataPartition(bank$y, p = 0.8, list = FALSE)
train_data <- bank[train_index, ]
test_data <- bank[-train_index, ]

Define important variables for modeling

# Hybrid sampling (both over- and under-sampling to balance classes)
train_data_hybrid <- ovun.sample(y ~ ., 
                                 data = train_data,  # Use selected features
                                 method = "both",   # Perform both over- and under-sampling
                                 N = nrow(train_data),  # Keep dataset size unchanged
                                 p = 0.6)$data  # Balance 'yes' and 'no'

# Train logistic regression model on resampled data
trainmodel <- glm(y ~ ., data = train_data_hybrid, family = binomial(link="logit"))
summary(trainmodel)  # View model summary
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train_data_hybrid)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1896  -0.9122   0.2187   0.8315   1.8442  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   4.562e+02  2.342e+02   1.948 0.051408 .  
## age                           3.444e-02  5.356e-03   6.430 1.28e-10 ***
## jobblue-collar                5.022e-02  1.587e-01   0.316 0.751699    
## jobentrepreneur              -6.226e-01  2.516e-01  -2.474 0.013365 *  
## jobhousemaid                  7.452e-01  3.247e-01   2.295 0.021710 *  
## jobmanagement                -5.992e-01  1.940e-01  -3.090 0.002004 ** 
## jobretired                   -3.707e-01  2.434e-01  -1.523 0.127799    
## jobself-employed             -9.098e-01  2.595e-01  -3.506 0.000455 ***
## jobservices                   1.828e-01  1.835e-01   0.997 0.318969    
## jobstudent                    4.814e-02  3.171e-01   0.152 0.879350    
## jobtechnician                 1.358e-01  1.486e-01   0.914 0.360790    
## jobunemployed                 8.508e-02  2.745e-01   0.310 0.756630    
## jobunknown                    4.259e-01  4.403e-01   0.967 0.333483    
## maritalmarried                1.120e-01  1.465e-01   0.764 0.444646    
## maritalsingle                 4.074e-01  1.685e-01   2.418 0.015600 *  
## maritalunknown               -1.546e+01  7.613e+02  -0.020 0.983794    
## educationbasic.6y             3.579e-01  2.564e-01   1.396 0.162768    
## educationbasic.9y             5.713e-01  1.896e-01   3.012 0.002591 ** 
## educationhigh.school          1.158e-01  1.967e-01   0.589 0.556158    
## educationilliterate          -1.561e+01  1.455e+03  -0.011 0.991444    
## educationprofessional.course  7.515e-01  2.127e-01   3.532 0.000412 ***
## educationuniversity.degree    6.565e-01  1.956e-01   3.356 0.000790 ***
## educationunknown              6.683e-01  2.780e-01   2.404 0.016220 *  
## defaultunknown                6.846e-02  1.181e-01   0.580 0.562197    
## housingyes                   -1.164e-01  8.697e-02  -1.338 0.180781    
## loanyes                      -1.562e-01  1.120e-01  -1.395 0.163131    
## contacttelephone             -2.221e-01  1.759e-01  -1.263 0.206602    
## monthaug                     -1.885e-01  2.775e-01  -0.679 0.496964    
## monthdec                      5.790e-01  5.744e-01   1.008 0.313513    
## monthjul                      4.702e-01  2.445e-01   1.923 0.054468 .  
## monthjun                      6.919e-01  2.256e-01   3.067 0.002161 ** 
## monthmar                      1.494e+00  4.438e-01   3.366 0.000763 ***
## monthmay                     -4.765e-01  1.919e-01  -2.483 0.013011 *  
## monthnov                     -3.882e-01  2.374e-01  -1.636 0.101926    
## monthoct                      1.145e+00  4.031e-01   2.841 0.004492 ** 
## monthsep                     -2.125e-01  4.613e-01  -0.461 0.645049    
## day_of_weekmon               -1.297e-01  1.372e-01  -0.946 0.344244    
## day_of_weekthu               -8.213e-02  1.360e-01  -0.604 0.545855    
## day_of_weektue                5.692e-02  1.384e-01   0.411 0.680846    
## day_of_weekwed               -2.012e-02  1.427e-01  -0.141 0.887896    
## campaign                     -2.485e-01  7.123e-02  -3.489 0.000485 ***
## pdaysNot Contacted           -1.427e+01  2.312e+02  -0.062 0.950786    
## poutcomenonexistent           2.469e-01  1.456e-01   1.696 0.089931 .  
## poutcomesuccess              -1.200e+01  2.312e+02  -0.052 0.958600    
## cons.price.idx               -2.086e-01  1.086e-01  -1.922 0.054628 .  
## cons.conf.idx                 1.643e-02  1.460e-02   1.125 0.260564    
## nr.employed                  -4.956e+01  4.434e+00 -11.178  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4457.7  on 3279  degrees of freedom
## Residual deviance: 3338.2  on 3233  degrees of freedom
## AIC: 3432.2
## 
## Number of Fisher Scoring iterations: 14

Predict on training data

str(bank)
## 'data.frame':    4099 obs. of  17 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ campaign      : num  0.693 1.386 0 1.099 0 ...
##  $ pdays         : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ nr.employed   : num  8.54 8.55 8.56 8.56 8.56 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
train_probs <- predict(trainmodel, newdata = train_data_hybrid, type = "response")
train_preds <- ifelse(train_probs > 0.7, "yes", "no")
train_preds <- as.factor(train_preds)
confusionMatrix(train_preds, train_data_hybrid$y)  # Evaluate model performance
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1257  799
##        yes  113 1111
##                                           
##                Accuracy : 0.722           
##                  95% CI : (0.7063, 0.7372)
##     No Information Rate : 0.5823          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.4662          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9175          
##             Specificity : 0.5817          
##          Pos Pred Value : 0.6114          
##          Neg Pred Value : 0.9077          
##              Prevalence : 0.4177          
##          Detection Rate : 0.3832          
##    Detection Prevalence : 0.6268          
##       Balanced Accuracy : 0.7496          
##                                           
##        'Positive' Class : no              
## 
# Check unique values in train and test
unique(train_data$default)
## [1] no      unknown
## Levels: no unknown
unique(test_data$default)
## [1] unknown no     
## Levels: no unknown

Predict on test data

# Predict probabilities
test_probs <- predict(trainmodel, newdata = test_data, type = "response")

# Convert probabilities to binary classification
test_preds <- ifelse(test_probs > 0.7, "yes", "no")
test_preds <- factor(test_preds, levels = c("no", "yes"))

# Evaluate test performance
confusionMatrix(test_preds, test_data$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  670  42
##        yes  59  48
##                                           
##                Accuracy : 0.8767          
##                  95% CI : (0.8522, 0.8984)
##     No Information Rate : 0.8901          
##     P-Value [Acc > NIR] : 0.8991          
##                                           
##                   Kappa : 0.4178          
##                                           
##  Mcnemar's Test P-Value : 0.1114          
##                                           
##             Sensitivity : 0.9191          
##             Specificity : 0.5333          
##          Pos Pred Value : 0.9410          
##          Neg Pred Value : 0.4486          
##              Prevalence : 0.8901          
##          Detection Rate : 0.8181          
##    Detection Prevalence : 0.8694          
##       Balanced Accuracy : 0.7262          
##                                           
##        'Positive' Class : no              
## 
roc_curve <- roc(test_data$y, test_probs)
best_threshold <- coords(roc_curve, "best", ret = "threshold")
print(best_threshold)
##   threshold
## 1  0.675151

Compute ROC curve and AUC

roc_curve <- roc(test_data$y, test_probs)
plot(roc_curve, col = "blue", main = "ROC Curve for Test Data")

auc_value <- auc(roc_curve)
print(paste("AUC:", round(auc_value, 4)))  # Print AUC score
## [1] "AUC: 0.7494"