library(tidyverse)
library(caret)
library(ggplot2)
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)
1. Load and Inspect Data
bank <- read.csv("C:/Users/yuan1/Downloads/bank-additional.csv",
sep = ";", header = TRUE, stringsAsFactors = TRUE)
Check the structure of the dataset
str(bank)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 3 levels "no","unknown",..: 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 3 levels "no","unknown",..: 3 1 3 2 3 1 3 3 1 1 ...
## $ loan : Factor w/ 3 levels "no","unknown",..: 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
table(bank$default)
##
## no unknown yes
## 3315 803 1
# Replace "yes" with "unknown" in the entire dataset (bank)
bank$default[bank$default == "yes"] <- "unknown"
# Ensure factor levels remain consistent
bank$default <- factor(bank$default, levels = c("no", "unknown"))
# Check if the transformation was successful
table(bank$default)
##
## no unknown
## 3315 804
table(bank$housing)
##
## no unknown yes
## 1839 105 2175
# Replace "unknown" with "no" in the housing column
bank$housing[bank$housing == "unknown"] <- "no"
# Ensure housing remains a factor with correct levels
bank$housing <- factor(bank$housing, levels = c("no", "yes"))
# Check the transformation
table(bank$housing)
##
## no yes
## 1944 2175
table(bank$loan)
##
## no unknown yes
## 3349 105 665
# Replace "unknown" with "yes" in the loan column
bank$loan[bank$loan == "unknown"] <- "yes"
# Ensure loan remains a factor with correct levels
bank$loan <- factor(bank$loan, levels = c("no", "yes"))
# Check the transformation
table(bank$loan)
##
## no yes
## 3349 770
table(bank$previous)
##
## 0 1 2 3 4 5 6
## 3523 475 78 25 14 2 2
# Convert previous to a binary factor: "no" (0) and "yes" (>0)
bank$previous <- ifelse(bank$previous == 0, "no", "yes")
# Convert to a factor with correct levels
bank$previous <- factor(bank$previous, levels = c("no", "yes"))
# Check the transformation
table(bank$previous)
##
## no yes
## 3523 596
table(bank$y) # Check target variable distribution
##
## no yes
## 3668 451
str(bank)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 2 1 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
# Convert pdays to character before applying ifelse()
bank$pdays <- as.character(bank$pdays)
# Recode pdays into two levels (1 = contacted, 0 = not contacted)
bank$pdays <- ifelse(bank$pdays == "999", "Not Contacted", "Contacted")
# Convert back to a factor with correct levels
bank$pdays <- factor(bank$pdays, levels = c("Not Contacted", "Contacted"))
# Check the transformation
table(bank$pdays)
##
## Not Contacted Contacted
## 3959 160
2. Data Cleaning
bank <- bank%>% dplyr::select(-duration) # Remove 'duration' column (leakage issue)
sum(is.na(bank)) # Check for missing values
## [1] 0
sum(duplicated(bank)) # Check for duplicate entries
## [1] 20
duplicates <- bank[duplicated(bank), ] # Identify duplicate rows
bank <- bank[!duplicated(bank), ] # Remove duplicate rows
bank <- bank %>% mutate_if(is.character, as.factor) # Convert character columns to factors
3. Exploratory Data Analysis
# Target variable distribution
ggplot(bank, aes(x = factor(y))) +
geom_bar(fill = c("blue", "orange")) +
labs(title = "Target Variable Distribution (y)", x = "Subscription to Term Deposit", y = "Count")

Visualizing numerical variable distributions
numeric_vars <- bank[, sapply(bank, is.numeric)] # Select numeric variables
numeric_melted <- melt(numeric_vars)
ggplot(numeric_melted, aes(x = value)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
facet_wrap(~variable, scales = "free") + # Separate subplots for each numeric variable
labs(title = "Distribution of Numeric Variables", x = "Value", y = "Count") +
theme_minimal()

table(bank$nr.employed)
##
## 4963.6 4991.6 5008.7 5017.5 5023.5 5076.2 5099.1 5176.3 5191 5195.8 5228.1
## 83 86 60 104 21 163 817 1 755 391 1618
table(bank$previous)
##
## no yes
## 3505 594
bank$campaign = log(bank$campaign)
bank$nr.employed = log(bank$nr.employed)
numeric<- bank %>% select(where(is.numeric))
par(mfrow = c(3, 3))
for (var in names(numeric)) {
qqnorm(numeric[[var]], main = paste("Q-Q Plot:", var))
qqline(numeric[[var]], col = "red", lwd = 2)
}
par(mfrow = c(1, 1))

Correlation matrix analysis
cor_matrix <- cor(numeric, use = "complete.obs")
print(cor_matrix)
## age campaign emp.var.rate cons.price.idx
## age 1.000000000 -0.013966224 -0.01938708 -0.001169426
## campaign -0.013966224 1.000000000 0.17904011 0.146428660
## emp.var.rate -0.019387084 0.179040112 1.00000000 0.755579401
## cons.price.idx -0.001169426 0.146428660 0.75557940 1.000000000
## cons.conf.idx 0.097893854 0.002892008 0.19240814 0.043520420
## euribor3m -0.015570727 0.159699049 0.97025583 0.657353850
## nr.employed -0.042945740 0.160648015 0.89491092 0.469223893
## cons.conf.idx euribor3m nr.employed
## age 0.097893854 -0.01557073 -0.04294574
## campaign 0.002892008 0.15969905 0.16064801
## emp.var.rate 0.192408143 0.97025583 0.89491092
## cons.price.idx 0.043520420 0.65735385 0.46922389
## cons.conf.idx 1.000000000 0.27434355 0.10314358
## euribor3m 0.274343550 1.00000000 0.94113103
## nr.employed 0.103143579 0.94113103 1.00000000
kable(cor_matrix, digits = 2, caption = "Numerical Variable Correlation Matrix")
Numerical Variable Correlation Matrix
age |
1.00 |
-0.01 |
-0.02 |
0.00 |
0.10 |
-0.02 |
-0.04 |
campaign |
-0.01 |
1.00 |
0.18 |
0.15 |
0.00 |
0.16 |
0.16 |
emp.var.rate |
-0.02 |
0.18 |
1.00 |
0.76 |
0.19 |
0.97 |
0.89 |
cons.price.idx |
0.00 |
0.15 |
0.76 |
1.00 |
0.04 |
0.66 |
0.47 |
cons.conf.idx |
0.10 |
0.00 |
0.19 |
0.04 |
1.00 |
0.27 |
0.10 |
euribor3m |
-0.02 |
0.16 |
0.97 |
0.66 |
0.27 |
1.00 |
0.94 |
nr.employed |
-0.04 |
0.16 |
0.89 |
0.47 |
0.10 |
0.94 |
1.00 |
Correlation heatmap
corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.8, cl.cex = 0.8, addCoef.col = "black")

Check correlations with target variable
table(bank$pdays) # Distribution of 'pdays'
##
## Contacted Not Contacted
## 159 3940
cor(bank$nr.employed, as.numeric(bank$y)) # Correlation with employment rate
## [1] -0.3509202
cor(bank$euribor3m, as.numeric(bank$y)) # Correlation with euribor3m
## [1] -0.2990494
4. Feature Engineering (Handling Categorical
Variables)
Selecting categorical variables for correlation analysis
categorical_vars <- c("job", "marital", "education", "default", "housing",
"loan", "contact", "month", "day_of_week", "poutcome","previous")
# Compute Cramér's V correlation matrix
cramer_matrix <- matrix(NA, nrow = length(categorical_vars), ncol = length(categorical_vars))
rownames(cramer_matrix) <- categorical_vars
colnames(cramer_matrix) <- categorical_vars
for (i in 1:length(categorical_vars)) {
for (j in 1:length(categorical_vars)) {
if (i != j) {
test_result <- assocstats(table(bank[[categorical_vars[i]]], bank[[categorical_vars[j]]]))
cramer_matrix[i, j] <- test_result$cramer
} else {
cramer_matrix[i, j] <- 1 # Self-correlation is 1
}
}
}
print(cramer_matrix) # Display Cramér's V correlation matrix
## job marital education default housing
## job 1.00000000 0.18346913 0.35478377 0.213070513 0.065990080
## marital 0.18346913 1.00000000 0.10711052 0.133882772 0.012968908
## education 0.35478377 0.10711052 1.00000000 0.242634924 0.053500343
## default 0.21307051 0.13388277 0.24263492 1.000000000 0.012871484
## housing 0.06599008 0.01296891 0.05350034 0.012871484 1.000000000
## loan 0.04251641 0.01624679 0.03803769 0.009700914 0.005030888
## contact 0.13066296 0.08594081 0.14956618 0.149818184 0.074343811
## month 0.11730417 0.06072848 0.09803143 0.160989212 0.090030158
## day_of_week 0.05609785 0.03293065 0.05310208 0.040349083 0.015879748
## poutcome 0.10430784 0.03602368 0.05179062 0.104420912 0.024694557
## previous 0.11136748 0.04556905 0.04381594 0.091649482 0.018613013
## loan contact month day_of_week poutcome
## job 0.042516407 0.13066296 0.11730417 0.05609785 0.104307840
## marital 0.016246794 0.08594081 0.06072848 0.03293065 0.036023683
## education 0.038037688 0.14956618 0.09803143 0.05310208 0.051790623
## default 0.009700914 0.14981818 0.16098921 0.04034908 0.104420912
## housing 0.005030888 0.07434381 0.09003016 0.01587975 0.024694557
## loan 1.000000000 0.00696911 0.03635633 0.02864837 0.009674543
## contact 0.006969110 1.00000000 0.60390024 0.06594183 0.251967096
## month 0.036356333 0.60390024 1.00000000 0.08660851 0.251417412
## day_of_week 0.028648374 0.06594183 0.08660851 1.00000000 0.026299807
## poutcome 0.009674543 0.25196710 0.25141741 0.02629981 1.000000000
## previous 0.009654427 0.25189023 0.31656704 0.01502844 1.000000000
## previous
## job 0.111367478
## marital 0.045569047
## education 0.043815945
## default 0.091649482
## housing 0.018613013
## loan 0.009654427
## contact 0.251890226
## month 0.316567036
## day_of_week 0.015028444
## poutcome 1.000000000
## previous 1.000000000
Ensure categorical variables are correctly encoded
print(levels(bank$housing))
## [1] "no" "yes"
print(levels(bank$loan))
## [1] "no" "yes"
print(levels(bank$previous))
## [1] "no" "yes"
Convert ‘unknown’ values to ‘no’ in housing and loan categories
bank$housing <- as.character(bank$housing)
bank$loan <- as.character(bank$loan)
bank$previous <- as.character(bank$previous)
bank$housing[bank$housing == "unknown"] <- "no"
bank$loan[bank$loan == "unknown"] <- "no"
bank$housing <- factor(bank$housing, levels = c("no", "yes"))
bank$loan <- factor(bank$loan, levels = c("no", "yes"))
table(bank$loan)
##
## no yes
## 3330 769
table(bank$housing)
##
## no yes
## 1935 2164
table(bank$previous)
##
## no yes
## 3505 594
5. Model Training and Evaluation
Train logistic regression model
str(bank)
## 'data.frame': 4099 obs. of 18 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ campaign : num 0.693 1.386 0 1.099 0 ...
## $ pdays : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
## $ previous : chr "no" "no" "no" "no" ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ nr.employed : num 8.54 8.55 8.56 8.56 8.56 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
prev_cramer <- assocstats(table(bank$previous, bank$y))$cramer
print(prev_cramer)
## [1] 0.2079214
pout_cramer <- assocstats(table(bank$poutcome, bank$y))$cramer
print(pout_cramer)
## [1] 0.3340245
bank <- bank%>% dplyr::select(-previous)
model <- glm(y ~ ., data = bank, family = binomial(link="logit"))
vif(model) # Check for multicollinearity
## GVIF Df GVIF^(1/(2*Df))
## age 2.060339 1 1.435388
## job 5.685170 11 1.082197
## marital 1.495544 3 1.069383
## education 3.332629 7 1.089788
## default 1.155510 1 1.074946
## housing 1.029070 1 1.014431
## loan 1.017915 1 1.008918
## contact 2.392374 1 1.546730
## month 6.638016 9 1.110884
## day_of_week 1.118374 4 1.014083
## campaign 1.061717 1 1.030396
## pdays 7.896885 1 2.810140
## poutcome 9.407044 2 1.751311
## cons.price.idx 2.087104 1 1.444681
## cons.conf.idx 2.413512 1 1.553548
## nr.employed 2.252955 1 1.500985
# Remove highly correlated variable
summary(model) # View model summary
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = bank)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0835 -0.3966 -0.3218 -0.2549 2.8704
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 434.193393 43.606103 9.957 < 2e-16 ***
## age 0.017015 0.006834 2.490 0.01278 *
## jobblue-collar -0.319127 0.225008 -1.418 0.15611
## jobentrepreneur -0.522427 0.397370 -1.315 0.18861
## jobhousemaid -0.132193 0.393883 -0.336 0.73716
## jobmanagement -0.446153 0.247003 -1.806 0.07088 .
## jobretired -0.288981 0.303659 -0.952 0.34127
## jobself-employed -0.610755 0.347980 -1.755 0.07924 .
## jobservices -0.195153 0.238708 -0.818 0.41362
## jobstudent -0.045555 0.354599 -0.128 0.89778
## jobtechnician -0.053453 0.191817 -0.279 0.78050
## jobunemployed 0.086801 0.332187 0.261 0.79386
## jobunknown -0.485744 0.637515 -0.762 0.44610
## maritalmarried 0.160872 0.201097 0.800 0.42373
## maritalsingle 0.290952 0.230125 1.264 0.20612
## maritalunknown 0.090024 1.169224 0.077 0.93863
## educationbasic.6y 0.248739 0.339613 0.732 0.46391
## educationbasic.9y 0.124738 0.272876 0.457 0.64758
## educationhigh.school 0.108351 0.261443 0.414 0.67856
## educationilliterate -10.895400 324.744015 -0.034 0.97324
## educationprofessional.course 0.204361 0.285021 0.717 0.47337
## educationuniversity.degree 0.197260 0.262371 0.752 0.45215
## educationunknown 0.233198 0.342803 0.680 0.49633
## defaultunknown -0.039833 0.177832 -0.224 0.82276
## housingyes -0.091361 0.115550 -0.791 0.42914
## loanyes -0.108607 0.150960 -0.719 0.47187
## contacttelephone -0.858310 0.214970 -3.993 6.53e-05 ***
## monthaug -0.478165 0.308803 -1.548 0.12151
## monthdec 0.540134 0.545760 0.990 0.32233
## monthjul 0.002272 0.295253 0.008 0.99386
## monthjun 0.623026 0.273656 2.277 0.02281 *
## monthmar 1.272880 0.391048 3.255 0.00113 **
## monthmay -0.496764 0.234422 -2.119 0.03408 *
## monthnov -0.538660 0.289626 -1.860 0.06291 .
## monthoct -0.475359 0.379285 -1.253 0.21010
## monthsep -0.764996 0.395975 -1.932 0.05337 .
## day_of_weekmon -0.035505 0.181945 -0.195 0.84528
## day_of_weekthu 0.032518 0.182566 0.178 0.85863
## day_of_weektue -0.033945 0.187269 -0.181 0.85616
## day_of_weekwed 0.150846 0.187855 0.803 0.42198
## campaign -0.155289 0.094264 -1.647 0.09948 .
## pdaysNot Contacted -0.637117 0.536926 -1.187 0.23538
## poutcomenonexistent 0.453375 0.181869 2.493 0.01267 *
## poutcomesuccess 1.209529 0.549903 2.200 0.02784 *
## cons.price.idx 0.133557 0.123535 1.081 0.27964
## cons.conf.idx 0.042326 0.015455 2.739 0.00617 **
## nr.employed -52.336116 4.916288 -10.645 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2837.0 on 4098 degrees of freedom
## Residual deviance: 2208.5 on 4052 degrees of freedom
## AIC: 2302.5
##
## Number of Fisher Scoring iterations: 11
Split dataset into training (80%) and testing (20%) sets
#bank <- bank%>% dplyr::select(-pdays)#
set.seed(4321)
train_index <- createDataPartition(bank$y, p = 0.8, list = FALSE)
train_data <- bank[train_index, ]
test_data <- bank[-train_index, ]
Define important variables for modeling
# Hybrid sampling (both over- and under-sampling to balance classes)
train_data_hybrid <- ovun.sample(y ~ .,
data = train_data, # Use selected features
method = "both", # Perform both over- and under-sampling
N = nrow(train_data), # Keep dataset size unchanged
p = 0.6)$data # Balance 'yes' and 'no'
# Train logistic regression model on resampled data
trainmodel <- glm(y ~ ., data = train_data_hybrid, family = binomial(link="logit"))
summary(trainmodel) # View model summary
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train_data_hybrid)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1896 -0.9122 0.2187 0.8315 1.8442
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.562e+02 2.342e+02 1.948 0.051408 .
## age 3.444e-02 5.356e-03 6.430 1.28e-10 ***
## jobblue-collar 5.022e-02 1.587e-01 0.316 0.751699
## jobentrepreneur -6.226e-01 2.516e-01 -2.474 0.013365 *
## jobhousemaid 7.452e-01 3.247e-01 2.295 0.021710 *
## jobmanagement -5.992e-01 1.940e-01 -3.090 0.002004 **
## jobretired -3.707e-01 2.434e-01 -1.523 0.127799
## jobself-employed -9.098e-01 2.595e-01 -3.506 0.000455 ***
## jobservices 1.828e-01 1.835e-01 0.997 0.318969
## jobstudent 4.814e-02 3.171e-01 0.152 0.879350
## jobtechnician 1.358e-01 1.486e-01 0.914 0.360790
## jobunemployed 8.508e-02 2.745e-01 0.310 0.756630
## jobunknown 4.259e-01 4.403e-01 0.967 0.333483
## maritalmarried 1.120e-01 1.465e-01 0.764 0.444646
## maritalsingle 4.074e-01 1.685e-01 2.418 0.015600 *
## maritalunknown -1.546e+01 7.613e+02 -0.020 0.983794
## educationbasic.6y 3.579e-01 2.564e-01 1.396 0.162768
## educationbasic.9y 5.713e-01 1.896e-01 3.012 0.002591 **
## educationhigh.school 1.158e-01 1.967e-01 0.589 0.556158
## educationilliterate -1.561e+01 1.455e+03 -0.011 0.991444
## educationprofessional.course 7.515e-01 2.127e-01 3.532 0.000412 ***
## educationuniversity.degree 6.565e-01 1.956e-01 3.356 0.000790 ***
## educationunknown 6.683e-01 2.780e-01 2.404 0.016220 *
## defaultunknown 6.846e-02 1.181e-01 0.580 0.562197
## housingyes -1.164e-01 8.697e-02 -1.338 0.180781
## loanyes -1.562e-01 1.120e-01 -1.395 0.163131
## contacttelephone -2.221e-01 1.759e-01 -1.263 0.206602
## monthaug -1.885e-01 2.775e-01 -0.679 0.496964
## monthdec 5.790e-01 5.744e-01 1.008 0.313513
## monthjul 4.702e-01 2.445e-01 1.923 0.054468 .
## monthjun 6.919e-01 2.256e-01 3.067 0.002161 **
## monthmar 1.494e+00 4.438e-01 3.366 0.000763 ***
## monthmay -4.765e-01 1.919e-01 -2.483 0.013011 *
## monthnov -3.882e-01 2.374e-01 -1.636 0.101926
## monthoct 1.145e+00 4.031e-01 2.841 0.004492 **
## monthsep -2.125e-01 4.613e-01 -0.461 0.645049
## day_of_weekmon -1.297e-01 1.372e-01 -0.946 0.344244
## day_of_weekthu -8.213e-02 1.360e-01 -0.604 0.545855
## day_of_weektue 5.692e-02 1.384e-01 0.411 0.680846
## day_of_weekwed -2.012e-02 1.427e-01 -0.141 0.887896
## campaign -2.485e-01 7.123e-02 -3.489 0.000485 ***
## pdaysNot Contacted -1.427e+01 2.312e+02 -0.062 0.950786
## poutcomenonexistent 2.469e-01 1.456e-01 1.696 0.089931 .
## poutcomesuccess -1.200e+01 2.312e+02 -0.052 0.958600
## cons.price.idx -2.086e-01 1.086e-01 -1.922 0.054628 .
## cons.conf.idx 1.643e-02 1.460e-02 1.125 0.260564
## nr.employed -4.956e+01 4.434e+00 -11.178 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4457.7 on 3279 degrees of freedom
## Residual deviance: 3338.2 on 3233 degrees of freedom
## AIC: 3432.2
##
## Number of Fisher Scoring iterations: 14
Predict on training data
str(bank)
## 'data.frame': 4099 obs. of 17 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 12 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 4 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 8 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 2 levels "no","unknown": 1 1 1 1 1 1 1 2 1 2 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 1 2 1 2 2 1 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 2 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ campaign : num 0.693 1.386 0 1.099 0 ...
## $ pdays : Factor w/ 2 levels "Contacted","Not Contacted": 2 2 2 2 2 2 2 2 2 2 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ nr.employed : num 8.54 8.55 8.56 8.56 8.56 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
train_probs <- predict(trainmodel, newdata = train_data_hybrid, type = "response")
train_preds <- ifelse(train_probs > 0.7, "yes", "no")
train_preds <- as.factor(train_preds)
confusionMatrix(train_preds, train_data_hybrid$y) # Evaluate model performance
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1257 799
## yes 113 1111
##
## Accuracy : 0.722
## 95% CI : (0.7063, 0.7372)
## No Information Rate : 0.5823
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4662
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9175
## Specificity : 0.5817
## Pos Pred Value : 0.6114
## Neg Pred Value : 0.9077
## Prevalence : 0.4177
## Detection Rate : 0.3832
## Detection Prevalence : 0.6268
## Balanced Accuracy : 0.7496
##
## 'Positive' Class : no
##
# Check unique values in train and test
unique(train_data$default)
## [1] no unknown
## Levels: no unknown
unique(test_data$default)
## [1] unknown no
## Levels: no unknown
Predict on test data
# Predict probabilities
test_probs <- predict(trainmodel, newdata = test_data, type = "response")
# Convert probabilities to binary classification
test_preds <- ifelse(test_probs > 0.7, "yes", "no")
test_preds <- factor(test_preds, levels = c("no", "yes"))
# Evaluate test performance
confusionMatrix(test_preds, test_data$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 670 42
## yes 59 48
##
## Accuracy : 0.8767
## 95% CI : (0.8522, 0.8984)
## No Information Rate : 0.8901
## P-Value [Acc > NIR] : 0.8991
##
## Kappa : 0.4178
##
## Mcnemar's Test P-Value : 0.1114
##
## Sensitivity : 0.9191
## Specificity : 0.5333
## Pos Pred Value : 0.9410
## Neg Pred Value : 0.4486
## Prevalence : 0.8901
## Detection Rate : 0.8181
## Detection Prevalence : 0.8694
## Balanced Accuracy : 0.7262
##
## 'Positive' Class : no
##
roc_curve <- roc(test_data$y, test_probs)
best_threshold <- coords(roc_curve, "best", ret = "threshold")
print(best_threshold)
## threshold
## 1 0.675151
Compute ROC curve and AUC
roc_curve <- roc(test_data$y, test_probs)
plot(roc_curve, col = "blue", main = "ROC Curve for Test Data")

auc_value <- auc(roc_curve)
print(paste("AUC:", round(auc_value, 4))) # Print AUC score
## [1] "AUC: 0.7494"