Predicting Term Deposit Subscriptions

Author

Felix Liu, Brian Chen

Published

May 4, 2025

1 Data Preprocessing and Exploratory Data Analysis

1.1 Data Preprocessing

Code
df <- read.csv("data/bank.csv")
df %>%
  head() %>%
  kbl()
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
30 unemployed married primary no 1787 no no cellular 19 oct 79 1 -1 0 unknown no
33 services married secondary no 4789 yes yes cellular 11 may 220 1 339 4 failure no
35 management single tertiary no 1350 yes no cellular 16 apr 185 1 330 1 failure no
30 management married tertiary no 1476 yes yes unknown 3 jun 199 4 -1 0 unknown no
59 blue-collar married secondary no 0 yes no unknown 5 may 226 1 -1 0 unknown no
35 management single tertiary no 747 no no cellular 23 feb 141 2 176 3 failure no
Code
df %>%
  str() %>%
  kbl()
'data.frame':   4521 obs. of  17 variables:
 $ age      : int  30 33 35 30 59 35 36 39 41 43 ...
 $ job      : chr  "unemployed" "services" "management" "management" ...
 $ marital  : chr  "married" "married" "single" "married" ...
 $ education: chr  "primary" "secondary" "tertiary" "tertiary" ...
 $ default  : chr  "no" "no" "no" "no" ...
 $ balance  : int  1787 4789 1350 1476 0 747 307 147 221 -88 ...
 $ housing  : chr  "no" "yes" "yes" "yes" ...
 $ loan     : chr  "no" "yes" "no" "yes" ...
 $ contact  : chr  "cellular" "cellular" "cellular" "unknown" ...
 $ day      : int  19 11 16 3 5 23 14 6 14 17 ...
 $ month    : chr  "oct" "may" "apr" "jun" ...
 $ duration : int  79 220 185 199 226 141 341 151 57 313 ...
 $ campaign : int  1 1 1 4 1 2 1 2 2 1 ...
 $ pdays    : int  -1 339 330 -1 -1 176 330 -1 -1 147 ...
 $ previous : int  0 4 1 0 0 3 2 0 0 2 ...
 $ poutcome : chr  "unknown" "failure" "failure" "unknown" ...
 $ y        : chr  "no" "no" "no" "no" ...
Code
# missing values and special cases  
df$pdays_recode <- ifelse(df$pdays == -1, 0, df$pdays)  

# 2. Convert categorical variables to factors  
categorical_vars <- c("job", "marital", "education", "default", "housing",   
                     "loan", "contact", "month", "poutcome", "y")  
df[categorical_vars] <- lapply(df[categorical_vars], as.factor)  

# Check for outliers  
# Boxplots for numeric variables  
par(mfrow=c(2,3))  
boxplot(df$age, main = "Age")  
boxplot(df$balance, main = "Balance")  
boxplot(df$duration, main = "Call Duration")  
boxplot(df$campaign, main = "Campaign")  
boxplot(df$pdays_recode, main = "Days Since Last Contact")  
boxplot(df$previous, main = "Previous Contacts")  

Code
par(mfrow=c(1,1)) 

1.2 Summary Statistics

Code
# Summary for numeric variables  
summary(df[, sapply(df, is.numeric)])  
      age           balance           day           duration   
 Min.   :19.00   Min.   :-3313   Min.   : 1.00   Min.   :   4  
 1st Qu.:33.00   1st Qu.:   69   1st Qu.: 9.00   1st Qu.: 104  
 Median :39.00   Median :  444   Median :16.00   Median : 185  
 Mean   :41.17   Mean   : 1423   Mean   :15.92   Mean   : 264  
 3rd Qu.:49.00   3rd Qu.: 1480   3rd Qu.:21.00   3rd Qu.: 329  
 Max.   :87.00   Max.   :71188   Max.   :31.00   Max.   :3025  
    campaign          pdays           previous        pdays_recode   
 Min.   : 1.000   Min.   : -1.00   Min.   : 0.0000   Min.   :  0.00  
 1st Qu.: 1.000   1st Qu.: -1.00   1st Qu.: 0.0000   1st Qu.:  0.00  
 Median : 2.000   Median : -1.00   Median : 0.0000   Median :  0.00  
 Mean   : 2.794   Mean   : 39.77   Mean   : 0.5426   Mean   : 40.59  
 3rd Qu.: 3.000   3rd Qu.: -1.00   3rd Qu.: 0.0000   3rd Qu.:  0.00  
 Max.   :50.000   Max.   :871.00   Max.   :25.0000   Max.   :871.00  
Code
# For categorical variables  
for (cat_var in categorical_vars) {  
  cat("\nFrequency table for", cat_var, ":\n")  
  print(table(df[[cat_var]]))  
  cat("\nProportion table for", cat_var, ":\n")  
  print(prop.table(table(df[[cat_var]])))  
}  

Frequency table for job :

       admin.   blue-collar  entrepreneur     housemaid    management 
          478           946           168           112           969 
      retired self-employed      services       student    technician 
          230           183           417            84           768 
   unemployed       unknown 
          128            38 

Proportion table for job :

       admin.   blue-collar  entrepreneur     housemaid    management 
   0.10572882    0.20924574    0.03715992    0.02477328    0.21433311 
      retired self-employed      services       student    technician 
   0.05087370    0.04047777    0.09223623    0.01857996    0.16987392 
   unemployed       unknown 
   0.02831232    0.00840522 

Frequency table for marital :

divorced  married   single 
     528     2797     1196 

Proportion table for marital :

 divorced   married    single 
0.1167883 0.6186684 0.2645432 

Frequency table for education :

  primary secondary  tertiary   unknown 
      678      2306      1350       187 

Proportion table for education :

   primary  secondary   tertiary    unknown 
0.14996682 0.51006415 0.29860650 0.04136253 

Frequency table for default :

  no  yes 
4445   76 

Proportion table for default :

        no        yes 
0.98318956 0.01681044 

Frequency table for housing :

  no  yes 
1962 2559 

Proportion table for housing :

       no       yes 
0.4339748 0.5660252 

Frequency table for loan :

  no  yes 
3830  691 

Proportion table for loan :

       no       yes 
0.8471577 0.1528423 

Frequency table for contact :

 cellular telephone   unknown 
     2896       301      1324 

Proportion table for contact :

  cellular  telephone    unknown 
0.64056625 0.06657819 0.29285556 

Frequency table for month :

 apr  aug  dec  feb  jan  jul  jun  mar  may  nov  oct  sep 
 293  633   20  222  148  706  531   49 1398  389   80   52 

Proportion table for month :

       apr        aug        dec        feb        jan        jul        jun 
0.06480867 0.14001327 0.00442380 0.04910418 0.03273612 0.15616014 0.11745189 
       mar        may        nov        oct        sep 
0.01083831 0.30922362 0.08604291 0.01769520 0.01150188 

Frequency table for poutcome :

failure   other success unknown 
    490     197     129    3705 

Proportion table for poutcome :

   failure      other    success    unknown 
0.10838310 0.04357443 0.02853351 0.81950896 

Frequency table for y :

  no  yes 
4000  521 

Proportion table for y :

     no     yes 
0.88476 0.11524 
Code
# Check for suspicious values  
unique(df$pdays_recode)  
  [1]   0 339 330 176 147 241 152 105 342 101   5  92  56 170 182 297 196 460
 [19] 137 367 145 169 207 266 288 168 345 436  90 183 146 335 347 119   7 271
 [37] 181  88 141 126  61 373 351 242  62  91 308 250 172 265  78  28  79   1
 [55] 188 167  89 164 462 209 321 254  94 364  96 356 149 363 275 325 341 260
 [73] 358  87 303  98 327 337 322 102  99 370  84 212  63  81 191 360 332  80
 [91]  85 247 150 175 382 261 336  58 206 112 199 133 208 253 135 278 140 298
[109] 273 124 281 162 323 349 117   2 256 333 116 268 136 198 357 259 353 174
[127] 371 205 246  69 315 110 461 184 270 127 187  64 130 346 100 352 808 113
[145] 378 292 287 107 293 139 138 193 274  97 103 359 185 674 211 300 334 280
[163] 479  95 262 362 225   3 366  60 190 368 122 343 131 365 299 115 316 180
[181] 154 313 264 350  73 232 204 143 375 186 344 210 248 177 221 189 104 258
[199] 305 171 120 317 178 386 118 404 374 282 179 284 227 291 173 871 238 294
[217] 222 435 340 426 239  83 111 415 255 235 244  38 683 329  59 151 192 158
[235] 338 388 165 348 197 295 109 484 326 369 397 414 319 474  93 249 272 355
[253] 195  82 541 231 153 201 761 114 385 267 161 467  75 106 223 312 148 309
[271] 283  86 166 160 450 500 311 123 159 687 224 361  74  76 286  77  57 219
[289] 331 804 144 234
Code
summary(df$duration)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
      4     104     185     264     329    3025 

1.3 Visualizations

Code
# Histograms for numeric variables  
par(mfrow=c(2,3))  
hist(df$age, col = "lightblue", main = "Age Distribution")  
hist(df$balance, col = "lightblue", main = "Balance Distribution")  
hist(df$duration, col = "lightblue", main = "Call Duration Distribution")  
hist(df$campaign, col = "lightblue", main = "Campaign Contacts Distribution") 
hist(df$pdays_recode, col = "lightblue", main = "Days Since Last Contact Distribution")  
hist(df$previous, col = "lightblue", main = "Previous Contacts Distribution")  

Code
par(mfrow=c(1,1))  


# Bar charts for categorical variables  
barplot(table(df$job), las = 2, main = "Job Distribution", cex.names = 0.7)  

Code
barplot(table(df$education), las = 2, main = "Education Distribution")  

Code
# Correlation heatmap for numeric variables  
numeric_vars <- df[, c("age", "balance", "duration", "campaign", "pdays_recode", "previous")]  
cor_matrix <- cor(numeric_vars)  

# Display the correlation matrix with values  
corrplot(cor_matrix, method = "color", type = "upper",  
         tl.col = "black", tl.srt = 45,  
         title = "Correlation Matrix of Numeric Variables",  
         addCoef.col = "black",   
         number.cex = 0.7,         
         diag = FALSE)             

1.4 Relationship Exploration

Insight:

Previous Campaign Outcomes vs Subscription Rate

Data analysis reveals a significant relationship between the poutcome variable (representing previous marketing campaign results) and subscription rates. Clients with successful (success) marketing history show a remarkably high re-subscription probability of 64.34%, substantially higher than those with failed attempts (failure) at 12.86% and those with no previous records (unknown) at 9.10%.

This indicates that a customer’s historical marketing response is a critical predictor of future subscription behavior, suggesting that banks should prioritize contacting clients with previous successful experiences to maximize conversion rates.

Education Level vs Subscription Rate

Tertiary-educated clients display the highest subscription rate at 14.30%, notably higher than clients with secondary education (10.62%) and primary education (9.44%). This educational gradient suggests that higher levels of education correspond to increased financial literacy and investment awareness, making tertiary-educated clients a prime target segment.

Banking institutions should consider education-based segmentation in their marketing strategies, prioritizing outreach to higher-educated clients while developing tailored approaches for clients with lower educational backgrounds to improve overall conversion efficiency.

Code
# Grouped bar chart for education vs. subscription  
table_edu_y <- table(df$education, df$y)  
table_edu_y_prop <- prop.table(table_edu_y, margin = 1)  
table_edu_y_prop_t <- t(table_edu_y_prop)   
barplot_data <- barplot(table_edu_y_prop_t, beside = TRUE,  
                        col = c("orange", "skyblue"),  
                        legend.text = FALSE,  
                        main = "Subscription Rate by Education Level",  
                        xlab = "Education Level",  
                        ylab = "Proportion",  
                        ylim = c(0, max(table_edu_y_prop_t) + 0.1)) 
text(x = barplot_data,   
     y = as.vector(table_edu_y_prop_t) + 0.03,   
     labels = sprintf("%.1f%%", as.vector(table_edu_y_prop_t) * 100),  
     cex = 0.8)   
legend("topright",                      
       legend = rownames(table_edu_y_prop_t),    
       fill = c("orange", "skyblue"),   
       cex = 0.7,                       
       bty = "n")                       

Code
# poutcome and subscription rate  
poutcome_vs_y <- prop.table(table(df$poutcome, df$y), 1)  
poutcome_vs_y_t <- t(poutcome_vs_y)  
barplot_data <- barplot(poutcome_vs_y_t, beside = TRUE,  
                        col = c("orange", "skyblue"),   
                        legend.text = FALSE,             
                        main = "Subscription Rate by Previous Outcome",  
                        xlab = "Previous Campaign Outcome",  
                        ylab = "Proportion",  
                        ylim = c(0, max(poutcome_vs_y_t) + 0.1))    
text(x = barplot_data,   
     y = as.vector(poutcome_vs_y_t) + 0.03,   
     labels = sprintf("%.1f%%", as.vector(poutcome_vs_y_t) * 100),  
     cex = 0.8)   
legend("topright",                      
       legend = rownames(poutcome_vs_y_t),  
       fill = c("orange", "skyblue"),   
       cex = 0.7,                       
       bty = "n") 

Code
# job vs. subscription  
table_job_y <- table(df$job, df$y)  
table_job_y_prop <- prop.table(table_job_y, margin = 1)  
table_job_y_prop_t <- t(table_job_y_prop)   
barplot_data <- barplot(table_job_y_prop_t, beside = TRUE,  
                        col = c("orange", "skyblue"),  
                        legend.text = FALSE,  
                        main = "Subscription Rate by Job Type",  
                        xlab = "Job Type",  
                        ylab = "Proportion",  
                        las = 2,               
                        cex.names = 0.7,      
                        ylim = c(0, max(table_job_y_prop_t) + 0.1))  
text(x = barplot_data,   
     y = as.vector(table_job_y_prop_t) + 0.03,   
     labels = sprintf("%.1f%%", as.vector(table_job_y_prop_t) * 100),  
     cex = 0.8)   
legend("topright",                      
       legend = rownames(table_job_y_prop_t),    
       fill = c("orange", "skyblue"),   
       cex = 0.7,                       
       bty = "n")   

Code
# Boxplots to compare by subscription status  
library(ggplot2)
library(gridExtra)
library(dplyr)
get_box_stats <- function(x) {
  return(data.frame(
    y = max(x) * 1.1,  
    label = paste("Mean:", round(mean(x), 1),
                 "
Median:", round(median(x), 1))
  ))
}
p1 <- ggplot(df, aes(x = y, y = pdays_recode)) +
  geom_boxplot(fill = "lightblue", alpha = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "blue") +
  stat_summary(
    fun.data = get_box_stats,
    geom = "text",
    hjust = 0,
    vjust = 1.0,
    size = 3
  ) +
  labs(title = "Days Since Last Contact by Subscription",
       x = "Subscription", y = "Days") +
  theme_bw() +
  theme(plot.title = element_text(size = 10))

p2 <- ggplot(df, aes(x = y, y = balance)) +
  geom_boxplot(fill = "lightgreen", alpha = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "blue") +
  stat_summary(
    fun.data = get_box_stats,
    geom = "text",
    hjust = 0,
    vjust = 1.0,
    size = 3
  ) +
  labs(title = "Balance by Subscription",
       x = "Subscription", y = "Balance") +
  theme_bw() +
  theme(plot.title = element_text(size = 10))

p3 <- ggplot(df, aes(x = y, y = duration)) +
  geom_boxplot(fill = "lightpink", alpha = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "blue") +
  stat_summary(
    fun.data = get_box_stats,
    geom = "text",
    hjust = 0,
    vjust = 1.0,
    size = 3
  ) +
  labs(title = "Duration by Subscription",
       x = "Subscription", y = "Duration") +
  theme_bw() +
  theme(plot.title = element_text(size = 10))

p4 <- ggplot(df, aes(x = y, y = campaign)) +
  geom_boxplot(fill = "lightyellow", alpha = 0.5) +
  stat_summary(fun = mean, geom = "point", shape = 20, size = 3, color = "blue") +
  stat_summary(
    fun.data = get_box_stats,
    geom = "text",
    hjust = 0,
    vjust = 1.0,
    size = 3
  ) +
  labs(title = "Campaign Contacts by Subscription",
       x = "Subscription", y = "Campaign Contacts") +
  theme_bw() +
  theme(plot.title = element_text(size = 10))
grid.arrange(p1, p2, p3, p4, ncol = 2)

Code
# Scatterplot for duration vs. campaign  
plot(df$duration, df$campaign, main = "Duration vs. Campaign",   
     xlab = "Duration (seconds)", ylab = "Campaign Contacts",  
     col = "black", pch = 16, cex = 0.7)  

Code
# Correlation between key variables  
key_vars_cor <- cor(df[, c("duration", "campaign", "pdays_recode")])  
print(key_vars_cor) 
               duration    campaign pdays_recode
duration      1.0000000 -0.06838200   0.01035620
campaign     -0.0683820  1.00000000  -0.09299573
pdays_recode  0.0103562 -0.09299573   1.00000000
Code
# Duration vs subscription (mean duration by subscription status)  
tapply(df$duration, df$y, mean) 
      no      yes 
226.3475 552.7428 

2 Multiple Linear Regression

2.1 Split the Data

Code
# Set seed for reproducibility  
set.seed(123)  

# Split data into training (70%) and testing (30%) sets  
train_idx <- sample(1:nrow(df), size = 0.7 * nrow(df))  
train <- df[train_idx, ]  
test <- df[-train_idx, ] 

2.2 Model 1: Predicting campaign (number of contacts)

Code
campaign_vars <- setdiff(names(df), c("y", "campaign", "pdays", "pdays_recode", "duration"))  
campaign_formula <- as.formula(paste("campaign ~", paste(campaign_vars, collapse = " + ")))  

# Fit full model  
model_full_campaign <- lm(campaign_formula, data = train)  
summary(model_full_campaign)  

Call:
lm(formula = campaign_formula, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-4.365 -1.432 -0.640  0.558 44.920 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -3.199e-01  4.988e-01  -0.641 0.521396    
age                -2.518e-03  6.223e-03  -0.405 0.685720    
jobblue-collar      2.563e-01  2.001e-01   1.281 0.200364    
jobentrepreneur     3.243e-01  3.050e-01   1.063 0.287716    
jobhousemaid       -4.676e-01  3.569e-01  -1.310 0.190273    
jobmanagement       2.327e-01  2.168e-01   1.074 0.283037    
jobretired         -1.775e-01  3.012e-01  -0.589 0.555638    
jobself-employed    5.800e-01  2.971e-01   1.952 0.051020 .  
jobservices         1.680e-01  2.289e-01   0.734 0.463077    
jobstudent          2.325e-01  4.311e-01   0.539 0.589684    
jobtechnician      -1.229e-01  2.003e-01  -0.614 0.539576    
jobunemployed       9.629e-02  3.456e-01   0.279 0.780553    
jobunknown          1.725e-01  6.139e-01   0.281 0.778736    
maritalmarried      1.822e-01  1.616e-01   1.127 0.259762    
maritalsingle       1.359e-01  1.883e-01   0.722 0.470371    
educationsecondary -1.805e-02  1.646e-01  -0.110 0.912667    
educationtertiary   4.157e-02  2.005e-01   0.207 0.835770    
educationunknown   -3.487e-01  2.928e-01  -1.191 0.233803    
defaultyes         -1.658e-01  4.007e-01  -0.414 0.679155    
balance             5.617e-06  1.609e-05   0.349 0.727070    
housingyes          4.499e-01  1.205e-01   3.735 0.000191 ***
loanyes             2.113e-01  1.433e-01   1.475 0.140381    
contacttelephone    3.889e-01  2.118e-01   1.836 0.066401 .  
contactunknown     -1.439e-01  1.699e-01  -0.847 0.397007    
day                 7.922e-02  6.941e-03  11.414  < 2e-16 ***
monthaug            2.441e+00  2.537e-01   9.620  < 2e-16 ***
monthdec            1.184e+00  8.855e-01   1.337 0.181407    
monthfeb            1.358e+00  3.150e-01   4.311 1.68e-05 ***
monthjan           -6.499e-01  3.540e-01  -1.836 0.066440 .  
monthjul            1.616e+00  2.428e-01   6.657 3.28e-11 ***
monthjun            1.852e+00  2.909e-01   6.364 2.25e-10 ***
monthmar            1.218e+00  5.216e-01   2.336 0.019567 *  
monthmay            6.654e-01  2.374e-01   2.803 0.005098 ** 
monthnov           -5.382e-02  2.632e-01  -0.204 0.838009    
monthoct           -2.719e-01  4.285e-01  -0.635 0.525753    
monthsep            4.920e-01  5.102e-01   0.964 0.334988    
previous            3.670e-02  4.144e-02   0.886 0.375869    
poutcomeother       2.650e-01  2.881e-01   0.920 0.357720    
poutcomesuccess    -2.665e-01  3.565e-01  -0.748 0.454745    
poutcomeunknown     3.328e-01  2.112e-01   1.576 0.115228    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.806 on 3124 degrees of freedom
Multiple R-squared:  0.1141,    Adjusted R-squared:  0.103 
F-statistic: 10.32 on 39 and 3124 DF,  p-value: < 2.2e-16
Code
# Apply stepwise selection  
step_model_campaign <- step(model_full_campaign, direction = "both")  
Start:  AIC=6569.44
campaign ~ age + job + marital + education + default + balance + 
    housing + loan + contact + day + month + previous + poutcome

            Df Sum of Sq   RSS    AIC
- job       11    122.81 24726 6563.2
- education  3     15.31 24618 6565.4
- marital    2     10.19 24613 6566.8
- balance    1      0.96 24604 6567.6
- age        1      1.29 24604 6567.6
- default    1      1.35 24604 6567.6
- poutcome   3     35.57 24638 6568.0
- previous   1      6.18 24609 6568.2
<none>                   24603 6569.4
- loan       1     17.13 24620 6569.6
- contact    2     35.85 24639 6570.1
- housing    1    109.86 24713 6581.5
- day        1   1026.01 25629 6696.7
- month     11   1884.75 26488 6781.0

Step:  AIC=6563.2
campaign ~ age + marital + education + default + balance + housing + 
    loan + contact + day + month + previous + poutcome

            Df Sum of Sq   RSS    AIC
- education  3     26.26 24752 6560.6
- balance    1      0.34 24726 6561.2
- default    1      0.86 24726 6561.3
- marital    2     16.69 24742 6561.3
- previous   1      6.42 24732 6562.0
- poutcome   3     40.52 24766 6562.4
- age        1      9.76 24735 6562.4
<none>                   24726 6563.2
- loan       1     18.62 24744 6563.6
- contact    2     34.44 24760 6563.6
+ job       11    122.81 24603 6569.4
- housing    1    117.06 24843 6576.1
- day        1   1023.00 25749 6689.5
- month     11   1870.62 26596 6771.9

Step:  AIC=6560.56
campaign ~ age + marital + default + balance + housing + loan + 
    contact + day + month + previous + poutcome

            Df Sum of Sq   RSS    AIC
- marital    2     16.19 24768 6558.6
- balance    1      0.56 24752 6558.6
- default    1      0.93 24753 6558.7
- previous   1      7.00 24759 6559.5
- poutcome   3     41.45 24793 6559.9
- age        1     11.60 24764 6560.0
<none>                   24752 6560.6
- contact    2     33.83 24786 6560.9
- loan       1     19.16 24771 6561.0
+ education  3     26.26 24726 6563.2
+ job       11    133.76 24618 6565.4
- housing    1    116.36 24868 6573.4
- day        1   1024.85 25777 6686.9
- month     11   1895.43 26647 6772.0

Step:  AIC=6558.63
campaign ~ age + default + balance + housing + loan + contact + 
    day + month + previous + poutcome

            Df Sum of Sq   RSS    AIC
- balance    1      0.80 24769 6556.7
- default    1      1.41 24770 6556.8
- previous   1      6.95 24775 6557.5
- poutcome   3     40.64 24809 6557.8
- age        1     11.47 24780 6558.1
<none>                   24768 6558.6
- contact    2     34.74 24803 6559.1
- loan       1     19.34 24788 6559.1
+ marital    2     16.19 24752 6560.6
+ education  3     25.76 24742 6561.3
+ job       11    139.61 24628 6562.7
- housing    1    123.73 24892 6572.4
- day        1   1026.44 25795 6685.1
- month     11   1911.11 26679 6771.8

Step:  AIC=6556.73
campaign ~ age + default + housing + loan + contact + day + month + 
    previous + poutcome

            Df Sum of Sq   RSS    AIC
- default    1      1.55 24770 6554.9
- previous   1      7.00 24776 6555.6
- poutcome   3     40.68 24810 6555.9
- age        1     11.13 24780 6556.1
<none>                   24769 6556.7
- loan       1     18.97 24788 6557.2
- contact    2     35.04 24804 6557.2
+ balance    1      0.80 24768 6558.6
+ marital    2     16.43 24752 6558.6
+ education  3     26.01 24743 6559.4
+ job       11    139.20 24630 6560.9
- housing    1    123.21 24892 6570.4
- day        1   1026.07 25795 6683.2
- month     11   1913.13 26682 6770.1

Step:  AIC=6554.93
campaign ~ age + housing + loan + contact + day + month + previous + 
    poutcome

            Df Sum of Sq   RSS    AIC
- previous   1      7.00 24778 6553.8
- poutcome   3     40.21 24811 6554.1
- age        1     11.14 24782 6554.3
<none>                   24770 6554.9
- loan       1     18.44 24789 6555.3
- contact    2     35.41 24806 6555.4
+ default    1      1.55 24769 6556.7
+ marital    2     16.95 24754 6556.8
+ balance    1      0.94 24770 6556.8
+ education  3     26.12 24744 6557.6
+ job       11    138.61 24632 6559.2
- housing    1    123.62 24894 6568.7
- day        1   1029.42 25800 6681.8
- month     11   1915.54 26686 6768.6

Step:  AIC=6553.82
campaign ~ age + housing + loan + contact + day + month + poutcome

            Df Sum of Sq   RSS    AIC
- poutcome   3     33.39 24811 6552.1
- age        1     11.52 24789 6553.3
<none>                   24778 6553.8
- loan       1     18.79 24796 6554.2
- contact    2     36.21 24814 6554.4
+ previous   1      7.00 24770 6554.9
+ default    1      1.55 24776 6555.6
+ marital    2     16.92 24760 6555.7
+ balance    1      0.99 24776 6555.7
+ education  3     26.73 24751 6556.4
+ job       11    138.93 24638 6558.0
- housing    1    123.15 24901 6567.5
- day        1   1027.58 25805 6680.4
- month     11   1916.71 26694 6767.6

Step:  AIC=6552.08
campaign ~ age + housing + loan + contact + day + month

            Df Sum of Sq   RSS    AIC
- age        1     13.27 24824 6551.8
- contact    2     31.14 24842 6552.0
<none>                   24811 6552.1
- loan       1     19.91 24831 6552.6
+ poutcome   3     33.39 24778 6553.8
+ default    1      1.06 24810 6553.9
+ balance    1      0.95 24810 6554.0
+ marital    2     16.04 24795 6554.0
+ previous   1      0.18 24811 6554.1
+ education  3     27.10 24784 6554.6
+ job       11    143.42 24667 6555.7
- housing    1    124.91 24936 6566.0
- day        1   1054.47 25865 6681.8
- month     11   2166.45 26977 6795.0

Step:  AIC=6551.77
campaign ~ housing + loan + contact + day + month

            Df Sum of Sq   RSS    AIC
- contact    2     26.06 24850 6551.1
<none>                   24824 6551.8
+ age        1     13.27 24811 6552.1
- loan       1     19.61 24844 6552.3
+ poutcome   3     35.13 24789 6553.3
+ default    1      1.06 24823 6553.6
+ balance    1      0.54 24824 6553.7
+ marital    2     16.15 24808 6553.7
+ previous   1      0.19 24824 6553.7
+ education  3     29.18 24795 6554.1
+ job       11    153.36 24671 6554.2
- housing    1    139.26 24963 6567.5
- day        1   1063.60 25888 6682.5
- month     11   2164.10 26988 6794.2

Step:  AIC=6551.09
campaign ~ housing + loan + day + month

            Df Sum of Sq   RSS    AIC
<none>                   24850 6551.1
- loan       1     17.50 24868 6551.3
+ contact    2     26.06 24824 6551.8
+ age        1      8.18 24842 6552.0
+ marital    2     17.43 24833 6552.9
+ default    1      1.39 24849 6552.9
+ balance    1      0.84 24849 6553.0
+ previous   1      0.03 24850 6553.1
+ poutcome   3     29.28 24821 6553.4
+ education  3     27.79 24822 6553.6
+ job       11    147.44 24703 6554.3
- housing    1    130.48 24981 6565.7
- day        1   1075.41 25926 6683.1
- month     11   2145.33 26996 6791.1
Code
summary(step_model_campaign)  

Call:
lm(formula = campaign ~ housing + loan + day + month, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-4.494 -1.402 -0.644  0.537 45.050 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept)  0.117140   0.246907   0.474   0.6352    
housingyes   0.468940   0.115325   4.066 4.89e-05 ***
loanyes      0.210676   0.141483   1.489   0.1366    
day          0.079599   0.006819  11.674  < 2e-16 ***
monthaug     2.440119   0.246225   9.910  < 2e-16 ***
monthdec     0.964882   0.871438   1.107   0.2683    
monthfeb     1.357394   0.312565   4.343 1.45e-05 ***
monthjan    -0.657263   0.350846  -1.873   0.0611 .  
monthjul     1.685563   0.236311   7.133 1.21e-12 ***
monthjun     1.785776   0.251025   7.114 1.39e-12 ***
monthmar     1.198133   0.518030   2.313   0.0208 *  
monthmay     0.617325   0.217684   2.836   0.0046 ** 
monthnov    -0.003869   0.260097  -0.015   0.9881    
monthoct    -0.383539   0.425545  -0.901   0.3675    
monthsep     0.387111   0.500248   0.774   0.4391    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2.809 on 3149 degrees of freedom
Multiple R-squared:  0.1052,    Adjusted R-squared:  0.1012 
F-statistic: 26.44 on 14 and 3149 DF,  p-value: < 2.2e-16
Code
# Cross-validation with 10 folds  
set.seed(100)  
model_full_CV_campaign <- train(  
  campaign_formula,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

# the formula from stepwise selection  
step_formula_campaign <- formula(step_model_campaign)  

# Running cross-validation on stepwise model  
model_step_CV_campaign <- train(  
  step_formula_campaign,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

## Compare RMSE  
print(model_full_CV_campaign)  
Linear Regression 

3164 samples
  13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2848, 2848, 2846, 2848, 2847, 2848, ... 
Resampling results:

  RMSE      Rsquared    MAE     
  2.800534  0.09911313  1.683191

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_campaign)  
Linear Regression 

3164 samples
   4 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2847, 2847, 2849, 2847, 2848, 2848, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  2.775264  0.1005481  1.670661

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# the model with better performance (lower RMSE)  
if(model_step_CV_campaign$results$RMSE < model_full_CV_campaign$results$RMSE) {  
  final_model_campaign <- step_model_campaign  
  cat("Selected model: Stepwise\n")  
} else {  
  final_model_campaign <- model_full_campaign  
  cat("Selected model: Full model\n")  
}  
Selected model: Stepwise
Code
## Model Diagnostics
# Plot residuals  
plot(final_model_campaign$fitted.values, resid(final_model_campaign),  
     main = "Residual Plot - Campaign Model",  
     xlab = "Fitted Values", ylab = "Residuals")  
abline(h = 0, col = "red", lty = 2)  

Code
# Predict on test set  
pred_campaign <- predict(final_model_campaign, newdata = test)  
rmse_campaign <- sqrt(mean((pred_campaign - test$campaign)^2))  
cat("Test RMSE for campaign model:", rmse_campaign, "\n")  
Test RMSE for campaign model: 3.271208 
Code
# Model Equation  
{
  cat("\nCampaign Model Equation:\n")  
  cat("\ncampaign =", round(coef(final_model_campaign)[1], 4))  
  for (i in 2:length(coef(final_model_campaign))) {  
    coef_name <- names(coef(final_model_campaign))[i]  
    coef_value <- coef(final_model_campaign)[i]  
    if (coef_value >= 0) {  
      cat(" +", round(coef_value, 4), "*", coef_name)  
    } else {  
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)  
    }  
  } 
} 

Campaign Model Equation:

campaign = 0.1171 + 0.4689 * housingyes + 0.2107 * loanyes + 0.0796 * day + 2.4401 * monthaug + 0.9649 * monthdec + 1.3574 * monthfeb - 0.6573 * monthjan + 1.6856 * monthjul + 1.7858 * monthjun + 1.1981 * monthmar + 0.6173 * monthmay - 0.0039 * monthnov - 0.3835 * monthoct + 0.3871 * monthsep

2.3 Model 2: Predicting pdays (days since last contact)

Code
pdays_vars <- setdiff(names(df), c("y", "pdays", "pdays_recode", "campaign", "duration"))  
pdays_formula <- as.formula(paste("pdays_recode ~", paste(pdays_vars, collapse = " + ")))  

# Fit full model  
model_full_pdays <- lm(pdays_formula, data = train)  
summary(model_full_pdays)  

Call:
lm(formula = pdays_formula, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-255.20   -7.31   -0.63    7.88  645.97 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         2.604e+02  8.510e+00  30.600  < 2e-16 ***
age                -9.974e-02  1.062e-01  -0.940 0.347536    
jobblue-collar      3.373e+00  3.414e+00   0.988 0.323289    
jobentrepreneur     7.978e+00  5.203e+00   1.533 0.125318    
jobhousemaid       -5.241e+00  6.089e+00  -0.861 0.389520    
jobmanagement       4.161e+00  3.698e+00   1.125 0.260537    
jobretired         -6.920e+00  5.139e+00  -1.346 0.178242    
jobself-employed    1.429e+00  5.069e+00   0.282 0.778102    
jobservices         3.668e+00  3.905e+00   0.939 0.347583    
jobstudent         -8.094e+00  7.355e+00  -1.100 0.271211    
jobtechnician       3.223e+00  3.417e+00   0.943 0.345692    
jobunemployed       6.614e+00  5.896e+00   1.122 0.262031    
jobunknown         -2.952e+00  1.047e+01  -0.282 0.778065    
maritalmarried     -4.718e+00  2.758e+00  -1.711 0.087196 .  
maritalsingle      -6.160e+00  3.212e+00  -1.918 0.055262 .  
educationsecondary -4.626e+00  2.808e+00  -1.648 0.099496 .  
educationtertiary  -7.841e+00  3.421e+00  -2.292 0.021966 *  
educationunknown   -4.024e+00  4.996e+00  -0.805 0.420666    
defaultyes          4.893e+00  6.836e+00   0.716 0.474213    
balance             8.014e-05  2.745e-04   0.292 0.770383    
housingyes          3.104e+00  2.055e+00   1.511 0.130987    
loanyes            -1.104e+00  2.444e+00  -0.452 0.651429    
contacttelephone   -9.911e-01  3.613e+00  -0.274 0.783875    
contactunknown     -1.669e+01  2.898e+00  -5.760 9.24e-09 ***
day                -2.742e-01  1.184e-01  -2.315 0.020668 *  
monthaug           -4.053e+00  4.329e+00  -0.936 0.349208    
monthdec           -1.283e+01  1.511e+01  -0.849 0.395709    
monthfeb           -1.526e+01  5.374e+00  -2.840 0.004537 ** 
monthjan           -1.726e+01  6.039e+00  -2.859 0.004283 ** 
monthjul           -3.616e+00  4.142e+00  -0.873 0.382661    
monthjun            6.164e+00  4.964e+00   1.242 0.214416    
monthmar           -1.968e+01  8.898e+00  -2.212 0.027057 *  
monthmay            1.452e+01  4.050e+00   3.584 0.000343 ***
monthnov           -2.309e+01  4.491e+00  -5.141 2.90e-07 ***
monthoct            5.879e+00  7.310e+00   0.804 0.421329    
monthsep           -5.047e+00  8.704e+00  -0.580 0.562041    
previous           -2.029e+00  7.069e-01  -2.871 0.004125 ** 
poutcomeother      -1.927e+01  4.916e+00  -3.920 9.05e-05 ***
poutcomesuccess    -7.139e+01  6.081e+00 -11.739  < 2e-16 ***
poutcomeunknown    -2.418e+02  3.603e+00 -67.094  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 47.88 on 3124 degrees of freedom
Multiple R-squared:  0.7724,    Adjusted R-squared:  0.7695 
F-statistic: 271.8 on 39 and 3124 DF,  p-value: < 2.2e-16
Code
# Apply stepwise selection  
step_model_pdays <- step(model_full_pdays, direction = "both")  
Start:  AIC=24520.53
pdays_recode ~ age + job + marital + education + default + balance + 
    housing + loan + contact + day + month + previous + poutcome

            Df Sum of Sq      RSS   AIC
- job       11     31432  7192431 24512
- balance    1       195  7161195 24519
- loan       1       468  7161468 24519
- default    1      1174  7162174 24519
- age        1      2023  7163023 24519
- education  3     12102  7173102 24520
- marital    2      8892  7169891 24520
<none>                    7161000 24520
- housing    1      5231  7166230 24521
- day        1     12287  7173286 24524
- previous   1     18888  7179888 24527
- contact    2     76518  7237518 24550
- month     11    300163  7461162 24628
- poutcome   3  11326700 18487699 27515

Step:  AIC=24512.39
pdays_recode ~ age + marital + education + default + balance + 
    housing + loan + contact + day + month + previous + poutcome

            Df Sum of Sq      RSS   AIC
- balance    1        69  7192500 24510
- loan       1       284  7192716 24510
- default    1      1455  7193886 24511
- education  3     12024  7204456 24512
<none>                    7192431 24512
- marital    2     11062  7203494 24513
- age        1      8111  7200543 24514
- housing    1      8565  7200996 24514
- day        1     11942  7204374 24516
- previous   1     19199  7211630 24519
+ job       11     31432  7161000 24520
- contact    2     75917  7268348 24542
- month     11    301366  7493798 24620
- poutcome   3  11326855 18519286 27499

Step:  AIC=24510.42
pdays_recode ~ age + marital + education + default + housing + 
    loan + contact + day + month + previous + poutcome

            Df Sum of Sq      RSS   AIC
- loan       1       299  7192799 24509
- default    1      1423  7193923 24509
- education  3     11973  7204474 24510
<none>                    7192500 24510
- marital    2     10998  7203498 24511
- age        1      8044  7200545 24512
- housing    1      8529  7201029 24512
+ balance    1        69  7192431 24512
- day        1     11961  7204462 24514
- previous   1     19180  7211681 24517
+ job       11     31305  7161195 24519
- contact    2     75907  7268407 24540
- month     11    301926  7494426 24618
- poutcome   3  11328811 18521311 27497

Step:  AIC=24508.55
pdays_recode ~ age + marital + education + default + housing + 
    contact + day + month + previous + poutcome

            Df Sum of Sq      RSS   AIC
- default    1      1358  7194157 24507
- education  3     11974  7204774 24508
<none>                    7192799 24509
- marital    2     10929  7203728 24509
- age        1      8061  7200860 24510
- housing    1      8546  7201345 24510
+ loan       1       299  7192500 24510
+ balance    1        84  7192716 24510
- day        1     11827  7204626 24512
- previous   1     19260  7212060 24515
+ job       11     31105  7161694 24517
- contact    2     76690  7269489 24538
- month     11    303915  7496715 24618
- poutcome   3  11329878 18522677 27495

Step:  AIC=24507.15
pdays_recode ~ age + marital + education + housing + contact + 
    day + month + previous + poutcome

            Df Sum of Sq      RSS   AIC
- education  3     11949  7206106 24506
<none>                    7194157 24507
- marital    2     11311  7205468 24508
+ default    1      1358  7192799 24509
- age        1      8063  7202221 24509
- housing    1      8465  7202622 24509
+ loan       1       234  7193923 24509
+ balance    1        47  7194110 24509
- day        1     12097  7206254 24510
- previous   1     19245  7213402 24514
+ job       11     31414  7162743 24515
- contact    2     76594  7270751 24537
- month     11    303460  7497617 24616
- poutcome   3  11330477 18524634 27494

Step:  AIC=24506.4
pdays_recode ~ age + marital + housing + contact + day + month + 
    previous + poutcome

            Df Sum of Sq      RSS   AIC
<none>                    7206106 24506
- age        1      5310  7211416 24507
+ education  3     11949  7194157 24507
- marital    2     11541  7217647 24508
+ default    1      1332  7204774 24508
- housing    1      8732  7214838 24508
+ loan       1       235  7205871 24508
+ balance    1         8  7206098 24508
- day        1     12502  7218607 24510
- previous   1     19258  7225364 24513
+ job       11     31156  7174950 24515
- contact    2     73717  7279823 24535
- month     11    311270  7517376 24618
- poutcome   3  11327391 18533497 27489
Code
summary(step_model_pdays)  

Call:
lm(formula = pdays_recode ~ age + marital + housing + contact + 
    day + month + previous + poutcome, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-254.72   -6.23   -0.40    6.70  648.25 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       258.4423     7.2818  35.492  < 2e-16 ***
age                -0.1415     0.0930  -1.521 0.128259    
maritalmarried     -4.3430     2.7335  -1.589 0.112205    
maritalsingle      -7.1292     3.1817  -2.241 0.025119 *  
housingyes          3.9386     2.0188   1.951 0.051153 .  
contacttelephone   -1.0544     3.5913  -0.294 0.769093    
contactunknown    -16.3159     2.8856  -5.654 1.70e-08 ***
day                -0.2756     0.1181  -2.334 0.019640 *  
monthaug           -4.0578     4.2598  -0.953 0.340865    
monthdec          -13.1136    15.0580  -0.871 0.383891    
monthfeb          -14.4074     5.3437  -2.696 0.007052 ** 
monthjan          -17.3423     6.0010  -2.890 0.003880 ** 
monthjul           -3.5069     4.1001  -0.855 0.392438    
monthjun            6.6268     4.9373   1.342 0.179631    
monthmar          -21.1928     8.8602  -2.392 0.016820 *  
monthmay           15.2232     4.0372   3.771 0.000166 ***
monthnov          -22.4642     4.4414  -5.058 4.48e-07 ***
monthoct            5.2799     7.2943   0.724 0.469216    
monthsep           -6.9992     8.6088  -0.813 0.416262    
previous           -2.0466     0.7064  -2.897 0.003791 ** 
poutcomeother     -19.0519     4.9073  -3.882 0.000106 ***
poutcomesuccess   -71.4666     6.0672 -11.779  < 2e-16 ***
poutcomeunknown  -241.3302     3.5987 -67.060  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 47.9 on 3141 degrees of freedom
Multiple R-squared:  0.7709,    Adjusted R-squared:  0.7693 
F-statistic: 480.5 on 22 and 3141 DF,  p-value: < 2.2e-16
Code
# Cross-validation with 10 folds  
set.seed(100)  
model_full_CV_pdays <- train(  
  pdays_formula,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

# the formula from stepwise selection  
step_formula_pdays <- formula(step_model_pdays)  

# Running cross-validation on stepwise model  
model_step_CV_pdays <- train(  
  step_formula_pdays,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

## Compare RMSE  
print(model_full_CV_pdays)  
Linear Regression 

3164 samples
  13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2847, 2848, 2847, 2848, 2848, 2848, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  48.19108  0.7690839  22.72181

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_pdays)  
Linear Regression 

3164 samples
   8 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2847, 2847, 2847, 2848, 2848, 2848, ... 
Resampling results:

  RMSE      Rsquared   MAE     
  48.06464  0.7688941  22.14233

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# Select the model with better performance (lower RMSE)  
if(model_step_CV_pdays$results$RMSE < model_full_CV_pdays$results$RMSE) {  
  final_model_pdays <- step_model_pdays  
  cat("Selected model: Stepwise\n")  
} else {  
  final_model_pdays <- model_full_pdays  
  cat("Selected model: Full model\n")  
}  
Selected model: Stepwise
Code
## Model Diagnostics
# Plot residuals  
plot(final_model_pdays$fitted.values, resid(final_model_pdays),  
     main = "Residual Plot - pdays Model",  
     xlab = "Fitted Values", ylab = "Residuals")  
abline(h = 0, col = "red", lty = 2)  

Code
# Predict on test set  
pred_pdays <- predict(final_model_pdays, newdata = test)  
rmse_pdays <- sqrt(mean((pred_pdays - test$pdays_recode)^2))  
cat("Test RMSE for pdays model:", rmse_pdays, "\n")  
Test RMSE for pdays model: 44.86248 
Code
# Model Equation  
{
  cat("\npdays Model Equation:\n")  
  cat("\npdays_recode =", round(coef(final_model_pdays)[1], 4))  
  for (i in 2:length(coef(final_model_pdays))) {  
    coef_name <- names(coef(final_model_pdays))[i]  
    coef_value <- coef(final_model_pdays)[i]  
    if (coef_value >= 0) {  
      cat(" +", round(coef_value, 4), "*", coef_name)  
    } else {  
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)  
    }  
  }  
}

pdays Model Equation:

pdays_recode = 258.4423 - 0.1415 * age - 4.343 * maritalmarried - 7.1292 * maritalsingle + 3.9386 * housingyes - 1.0544 * contacttelephone - 16.3159 * contactunknown - 0.2756 * day - 4.0578 * monthaug - 13.1136 * monthdec - 14.4074 * monthfeb - 17.3423 * monthjan - 3.5069 * monthjul + 6.6268 * monthjun - 21.1928 * monthmar + 15.2232 * monthmay - 22.4642 * monthnov + 5.2799 * monthoct - 6.9992 * monthsep - 2.0466 * previous - 19.0519 * poutcomeother - 71.4666 * poutcomesuccess - 241.3302 * poutcomeunknown

2.4 Model 3: Predicting duration (length of last contact)

Code
duration_vars <- setdiff(names(df), c("y", "duration", "campaign", "pdays", "pdays_recode"))  
duration_formula <- as.formula(paste("duration ~", paste(duration_vars, collapse = " + ")))  

# Fit full model  
model_full_duration <- lm(duration_formula, data = train)  
summary(model_full_duration)  

Call:
lm(formula = duration_formula, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-416.80 -158.41  -71.09   63.82 2188.89 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        240.668431  45.650946   5.272 1.44e-07 ***
age                 -0.192320   0.569472  -0.338 0.735599    
jobblue-collar      63.767588  18.312893   3.482 0.000504 ***
jobentrepreneur     78.903426  27.910192   2.827 0.004728 ** 
jobhousemaid        90.037307  32.664477   2.756 0.005878 ** 
jobmanagement       62.860964  19.836406   3.169 0.001545 ** 
jobretired          88.974794  27.566379   3.228 0.001261 ** 
jobself-employed    63.476648  27.192850   2.334 0.019642 *  
jobservices         30.668005  20.945706   1.464 0.143249    
jobstudent          17.795470  39.455480   0.451 0.652002    
jobtechnician       29.110406  18.331586   1.588 0.112389    
jobunemployed       88.413497  31.628315   2.795 0.005215 ** 
jobunknown          42.411919  56.184522   0.755 0.450385    
maritalmarried     -30.493692  14.792334  -2.061 0.039342 *  
maritalsingle       -1.529751  17.231493  -0.089 0.929265    
educationsecondary  27.871562  15.059814   1.851 0.064304 .  
educationtertiary    3.966690  18.350270   0.216 0.828873    
educationunknown   -12.541982  26.800201  -0.468 0.639831    
defaultyes         -47.443923  36.670109  -1.294 0.195829    
balance             -0.002174   0.001473  -1.476 0.140040    
housingyes          -1.166730  11.023754  -0.106 0.915718    
loanyes             -9.629006  13.111306  -0.734 0.462757    
contacttelephone   -39.965234  19.381804  -2.062 0.039290 *  
contactunknown       0.219514  15.547686   0.014 0.988736    
day                 -0.827925   0.635208  -1.303 0.192537    
monthaug           -56.074700  23.221259  -2.415 0.015801 *  
monthdec           250.066073  81.032679   3.086 0.002047 ** 
monthfeb           -29.279243  28.828642  -1.016 0.309884    
monthjan           -32.025903  32.394884  -0.989 0.322931    
monthjul            -9.730399  22.217423  -0.438 0.661444    
monthjun           -61.371174  26.626038  -2.305 0.021236 *  
monthmar           -91.804639  47.731970  -1.923 0.054529 .  
monthmay           -22.996602  21.727086  -1.058 0.289941    
monthnov             1.254662  24.087885   0.052 0.958463    
monthoct           -27.148901  39.213614  -0.692 0.488779    
monthsep           -99.068507  46.689508  -2.122 0.033928 *  
previous             5.726879   3.792016   1.510 0.131082    
poutcomeother       30.558220  26.367607   1.159 0.246573    
poutcomesuccess    100.622882  32.621895   3.085 0.002057 ** 
poutcomeunknown     36.787162  19.328241   1.903 0.057095 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 256.8 on 3124 degrees of freedom
Multiple R-squared:  0.02815,   Adjusted R-squared:  0.01602 
F-statistic: 2.321 on 39 and 3124 DF,  p-value: 7.11e-06
Code
# Apply stepwise selection  
step_model_duration <- step(model_full_duration, direction = "both")  
Start:  AIC=35149.88
duration ~ age + job + marital + education + default + balance + 
    housing + loan + contact + day + month + previous + poutcome

            Df Sum of Sq       RSS   AIC
- housing    1       739 206050151 35148
- age        1      7523 206056935 35148
- loan       1     35574 206084986 35148
- default    1    110407 206159820 35150
- day        1    112050 206161462 35150
<none>                   206049413 35150
- balance    1    143696 206193108 35150
- previous   1    150438 206199850 35150
- contact    2    285442 206334855 35150
- education  3    452770 206502183 35151
- job       11   1693386 207742799 35154
- poutcome   3    696258 206745671 35155
- marital    2    568095 206617507 35155
- month     11   2163526 208212939 35161

Step:  AIC=35147.9
duration ~ age + job + marital + education + default + balance + 
    loan + contact + day + month + previous + poutcome

            Df Sum of Sq       RSS   AIC
- age        1      7097 206057248 35146
- loan       1     35452 206085603 35146
- default    1    110227 206160379 35148
- day        1    111490 206161641 35148
<none>                   206050151 35148
- balance    1    143319 206193471 35148
- previous   1    150503 206200654 35148
- contact    2    284774 206334926 35148
- education  3    452050 206502202 35149
+ housing    1       739 206049413 35150
- job       11   1704634 207754786 35152
- marital    2    575636 206625788 35153
- poutcome   3    708498 206758650 35153
- month     11   2249572 208299724 35160

Step:  AIC=35146
duration ~ job + marital + education + default + balance + loan + 
    contact + day + month + previous + poutcome

            Df Sum of Sq       RSS   AIC
- loan       1     35699 206092947 35145
- default    1    110372 206167620 35146
- day        1    110572 206167820 35146
<none>                   206057248 35146
- balance    1    146828 206204076 35146
- previous   1    151178 206208426 35146
- contact    2    304309 206361557 35147
- education  3    467063 206524311 35147
+ age        1      7097 206050151 35148
+ housing    1       313 206056935 35148
- job       11   1722384 207779632 35150
- poutcome   3    710524 206767772 35151
- marital    2    636342 206693590 35152
- month     11   2269263 208326511 35159

Step:  AIC=35144.55
duration ~ job + marital + education + default + balance + contact + 
    day + month + previous + poutcome

            Df Sum of Sq       RSS   AIC
- day        1    105695 206198641 35144
- default    1    116695 206209642 35144
<none>                   206092947 35145
- balance    1    140263 206233210 35145
- previous   1    148782 206241729 35145
- contact    2    299701 206392648 35145
- education  3    453730 206546677 35146
+ loan       1     35699 206057248 35146
+ age        1      7344 206085603 35146
+ housing    1       231 206092716 35147
- job       11   1728040 207820987 35149
- poutcome   3    717873 206810820 35150
- marital    2    638936 206731882 35150
- month     11   2243348 208336295 35157

Step:  AIC=35144.17
duration ~ job + marital + education + default + balance + contact + 
    month + previous + poutcome

            Df Sum of Sq       RSS   AIC
- default    1    109998 206308639 35144
<none>                   206198641 35144
- balance    1    138547 206337188 35144
- previous   1    151691 206350332 35145
+ day        1    105695 206092947 35145
- contact    2    310909 206509550 35145
- education  3    467581 206666222 35145
+ loan       1     30821 206167820 35146
+ age        1      6412 206192229 35146
+ housing    1         7 206198634 35146
- job       11   1746953 207945595 35149
- poutcome   3    712601 206911243 35149
- marital    2    639159 206837800 35150
- month     11   2154392 208353033 35155

Step:  AIC=35143.86
duration ~ job + marital + education + balance + contact + month + 
    previous + poutcome

            Df Sum of Sq       RSS   AIC
- balance    1    124018 206432657 35144
<none>                   206308639 35144
+ default    1    109998 206198641 35144
- previous   1    151075 206459714 35144
+ day        1     98997 206209642 35144
- contact    2    302559 206611198 35144
- education  3    460866 206769505 35145
+ loan       1     36694 206271945 35145
+ age        1      6598 206302041 35146
+ housing    1         1 206308638 35146
- job       11   1734492 208043131 35148
- poutcome   3    705012 207013651 35149
- marital    2    618852 206927491 35149
- month     11   2127787 208436426 35154

Step:  AIC=35143.76
duration ~ job + marital + education + contact + month + previous + 
    poutcome

            Df Sum of Sq       RSS   AIC
<none>                   206432657 35144
+ balance    1    124018 206308639 35144
- previous   1    149030 206581688 35144
+ day        1     97855 206334803 35144
+ default    1     95469 206337188 35144
- contact    2    317430 206750087 35145
- education  3    480215 206912872 35145
+ loan       1     30059 206402598 35145
+ age        1      9670 206422988 35146
+ housing    1        95 206432562 35146
- job       11   1699726 208132383 35148
- poutcome   3    698925 207131582 35148
- marital    2    633174 207065832 35149
- month     11   2098478 208531135 35154
Code
summary(step_model_duration)  

Call:
lm(formula = duration ~ job + marital + education + contact + 
    month + previous + poutcome, data = train)

Residuals:
    Min      1Q  Median      3Q     Max 
-407.21 -157.46  -72.69   65.58 2199.57 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)        212.86037   33.25465   6.401 1.78e-10 ***
jobblue-collar      64.92411   18.29789   3.548 0.000394 ***
jobentrepreneur     75.99421   27.79683   2.734 0.006294 ** 
jobhousemaid        88.26812   32.49658   2.716 0.006639 ** 
jobmanagement       61.93859   19.78221   3.131 0.001758 ** 
jobretired          84.10879   25.38868   3.313 0.000934 ***
jobself-employed    61.78942   27.11018   2.279 0.022722 *  
jobservices         31.06701   20.93262   1.484 0.137871    
jobstudent          21.46687   38.87163   0.552 0.580816    
jobtechnician       28.54625   18.31788   1.558 0.119244    
jobunemployed       89.59062   31.55454   2.839 0.004551 ** 
jobunknown          44.49102   55.92620   0.796 0.426365    
maritalmarried     -30.25982   14.69972  -2.059 0.039622 *  
maritalsingle       -0.04468   16.41872  -0.003 0.997829    
educationsecondary  28.64329   14.89213   1.923 0.054522 .  
educationtertiary    4.76297   18.11407   0.263 0.792612    
educationunknown   -13.30023   26.76044  -0.497 0.619216    
contacttelephone   -41.94706   19.13128  -2.193 0.028410 *  
contactunknown      -3.02578   15.36363  -0.197 0.843884    
monthaug           -53.66417   22.58941  -2.376 0.017579 *  
monthdec           246.59741   80.82912   3.051 0.002301 ** 
monthfeb           -20.07161   27.74507  -0.723 0.469470    
monthjan           -37.88221   31.50769  -1.202 0.229332    
monthjul            -9.30814   21.97246  -0.424 0.671867    
monthjun           -53.19077   25.69105  -2.070 0.038497 *  
monthmar           -87.74503   47.46413  -1.849 0.064601 .  
monthmay           -18.91454   21.55161  -0.878 0.380207    
monthnov            -2.63170   23.97585  -0.110 0.912603    
monthoct           -31.25260   38.80865  -0.805 0.420708    
monthsep           -93.55866   46.44619  -2.014 0.044059 *  
previous             5.69838    3.79080   1.503 0.132885    
poutcomeother       30.23809   26.34988   1.148 0.251238    
poutcomesuccess    101.27051   32.45532   3.120 0.001823 ** 
poutcomeunknown     34.86610   19.23647   1.812 0.070005 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 256.8 on 3130 degrees of freedom
Multiple R-squared:  0.02635,   Adjusted R-squared:  0.01608 
F-statistic: 2.567 on 33 and 3130 DF,  p-value: 2.504e-06
Code
# Cross-validation with 10 folds  
set.seed(100)  
model_full_CV_duration <- train(  
  duration_formula,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

# the formula from stepwise selection  
step_formula_duration <- formula(step_model_duration)  

# Running cross-validation on stepwise model  
model_step_CV_duration <- train(  
  step_formula_duration,   
  data = train,   
  method = "lm",   
  trControl = trainControl(method = "cv", number = 10)  
)  

## Compare RMSE  
print(model_full_CV_duration)  
Linear Regression 

3164 samples
  13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2849, 2847, 2847, 2848, 2847, 2848, ... 
Resampling results:

  RMSE      Rsquared    MAE     
  257.3651  0.01280132  177.0185

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_duration)  
Linear Regression 

3164 samples
   7 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 2847, 2848, 2847, 2848, 2847, 2848, ... 
Resampling results:

  RMSE      Rsquared    MAE     
  257.2609  0.01321806  176.9475

Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# Select the model with better performance (lower RMSE)  
if(model_step_CV_duration$results$RMSE < model_full_CV_duration$results$RMSE) {  
  final_model_duration <- step_model_duration  
  cat("Selected model: Stepwise\n")  
} else {  
  final_model_duration <- model_full_duration  
  cat("Selected model: Full model\n")  
}  
Selected model: Stepwise
Code
## Model Diagnostics
# Plot residuals  
plot(final_model_duration$fitted.values, resid(final_model_duration),  
     main = "Residual Plot - Duration Model",  
     xlab = "Fitted Values", ylab = "Residuals")  
abline(h = 0, col = "red", lty = 2)  

Code
# Predict on test set  
pred_duration <- predict(final_model_duration, newdata = test)  
rmse_duration <- sqrt(mean((pred_duration - test$duration)^2))  
cat("Test RMSE for duration model:", rmse_duration, "\n")  
Test RMSE for duration model: 264.5255 
Code
# Model Equation  
{
  cat("\nDuration Model Equation:\n")  
  cat("\nduration =", round(coef(final_model_duration)[1], 4))  
  for (i in 2:length(coef(final_model_duration))) {  
    coef_name <- names(coef(final_model_duration))[i]  
    coef_value <- coef(final_model_duration)[i]  
    if (coef_value >= 0) {  
      cat(" +", round(coef_value, 4), "*", coef_name)  
    } else {  
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)  
    }  
  }  
}

Duration Model Equation:

duration = 212.8604 + 64.9241 * jobblue-collar + 75.9942 * jobentrepreneur + 88.2681 * jobhousemaid + 61.9386 * jobmanagement + 84.1088 * jobretired + 61.7894 * jobself-employed + 31.067 * jobservices + 21.4669 * jobstudent + 28.5463 * jobtechnician + 89.5906 * jobunemployed + 44.491 * jobunknown - 30.2598 * maritalmarried - 0.0447 * maritalsingle + 28.6433 * educationsecondary + 4.763 * educationtertiary - 13.3002 * educationunknown - 41.9471 * contacttelephone - 3.0258 * contactunknown - 53.6642 * monthaug + 246.5974 * monthdec - 20.0716 * monthfeb - 37.8822 * monthjan - 9.3081 * monthjul - 53.1908 * monthjun - 87.745 * monthmar - 18.9145 * monthmay - 2.6317 * monthnov - 31.2526 * monthoct - 93.5587 * monthsep + 5.6984 * previous + 30.2381 * poutcomeother + 101.2705 * poutcomesuccess + 34.8661 * poutcomeunknown

3 Generalized Linear Models

3.1 Model 1: GLMs for pdays

Model ZIP and Hurdle fail to converge, and have NA in standard errors and p-values, so they are excluded from our consideration.

Code
# 1. Poisson model  
glm_poisson_pdays <- glm(  
  pdays_formula,  # Use the same predictors as in the MLR model
  data = train,   
  family = poisson(link = "log")  
)  
summary(glm_poisson_pdays)  

Call:
glm(formula = pdays_formula, family = poisson(link = "log"), 
    data = train)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         5.761e+00  2.699e-02 213.410  < 2e-16 ***
age                -1.853e-03  3.681e-04  -5.034 4.80e-07 ***
jobblue-collar      8.169e-02  1.077e-02   7.582 3.41e-14 ***
jobentrepreneur     2.135e-01  1.838e-02  11.611  < 2e-16 ***
jobhousemaid       -1.999e-01  2.245e-02  -8.906  < 2e-16 ***
jobmanagement       5.469e-02  1.208e-02   4.529 5.93e-06 ***
jobretired         -1.035e-01  1.859e-02  -5.568 2.58e-08 ***
jobself-employed   -1.087e-02  1.853e-02  -0.587  0.55751    
jobservices         1.259e-01  1.268e-02   9.929  < 2e-16 ***
jobstudent         -5.324e-02  2.350e-02  -2.266  0.02346 *  
jobtechnician       9.099e-02  1.104e-02   8.240  < 2e-16 ***
jobunemployed       1.811e-01  1.874e-02   9.667  < 2e-16 ***
jobunknown         -4.311e-01  6.089e-02  -7.080 1.45e-12 ***
maritalmarried     -1.039e-01  9.309e-03 -11.156  < 2e-16 ***
maritalsingle      -1.285e-01  1.075e-02 -11.959  < 2e-16 ***
educationsecondary -1.282e-01  9.785e-03 -13.105  < 2e-16 ***
educationtertiary  -1.657e-01  1.208e-02 -13.717  < 2e-16 ***
educationunknown   -1.671e-01  1.616e-02 -10.338  < 2e-16 ***
defaultyes          8.185e-02  3.017e-02   2.713  0.00667 ** 
balance            -1.237e-06  1.066e-06  -1.161  0.24567    
housingyes          1.379e-01  7.490e-03  18.417  < 2e-16 ***
loanyes            -1.085e-02  9.233e-03  -1.176  0.23977    
contacttelephone   -1.211e-02  1.097e-02  -1.104  0.26954    
contactunknown      3.811e-01  2.157e-02  17.665  < 2e-16 ***
day                -4.559e-03  4.873e-04  -9.356  < 2e-16 ***
monthaug           -7.786e-02  1.494e-02  -5.210 1.88e-07 ***
monthdec           -6.988e-02  2.695e-02  -2.593  0.00952 ** 
monthfeb           -1.863e-01  1.346e-02 -13.837  < 2e-16 ***
monthjan           -1.585e-01  1.584e-02 -10.004  < 2e-16 ***
monthjul           -1.314e-01  2.255e-02  -5.826 5.67e-09 ***
monthjun           -2.265e-01  1.987e-02 -11.400  < 2e-16 ***
monthmar           -2.626e-01  2.607e-02 -10.070  < 2e-16 ***
monthmay            1.384e-01  9.048e-03  15.300  < 2e-16 ***
monthnov           -4.183e-01  1.220e-02 -34.298  < 2e-16 ***
monthoct            1.022e-01  1.658e-02   6.165 7.04e-10 ***
monthsep           -1.021e-01  1.923e-02  -5.309 1.10e-07 ***
previous           -1.222e-02  1.006e-03 -12.146  < 2e-16 ***
poutcomeother      -9.609e-02  7.065e-03 -13.600  < 2e-16 ***
poutcomesuccess    -2.653e-01  1.033e-02 -25.699  < 2e-16 ***
poutcomeunknown    -2.389e+01  1.087e+02  -0.220  0.82607    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for poisson family taken to be 1)

    Null deviance: 473275  on 3163  degrees of freedom
Residual deviance:  25931  on 3124  degrees of freedom
AIC: 30032

Number of Fisher Scoring iterations: 16
Code
# Check for overdispersion  
dispersion <- sum(residuals(glm_poisson_pdays, type = "pearson")^2) / df.residual(glm_poisson_pdays)  
cat("Dispersion parameter:", dispersion, "\n")  
Dispersion parameter: 8.615815 
Code
# 2. Negative Binomial model (if overdispersion is evident)  
glm_nb_pdays <- glm.nb(pdays_formula, data = train)  
summary(glm_nb_pdays)  

Call:
glm.nb(formula = pdays_formula, data = train, init.theta = 3.802227253, 
    link = log)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)         5.762e+00  2.077e-01  27.743  < 2e-16 ***
age                -1.779e-03  2.795e-03  -0.637 0.524422    
jobblue-collar      9.754e-02  8.498e-02   1.148 0.251007    
jobentrepreneur     2.379e-01  1.466e-01   1.623 0.104625    
jobhousemaid       -2.038e-01  1.585e-01  -1.285 0.198645    
jobmanagement       7.207e-02  9.134e-02   0.789 0.430130    
jobretired         -6.839e-02  1.334e-01  -0.513 0.608149    
jobself-employed    1.176e-03  1.383e-01   0.009 0.993217    
jobservices         1.445e-01  1.005e-01   1.438 0.150465    
jobstudent         -9.482e-02  1.637e-01  -0.579 0.562362    
jobtechnician       1.307e-01  8.422e-02   1.552 0.120691    
jobunemployed       2.858e-01  1.434e-01   1.993 0.046209 *  
jobunknown         -3.975e-01  3.211e-01  -1.238 0.215752    
maritalmarried     -1.415e-01  7.541e-02  -1.877 0.060518 .  
maritalsingle      -1.646e-01  8.574e-02  -1.920 0.054865 .  
educationsecondary -1.240e-01  8.051e-02  -1.540 0.123573    
educationtertiary  -1.738e-01  9.449e-02  -1.840 0.065784 .  
educationunknown   -1.281e-01  1.255e-01  -1.021 0.307235    
defaultyes          7.954e-02  2.661e-01   0.299 0.764980    
balance             1.471e-06  8.035e-06   0.183 0.854744    
housingyes          1.553e-01  5.521e-02   2.813 0.004906 ** 
loanyes            -9.573e-03  7.079e-02  -0.135 0.892421    
contacttelephone   -3.044e-02  8.262e-02  -0.368 0.712513    
contactunknown      3.314e-01  1.806e-01   1.835 0.066433 .  
day                -4.456e-03  3.641e-03  -1.224 0.221034    
monthaug           -1.025e-01  1.107e-01  -0.926 0.354472    
monthdec           -5.048e-02  1.901e-01  -0.266 0.790566    
monthfeb           -1.799e-01  1.022e-01  -1.760 0.078400 .  
monthjan           -1.731e-01  1.144e-01  -1.513 0.130306    
monthjul           -1.314e-01  1.613e-01  -0.815 0.415162    
monthjun           -2.333e-01  1.421e-01  -1.641 0.100813    
monthmar           -2.402e-01  1.684e-01  -1.426 0.153824    
monthmay            1.451e-01  7.345e-02   1.975 0.048218 *  
monthnov           -4.312e-01  8.708e-02  -4.952 7.35e-07 ***
monthoct            7.594e-02  1.288e-01   0.589 0.555556    
monthsep           -7.510e-02  1.399e-01  -0.537 0.591276    
previous           -1.354e-02  7.730e-03  -1.752 0.079752 .  
poutcomeother      -1.102e-01  5.480e-02  -2.011 0.044352 *  
poutcomesuccess    -2.679e-01  7.113e-02  -3.766 0.000166 ***
poutcomeunknown    -3.387e+01  1.613e+04  -0.002 0.998324    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Negative Binomial(3.8022) family taken to be 1)

    Null deviance: 60731.23  on 3163  degrees of freedom
Residual deviance:   614.32  on 3124  degrees of freedom
AIC: 6974.8

Number of Fisher Scoring iterations: 1

              Theta:  3.802 
          Std. Err.:  0.228 

 2 x log-likelihood:  -6892.833 
Code
# 3. Zero-Inflated Poisson model  
# For ZIP models
zero_formula <- ~1  # Intercept only for zero model  
zip_pdays <- zeroinfl(  
  formula = as.formula(paste(  
    "pdays_recode ~", paste(attr(terms(pdays_formula), "term.labels"), collapse = " + "),   # Use the same predictors for the count part
    "| 1"  
  )),  
  data = train,   
  dist = "poisson"  
)  
summary(zip_pdays)  

Call:
zeroinfl(formula = as.formula(paste("pdays_recode ~", paste(attr(terms(pdays_formula), 
    "term.labels"), collapse = " + "), "| 1")), data = train, dist = "poisson")

Pearson residuals:
       Min         1Q     Median         3Q        Max 
-1.740e+01 -1.277e-04 -1.010e-04 -9.023e-05  5.103e+01 

Count model coefficients (poisson with log link):
                     Estimate Std. Error z value Pr(>|z|)
(Intercept)         5.761e+00         NA      NA       NA
age                -1.853e-03         NA      NA       NA
jobblue-collar      8.169e-02         NA      NA       NA
jobentrepreneur     2.135e-01         NA      NA       NA
jobhousemaid       -1.999e-01         NA      NA       NA
jobmanagement       5.469e-02         NA      NA       NA
jobretired         -1.035e-01         NA      NA       NA
jobself-employed   -1.087e-02         NA      NA       NA
jobservices         1.259e-01         NA      NA       NA
jobstudent         -5.324e-02         NA      NA       NA
jobtechnician       9.099e-02         NA      NA       NA
jobunemployed       1.811e-01         NA      NA       NA
jobunknown         -4.310e-01         NA      NA       NA
maritalmarried     -1.039e-01         NA      NA       NA
maritalsingle      -1.286e-01         NA      NA       NA
educationsecondary -1.282e-01         NA      NA       NA
educationtertiary  -1.657e-01         NA      NA       NA
educationunknown   -1.671e-01         NA      NA       NA
defaultyes          8.185e-02         NA      NA       NA
balance            -1.237e-06         NA      NA       NA
housingyes          1.379e-01         NA      NA       NA
loanyes            -1.085e-02         NA      NA       NA
contacttelephone   -1.211e-02         NA      NA       NA
contactunknown      3.811e-01         NA      NA       NA
day                -4.559e-03         NA      NA       NA
monthaug           -7.786e-02         NA      NA       NA
monthdec           -6.988e-02         NA      NA       NA
monthfeb           -1.863e-01         NA      NA       NA
monthjan           -1.585e-01         NA      NA       NA
monthjul           -1.314e-01         NA      NA       NA
monthjun           -2.265e-01         NA      NA       NA
monthmar           -2.626e-01         NA      NA       NA
monthmay            1.384e-01         NA      NA       NA
monthnov           -4.183e-01         NA      NA       NA
monthoct            1.022e-01         NA      NA       NA
monthsep           -1.021e-01         NA      NA       NA
previous           -1.222e-02         NA      NA       NA
poutcomeother      -9.609e-02         NA      NA       NA
poutcomesuccess    -2.653e-01         NA      NA       NA
poutcomeunknown    -2.391e+01         NA      NA       NA

Zero-inflation model coefficients (binomial with logit link):
            Estimate Std. Error z value Pr(>|z|)
(Intercept)    -19.4         NA      NA       NA

Number of iterations in BFGS optimization: 65 
Log-likelihood: -1.498e+04 on 41 Df
Code
# 4. Hurdle model  
hurdle_pdays <- hurdle(  
  formula = as.formula(paste(  
    "pdays_recode ~", paste(attr(terms(pdays_formula), "term.labels"), collapse = " + "),   
    "| 1"  
  )),  
  data = train,   
  dist = "poisson"  
)  
summary(hurdle_pdays)  

Call:
hurdle(formula = as.formula(paste("pdays_recode ~", paste(attr(terms(pdays_formula), 
    "term.labels"), collapse = " + "), "| 1")), data = train, dist = "poisson")

Pearson residuals:
    Min      1Q  Median      3Q     Max 
-0.4688 -0.4688 -0.4688 -0.4688 11.9308 

Count model coefficients (truncated poisson with log link):
                     Estimate Std. Error z value Pr(>|z|)
(Intercept)         5.761e+00         NA      NA       NA
age                -1.853e-03         NA      NA       NA
jobblue-collar      8.169e-02         NA      NA       NA
jobentrepreneur     2.135e-01         NA      NA       NA
jobhousemaid       -1.999e-01         NA      NA       NA
jobmanagement       5.469e-02         NA      NA       NA
jobretired         -1.035e-01         NA      NA       NA
jobself-employed   -1.087e-02         NA      NA       NA
jobservices         1.259e-01         NA      NA       NA
jobstudent         -5.324e-02         NA      NA       NA
jobtechnician       9.099e-02         NA      NA       NA
jobunemployed       1.811e-01         NA      NA       NA
jobunknown         -4.311e-01         NA      NA       NA
maritalmarried     -1.039e-01         NA      NA       NA
maritalsingle      -1.285e-01         NA      NA       NA
educationsecondary -1.282e-01         NA      NA       NA
educationtertiary  -1.657e-01         NA      NA       NA
educationunknown   -1.671e-01         NA      NA       NA
defaultyes          8.185e-02         NA      NA       NA
balance            -1.237e-06         NA      NA       NA
housingyes          1.379e-01         NA      NA       NA
loanyes            -1.085e-02         NA      NA       NA
contacttelephone   -1.211e-02         NA      NA       NA
contactunknown      3.811e-01         NA      NA       NA
day                -4.559e-03         NA      NA       NA
monthaug           -7.786e-02         NA      NA       NA
monthdec           -6.988e-02         NA      NA       NA
monthfeb           -1.863e-01         NA      NA       NA
monthjan           -1.585e-01         NA      NA       NA
monthjul           -1.314e-01         NA      NA       NA
monthjun           -2.265e-01         NA      NA       NA
monthmar           -2.626e-01         NA      NA       NA
monthmay            1.384e-01         NA      NA       NA
monthnov           -4.183e-01         NA      NA       NA
monthoct            1.022e-01         NA      NA       NA
monthsep           -1.021e-01         NA      NA       NA
previous           -1.222e-02         NA      NA       NA
poutcomeother      -9.609e-02         NA      NA       NA
poutcomesuccess    -2.653e-01         NA      NA       NA
poutcomeunknown    -2.389e+01         NA      NA       NA
Zero hurdle model coefficients (binomial with logit link):
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -1.51532    0.04626  -32.76   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Number of iterations in BFGS optimization: 2 
Log-likelihood: -1.647e+04 on 41 Df
Code
## Evaluate Models Using RMSE
# Predict on test set for each model and calculate RMSE  
pred_poisson <- predict(glm_poisson_pdays, newdata = test, type = "response")  
rmse_poisson <- sqrt(mean((pred_poisson - test$pdays_recode)^2))  

pred_nb <- predict(glm_nb_pdays, newdata = test, type = "response")  
rmse_nb <- sqrt(mean((pred_nb - test$pdays_recode)^2))  

# pred_zip <- predict(zip_pdays, newdata = test, type = "response")  
# rmse_zip <- sqrt(mean((pred_zip - test$pdays_recode)^2))  
# 
# pred_hurdle <- predict(hurdle_pdays, newdata = test, type = "response")  
# rmse_hurdle <- sqrt(mean((pred_hurdle - test$pdays_recode)^2))  

# Compare RMSE values  
rmse_comparison <- data.frame(  
  Model = c("Poisson", "Negative Binomial"),  
  test_RMSE = c(rmse_poisson, rmse_nb)  
)  
print(rmse_comparison)  
              Model test_RMSE
1           Poisson  42.28214
2 Negative Binomial  42.57622
Code
# Select the best model based on lowest RMSE  
best_model_idx <- which.min(rmse_comparison$test_RMSE)  
best_model_name <- rmse_comparison$Model[best_model_idx]  
cat("Best model for pdays:", best_model_name, "\n")  
Best model for pdays: Poisson 
Code
# Model Equation
best_model <- if(best_model_name == "Poisson") glm_poisson_pdays else glm_nb_pdays

{
  cat("
Best Model (", best_model_name, ") Equation:
")
  cat("
log(pdays) =", round(coef(best_model)[1], 4))
  for (i in 2:length(coef(best_model))) {
    coef_name <- names(coef(best_model))[i]
    coef_value <- coef(best_model)[i]
    if (coef_value >= 0) {
      cat(" +", round(coef_value, 4), "*", coef_name)
    } else {
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)
    }
  }
}

Best Model ( Poisson ) Equation:

log(pdays) = 5.7607 - 0.0019 * age + 0.0817 * jobblue-collar + 0.2135 * jobentrepreneur - 0.1999 * jobhousemaid + 0.0547 * jobmanagement - 0.1035 * jobretired - 0.0109 * jobself-employed + 0.1259 * jobservices - 0.0532 * jobstudent + 0.091 * jobtechnician + 0.1811 * jobunemployed - 0.4311 * jobunknown - 0.1039 * maritalmarried - 0.1285 * maritalsingle - 0.1282 * educationsecondary - 0.1657 * educationtertiary - 0.1671 * educationunknown + 0.0819 * defaultyes - 0 * balance + 0.1379 * housingyes - 0.0109 * loanyes - 0.0121 * contacttelephone + 0.3811 * contactunknown - 0.0046 * day - 0.0779 * monthaug - 0.0699 * monthdec - 0.1863 * monthfeb - 0.1585 * monthjan - 0.1314 * monthjul - 0.2265 * monthjun - 0.2626 * monthmar + 0.1384 * monthmay - 0.4183 * monthnov + 0.1022 * monthoct - 0.1021 * monthsep - 0.0122 * previous - 0.0961 * poutcomeother - 0.2653 * poutcomesuccess - 23.8852 * poutcomeunknown

3.2 Model 2: GLMs for campaign

Code
# Gamma model with log link  
glm_gamma_campaign <- glm(  
  campaign_formula,   
  data = train,   
  family = Gamma(link = "log")  
)  
summary(glm_gamma_campaign)  

Call:
glm(formula = campaign_formula, family = Gamma(link = "log"), 
    data = train)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -1.041e-01  1.554e-01  -0.670 0.502751    
age                -8.290e-05  1.938e-03  -0.043 0.965885    
jobblue-collar      8.477e-02  6.233e-02   1.360 0.173922    
jobentrepreneur     1.523e-01  9.499e-02   1.604 0.108858    
jobhousemaid       -7.145e-02  1.112e-01  -0.643 0.520481    
jobmanagement       1.001e-01  6.751e-02   1.483 0.138284    
jobretired         -5.185e-02  9.382e-02  -0.553 0.580524    
jobself-employed    1.741e-01  9.255e-02   1.881 0.060030 .  
jobservices         4.417e-02  7.129e-02   0.620 0.535607    
jobstudent          1.669e-01  1.343e-01   1.243 0.214105    
jobtechnician      -2.749e-03  6.239e-02  -0.044 0.964857    
jobunemployed      -4.483e-03  1.076e-01  -0.042 0.966784    
jobunknown          5.231e-02  1.912e-01   0.274 0.784464    
maritalmarried      6.826e-02  5.035e-02   1.356 0.175254    
maritalsingle       6.069e-02  5.865e-02   1.035 0.300837    
educationsecondary -3.499e-03  5.126e-02  -0.068 0.945578    
educationtertiary  -7.578e-03  6.246e-02  -0.121 0.903433    
educationunknown   -1.225e-01  9.121e-02  -1.343 0.179231    
defaultyes         -3.835e-02  1.248e-01  -0.307 0.758683    
balance             1.811e-06  5.012e-06   0.361 0.717906    
housingyes          1.489e-01  3.752e-02   3.970 7.36e-05 ***
loanyes             5.179e-02  4.462e-02   1.161 0.245871    
contacttelephone    1.081e-01  6.597e-02   1.639 0.101412    
contactunknown     -2.819e-02  5.292e-02  -0.533 0.594218    
day                 2.357e-02  2.162e-03  10.901  < 2e-16 ***
monthaug            7.781e-01  7.903e-02   9.845  < 2e-16 ***
monthdec            4.877e-01  2.758e-01   1.768 0.077083 .  
monthfeb            4.629e-01  9.812e-02   4.717 2.49e-06 ***
monthjan           -1.927e-01  1.103e-01  -1.748 0.080602 .  
monthjul            5.536e-01  7.562e-02   7.322 3.10e-13 ***
monthjun            6.505e-01  9.062e-02   7.179 8.76e-13 ***
monthmar            4.913e-01  1.625e-01   3.024 0.002514 ** 
monthmay            2.629e-01  7.395e-02   3.555 0.000384 ***
monthnov           -2.784e-02  8.198e-02  -0.340 0.734179    
monthoct           -2.300e-01  1.335e-01  -1.723 0.084971 .  
monthsep            1.267e-01  1.589e-01   0.797 0.425426    
previous            1.389e-02  1.291e-02   1.076 0.282046    
poutcomeother       1.401e-01  8.974e-02   1.561 0.118536    
poutcomesuccess    -1.134e-01  1.110e-01  -1.021 0.307339    
poutcomeunknown     1.291e-01  6.578e-02   1.963 0.049779 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Gamma family taken to be 0.7640404)

    Null deviance: 1909.7  on 3163  degrees of freedom
Residual deviance: 1545.2  on 3124  degrees of freedom
AIC: 11525

Number of Fisher Scoring iterations: 7
Code
# Inverse Gaussian model with log link  
glm_invgauss_campaign <- glm(  
  campaign_formula,   
  data = train,   
  family = inverse.gaussian(link = "log")  
)  
summary(glm_invgauss_campaign)  

Call:
glm(formula = campaign_formula, family = inverse.gaussian(link = "log"), 
    data = train)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)        -6.405e-02  1.435e-01  -0.446 0.655480    
age                 4.491e-04  1.864e-03   0.241 0.809617    
jobblue-collar      7.969e-02  5.830e-02   1.367 0.171791    
jobentrepreneur     1.788e-01  9.256e-02   1.932 0.053508 .  
jobhousemaid        1.417e-02  1.059e-01   0.134 0.893590    
jobmanagement       1.094e-01  6.378e-02   1.715 0.086519 .  
jobretired         -2.506e-02  8.658e-02  -0.289 0.772236    
jobself-employed    1.568e-01  9.273e-02   1.691 0.090859 .  
jobservices         2.979e-02  6.684e-02   0.446 0.655837    
jobstudent          2.198e-01  1.307e-01   1.682 0.092753 .  
jobtechnician       2.957e-02  5.836e-02   0.507 0.612404    
jobunemployed      -2.790e-02  9.748e-02  -0.286 0.774713    
jobunknown          3.940e-02  1.770e-01   0.223 0.823916    
maritalmarried      6.661e-02  4.723e-02   1.410 0.158534    
maritalsingle       6.020e-02  5.534e-02   1.088 0.276737    
educationsecondary  6.506e-03  4.946e-02   0.132 0.895366    
educationtertiary  -1.414e-02  6.012e-02  -0.235 0.814083    
educationunknown   -1.101e-01  8.403e-02  -1.310 0.190305    
defaultyes         -1.255e-02  1.193e-01  -0.105 0.916164    
balance             2.311e-06  4.522e-06   0.511 0.609328    
housingyes          1.402e-01  3.615e-02   3.880 0.000107 ***
loanyes             3.018e-02  4.375e-02   0.690 0.490373    
contacttelephone    8.123e-02  6.428e-02   1.264 0.206452    
contactunknown     -2.580e-02  4.951e-02  -0.521 0.602391    
day                 2.144e-02  2.169e-03   9.887  < 2e-16 ***
monthaug            7.131e-01  7.513e-02   9.492  < 2e-16 ***
monthdec            4.682e-01  2.668e-01   1.755 0.079358 .  
monthfeb            4.230e-01  8.715e-02   4.854 1.27e-06 ***
monthjan           -1.865e-01  9.444e-02  -1.974 0.048423 *  
monthjul            5.229e-01  6.992e-02   7.478 9.75e-14 ***
monthjun            6.153e-01  8.382e-02   7.341 2.69e-13 ***
monthmar            4.783e-01  1.576e-01   3.035 0.002425 ** 
monthmay            2.518e-01  6.412e-02   3.928 8.77e-05 ***
monthnov           -4.463e-02  6.911e-02  -0.646 0.518494    
monthoct           -2.400e-01  1.012e-01  -2.373 0.017721 *  
monthsep            1.185e-01  1.310e-01   0.904 0.365890    
previous            1.091e-02  1.155e-02   0.944 0.345026    
poutcomeother       1.576e-01  8.276e-02   1.905 0.056909 .  
poutcomesuccess    -8.490e-02  9.168e-02  -0.926 0.354470    
poutcomeunknown     1.172e-01  5.731e-02   2.045 0.040925 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for inverse.gaussian family taken to be 0.2864534)

    Null deviance: 759.99  on 3163  degrees of freedom
Residual deviance: 632.45  on 3124  degrees of freedom
AIC: 10682

Number of Fisher Scoring iterations: 10
Code
# Predict on test set for each model and calculate RMSE  
pred_gamma <- predict(glm_gamma_campaign, newdata = test, type = "response")  
rmse_gamma <- sqrt(mean((pred_gamma - test$campaign)^2))  

pred_invgauss <- predict(glm_invgauss_campaign, newdata = test, type = "response")  
rmse_invgauss <- sqrt(mean((pred_invgauss - test$campaign)^2))  

# Compare with linear model RMSE  
rmse_comparison <- data.frame(  
  Model = c("Gamma", "Inverse Gaussian"),  
  test_RMSE = c(rmse_gamma, rmse_invgauss)  
)  
print(rmse_comparison)  
             Model test_RMSE
1            Gamma  3.256990
2 Inverse Gaussian  3.267202
Code
# Select the best model based on lowest RMSE  
best_model_idx <- which.min(rmse_comparison$test_RMSE)  
best_model_name <- rmse_comparison$Model[best_model_idx]  
cat("Best model for campaign:", best_model_name, "\n")  
Best model for campaign: Gamma 
Code
# Model Equation

best_model <- glm_gamma_campaign

{
  cat("
Gamma Model Equation (with log link):
")
  cat("
log(campaign) =", round(coef(best_model)[1], 4))
  for (i in 2:length(coef(best_model))) {
    coef_name <- names(coef(best_model))[i]
    coef_value <- coef(best_model)[i]
    if (coef_value >= 0) {
      cat(" +", round(coef_value, 4), "*", coef_name)
    } else {
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)
    }
  }
  
  cat("

Variance function: V(μ) = μ²")
  cat("
Dispersion parameter =", round(summary(best_model)$dispersion, 4))
}

Gamma Model Equation (with log link):

log(campaign) = -0.1041 - 1e-04 * age + 0.0848 * jobblue-collar + 0.1523 * jobentrepreneur - 0.0714 * jobhousemaid + 0.1001 * jobmanagement - 0.0519 * jobretired + 0.1741 * jobself-employed + 0.0442 * jobservices + 0.1669 * jobstudent - 0.0027 * jobtechnician - 0.0045 * jobunemployed + 0.0523 * jobunknown + 0.0683 * maritalmarried + 0.0607 * maritalsingle - 0.0035 * educationsecondary - 0.0076 * educationtertiary - 0.1225 * educationunknown - 0.0383 * defaultyes + 0 * balance + 0.1489 * housingyes + 0.0518 * loanyes + 0.1081 * contacttelephone - 0.0282 * contactunknown + 0.0236 * day + 0.7781 * monthaug + 0.4877 * monthdec + 0.4629 * monthfeb - 0.1927 * monthjan + 0.5536 * monthjul + 0.6505 * monthjun + 0.4913 * monthmar + 0.2629 * monthmay - 0.0278 * monthnov - 0.23 * monthoct + 0.1267 * monthsep + 0.0139 * previous + 0.1401 * poutcomeother - 0.1134 * poutcomesuccess + 0.1291 * poutcomeunknown

Variance function: V(μ) = μ²
Dispersion parameter = 0.764

3.3 Model 3: GLMs for duration

Code
# Gamma model with log link  
glm_gamma_duration <- glm(  
  duration_formula,   
  data = train,   
  family = Gamma(link = "log")  
)  
summary(glm_gamma_duration)  

Call:
glm(formula = duration_formula, family = Gamma(link = "log"), 
    data = train)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         5.481e+00  1.716e-01  31.943  < 2e-16 ***
age                -8.612e-04  2.141e-03  -0.402 0.687475    
jobblue-collar      2.484e-01  6.884e-02   3.608 0.000313 ***
jobentrepreneur     2.819e-01  1.049e-01   2.687 0.007240 ** 
jobhousemaid        3.774e-01  1.228e-01   3.073 0.002135 ** 
jobmanagement       2.580e-01  7.456e-02   3.461 0.000546 ***
jobretired          3.637e-01  1.036e-01   3.510 0.000454 ***
jobself-employed    2.469e-01  1.022e-01   2.416 0.015767 *  
jobservices         1.434e-01  7.873e-02   1.821 0.068630 .  
jobstudent          7.618e-02  1.483e-01   0.514 0.607505    
jobtechnician       1.212e-01  6.891e-02   1.759 0.078730 .  
jobunemployed       3.281e-01  1.189e-01   2.760 0.005816 ** 
jobunknown          1.481e-01  2.112e-01   0.701 0.483194    
maritalmarried     -1.117e-01  5.560e-02  -2.009 0.044675 *  
maritalsingle      -1.026e-02  6.477e-02  -0.158 0.874182    
educationsecondary  1.156e-01  5.661e-02   2.042 0.041208 *  
educationtertiary   2.629e-02  6.898e-02   0.381 0.703096    
educationunknown   -1.116e-02  1.007e-01  -0.111 0.911828    
defaultyes         -1.884e-01  1.378e-01  -1.367 0.171789    
balance            -8.964e-06  5.536e-06  -1.619 0.105473    
housingyes         -1.223e-02  4.144e-02  -0.295 0.767842    
loanyes            -3.675e-02  4.928e-02  -0.746 0.455958    
contacttelephone   -1.662e-01  7.285e-02  -2.281 0.022620 *  
contactunknown     -4.346e-03  5.844e-02  -0.074 0.940726    
day                -3.624e-03  2.388e-03  -1.518 0.129149    
monthaug           -2.122e-01  8.729e-02  -2.431 0.015108 *  
monthdec            5.223e-01  3.046e-01   1.715 0.086472 .  
monthfeb           -1.082e-01  1.084e-01  -0.998 0.318140    
monthjan           -1.271e-01  1.218e-01  -1.044 0.296779    
monthjul           -3.531e-02  8.351e-02  -0.423 0.672442    
monthjun           -2.342e-01  1.001e-01  -2.340 0.019341 *  
monthmar           -3.726e-01  1.794e-01  -2.077 0.037914 *  
monthmay           -7.474e-02  8.167e-02  -0.915 0.360190    
monthnov            7.382e-03  9.054e-02   0.082 0.935022    
monthoct           -9.119e-02  1.474e-01  -0.619 0.536196    
monthsep           -3.754e-01  1.755e-01  -2.139 0.032522 *  
previous            2.176e-02  1.425e-02   1.527 0.126969    
poutcomeother       1.103e-01  9.911e-02   1.113 0.265977    
poutcomesuccess     3.300e-01  1.226e-01   2.691 0.007163 ** 
poutcomeunknown     1.287e-01  7.265e-02   1.771 0.076664 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for Gamma family taken to be 0.931904)

    Null deviance: 2542.0  on 3163  degrees of freedom
Residual deviance: 2463.5  on 3124  degrees of freedom
AIC: 41466

Number of Fisher Scoring iterations: 6
Code
# Inverse Gaussian model with log link  
glm_invgauss_duration <- glm(  
  duration_formula,   
  data = train,   
  family = inverse.gaussian(link = "inverse")  
)  
summary(glm_invgauss_duration)  

Call:
glm(formula = duration_formula, family = inverse.gaussian(link = "inverse"), 
    data = train)

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         4.161e-03  6.533e-04   6.369 2.19e-10 ***
age                 3.618e-06  8.206e-06   0.441 0.659326    
jobblue-collar     -9.663e-04  2.723e-04  -3.549 0.000392 ***
jobentrepreneur    -1.094e-03  4.013e-04  -2.728 0.006416 ** 
jobhousemaid       -1.454e-03  4.575e-04  -3.178 0.001495 ** 
jobmanagement      -1.026e-03  2.962e-04  -3.463 0.000542 ***
jobretired         -1.394e-03  3.949e-04  -3.529 0.000423 ***
jobself-employed   -9.861e-04  3.976e-04  -2.480 0.013193 *  
jobservices        -5.797e-04  3.114e-04  -1.861 0.062795 .  
jobstudent         -3.162e-04  5.928e-04  -0.533 0.593834    
jobtechnician      -5.007e-04  2.780e-04  -1.801 0.071809 .  
jobunemployed      -1.231e-03  4.359e-04  -2.824 0.004767 ** 
jobunknown         -5.525e-04  8.837e-04  -0.625 0.531922    
maritalmarried      4.154e-04  2.064e-04   2.013 0.044233 *  
maritalsingle       3.617e-05  2.397e-04   0.151 0.880031    
educationsecondary -4.237e-04  2.171e-04  -1.952 0.051030 .  
educationtertiary  -8.551e-05  2.664e-04  -0.321 0.748226    
educationunknown    5.765e-05  3.996e-04   0.144 0.885303    
defaultyes          7.315e-04  5.575e-04   1.312 0.189529    
balance             3.617e-08  2.283e-08   1.584 0.113282    
housingyes          4.129e-05  1.584e-04   0.261 0.794377    
loanyes             1.423e-04  1.895e-04   0.751 0.452855    
contacttelephone    6.387e-04  2.930e-04   2.179 0.029372 *  
contactunknown      1.448e-05  2.205e-04   0.066 0.947673    
day                 1.430e-05  9.158e-06   1.561 0.118511    
monthaug            8.143e-04  3.307e-04   2.463 0.013840 *  
monthdec           -1.161e-03  8.300e-04  -1.399 0.161902    
monthfeb            4.051e-04  4.054e-04   0.999 0.317777    
monthjan            4.764e-04  4.695e-04   1.015 0.310325    
monthjul            1.333e-04  3.081e-04   0.433 0.665240    
monthjun            9.034e-04  3.800e-04   2.377 0.017505 *  
monthmar            1.515e-03  7.606e-04   1.992 0.046435 *  
monthmay            2.776e-04  3.016e-04   0.920 0.357438    
monthnov           -2.526e-05  3.319e-04  -0.076 0.939351    
monthoct            3.482e-04  5.590e-04   0.623 0.533361    
monthsep            1.497e-03  7.206e-04   2.077 0.037888 *  
previous           -7.309e-05  4.854e-05  -1.506 0.132203    
poutcomeother      -4.120e-04  3.691e-04  -1.116 0.264426    
poutcomesuccess    -1.145e-03  4.254e-04  -2.693 0.007129 ** 
poutcomeunknown    -4.678e-04  2.753e-04  -1.699 0.089358 .  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for inverse.gaussian family taken to be 0.003618591)

    Null deviance: 19.112  on 3163  degrees of freedom
Residual deviance: 18.821  on 3124  degrees of freedom
AIC: 42036

Number of Fisher Scoring iterations: 2
Code
# Predict on test set for each model and calculate RMSE  
pred_gamma <- predict(glm_gamma_duration, newdata = test, type = "response")  
rmse_gamma <- sqrt(mean((pred_gamma - test$duration)^2))  

pred_invgauss <- predict(glm_invgauss_duration, newdata = test, type = "response")  
rmse_invgauss <- sqrt(mean((pred_invgauss - test$duration)^2))  

# Compare with linear model RMSE  
rmse_comparison <- data.frame(  
  Model = c("Gamma", "Inverse Gaussian"),  
  test_RMSE = c(rmse_gamma, rmse_invgauss)  
)  
print(rmse_comparison)  
             Model test_RMSE
1            Gamma  264.5334
2 Inverse Gaussian  265.2772
Code
# Select the best model based on lowest RMSE  
best_model_idx <- which.min(rmse_comparison$test_RMSE)  
best_model_name <- rmse_comparison$Model[best_model_idx]  
cat("Best model for duration:", best_model_name, "\n")  
Best model for duration: Gamma 
Code
# Model Equation

{
  cat("
Gamma Model Equation for Duration (with log link):
")
  cat("
log(duration) =", round(coef(glm_gamma_duration)[1], 4))
  for (i in 2:length(coef(glm_gamma_duration))) {
    coef_name <- names(coef(glm_gamma_duration))[i]
    coef_value <- coef(glm_gamma_duration)[i]
    if (coef_value >= 0) {
      cat(" +", round(coef_value, 4), "*", coef_name)
    } else {
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)
    }
  }
  
  cat("

Variance function: V(μ) = μ²")
  cat("
Dispersion parameter =", round(summary(glm_gamma_duration)$dispersion, 4))
}

Gamma Model Equation for Duration (with log link):

log(duration) = 5.4812 - 9e-04 * age + 0.2484 * jobblue-collar + 0.2819 * jobentrepreneur + 0.3774 * jobhousemaid + 0.258 * jobmanagement + 0.3637 * jobretired + 0.2469 * jobself-employed + 0.1434 * jobservices + 0.0762 * jobstudent + 0.1212 * jobtechnician + 0.3281 * jobunemployed + 0.1481 * jobunknown - 0.1117 * maritalmarried - 0.0103 * maritalsingle + 0.1156 * educationsecondary + 0.0263 * educationtertiary - 0.0112 * educationunknown - 0.1884 * defaultyes - 0 * balance - 0.0122 * housingyes - 0.0367 * loanyes - 0.1662 * contacttelephone - 0.0043 * contactunknown - 0.0036 * day - 0.2122 * monthaug + 0.5223 * monthdec - 0.1082 * monthfeb - 0.1271 * monthjan - 0.0353 * monthjul - 0.2342 * monthjun - 0.3726 * monthmar - 0.0747 * monthmay + 0.0074 * monthnov - 0.0912 * monthoct - 0.3754 * monthsep + 0.0218 * previous + 0.1103 * poutcomeother + 0.33 * poutcomesuccess + 0.1287 * poutcomeunknown

Variance function: V(μ) = μ²
Dispersion parameter = 0.9319

4 Binary Classification

4.1 Logistic Regression

Code
y_predictors <- setdiff(names(df), c("y", "pdays"))  
y_formula <- as.formula(paste("y ~", paste(y_predictors, collapse = " + ")))  

# Fit logistic regression model  
logistic_model <- glm(y_formula, data = train, family = binomial)  
summary(logistic_model)  

Call:
glm(formula = y_formula, family = binomial, data = train)

Coefficients:
                     Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -1.804e+00  7.299e-01  -2.472 0.013428 *  
age                -4.764e-03  8.739e-03  -0.545 0.585606    
jobblue-collar     -6.865e-01  3.018e-01  -2.275 0.022905 *  
jobentrepreneur    -1.821e-01  4.363e-01  -0.417 0.676402    
jobhousemaid       -3.449e-01  4.887e-01  -0.706 0.480380    
jobmanagement      -1.529e-01  2.917e-01  -0.524 0.600268    
jobretired          7.269e-01  3.767e-01   1.930 0.053634 .  
jobself-employed    1.350e-01  4.024e-01   0.336 0.737195    
jobservices        -2.594e-01  3.409e-01  -0.761 0.446753    
jobstudent          3.273e-01  4.890e-01   0.669 0.503291    
jobtechnician      -1.669e-01  2.774e-01  -0.602 0.547388    
jobunemployed      -9.854e-01  5.459e-01  -1.805 0.071070 .  
jobunknown         -5.945e-01  9.158e-01  -0.649 0.516247    
maritalmarried     -4.550e-01  2.108e-01  -2.158 0.030915 *  
maritalsingle      -3.094e-01  2.457e-01  -1.259 0.207995    
educationsecondary -2.362e-01  2.442e-01  -0.967 0.333335    
educationtertiary  -1.340e-02  2.801e-01  -0.048 0.961861    
educationunknown   -1.048e+00  4.901e-01  -2.138 0.032550 *  
defaultyes          5.319e-01  5.239e-01   1.015 0.309983    
balance            -1.701e-05  2.003e-05  -0.849 0.395841    
housingyes         -3.336e-01  1.699e-01  -1.963 0.049623 *  
loanyes            -6.111e-01  2.428e-01  -2.517 0.011852 *  
contacttelephone   -2.511e-01  3.092e-01  -0.812 0.416737    
contactunknown     -1.252e+00  2.726e-01  -4.593 4.36e-06 ***
day                 1.782e-02  9.963e-03   1.788 0.073731 .  
monthaug           -2.340e-01  2.961e-01  -0.790 0.429249    
monthdec           -4.041e-01  8.987e-01  -0.450 0.652937    
monthfeb           -2.046e-02  3.669e-01  -0.056 0.955537    
monthjan           -1.431e+00  5.011e-01  -2.856 0.004291 ** 
monthjul           -1.048e+00  3.090e-01  -3.393 0.000692 ***
monthjun            2.743e-01  3.706e-01   0.740 0.459089    
monthmar            1.703e+00  4.618e-01   3.687 0.000227 ***
monthmay           -6.084e-01  2.850e-01  -2.135 0.032743 *  
monthnov           -1.174e+00  3.365e-01  -3.490 0.000484 ***
monthoct            1.669e+00  3.984e-01   4.188 2.81e-05 ***
monthsep            6.182e-01  4.973e-01   1.243 0.213819    
duration            4.456e-03  2.478e-04  17.980  < 2e-16 ***
campaign           -6.490e-02  3.390e-02  -1.915 0.055549 .  
previous           -3.353e-02  4.882e-02  -0.687 0.492191    
poutcomeother       5.222e-01  3.352e-01   1.558 0.119260    
poutcomesuccess     2.528e+00  3.533e-01   7.156 8.34e-13 ***
poutcomeunknown    -4.110e-01  3.842e-01  -1.070 0.284721    
pdays_recode       -5.803e-04  1.193e-03  -0.486 0.626665    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2274.9  on 3163  degrees of freedom
Residual deviance: 1470.1  on 3121  degrees of freedom
AIC: 1556.1

Number of Fisher Scoring iterations: 6
Code
# Apply stepwise selection to find the best model  
step_logistic <- step(logistic_model, direction = "both", trace = 1)  
Start:  AIC=1556.05
y ~ age + job + marital + education + default + balance + housing + 
    loan + contact + day + month + duration + campaign + previous + 
    poutcome + pdays_recode

               Df Deviance    AIC
- pdays_recode  1   1470.3 1554.3
- age           1   1470.3 1554.3
- previous      1   1470.5 1554.5
- balance       1   1470.8 1554.8
- default       1   1471.0 1555.0
<none>              1470.0 1556.0
- job          11   1492.6 1556.6
- marital       2   1474.7 1556.7
- education     3   1476.8 1556.8
- day           1   1473.2 1557.2
- housing       1   1473.9 1557.9
- campaign      1   1474.1 1558.1
- loan          1   1477.0 1561.0
- contact       2   1492.3 1574.3
- poutcome      3   1546.7 1626.7
- month        11   1569.1 1633.1
- duration      1   1915.9 1999.9

Step:  AIC=1554.29
y ~ age + job + marital + education + default + balance + housing + 
    loan + contact + day + month + duration + campaign + previous + 
    poutcome

               Df Deviance    AIC
- age           1   1470.6 1552.6
- previous      1   1470.7 1552.7
- balance       1   1471.0 1553.0
- default       1   1471.2 1553.2
<none>              1470.3 1554.3
- marital       2   1474.8 1554.8
- job          11   1493.0 1555.0
- education     3   1477.0 1555.0
- day           1   1473.6 1555.6
+ pdays_recode  1   1470.0 1556.0
- housing       1   1474.3 1556.3
- campaign      1   1474.3 1556.3
- loan          1   1477.2 1559.2
- contact       2   1492.4 1572.4
- poutcome      3   1551.0 1629.0
- month        11   1569.1 1631.1
- duration      1   1916.1 1998.1

Step:  AIC=1552.59
y ~ job + marital + education + default + balance + housing + 
    loan + contact + day + month + duration + campaign + previous + 
    poutcome

               Df Deviance    AIC
- previous      1   1471.0 1551.0
- balance       1   1471.4 1551.4
- default       1   1471.5 1551.5
<none>              1470.6 1552.6
- marital       2   1475.2 1553.2
- education     3   1477.6 1553.6
- day           1   1473.9 1553.9
+ age           1   1470.3 1554.3
- job          11   1494.3 1554.3
+ pdays_recode  1   1470.3 1554.3
- housing       1   1474.4 1554.4
- campaign      1   1474.6 1554.6
- loan          1   1477.5 1557.5
- contact       2   1492.9 1570.9
- poutcome      3   1551.4 1627.4
- month        11   1569.2 1629.2
- duration      1   1916.1 1996.1

Step:  AIC=1550.98
y ~ job + marital + education + default + balance + housing + 
    loan + contact + day + month + duration + campaign + poutcome

               Df Deviance    AIC
- balance       1   1471.8 1549.8
- default       1   1471.9 1549.9
<none>              1471.0 1551.0
- marital       2   1475.6 1551.6
- education     3   1477.8 1551.8
- day           1   1474.3 1552.3
+ previous      1   1470.6 1552.6
- job          11   1494.6 1552.6
+ age           1   1470.7 1552.7
+ pdays_recode  1   1470.8 1552.8
- housing       1   1474.8 1552.8
- campaign      1   1475.0 1553.0
- loan          1   1478.0 1556.0
- contact       2   1493.3 1569.3
- month        11   1569.5 1627.5
- poutcome      3   1560.8 1634.8
- duration      1   1916.2 1994.2

Step:  AIC=1549.77
y ~ job + marital + education + default + housing + loan + contact + 
    day + month + duration + campaign + poutcome

               Df Deviance    AIC
- default       1   1472.8 1548.8
<none>              1471.8 1549.8
- marital       2   1476.6 1550.6
- education     3   1478.6 1550.6
- job          11   1494.9 1550.9
+ balance       1   1471.0 1551.0
- day           1   1475.3 1551.3
+ previous      1   1471.4 1551.4
+ age           1   1471.5 1551.5
- housing       1   1475.5 1551.5
+ pdays_recode  1   1471.6 1551.6
- campaign      1   1475.8 1551.8
- loan          1   1478.6 1554.6
- contact       2   1494.0 1568.0
- month        11   1569.5 1625.5
- poutcome      3   1561.4 1633.4
- duration      1   1918.3 1994.3

Step:  AIC=1548.76
y ~ job + marital + education + housing + loan + contact + day + 
    month + duration + campaign + poutcome

               Df Deviance    AIC
<none>              1472.8 1548.8
- education     3   1479.3 1549.3
+ default       1   1471.8 1549.8
- marital       2   1477.8 1549.8
+ balance       1   1471.9 1549.9
- job          11   1496.0 1550.0
- day           1   1476.3 1550.3
+ previous      1   1472.4 1550.4
- housing       1   1476.4 1550.4
+ age           1   1472.5 1550.5
+ pdays_recode  1   1472.6 1550.6
- campaign      1   1476.9 1550.9
- loan          1   1479.5 1553.5
- contact       2   1495.4 1567.4
- month        11   1570.3 1624.3
- poutcome      3   1562.1 1632.1
- duration      1   1918.9 1992.9
Code
summary(step_logistic)  

Call:
glm(formula = y ~ job + marital + education + housing + loan + 
    contact + day + month + duration + campaign + poutcome, family = binomial, 
    data = train)

Coefficients:
                    Estimate Std. Error z value Pr(>|z|)    
(Intercept)        -2.307619   0.493340  -4.678 2.90e-06 ***
jobblue-collar     -0.674396   0.301301  -2.238  0.02520 *  
jobentrepreneur    -0.154748   0.429977  -0.360  0.71892    
jobhousemaid       -0.377238   0.484336  -0.779  0.43605    
jobmanagement      -0.162854   0.291133  -0.559  0.57590    
jobretired          0.623192   0.337352   1.847  0.06470 .  
jobself-employed    0.118553   0.402112   0.295  0.76813    
jobservices        -0.253252   0.340505  -0.744  0.45702    
jobstudent          0.377435   0.482791   0.782  0.43435    
jobtechnician      -0.159352   0.276779  -0.576  0.56479    
jobunemployed      -0.993926   0.548732  -1.811  0.07009 .  
jobunknown         -0.591053   0.905840  -0.652  0.51408    
maritalmarried     -0.466512   0.209373  -2.228  0.02587 *  
maritalsingle      -0.293546   0.232212  -1.264  0.20618    
educationsecondary -0.197325   0.241679  -0.816  0.41423    
educationtertiary   0.029973   0.274886   0.109  0.91317    
educationunknown   -1.001211   0.484783  -2.065  0.03890 *  
housingyes         -0.321157   0.168268  -1.909  0.05631 .  
loanyes            -0.603272   0.242598  -2.487  0.01289 *  
contacttelephone   -0.285934   0.303427  -0.942  0.34601    
contactunknown     -1.255110   0.272454  -4.607 4.09e-06 ***
day                 0.018559   0.009922   1.870  0.06142 .  
monthaug           -0.225513   0.294280  -0.766  0.44348    
monthdec           -0.412010   0.893944  -0.461  0.64488    
monthfeb            0.014870   0.364535   0.041  0.96746    
monthjan           -1.396629   0.498517  -2.802  0.00509 ** 
monthjul           -1.014915   0.307376  -3.302  0.00096 ***
monthjun            0.291647   0.370787   0.787  0.43154    
monthmar            1.710013   0.461317   3.707  0.00021 ***
monthmay           -0.592555   0.284030  -2.086  0.03696 *  
monthnov           -1.169031   0.333733  -3.503  0.00046 ***
monthoct            1.607901   0.393698   4.084 4.42e-05 ***
monthsep            0.630528   0.495886   1.272  0.20354    
duration            0.004446   0.000247  17.998  < 2e-16 ***
campaign           -0.066192   0.033907  -1.952  0.05092 .  
poutcomeother       0.530573   0.333139   1.593  0.11124    
poutcomesuccess     2.549988   0.346040   7.369 1.72e-13 ***
poutcomeunknown    -0.169473   0.225736  -0.751  0.45280    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 2274.9  on 3163  degrees of freedom
Residual deviance: 1472.8  on 3126  degrees of freedom
AIC: 1548.8

Number of Fisher Scoring iterations: 6
Code
# Get predicted probabilities on test set  
logistic_probs <- predict(step_logistic, newdata = test, type = "response") 

4.2 Decision Tree

Code
# Fit decision tree model  
tree_model <- rpart(y_formula, data = train, method = "class",   
                   control = rpart.control(cp = 0.01))  

# Plot the tree  
rpart.plot(tree_model)  

Code
# Print the complexity parameter table  
printcp(tree_model)  

Classification tree:
rpart(formula = y_formula, data = train, method = "class", control = rpart.control(cp = 0.01))

Variables actually used in tree construction:
[1] duration job      month    poutcome previous

Root node error: 368/3164 = 0.11631

n= 3164 

        CP nsplit rel error  xerror     xstd
1 0.036232      0   1.00000 1.00000 0.049003
2 0.029891      3   0.89130 0.98098 0.048596
3 0.019022      4   0.86141 0.96739 0.048301
4 0.013587      5   0.84239 0.96739 0.048301
5 0.010870     10   0.77446 0.97283 0.048419
6 0.010000     12   0.75272 0.93750 0.047642
Code
# Get predicted probabilities on test set  
tree_probs <- predict(tree_model, newdata = test, type = "prob")[, 2] 

4.3 Evaluate at Different Thresholds

Final Model and Threshold:

Based on accuracy:

  • Logistic Regression @ cutoff = 0.5
  • It has the highest accuracy (0.9012528)

Based on recall:

  • Logistic Regression @ cutoff = 0.3
  • It has the highest recall (0.4901961)
Code
# Create a function to evaluate model at different thresholds  
evaluate_cutoff <- function(probs, actual, threshold = 0.5, model_name = "Model") {  
  actual <- as.character(actual)  
  pred <- ifelse(probs > threshold, "yes", "no")  
  confusion <- table(Predicted = pred, Actual = actual)  
  TP <- sum(pred == "yes" & actual == "yes")  
  TN <- sum(pred == "no" & actual == "no")  
  FP <- sum(pred == "yes" & actual == "no")  
  FN <- sum(pred == "no" & actual == "yes")  
  accuracy <- (TP + TN) / (TP + TN + FP + FN)  
  cat(paste0("\n", model_name, " @ cutoff = ", threshold, "\n"))  
  print(confusion)  
  cat(sprintf("Accuracy: %.3f\n", accuracy))  
  return(list(confusion = confusion, accuracy = accuracy))  
}  

# Evaluate logistic regression at different thresholds  
evaluate_cutoff(logistic_probs, test$y, 0.3, "Logistic Regression")  

Logistic Regression @ cutoff = 0.3
         Actual
Predicted   no  yes
      no  1135   78
      yes   69   75
Accuracy: 0.892
$confusion
         Actual
Predicted   no  yes
      no  1135   78
      yes   69   75

$accuracy
[1] 0.8916728
Code
evaluate_cutoff(logistic_probs, test$y, 0.5, "Logistic Regression")  

Logistic Regression @ cutoff = 0.5
         Actual
Predicted   no  yes
      no  1169   99
      yes   35   54
Accuracy: 0.901
$confusion
         Actual
Predicted   no  yes
      no  1169   99
      yes   35   54

$accuracy
[1] 0.9012528
Code
evaluate_cutoff(logistic_probs, test$y, 0.7, "Logistic Regression")  

Logistic Regression @ cutoff = 0.7
         Actual
Predicted   no  yes
      no  1186  122
      yes   18   31
Accuracy: 0.897
$confusion
         Actual
Predicted   no  yes
      no  1186  122
      yes   18   31

$accuracy
[1] 0.8968312
Code
# Evaluate decision tree at different thresholds  
evaluate_cutoff(tree_probs, test$y, 0.3, "Decision Tree")  

Decision Tree @ cutoff = 0.3
         Actual
Predicted   no  yes
      no  1141   96
      yes   63   57
Accuracy: 0.883
$confusion
         Actual
Predicted   no  yes
      no  1141   96
      yes   63   57

$accuracy
[1] 0.8828298
Code
evaluate_cutoff(tree_probs, test$y, 0.5, "Decision Tree")  

Decision Tree @ cutoff = 0.5
         Actual
Predicted   no  yes
      no  1161  113
      yes   43   40
Accuracy: 0.885
$confusion
         Actual
Predicted   no  yes
      no  1161  113
      yes   43   40

$accuracy
[1] 0.8850405
Code
evaluate_cutoff(tree_probs, test$y, 0.7, "Decision Tree")  

Decision Tree @ cutoff = 0.7
         Actual
Predicted   no  yes
      no  1171  122
      yes   33   31
Accuracy: 0.886
$confusion
         Actual
Predicted   no  yes
      no  1171  122
      yes   33   31

$accuracy
[1] 0.8857775
Code
## recall comparison
calculate_metrics <- function(probs, actual, threshold = 0.5) {
  actual <- as.character(actual)
  pred <- ifelse(probs > threshold, "yes", "no")
  
  TP <- sum(pred == "yes" & actual == "yes")
  TN <- sum(pred == "no" & actual == "no")
  FP <- sum(pred == "yes" & actual == "no")
  FN <- sum(pred == "no" & actual == "yes")
  
  recall <- TP / (TP + FN)
  
  return(recall)
}


lr_recall_0.3 <- calculate_metrics(logistic_probs, test$y, 0.3)
lr_recall_0.5 <- calculate_metrics(logistic_probs, test$y, 0.5)
lr_recall_0.7 <- calculate_metrics(logistic_probs, test$y, 0.7)

dt_recall_0.3 <- calculate_metrics(tree_probs, test$y, 0.3)
dt_recall_0.5 <- calculate_metrics(tree_probs, test$y, 0.5)
dt_recall_0.7 <- calculate_metrics(tree_probs, test$y, 0.7)

recall_comparison <- data.frame(
  Threshold = c(0.3, 0.5, 0.7),
  Logistic_Regression = c(lr_recall_0.3, lr_recall_0.5, lr_recall_0.7),
  Decision_Tree = c(dt_recall_0.3, dt_recall_0.5, dt_recall_0.7)
)

{
  cat("\nRecall Comparison Table:\n\n")
  print(recall_comparison)
}

Recall Comparison Table:

  Threshold Logistic_Regression Decision_Tree
1       0.3           0.4901961     0.3725490
2       0.5           0.3529412     0.2614379
3       0.7           0.2026144     0.2026144
Code
# Compute ROC curves for both models  
roc_logistic <- roc(response = test$y, predictor = logistic_probs)  
roc_tree <- roc(response = test$y, predictor = tree_probs)  

# Plot ROC curves  
library(ggplot2)  
library(pROC)  
roc_data <- data.frame(  
  FPR_logistic = 1 - roc_logistic$specificities,  
  TPR_logistic = roc_logistic$sensitivities,  
  FPR_tree = 1 - roc_tree$specificities,  
  TPR_tree = roc_tree$sensitivities  
)  
ggplot() +  
  geom_line(data = roc_data, aes(x = FPR_logistic, y = TPR_logistic, color = "Logistic Regression"), size = 1.2) +  
  geom_line(data = roc_data, aes(x = FPR_tree, y = TPR_tree, color = "Decision Tree"), size = 1.2) +  
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +  
  scale_color_manual(name = "Models",   
                    values = c("Logistic Regression" = "#1E88E5", "Decision Tree" = "#D81B60"),  
                    labels = c(paste0("Logistic Regression (AUC = ", round(auc(roc_logistic), 3), ")"),  
                              paste0("Decision Tree (AUC = ", round(auc(roc_tree), 3), ")"))) +  
  labs(title = "ROC Curves for Bank Term Deposit Subscription Models",  
       x = "False Positive Rate (1 - Specificity)",  
       y = "True Positive Rate (Sensitivity)") +  
  theme_minimal() +  
  coord_fixed(ratio = 1) +
  theme(legend.position = "bottom",  
        plot.title = element_text(face = "bold", hjust = 0.5),  
        legend.title = element_text(face = "bold")) 

5 Principal Component Analysis and PCA Regression

The PCA regression analysis results indicate that this model significantly outperforms the original multiple linear regression model, achieving a 33.4% reduction in prediction error (RMSE decreased from 264.53 to 176.11).

The first three principal components collectively explain 62.4% of the total variance, with PC1 (26.6%) primarily reflecting customer contact history information, PC2 (18.3%) capturing customer demographic and financial status, and PC3 (17.5%) representing the inverse relationship between marketing campaign intensity and call duration.

This dimensionality reduction method not only improves predictive performance but also eliminates multicollinearity issues, providing the bank’s marketing team with clearer insights into the factors influencing customer call duration.

5.1 Select and Standardize Numeric Predictors

Code
# Select the numeric variables specified  
numeric_vars <- c("age", "balance", "duration", "campaign", "pdays_recode", "previous")  
numeric_data <- train[, numeric_vars]  # perform PCA using train data

# Standardize the data  
numeric_scaled <- scale(numeric_data) 

5.2 Perform PCA

Code
# Run PCA on the standardized data  
pca_result <- prcomp(numeric_scaled, center = TRUE, scale. = TRUE)  

# Summary of the PCA results  
summary(pca_result)  
Importance of components:
                         PC1    PC2    PC3    PC4    PC5     PC6
Standard deviation     1.263 1.0481 1.0253 0.9601 0.9531 0.65134
Proportion of Variance 0.266 0.1831 0.1752 0.1536 0.1514 0.07071
Cumulative Proportion  0.266 0.4491 0.6243 0.7779 0.9293 1.00000
Code
# Look at the loadings (contributions of each variable to the components)  
print(pca_result$rotation) 
                     PC1          PC2         PC3         PC4         PC5
age           0.02717945  0.687040768 -0.09553200 -0.08430894 -0.71482233
balance      -0.01944040  0.691586097 -0.02213962 -0.20597102  0.69137833
duration     -0.06312925 -0.199650775 -0.73004588 -0.64937273 -0.01987669
campaign      0.17772050 -0.098774517  0.65623193 -0.72029958 -0.09068348
pdays_recode -0.69493145 -0.005568194  0.11891022 -0.02696177 -0.03898136
previous     -0.69309722  0.005984359  0.11241216 -0.09604471 -0.02978123
                      PC6
age           0.005561717
balance       0.020310821
duration      0.033275462
campaign      0.031043446
pdays_recode  0.707568479
previous     -0.704863415
Code
# correlation plot between the original variables and the principal components
loadings <- pca_result$rotation  
loadings_df <- as.data.frame(loadings[, 1:3])  
loadings_df$Variable <- rownames(loadings_df)  
library(reshape2)  
loadings_long <- melt(loadings_df, id.vars = "Variable")  
colnames(loadings_long) <- c("Variable", "Component", "Loading")  
ggplot(loadings_long, aes(x = Variable, y = Loading, fill = Loading)) +  
  geom_bar(stat = "identity") +  
  facet_wrap(~ Component, ncol = 1) +  
  scale_fill_gradient2(low = "lightgreen", 
                       mid = "white", 
                       high = "palevioletred", midpoint = 0) +  
  theme_minimal() +  
  labs(title = "Principal Component Loadings",  
       x = "",  
       y = "Loading Value") +  
  theme(axis.text.x = element_text(angle = 45, hjust = 1),  
        panel.grid.major.x = element_blank(),  
        panel.grid.minor = element_blank(),  
        legend.position = "none") +  
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray50")  

5.3 Scree Plot and Variance Explained

Code
# Extract the explained variance for each component  
scree_values <- summary(pca_result)$importance[2, ]  
cumulative_var <- summary(pca_result)$importance[3, ]  

# Create scree plot  
plot(scree_values, type = "b", pch = 19,  
     xlab = "Principal Component",   
     ylab = "Proportion of Variance Explained",  
     main = "Scree Plot")  

Code
# Plot cumulative variance  
plot(cumulative_var, type = "b", pch = 19,  
     xlab = "Principal Component",   
     ylab = "Cumulative Proportion of Variance Explained",  
     main = "Cumulative Variance Explained")  
abline(h = 0.9, col = "red", lty = 2)  # 90% threshold line  

Code
# Determine number of components to retain (90% variance)  
num_components <- which(cumulative_var >= 0.9)[1]  
{
  cat("\nCumulative Proportion of Variance Explained:\n\n")
  print(cumulative_var)
  cat("\nNumber of components to retain for 90% variance:", num_components, "\n")  
}

Cumulative Proportion of Variance Explained:

    PC1     PC2     PC3     PC4     PC5     PC6 
0.26596 0.44905 0.62426 0.77790 0.92929 1.00000 

Number of components to retain for 90% variance: 5 

5.4 PCA Regression

Code
# Split the PCA scores into training and testing sets  
pc_train <- as.data.frame(pca_result$x[, 1:3])

# For training set  
# pc_train <- pc_scores[train_idx, 1:3]  # First 3 components  
pc_train$duration <- train$duration  

# Fit linear regression model using the principal components  
lm_pca <- lm(duration ~ ., data = pc_train)  
summary(lm_pca)  

Call:
lm(formula = duration ~ ., data = pc_train)

Residuals:
    Min      1Q  Median      3Q     Max 
-231.50 -106.11  -40.54   57.78 1828.17 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  266.115      2.874  92.590  < 2e-16 ***
PC1          -16.344      2.276  -7.183 8.49e-13 ***
PC2          -51.690      2.743 -18.847  < 2e-16 ***
PC3         -189.011      2.804 -67.418  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 161.7 on 3160 degrees of freedom
Multiple R-squared:  0.6105,    Adjusted R-squared:  0.6101 
F-statistic:  1651 on 3 and 3160 DF,  p-value: < 2.2e-16

5.5 Predict and Evaluate on Test Set

Code
# For testing set  
# Project test data onto the same principal components  
test_scaled <- scale(test[, numeric_vars],   
                    center = attr(numeric_scaled, "scaled:center"),  
                    scale = attr(numeric_scaled, "scaled:scale"))  
pc_test <- as.data.frame(predict(pca_result, newdata = test_scaled)[, 1:3])  

# Predict duration using the PCA regression model  
pred_pca <- predict(lm_pca, newdata = pc_test)  

# Calculate RMSE  
rmse_pca <- sqrt(mean((pred_pca - test$duration)^2))  
cat("RMSE from PCA regression:", rmse_pca, "\n")  
RMSE from PCA regression: 176.1062 
Code
# Compare with the MLR model from Task 2  
cat("RMSE from MLR model (Task 2):", rmse_duration, "\n")  
RMSE from MLR model (Task 2): 264.5255 
Code
# PCA regression equation  
{
  cat("PCA Regression Equation:\n")  
  cat("duration =", round(coef(lm_pca)[1], 4))  
  for (i in 2:length(coef(lm_pca))) {  
    coef_name <- names(coef(lm_pca))[i]  
    coef_value <- coef(lm_pca)[i]  
    if (coef_value >= 0) {  
      cat(" +", round(coef_value, 4), "*", coef_name)  
    } else {  
      cat(" -", abs(round(coef_value, 4)), "*", coef_name)  
    }  
  }  
}
PCA Regression Equation:
duration = 266.1147 - 16.3444 * PC1 - 51.6902 * PC2 - 189.0112 * PC3