# missing values and special cases df$pdays_recode <-ifelse(df$pdays ==-1, 0, df$pdays) # 2. Convert categorical variables to factors categorical_vars <-c("job", "marital", "education", "default", "housing", "loan", "contact", "month", "poutcome", "y") df[categorical_vars] <-lapply(df[categorical_vars], as.factor) # Check for outliers # Boxplots for numeric variables par(mfrow=c(2,3)) boxplot(df$age, main ="Age") boxplot(df$balance, main ="Balance") boxplot(df$duration, main ="Call Duration") boxplot(df$campaign, main ="Campaign") boxplot(df$pdays_recode, main ="Days Since Last Contact") boxplot(df$previous, main ="Previous Contacts")
Code
par(mfrow=c(1,1))
1.2 Summary Statistics
Code
# Summary for numeric variables summary(df[, sapply(df, is.numeric)])
age balance day duration
Min. :19.00 Min. :-3313 Min. : 1.00 Min. : 4
1st Qu.:33.00 1st Qu.: 69 1st Qu.: 9.00 1st Qu.: 104
Median :39.00 Median : 444 Median :16.00 Median : 185
Mean :41.17 Mean : 1423 Mean :15.92 Mean : 264
3rd Qu.:49.00 3rd Qu.: 1480 3rd Qu.:21.00 3rd Qu.: 329
Max. :87.00 Max. :71188 Max. :31.00 Max. :3025
campaign pdays previous pdays_recode
Min. : 1.000 Min. : -1.00 Min. : 0.0000 Min. : 0.00
1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000 1st Qu.: 0.00
Median : 2.000 Median : -1.00 Median : 0.0000 Median : 0.00
Mean : 2.794 Mean : 39.77 Mean : 0.5426 Mean : 40.59
3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000 3rd Qu.: 0.00
Max. :50.000 Max. :871.00 Max. :25.0000 Max. :871.00
Code
# For categorical variables for (cat_var in categorical_vars) { cat("\nFrequency table for", cat_var, ":\n") print(table(df[[cat_var]])) cat("\nProportion table for", cat_var, ":\n") print(prop.table(table(df[[cat_var]]))) }
Frequency table for job :
admin. blue-collar entrepreneur housemaid management
478 946 168 112 969
retired self-employed services student technician
230 183 417 84 768
unemployed unknown
128 38
Proportion table for job :
admin. blue-collar entrepreneur housemaid management
0.10572882 0.20924574 0.03715992 0.02477328 0.21433311
retired self-employed services student technician
0.05087370 0.04047777 0.09223623 0.01857996 0.16987392
unemployed unknown
0.02831232 0.00840522
Frequency table for marital :
divorced married single
528 2797 1196
Proportion table for marital :
divorced married single
0.1167883 0.6186684 0.2645432
Frequency table for education :
primary secondary tertiary unknown
678 2306 1350 187
Proportion table for education :
primary secondary tertiary unknown
0.14996682 0.51006415 0.29860650 0.04136253
Frequency table for default :
no yes
4445 76
Proportion table for default :
no yes
0.98318956 0.01681044
Frequency table for housing :
no yes
1962 2559
Proportion table for housing :
no yes
0.4339748 0.5660252
Frequency table for loan :
no yes
3830 691
Proportion table for loan :
no yes
0.8471577 0.1528423
Frequency table for contact :
cellular telephone unknown
2896 301 1324
Proportion table for contact :
cellular telephone unknown
0.64056625 0.06657819 0.29285556
Frequency table for month :
apr aug dec feb jan jul jun mar may nov oct sep
293 633 20 222 148 706 531 49 1398 389 80 52
Proportion table for month :
apr aug dec feb jan jul jun
0.06480867 0.14001327 0.00442380 0.04910418 0.03273612 0.15616014 0.11745189
mar may nov oct sep
0.01083831 0.30922362 0.08604291 0.01769520 0.01150188
Frequency table for poutcome :
failure other success unknown
490 197 129 3705
Proportion table for poutcome :
failure other success unknown
0.10838310 0.04357443 0.02853351 0.81950896
Frequency table for y :
no yes
4000 521
Proportion table for y :
no yes
0.88476 0.11524
Code
# Check for suspicious values unique(df$pdays_recode)
Min. 1st Qu. Median Mean 3rd Qu. Max.
4 104 185 264 329 3025
1.3 Visualizations
Code
# Histograms for numeric variables par(mfrow=c(2,3)) hist(df$age, col ="lightblue", main ="Age Distribution") hist(df$balance, col ="lightblue", main ="Balance Distribution") hist(df$duration, col ="lightblue", main ="Call Duration Distribution") hist(df$campaign, col ="lightblue", main ="Campaign Contacts Distribution") hist(df$pdays_recode, col ="lightblue", main ="Days Since Last Contact Distribution") hist(df$previous, col ="lightblue", main ="Previous Contacts Distribution")
Code
par(mfrow=c(1,1)) # Bar charts for categorical variables barplot(table(df$job), las =2, main ="Job Distribution", cex.names =0.7)
Code
barplot(table(df$education), las =2, main ="Education Distribution")
Code
# Correlation heatmap for numeric variables numeric_vars <- df[, c("age", "balance", "duration", "campaign", "pdays_recode", "previous")] cor_matrix <-cor(numeric_vars) # Display the correlation matrix with values corrplot(cor_matrix, method ="color", type ="upper", tl.col ="black", tl.srt =45, title ="Correlation Matrix of Numeric Variables", addCoef.col ="black", number.cex =0.7, diag =FALSE)
1.4 Relationship Exploration
Insight:
Previous Campaign Outcomes vs Subscription Rate
Data analysis reveals a significant relationship between the poutcome variable (representing previous marketing campaign results) and subscription rates. Clients with successful (success) marketing history show a remarkably high re-subscription probability of 64.34%, substantially higher than those with failed attempts (failure) at 12.86% and those with no previous records (unknown) at 9.10%.
This indicates that a customer’s historical marketing response is a critical predictor of future subscription behavior, suggesting that banks should prioritize contacting clients with previous successful experiences to maximize conversion rates.
Education Level vs Subscription Rate
Tertiary-educated clients display the highest subscription rate at 14.30%, notably higher than clients with secondary education (10.62%) and primary education (9.44%). This educational gradient suggests that higher levels of education correspond to increased financial literacy and investment awareness, making tertiary-educated clients a prime target segment.
Banking institutions should consider education-based segmentation in their marketing strategies, prioritizing outreach to higher-educated clients while developing tailored approaches for clients with lower educational backgrounds to improve overall conversion efficiency.
Code
# Grouped bar chart for education vs. subscription table_edu_y <-table(df$education, df$y) table_edu_y_prop <-prop.table(table_edu_y, margin =1) table_edu_y_prop_t <-t(table_edu_y_prop) barplot_data <-barplot(table_edu_y_prop_t, beside =TRUE, col =c("orange", "skyblue"), legend.text =FALSE, main ="Subscription Rate by Education Level", xlab ="Education Level", ylab ="Proportion", ylim =c(0, max(table_edu_y_prop_t) +0.1)) text(x = barplot_data, y =as.vector(table_edu_y_prop_t) +0.03, labels =sprintf("%.1f%%", as.vector(table_edu_y_prop_t) *100), cex =0.8) legend("topright", legend =rownames(table_edu_y_prop_t), fill =c("orange", "skyblue"), cex =0.7, bty ="n")
Code
# poutcome and subscription rate poutcome_vs_y <-prop.table(table(df$poutcome, df$y), 1) poutcome_vs_y_t <-t(poutcome_vs_y) barplot_data <-barplot(poutcome_vs_y_t, beside =TRUE, col =c("orange", "skyblue"), legend.text =FALSE, main ="Subscription Rate by Previous Outcome", xlab ="Previous Campaign Outcome", ylab ="Proportion", ylim =c(0, max(poutcome_vs_y_t) +0.1)) text(x = barplot_data, y =as.vector(poutcome_vs_y_t) +0.03, labels =sprintf("%.1f%%", as.vector(poutcome_vs_y_t) *100), cex =0.8) legend("topright", legend =rownames(poutcome_vs_y_t), fill =c("orange", "skyblue"), cex =0.7, bty ="n")
Code
# job vs. subscription table_job_y <-table(df$job, df$y) table_job_y_prop <-prop.table(table_job_y, margin =1) table_job_y_prop_t <-t(table_job_y_prop) barplot_data <-barplot(table_job_y_prop_t, beside =TRUE, col =c("orange", "skyblue"), legend.text =FALSE, main ="Subscription Rate by Job Type", xlab ="Job Type", ylab ="Proportion", las =2, cex.names =0.7, ylim =c(0, max(table_job_y_prop_t) +0.1)) text(x = barplot_data, y =as.vector(table_job_y_prop_t) +0.03, labels =sprintf("%.1f%%", as.vector(table_job_y_prop_t) *100), cex =0.8) legend("topright", legend =rownames(table_job_y_prop_t), fill =c("orange", "skyblue"), cex =0.7, bty ="n")
Code
# Boxplots to compare by subscription status library(ggplot2)library(gridExtra)library(dplyr)get_box_stats <-function(x) {return(data.frame(y =max(x) *1.1, label =paste("Mean:", round(mean(x), 1),"Median:", round(median(x), 1)) ))}p1 <-ggplot(df, aes(x = y, y = pdays_recode)) +geom_boxplot(fill ="lightblue", alpha =0.5) +stat_summary(fun = mean, geom ="point", shape =20, size =3, color ="blue") +stat_summary(fun.data = get_box_stats,geom ="text",hjust =0,vjust =1.0,size =3 ) +labs(title ="Days Since Last Contact by Subscription",x ="Subscription", y ="Days") +theme_bw() +theme(plot.title =element_text(size =10))p2 <-ggplot(df, aes(x = y, y = balance)) +geom_boxplot(fill ="lightgreen", alpha =0.5) +stat_summary(fun = mean, geom ="point", shape =20, size =3, color ="blue") +stat_summary(fun.data = get_box_stats,geom ="text",hjust =0,vjust =1.0,size =3 ) +labs(title ="Balance by Subscription",x ="Subscription", y ="Balance") +theme_bw() +theme(plot.title =element_text(size =10))p3 <-ggplot(df, aes(x = y, y = duration)) +geom_boxplot(fill ="lightpink", alpha =0.5) +stat_summary(fun = mean, geom ="point", shape =20, size =3, color ="blue") +stat_summary(fun.data = get_box_stats,geom ="text",hjust =0,vjust =1.0,size =3 ) +labs(title ="Duration by Subscription",x ="Subscription", y ="Duration") +theme_bw() +theme(plot.title =element_text(size =10))p4 <-ggplot(df, aes(x = y, y = campaign)) +geom_boxplot(fill ="lightyellow", alpha =0.5) +stat_summary(fun = mean, geom ="point", shape =20, size =3, color ="blue") +stat_summary(fun.data = get_box_stats,geom ="text",hjust =0,vjust =1.0,size =3 ) +labs(title ="Campaign Contacts by Subscription",x ="Subscription", y ="Campaign Contacts") +theme_bw() +theme(plot.title =element_text(size =10))grid.arrange(p1, p2, p3, p4, ncol =2)
Code
# Scatterplot for duration vs. campaign plot(df$duration, df$campaign, main ="Duration vs. Campaign", xlab ="Duration (seconds)", ylab ="Campaign Contacts", col ="black", pch =16, cex =0.7)
# Duration vs subscription (mean duration by subscription status) tapply(df$duration, df$y, mean)
no yes
226.3475 552.7428
2 Multiple Linear Regression
2.1 Split the Data
Code
# Set seed for reproducibility set.seed(123) # Split data into training (70%) and testing (30%) sets train_idx <-sample(1:nrow(df), size =0.7*nrow(df)) train <- df[train_idx, ] test <- df[-train_idx, ]
2.2 Model 1: Predicting campaign (number of contacts)
Code
campaign_vars <-setdiff(names(df), c("y", "campaign", "pdays", "pdays_recode", "duration")) campaign_formula <-as.formula(paste("campaign ~", paste(campaign_vars, collapse =" + "))) # Fit full model model_full_campaign <-lm(campaign_formula, data = train) summary(model_full_campaign)
# Cross-validation with 10 folds set.seed(100) model_full_CV_campaign <-train( campaign_formula, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) # the formula from stepwise selection step_formula_campaign <-formula(step_model_campaign) # Running cross-validation on stepwise model model_step_CV_campaign <-train( step_formula_campaign, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) ## Compare RMSE print(model_full_CV_campaign)
Linear Regression
3164 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2848, 2848, 2846, 2848, 2847, 2848, ...
Resampling results:
RMSE Rsquared MAE
2.800534 0.09911313 1.683191
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_campaign)
Linear Regression
3164 samples
4 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2847, 2847, 2849, 2847, 2848, 2848, ...
Resampling results:
RMSE Rsquared MAE
2.775264 0.1005481 1.670661
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# the model with better performance (lower RMSE) if(model_step_CV_campaign$results$RMSE < model_full_CV_campaign$results$RMSE) { final_model_campaign <- step_model_campaign cat("Selected model: Stepwise\n") } else { final_model_campaign <- model_full_campaign cat("Selected model: Full model\n") }
Selected model: Stepwise
Code
## Model Diagnostics# Plot residuals plot(final_model_campaign$fitted.values, resid(final_model_campaign), main ="Residual Plot - Campaign Model", xlab ="Fitted Values", ylab ="Residuals") abline(h =0, col ="red", lty =2)
Code
# Predict on test set pred_campaign <-predict(final_model_campaign, newdata = test) rmse_campaign <-sqrt(mean((pred_campaign - test$campaign)^2)) cat("Test RMSE for campaign model:", rmse_campaign, "\n")
Test RMSE for campaign model: 3.271208
Code
# Model Equation {cat("\nCampaign Model Equation:\n") cat("\ncampaign =", round(coef(final_model_campaign)[1], 4)) for (i in2:length(coef(final_model_campaign))) { coef_name <-names(coef(final_model_campaign))[i] coef_value <-coef(final_model_campaign)[i] if (coef_value >=0) { cat(" +", round(coef_value, 4), "*", coef_name) } else { cat(" -", abs(round(coef_value, 4)), "*", coef_name) } } }
# Cross-validation with 10 folds set.seed(100) model_full_CV_pdays <-train( pdays_formula, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) # the formula from stepwise selection step_formula_pdays <-formula(step_model_pdays) # Running cross-validation on stepwise model model_step_CV_pdays <-train( step_formula_pdays, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) ## Compare RMSE print(model_full_CV_pdays)
Linear Regression
3164 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2847, 2848, 2847, 2848, 2848, 2848, ...
Resampling results:
RMSE Rsquared MAE
48.19108 0.7690839 22.72181
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_pdays)
Linear Regression
3164 samples
8 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2847, 2847, 2847, 2848, 2848, 2848, ...
Resampling results:
RMSE Rsquared MAE
48.06464 0.7688941 22.14233
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# Select the model with better performance (lower RMSE) if(model_step_CV_pdays$results$RMSE < model_full_CV_pdays$results$RMSE) { final_model_pdays <- step_model_pdays cat("Selected model: Stepwise\n") } else { final_model_pdays <- model_full_pdays cat("Selected model: Full model\n") }
Selected model: Stepwise
Code
## Model Diagnostics# Plot residuals plot(final_model_pdays$fitted.values, resid(final_model_pdays), main ="Residual Plot - pdays Model", xlab ="Fitted Values", ylab ="Residuals") abline(h =0, col ="red", lty =2)
Code
# Predict on test set pred_pdays <-predict(final_model_pdays, newdata = test) rmse_pdays <-sqrt(mean((pred_pdays - test$pdays_recode)^2)) cat("Test RMSE for pdays model:", rmse_pdays, "\n")
Test RMSE for pdays model: 44.86248
Code
# Model Equation {cat("\npdays Model Equation:\n") cat("\npdays_recode =", round(coef(final_model_pdays)[1], 4)) for (i in2:length(coef(final_model_pdays))) { coef_name <-names(coef(final_model_pdays))[i] coef_value <-coef(final_model_pdays)[i] if (coef_value >=0) { cat(" +", round(coef_value, 4), "*", coef_name) } else { cat(" -", abs(round(coef_value, 4)), "*", coef_name) } } }
# Cross-validation with 10 folds set.seed(100) model_full_CV_duration <-train( duration_formula, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) # the formula from stepwise selection step_formula_duration <-formula(step_model_duration) # Running cross-validation on stepwise model model_step_CV_duration <-train( step_formula_duration, data = train, method ="lm", trControl =trainControl(method ="cv", number =10) ) ## Compare RMSE print(model_full_CV_duration)
Linear Regression
3164 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2849, 2847, 2847, 2848, 2847, 2848, ...
Resampling results:
RMSE Rsquared MAE
257.3651 0.01280132 177.0185
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
print(model_step_CV_duration)
Linear Regression
3164 samples
7 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 2847, 2848, 2847, 2848, 2847, 2848, ...
Resampling results:
RMSE Rsquared MAE
257.2609 0.01321806 176.9475
Tuning parameter 'intercept' was held constant at a value of TRUE
Code
# Select the model with better performance (lower RMSE) if(model_step_CV_duration$results$RMSE < model_full_CV_duration$results$RMSE) { final_model_duration <- step_model_duration cat("Selected model: Stepwise\n") } else { final_model_duration <- model_full_duration cat("Selected model: Full model\n") }
Selected model: Stepwise
Code
## Model Diagnostics# Plot residuals plot(final_model_duration$fitted.values, resid(final_model_duration), main ="Residual Plot - Duration Model", xlab ="Fitted Values", ylab ="Residuals") abline(h =0, col ="red", lty =2)
Code
# Predict on test set pred_duration <-predict(final_model_duration, newdata = test) rmse_duration <-sqrt(mean((pred_duration - test$duration)^2)) cat("Test RMSE for duration model:", rmse_duration, "\n")
Test RMSE for duration model: 264.5255
Code
# Model Equation {cat("\nDuration Model Equation:\n") cat("\nduration =", round(coef(final_model_duration)[1], 4)) for (i in2:length(coef(final_model_duration))) { coef_name <-names(coef(final_model_duration))[i] coef_value <-coef(final_model_duration)[i] if (coef_value >=0) { cat(" +", round(coef_value, 4), "*", coef_name) } else { cat(" -", abs(round(coef_value, 4)), "*", coef_name) } } }
Model ZIP and Hurdle fail to converge, and have NA in standard errors and p-values, so they are excluded from our consideration.
Code
# 1. Poisson model glm_poisson_pdays <-glm( pdays_formula, # Use the same predictors as in the MLR modeldata = train, family =poisson(link ="log") ) summary(glm_poisson_pdays)
# 3. Zero-Inflated Poisson model # For ZIP modelszero_formula <-~1# Intercept only for zero model zip_pdays <-zeroinfl( formula =as.formula(paste( "pdays_recode ~", paste(attr(terms(pdays_formula), "term.labels"), collapse =" + "), # Use the same predictors for the count part"| 1" )), data = train, dist ="poisson") summary(zip_pdays)
Call:
zeroinfl(formula = as.formula(paste("pdays_recode ~", paste(attr(terms(pdays_formula),
"term.labels"), collapse = " + "), "| 1")), data = train, dist = "poisson")
Pearson residuals:
Min 1Q Median 3Q Max
-1.740e+01 -1.277e-04 -1.010e-04 -9.023e-05 5.103e+01
Count model coefficients (poisson with log link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.761e+00 NA NA NA
age -1.853e-03 NA NA NA
jobblue-collar 8.169e-02 NA NA NA
jobentrepreneur 2.135e-01 NA NA NA
jobhousemaid -1.999e-01 NA NA NA
jobmanagement 5.469e-02 NA NA NA
jobretired -1.035e-01 NA NA NA
jobself-employed -1.087e-02 NA NA NA
jobservices 1.259e-01 NA NA NA
jobstudent -5.324e-02 NA NA NA
jobtechnician 9.099e-02 NA NA NA
jobunemployed 1.811e-01 NA NA NA
jobunknown -4.310e-01 NA NA NA
maritalmarried -1.039e-01 NA NA NA
maritalsingle -1.286e-01 NA NA NA
educationsecondary -1.282e-01 NA NA NA
educationtertiary -1.657e-01 NA NA NA
educationunknown -1.671e-01 NA NA NA
defaultyes 8.185e-02 NA NA NA
balance -1.237e-06 NA NA NA
housingyes 1.379e-01 NA NA NA
loanyes -1.085e-02 NA NA NA
contacttelephone -1.211e-02 NA NA NA
contactunknown 3.811e-01 NA NA NA
day -4.559e-03 NA NA NA
monthaug -7.786e-02 NA NA NA
monthdec -6.988e-02 NA NA NA
monthfeb -1.863e-01 NA NA NA
monthjan -1.585e-01 NA NA NA
monthjul -1.314e-01 NA NA NA
monthjun -2.265e-01 NA NA NA
monthmar -2.626e-01 NA NA NA
monthmay 1.384e-01 NA NA NA
monthnov -4.183e-01 NA NA NA
monthoct 1.022e-01 NA NA NA
monthsep -1.021e-01 NA NA NA
previous -1.222e-02 NA NA NA
poutcomeother -9.609e-02 NA NA NA
poutcomesuccess -2.653e-01 NA NA NA
poutcomeunknown -2.391e+01 NA NA NA
Zero-inflation model coefficients (binomial with logit link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) -19.4 NA NA NA
Number of iterations in BFGS optimization: 65
Log-likelihood: -1.498e+04 on 41 Df
Code
# 4. Hurdle model hurdle_pdays <-hurdle( formula =as.formula(paste( "pdays_recode ~", paste(attr(terms(pdays_formula), "term.labels"), collapse =" + "), "| 1" )), data = train, dist ="poisson") summary(hurdle_pdays)
Call:
hurdle(formula = as.formula(paste("pdays_recode ~", paste(attr(terms(pdays_formula),
"term.labels"), collapse = " + "), "| 1")), data = train, dist = "poisson")
Pearson residuals:
Min 1Q Median 3Q Max
-0.4688 -0.4688 -0.4688 -0.4688 11.9308
Count model coefficients (truncated poisson with log link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.761e+00 NA NA NA
age -1.853e-03 NA NA NA
jobblue-collar 8.169e-02 NA NA NA
jobentrepreneur 2.135e-01 NA NA NA
jobhousemaid -1.999e-01 NA NA NA
jobmanagement 5.469e-02 NA NA NA
jobretired -1.035e-01 NA NA NA
jobself-employed -1.087e-02 NA NA NA
jobservices 1.259e-01 NA NA NA
jobstudent -5.324e-02 NA NA NA
jobtechnician 9.099e-02 NA NA NA
jobunemployed 1.811e-01 NA NA NA
jobunknown -4.311e-01 NA NA NA
maritalmarried -1.039e-01 NA NA NA
maritalsingle -1.285e-01 NA NA NA
educationsecondary -1.282e-01 NA NA NA
educationtertiary -1.657e-01 NA NA NA
educationunknown -1.671e-01 NA NA NA
defaultyes 8.185e-02 NA NA NA
balance -1.237e-06 NA NA NA
housingyes 1.379e-01 NA NA NA
loanyes -1.085e-02 NA NA NA
contacttelephone -1.211e-02 NA NA NA
contactunknown 3.811e-01 NA NA NA
day -4.559e-03 NA NA NA
monthaug -7.786e-02 NA NA NA
monthdec -6.988e-02 NA NA NA
monthfeb -1.863e-01 NA NA NA
monthjan -1.585e-01 NA NA NA
monthjul -1.314e-01 NA NA NA
monthjun -2.265e-01 NA NA NA
monthmar -2.626e-01 NA NA NA
monthmay 1.384e-01 NA NA NA
monthnov -4.183e-01 NA NA NA
monthoct 1.022e-01 NA NA NA
monthsep -1.021e-01 NA NA NA
previous -1.222e-02 NA NA NA
poutcomeother -9.609e-02 NA NA NA
poutcomesuccess -2.653e-01 NA NA NA
poutcomeunknown -2.389e+01 NA NA NA
Zero hurdle model coefficients (binomial with logit link):
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.51532 0.04626 -32.76 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Number of iterations in BFGS optimization: 2
Log-likelihood: -1.647e+04 on 41 Df
Code
## Evaluate Models Using RMSE# Predict on test set for each model and calculate RMSE pred_poisson <-predict(glm_poisson_pdays, newdata = test, type ="response") rmse_poisson <-sqrt(mean((pred_poisson - test$pdays_recode)^2)) pred_nb <-predict(glm_nb_pdays, newdata = test, type ="response") rmse_nb <-sqrt(mean((pred_nb - test$pdays_recode)^2)) # pred_zip <- predict(zip_pdays, newdata = test, type = "response") # rmse_zip <- sqrt(mean((pred_zip - test$pdays_recode)^2)) # # pred_hurdle <- predict(hurdle_pdays, newdata = test, type = "response") # rmse_hurdle <- sqrt(mean((pred_hurdle - test$pdays_recode)^2)) # Compare RMSE values rmse_comparison <-data.frame( Model =c("Poisson", "Negative Binomial"), test_RMSE =c(rmse_poisson, rmse_nb) ) print(rmse_comparison)
Model test_RMSE
1 Poisson 42.28214
2 Negative Binomial 42.57622
Code
# Select the best model based on lowest RMSE best_model_idx <-which.min(rmse_comparison$test_RMSE) best_model_name <- rmse_comparison$Model[best_model_idx] cat("Best model for pdays:", best_model_name, "\n")
# Inverse Gaussian model with log link glm_invgauss_campaign <-glm( campaign_formula, data = train, family =inverse.gaussian(link ="log") ) summary(glm_invgauss_campaign)
# Predict on test set for each model and calculate RMSE pred_gamma <-predict(glm_gamma_campaign, newdata = test, type ="response") rmse_gamma <-sqrt(mean((pred_gamma - test$campaign)^2)) pred_invgauss <-predict(glm_invgauss_campaign, newdata = test, type ="response") rmse_invgauss <-sqrt(mean((pred_invgauss - test$campaign)^2)) # Compare with linear model RMSE rmse_comparison <-data.frame( Model =c("Gamma", "Inverse Gaussian"), test_RMSE =c(rmse_gamma, rmse_invgauss) ) print(rmse_comparison)
Model test_RMSE
1 Gamma 3.256990
2 Inverse Gaussian 3.267202
Code
# Select the best model based on lowest RMSE best_model_idx <-which.min(rmse_comparison$test_RMSE) best_model_name <- rmse_comparison$Model[best_model_idx] cat("Best model for campaign:", best_model_name, "\n")
# Inverse Gaussian model with log link glm_invgauss_duration <-glm( duration_formula, data = train, family =inverse.gaussian(link ="inverse") ) summary(glm_invgauss_duration)
# Predict on test set for each model and calculate RMSE pred_gamma <-predict(glm_gamma_duration, newdata = test, type ="response") rmse_gamma <-sqrt(mean((pred_gamma - test$duration)^2)) pred_invgauss <-predict(glm_invgauss_duration, newdata = test, type ="response") rmse_invgauss <-sqrt(mean((pred_invgauss - test$duration)^2)) # Compare with linear model RMSE rmse_comparison <-data.frame( Model =c("Gamma", "Inverse Gaussian"), test_RMSE =c(rmse_gamma, rmse_invgauss) ) print(rmse_comparison)
Model test_RMSE
1 Gamma 264.5334
2 Inverse Gaussian 265.2772
Code
# Select the best model based on lowest RMSE best_model_idx <-which.min(rmse_comparison$test_RMSE) best_model_name <- rmse_comparison$Model[best_model_idx] cat("Best model for duration:", best_model_name, "\n")
# Get predicted probabilities on test set logistic_probs <-predict(step_logistic, newdata = test, type ="response")
4.2 Decision Tree
Code
# Fit decision tree model tree_model <-rpart(y_formula, data = train, method ="class", control =rpart.control(cp =0.01)) # Plot the tree rpart.plot(tree_model)
Code
# Print the complexity parameter table printcp(tree_model)
The PCA regression analysis results indicate that this model significantly outperforms the original multiple linear regression model, achieving a 33.4% reduction in prediction error (RMSE decreased from 264.53 to 176.11).
The first three principal components collectively explain 62.4% of the total variance, with PC1 (26.6%) primarily reflecting customer contact history information, PC2 (18.3%) capturing customer demographic and financial status, and PC3 (17.5%) representing the inverse relationship between marketing campaign intensity and call duration.
This dimensionality reduction method not only improves predictive performance but also eliminates multicollinearity issues, providing the bank’s marketing team with clearer insights into the factors influencing customer call duration.
5.1 Select and Standardize Numeric Predictors
Code
# Select the numeric variables specified numeric_vars <-c("age", "balance", "duration", "campaign", "pdays_recode", "previous") numeric_data <- train[, numeric_vars] # perform PCA using train data# Standardize the data numeric_scaled <-scale(numeric_data)
5.2 Perform PCA
Code
# Run PCA on the standardized data pca_result <-prcomp(numeric_scaled, center =TRUE, scale. =TRUE) # Summary of the PCA results summary(pca_result)
# correlation plot between the original variables and the principal componentsloadings <- pca_result$rotation loadings_df <-as.data.frame(loadings[, 1:3]) loadings_df$Variable <-rownames(loadings_df) library(reshape2) loadings_long <-melt(loadings_df, id.vars ="Variable") colnames(loadings_long) <-c("Variable", "Component", "Loading") ggplot(loadings_long, aes(x = Variable, y = Loading, fill = Loading)) +geom_bar(stat ="identity") +facet_wrap(~ Component, ncol =1) +scale_fill_gradient2(low ="lightgreen", mid ="white", high ="palevioletred", midpoint =0) +theme_minimal() +labs(title ="Principal Component Loadings", x ="", y ="Loading Value") +theme(axis.text.x =element_text(angle =45, hjust =1), panel.grid.major.x =element_blank(), panel.grid.minor =element_blank(), legend.position ="none") +geom_hline(yintercept =0, linetype ="dashed", color ="gray50")
5.3 Scree Plot and Variance Explained
Code
# Extract the explained variance for each component scree_values <-summary(pca_result)$importance[2, ] cumulative_var <-summary(pca_result)$importance[3, ] # Create scree plot plot(scree_values, type ="b", pch =19, xlab ="Principal Component", ylab ="Proportion of Variance Explained", main ="Scree Plot")
Code
# Plot cumulative variance plot(cumulative_var, type ="b", pch =19, xlab ="Principal Component", ylab ="Cumulative Proportion of Variance Explained", main ="Cumulative Variance Explained") abline(h =0.9, col ="red", lty =2) # 90% threshold line
Code
# Determine number of components to retain (90% variance) num_components <-which(cumulative_var >=0.9)[1] {cat("\nCumulative Proportion of Variance Explained:\n\n")print(cumulative_var)cat("\nNumber of components to retain for 90% variance:", num_components, "\n") }
Cumulative Proportion of Variance Explained:
PC1 PC2 PC3 PC4 PC5 PC6
0.26596 0.44905 0.62426 0.77790 0.92929 1.00000
Number of components to retain for 90% variance: 5
5.4 PCA Regression
Code
# Split the PCA scores into training and testing sets pc_train <-as.data.frame(pca_result$x[, 1:3])# For training set # pc_train <- pc_scores[train_idx, 1:3] # First 3 components pc_train$duration <- train$duration # Fit linear regression model using the principal components lm_pca <-lm(duration ~ ., data = pc_train) summary(lm_pca)
Call:
lm(formula = duration ~ ., data = pc_train)
Residuals:
Min 1Q Median 3Q Max
-231.50 -106.11 -40.54 57.78 1828.17
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 266.115 2.874 92.590 < 2e-16 ***
PC1 -16.344 2.276 -7.183 8.49e-13 ***
PC2 -51.690 2.743 -18.847 < 2e-16 ***
PC3 -189.011 2.804 -67.418 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 161.7 on 3160 degrees of freedom
Multiple R-squared: 0.6105, Adjusted R-squared: 0.6101
F-statistic: 1651 on 3 and 3160 DF, p-value: < 2.2e-16
5.5 Predict and Evaluate on Test Set
Code
# For testing set # Project test data onto the same principal components test_scaled <-scale(test[, numeric_vars], center =attr(numeric_scaled, "scaled:center"), scale =attr(numeric_scaled, "scaled:scale")) pc_test <-as.data.frame(predict(pca_result, newdata = test_scaled)[, 1:3]) # Predict duration using the PCA regression model pred_pca <-predict(lm_pca, newdata = pc_test) # Calculate RMSE rmse_pca <-sqrt(mean((pred_pca - test$duration)^2)) cat("RMSE from PCA regression:", rmse_pca, "\n")
RMSE from PCA regression: 176.1062
Code
# Compare with the MLR model from Task 2 cat("RMSE from MLR model (Task 2):", rmse_duration, "\n")