Bank Case Study

Author

Vanessa Makayla Sara

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
library(tidyverse)

Warning: package 'ggplot2' was built under R version 4.5.2

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.2
✔ ggplot2   4.0.1     ✔ tibble    3.3.0
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.1.0     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library(corrplot)

corrplot 0.95 loaded

library(conflicted)
library(conflicted)
conflicts_prefer(dplyr::filter)

[conflicted] Will prefer dplyr::filter over any other package.

bank_data <-read.csv('~/Downloads/data analytics application/bank-additional-full.csv', sep=";")

glimpse(bank_data)

Rows: 41,188
Columns: 21
$ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
$ job            <chr> "housemaid", "services", "services", "admin.", "service…
$ marital        <chr> "married", "married", "married", "married", "married", …
$ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
$ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
$ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
$ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
$ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
$ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
$ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
$ duration       <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
$ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
$ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
$ emp.var.rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
$ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
$ cons.conf.idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
$ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
$ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
$ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…

coverting target to factors, pdays to categorical factor, binning age, campaign intensity, removing unkowns, we drop duration and nr.employed due to high correlation with other variables.

dt_model<- bank_data|>
  mutate(
    y=factor(y, levels=c("no", "yes"), labels=c("No", "Yes")),
    pdays_bin=factor(ifelse(pdays==999, "No", "Yes")),
    age_group=factor(case_when(
      age<30 ~ "Young",
      age < 45 ~ "Middle-Age",
      age <60 ~ "Mature",
      TRUE ~ "Senior"
    )),
    campaign_intensity = factor(case_when(
      campaign ==1 ~ "First",
      campaign <= 3~ "Light",
      TRUE ~ "Heavy"
    ))
  )|>
  dplyr::filter(education != "unknown")|>
  dplyr::select(-duration, -nr.employed, -age, -education, -default)|>
  na.omit()

check class balance

prop.table(table(dt_model$y))


      No      Yes 
0.888765 0.111235

correlation plot for numerical variables and a visualization for the class imbalance.

numeric_cols<- dt_model |> select(where(is.numeric))

cor_matrix<- cor(numeric_cols, use="complete.obs")

corrplot(cor_matrix,
         method="color",
         type="upper",
         addCoef.col="black",
         tl.col="black",
         number.cex=0.7,
         title="Correlation of Economic and Campaing Variables",
         mar=c(0,0,1,0))

^here we see that euribor3m (interest rates)and emp.var.rate are highly correlated so we will drop emp.var.rate since it is less intuitive, and so we dont confuse feature importance in logistic regression.

dt_model<- dt_model |>
  dplyr::select(-emp.var.rate)|>
  na.omit()

class balance visualization

ggplot(dt_model, aes(x=y, fill=y)) +
  geom_bar()+
  scale_fill_manual(values=c("No"= "salmon", "Yes"= "steelblue"))+
  labs(title="Distribution of Subcription Responses", 
       subtitle=paste0("Total Observations:", nrow(dt_model)),
       x="Subscribed to Term Deposit",
       y="Count")+
  theme_minimal()

Modeling

levels(dt_model$y)<-c("No", "Yes")

identifying variables with nearly no variation

nzv<-nearZeroVar(dt_model)
if(length(nzv)>0) dt_model<-dt_model[, -nzv]

split data

set.seed(123)
trainIndex<-createDataPartition(dt_model$y, p=.8, list=FALSE)
train_data<-dt_model[trainIndex, ]
test_data<-dt_model[-trainIndex, ]

defining training controls, down sampling balances the “no” and “yes’ classes in the training set

ctrl<-trainControl(method="cv",
                   number=5, 
                   classProbs=TRUE, 
                   summaryFunction=twoClassSummary,
                   sampling="down")

logistic regression

set.seed(123)
model_logit<-train(y~.,
                   data=train_data,
                   method="glm",
                   family="binomial",
                   metric="ROC",
                   trControl=ctrl)

LDA

set.seed(123)
model_lda<-train(y ~.,
                 data=train_data,
                 method="lda",
                 preProcess=c("center", "scale"),
                 metric="ROC",
                 trControl=ctrl)

results and comparison

results<-resamples(list(logistic=model_logit, LDA=model_lda))
summary(results)


Call:
summary.resamples(object = results)

Models: logistic, LDA 
Number of resamples: 5 

ROC 
              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
logistic 0.7745407 0.7773954 0.7777738 0.7820091 0.7887304 0.7916053    0
LDA      0.7736058 0.7762520 0.7776508 0.7812768 0.7883945 0.7904809    0

Sens 
              Min.  1st Qu.    Median      Mean   3rd Qu.      Max. NA's
logistic 0.8203529 0.827660 0.8308679 0.8345393 0.8453039 0.8485119    0
LDA      0.8059169 0.818036 0.8196400 0.8212440 0.8285511 0.8340759    0

Spec 
              Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
logistic 0.6139601 0.6230441 0.6367521 0.6318339 0.6401138 0.6452991    0
LDA      0.6267806 0.6344239 0.6424501 0.6429385 0.6514936 0.6595442    0

visual for comparison

dotplot(results)

both models are performing very similarly with a mean ROC of about .78 the logistic regression model has a higher mean in sensitivity, but both models have a low specificity mean (tradeoff using down sampling)

generating predictions using the Logistic Model our primary model

logit_pred<- predict(model_logit, newdata=test_data)

confusion matrix to provide accuracy, sensitivity and specificity

conf_matrix<- confusionMatrix(logit_pred, test_data$y, positive="Yes")
print(conf_matrix)

Confusion Matrix and Statistics

          Reference
Prediction   No  Yes
       No  5751  320
       Yes 1262  557
                                          
               Accuracy : 0.7995          
                 95% CI : (0.7905, 0.8083)
    No Information Rate : 0.8888          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.3097          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.6351          
            Specificity : 0.8200          
         Pos Pred Value : 0.3062          
         Neg Pred Value : 0.9473          
             Prevalence : 0.1112          
         Detection Rate : 0.0706          
   Detection Prevalence : 0.2305          
      Balanced Accuracy : 0.7276          
                                          
       'Positive' Class : Yes

part c insights and feature importance

extract feature importance

importance<-varImp(model_logit, scale=FALSE)
importance_data<-as.data.frame(importance$importance)

plot top 10 predictors

plot(importance, top=10, main="Top 10 Predictors of Subscription (Logistic Regression)")

extract the actual importance values for report

report_importance<-importance_data|>
  rownames_to_column(var="Variable")|>
  rename(Importance_Score=Overall)|>
  head(10)
print(report_importance)

             Variable Importance_Score
1    `jobblue-collar`       2.11092000
2     jobentrepreneur       0.56566971
3        jobhousemaid       1.52923523
4       jobmanagement       0.08387114
5          jobretired       1.19191372
6  `jobself-employed`       0.06659760
7         jobservices       1.62249252
8          jobstudent       0.31569569
9       jobtechnician       0.16147718
10      jobunemployed       0.35492915

D strategic recommendations

generating probabilites for the test set

test_probs<- predict(model_logit, newdata=test_data, type="prob")

combining with original data

strategic_list <-cbind(test_data, Prob_Yes=test_probs$Yes)|>
  arrange(desc(Prob_Yes))

view top 10 leads

head(strategic_list, 10)

           job  marital housing loan  contact month day_of_week campaign
1      student   single      no   no cellular   mar         wed        1
2      student   single     yes   no cellular   mar         tue        1
3      student   single      no   no cellular   mar         fri        1
4  blue-collar   single      no   no cellular   mar         thu        2
5       admin.   single      no   no cellular   mar         wed        2
6       admin.   single     yes   no cellular   mar         thu        2
7       admin.  married     yes   no cellular   mar         wed        3
8      retired   single      no   no cellular   mar         fri        7
9   technician  married     yes   no cellular   mar         tue        2
10  technician divorced      no   no cellular   mar         mon        1
   previous poutcome cons.price.idx cons.conf.idx euribor3m   y  age_group
1         1  success         93.369         -34.8     0.655 Yes      Young
2         2  success         93.369         -34.8     0.655 Yes      Young
3         3  success         93.369         -34.8     0.649 Yes      Young
4         2  success         93.369         -34.8     0.654 Yes      Young
5         2  success         93.369         -34.8     0.655 Yes Middle-Age
6         3  success         93.369         -34.8     0.643 Yes Middle-Age
7         1  success         93.369         -34.8     0.655 Yes Middle-Age
8         2  success         93.369         -34.8     0.653  No     Senior
9         2  success         93.369         -34.8     0.655 Yes Middle-Age
10        1  success         93.369         -34.8     0.639 Yes Middle-Age
   campaign_intensity  Prob_Yes
1               First 0.9918638
2               First 0.9908818
3               First 0.9905623
4               Light 0.9904453
5               Light 0.9903472
6               Light 0.9901607
7               Light 0.9893642
8               Heavy 0.9891152
9               Light 0.9889205
10              First 0.9880806