Bank Case Study MVS

Author

Vanessa Makayla Sara

library(MASS)
library(ggplot2)

Warning: package 'ggplot2' was built under R version 4.5.2

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.2
✔ lubridate 1.9.4     ✔ tibble    3.3.0
✔ purrr     1.1.0     ✔ tidyr     1.3.1
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
✖ dplyr::select() masks MASS::select()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(corrplot)

corrplot 0.95 loaded

library(car)

Loading required package: carData

Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

The following object is masked from 'package:purrr':

    some

library(dplyr)
library(caret)

Loading required package: lattice

Attaching package: 'caret'

The following object is masked from 'package:purrr':

    lift

library(pROC)

Type 'citation("pROC")' for a citation.

Attaching package: 'pROC'

The following objects are masked from 'package:stats':

    cov, smooth, var

data<-read.csv("~/Downloads/data analytics application/bank-additional-full.csv", sep=";")

glimpse(data)

Rows: 41,188
Columns: 21
$ age            <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,…
$ job            <chr> "housemaid", "services", "services", "admin.", "service…
$ marital        <chr> "married", "married", "married", "married", "married", …
$ education      <chr> "basic.4y", "high.school", "high.school", "basic.6y", "…
$ default        <chr> "no", "unknown", "no", "no", "no", "unknown", "no", "un…
$ housing        <chr> "no", "no", "yes", "no", "no", "no", "no", "no", "yes",…
$ loan           <chr> "no", "no", "no", "no", "yes", "no", "no", "no", "no", …
$ contact        <chr> "telephone", "telephone", "telephone", "telephone", "te…
$ month          <chr> "may", "may", "may", "may", "may", "may", "may", "may",…
$ day_of_week    <chr> "mon", "mon", "mon", "mon", "mon", "mon", "mon", "mon",…
$ duration       <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22…
$ campaign       <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ pdays          <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, …
$ previous       <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ poutcome       <chr> "nonexistent", "nonexistent", "nonexistent", "nonexiste…
$ emp.var.rate   <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, …
$ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,…
$ cons.conf.idx  <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,…
$ euribor3m      <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,…
$ nr.employed    <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5…
$ y              <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "…

levels(data$Attrition_Flag)

NULL

dt_final<-data|>
  mutate(
    job=as.factor(job),
    marital=as.factor(marital),
    education=as.factor(education),
    default=as.factor(default),
    housing=as.factor(housing),
    loan=as.factor(loan),
    contact=as.factor(contact),
    month=as.factor(month),
    day_of_week=as.factor(day_of_week),
    poutcome=as.factor(poutcome),
    y=factor(y, levels=c("no", "yes"), labels=c("No", "Yes"))
  )|>
  mutate(
    call_frequency_ratio = case_when(
      pdays==999~campaign / 1, 
      pdays==0~(campaign+previous)/1,
      TRUE~(campaign+previous)/(pdays+1)
    ),
    pdays_binary=factor(ifelse(pdays==999, "never_contacted", "contacted_before"))
  )|>
  mutate(
    age_group=case_when(
      age < 30 ~ "Young",
      age < 45 ~ "Middle-Age",
      age < 60 ~ "Mature",
      TRUE ~ "Senior"
    ),
    age_group=as.factor(age_group)
  )|>
  mutate(
    campaign_intensity=case_when(
      campaign==1 ~ "First_Contact",
      campaign <=3~ "Light_Followup",
      campaign <=7 ~ "Medium_Followup",
      TRUE ~ "Heavy_Followup"
    ),
    campaign_intensity = as.factor(campaign_intensity)
  )|>
  mutate(
    education_level = case_when(
      education =="illiterate"~0, 
      education == "basic.4y"~1,
      education=="basic.6y"~2,
      education=="basic.9y"~3,
      education=="high.school"~4,
      education=="professional.course"~5,
      education=="university.degree"~6,
      TRUE~NA_real_
    )
  )|>
  select(-duration, -emp.var.rate, -age, -campaign, -education, -pdays, -loan, -month, -day_of_week, -pdays, -previous, -default )|>
  na.omit()

set.seed(123)
trainIndex<-createDataPartition(dt_final$y, p=.8, list=FALSE)
train_data<- dt_final[trainIndex, ]
test_data<-dt_final[-trainIndex, ]

ctrl<-trainControl(method="cv",
                   number=5,
                   classProbs=TRUE,
                   summaryFunction=twoClassSummary,
                   sampling="down")

model_logit<-train(y~.,
                   data=train_data,
                   method="glm",
                   family="binomial",
                   metric="ROC",
                   trControl=ctrl)

vif(model_logit$finalModel)

                 `jobblue-collar`                   jobentrepreneur 
                         2.087930                          1.111932 
                     jobhousemaid                     jobmanagement 
                         1.159707                          1.206265 
                       jobretired                `jobself-employed` 
                         1.791925                          1.098821 
                      jobservices                        jobstudent 
                         1.326558                          1.208642 
                    jobtechnician                     jobunemployed 
                         1.358449                          1.091912 
                       jobunknown                    maritalmarried 
                         1.023915                          2.724329 
                    maritalsingle                    maritalunknown 
                         3.048856                          1.026481 
                   housingunknown                        housingyes 
                         1.032153                          1.043651 
                 contacttelephone               poutcomenonexistent 
                         1.606436                          1.448257 
                  poutcomesuccess                    cons.price.idx 
                         8.156922                          6.524123 
                    cons.conf.idx                         euribor3m 
                         2.834936                         54.328153 
                      nr.employed              call_frequency_ratio 
                        35.027662                          5.478768 
      pdays_binarynever_contacted             `age_groupMiddle-Age` 
                         8.499524                          1.605473 
                  age_groupSenior                    age_groupYoung 
                         1.626034                          1.832470 
 campaign_intensityHeavy_Followup  campaign_intensityLight_Followup 
                         4.339191                          1.578635 
campaign_intensityMedium_Followup                   education_level 
                         2.979536                          1.750126

model_lda<-train(y~., data=train_data, method="lda",
                 metric="ROC", trControl=ctrl)

test_probs<-predict(model_logit, newdata=test_data, type="prob")

prob_logit<- predict(model_logit, newdata=test_data, type="prob")$Yes
prob_lda<-predict(model_lda, newdata=test_data, type="prob")$Yes

roc_logit<-roc(test_data$y, prob_logit, quiet=TRUE)
roc_lda <- roc(test_data$y, prob_lda, quiet=TRUE)
cat("===AUC Comparison===\n")

===AUC Comparison===

cat("Logistic Regression AUC:", auc(roc_logit), "\n")

Logistic Regression AUC: 0.7652747

cat("LDA AUC:", auc(roc_lda), "\n")

LDA AUC: 0.7646249

pred_logit<-predict(model_logit, newdata=test_data)
pred_lda<-predict(model_lda, newdata=test_data)

cm_logit<-confusionMatrix(pred_logit, test_data$y, positive="Yes")
cm_lda<-confusionMatrix(pred_lda, test_data$y, positive="Yes")

performance_comp<-data.frame(
  Metric=c("AUC", "Sensitivity", "Specificity"),
  Logistic=c(as.numeric(auc(roc_logit)),
             cm_logit$byClass["Sensitivity"],
             cm_logit$byClass["Specificity"]),
  LDA=c(as.numeric(auc(roc_lda)),
        cm_lda$byClass["Sensitivity"],
        cm_lda$byClass["Specificity"])
)
cat("===Full Model Comparison===\n")

===Full Model Comparison===

performance_comp[,-1]<-round(performance_comp[,-1],4)
print(performance_comp)

                 Metric Logistic    LDA
                    AUC   0.7653 0.7646
Sensitivity Sensitivity   0.6556 0.6648
Specificity Specificity   0.7858 0.7756

plot(varImp(model_logit), top=10, main="Top 10 Drivers of Subscription")

strategic_list<-test_data|>
  mutate(Prob_yes=test_probs$Yes)|>
  arrange(desc(Prob_yes))
top_20_cutoff <- floor (0.20*nrow(strategic_list))
top_20_customers<-strategic_list[1:top_20_cutoff, ]
conv_top20<-mean(top_20_customers$y=="Yes")
conv_baseline<-mean(test_data$y=="Yes")
lift_score<-conv_top20/conv_baseline

results_df<-data.frame(
  Strategy=c("Random Calling (Baseline)", "Model_Driven (Top 20%)"),
  Conversion=c(conv_baseline, conv_top20)
)

ggplot(results_df, aes(x=Strategy, y=Conversion, fill=Strategy))+
  geom_bar(stat="identity")+
  scale_y_continuous(labels=scales::percent)+
  theme_minimal()+
  labs(title="Business Value: Model vs. Random Strategy", y="Conversion Rate")

cat("==Final Model Performance===\n")

==Final Model Performance===

cat("Conversion in Top 20% Target Group:", round(conv_top20 *100,2), "%\n")

Conversion in Top 20% Target Group: 32.13 %

cat("Conversion in Random Targeting (Baseline):", round(conv_baseline*100,2), "%\n")

Conversion in Random Targeting (Baseline): 11.12 %

cat("Model Lift", round(lift_score, 2), "x better than random\n")

Model Lift 2.89 x better than random