Data exploration

#Load libraries
library(caret)

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 4.4.3

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

#Load dataset
bank_df <- read.csv('bank-additional.csv', sep = ';')

#Remove duration from dataset
bank_df <- subset(bank_df, select = -c(duration))

#bank_df <- filter(bank_df, job != 'unknown')
#bank_df <- filter(bank_df, marital != 'unknown')
#bank_df <- filter(bank_df, education != 'unknown')
#bank_df <- filter(bank_df, default != 'unknown')
#bank_df <- filter(bank_df, housing != 'unknown')
#bank_df <- filter(bank_df, loan != 'unknown')

#Convert character variables into categorical
bank_df$job <- as.factor(bank_df$job)
bank_df$marital <- as.factor(bank_df$marital)
bank_df$education <- as.factor(bank_df$education)
bank_df$default <- as.factor(bank_df$default)
bank_df$housing <- as.factor(bank_df$housing)
bank_df$loan <- as.factor(bank_df$loan)
bank_df$contact <- as.factor(bank_df$contact)
bank_df$month <- as.factor(bank_df$month)
bank_df$day_of_week <- as.factor(bank_df$day_of_week)
bank_df$poutcome <- as.factor(bank_df$poutcome)
bank_df$y <- as.factor(bank_df$y)

# Check if missing values
colSums(is.na(bank_df))

##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       campaign          pdays       previous       poutcome   emp.var.rate 
##              0              0              0              0              0 
## cons.price.idx  cons.conf.idx      euribor3m    nr.employed              y 
##              0              0              0              0              0

Train-Test Split

# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index <- createDataPartition(bank_df$y, p = 0.7, list = FALSE)
train_data <- bank_df[train_index, ] # 70% training
test_data <- bank_df[-train_index, ] # 30% testing

Logistic regression with all variables

full_model <- glm(y ~ ., data = train_data, family = 'binomial')
summary(full_model)

## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -7.864e+01  1.287e+02  -0.611 0.541123    
## age                           9.500e-03  8.122e-03   1.170 0.242123    
## jobblue-collar               -2.376e-01  2.692e-01  -0.882 0.377598    
## jobentrepreneur              -8.136e-01  5.443e-01  -1.495 0.134972    
## jobhousemaid                  1.596e-01  4.286e-01   0.372 0.709568    
## jobmanagement                -2.842e-01  2.941e-01  -0.966 0.333845    
## jobretired                    5.102e-02  3.501e-01   0.146 0.884121    
## jobself-employed             -4.166e-01  4.003e-01  -1.041 0.298090    
## jobservices                  -2.054e-02  2.857e-01  -0.072 0.942702    
## jobstudent                    3.026e-02  4.242e-01   0.071 0.943121    
## jobtechnician                 2.814e-02  2.305e-01   0.122 0.902824    
## jobunemployed                 7.681e-02  4.158e-01   0.185 0.853435    
## jobunknown                    3.616e-02  6.423e-01   0.056 0.955108    
## maritalmarried                3.465e-01  2.521e-01   1.375 0.169285    
## maritalsingle                 4.311e-01  2.854e-01   1.511 0.130855    
## maritalunknown                5.329e-01  1.231e+00   0.433 0.665053    
## educationbasic.6y             4.611e-01  3.899e-01   1.183 0.236914    
## educationbasic.9y             2.306e-01  3.226e-01   0.715 0.474615    
## educationhigh.school          1.650e-01  3.101e-01   0.532 0.594730    
## educationprofessional.course  3.072e-01  3.367e-01   0.913 0.361505    
## educationuniversity.degree    2.864e-01  3.099e-01   0.924 0.355403    
## educationunknown              3.378e-01  4.022e-01   0.840 0.401071    
## defaultunknown                1.412e-02  2.102e-01   0.067 0.946461    
## defaultyes                   -9.535e+00  3.247e+02  -0.029 0.976576    
## housingunknown               -8.314e-01  5.778e-01  -1.439 0.150219    
## housingyes                   -1.607e-01  1.382e-01  -1.163 0.244648    
## loanunknown                          NA         NA      NA       NA    
## loanyes                      -1.228e-01  1.920e-01  -0.640 0.522213    
## contacttelephone             -9.202e-01  2.866e-01  -3.211 0.001324 ** 
## monthaug                      2.546e-01  4.445e-01   0.573 0.566832    
## monthdec                      8.561e-01  6.548e-01   1.308 0.191038    
## monthjul                      9.367e-02  3.687e-01   0.254 0.799436    
## monthjun                      3.553e-01  4.625e-01   0.768 0.442257    
## monthmar                      1.915e+00  5.491e-01   3.487 0.000489 ***
## monthmay                     -2.362e-01  3.232e-01  -0.731 0.464855    
## monthnov                     -6.305e-01  4.371e-01  -1.442 0.149204    
## monthoct                     -2.822e-01  5.387e-01  -0.524 0.600411    
## monthsep                      1.696e-01  6.361e-01   0.267 0.789788    
## day_of_weekmon                7.539e-02  2.122e-01   0.355 0.722341    
## day_of_weekthu               -1.823e-01  2.254e-01  -0.809 0.418778    
## day_of_weektue               -4.901e-02  2.247e-01  -0.218 0.827352    
## day_of_weekwed                2.194e-01  2.198e-01   0.998 0.318128    
## campaign                     -9.064e-02  4.263e-02  -2.126 0.033471 *  
## pdays                        -3.682e-04  7.134e-04  -0.516 0.605744    
## previous                      1.737e-02  1.810e-01   0.096 0.923560    
## poutcomenonexistent           1.684e-01  3.060e-01   0.550 0.582212    
## poutcomesuccess               1.144e+00  7.059e-01   1.620 0.105213    
## emp.var.rate                 -8.666e-01  4.949e-01  -1.751 0.079906 .  
## cons.price.idx                1.060e+00  8.575e-01   1.237 0.216175    
## cons.conf.idx                 2.710e-02  2.754e-02   0.984 0.324989    
## euribor3m                     4.407e-01  4.275e-01   1.031 0.302577    
## nr.employed                  -4.558e-03  1.025e-02  -0.445 0.656425    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1993.5  on 2883  degrees of freedom
## Residual deviance: 1576.6  on 2833  degrees of freedom
## AIC: 1678.6
## 
## Number of Fisher Scoring iterations: 11

loan ‘unknown’ lacks observations and our model is not able to estimate the coefficients

table(bank_df$loan, bank_df$y)

##          
##             no  yes
##   no      2975  374
##   unknown   96    9
##   yes      597   68

Stepwise Logistic Regression

# Stepwise variable selection based on AIC
stepwise_model <- step(full_model, direction = 'both', trace = FALSE)
summary(stepwise_model)

## 
## Call:
## glm(formula = y ~ contact + month + campaign + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx, family = "binomial", data = train_data)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -109.93840   17.41806  -6.312 2.76e-10 ***
## contacttelephone      -0.86174    0.25182  -3.422 0.000622 ***
## monthaug               0.39061    0.37951   1.029 0.303362    
## monthdec               1.11904    0.59232   1.889 0.058861 .  
## monthjul               0.24810    0.35508   0.699 0.484737    
## monthjun               0.46880    0.33234   1.411 0.158366    
## monthmar               2.10163    0.45541   4.615 3.94e-06 ***
## monthmay              -0.18438    0.29464  -0.626 0.531468    
## monthnov              -0.34281    0.35996  -0.952 0.340921    
## monthoct               0.09174    0.43825   0.209 0.834191    
## monthsep               0.34474    0.48484   0.711 0.477063    
## campaign              -0.09212    0.04224  -2.181 0.029208 *  
## poutcomenonexistent    0.19032    0.20752   0.917 0.359087    
## poutcomesuccess        1.50335    0.28829   5.215 1.84e-07 ***
## emp.var.rate          -0.65000    0.07538  -8.623  < 2e-16 ***
## cons.price.idx         1.17033    0.18870   6.202 5.57e-10 ***
## cons.conf.idx          0.04440    0.01816   2.446 0.014462 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1993.5  on 2883  degrees of freedom
## Residual deviance: 1597.8  on 2867  degrees of freedom
## AIC: 1631.8
## 
## Number of Fisher Scoring iterations: 6

Final stepwise model includes:

contact
month
campaign
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx

Variables Estimates Explanation

levels(bank_df$contact)

## [1] "cellular"  "telephone"

Cellular is our baseline. Our result represents the change in log odds of subscribing when contact = telephone, compared to cellular.

For a one unit increase in ‘contacttelephone’(when contact = ‘telephone’) the log odds of subscribing to a term decrease by approximately 0.86174 when compared to cellular.

levels(bank_df$month)

##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"

‘apr’ is our baseline so each result represents the change in log odds when compared to apr.

For example, ‘for a one unit increase’, basically when month = ‘aug’ the log odds of subscribing to a term increase by aproximately 0.39 when compared to ‘apr’

levels(bank_df$poutcome)

## [1] "failure"     "nonexistent" "success"

‘failure’ is our baseline so we just do the same as above.

Then for our numerical variables its just:

For a one unit increase in;

emp.var.rate the log odds of subscribing to a term decrease by 0.65 assuming all other variables remain constant.

cons.price.idx the log odds of subscribing to a term increase by 1.17 assuming all other variables remain constant.

cons.conf.idx the log odds of subscribing to a term increase 0.044 assuming all other variables remain constant.

Stepwise Evaluation

vif(stepwise_model)

##                    GVIF Df GVIF^(1/(2*Df))
## contact        2.419575  1        1.555498
## month          5.794801  9        1.102532
## campaign       1.046557  1        1.023014
## poutcome       1.452056  2        1.097731
## emp.var.rate   3.976174  1        1.994035
## cons.price.idx 3.712973  1        1.926908
## cons.conf.idx  2.427152  1        1.557932

# Evaluate the model on test data
pred_probs <- predict(stepwise_model, newdata = test_data, type = 'response') 
#type = 'response' transforms predicted values back to probabilities because logistic regression is fitted using log-odds

# Make probabilities to class labels. Threshold of 0.5 makes the most sense.
pred_class <- ifelse(pred_probs > 0.5, 'yes', 'no')
pred_class <- factor(pred_class, levels = c('no', 'yes'))

Stepwise Confusion Matrix 0.5

# Confusion matrix
conf_matrix <- confusionMatrix(pred_class, test_data$y)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1091  107
##        yes    9   28
##                                           
##                Accuracy : 0.9061          
##                  95% CI : (0.8884, 0.9218)
##     No Information Rate : 0.8907          
##     P-Value [Acc > NIR] : 0.04356         
##                                           
##                   Kappa : 0.2923          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9918          
##             Specificity : 0.2074          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.7568          
##              Prevalence : 0.8907          
##          Detection Rate : 0.8834          
##    Detection Prevalence : 0.9700          
##       Balanced Accuracy : 0.5996          
##                                           
##        'Positive' Class : no              
##

Model sucks at predicting ‘yes’ which is what we are more interested in. Maybe changing threshold from 0.5 to something else might help?

pROC Threshold Inbalanced Data

#Precision-Recall Curve finds threshold that balances sensitivity and specificity
roc_obj <- roc(test_data$y, pred_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_obj)

#coords(roc_obj, "best", ret = "threshold")
pROC::coords(
  roc_obj,
  "best",
  ret = c("threshold", "sensitivity", "specificity")
)

##    threshold sensitivity specificity
## 1 0.09224494   0.7333333        0.82

pROC finds the optimal threshold to be 0.09224494

Stepwise Confusion Matrix pROC

pred_probs2 <- predict(stepwise_model, newdata = test_data, type = 'response') 

pred_class2 <- ifelse(pred_probs2 > 0.09224494, 'yes', 'no')
pred_class2 <- factor(pred_class2, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix2 <- confusionMatrix(pred_class2, test_data$y)
print(conf_matrix2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  902  36
##        yes 198  99
##                                          
##                Accuracy : 0.8105         
##                  95% CI : (0.7875, 0.832)
##     No Information Rate : 0.8907         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3625         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8200         
##             Specificity : 0.7333         
##          Pos Pred Value : 0.9616         
##          Neg Pred Value : 0.3333         
##              Prevalence : 0.8907         
##          Detection Rate : 0.7304         
##    Detection Prevalence : 0.7595         
##       Balanced Accuracy : 0.7767         
##                                          
##        'Positive' Class : no             
##

Best results.

Stepwise Confusion Matrix 0.1

pred_probs3 <- predict(stepwise_model, newdata = test_data, type = 'response') 

pred_class3 <- ifelse(pred_probs3 > 0.1, 'yes', 'no')
pred_class3 <- factor(pred_class3, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix3 <- confusionMatrix(pred_class3, test_data$y)
print(conf_matrix3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  929  40
##        yes 171  95
##                                          
##                Accuracy : 0.8291         
##                  95% CI : (0.807, 0.8497)
##     No Information Rate : 0.8907         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3846         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8445         
##             Specificity : 0.7037         
##          Pos Pred Value : 0.9587         
##          Neg Pred Value : 0.3571         
##              Prevalence : 0.8907         
##          Detection Rate : 0.7522         
##    Detection Prevalence : 0.7846         
##       Balanced Accuracy : 0.7741         
##                                          
##        'Positive' Class : no             
##

Gains a bit of accuracy and sensitivity but loses specificity. Here it is just debatable on which to use.

XGBoost Model

library(xgboost)

## Warning: package 'xgboost' was built under R version 4.4.3

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# Convert categorical variables to numerical matrix
x_train <- model.matrix(y ~ ., data = train_data)[, -1]
y_train <- as.numeric(train_data$y) - 1  # Convert to 0 (no) and 1 (yes)

x_test <- model.matrix(y ~ ., data = test_data)[, -1]
y_test <- as.numeric(test_data$y) - 1

# Set parameters
params <- list(
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",        # Logarithmic loss metric
  eta = 0.1,                      # Learning rate
  max_depth = 6,                  # Depth of trees
  subsample = 0.8,                 # Subsample ratio
  colsample_bytree = 0.8          # Feature sampling
)

# Train the model
xgb_model <- xgboost(
  data = x_train, label = y_train,
  params = params,
  nrounds = 100,    # Number of boosting rounds
  verbose = 0       # 0 = silent, 1 = shows log-loss for each round
)

# Predict probabilities
pred_probs <- predict(xgb_model, x_test)

# Convert probabilities to binary predictions (default threshold = 0.5)
pred_class <- ifelse(pred_probs > 0.1, 1, 0)

# Convert back to factors for confusion matrix
pred_class <- factor(pred_class, levels = c(0, 1), labels = c("no", "yes"))
y_test <- factor(y_test, levels = c(0, 1), labels = c("no", "yes"))

confusionMatrix(pred_class, y_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  904  46
##        yes 196  89
##                                           
##                Accuracy : 0.804           
##                  95% CI : (0.7808, 0.8258)
##     No Information Rate : 0.8907          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3234          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8218          
##             Specificity : 0.6593          
##          Pos Pred Value : 0.9516          
##          Neg Pred Value : 0.3123          
##              Prevalence : 0.8907          
##          Detection Rate : 0.7320          
##    Detection Prevalence : 0.7692          
##       Balanced Accuracy : 0.7405          
##                                           
##        'Positive' Class : no              
##

Model on Balanced Data

d1_yes <- bank_df |>
  filter(y == 'yes')
d1_no <- bank_df |>
  filter(y == 'no')

sample_no_cust = sample_n(d1_no, nrow(d1_yes))
df_bal = rbind(d1_yes, sample_no_cust)

# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index_balanced_df <- createDataPartition(df_bal$y, p = 0.7, list = FALSE)
train_data_balanced_df <- df_bal[train_index_balanced_df, ] # 70% training
test_data_balanced_df <- df_bal[-train_index_balanced_df, ] # 30% testing


full_model_balanced_df <- glm(y ~ ., data = train_data_balanced_df, family = 'binomial')
#summary(full_model_balanced_df)

# Stepwise variable selection based on AIC
stepwise_model_balanced_df <- step(full_model_balanced_df, direction = 'both', trace = FALSE)
summary(stepwise_model_balanced_df)

## 
## Call:
## glm(formula = y ~ age + loan + month + day_of_week + pdays + 
##     emp.var.rate + cons.price.idx, family = "binomial", data = train_data_balanced_df)
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.543e+02  4.009e+01  -3.850 0.000118 ***
## age             1.756e-02  9.260e-03   1.896 0.057939 .  
## loanunknown    -8.073e-01  5.656e-01  -1.427 0.153529    
## loanyes        -7.930e-01  2.647e-01  -2.996 0.002734 ** 
## monthaug        1.166e+00  5.665e-01   2.058 0.039594 *  
## monthdec        1.536e+01  7.033e+02   0.022 0.982578    
## monthjul        5.058e-01  4.953e-01   1.021 0.307210    
## monthjun       -2.441e-01  4.759e-01  -0.513 0.608079    
## monthmar        1.186e+00  6.998e-01   1.695 0.090065 .  
## monthmay       -5.972e-01  4.218e-01  -1.416 0.156761    
## monthnov        1.419e-01  5.268e-01   0.269 0.787649    
## monthoct        5.891e-01  7.755e-01   0.760 0.447470    
## monthsep       -3.515e-01  7.244e-01  -0.485 0.627558    
## day_of_weekmon  5.396e-02  3.102e-01   0.174 0.861903    
## day_of_weekthu -6.368e-01  3.112e-01  -2.047 0.040696 *  
## day_of_weektue -4.396e-01  3.157e-01  -1.393 0.163722    
## day_of_weekwed -4.840e-02  3.153e-01  -0.153 0.878011    
## pdays          -1.154e-03  4.720e-04  -2.444 0.014522 *  
## emp.var.rate   -9.239e-01  1.569e-01  -5.889 3.88e-09 ***
## cons.price.idx  1.654e+00  4.270e-01   3.874 0.000107 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 876.14  on 631  degrees of freedom
## Residual deviance: 677.19  on 612  degrees of freedom
## AIC: 717.19
## 
## Number of Fisher Scoring iterations: 15

vif(stepwise_model_balanced_df)

##                    GVIF Df GVIF^(1/(2*Df))
## age            1.058281  1        1.028728
## loan           1.089451  2        1.021649
## month          5.002665  9        1.093565
## day_of_week    1.184229  4        1.021361
## pdays          1.196894  1        1.094027
## emp.var.rate   7.917902  1        2.813877
## cons.price.idx 7.631056  1        2.762437

Stepwise Confusion Matrix Balanced 0.5

pred_probs_balanced_df <- predict(stepwise_model_balanced_df, 
                                  newdata = test_data_balanced_df, type = 'response') 

pred_class_balanced_df <- ifelse(pred_probs_balanced_df > 0.5   , 'yes', 'no')
pred_class_balanced_df <- factor(pred_class_balanced_df, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix_balanced_df <- confusionMatrix(pred_class_balanced_df, test_data_balanced_df$y)
print(conf_matrix_balanced_df)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  111  45
##        yes  24  90
##                                           
##                Accuracy : 0.7444          
##                  95% CI : (0.6881, 0.7954)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.4889          
##                                           
##  Mcnemar's Test P-Value : 0.01605         
##                                           
##             Sensitivity : 0.8222          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.7115          
##          Neg Pred Value : 0.7895          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4111          
##    Detection Prevalence : 0.5778          
##       Balanced Accuracy : 0.7444          
##                                           
##        'Positive' Class : no              
##

roc_obj2 <- roc(test_data_balanced_df$y, pred_probs_balanced_df)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_obj2)

pROC::coords(roc_obj2, "best", ret = "threshold")

##   threshold
## 1 0.4665221

Optimal threshold = 0.4230439

pred_probs_balanced_df2 <- predict(stepwise_model_balanced_df, 
                                  newdata = test_data_balanced_df, type = 'response') 

pred_class_balanced_df2 <- ifelse(pred_probs_balanced_df2 > 0.4230439       , 'yes', 'no')
pred_class_balanced_df2 <- factor(pred_class_balanced_df2, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix_balanced_df2 <- confusionMatrix(pred_class_balanced_df2, test_data_balanced_df$y)
print(conf_matrix_balanced_df2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no   89  34
##        yes  46 101
##                                           
##                Accuracy : 0.7037          
##                  95% CI : (0.6454, 0.7575)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : 8.646e-12       
##                                           
##                   Kappa : 0.4074          
##                                           
##  Mcnemar's Test P-Value : 0.2188          
##                                           
##             Sensitivity : 0.6593          
##             Specificity : 0.7481          
##          Pos Pred Value : 0.7236          
##          Neg Pred Value : 0.6871          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3296          
##    Detection Prevalence : 0.4556          
##       Balanced Accuracy : 0.7037          
##                                           
##        'Positive' Class : no              
##

LDA Model

lda_model_stepwise_balanced_df <- lda(y ~ age + contact + month + campaign + pdays + 
                                        emp.var.rate + cons.price.idx + cons.conf.idx, 
                                      data = train_data_balanced_df)

lda_model_stepwise_balanced_df

## Call:
## lda(y ~ age + contact + month + campaign + pdays + emp.var.rate + 
##     cons.price.idx + cons.conf.idx, data = train_data_balanced_df)
## 
## Prior probabilities of groups:
##  no yes 
## 0.5 0.5 
## 
## Group means:
##          age contacttelephone  monthaug   monthdec  monthjul  monthjun
## no  39.30380        0.3829114 0.1297468 0.00000000 0.1930380 0.1392405
## yes 41.84177        0.1835443 0.1329114 0.03481013 0.1329114 0.1613924
##       monthmar  monthmay   monthnov    monthoct   monthsep campaign    pdays
## no  0.01265823 0.3765823 0.08544304 0.009493671 0.01265823 2.471519 977.0253
## yes 0.06329114 0.2025316 0.10126582 0.041139241 0.05379747 2.082278 782.0823
##     emp.var.rate cons.price.idx cons.conf.idx
## no     0.2344937       93.61041     -40.80063
## yes   -1.0689873       93.44499     -39.87184
## 
## Coefficients of linear discriminants:
##                            LD1
## age               0.0132671935
## contacttelephone -0.4020287707
## monthaug          0.4010579566
## monthdec          1.1068714359
## monthjul          0.1318796019
## monthjun         -0.0029915901
## monthmar          0.9109459816
## monthmay         -0.5447458193
## monthnov         -0.1069661686
## monthoct          0.2892776418
## monthsep         -0.1134861056
## campaign         -0.0344123678
## pdays            -0.0005878534
## emp.var.rate     -0.6921495522
## cons.price.idx    1.1472420578
## cons.conf.idx     0.0225206926

contacttelephone (-0.5429): If a customer was contacted via telephone, they were more likely to say “no.”
monthmar (1.1902): Being contacted in March increases the likelihood of a “yes.”
emp.var.rate (-0.6627): A higher employment variation rate decreases the chance of “yes,” meaning people are more likely to accept the offer when employment conditions are poor.
cons.price.idx (1.1481): A higher consumer price index increases the likelihood of a “yes.”
pdays (-0.00057): Higher values of pdays (longer time since last contact) slightly decrease the probability of a “yes.”

LDA Predictions and Confusion Matrix

lda_predictions <- predict(lda_model_stepwise_balanced_df, test_data_balanced_df)
caret::confusionMatrix(as.factor(lda_predictions$class), test_data_balanced_df$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  115  45
##        yes  20  90
##                                          
##                Accuracy : 0.7593         
##                  95% CI : (0.7037, 0.809)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5185         
##                                          
##  Mcnemar's Test P-Value : 0.002912       
##                                          
##             Sensitivity : 0.8519         
##             Specificity : 0.6667         
##          Pos Pred Value : 0.7188         
##          Neg Pred Value : 0.8182         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4259         
##    Detection Prevalence : 0.5926         
##       Balanced Accuracy : 0.7593         
##                                          
##        'Positive' Class : no             
##

  #lda_predictions$class is just the name of variable where our predictions are stored

Case Study 1

Doris Mbitazi

2025-02-04