Data exploration

#Load libraries
library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

#Load dataset
bank_df <- read.csv('/Users/ponce/Desktop/DA-6813/Case Study 1/bank-additional.csv', 
sep = ';')

#Remove duration from dataset
bank_df <- subset(bank_df, select = -c(duration))

#bank_df <- filter(bank_df, job != 'unknown')
#bank_df <- filter(bank_df, marital != 'unknown')
#bank_df <- filter(bank_df, education != 'unknown')
#bank_df <- filter(bank_df, default != 'unknown')
#bank_df <- filter(bank_df, housing != 'unknown')
#bank_df <- filter(bank_df, loan != 'unknown')

#Convert character variables into categorical
bank_df$job <- as.factor(bank_df$job)
bank_df$marital <- as.factor(bank_df$marital)
bank_df$education <- as.factor(bank_df$education)
bank_df$default <- as.factor(bank_df$default)
bank_df$housing <- as.factor(bank_df$housing)
bank_df$loan <- as.factor(bank_df$loan)
bank_df$contact <- as.factor(bank_df$contact)
bank_df$month <- as.factor(bank_df$month)
bank_df$day_of_week <- as.factor(bank_df$day_of_week)
bank_df$poutcome <- as.factor(bank_df$poutcome)
bank_df$y <- as.factor(bank_df$y)

# Check if missing values
colSums(is.na(bank_df))

##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       campaign          pdays       previous       poutcome   emp.var.rate 
##              0              0              0              0              0 
## cons.price.idx  cons.conf.idx      euribor3m    nr.employed              y 
##              0              0              0              0              0

Train-Test Split

# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index <- createDataPartition(bank_df$y, p = 0.7, list = FALSE)
train_data <- bank_df[train_index, ] # 70% training
test_data <- bank_df[-train_index, ] # 30% testing

Logistic regression with all variables

full_model <- glm(y ~ ., data = train_data, family = 'binomial')
summary(full_model)

## 
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_data)
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -7.864e+01  1.287e+02  -0.611 0.541123    
## age                           9.500e-03  8.122e-03   1.170 0.242123    
## jobblue-collar               -2.376e-01  2.692e-01  -0.882 0.377598    
## jobentrepreneur              -8.136e-01  5.443e-01  -1.495 0.134972    
## jobhousemaid                  1.596e-01  4.286e-01   0.372 0.709568    
## jobmanagement                -2.842e-01  2.941e-01  -0.966 0.333845    
## jobretired                    5.102e-02  3.501e-01   0.146 0.884121    
## jobself-employed             -4.166e-01  4.003e-01  -1.041 0.298090    
## jobservices                  -2.054e-02  2.857e-01  -0.072 0.942702    
## jobstudent                    3.026e-02  4.242e-01   0.071 0.943121    
## jobtechnician                 2.814e-02  2.305e-01   0.122 0.902824    
## jobunemployed                 7.681e-02  4.158e-01   0.185 0.853435    
## jobunknown                    3.616e-02  6.423e-01   0.056 0.955108    
## maritalmarried                3.465e-01  2.521e-01   1.375 0.169285    
## maritalsingle                 4.311e-01  2.854e-01   1.511 0.130855    
## maritalunknown                5.329e-01  1.231e+00   0.433 0.665053    
## educationbasic.6y             4.611e-01  3.899e-01   1.183 0.236914    
## educationbasic.9y             2.306e-01  3.226e-01   0.715 0.474615    
## educationhigh.school          1.650e-01  3.101e-01   0.532 0.594730    
## educationprofessional.course  3.072e-01  3.367e-01   0.913 0.361505    
## educationuniversity.degree    2.864e-01  3.099e-01   0.924 0.355403    
## educationunknown              3.378e-01  4.022e-01   0.840 0.401071    
## defaultunknown                1.412e-02  2.102e-01   0.067 0.946461    
## defaultyes                   -9.535e+00  3.247e+02  -0.029 0.976576    
## housingunknown               -8.314e-01  5.778e-01  -1.439 0.150219    
## housingyes                   -1.607e-01  1.382e-01  -1.163 0.244648    
## loanunknown                          NA         NA      NA       NA    
## loanyes                      -1.228e-01  1.920e-01  -0.640 0.522213    
## contacttelephone             -9.202e-01  2.866e-01  -3.211 0.001324 ** 
## monthaug                      2.546e-01  4.445e-01   0.573 0.566832    
## monthdec                      8.561e-01  6.548e-01   1.308 0.191038    
## monthjul                      9.367e-02  3.687e-01   0.254 0.799436    
## monthjun                      3.553e-01  4.625e-01   0.768 0.442257    
## monthmar                      1.915e+00  5.491e-01   3.487 0.000489 ***
## monthmay                     -2.362e-01  3.232e-01  -0.731 0.464855    
## monthnov                     -6.305e-01  4.371e-01  -1.442 0.149204    
## monthoct                     -2.822e-01  5.387e-01  -0.524 0.600411    
## monthsep                      1.696e-01  6.361e-01   0.267 0.789788    
## day_of_weekmon                7.539e-02  2.122e-01   0.355 0.722341    
## day_of_weekthu               -1.823e-01  2.254e-01  -0.809 0.418778    
## day_of_weektue               -4.901e-02  2.247e-01  -0.218 0.827352    
## day_of_weekwed                2.194e-01  2.198e-01   0.998 0.318128    
## campaign                     -9.064e-02  4.263e-02  -2.126 0.033471 *  
## pdays                        -3.682e-04  7.134e-04  -0.516 0.605744    
## previous                      1.737e-02  1.810e-01   0.096 0.923560    
## poutcomenonexistent           1.684e-01  3.060e-01   0.550 0.582212    
## poutcomesuccess               1.144e+00  7.059e-01   1.620 0.105213    
## emp.var.rate                 -8.666e-01  4.949e-01  -1.751 0.079906 .  
## cons.price.idx                1.060e+00  8.575e-01   1.237 0.216175    
## cons.conf.idx                 2.710e-02  2.754e-02   0.984 0.324989    
## euribor3m                     4.407e-01  4.275e-01   1.031 0.302577    
## nr.employed                  -4.558e-03  1.025e-02  -0.445 0.656425    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1993.5  on 2883  degrees of freedom
## Residual deviance: 1576.6  on 2833  degrees of freedom
## AIC: 1678.6
## 
## Number of Fisher Scoring iterations: 11

loan ‘unknown’ lacks observations and our model is not able to estimate the coefficients

table(bank_df$loan, bank_df$y)

##          
##             no  yes
##   no      2975  374
##   unknown   96    9
##   yes      597   68

Stepwise Logistic Regression

# Stepwise variable selection based on AIC
stepwise_model <- step(full_model, direction = 'both', trace = FALSE)
summary(stepwise_model)

## 
## Call:
## glm(formula = y ~ contact + month + campaign + poutcome + emp.var.rate + 
##     cons.price.idx + cons.conf.idx, family = "binomial", data = train_data)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -109.93840   17.41806  -6.312 2.76e-10 ***
## contacttelephone      -0.86174    0.25182  -3.422 0.000622 ***
## monthaug               0.39061    0.37951   1.029 0.303362    
## monthdec               1.11904    0.59232   1.889 0.058861 .  
## monthjul               0.24810    0.35508   0.699 0.484737    
## monthjun               0.46880    0.33234   1.411 0.158366    
## monthmar               2.10163    0.45541   4.615 3.94e-06 ***
## monthmay              -0.18438    0.29464  -0.626 0.531468    
## monthnov              -0.34281    0.35996  -0.952 0.340921    
## monthoct               0.09174    0.43825   0.209 0.834191    
## monthsep               0.34474    0.48484   0.711 0.477063    
## campaign              -0.09212    0.04224  -2.181 0.029208 *  
## poutcomenonexistent    0.19032    0.20752   0.917 0.359087    
## poutcomesuccess        1.50335    0.28829   5.215 1.84e-07 ***
## emp.var.rate          -0.65000    0.07538  -8.623  < 2e-16 ***
## cons.price.idx         1.17033    0.18870   6.202 5.57e-10 ***
## cons.conf.idx          0.04440    0.01816   2.446 0.014462 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1993.5  on 2883  degrees of freedom
## Residual deviance: 1597.8  on 2867  degrees of freedom
## AIC: 1631.8
## 
## Number of Fisher Scoring iterations: 6

Final stepwise model includes:

contact
month
campaign
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx

Variables Estimates Explanation

levels(bank_df$contact)

## [1] "cellular"  "telephone"

Cellular is our baseline. Our result represents the change in log odds of subscribing when contact = telephone, compared to cellular.

For a one unit increase in ‘contacttelephone’(when contact = ‘telephone’) the log odds of subscribing to a term decrease by approximately 0.86174 when compared to cellular.

levels(bank_df$month)

##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"

‘apr’ is our baseline so each result represents the change in log odds when compared to apr.

For example, ‘for a one unit increase’, basically when month = ‘aug’ the log odds of subscribing to a term increase by aproximately 0.39 when compared to ‘apr’

levels(bank_df$poutcome)

## [1] "failure"     "nonexistent" "success"

‘failure’ is our baseline so we just do the same as above.

Then for our numerical variables its just:

For a one unit increase in;

emp.var.rate the log odds of subscribing to a term decrease by 0.65 assuming all other variables remain constant.

cons.price.idx the log odds of subscribing to a term increase by 1.17 assuming all other variables remain constant.

cons.conf.idx the log odds of subscribing to a term increase 0.044 assuming all other variables remain constant.

Stepwise Evaluation

vif(stepwise_model)

##                    GVIF Df GVIF^(1/(2*Df))
## contact        2.419575  1        1.555498
## month          5.794801  9        1.102532
## campaign       1.046557  1        1.023014
## poutcome       1.452056  2        1.097731
## emp.var.rate   3.976174  1        1.994035
## cons.price.idx 3.712973  1        1.926908
## cons.conf.idx  2.427152  1        1.557932

# Evaluate the model on test data
pred_probs <- predict(stepwise_model, newdata = test_data, type = 'response') 
#type = 'response' transforms predicted values back to probabilities because logistic regression is fitted using log-odds

# Make probabilities to class labels. Threshold of 0.5 makes the most sense.
pred_class <- ifelse(pred_probs > 0.5, 'yes', 'no')
pred_class <- factor(pred_class, levels = c('no', 'yes'))

Stepwise Confusion Matrix 0.5

# Confusion matrix
conf_matrix <- confusionMatrix(pred_class, test_data$y)
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1091  107
##        yes    9   28
##                                           
##                Accuracy : 0.9061          
##                  95% CI : (0.8884, 0.9218)
##     No Information Rate : 0.8907          
##     P-Value [Acc > NIR] : 0.04356         
##                                           
##                   Kappa : 0.2923          
##                                           
##  Mcnemar's Test P-Value : < 2e-16         
##                                           
##             Sensitivity : 0.9918          
##             Specificity : 0.2074          
##          Pos Pred Value : 0.9107          
##          Neg Pred Value : 0.7568          
##              Prevalence : 0.8907          
##          Detection Rate : 0.8834          
##    Detection Prevalence : 0.9700          
##       Balanced Accuracy : 0.5996          
##                                           
##        'Positive' Class : no              
##

Model sucks at predicting ‘yes’ which is what we are more interested in. Maybe changing threshold from 0.5 to something else might help?

pROC Threshold Inbalanced Data

#Precision-Recall Curve finds threshold that balances sensitivity and specificity
roc_obj <- roc(test_data$y, pred_probs)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_obj)

coords(roc_obj, "best", ret = "threshold")

##    threshold
## 1 0.09224494

pROC finds the optimal threshold to be 0.09224494

Stepwise Confusion Matrix pROC

pred_probs2 <- predict(stepwise_model, newdata = test_data, type = 'response') 

pred_class2 <- ifelse(pred_probs2 > 0.09224494, 'yes', 'no')
pred_class2 <- factor(pred_class2, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix2 <- confusionMatrix(pred_class2, test_data$y)
print(conf_matrix2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  902  36
##        yes 198  99
##                                          
##                Accuracy : 0.8105         
##                  95% CI : (0.7875, 0.832)
##     No Information Rate : 0.8907         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3625         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8200         
##             Specificity : 0.7333         
##          Pos Pred Value : 0.9616         
##          Neg Pred Value : 0.3333         
##              Prevalence : 0.8907         
##          Detection Rate : 0.7304         
##    Detection Prevalence : 0.7595         
##       Balanced Accuracy : 0.7767         
##                                          
##        'Positive' Class : no             
##

Best results.

Stepwise Confusion Matrix 0.1

pred_probs3 <- predict(stepwise_model, newdata = test_data, type = 'response') 

pred_class3 <- ifelse(pred_probs3 > 0.1, 'yes', 'no')
pred_class3 <- factor(pred_class3, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix3 <- confusionMatrix(pred_class3, test_data$y)
print(conf_matrix3)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  929  40
##        yes 171  95
##                                          
##                Accuracy : 0.8291         
##                  95% CI : (0.807, 0.8497)
##     No Information Rate : 0.8907         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3846         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.8445         
##             Specificity : 0.7037         
##          Pos Pred Value : 0.9587         
##          Neg Pred Value : 0.3571         
##              Prevalence : 0.8907         
##          Detection Rate : 0.7522         
##    Detection Prevalence : 0.7846         
##       Balanced Accuracy : 0.7741         
##                                          
##        'Positive' Class : no             
##

Gains a bit of accuracy and sensitivity but loses specificity. Here it is just debatable on which to use.

XGBoost Model

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:dplyr':
## 
##     slice

# Convert categorical variables to numerical matrix
x_train <- model.matrix(y ~ ., data = train_data)[, -1]
y_train <- as.numeric(train_data$y) - 1  # Convert to 0 (no) and 1 (yes)

x_test <- model.matrix(y ~ ., data = test_data)[, -1]
y_test <- as.numeric(test_data$y) - 1

# Set parameters
params <- list(
  objective = "binary:logistic",  # Binary classification
  eval_metric = "logloss",        # Logarithmic loss metric
  eta = 0.1,                      # Learning rate
  max_depth = 6,                  # Depth of trees
  subsample = 0.8,                 # Subsample ratio
  colsample_bytree = 0.8          # Feature sampling
)

# Train the model
xgb_model <- xgboost(
  data = x_train, label = y_train,
  params = params,
  nrounds = 100,    # Number of boosting rounds
  verbose = 0       # 0 = silent, 1 = shows log-loss for each round
)

# Predict probabilities
pred_probs <- predict(xgb_model, x_test)

#pROC threshold for XGBoost
roc_curve <- roc(y_test, pred_probs)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

coords(roc_curve, 'best', ret = 'threshold')

##   threshold
## 1 0.1145365

Best Threshold = 0.1145365

# Convert probabilities to binary predictions (default threshold = 0.5)
pred_class <- ifelse(pred_probs > 0.1145365, 1, 0)

# Convert back to factors for confusion matrix
pred_class <- factor(pred_class, levels = c(0, 1), labels = c("no", "yes"))
y_test <- factor(y_test, levels = c(0, 1), labels = c("no", "yes"))

confusionMatrix(pred_class, y_test)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  942  45
##        yes 158  90
##                                           
##                Accuracy : 0.8356          
##                  95% CI : (0.8138, 0.8559)
##     No Information Rate : 0.8907          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3826          
##                                           
##  Mcnemar's Test P-Value : 3.815e-15       
##                                           
##             Sensitivity : 0.8564          
##             Specificity : 0.6667          
##          Pos Pred Value : 0.9544          
##          Neg Pred Value : 0.3629          
##              Prevalence : 0.8907          
##          Detection Rate : 0.7628          
##    Detection Prevalence : 0.7992          
##       Balanced Accuracy : 0.7615          
##                                           
##        'Positive' Class : no              
##

Model on Balanced Data

d1_yes <- bank_df |>
  filter(y == 'yes')
d1_no <- bank_df |>
  filter(y == 'no')

sample_no_cust = sample_n(d1_no, nrow(d1_yes))
df_bal = rbind(d1_yes, sample_no_cust)

# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index_balanced_df <- createDataPartition(df_bal$y, p = 0.7, list = FALSE)
train_data_balanced_df <- df_bal[train_index_balanced_df, ] # 70% training
test_data_balanced_df <- df_bal[-train_index_balanced_df, ] # 30% testing


full_model_balanced_df <- glm(y ~ ., data = train_data_balanced_df, family = 'binomial')
#summary(full_model_balanced_df)

# Stepwise variable selection based on AIC
stepwise_model_balanced_df <- step(full_model_balanced_df, direction = 'both', trace = FALSE)
summary(stepwise_model_balanced_df)

## 
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + emp.var.rate + 
##     cons.price.idx + cons.conf.idx, family = "binomial", data = train_data_balanced_df)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.388e+02  3.460e+01  -4.011 6.04e-05 ***
## contacttelephone -7.994e-01  3.487e-01  -2.292   0.0219 *  
## monthaug          1.035e-01  6.219e-01   0.166   0.8679    
## monthdec          1.506e+01  7.086e+02   0.021   0.9830    
## monthjul          1.248e-01  5.119e-01   0.244   0.8074    
## monthjun          1.273e-01  4.800e-01   0.265   0.7908    
## monthmar          1.918e+00  8.603e-01   2.230   0.0258 *  
## monthmay         -3.991e-01  4.185e-01  -0.954   0.3403    
## monthnov         -5.629e-01  5.091e-01  -1.106   0.2689    
## monthoct          2.109e-01  8.358e-01   0.252   0.8008    
## monthsep          1.321e-01  9.232e-01   0.143   0.8862    
## campaign         -7.857e-02  4.612e-02  -1.704   0.0884 .  
## pdays            -1.125e-03  4.763e-04  -2.361   0.0182 *  
## emp.var.rate     -7.407e-01  1.347e-01  -5.501 3.79e-08 ***
## cons.price.idx    1.523e+00  3.691e-01   4.126 3.69e-05 ***
## cons.conf.idx     5.869e-02  2.985e-02   1.966   0.0492 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 876.14  on 631  degrees of freedom
## Residual deviance: 679.96  on 616  degrees of freedom
## AIC: 711.96
## 
## Number of Fisher Scoring iterations: 15

vif(stepwise_model_balanced_df)

##                    GVIF Df GVIF^(1/(2*Df))
## contact        2.890229  1        1.700067
## month          8.469356  9        1.126023
## campaign       1.036432  1        1.018053
## pdays          1.164511  1        1.079125
## emp.var.rate   5.661669  1        2.379426
## cons.price.idx 5.377469  1        2.318937
## cons.conf.idx  2.232940  1        1.494303

Stepwise Confusion Matrix Balanced 0.5

pred_probs_balanced_df <- predict(stepwise_model_balanced_df, 
                                  newdata = test_data_balanced_df, type = 'response') 

pred_class_balanced_df <- ifelse(pred_probs_balanced_df > 0.5   , 'yes', 'no')
pred_class_balanced_df <- factor(pred_class_balanced_df, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix_balanced_df <- confusionMatrix(pred_class_balanced_df, test_data_balanced_df$y)
print(conf_matrix_balanced_df)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  116  47
##        yes  19  88
##                                           
##                Accuracy : 0.7556          
##                  95% CI : (0.6998, 0.8056)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5111          
##                                           
##  Mcnemar's Test P-Value : 0.000889        
##                                           
##             Sensitivity : 0.8593          
##             Specificity : 0.6519          
##          Pos Pred Value : 0.7117          
##          Neg Pred Value : 0.8224          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4296          
##    Detection Prevalence : 0.6037          
##       Balanced Accuracy : 0.7556          
##                                           
##        'Positive' Class : no              
##

roc_obj2 <- roc(test_data_balanced_df$y, pred_probs_balanced_df)

## Setting levels: control = no, case = yes

## Setting direction: controls < cases

plot(roc_obj2)

coords(roc_obj2, "best", ret = "threshold")

##   threshold
## 1 0.4230439

Optimal threshold = 0.4230439

pred_probs_balanced_df2 <- predict(stepwise_model_balanced_df, 
                                  newdata = test_data_balanced_df, type = 'response') 

pred_class_balanced_df2 <- ifelse(pred_probs_balanced_df2 > 0.4230439       , 'yes', 'no')
pred_class_balanced_df2 <- factor(pred_class_balanced_df2, levels = c('no', 'yes'))

# Confusion matrix
conf_matrix_balanced_df2 <- confusionMatrix(pred_class_balanced_df2, test_data_balanced_df$y)
print(conf_matrix_balanced_df2)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  104  33
##        yes  31 102
##                                           
##                Accuracy : 0.763           
##                  95% CI : (0.7076, 0.8124)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.5259          
##                                           
##  Mcnemar's Test P-Value : 0.9005          
##                                           
##             Sensitivity : 0.7704          
##             Specificity : 0.7556          
##          Pos Pred Value : 0.7591          
##          Neg Pred Value : 0.7669          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3852          
##    Detection Prevalence : 0.5074          
##       Balanced Accuracy : 0.7630          
##                                           
##        'Positive' Class : no              
##

LDA Model

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

lda_model_stepwise_balanced_df <- lda(y ~ age + contact + month + campaign + pdays + 
                                        emp.var.rate + cons.price.idx + cons.conf.idx, 
                                      data = train_data_balanced_df)

lda_model_stepwise_balanced_df

## Call:
## lda(y ~ age + contact + month + campaign + pdays + emp.var.rate + 
##     cons.price.idx + cons.conf.idx, data = train_data_balanced_df)
## 
## Prior probabilities of groups:
##  no yes 
## 0.5 0.5 
## 
## Group means:
##          age contacttelephone  monthaug   monthdec  monthjul  monthjun
## no  39.56646        0.3544304 0.1550633 0.00000000 0.1962025 0.1044304
## yes 41.84177        0.1835443 0.1329114 0.03481013 0.1329114 0.1613924
##        monthmar  monthmay  monthnov    monthoct    monthsep campaign    pdays
## no  0.006329114 0.3449367 0.1329114 0.009493671 0.006329114 2.731013 976.9399
## yes 0.063291139 0.2025316 0.1012658 0.041139241 0.053797468 2.082278 782.0823
##     emp.var.rate cons.price.idx cons.conf.idx
## no     0.2357595       93.56728     -40.78101
## yes   -1.0689873       93.44499     -39.87184
## 
## Coefficients of linear discriminants:
##                            LD1
## age               0.0074060259
## contacttelephone -0.5428783452
## monthaug          0.1617802933
## monthdec          0.9969463418
## monthjul          0.1882184386
## monthjun          0.2099341992
## monthmar          1.1901913553
## monthmay         -0.3955212307
## monthnov         -0.4681373725
## monthoct          0.2895821931
## monthsep          0.2140202697
## campaign         -0.0589307340
## pdays            -0.0005710463
## emp.var.rate     -0.6626569249
## cons.price.idx    1.1480788225
## cons.conf.idx     0.0369509061

contacttelephone (-0.5429): If a customer was contacted via telephone, they were more likely to say “no.”
monthmar (1.1902): Being contacted in March increases the likelihood of a “yes.”
emp.var.rate (-0.6627): A higher employment variation rate decreases the chance of “yes,” meaning people are more likely to accept the offer when employment conditions are poor.
cons.price.idx (1.1481): A higher consumer price index increases the likelihood of a “yes.”
pdays (-0.00057): Higher values of pdays (longer time since last contact) slightly decrease the probability of a “yes.”

LDA Predictions and Confusion Matrix

lda_predictions <- predict(lda_model_stepwise_balanced_df, test_data_balanced_df)
caret::confusionMatrix(as.factor(lda_predictions$class), test_data_balanced_df$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  116  46
##        yes  19  89
##                                          
##                Accuracy : 0.7593         
##                  95% CI : (0.7037, 0.809)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.5185         
##                                          
##  Mcnemar's Test P-Value : 0.00126        
##                                          
##             Sensitivity : 0.8593         
##             Specificity : 0.6593         
##          Pos Pred Value : 0.7160         
##          Neg Pred Value : 0.8241         
##              Prevalence : 0.5000         
##          Detection Rate : 0.4296         
##    Detection Prevalence : 0.6000         
##       Balanced Accuracy : 0.7593         
##                                          
##        'Positive' Class : no             
##

  #lda_predictions$class is just the name of variable where our predictions are stored

Case Study 1

Mauricio Ponce

2025-02-04