#Load libraries
library(caret)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
#Load dataset
bank_df <- read.csv('bank-additional.csv', sep = ';')
#Remove duration from dataset
bank_df <- subset(bank_df, select = -c(duration))
#bank_df <- filter(bank_df, job != 'unknown')
#bank_df <- filter(bank_df, marital != 'unknown')
#bank_df <- filter(bank_df, education != 'unknown')
#bank_df <- filter(bank_df, default != 'unknown')
#bank_df <- filter(bank_df, housing != 'unknown')
#bank_df <- filter(bank_df, loan != 'unknown')
#Convert character variables into categorical
bank_df$job <- as.factor(bank_df$job)
bank_df$marital <- as.factor(bank_df$marital)
bank_df$education <- as.factor(bank_df$education)
bank_df$default <- as.factor(bank_df$default)
bank_df$housing <- as.factor(bank_df$housing)
bank_df$loan <- as.factor(bank_df$loan)
bank_df$contact <- as.factor(bank_df$contact)
bank_df$month <- as.factor(bank_df$month)
bank_df$day_of_week <- as.factor(bank_df$day_of_week)
bank_df$poutcome <- as.factor(bank_df$poutcome)
bank_df$y <- as.factor(bank_df$y)
# Check if missing values
colSums(is.na(bank_df))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## campaign pdays previous poutcome emp.var.rate
## 0 0 0 0 0
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 0 0 0 0 0
# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index <- createDataPartition(bank_df$y, p = 0.7, list = FALSE)
train_data <- bank_df[train_index, ] # 70% training
test_data <- bank_df[-train_index, ] # 30% testing
full_model <- glm(y ~ ., data = train_data, family = 'binomial')
summary(full_model)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_data)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.864e+01 1.287e+02 -0.611 0.541123
## age 9.500e-03 8.122e-03 1.170 0.242123
## jobblue-collar -2.376e-01 2.692e-01 -0.882 0.377598
## jobentrepreneur -8.136e-01 5.443e-01 -1.495 0.134972
## jobhousemaid 1.596e-01 4.286e-01 0.372 0.709568
## jobmanagement -2.842e-01 2.941e-01 -0.966 0.333845
## jobretired 5.102e-02 3.501e-01 0.146 0.884121
## jobself-employed -4.166e-01 4.003e-01 -1.041 0.298090
## jobservices -2.054e-02 2.857e-01 -0.072 0.942702
## jobstudent 3.026e-02 4.242e-01 0.071 0.943121
## jobtechnician 2.814e-02 2.305e-01 0.122 0.902824
## jobunemployed 7.681e-02 4.158e-01 0.185 0.853435
## jobunknown 3.616e-02 6.423e-01 0.056 0.955108
## maritalmarried 3.465e-01 2.521e-01 1.375 0.169285
## maritalsingle 4.311e-01 2.854e-01 1.511 0.130855
## maritalunknown 5.329e-01 1.231e+00 0.433 0.665053
## educationbasic.6y 4.611e-01 3.899e-01 1.183 0.236914
## educationbasic.9y 2.306e-01 3.226e-01 0.715 0.474615
## educationhigh.school 1.650e-01 3.101e-01 0.532 0.594730
## educationprofessional.course 3.072e-01 3.367e-01 0.913 0.361505
## educationuniversity.degree 2.864e-01 3.099e-01 0.924 0.355403
## educationunknown 3.378e-01 4.022e-01 0.840 0.401071
## defaultunknown 1.412e-02 2.102e-01 0.067 0.946461
## defaultyes -9.535e+00 3.247e+02 -0.029 0.976576
## housingunknown -8.314e-01 5.778e-01 -1.439 0.150219
## housingyes -1.607e-01 1.382e-01 -1.163 0.244648
## loanunknown NA NA NA NA
## loanyes -1.228e-01 1.920e-01 -0.640 0.522213
## contacttelephone -9.202e-01 2.866e-01 -3.211 0.001324 **
## monthaug 2.546e-01 4.445e-01 0.573 0.566832
## monthdec 8.561e-01 6.548e-01 1.308 0.191038
## monthjul 9.367e-02 3.687e-01 0.254 0.799436
## monthjun 3.553e-01 4.625e-01 0.768 0.442257
## monthmar 1.915e+00 5.491e-01 3.487 0.000489 ***
## monthmay -2.362e-01 3.232e-01 -0.731 0.464855
## monthnov -6.305e-01 4.371e-01 -1.442 0.149204
## monthoct -2.822e-01 5.387e-01 -0.524 0.600411
## monthsep 1.696e-01 6.361e-01 0.267 0.789788
## day_of_weekmon 7.539e-02 2.122e-01 0.355 0.722341
## day_of_weekthu -1.823e-01 2.254e-01 -0.809 0.418778
## day_of_weektue -4.901e-02 2.247e-01 -0.218 0.827352
## day_of_weekwed 2.194e-01 2.198e-01 0.998 0.318128
## campaign -9.064e-02 4.263e-02 -2.126 0.033471 *
## pdays -3.682e-04 7.134e-04 -0.516 0.605744
## previous 1.737e-02 1.810e-01 0.096 0.923560
## poutcomenonexistent 1.684e-01 3.060e-01 0.550 0.582212
## poutcomesuccess 1.144e+00 7.059e-01 1.620 0.105213
## emp.var.rate -8.666e-01 4.949e-01 -1.751 0.079906 .
## cons.price.idx 1.060e+00 8.575e-01 1.237 0.216175
## cons.conf.idx 2.710e-02 2.754e-02 0.984 0.324989
## euribor3m 4.407e-01 4.275e-01 1.031 0.302577
## nr.employed -4.558e-03 1.025e-02 -0.445 0.656425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1993.5 on 2883 degrees of freedom
## Residual deviance: 1576.6 on 2833 degrees of freedom
## AIC: 1678.6
##
## Number of Fisher Scoring iterations: 11
loan ‘unknown’ lacks observations and our model is not able to estimate the coefficients
table(bank_df$loan, bank_df$y)
##
## no yes
## no 2975 374
## unknown 96 9
## yes 597 68
# Stepwise variable selection based on AIC
stepwise_model <- step(full_model, direction = 'both', trace = FALSE)
summary(stepwise_model)
##
## Call:
## glm(formula = y ~ contact + month + campaign + poutcome + emp.var.rate +
## cons.price.idx + cons.conf.idx, family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -109.93840 17.41806 -6.312 2.76e-10 ***
## contacttelephone -0.86174 0.25182 -3.422 0.000622 ***
## monthaug 0.39061 0.37951 1.029 0.303362
## monthdec 1.11904 0.59232 1.889 0.058861 .
## monthjul 0.24810 0.35508 0.699 0.484737
## monthjun 0.46880 0.33234 1.411 0.158366
## monthmar 2.10163 0.45541 4.615 3.94e-06 ***
## monthmay -0.18438 0.29464 -0.626 0.531468
## monthnov -0.34281 0.35996 -0.952 0.340921
## monthoct 0.09174 0.43825 0.209 0.834191
## monthsep 0.34474 0.48484 0.711 0.477063
## campaign -0.09212 0.04224 -2.181 0.029208 *
## poutcomenonexistent 0.19032 0.20752 0.917 0.359087
## poutcomesuccess 1.50335 0.28829 5.215 1.84e-07 ***
## emp.var.rate -0.65000 0.07538 -8.623 < 2e-16 ***
## cons.price.idx 1.17033 0.18870 6.202 5.57e-10 ***
## cons.conf.idx 0.04440 0.01816 2.446 0.014462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1993.5 on 2883 degrees of freedom
## Residual deviance: 1597.8 on 2867 degrees of freedom
## AIC: 1631.8
##
## Number of Fisher Scoring iterations: 6
Final stepwise model includes:
contact
month
campaign
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx
levels(bank_df$contact)
## [1] "cellular" "telephone"
Cellular is our baseline. Our result represents the change in log odds of subscribing when contact = telephone, compared to cellular.
For a one unit increase in ‘contacttelephone’(when contact = ‘telephone’) the log odds of subscribing to a term decrease by approximately 0.86174 when compared to cellular.
levels(bank_df$month)
## [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
‘apr’ is our baseline so each result represents the change in log odds when compared to apr.
For example, ‘for a one unit increase’, basically when month = ‘aug’ the log odds of subscribing to a term increase by aproximately 0.39 when compared to ‘apr’
levels(bank_df$poutcome)
## [1] "failure" "nonexistent" "success"
‘failure’ is our baseline so we just do the same as above.
Then for our numerical variables its just:
For a one unit increase in;
emp.var.rate the log odds of subscribing to a term decrease by 0.65 assuming all other variables remain constant.
cons.price.idx the log odds of subscribing to a term increase by 1.17 assuming all other variables remain constant.
cons.conf.idx the log odds of subscribing to a term increase 0.044 assuming all other variables remain constant.
vif(stepwise_model)
## GVIF Df GVIF^(1/(2*Df))
## contact 2.419575 1 1.555498
## month 5.794801 9 1.102532
## campaign 1.046557 1 1.023014
## poutcome 1.452056 2 1.097731
## emp.var.rate 3.976174 1 1.994035
## cons.price.idx 3.712973 1 1.926908
## cons.conf.idx 2.427152 1 1.557932
# Evaluate the model on test data
pred_probs <- predict(stepwise_model, newdata = test_data, type = 'response')
#type = 'response' transforms predicted values back to probabilities because logistic regression is fitted using log-odds
# Make probabilities to class labels. Threshold of 0.5 makes the most sense.
pred_class <- ifelse(pred_probs > 0.5, 'yes', 'no')
pred_class <- factor(pred_class, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix <- confusionMatrix(pred_class, test_data$y)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1091 107
## yes 9 28
##
## Accuracy : 0.9061
## 95% CI : (0.8884, 0.9218)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 0.04356
##
## Kappa : 0.2923
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9918
## Specificity : 0.2074
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.7568
## Prevalence : 0.8907
## Detection Rate : 0.8834
## Detection Prevalence : 0.9700
## Balanced Accuracy : 0.5996
##
## 'Positive' Class : no
##
Model sucks at predicting ‘yes’ which is what we are more interested in. Maybe changing threshold from 0.5 to something else might help?
#Precision-Recall Curve finds threshold that balances sensitivity and specificity
roc_obj <- roc(test_data$y, pred_probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
plot(roc_obj)
#coords(roc_obj, "best", ret = "threshold")
pROC::coords(
roc_obj,
"best",
ret = c("threshold", "sensitivity", "specificity")
)
## threshold sensitivity specificity
## 1 0.09224494 0.7333333 0.82
pROC finds the optimal threshold to be 0.09224494
pred_probs2 <- predict(stepwise_model, newdata = test_data, type = 'response')
pred_class2 <- ifelse(pred_probs2 > 0.09224494, 'yes', 'no')
pred_class2 <- factor(pred_class2, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix2 <- confusionMatrix(pred_class2, test_data$y)
print(conf_matrix2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 902 36
## yes 198 99
##
## Accuracy : 0.8105
## 95% CI : (0.7875, 0.832)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3625
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8200
## Specificity : 0.7333
## Pos Pred Value : 0.9616
## Neg Pred Value : 0.3333
## Prevalence : 0.8907
## Detection Rate : 0.7304
## Detection Prevalence : 0.7595
## Balanced Accuracy : 0.7767
##
## 'Positive' Class : no
##
Best results.
pred_probs3 <- predict(stepwise_model, newdata = test_data, type = 'response')
pred_class3 <- ifelse(pred_probs3 > 0.1, 'yes', 'no')
pred_class3 <- factor(pred_class3, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix3 <- confusionMatrix(pred_class3, test_data$y)
print(conf_matrix3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 929 40
## yes 171 95
##
## Accuracy : 0.8291
## 95% CI : (0.807, 0.8497)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3846
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8445
## Specificity : 0.7037
## Pos Pred Value : 0.9587
## Neg Pred Value : 0.3571
## Prevalence : 0.8907
## Detection Rate : 0.7522
## Detection Prevalence : 0.7846
## Balanced Accuracy : 0.7741
##
## 'Positive' Class : no
##
Gains a bit of accuracy and sensitivity but loses specificity. Here it is just debatable on which to use.
library(xgboost)
## Warning: package 'xgboost' was built under R version 4.4.3
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Convert categorical variables to numerical matrix
x_train <- model.matrix(y ~ ., data = train_data)[, -1]
y_train <- as.numeric(train_data$y) - 1 # Convert to 0 (no) and 1 (yes)
x_test <- model.matrix(y ~ ., data = test_data)[, -1]
y_test <- as.numeric(test_data$y) - 1
# Set parameters
params <- list(
objective = "binary:logistic", # Binary classification
eval_metric = "logloss", # Logarithmic loss metric
eta = 0.1, # Learning rate
max_depth = 6, # Depth of trees
subsample = 0.8, # Subsample ratio
colsample_bytree = 0.8 # Feature sampling
)
# Train the model
xgb_model <- xgboost(
data = x_train, label = y_train,
params = params,
nrounds = 100, # Number of boosting rounds
verbose = 0 # 0 = silent, 1 = shows log-loss for each round
)
# Predict probabilities
pred_probs <- predict(xgb_model, x_test)
# Convert probabilities to binary predictions (default threshold = 0.5)
pred_class <- ifelse(pred_probs > 0.1, 1, 0)
# Convert back to factors for confusion matrix
pred_class <- factor(pred_class, levels = c(0, 1), labels = c("no", "yes"))
y_test <- factor(y_test, levels = c(0, 1), labels = c("no", "yes"))
confusionMatrix(pred_class, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 904 46
## yes 196 89
##
## Accuracy : 0.804
## 95% CI : (0.7808, 0.8258)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3234
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8218
## Specificity : 0.6593
## Pos Pred Value : 0.9516
## Neg Pred Value : 0.3123
## Prevalence : 0.8907
## Detection Rate : 0.7320
## Detection Prevalence : 0.7692
## Balanced Accuracy : 0.7405
##
## 'Positive' Class : no
##
d1_yes <- bank_df |>
filter(y == 'yes')
d1_no <- bank_df |>
filter(y == 'no')
sample_no_cust = sample_n(d1_no, nrow(d1_yes))
df_bal = rbind(d1_yes, sample_no_cust)
# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index_balanced_df <- createDataPartition(df_bal$y, p = 0.7, list = FALSE)
train_data_balanced_df <- df_bal[train_index_balanced_df, ] # 70% training
test_data_balanced_df <- df_bal[-train_index_balanced_df, ] # 30% testing
full_model_balanced_df <- glm(y ~ ., data = train_data_balanced_df, family = 'binomial')
#summary(full_model_balanced_df)
# Stepwise variable selection based on AIC
stepwise_model_balanced_df <- step(full_model_balanced_df, direction = 'both', trace = FALSE)
summary(stepwise_model_balanced_df)
##
## Call:
## glm(formula = y ~ age + loan + month + day_of_week + pdays +
## emp.var.rate + cons.price.idx, family = "binomial", data = train_data_balanced_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.543e+02 4.009e+01 -3.850 0.000118 ***
## age 1.756e-02 9.260e-03 1.896 0.057939 .
## loanunknown -8.073e-01 5.656e-01 -1.427 0.153529
## loanyes -7.930e-01 2.647e-01 -2.996 0.002734 **
## monthaug 1.166e+00 5.665e-01 2.058 0.039594 *
## monthdec 1.536e+01 7.033e+02 0.022 0.982578
## monthjul 5.058e-01 4.953e-01 1.021 0.307210
## monthjun -2.441e-01 4.759e-01 -0.513 0.608079
## monthmar 1.186e+00 6.998e-01 1.695 0.090065 .
## monthmay -5.972e-01 4.218e-01 -1.416 0.156761
## monthnov 1.419e-01 5.268e-01 0.269 0.787649
## monthoct 5.891e-01 7.755e-01 0.760 0.447470
## monthsep -3.515e-01 7.244e-01 -0.485 0.627558
## day_of_weekmon 5.396e-02 3.102e-01 0.174 0.861903
## day_of_weekthu -6.368e-01 3.112e-01 -2.047 0.040696 *
## day_of_weektue -4.396e-01 3.157e-01 -1.393 0.163722
## day_of_weekwed -4.840e-02 3.153e-01 -0.153 0.878011
## pdays -1.154e-03 4.720e-04 -2.444 0.014522 *
## emp.var.rate -9.239e-01 1.569e-01 -5.889 3.88e-09 ***
## cons.price.idx 1.654e+00 4.270e-01 3.874 0.000107 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 876.14 on 631 degrees of freedom
## Residual deviance: 677.19 on 612 degrees of freedom
## AIC: 717.19
##
## Number of Fisher Scoring iterations: 15
vif(stepwise_model_balanced_df)
## GVIF Df GVIF^(1/(2*Df))
## age 1.058281 1 1.028728
## loan 1.089451 2 1.021649
## month 5.002665 9 1.093565
## day_of_week 1.184229 4 1.021361
## pdays 1.196894 1 1.094027
## emp.var.rate 7.917902 1 2.813877
## cons.price.idx 7.631056 1 2.762437
pred_probs_balanced_df <- predict(stepwise_model_balanced_df,
newdata = test_data_balanced_df, type = 'response')
pred_class_balanced_df <- ifelse(pred_probs_balanced_df > 0.5 , 'yes', 'no')
pred_class_balanced_df <- factor(pred_class_balanced_df, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix_balanced_df <- confusionMatrix(pred_class_balanced_df, test_data_balanced_df$y)
print(conf_matrix_balanced_df)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 111 45
## yes 24 90
##
## Accuracy : 0.7444
## 95% CI : (0.6881, 0.7954)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.4889
##
## Mcnemar's Test P-Value : 0.01605
##
## Sensitivity : 0.8222
## Specificity : 0.6667
## Pos Pred Value : 0.7115
## Neg Pred Value : 0.7895
## Prevalence : 0.5000
## Detection Rate : 0.4111
## Detection Prevalence : 0.5778
## Balanced Accuracy : 0.7444
##
## 'Positive' Class : no
##
roc_obj2 <- roc(test_data_balanced_df$y, pred_probs_balanced_df)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
plot(roc_obj2)
pROC::coords(roc_obj2, "best", ret = "threshold")
## threshold
## 1 0.4665221
Optimal threshold = 0.4230439
pred_probs_balanced_df2 <- predict(stepwise_model_balanced_df,
newdata = test_data_balanced_df, type = 'response')
pred_class_balanced_df2 <- ifelse(pred_probs_balanced_df2 > 0.4230439 , 'yes', 'no')
pred_class_balanced_df2 <- factor(pred_class_balanced_df2, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix_balanced_df2 <- confusionMatrix(pred_class_balanced_df2, test_data_balanced_df$y)
print(conf_matrix_balanced_df2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 89 34
## yes 46 101
##
## Accuracy : 0.7037
## 95% CI : (0.6454, 0.7575)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : 8.646e-12
##
## Kappa : 0.4074
##
## Mcnemar's Test P-Value : 0.2188
##
## Sensitivity : 0.6593
## Specificity : 0.7481
## Pos Pred Value : 0.7236
## Neg Pred Value : 0.6871
## Prevalence : 0.5000
## Detection Rate : 0.3296
## Detection Prevalence : 0.4556
## Balanced Accuracy : 0.7037
##
## 'Positive' Class : no
##
lda_model_stepwise_balanced_df <- lda(y ~ age + contact + month + campaign + pdays +
emp.var.rate + cons.price.idx + cons.conf.idx,
data = train_data_balanced_df)
lda_model_stepwise_balanced_df
## Call:
## lda(y ~ age + contact + month + campaign + pdays + emp.var.rate +
## cons.price.idx + cons.conf.idx, data = train_data_balanced_df)
##
## Prior probabilities of groups:
## no yes
## 0.5 0.5
##
## Group means:
## age contacttelephone monthaug monthdec monthjul monthjun
## no 39.30380 0.3829114 0.1297468 0.00000000 0.1930380 0.1392405
## yes 41.84177 0.1835443 0.1329114 0.03481013 0.1329114 0.1613924
## monthmar monthmay monthnov monthoct monthsep campaign pdays
## no 0.01265823 0.3765823 0.08544304 0.009493671 0.01265823 2.471519 977.0253
## yes 0.06329114 0.2025316 0.10126582 0.041139241 0.05379747 2.082278 782.0823
## emp.var.rate cons.price.idx cons.conf.idx
## no 0.2344937 93.61041 -40.80063
## yes -1.0689873 93.44499 -39.87184
##
## Coefficients of linear discriminants:
## LD1
## age 0.0132671935
## contacttelephone -0.4020287707
## monthaug 0.4010579566
## monthdec 1.1068714359
## monthjul 0.1318796019
## monthjun -0.0029915901
## monthmar 0.9109459816
## monthmay -0.5447458193
## monthnov -0.1069661686
## monthoct 0.2892776418
## monthsep -0.1134861056
## campaign -0.0344123678
## pdays -0.0005878534
## emp.var.rate -0.6921495522
## cons.price.idx 1.1472420578
## cons.conf.idx 0.0225206926
contacttelephone (-0.5429): If a customer was contacted via telephone, they were more likely to say “no.”
monthmar (1.1902): Being contacted in March increases the likelihood of a “yes.”
emp.var.rate (-0.6627): A higher employment variation rate decreases the chance of “yes,” meaning people are more likely to accept the offer when employment conditions are poor.
cons.price.idx (1.1481): A higher consumer price index increases the likelihood of a “yes.”
pdays (-0.00057): Higher values of pdays (longer time since last contact) slightly decrease the probability of a “yes.”
lda_predictions <- predict(lda_model_stepwise_balanced_df, test_data_balanced_df)
caret::confusionMatrix(as.factor(lda_predictions$class), test_data_balanced_df$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 115 45
## yes 20 90
##
## Accuracy : 0.7593
## 95% CI : (0.7037, 0.809)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5185
##
## Mcnemar's Test P-Value : 0.002912
##
## Sensitivity : 0.8519
## Specificity : 0.6667
## Pos Pred Value : 0.7188
## Neg Pred Value : 0.8182
## Prevalence : 0.5000
## Detection Rate : 0.4259
## Detection Prevalence : 0.5926
## Balanced Accuracy : 0.7593
##
## 'Positive' Class : no
##
#lda_predictions$class is just the name of variable where our predictions are stored