#Load libraries
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
#Load dataset
bank_df <- read.csv('/Users/ponce/Desktop/DA-6813/Case Study 1/bank-additional.csv',
sep = ';')
#Remove duration from dataset
bank_df <- subset(bank_df, select = -c(duration))
#bank_df <- filter(bank_df, job != 'unknown')
#bank_df <- filter(bank_df, marital != 'unknown')
#bank_df <- filter(bank_df, education != 'unknown')
#bank_df <- filter(bank_df, default != 'unknown')
#bank_df <- filter(bank_df, housing != 'unknown')
#bank_df <- filter(bank_df, loan != 'unknown')
#Convert character variables into categorical
bank_df$job <- as.factor(bank_df$job)
bank_df$marital <- as.factor(bank_df$marital)
bank_df$education <- as.factor(bank_df$education)
bank_df$default <- as.factor(bank_df$default)
bank_df$housing <- as.factor(bank_df$housing)
bank_df$loan <- as.factor(bank_df$loan)
bank_df$contact <- as.factor(bank_df$contact)
bank_df$month <- as.factor(bank_df$month)
bank_df$day_of_week <- as.factor(bank_df$day_of_week)
bank_df$poutcome <- as.factor(bank_df$poutcome)
bank_df$y <- as.factor(bank_df$y)
# Check if missing values
colSums(is.na(bank_df))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## campaign pdays previous poutcome emp.var.rate
## 0 0 0 0 0
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 0 0 0 0 0
# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index <- createDataPartition(bank_df$y, p = 0.7, list = FALSE)
train_data <- bank_df[train_index, ] # 70% training
test_data <- bank_df[-train_index, ] # 30% testing
full_model <- glm(y ~ ., data = train_data, family = 'binomial')
summary(full_model)
##
## Call:
## glm(formula = y ~ ., family = "binomial", data = train_data)
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.864e+01 1.287e+02 -0.611 0.541123
## age 9.500e-03 8.122e-03 1.170 0.242123
## jobblue-collar -2.376e-01 2.692e-01 -0.882 0.377598
## jobentrepreneur -8.136e-01 5.443e-01 -1.495 0.134972
## jobhousemaid 1.596e-01 4.286e-01 0.372 0.709568
## jobmanagement -2.842e-01 2.941e-01 -0.966 0.333845
## jobretired 5.102e-02 3.501e-01 0.146 0.884121
## jobself-employed -4.166e-01 4.003e-01 -1.041 0.298090
## jobservices -2.054e-02 2.857e-01 -0.072 0.942702
## jobstudent 3.026e-02 4.242e-01 0.071 0.943121
## jobtechnician 2.814e-02 2.305e-01 0.122 0.902824
## jobunemployed 7.681e-02 4.158e-01 0.185 0.853435
## jobunknown 3.616e-02 6.423e-01 0.056 0.955108
## maritalmarried 3.465e-01 2.521e-01 1.375 0.169285
## maritalsingle 4.311e-01 2.854e-01 1.511 0.130855
## maritalunknown 5.329e-01 1.231e+00 0.433 0.665053
## educationbasic.6y 4.611e-01 3.899e-01 1.183 0.236914
## educationbasic.9y 2.306e-01 3.226e-01 0.715 0.474615
## educationhigh.school 1.650e-01 3.101e-01 0.532 0.594730
## educationprofessional.course 3.072e-01 3.367e-01 0.913 0.361505
## educationuniversity.degree 2.864e-01 3.099e-01 0.924 0.355403
## educationunknown 3.378e-01 4.022e-01 0.840 0.401071
## defaultunknown 1.412e-02 2.102e-01 0.067 0.946461
## defaultyes -9.535e+00 3.247e+02 -0.029 0.976576
## housingunknown -8.314e-01 5.778e-01 -1.439 0.150219
## housingyes -1.607e-01 1.382e-01 -1.163 0.244648
## loanunknown NA NA NA NA
## loanyes -1.228e-01 1.920e-01 -0.640 0.522213
## contacttelephone -9.202e-01 2.866e-01 -3.211 0.001324 **
## monthaug 2.546e-01 4.445e-01 0.573 0.566832
## monthdec 8.561e-01 6.548e-01 1.308 0.191038
## monthjul 9.367e-02 3.687e-01 0.254 0.799436
## monthjun 3.553e-01 4.625e-01 0.768 0.442257
## monthmar 1.915e+00 5.491e-01 3.487 0.000489 ***
## monthmay -2.362e-01 3.232e-01 -0.731 0.464855
## monthnov -6.305e-01 4.371e-01 -1.442 0.149204
## monthoct -2.822e-01 5.387e-01 -0.524 0.600411
## monthsep 1.696e-01 6.361e-01 0.267 0.789788
## day_of_weekmon 7.539e-02 2.122e-01 0.355 0.722341
## day_of_weekthu -1.823e-01 2.254e-01 -0.809 0.418778
## day_of_weektue -4.901e-02 2.247e-01 -0.218 0.827352
## day_of_weekwed 2.194e-01 2.198e-01 0.998 0.318128
## campaign -9.064e-02 4.263e-02 -2.126 0.033471 *
## pdays -3.682e-04 7.134e-04 -0.516 0.605744
## previous 1.737e-02 1.810e-01 0.096 0.923560
## poutcomenonexistent 1.684e-01 3.060e-01 0.550 0.582212
## poutcomesuccess 1.144e+00 7.059e-01 1.620 0.105213
## emp.var.rate -8.666e-01 4.949e-01 -1.751 0.079906 .
## cons.price.idx 1.060e+00 8.575e-01 1.237 0.216175
## cons.conf.idx 2.710e-02 2.754e-02 0.984 0.324989
## euribor3m 4.407e-01 4.275e-01 1.031 0.302577
## nr.employed -4.558e-03 1.025e-02 -0.445 0.656425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1993.5 on 2883 degrees of freedom
## Residual deviance: 1576.6 on 2833 degrees of freedom
## AIC: 1678.6
##
## Number of Fisher Scoring iterations: 11
loan ‘unknown’ lacks observations and our model is not able to estimate the coefficients
table(bank_df$loan, bank_df$y)
##
## no yes
## no 2975 374
## unknown 96 9
## yes 597 68
# Stepwise variable selection based on AIC
stepwise_model <- step(full_model, direction = 'both', trace = FALSE)
summary(stepwise_model)
##
## Call:
## glm(formula = y ~ contact + month + campaign + poutcome + emp.var.rate +
## cons.price.idx + cons.conf.idx, family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -109.93840 17.41806 -6.312 2.76e-10 ***
## contacttelephone -0.86174 0.25182 -3.422 0.000622 ***
## monthaug 0.39061 0.37951 1.029 0.303362
## monthdec 1.11904 0.59232 1.889 0.058861 .
## monthjul 0.24810 0.35508 0.699 0.484737
## monthjun 0.46880 0.33234 1.411 0.158366
## monthmar 2.10163 0.45541 4.615 3.94e-06 ***
## monthmay -0.18438 0.29464 -0.626 0.531468
## monthnov -0.34281 0.35996 -0.952 0.340921
## monthoct 0.09174 0.43825 0.209 0.834191
## monthsep 0.34474 0.48484 0.711 0.477063
## campaign -0.09212 0.04224 -2.181 0.029208 *
## poutcomenonexistent 0.19032 0.20752 0.917 0.359087
## poutcomesuccess 1.50335 0.28829 5.215 1.84e-07 ***
## emp.var.rate -0.65000 0.07538 -8.623 < 2e-16 ***
## cons.price.idx 1.17033 0.18870 6.202 5.57e-10 ***
## cons.conf.idx 0.04440 0.01816 2.446 0.014462 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1993.5 on 2883 degrees of freedom
## Residual deviance: 1597.8 on 2867 degrees of freedom
## AIC: 1631.8
##
## Number of Fisher Scoring iterations: 6
Final stepwise model includes:
contact
month
campaign
poutcome
emp.var.rate
cons.price.idx
cons.conf.idx
levels(bank_df$contact)
## [1] "cellular" "telephone"
Cellular is our baseline. Our result represents the change in log odds of subscribing when contact = telephone, compared to cellular.
For a one unit increase in ‘contacttelephone’(when contact = ‘telephone’) the log odds of subscribing to a term decrease by approximately 0.86174 when compared to cellular.
levels(bank_df$month)
## [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
‘apr’ is our baseline so each result represents the change in log odds when compared to apr.
For example, ‘for a one unit increase’, basically when month = ‘aug’ the log odds of subscribing to a term increase by aproximately 0.39 when compared to ‘apr’
levels(bank_df$poutcome)
## [1] "failure" "nonexistent" "success"
‘failure’ is our baseline so we just do the same as above.
Then for our numerical variables its just:
For a one unit increase in;
emp.var.rate the log odds of subscribing to a term decrease by 0.65 assuming all other variables remain constant.
cons.price.idx the log odds of subscribing to a term increase by 1.17 assuming all other variables remain constant.
cons.conf.idx the log odds of subscribing to a term increase 0.044 assuming all other variables remain constant.
vif(stepwise_model)
## GVIF Df GVIF^(1/(2*Df))
## contact 2.419575 1 1.555498
## month 5.794801 9 1.102532
## campaign 1.046557 1 1.023014
## poutcome 1.452056 2 1.097731
## emp.var.rate 3.976174 1 1.994035
## cons.price.idx 3.712973 1 1.926908
## cons.conf.idx 2.427152 1 1.557932
# Evaluate the model on test data
pred_probs <- predict(stepwise_model, newdata = test_data, type = 'response')
#type = 'response' transforms predicted values back to probabilities because logistic regression is fitted using log-odds
# Make probabilities to class labels. Threshold of 0.5 makes the most sense.
pred_class <- ifelse(pred_probs > 0.5, 'yes', 'no')
pred_class <- factor(pred_class, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix <- confusionMatrix(pred_class, test_data$y)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1091 107
## yes 9 28
##
## Accuracy : 0.9061
## 95% CI : (0.8884, 0.9218)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 0.04356
##
## Kappa : 0.2923
##
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.9918
## Specificity : 0.2074
## Pos Pred Value : 0.9107
## Neg Pred Value : 0.7568
## Prevalence : 0.8907
## Detection Rate : 0.8834
## Detection Prevalence : 0.9700
## Balanced Accuracy : 0.5996
##
## 'Positive' Class : no
##
Model sucks at predicting ‘yes’ which is what we are more interested in. Maybe changing threshold from 0.5 to something else might help?
#Precision-Recall Curve finds threshold that balances sensitivity and specificity
roc_obj <- roc(test_data$y, pred_probs)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
plot(roc_obj)
coords(roc_obj, "best", ret = "threshold")
## threshold
## 1 0.09224494
pROC finds the optimal threshold to be 0.09224494
pred_probs2 <- predict(stepwise_model, newdata = test_data, type = 'response')
pred_class2 <- ifelse(pred_probs2 > 0.09224494, 'yes', 'no')
pred_class2 <- factor(pred_class2, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix2 <- confusionMatrix(pred_class2, test_data$y)
print(conf_matrix2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 902 36
## yes 198 99
##
## Accuracy : 0.8105
## 95% CI : (0.7875, 0.832)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3625
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8200
## Specificity : 0.7333
## Pos Pred Value : 0.9616
## Neg Pred Value : 0.3333
## Prevalence : 0.8907
## Detection Rate : 0.7304
## Detection Prevalence : 0.7595
## Balanced Accuracy : 0.7767
##
## 'Positive' Class : no
##
Best results.
pred_probs3 <- predict(stepwise_model, newdata = test_data, type = 'response')
pred_class3 <- ifelse(pred_probs3 > 0.1, 'yes', 'no')
pred_class3 <- factor(pred_class3, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix3 <- confusionMatrix(pred_class3, test_data$y)
print(conf_matrix3)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 929 40
## yes 171 95
##
## Accuracy : 0.8291
## 95% CI : (0.807, 0.8497)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3846
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8445
## Specificity : 0.7037
## Pos Pred Value : 0.9587
## Neg Pred Value : 0.3571
## Prevalence : 0.8907
## Detection Rate : 0.7522
## Detection Prevalence : 0.7846
## Balanced Accuracy : 0.7741
##
## 'Positive' Class : no
##
Gains a bit of accuracy and sensitivity but loses specificity. Here it is just debatable on which to use.
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:dplyr':
##
## slice
# Convert categorical variables to numerical matrix
x_train <- model.matrix(y ~ ., data = train_data)[, -1]
y_train <- as.numeric(train_data$y) - 1 # Convert to 0 (no) and 1 (yes)
x_test <- model.matrix(y ~ ., data = test_data)[, -1]
y_test <- as.numeric(test_data$y) - 1
# Set parameters
params <- list(
objective = "binary:logistic", # Binary classification
eval_metric = "logloss", # Logarithmic loss metric
eta = 0.1, # Learning rate
max_depth = 6, # Depth of trees
subsample = 0.8, # Subsample ratio
colsample_bytree = 0.8 # Feature sampling
)
# Train the model
xgb_model <- xgboost(
data = x_train, label = y_train,
params = params,
nrounds = 100, # Number of boosting rounds
verbose = 0 # 0 = silent, 1 = shows log-loss for each round
)
# Predict probabilities
pred_probs <- predict(xgb_model, x_test)
#pROC threshold for XGBoost
roc_curve <- roc(y_test, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
coords(roc_curve, 'best', ret = 'threshold')
## threshold
## 1 0.1145365
Best Threshold = 0.1145365
# Convert probabilities to binary predictions (default threshold = 0.5)
pred_class <- ifelse(pred_probs > 0.1145365, 1, 0)
# Convert back to factors for confusion matrix
pred_class <- factor(pred_class, levels = c(0, 1), labels = c("no", "yes"))
y_test <- factor(y_test, levels = c(0, 1), labels = c("no", "yes"))
confusionMatrix(pred_class, y_test)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 942 45
## yes 158 90
##
## Accuracy : 0.8356
## 95% CI : (0.8138, 0.8559)
## No Information Rate : 0.8907
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3826
##
## Mcnemar's Test P-Value : 3.815e-15
##
## Sensitivity : 0.8564
## Specificity : 0.6667
## Pos Pred Value : 0.9544
## Neg Pred Value : 0.3629
## Prevalence : 0.8907
## Detection Rate : 0.7628
## Detection Prevalence : 0.7992
## Balanced Accuracy : 0.7615
##
## 'Positive' Class : no
##
d1_yes <- bank_df |>
filter(y == 'yes')
d1_no <- bank_df |>
filter(y == 'no')
sample_no_cust = sample_n(d1_no, nrow(d1_yes))
df_bal = rbind(d1_yes, sample_no_cust)
# Split data into training and testing sets
set.seed(42)
#train_index splits the data. using bank_df$y we make sure that training set will have similar class distribution as the full dataset. 'Stratified sampling'
train_index_balanced_df <- createDataPartition(df_bal$y, p = 0.7, list = FALSE)
train_data_balanced_df <- df_bal[train_index_balanced_df, ] # 70% training
test_data_balanced_df <- df_bal[-train_index_balanced_df, ] # 30% testing
full_model_balanced_df <- glm(y ~ ., data = train_data_balanced_df, family = 'binomial')
#summary(full_model_balanced_df)
# Stepwise variable selection based on AIC
stepwise_model_balanced_df <- step(full_model_balanced_df, direction = 'both', trace = FALSE)
summary(stepwise_model_balanced_df)
##
## Call:
## glm(formula = y ~ contact + month + campaign + pdays + emp.var.rate +
## cons.price.idx + cons.conf.idx, family = "binomial", data = train_data_balanced_df)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.388e+02 3.460e+01 -4.011 6.04e-05 ***
## contacttelephone -7.994e-01 3.487e-01 -2.292 0.0219 *
## monthaug 1.035e-01 6.219e-01 0.166 0.8679
## monthdec 1.506e+01 7.086e+02 0.021 0.9830
## monthjul 1.248e-01 5.119e-01 0.244 0.8074
## monthjun 1.273e-01 4.800e-01 0.265 0.7908
## monthmar 1.918e+00 8.603e-01 2.230 0.0258 *
## monthmay -3.991e-01 4.185e-01 -0.954 0.3403
## monthnov -5.629e-01 5.091e-01 -1.106 0.2689
## monthoct 2.109e-01 8.358e-01 0.252 0.8008
## monthsep 1.321e-01 9.232e-01 0.143 0.8862
## campaign -7.857e-02 4.612e-02 -1.704 0.0884 .
## pdays -1.125e-03 4.763e-04 -2.361 0.0182 *
## emp.var.rate -7.407e-01 1.347e-01 -5.501 3.79e-08 ***
## cons.price.idx 1.523e+00 3.691e-01 4.126 3.69e-05 ***
## cons.conf.idx 5.869e-02 2.985e-02 1.966 0.0492 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 876.14 on 631 degrees of freedom
## Residual deviance: 679.96 on 616 degrees of freedom
## AIC: 711.96
##
## Number of Fisher Scoring iterations: 15
vif(stepwise_model_balanced_df)
## GVIF Df GVIF^(1/(2*Df))
## contact 2.890229 1 1.700067
## month 8.469356 9 1.126023
## campaign 1.036432 1 1.018053
## pdays 1.164511 1 1.079125
## emp.var.rate 5.661669 1 2.379426
## cons.price.idx 5.377469 1 2.318937
## cons.conf.idx 2.232940 1 1.494303
pred_probs_balanced_df <- predict(stepwise_model_balanced_df,
newdata = test_data_balanced_df, type = 'response')
pred_class_balanced_df <- ifelse(pred_probs_balanced_df > 0.5 , 'yes', 'no')
pred_class_balanced_df <- factor(pred_class_balanced_df, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix_balanced_df <- confusionMatrix(pred_class_balanced_df, test_data_balanced_df$y)
print(conf_matrix_balanced_df)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 116 47
## yes 19 88
##
## Accuracy : 0.7556
## 95% CI : (0.6998, 0.8056)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5111
##
## Mcnemar's Test P-Value : 0.000889
##
## Sensitivity : 0.8593
## Specificity : 0.6519
## Pos Pred Value : 0.7117
## Neg Pred Value : 0.8224
## Prevalence : 0.5000
## Detection Rate : 0.4296
## Detection Prevalence : 0.6037
## Balanced Accuracy : 0.7556
##
## 'Positive' Class : no
##
roc_obj2 <- roc(test_data_balanced_df$y, pred_probs_balanced_df)
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
plot(roc_obj2)
coords(roc_obj2, "best", ret = "threshold")
## threshold
## 1 0.4230439
Optimal threshold = 0.4230439
pred_probs_balanced_df2 <- predict(stepwise_model_balanced_df,
newdata = test_data_balanced_df, type = 'response')
pred_class_balanced_df2 <- ifelse(pred_probs_balanced_df2 > 0.4230439 , 'yes', 'no')
pred_class_balanced_df2 <- factor(pred_class_balanced_df2, levels = c('no', 'yes'))
# Confusion matrix
conf_matrix_balanced_df2 <- confusionMatrix(pred_class_balanced_df2, test_data_balanced_df$y)
print(conf_matrix_balanced_df2)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 104 33
## yes 31 102
##
## Accuracy : 0.763
## 95% CI : (0.7076, 0.8124)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5259
##
## Mcnemar's Test P-Value : 0.9005
##
## Sensitivity : 0.7704
## Specificity : 0.7556
## Pos Pred Value : 0.7591
## Neg Pred Value : 0.7669
## Prevalence : 0.5000
## Detection Rate : 0.3852
## Detection Prevalence : 0.5074
## Balanced Accuracy : 0.7630
##
## 'Positive' Class : no
##
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
lda_model_stepwise_balanced_df <- lda(y ~ age + contact + month + campaign + pdays +
emp.var.rate + cons.price.idx + cons.conf.idx,
data = train_data_balanced_df)
lda_model_stepwise_balanced_df
## Call:
## lda(y ~ age + contact + month + campaign + pdays + emp.var.rate +
## cons.price.idx + cons.conf.idx, data = train_data_balanced_df)
##
## Prior probabilities of groups:
## no yes
## 0.5 0.5
##
## Group means:
## age contacttelephone monthaug monthdec monthjul monthjun
## no 39.56646 0.3544304 0.1550633 0.00000000 0.1962025 0.1044304
## yes 41.84177 0.1835443 0.1329114 0.03481013 0.1329114 0.1613924
## monthmar monthmay monthnov monthoct monthsep campaign pdays
## no 0.006329114 0.3449367 0.1329114 0.009493671 0.006329114 2.731013 976.9399
## yes 0.063291139 0.2025316 0.1012658 0.041139241 0.053797468 2.082278 782.0823
## emp.var.rate cons.price.idx cons.conf.idx
## no 0.2357595 93.56728 -40.78101
## yes -1.0689873 93.44499 -39.87184
##
## Coefficients of linear discriminants:
## LD1
## age 0.0074060259
## contacttelephone -0.5428783452
## monthaug 0.1617802933
## monthdec 0.9969463418
## monthjul 0.1882184386
## monthjun 0.2099341992
## monthmar 1.1901913553
## monthmay -0.3955212307
## monthnov -0.4681373725
## monthoct 0.2895821931
## monthsep 0.2140202697
## campaign -0.0589307340
## pdays -0.0005710463
## emp.var.rate -0.6626569249
## cons.price.idx 1.1480788225
## cons.conf.idx 0.0369509061
contacttelephone (-0.5429): If a customer was contacted via telephone, they were more likely to say “no.”
monthmar (1.1902): Being contacted in March increases the likelihood of a “yes.”
emp.var.rate (-0.6627): A higher employment variation rate decreases the chance of “yes,” meaning people are more likely to accept the offer when employment conditions are poor.
cons.price.idx (1.1481): A higher consumer price index increases the likelihood of a “yes.”
pdays (-0.00057): Higher values of pdays (longer time since last contact) slightly decrease the probability of a “yes.”
lda_predictions <- predict(lda_model_stepwise_balanced_df, test_data_balanced_df)
caret::confusionMatrix(as.factor(lda_predictions$class), test_data_balanced_df$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 116 46
## yes 19 89
##
## Accuracy : 0.7593
## 95% CI : (0.7037, 0.809)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5185
##
## Mcnemar's Test P-Value : 0.00126
##
## Sensitivity : 0.8593
## Specificity : 0.6593
## Pos Pred Value : 0.7160
## Neg Pred Value : 0.8241
## Prevalence : 0.5000
## Detection Rate : 0.4296
## Detection Prevalence : 0.6000
## Balanced Accuracy : 0.7593
##
## 'Positive' Class : no
##
#lda_predictions$class is just the name of variable where our predictions are stored