To estimate or predict whether the client will subscribe a bank term deposit
# Setting the working directory and Loading Data and required libraries
setwd("G:\\IIMK DABS\\Session Files\\24-07-22 (Logistic Regression)")
getwd()
data <- read.csv ("bank-full.csv",sep = ";", stringsAsFactors = TRUE)
data <- subset(data, select = -c(job, month))
library(dplyr)
library(fastDummies)
library(pROC)
library(car)
library(caret)
library(ROSE)
# Creating Dummy columns for categorical data and converting dependent variable as factor
data <- dummy_cols(data, remove_first_dummy = TRUE, remove_selected_columns = TRUE)
data$y_yes <- as.factor(data$y_yes)
data$y_yes <- relevel(data$y_yes, ref = "0")
# Creating model using Logistic Regression
model1 <- glm (y_yes~., family = binomial, data = data)
# Getting summary of model
summary(model1)
##
## Call:
## glm(formula = y_yes ~ ., family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7191 -0.4122 -0.2755 -0.1634 3.4803
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.859e+00 1.556e-01 -18.372 < 2e-16 ***
## age 6.413e-03 1.827e-03 3.509 0.000449 ***
## balance 1.760e-05 4.813e-06 3.657 0.000255 ***
## day -4.558e-03 2.134e-03 -2.136 0.032680 *
## duration 4.025e-03 6.236e-05 64.544 < 2e-16 ***
## campaign -1.125e-01 9.966e-03 -11.288 < 2e-16 ***
## pdays 1.001e-04 2.992e-04 0.335 0.737896
## previous 1.067e-02 6.567e-03 1.624 0.104272
## marital_married -1.848e-01 5.709e-02 -3.237 0.001210 **
## marital_single 2.327e-01 6.455e-02 3.605 0.000312 ***
## education_secondary 2.138e-01 5.895e-02 3.627 0.000287 ***
## education_tertiary 4.122e-01 6.164e-02 6.688 2.26e-11 ***
## education_unknown 3.585e-01 9.694e-02 3.698 0.000217 ***
## default_yes -2.203e-01 1.616e-01 -1.363 0.172909
## housing_yes -8.333e-01 3.860e-02 -21.590 < 2e-16 ***
## loan_yes -5.939e-01 5.796e-02 -10.246 < 2e-16 ***
## contact_telephone -3.510e-02 7.142e-02 -0.492 0.623072
## contact_unknown -1.180e+00 5.757e-02 -20.488 < 2e-16 ***
## poutcome_other 2.686e-01 8.622e-02 3.115 0.001838 **
## poutcome_success 2.306e+00 7.860e-02 29.335 < 2e-16 ***
## poutcome_unknown -2.774e-01 9.011e-02 -3.079 0.002078 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32631 on 45210 degrees of freedom
## Residual deviance: 22772 on 45190 degrees of freedom
## AIC: 22814
##
## Number of Fisher Scoring iterations: 6
# Obtaining the coefficients of the independent variables.
coef(model1)
## (Intercept) age balance day
## -2.8594329823 0.0064128081 0.0000176022 -0.0045580996
## duration campaign pdays previous
## 0.0040251982 -0.1125047516 0.0001001259 0.0106676005
## marital_married marital_single education_secondary education_tertiary
## -0.1847679357 0.2327085638 0.2138082074 0.4122357707
## education_unknown default_yes housing_yes loan_yes
## 0.3584795509 -0.2202930144 -0.8333283801 -0.5938966145
## contact_telephone contact_unknown poutcome_other poutcome_success
## -0.0351029784 -1.1795249357 0.2685837701 2.3057468764
## poutcome_unknown
## -0.2774302888
# Converting the lnodds format of coefficients to odds ratio
exp(coef(model1))
## (Intercept) age balance day
## 0.05730124 1.00643341 1.00001760 0.99545227
## duration campaign pdays previous
## 1.00403331 0.89359310 1.00010013 1.01072470
## marital_married marital_single education_secondary education_tertiary
## 0.83129718 1.26201363 1.23838512 1.51019045
## education_unknown default_yes housing_yes loan_yes
## 1.43115177 0.80228368 0.43460036 0.55217149
## contact_telephone contact_unknown poutcome_other poutcome_success
## 0.96550598 0.30742475 1.30811055 10.03166787
## poutcome_unknown
## 0.75772838
# Checking multicollinearity using car package
vif(model1)
## age balance day duration
## 1.493934 1.029421 1.022904 1.089086
## campaign pdays previous marital_married
## 1.041521 3.677143 1.303081 2.634189
## marital_single education_secondary education_tertiary education_unknown
## 3.072608 2.817591 2.841184 1.349767
## default_yes housing_yes loan_yes contact_telephone
## 1.011078 1.161237 1.024785 1.073805
## contact_unknown poutcome_other poutcome_success poutcome_unknown
## 1.123322 1.385048 1.655188 5.386445
# Stpwise Regression for identifying significant variables
model2 <- step(model1, trace = 0)
summary(model2)
##
## Call:
## glm(formula = y_yes ~ age + balance + day + duration + campaign +
## previous + marital_married + marital_single + education_secondary +
## education_tertiary + education_unknown + housing_yes + loan_yes +
## contact_unknown + poutcome_other + poutcome_success + poutcome_unknown,
## family = binomial, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.7255 -0.4124 -0.2757 -0.1635 3.4842
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.834e+00 1.378e-01 -20.572 < 2e-16 ***
## age 6.252e-03 1.796e-03 3.482 0.000497 ***
## balance 1.783e-05 4.802e-06 3.713 0.000204 ***
## day -4.620e-03 2.133e-03 -2.166 0.030302 *
## duration 4.026e-03 6.234e-05 64.584 < 2e-16 ***
## campaign -1.128e-01 9.946e-03 -11.343 < 2e-16 ***
## previous 1.054e-02 6.525e-03 1.615 0.106409
## marital_married -1.846e-01 5.703e-02 -3.237 0.001208 **
## marital_single 2.315e-01 6.442e-02 3.594 0.000325 ***
## education_secondary 2.146e-01 5.890e-02 3.644 0.000268 ***
## education_tertiary 4.140e-01 6.146e-02 6.737 1.62e-11 ***
## education_unknown 3.583e-01 9.695e-02 3.695 0.000220 ***
## housing_yes -8.306e-01 3.821e-02 -21.739 < 2e-16 ***
## loan_yes -5.997e-01 5.781e-02 -10.373 < 2e-16 ***
## contact_unknown -1.178e+00 5.731e-02 -20.552 < 2e-16 ***
## poutcome_other 2.658e-01 8.609e-02 3.088 0.002014 **
## poutcome_success 2.300e+00 7.605e-02 30.249 < 2e-16 ***
## poutcome_unknown -3.028e-01 5.749e-02 -5.268 1.38e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 32631 on 45210 degrees of freedom
## Residual deviance: 22775 on 45193 degrees of freedom
## AIC: 22811
##
## Number of Fisher Scoring iterations: 6
exp(coef(model2))
## (Intercept) age balance day
## 0.05878022 1.00627196 1.00001783 0.99539101
## duration campaign previous marital_married
## 1.00403444 0.89331216 1.01059089 0.83143926
## marital_single education_secondary education_tertiary education_unknown
## 1.26053916 1.23941825 1.51291505 1.43083661
## housing_yes loan_yes contact_unknown poutcome_other
## 0.43578486 0.54897599 0.30795248 1.30453846
## poutcome_success poutcome_unknown
## 9.97875886 0.73871157
# Splitting data into train and test data using caret package to maintain the proportion
set.seed (1234)
index <- createDataPartition(data$y_yes, p = .80, list = FALSE)
train <- data[index,]
test <- data [-index,]
# Training the data using train data and using stepwise regression to identify significant variables
model3 <- glm (y_yes~., family = binomial, data = train)
summary(model3)
##
## Call:
## glm(formula = y_yes ~ ., family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7760 -0.4122 -0.2739 -0.1622 3.5182
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.912e+00 1.740e-01 -16.739 < 2e-16 ***
## age 7.242e-03 2.044e-03 3.543 0.000395 ***
## balance 1.994e-05 5.458e-06 3.653 0.000259 ***
## day -4.947e-03 2.388e-03 -2.072 0.038296 *
## duration 4.009e-03 6.936e-05 57.794 < 2e-16 ***
## campaign -1.159e-01 1.127e-02 -10.283 < 2e-16 ***
## pdays 3.650e-05 3.350e-04 0.109 0.913229
## previous 6.174e-03 6.227e-03 0.991 0.321447
## marital_married -1.230e-01 6.426e-02 -1.914 0.055641 .
## marital_single 2.961e-01 7.258e-02 4.080 4.50e-05 ***
## education_secondary 2.411e-01 6.623e-02 3.641 0.000271 ***
## education_tertiary 4.375e-01 6.917e-02 6.326 2.52e-10 ***
## education_unknown 4.002e-01 1.083e-01 3.694 0.000221 ***
## default_yes -2.806e-01 1.821e-01 -1.541 0.123326
## housing_yes -8.626e-01 4.327e-02 -19.933 < 2e-16 ***
## loan_yes -5.516e-01 6.423e-02 -8.587 < 2e-16 ***
## contact_telephone -1.235e-01 8.131e-02 -1.519 0.128873
## contact_unknown -1.161e+00 6.438e-02 -18.033 < 2e-16 ***
## poutcome_other 2.994e-01 9.488e-02 3.156 0.001601 **
## poutcome_success 2.305e+00 8.805e-02 26.172 < 2e-16 ***
## poutcome_unknown -3.279e-01 1.001e-01 -3.274 0.001061 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26108 on 36169 degrees of freedom
## Residual deviance: 18181 on 36149 degrees of freedom
## AIC: 18223
##
## Number of Fisher Scoring iterations: 6
model4 <- step(model3, trace = 0)
summary(model4)
##
## Call:
## glm(formula = y_yes ~ age + balance + day + duration + campaign +
## marital_married + marital_single + education_secondary +
## education_tertiary + education_unknown + default_yes + housing_yes +
## loan_yes + contact_telephone + contact_unknown + poutcome_other +
## poutcome_success + poutcome_unknown, family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7755 -0.4122 -0.2740 -0.1622 3.5154
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.886e+00 1.533e-01 -18.823 < 2e-16 ***
## age 7.246e-03 2.043e-03 3.546 0.000391 ***
## balance 1.995e-05 5.457e-06 3.656 0.000256 ***
## day -5.011e-03 2.386e-03 -2.100 0.035701 *
## duration 4.009e-03 6.936e-05 57.794 < 2e-16 ***
## campaign -1.155e-01 1.125e-02 -10.265 < 2e-16 ***
## marital_married -1.229e-01 6.423e-02 -1.913 0.055746 .
## marital_single 2.963e-01 7.257e-02 4.083 4.45e-05 ***
## education_secondary 2.413e-01 6.623e-02 3.643 0.000269 ***
## education_tertiary 4.381e-01 6.912e-02 6.337 2.34e-10 ***
## education_unknown 4.000e-01 1.083e-01 3.692 0.000223 ***
## default_yes -2.801e-01 1.821e-01 -1.538 0.124031
## housing_yes -8.614e-01 4.289e-02 -20.082 < 2e-16 ***
## loan_yes -5.513e-01 6.423e-02 -8.584 < 2e-16 ***
## contact_telephone -1.226e-01 8.130e-02 -1.507 0.131700
## contact_unknown -1.161e+00 6.436e-02 -18.041 < 2e-16 ***
## poutcome_other 3.055e-01 9.445e-02 3.235 0.001217 **
## poutcome_success 2.303e+00 8.513e-02 27.057 < 2e-16 ***
## poutcome_unknown -3.541e-01 6.088e-02 -5.816 6.02e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 26108 on 36169 degrees of freedom
## Residual deviance: 18182 on 36151 degrees of freedom
## AIC: 18220
##
## Number of Fisher Scoring iterations: 6
# Predicting test data using the model created, converting the predicted probabilities to class
predicted <- predict (model4, newdata = test, type = "response")
test$predicted <- predicted
test$class <- ifelse (test$predicted >= .5, 1, 0)
test$class <- factor (test$class)
test$y_yes <- factor (test$y_yes)
# Creating confusion matrix
confusionMatrix(test$class, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7814 729
## 1 170 328
##
## Accuracy : 0.9006
## 95% CI : (0.8942, 0.9067)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 6.829e-08
##
## Kappa : 0.3751
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.31031
## Specificity : 0.97871
## Pos Pred Value : 0.65863
## Neg Pred Value : 0.91467
## Prevalence : 0.11691
## Detection Rate : 0.03628
## Detection Prevalence : 0.05508
## Balanced Accuracy : 0.64451
##
## 'Positive' Class : 1
##
The sensitivity value is very low. Sensitivity is the True positive rate. The value of sensitivity is low because the data is imbalanced. The proportion of event class and non-event class in the dependent variable is not same. In order to balance that, data balancing needs to be done. This can be done using ROSE package and creating over-sampled, under-sampled and ‘both’ sampled data
# Creating Over sampled, Under sampled and 'both' data
set.seed(1234)
over<- ovun.sample(y_yes~., data = train, method = "over", N = 63836)$data
set.seed(1234)
under<- ovun.sample(y_yes~., data = train, method = "under", N = 8464)$data
both<- ovun.sample(y_yes~., data = train, method = "both", p = .50, seed = 1234, N = 36170)$data
# Creating model using over sampled data, predicting using the model and creating confusion matrix
model5 <- glm (y_yes~., family = binomial, data = over)
predicted1 <- predict (model5, newdata = test, type = "response")
class1 <- ifelse (predicted1 >= .50, 1, 0)
class1 <- factor (class1)
confusionMatrix(class1, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6687 234
## 1 1297 823
##
## Accuracy : 0.8307
## 95% CI : (0.8228, 0.8383)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.429
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.77862
## Specificity : 0.83755
## Pos Pred Value : 0.38821
## Neg Pred Value : 0.96619
## Prevalence : 0.11691
## Detection Rate : 0.09103
## Detection Prevalence : 0.23449
## Balanced Accuracy : 0.80808
##
## 'Positive' Class : 1
##
# Creating model using under sampled data, predicting using the model and creating confusion matrix
model6 <- glm (y_yes~., family = binomial, data = under)
predicted2 <- predict (model6, newdata = test, type = "response")
class2 <- ifelse (predicted2 >= .50, 1, 0)
class2 <- factor (class2)
confusionMatrix(class2, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6708 246
## 1 1276 811
##
## Accuracy : 0.8317
## 95% CI : (0.8238, 0.8393)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.427
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7673
## Specificity : 0.8402
## Pos Pred Value : 0.3886
## Neg Pred Value : 0.9646
## Prevalence : 0.1169
## Detection Rate : 0.0897
## Detection Prevalence : 0.2308
## Balanced Accuracy : 0.8037
##
## 'Positive' Class : 1
##
# Creating model using both sampled data, predicting using the model and creating confusion matrix
model7 <- glm (y_yes~., family = binomial, data = both)
predicted3 <- predict (model7, newdata = test, type = "response")
class3 <- ifelse (predicted3 >= .50, 1, 0)
class3 <- factor (class3)
confusionMatrix(class3, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 6700 243
## 1 1284 814
##
## Accuracy : 0.8311
## 95% CI : (0.8232, 0.8388)
## No Information Rate : 0.8831
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4269
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.77010
## Specificity : 0.83918
## Pos Pred Value : 0.38799
## Neg Pred Value : 0.96500
## Prevalence : 0.11691
## Detection Rate : 0.09003
## Detection Prevalence : 0.23205
## Balanced Accuracy : 0.80464
##
## 'Positive' Class : 1
##
# Using library proc, creating ROC curve and calculatng AUCvalues for all 3 models
roc(test$y_yes, predicted1, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="green4", lwd=2, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = test$y_yes, predictor = predicted1, percent = TRUE, plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage", ylab = "True Postive Percentage", col = "green4", lwd = 2, print.auc = TRUE)
##
## Data: predicted1 in 7984 controls (test$y_yes 0) < 1057 cases (test$y_yes 1).
## Area under the curve: 89.06%
plot.roc(test$y_yes, predicted2, percent=TRUE, col="red", lwd=2, print.auc=TRUE, add=TRUE, print.auc.y=30)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(test$y_yes, predicted3, percent=TRUE, col ="blue", lwd=2,print.auc=TRUE, add=TRUE,print.auc.y=40)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("topright", bg ="transparent", legend=c("Over sampled", "Under sampled", "Both"), col=c("green4", "red","blue"), cex = .5,lwd=2)
