Problem Statement

To estimate or predict whether the client will subscribe a bank term deposit

# Setting the working directory and Loading Data and required libraries
setwd("G:\\IIMK DABS\\Session Files\\24-07-22 (Logistic Regression)")
getwd()
data <- read.csv ("bank-full.csv",sep = ";", stringsAsFactors = TRUE)
data <- subset(data, select = -c(job, month))
library(dplyr)
library(fastDummies)
library(pROC)
library(car)
library(caret)
library(ROSE)
# Creating Dummy columns for categorical data and converting dependent variable as factor
data <- dummy_cols(data, remove_first_dummy = TRUE, remove_selected_columns = TRUE)
data$y_yes <- as.factor(data$y_yes)
data$y_yes <- relevel(data$y_yes, ref = "0")
# Creating model using Logistic Regression
model1 <- glm (y_yes~., family = binomial, data = data)
# Getting summary of model
summary(model1)
## 
## Call:
## glm(formula = y_yes ~ ., family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.7191  -0.4122  -0.2755  -0.1634   3.4803  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.859e+00  1.556e-01 -18.372  < 2e-16 ***
## age                  6.413e-03  1.827e-03   3.509 0.000449 ***
## balance              1.760e-05  4.813e-06   3.657 0.000255 ***
## day                 -4.558e-03  2.134e-03  -2.136 0.032680 *  
## duration             4.025e-03  6.236e-05  64.544  < 2e-16 ***
## campaign            -1.125e-01  9.966e-03 -11.288  < 2e-16 ***
## pdays                1.001e-04  2.992e-04   0.335 0.737896    
## previous             1.067e-02  6.567e-03   1.624 0.104272    
## marital_married     -1.848e-01  5.709e-02  -3.237 0.001210 ** 
## marital_single       2.327e-01  6.455e-02   3.605 0.000312 ***
## education_secondary  2.138e-01  5.895e-02   3.627 0.000287 ***
## education_tertiary   4.122e-01  6.164e-02   6.688 2.26e-11 ***
## education_unknown    3.585e-01  9.694e-02   3.698 0.000217 ***
## default_yes         -2.203e-01  1.616e-01  -1.363 0.172909    
## housing_yes         -8.333e-01  3.860e-02 -21.590  < 2e-16 ***
## loan_yes            -5.939e-01  5.796e-02 -10.246  < 2e-16 ***
## contact_telephone   -3.510e-02  7.142e-02  -0.492 0.623072    
## contact_unknown     -1.180e+00  5.757e-02 -20.488  < 2e-16 ***
## poutcome_other       2.686e-01  8.622e-02   3.115 0.001838 ** 
## poutcome_success     2.306e+00  7.860e-02  29.335  < 2e-16 ***
## poutcome_unknown    -2.774e-01  9.011e-02  -3.079 0.002078 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45210  degrees of freedom
## Residual deviance: 22772  on 45190  degrees of freedom
## AIC: 22814
## 
## Number of Fisher Scoring iterations: 6
# Obtaining the coefficients of the independent variables. 
coef(model1)
##         (Intercept)                 age             balance                 day 
##       -2.8594329823        0.0064128081        0.0000176022       -0.0045580996 
##            duration            campaign               pdays            previous 
##        0.0040251982       -0.1125047516        0.0001001259        0.0106676005 
##     marital_married      marital_single education_secondary  education_tertiary 
##       -0.1847679357        0.2327085638        0.2138082074        0.4122357707 
##   education_unknown         default_yes         housing_yes            loan_yes 
##        0.3584795509       -0.2202930144       -0.8333283801       -0.5938966145 
##   contact_telephone     contact_unknown      poutcome_other    poutcome_success 
##       -0.0351029784       -1.1795249357        0.2685837701        2.3057468764 
##    poutcome_unknown 
##       -0.2774302888
# Converting the lnodds format of coefficients to odds ratio
exp(coef(model1))
##         (Intercept)                 age             balance                 day 
##          0.05730124          1.00643341          1.00001760          0.99545227 
##            duration            campaign               pdays            previous 
##          1.00403331          0.89359310          1.00010013          1.01072470 
##     marital_married      marital_single education_secondary  education_tertiary 
##          0.83129718          1.26201363          1.23838512          1.51019045 
##   education_unknown         default_yes         housing_yes            loan_yes 
##          1.43115177          0.80228368          0.43460036          0.55217149 
##   contact_telephone     contact_unknown      poutcome_other    poutcome_success 
##          0.96550598          0.30742475          1.30811055         10.03166787 
##    poutcome_unknown 
##          0.75772838
# Checking multicollinearity using car package
vif(model1)
##                 age             balance                 day            duration 
##            1.493934            1.029421            1.022904            1.089086 
##            campaign               pdays            previous     marital_married 
##            1.041521            3.677143            1.303081            2.634189 
##      marital_single education_secondary  education_tertiary   education_unknown 
##            3.072608            2.817591            2.841184            1.349767 
##         default_yes         housing_yes            loan_yes   contact_telephone 
##            1.011078            1.161237            1.024785            1.073805 
##     contact_unknown      poutcome_other    poutcome_success    poutcome_unknown 
##            1.123322            1.385048            1.655188            5.386445
# Stpwise Regression for identifying significant variables
model2 <- step(model1, trace = 0)
summary(model2)
## 
## Call:
## glm(formula = y_yes ~ age + balance + day + duration + campaign + 
##     previous + marital_married + marital_single + education_secondary + 
##     education_tertiary + education_unknown + housing_yes + loan_yes + 
##     contact_unknown + poutcome_other + poutcome_success + poutcome_unknown, 
##     family = binomial, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.7255  -0.4124  -0.2757  -0.1635   3.4842  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.834e+00  1.378e-01 -20.572  < 2e-16 ***
## age                  6.252e-03  1.796e-03   3.482 0.000497 ***
## balance              1.783e-05  4.802e-06   3.713 0.000204 ***
## day                 -4.620e-03  2.133e-03  -2.166 0.030302 *  
## duration             4.026e-03  6.234e-05  64.584  < 2e-16 ***
## campaign            -1.128e-01  9.946e-03 -11.343  < 2e-16 ***
## previous             1.054e-02  6.525e-03   1.615 0.106409    
## marital_married     -1.846e-01  5.703e-02  -3.237 0.001208 ** 
## marital_single       2.315e-01  6.442e-02   3.594 0.000325 ***
## education_secondary  2.146e-01  5.890e-02   3.644 0.000268 ***
## education_tertiary   4.140e-01  6.146e-02   6.737 1.62e-11 ***
## education_unknown    3.583e-01  9.695e-02   3.695 0.000220 ***
## housing_yes         -8.306e-01  3.821e-02 -21.739  < 2e-16 ***
## loan_yes            -5.997e-01  5.781e-02 -10.373  < 2e-16 ***
## contact_unknown     -1.178e+00  5.731e-02 -20.552  < 2e-16 ***
## poutcome_other       2.658e-01  8.609e-02   3.088 0.002014 ** 
## poutcome_success     2.300e+00  7.605e-02  30.249  < 2e-16 ***
## poutcome_unknown    -3.028e-01  5.749e-02  -5.268 1.38e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 32631  on 45210  degrees of freedom
## Residual deviance: 22775  on 45193  degrees of freedom
## AIC: 22811
## 
## Number of Fisher Scoring iterations: 6
exp(coef(model2))
##         (Intercept)                 age             balance                 day 
##          0.05878022          1.00627196          1.00001783          0.99539101 
##            duration            campaign            previous     marital_married 
##          1.00403444          0.89331216          1.01059089          0.83143926 
##      marital_single education_secondary  education_tertiary   education_unknown 
##          1.26053916          1.23941825          1.51291505          1.43083661 
##         housing_yes            loan_yes     contact_unknown      poutcome_other 
##          0.43578486          0.54897599          0.30795248          1.30453846 
##    poutcome_success    poutcome_unknown 
##          9.97875886          0.73871157
# Splitting data into train and test data using caret package to maintain the proportion
set.seed (1234)
index <- createDataPartition(data$y_yes, p = .80, list = FALSE)
train <- data[index,]
test <- data [-index,]
# Training the data using train data and using stepwise regression to identify significant variables
model3 <- glm (y_yes~., family = binomial, data = train)
summary(model3)
## 
## Call:
## glm(formula = y_yes ~ ., family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7760  -0.4122  -0.2739  -0.1622   3.5182  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.912e+00  1.740e-01 -16.739  < 2e-16 ***
## age                  7.242e-03  2.044e-03   3.543 0.000395 ***
## balance              1.994e-05  5.458e-06   3.653 0.000259 ***
## day                 -4.947e-03  2.388e-03  -2.072 0.038296 *  
## duration             4.009e-03  6.936e-05  57.794  < 2e-16 ***
## campaign            -1.159e-01  1.127e-02 -10.283  < 2e-16 ***
## pdays                3.650e-05  3.350e-04   0.109 0.913229    
## previous             6.174e-03  6.227e-03   0.991 0.321447    
## marital_married     -1.230e-01  6.426e-02  -1.914 0.055641 .  
## marital_single       2.961e-01  7.258e-02   4.080 4.50e-05 ***
## education_secondary  2.411e-01  6.623e-02   3.641 0.000271 ***
## education_tertiary   4.375e-01  6.917e-02   6.326 2.52e-10 ***
## education_unknown    4.002e-01  1.083e-01   3.694 0.000221 ***
## default_yes         -2.806e-01  1.821e-01  -1.541 0.123326    
## housing_yes         -8.626e-01  4.327e-02 -19.933  < 2e-16 ***
## loan_yes            -5.516e-01  6.423e-02  -8.587  < 2e-16 ***
## contact_telephone   -1.235e-01  8.131e-02  -1.519 0.128873    
## contact_unknown     -1.161e+00  6.438e-02 -18.033  < 2e-16 ***
## poutcome_other       2.994e-01  9.488e-02   3.156 0.001601 ** 
## poutcome_success     2.305e+00  8.805e-02  26.172  < 2e-16 ***
## poutcome_unknown    -3.279e-01  1.001e-01  -3.274 0.001061 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 26108  on 36169  degrees of freedom
## Residual deviance: 18181  on 36149  degrees of freedom
## AIC: 18223
## 
## Number of Fisher Scoring iterations: 6
model4 <- step(model3, trace = 0)
summary(model4)
## 
## Call:
## glm(formula = y_yes ~ age + balance + day + duration + campaign + 
##     marital_married + marital_single + education_secondary + 
##     education_tertiary + education_unknown + default_yes + housing_yes + 
##     loan_yes + contact_telephone + contact_unknown + poutcome_other + 
##     poutcome_success + poutcome_unknown, family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7755  -0.4122  -0.2740  -0.1622   3.5154  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -2.886e+00  1.533e-01 -18.823  < 2e-16 ***
## age                  7.246e-03  2.043e-03   3.546 0.000391 ***
## balance              1.995e-05  5.457e-06   3.656 0.000256 ***
## day                 -5.011e-03  2.386e-03  -2.100 0.035701 *  
## duration             4.009e-03  6.936e-05  57.794  < 2e-16 ***
## campaign            -1.155e-01  1.125e-02 -10.265  < 2e-16 ***
## marital_married     -1.229e-01  6.423e-02  -1.913 0.055746 .  
## marital_single       2.963e-01  7.257e-02   4.083 4.45e-05 ***
## education_secondary  2.413e-01  6.623e-02   3.643 0.000269 ***
## education_tertiary   4.381e-01  6.912e-02   6.337 2.34e-10 ***
## education_unknown    4.000e-01  1.083e-01   3.692 0.000223 ***
## default_yes         -2.801e-01  1.821e-01  -1.538 0.124031    
## housing_yes         -8.614e-01  4.289e-02 -20.082  < 2e-16 ***
## loan_yes            -5.513e-01  6.423e-02  -8.584  < 2e-16 ***
## contact_telephone   -1.226e-01  8.130e-02  -1.507 0.131700    
## contact_unknown     -1.161e+00  6.436e-02 -18.041  < 2e-16 ***
## poutcome_other       3.055e-01  9.445e-02   3.235 0.001217 ** 
## poutcome_success     2.303e+00  8.513e-02  27.057  < 2e-16 ***
## poutcome_unknown    -3.541e-01  6.088e-02  -5.816 6.02e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 26108  on 36169  degrees of freedom
## Residual deviance: 18182  on 36151  degrees of freedom
## AIC: 18220
## 
## Number of Fisher Scoring iterations: 6
# Predicting test data using the model created, converting the predicted probabilities to class
predicted <- predict (model4, newdata = test, type = "response")
test$predicted <- predicted
test$class <- ifelse (test$predicted >= .5, 1, 0)
test$class <- factor (test$class)
test$y_yes <- factor (test$y_yes)
# Creating confusion matrix
confusionMatrix(test$class, test$y_yes, positive  = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7814  729
##          1  170  328
##                                           
##                Accuracy : 0.9006          
##                  95% CI : (0.8942, 0.9067)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 6.829e-08       
##                                           
##                   Kappa : 0.3751          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.31031         
##             Specificity : 0.97871         
##          Pos Pred Value : 0.65863         
##          Neg Pred Value : 0.91467         
##              Prevalence : 0.11691         
##          Detection Rate : 0.03628         
##    Detection Prevalence : 0.05508         
##       Balanced Accuracy : 0.64451         
##                                           
##        'Positive' Class : 1               
## 

The sensitivity value is very low. Sensitivity is the True positive rate. The value of sensitivity is low because the data is imbalanced. The proportion of event class and non-event class in the dependent variable is not same. In order to balance that, data balancing needs to be done. This can be done using ROSE package and creating over-sampled, under-sampled and ‘both’ sampled data

# Creating Over sampled, Under sampled and 'both' data
set.seed(1234)
over<- ovun.sample(y_yes~., data = train, method = "over", N = 63836)$data

set.seed(1234)
under<- ovun.sample(y_yes~., data = train, method = "under", N = 8464)$data

both<- ovun.sample(y_yes~., data = train, method = "both", p = .50, seed = 1234, N = 36170)$data
# Creating model using over sampled data, predicting using the model and creating confusion matrix
model5 <- glm (y_yes~., family = binomial, data = over)
predicted1 <- predict (model5, newdata = test, type = "response")
class1 <- ifelse (predicted1 >= .50, 1, 0)
class1 <- factor (class1)
confusionMatrix(class1, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6687  234
##          1 1297  823
##                                           
##                Accuracy : 0.8307          
##                  95% CI : (0.8228, 0.8383)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.429           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.77862         
##             Specificity : 0.83755         
##          Pos Pred Value : 0.38821         
##          Neg Pred Value : 0.96619         
##              Prevalence : 0.11691         
##          Detection Rate : 0.09103         
##    Detection Prevalence : 0.23449         
##       Balanced Accuracy : 0.80808         
##                                           
##        'Positive' Class : 1               
## 
# Creating model using under sampled data, predicting using the model and creating confusion matrix
model6 <- glm (y_yes~., family = binomial, data = under)
predicted2 <- predict (model6, newdata = test, type = "response")
class2 <- ifelse (predicted2 >= .50, 1, 0)
class2 <- factor (class2)
confusionMatrix(class2, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6708  246
##          1 1276  811
##                                           
##                Accuracy : 0.8317          
##                  95% CI : (0.8238, 0.8393)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.427           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7673          
##             Specificity : 0.8402          
##          Pos Pred Value : 0.3886          
##          Neg Pred Value : 0.9646          
##              Prevalence : 0.1169          
##          Detection Rate : 0.0897          
##    Detection Prevalence : 0.2308          
##       Balanced Accuracy : 0.8037          
##                                           
##        'Positive' Class : 1               
## 
# Creating model using both sampled data, predicting using the model and creating confusion matrix

model7 <- glm (y_yes~., family = binomial, data = both)
predicted3 <- predict (model7, newdata = test, type = "response")
class3 <- ifelse (predicted3 >= .50, 1, 0)
class3 <- factor (class3)
confusionMatrix(class3, test$y_yes, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 6700  243
##          1 1284  814
##                                           
##                Accuracy : 0.8311          
##                  95% CI : (0.8232, 0.8388)
##     No Information Rate : 0.8831          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4269          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.77010         
##             Specificity : 0.83918         
##          Pos Pred Value : 0.38799         
##          Neg Pred Value : 0.96500         
##              Prevalence : 0.11691         
##          Detection Rate : 0.09003         
##    Detection Prevalence : 0.23205         
##       Balanced Accuracy : 0.80464         
##                                           
##        'Positive' Class : 1               
## 
# Using library proc, creating ROC curve and calculatng AUCvalues for all 3 models
roc(test$y_yes, predicted1, plot=TRUE, legacy.axes=TRUE, percent=TRUE, xlab="False Positive Percentage", ylab="True Postive Percentage", col="green4", lwd=2, print.auc=TRUE)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = test$y_yes, predictor = predicted1, percent = TRUE,     plot = TRUE, legacy.axes = TRUE, xlab = "False Positive Percentage",     ylab = "True Postive Percentage", col = "green4", lwd = 2,     print.auc = TRUE)
## 
## Data: predicted1 in 7984 controls (test$y_yes 0) < 1057 cases (test$y_yes 1).
## Area under the curve: 89.06%
plot.roc(test$y_yes, predicted2, percent=TRUE, col="red", lwd=2, print.auc=TRUE, add=TRUE, print.auc.y=30)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot.roc(test$y_yes, predicted3, percent=TRUE, col ="blue", lwd=2,print.auc=TRUE, add=TRUE,print.auc.y=40)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
legend("topright", bg ="transparent", legend=c("Over sampled", "Under sampled", "Both"), col=c("green4", "red","blue"), cex = .5,lwd=2)