Intro

In this assignment we are performing a logistic regression on the Bank Marketing Data Set to figure out whether a client will subscribe a term deposit based on variable “y”.

Importing Data

Importing the data using the “read.csv” function was not successful and led to some errors in data. After doing some research the data was imported as a table using the “read.table” function and the table headers were assigned based on the first line of the data, separated by “;”.

bank_data <- read.table("bank-additional-full.csv", head = TRUE, sep = ";", stringsAsFactors=T)

#Table headers
names(bank_data)
##  [1] "age"            "job"            "marital"        "education"     
##  [5] "default"        "housing"        "loan"           "contact"       
##  [9] "month"          "day_of_week"    "duration"       "campaign"      
## [13] "pdays"          "previous"       "poutcome"       "emp.var.rate"  
## [17] "cons.price.idx" "cons.conf.idx"  "euribor3m"      "nr.employed"   
## [21] "y"
summary(bank_data)
##       age                 job            marital     
##  Min.   :17.00   admin.     :10422   divorced: 4612  
##  1st Qu.:32.00   blue-collar: 9254   married :24928  
##  Median :38.00   technician : 6743   single  :11568  
##  Mean   :40.02   services   : 3969   unknown :   80  
##  3rd Qu.:47.00   management : 2924                   
##  Max.   :98.00   retired    : 1720                   
##                  (Other)    : 6156                   
##                education        default         housing           loan      
##  university.degree  :12168   no     :32588   no     :18622   no     :33950  
##  high.school        : 9515   unknown: 8597   unknown:  990   unknown:  990  
##  basic.9y           : 6045   yes    :    3   yes    :21576   yes    : 6248  
##  professional.course: 5243                                                  
##  basic.4y           : 4176                                                  
##  basic.6y           : 2292                                                  
##  (Other)            : 1749                                                  
##       contact          month       day_of_week    duration     
##  cellular :26144   may    :13769   fri:7827    Min.   :   0.0  
##  telephone:15044   jul    : 7174   mon:8514    1st Qu.: 102.0  
##                    aug    : 6178   thu:8623    Median : 180.0  
##                    jun    : 5318   tue:8090    Mean   : 258.3  
##                    nov    : 4101   wed:8134    3rd Qu.: 319.0  
##                    apr    : 2632               Max.   :4918.0  
##                    (Other): 2016                               
##     campaign          pdays          previous            poutcome    
##  Min.   : 1.000   Min.   :  0.0   Min.   :0.000   failure    : 4252  
##  1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.000   nonexistent:35563  
##  Median : 2.000   Median :999.0   Median :0.000   success    : 1373  
##  Mean   : 2.568   Mean   :962.5   Mean   :0.173                      
##  3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.000                      
##  Max.   :56.000   Max.   :999.0   Max.   :7.000                      
##                                                                      
##   emp.var.rate      cons.price.idx  cons.conf.idx     euribor3m    
##  Min.   :-3.40000   Min.   :92.20   Min.   :-50.8   Min.   :0.634  
##  1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.344  
##  Median : 1.10000   Median :93.75   Median :-41.8   Median :4.857  
##  Mean   : 0.08189   Mean   :93.58   Mean   :-40.5   Mean   :3.621  
##  3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961  
##  Max.   : 1.40000   Max.   :94.77   Max.   :-26.9   Max.   :5.045  
##                                                                    
##   nr.employed     y        
##  Min.   :4964   no :36548  
##  1st Qu.:5099   yes: 4640  
##  Median :5191              
##  Mean   :5167              
##  3rd Qu.:5228              
##  Max.   :5228              
## 
#Number of observations
nrow(bank_data)
## [1] 41188

Logistic Regression

Since our response variable “y” is binary and is either yes or no, we will implement logistic regression against a set of selected variables. Then we choose the variables with low p-values which make the model more accurate.

glm.fit1 <- glm(y ~ age + job + marital + education + default + housing + loan + month + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)

summary(glm.fit1)
## 
## Call:
## glm(formula = y ~ age + job + marital + education + default + 
##     housing + loan + month + duration + cons.price.idx + cons.conf.idx + 
##     euribor3m, family = binomial, data = bank_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.0552  -0.3189  -0.1941  -0.1371   3.2645  
## 
## Coefficients: (1 not defined because of singularities)
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -4.670e+01  3.664e+00 -12.746  < 2e-16 ***
## age                           1.365e-03  2.363e-03   0.578 0.563477    
## jobblue-collar               -2.946e-01  7.819e-02  -3.767 0.000165 ***
## jobentrepreneur              -2.263e-01  1.236e-01  -1.830 0.067214 .  
## jobhousemaid                 -3.627e-02  1.421e-01  -0.255 0.798533    
## jobmanagement                -7.199e-02  8.305e-02  -0.867 0.386038    
## jobretired                    2.662e-01  1.038e-01   2.565 0.010317 *  
## jobself-employed             -2.290e-01  1.159e-01  -1.975 0.048229 *  
## jobservices                  -1.892e-01  8.402e-02  -2.252 0.024314 *  
## jobstudent                    2.629e-01  1.071e-01   2.455 0.014098 *  
## jobtechnician                -3.095e-02  6.900e-02  -0.449 0.653737    
## jobunemployed                 5.409e-02  1.224e-01   0.442 0.658686    
## jobunknown                    1.075e-01  2.278e-01   0.472 0.636879    
## maritalmarried                2.387e-02  6.676e-02   0.358 0.720685    
## maritalsingle                 9.119e-02  7.621e-02   1.196 0.231519    
## maritalunknown                1.010e-01  4.038e-01   0.250 0.802465    
## educationbasic.6y             1.330e-01  1.180e-01   1.127 0.259843    
## educationbasic.9y            -1.268e-03  9.326e-02  -0.014 0.989153    
## educationhigh.school          1.717e-02  8.964e-02   0.192 0.848104    
## educationilliterate           1.163e+00  7.215e-01   1.613 0.106824    
## educationprofessional.course  9.849e-02  9.858e-02   0.999 0.317785    
## educationuniversity.degree    1.865e-01  8.969e-02   2.080 0.037539 *  
## educationunknown              1.020e-01  1.166e-01   0.875 0.381714    
## defaultunknown               -3.343e-01  6.640e-02  -5.035 4.78e-07 ***
## defaultyes                   -7.443e+00  1.136e+02  -0.066 0.947736    
## housingunknown               -9.494e-02  1.361e-01  -0.697 0.485597    
## housingyes                    7.138e-04  4.018e-02   0.018 0.985825    
## loanunknown                          NA         NA      NA       NA    
## loanyes                      -4.671e-02  5.566e-02  -0.839 0.401372    
## monthaug                      5.316e-01  9.401e-02   5.654 1.57e-08 ***
## monthdec                      5.226e-01  1.856e-01   2.815 0.004875 ** 
## monthjul                      5.582e-01  8.813e-02   6.333 2.40e-10 ***
## monthjun                      4.322e-01  8.807e-02   4.908 9.20e-07 ***
## monthmar                      1.533e+00  1.160e-01  13.218  < 2e-16 ***
## monthmay                     -6.660e-01  7.212e-02  -9.235  < 2e-16 ***
## monthnov                      3.485e-01  9.278e-02   3.756 0.000173 ***
## monthoct                      5.841e-01  1.152e-01   5.072 3.94e-07 ***
## monthsep                      2.825e-01  1.240e-01   2.279 0.022649 *  
## duration                      4.684e-03  7.342e-05  63.789  < 2e-16 ***
## cons.price.idx                5.011e-01  4.003e-02  12.517  < 2e-16 ***
## cons.conf.idx                 4.190e-02  4.543e-03   9.223  < 2e-16 ***
## euribor3m                    -7.920e-01  1.599e-02 -49.534  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28999  on 41187  degrees of freedom
## Residual deviance: 17895  on 41147  degrees of freedom
## AIC: 17977
## 
## Number of Fisher Scoring iterations: 10

As we noted above, we will make a second model based on variables with low p-values.

glm.fit2 <- glm(y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)

summary(glm.fit2)
## 
## Call:
## glm(formula = y ~ age + job + duration + cons.price.idx + cons.conf.idx + 
##     euribor3m, family = binomial, data = bank_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.8961  -0.3573  -0.1986  -0.1442   3.2024  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.913e+01  3.321e+00 -17.806  < 2e-16 ***
## age              -1.078e-03  2.066e-03  -0.522  0.60184    
## jobblue-collar   -6.286e-01  6.196e-02 -10.146  < 2e-16 ***
## jobentrepreneur  -3.447e-01  1.199e-01  -2.876  0.00403 ** 
## jobhousemaid     -1.446e-01  1.336e-01  -1.082  0.27911    
## jobmanagement    -8.095e-02  8.003e-02  -1.011  0.31178    
## jobretired        2.986e-01  9.759e-02   3.060  0.00222 ** 
## jobself-employed -2.101e-01  1.122e-01  -1.872  0.06119 .  
## jobservices      -4.360e-01  7.768e-02  -5.612    2e-08 ***
## jobstudent        2.398e-01  9.911e-02   2.419  0.01555 *  
## jobtechnician    -4.929e-02  6.017e-02  -0.819  0.41270    
## jobunemployed     1.653e-02  1.177e-01   0.140  0.88837    
## jobunknown       -1.984e-02  2.183e-01  -0.091  0.92761    
## duration          4.554e-03  7.119e-05  63.963  < 2e-16 ***
## cons.price.idx    6.542e-01  3.614e-02  18.101  < 2e-16 ***
## cons.conf.idx     7.566e-02  3.468e-03  21.819  < 2e-16 ***
## euribor3m        -8.139e-01  1.428e-02 -56.990  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 28999  on 41187  degrees of freedom
## Residual deviance: 18671  on 41171  degrees of freedom
## AIC: 18705
## 
## Number of Fisher Scoring iterations: 6

In order to speed up the loading process, I have commented out the pairs charts.

#pairs(data = bank_data, y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m ,col = bank_data$y)

Making Predictions

As can be noted below we use the predict function to make predictions based on our model. A sample of 5 values are shown below to illustrate the correctness of our model. The mean of our predictions is 0.9045 which looks great. It means that our model is correct 90.45% of the time.

A confusion matrix is also made to compare how many of our observations are in line with the prediction model. The number of true negatives (35,684), which is the number of people who would not subscribe seems rational. However, in the confusion matrix, the number of false positives is more than true positives (3,068 > 1,572).

glm.probs <- predict(glm.fit2, type = "response")

glm.probs[1: 5]
##          1          2          3          4          5 
## 0.03329431 0.01520298 0.02190812 0.02394977 0.03075755
glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")

attach(bank_data)
table(glm.pred, y)
##         y
## glm.pred    no   yes
##      no  35684  3068
##      yes   864  1572
mean(glm.pred == y)
## [1] 0.9045353

Training and Test Set

Due to the problem of false positives we observed, we will make a training set by using the mean of “age” and dividing the participants into two groups. One below the age 40 and the other above 40.

mean(bank_data$age)
## [1] 40.02406
train <- age < 40

glm.fit <- glm(y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial, subset = train)

summary(glm.fit)
## 
## Call:
## glm(formula = y ~ age + job + duration + cons.price.idx + cons.conf.idx + 
##     euribor3m, family = binomial, data = bank_data, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.9360  -0.3775  -0.2109  -0.1493   3.2194  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -5.182e+01  4.405e+00 -11.763  < 2e-16 ***
## age              -3.587e-02  6.108e-03  -5.873 4.28e-09 ***
## jobblue-collar   -6.250e-01  8.118e-02  -7.699 1.37e-14 ***
## jobentrepreneur  -3.057e-01  1.710e-01  -1.787   0.0739 .  
## jobhousemaid     -3.745e-01  2.530e-01  -1.480   0.1388    
## jobmanagement     1.014e-01  1.128e-01   0.899   0.3688    
## jobretired       -1.430e+00  1.081e+00  -1.323   0.1860    
## jobself-employed -4.784e-02  1.439e-01  -0.333   0.7395    
## jobservices      -4.300e-01  9.581e-02  -4.488 7.17e-06 ***
## jobstudent        4.317e-02  1.077e-01   0.401   0.6884    
## jobtechnician     6.212e-02  7.362e-02   0.844   0.3988    
## jobunemployed     2.304e-01  1.512e-01   1.523   0.1277    
## jobunknown       -2.890e-01  3.833e-01  -0.754   0.4509    
## duration          4.570e-03  9.459e-05  48.317  < 2e-16 ***
## cons.price.idx    5.856e-01  4.768e-02  12.283  < 2e-16 ***
## cons.conf.idx     7.369e-02  4.571e-03  16.121  < 2e-16 ***
## euribor3m        -7.731e-01  1.902e-02 -40.654  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16286  on 22606  degrees of freedom
## Residual deviance: 10712  on 22590  degrees of freedom
## AIC: 10746
## 
## Number of Fisher Scoring iterations: 6
glm.probs <- predict(glm.fit, newdata = bank_data[!train,], type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")

y.40 <- bank_data$y[!train]

table(glm.pred, y.40)
##         y.40
## glm.pred    no   yes
##      no  16402  1674
##      yes   176   329
mean(glm.pred == y.40)
## [1] 0.9004359

Fitting a smaller model

Next, we try to reduce the number of explanatory variables to see how much our model might improve under the same test set. As we note below, we can see that the correctness of our model is 89.22% (slightly below 90.45% compared to the previous model), which is very good.

train <- age < 40

glm.fit <- glm(y ~ age + job + cons.price.idx + euribor3m, data = bank_data, family = binomial, subset = train)

summary(glm.fit)
## 
## Call:
## glm(formula = y ~ age + job + cons.price.idx + euribor3m, family = binomial, 
##     data = bank_data, subset = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3730  -0.5305  -0.3343  -0.2779   2.7382  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -43.624120   3.968962 -10.991  < 2e-16 ***
## age               -0.029067   0.005286  -5.498 3.83e-08 ***
## jobblue-collar    -0.593823   0.068267  -8.699  < 2e-16 ***
## jobentrepreneur   -0.259193   0.148287  -1.748  0.08048 .  
## jobhousemaid      -0.455672   0.227020  -2.007  0.04473 *  
## jobmanagement      0.024516   0.098196   0.250  0.80285    
## jobretired        -1.381458   1.025140  -1.348  0.17779    
## jobself-employed  -0.102575   0.123676  -0.829  0.40689    
## jobservices       -0.395164   0.082839  -4.770 1.84e-06 ***
## jobstudent         0.249572   0.093979   2.656  0.00792 ** 
## jobtechnician      0.019501   0.063557   0.307  0.75897    
## jobunemployed      0.163193   0.131178   1.244  0.21348    
## jobunknown         0.145939   0.309415   0.472  0.63717    
## cons.price.idx     0.474215   0.042652  11.118  < 2e-16 ***
## euribor3m         -0.565727   0.015600 -36.264  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 16286  on 22606  degrees of freedom
## Residual deviance: 14195  on 22592  degrees of freedom
## AIC: 14225
## 
## Number of Fisher Scoring iterations: 5
glm.probs <- predict(glm.fit, newdata = bank_data[!train,], type = "response")

glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")

y.40 <- bank_data$y[!train]

table(glm.pred, y.40)
##         y.40
## glm.pred    no   yes
##       no 16578  2003
mean(glm.pred == y.40)
## [1] 0.8922017