library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
#library(gam)
#library(logistf)

II: the Problem

a.) This case study uses data from the UCI Machine Learning Repository. The dataset is titled ‘bank-additional.csv’ and is 10% of the examples (4119), randomly selected from 1), and 20 of the inputs from the full dataset (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing). The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.

b.) The purpose of this case is predict if the client will subscribe a term deposit (variable y) by using a classification method.

c.) What are the best predictors of the dependent variable in the data set, in this case the dependent variable being ‘y’ - has the client subscribed a term deposit? (Last column in the dataset)

III: Review of the literature.

a.)

IV: Methodology

V: Data

a.) data cleaning

Read in the csv.

bank = read.csv('bank-additional.csv', sep=';')
bank_copy = bank
str(bank_copy)
## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : chr  "blue-collar" "services" "services" "services" ...
##  $ marital       : chr  "married" "single" "married" "married" ...
##  $ education     : chr  "basic.9y" "high.school" "high.school" "basic.9y" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "yes" "no" "yes" "unknown" ...
##  $ loan          : chr  "no" "no" "no" "unknown" ...
##  $ contact       : chr  "cellular" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "jun" "jun" ...
##  $ day_of_week   : chr  "fri" "fri" "wed" "fri" ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : chr  "no" "no" "no" "no" ...
levels(as.factor(bank_copy$job))
##  [1] "admin."        "blue-collar"   "entrepreneur"  "housemaid"    
##  [5] "management"    "retired"       "self-employed" "services"     
##  [9] "student"       "technician"    "unemployed"    "unknown"
levels(as.factor(bank_copy$marital))
## [1] "divorced" "married"  "single"   "unknown"
levels(as.factor(bank_copy$education))
## [1] "basic.4y"            "basic.6y"            "basic.9y"           
## [4] "high.school"         "illiterate"          "professional.course"
## [7] "university.degree"   "unknown"
levels(as.factor(bank_copy$default))
## [1] "no"      "unknown" "yes"
levels(as.factor(bank_copy$housing))
## [1] "no"      "unknown" "yes"
levels(as.factor(bank_copy$loan))
## [1] "no"      "unknown" "yes"
levels(as.factor(bank_copy$contact))
## [1] "cellular"  "telephone"
levels(as.factor(bank_copy$month))
##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
levels(as.factor(bank_copy$day_of_week))
## [1] "fri" "mon" "thu" "tue" "wed"
levels(as.factor(bank_copy$poutcome))
## [1] "failure"     "nonexistent" "success"
levels(as.factor(bank_copy$y))
## [1] "no"  "yes"

check age for non-int values

sum(bank_copy$age != integer())
## [1] 0

Take out the ‘unknown’ values from the dataset

bank_copy = subset(bank_copy, bank_copy$job != 'unknown')
bank_copy = subset(bank_copy, bank_copy$marital != 'unknown')
bank_copy = subset(bank_copy, bank_copy$education != 'unknown')
bank_copy = subset(bank_copy, bank_copy$default != 'unknown')
bank_copy = subset(bank_copy, bank_copy$housing != 'unknown')
bank_copy = subset(bank_copy, bank_copy$loan != 'unknown')

Check for Null values in integer variables. Results show no change in total observations (records)

bank_copy = subset(bank_copy, !is.na(bank_copy$age))
bank_copy = subset(bank_copy, !is.na(bank_copy$duration))
bank_copy = subset(bank_copy, !is.na(bank_copy$campaign))
bank_copy = subset(bank_copy, !is.na(bank_copy$pdays))
bank_copy = subset(bank_copy, !is.na(bank_copy$previous))
bank_copy = subset(bank_copy, !is.na(bank_copy$emp.var.rate))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.price.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.conf.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$euribor3m))
bank_copy = subset(bank_copy, !is.na(bank_copy$nr.employed))

Check for Null values in character variables. Results show no change in total observations

bank_copy = subset(bank_copy, !is.nan(bank_copy$job))
bank_copy = subset(bank_copy, !is.nan(bank_copy$marital))
bank_copy = subset(bank_copy, !is.nan(bank_copy$education))
bank_copy = subset(bank_copy, !is.nan(bank_copy$default))
bank_copy = subset(bank_copy, !is.nan(bank_copy$housing))
bank_copy = subset(bank_copy, !is.nan(bank_copy$loan))
bank_copy = subset(bank_copy, !is.nan(bank_copy$contact))
bank_copy = subset(bank_copy, !is.nan(bank_copy$month))
bank_copy = subset(bank_copy, !is.nan(bank_copy$day_of_week))
bank_copy = subset(bank_copy, !is.nan(bank_copy$y))
bank_copy = subset(bank_copy, !is.nan(bank_copy$poutcome))

Create dummy variables

But first, another look at the categorical variables to be converted to factored levels.

levels(as.factor(bank_copy$job))
##  [1] "admin."        "blue-collar"   "entrepreneur"  "housemaid"    
##  [5] "management"    "retired"       "self-employed" "services"     
##  [9] "student"       "technician"    "unemployed"
levels(as.factor(bank_copy$marital))
## [1] "divorced" "married"  "single"
levels(as.factor(bank_copy$education))
## [1] "basic.4y"            "basic.6y"            "basic.9y"           
## [4] "high.school"         "illiterate"          "professional.course"
## [7] "university.degree"
levels(as.factor(bank_copy$default))
## [1] "no"  "yes"
levels(as.factor(bank_copy$housing))
## [1] "no"  "yes"
levels(as.factor(bank_copy$loan))
## [1] "no"  "yes"
levels(as.factor(bank_copy$contact))
## [1] "cellular"  "telephone"
levels(as.factor(bank_copy$month))
##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
levels(as.factor(bank_copy$day_of_week))
## [1] "fri" "mon" "thu" "tue" "wed"
levels(as.factor(bank_copy$poutcome))
## [1] "failure"     "nonexistent" "success"
levels(as.factor(bank_copy$y))
## [1] "no"  "yes"

Now we use factor(), and R will dummy the categorical variables for us.

bank_copy$jobF = factor(bank_copy$job)
bank_copy$marriedF = factor(bank_copy$marital)
#bank_copy$yF = factor(bank_copy$y) # this causes error in confusion matrix
bank_copy$y_dummy = ifelse(bank_copy$y == 'yes', 1,0)

For a better understanding of how R is going to deal with the categorical variables, we can use the contrasts() function. This function will show us how the variables have been dummyfied by R and how to interpret them in a model. For instance, you can see that in the variable yF, the no(indicating the customer did not subscribe) will be used as the reference (0). Example seen here: https://datascienceplus.com/perform-logistic-regression-in-r/

contrasts(bank_copy$jobF)
##               blue-collar entrepreneur housemaid management retired
## admin.                  0            0         0          0       0
## blue-collar             1            0         0          0       0
## entrepreneur            0            1         0          0       0
## housemaid               0            0         1          0       0
## management              0            0         0          1       0
## retired                 0            0         0          0       1
## self-employed           0            0         0          0       0
## services                0            0         0          0       0
## student                 0            0         0          0       0
## technician              0            0         0          0       0
## unemployed              0            0         0          0       0
##               self-employed services student technician unemployed
## admin.                    0        0       0          0          0
## blue-collar               0        0       0          0          0
## entrepreneur              0        0       0          0          0
## housemaid                 0        0       0          0          0
## management                0        0       0          0          0
## retired                   0        0       0          0          0
## self-employed             1        0       0          0          0
## services                  0        1       0          0          0
## student                   0        0       1          0          0
## technician                0        0       0          1          0
## unemployed                0        0       0          0          1

Logistic Model

m1 = glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF + bank_copy$age, data=bank_copy, family = binomial)
summary(m1)
## 
## Call:
## glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF + 
##     bank_copy$age, family = binomial, data = bank_copy)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9555  -0.5351  -0.4609  -0.3954   2.4019  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.051271   0.355371  -8.586  < 2e-16 ***
## bank_copy$jobFblue-collar   -0.484363   0.184277  -2.628 0.008577 ** 
## bank_copy$jobFentrepreneur  -0.814744   0.405472  -2.009 0.044497 *  
## bank_copy$jobFhousemaid     -0.251107   0.374837  -0.670 0.502915    
## bank_copy$jobFmanagement    -0.489531   0.239055  -2.048 0.040582 *  
## bank_copy$jobFretired        0.252587   0.280029   0.902 0.367055    
## bank_copy$jobFself-employed -0.370964   0.310493  -1.195 0.232181    
## bank_copy$jobFservices      -0.461680   0.236791  -1.950 0.051208 .  
## bank_copy$jobFstudent        0.834568   0.336106   2.483 0.013026 *  
## bank_copy$jobFtechnician    -0.139881   0.162777  -0.859 0.390153    
## bank_copy$jobFunemployed     0.380907   0.295693   1.288 0.197682    
## bank_copy$marriedFmarried    0.147205   0.191068   0.770 0.441042    
## bank_copy$marriedFsingle     0.516753   0.213564   2.420 0.015535 *  
## bank_copy$age                0.024432   0.006757   3.616 0.000299 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.4  on 3089  degrees of freedom
## Residual deviance: 2205.3  on 3076  degrees of freedom
## AIC: 2233.3
## 
## Number of Fisher Scoring iterations: 5

It looks like those with the job type of blue-collar(-0.48), entrepreneur(-0.81) show a decrease in coefficient, which also means a decrease in the odds of those customers being confirmed as a ‘yes’ to being a subscriber (‘y’). And age is significant, and shows that with an increase in age, we see the odds go up (0.024), increasing the odds that the customer will subscribe.

Now we check the performance of the model

bank_copy$PredProb = predict.glm(m1, newdata = bank_copy, type = 'response')
bank_copy$PredChoice = ifelse(bank_copy$PredProb >= 0.25, 1, 0)
bank_copy$PredChoice = factor(bank_copy$PredChoice)
caret::confusionMatrix(as.factor(bank_copy$y_dummy), as.factor(bank_copy$PredChoice))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2673   47
##          1  345   25
##                                           
##                Accuracy : 0.8731          
##                  95% CI : (0.8609, 0.8847)
##     No Information Rate : 0.9767          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0771          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.88569         
##             Specificity : 0.34722         
##          Pos Pred Value : 0.98272         
##          Neg Pred Value : 0.06757         
##              Prevalence : 0.97670         
##          Detection Rate : 0.86505         
##    Detection Prevalence : 0.88026         
##       Balanced Accuracy : 0.61645         
##                                           
##        'Positive' Class : 0               
##