V: Data

a.) data cleaning

Read in the csv.

bank = read.csv('bank-additional.csv', sep=';')

bank_copy = bank

str(bank_copy)

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : chr  "blue-collar" "services" "services" "services" ...
##  $ marital       : chr  "married" "single" "married" "married" ...
##  $ education     : chr  "basic.9y" "high.school" "high.school" "basic.9y" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "yes" "no" "yes" "unknown" ...
##  $ loan          : chr  "no" "no" "no" "unknown" ...
##  $ contact       : chr  "cellular" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "jun" "jun" ...
##  $ day_of_week   : chr  "fri" "fri" "wed" "fri" ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : chr  "no" "no" "no" "no" ...

levels(as.factor(bank_copy$job))

##  [1] "admin."        "blue-collar"   "entrepreneur"  "housemaid"    
##  [5] "management"    "retired"       "self-employed" "services"     
##  [9] "student"       "technician"    "unemployed"    "unknown"

levels(as.factor(bank_copy$marital))

## [1] "divorced" "married"  "single"   "unknown"

levels(as.factor(bank_copy$education))

## [1] "basic.4y"            "basic.6y"            "basic.9y"           
## [4] "high.school"         "illiterate"          "professional.course"
## [7] "university.degree"   "unknown"

levels(as.factor(bank_copy$default))

## [1] "no"      "unknown" "yes"

levels(as.factor(bank_copy$housing))

## [1] "no"      "unknown" "yes"

levels(as.factor(bank_copy$loan))

## [1] "no"      "unknown" "yes"

levels(as.factor(bank_copy$contact))

## [1] "cellular"  "telephone"

levels(as.factor(bank_copy$month))

##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"

levels(as.factor(bank_copy$day_of_week))

## [1] "fri" "mon" "thu" "tue" "wed"

levels(as.factor(bank_copy$poutcome))

## [1] "failure"     "nonexistent" "success"

levels(as.factor(bank_copy$y))

## [1] "no"  "yes"

check age for non-int values

sum(bank_copy$age != integer())

## [1] 0

Take out the ‘unknown’ values from the dataset

bank_copy = subset(bank_copy, bank_copy$job != 'unknown')
bank_copy = subset(bank_copy, bank_copy$marital != 'unknown')
bank_copy = subset(bank_copy, bank_copy$education != 'unknown')
bank_copy = subset(bank_copy, bank_copy$default != 'unknown')
bank_copy = subset(bank_copy, bank_copy$housing != 'unknown')
bank_copy = subset(bank_copy, bank_copy$loan != 'unknown')

Check for Null values in integer variables. Results show no change in total observations (records)

bank_copy = subset(bank_copy, !is.na(bank_copy$age))
bank_copy = subset(bank_copy, !is.na(bank_copy$duration))
bank_copy = subset(bank_copy, !is.na(bank_copy$campaign))
bank_copy = subset(bank_copy, !is.na(bank_copy$pdays))
bank_copy = subset(bank_copy, !is.na(bank_copy$previous))
bank_copy = subset(bank_copy, !is.na(bank_copy$emp.var.rate))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.price.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.conf.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$euribor3m))
bank_copy = subset(bank_copy, !is.na(bank_copy$nr.employed))

Check for Null values in character variables. Results show no change in total observations

bank_copy = subset(bank_copy, !is.nan(bank_copy$job))
bank_copy = subset(bank_copy, !is.nan(bank_copy$marital))
bank_copy = subset(bank_copy, !is.nan(bank_copy$education))
bank_copy = subset(bank_copy, !is.nan(bank_copy$default))
bank_copy = subset(bank_copy, !is.nan(bank_copy$housing))
bank_copy = subset(bank_copy, !is.nan(bank_copy$loan))
bank_copy = subset(bank_copy, !is.nan(bank_copy$contact))
bank_copy = subset(bank_copy, !is.nan(bank_copy$month))
bank_copy = subset(bank_copy, !is.nan(bank_copy$day_of_week))
bank_copy = subset(bank_copy, !is.nan(bank_copy$y))

bank_copy = subset(bank_copy, !is.nan(bank_copy$poutcome))

Create dummy variables

But first, another look at the categorical variables to be converted to factored levels.

levels(as.factor(bank_copy$job))

##  [1] "admin."        "blue-collar"   "entrepreneur"  "housemaid"    
##  [5] "management"    "retired"       "self-employed" "services"     
##  [9] "student"       "technician"    "unemployed"

levels(as.factor(bank_copy$marital))

## [1] "divorced" "married"  "single"

levels(as.factor(bank_copy$education))

## [1] "basic.4y"            "basic.6y"            "basic.9y"           
## [4] "high.school"         "illiterate"          "professional.course"
## [7] "university.degree"

levels(as.factor(bank_copy$default))

## [1] "no"  "yes"

levels(as.factor(bank_copy$housing))

## [1] "no"  "yes"

levels(as.factor(bank_copy$loan))

## [1] "no"  "yes"

levels(as.factor(bank_copy$contact))

## [1] "cellular"  "telephone"

levels(as.factor(bank_copy$month))

##  [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"

levels(as.factor(bank_copy$day_of_week))

## [1] "fri" "mon" "thu" "tue" "wed"

levels(as.factor(bank_copy$poutcome))

## [1] "failure"     "nonexistent" "success"

levels(as.factor(bank_copy$y))

## [1] "no"  "yes"

Now we use factor(), and R will dummy the categorical variables for us.

bank_copy$jobF = factor(bank_copy$job)
bank_copy$marriedF = factor(bank_copy$marital)
#bank_copy$yF = factor(bank_copy$y) # this causes error in confusion matrix

bank_copy$y_dummy = ifelse(bank_copy$y == 'yes', 1,0)

For a better understanding of how R is going to deal with the categorical variables, we can use the contrasts() function. This function will show us how the variables have been dummyfied by R and how to interpret them in a model. For instance, you can see that in the variable yF, the no(indicating the customer did not subscribe) will be used as the reference (0). Example seen here: https://datascienceplus.com/perform-logistic-regression-in-r/

contrasts(bank_copy$jobF)

##               blue-collar entrepreneur housemaid management retired
## admin.                  0            0         0          0       0
## blue-collar             1            0         0          0       0
## entrepreneur            0            1         0          0       0
## housemaid               0            0         1          0       0
## management              0            0         0          1       0
## retired                 0            0         0          0       1
## self-employed           0            0         0          0       0
## services                0            0         0          0       0
## student                 0            0         0          0       0
## technician              0            0         0          0       0
## unemployed              0            0         0          0       0
##               self-employed services student technician unemployed
## admin.                    0        0       0          0          0
## blue-collar               0        0       0          0          0
## entrepreneur              0        0       0          0          0
## housemaid                 0        0       0          0          0
## management                0        0       0          0          0
## retired                   0        0       0          0          0
## self-employed             1        0       0          0          0
## services                  0        1       0          0          0
## student                   0        0       1          0          0
## technician                0        0       0          1          0
## unemployed                0        0       0          0          1

Logistic Model

m1 = glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF + bank_copy$age, data=bank_copy, family = binomial)

summary(m1)

## 
## Call:
## glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF + 
##     bank_copy$age, family = binomial, data = bank_copy)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9555  -0.5351  -0.4609  -0.3954   2.4019  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -3.051271   0.355371  -8.586  < 2e-16 ***
## bank_copy$jobFblue-collar   -0.484363   0.184277  -2.628 0.008577 ** 
## bank_copy$jobFentrepreneur  -0.814744   0.405472  -2.009 0.044497 *  
## bank_copy$jobFhousemaid     -0.251107   0.374837  -0.670 0.502915    
## bank_copy$jobFmanagement    -0.489531   0.239055  -2.048 0.040582 *  
## bank_copy$jobFretired        0.252587   0.280029   0.902 0.367055    
## bank_copy$jobFself-employed -0.370964   0.310493  -1.195 0.232181    
## bank_copy$jobFservices      -0.461680   0.236791  -1.950 0.051208 .  
## bank_copy$jobFstudent        0.834568   0.336106   2.483 0.013026 *  
## bank_copy$jobFtechnician    -0.139881   0.162777  -0.859 0.390153    
## bank_copy$jobFunemployed     0.380907   0.295693   1.288 0.197682    
## bank_copy$marriedFmarried    0.147205   0.191068   0.770 0.441042    
## bank_copy$marriedFsingle     0.516753   0.213564   2.420 0.015535 *  
## bank_copy$age                0.024432   0.006757   3.616 0.000299 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.4  on 3089  degrees of freedom
## Residual deviance: 2205.3  on 3076  degrees of freedom
## AIC: 2233.3
## 
## Number of Fisher Scoring iterations: 5

It looks like those with the job type of blue-collar(-0.48), entrepreneur(-0.81) show a decrease in coefficient, which also means a decrease in the odds of those customers being confirmed as a ‘yes’ to being a subscriber (‘y’). And age is significant, and shows that with an increase in age, we see the odds go up (0.024), increasing the odds that the customer will subscribe.

Now we check the performance of the model

bank_copy$PredProb = predict.glm(m1, newdata = bank_copy, type = 'response')

bank_copy$PredChoice = ifelse(bank_copy$PredProb >= 0.25, 1, 0)

bank_copy$PredChoice = factor(bank_copy$PredChoice)

caret::confusionMatrix(as.factor(bank_copy$y_dummy), as.factor(bank_copy$PredChoice))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2673   47
##          1  345   25
##                                           
##                Accuracy : 0.8731          
##                  95% CI : (0.8609, 0.8847)
##     No Information Rate : 0.9767          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0771          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.88569         
##             Specificity : 0.34722         
##          Pos Pred Value : 0.98272         
##          Neg Pred Value : 0.06757         
##              Prevalence : 0.97670         
##          Detection Rate : 0.86505         
##    Detection Prevalence : 0.88026         
##       Balanced Accuracy : 0.61645         
##                                           
##        'Positive' Class : 0               
##

Bank Marketing Case

Logan Henslee

9/15/2021

II: the Problem

III: Review of the literature.

IV: Methodology

V: Data

Read in the csv.