library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
#library(gam)
#library(logistf)
a.) This case study uses data from the UCI Machine Learning Repository. The dataset is titled ‘bank-additional.csv’ and is 10% of the examples (4119), randomly selected from 1), and 20 of the inputs from the full dataset (http://archive.ics.uci.edu/ml/datasets/Bank+Marketing). The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.
b.) The purpose of this case is predict if the client will subscribe a term deposit (variable y) by using a classification method.
c.) What are the best predictors of the dependent variable in the data set, in this case the dependent variable being ‘y’ - has the client subscribed a term deposit? (Last column in the dataset)
a.)
a.) data cleaning
bank = read.csv('bank-additional.csv', sep=';')
bank_copy = bank
str(bank_copy)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : chr "blue-collar" "services" "services" "services" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education : chr "basic.9y" "high.school" "high.school" "basic.9y" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "yes" "no" "yes" "unknown" ...
## $ loan : chr "no" "no" "no" "unknown" ...
## $ contact : chr "cellular" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "jun" "jun" ...
## $ day_of_week : chr "fri" "fri" "wed" "fri" ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : chr "no" "no" "no" "no" ...
levels(as.factor(bank_copy$job))
## [1] "admin." "blue-collar" "entrepreneur" "housemaid"
## [5] "management" "retired" "self-employed" "services"
## [9] "student" "technician" "unemployed" "unknown"
levels(as.factor(bank_copy$marital))
## [1] "divorced" "married" "single" "unknown"
levels(as.factor(bank_copy$education))
## [1] "basic.4y" "basic.6y" "basic.9y"
## [4] "high.school" "illiterate" "professional.course"
## [7] "university.degree" "unknown"
levels(as.factor(bank_copy$default))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_copy$housing))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_copy$loan))
## [1] "no" "unknown" "yes"
levels(as.factor(bank_copy$contact))
## [1] "cellular" "telephone"
levels(as.factor(bank_copy$month))
## [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
levels(as.factor(bank_copy$day_of_week))
## [1] "fri" "mon" "thu" "tue" "wed"
levels(as.factor(bank_copy$poutcome))
## [1] "failure" "nonexistent" "success"
levels(as.factor(bank_copy$y))
## [1] "no" "yes"
check age for non-int values
sum(bank_copy$age != integer())
## [1] 0
Take out the ‘unknown’ values from the dataset
bank_copy = subset(bank_copy, bank_copy$job != 'unknown')
bank_copy = subset(bank_copy, bank_copy$marital != 'unknown')
bank_copy = subset(bank_copy, bank_copy$education != 'unknown')
bank_copy = subset(bank_copy, bank_copy$default != 'unknown')
bank_copy = subset(bank_copy, bank_copy$housing != 'unknown')
bank_copy = subset(bank_copy, bank_copy$loan != 'unknown')
Check for Null values in integer variables. Results show no change in total observations (records)
bank_copy = subset(bank_copy, !is.na(bank_copy$age))
bank_copy = subset(bank_copy, !is.na(bank_copy$duration))
bank_copy = subset(bank_copy, !is.na(bank_copy$campaign))
bank_copy = subset(bank_copy, !is.na(bank_copy$pdays))
bank_copy = subset(bank_copy, !is.na(bank_copy$previous))
bank_copy = subset(bank_copy, !is.na(bank_copy$emp.var.rate))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.price.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$cons.conf.idx))
bank_copy = subset(bank_copy, !is.na(bank_copy$euribor3m))
bank_copy = subset(bank_copy, !is.na(bank_copy$nr.employed))
Check for Null values in character variables. Results show no change in total observations
bank_copy = subset(bank_copy, !is.nan(bank_copy$job))
bank_copy = subset(bank_copy, !is.nan(bank_copy$marital))
bank_copy = subset(bank_copy, !is.nan(bank_copy$education))
bank_copy = subset(bank_copy, !is.nan(bank_copy$default))
bank_copy = subset(bank_copy, !is.nan(bank_copy$housing))
bank_copy = subset(bank_copy, !is.nan(bank_copy$loan))
bank_copy = subset(bank_copy, !is.nan(bank_copy$contact))
bank_copy = subset(bank_copy, !is.nan(bank_copy$month))
bank_copy = subset(bank_copy, !is.nan(bank_copy$day_of_week))
bank_copy = subset(bank_copy, !is.nan(bank_copy$y))
bank_copy = subset(bank_copy, !is.nan(bank_copy$poutcome))
Create dummy variables
But first, another look at the categorical variables to be converted to factored levels.
levels(as.factor(bank_copy$job))
## [1] "admin." "blue-collar" "entrepreneur" "housemaid"
## [5] "management" "retired" "self-employed" "services"
## [9] "student" "technician" "unemployed"
levels(as.factor(bank_copy$marital))
## [1] "divorced" "married" "single"
levels(as.factor(bank_copy$education))
## [1] "basic.4y" "basic.6y" "basic.9y"
## [4] "high.school" "illiterate" "professional.course"
## [7] "university.degree"
levels(as.factor(bank_copy$default))
## [1] "no" "yes"
levels(as.factor(bank_copy$housing))
## [1] "no" "yes"
levels(as.factor(bank_copy$loan))
## [1] "no" "yes"
levels(as.factor(bank_copy$contact))
## [1] "cellular" "telephone"
levels(as.factor(bank_copy$month))
## [1] "apr" "aug" "dec" "jul" "jun" "mar" "may" "nov" "oct" "sep"
levels(as.factor(bank_copy$day_of_week))
## [1] "fri" "mon" "thu" "tue" "wed"
levels(as.factor(bank_copy$poutcome))
## [1] "failure" "nonexistent" "success"
levels(as.factor(bank_copy$y))
## [1] "no" "yes"
Now we use factor(), and R will dummy the categorical variables for us.
bank_copy$jobF = factor(bank_copy$job)
bank_copy$marriedF = factor(bank_copy$marital)
#bank_copy$yF = factor(bank_copy$y) # this causes error in confusion matrix
bank_copy$y_dummy = ifelse(bank_copy$y == 'yes', 1,0)
For a better understanding of how R is going to deal with the categorical variables, we can use the contrasts() function. This function will show us how the variables have been dummyfied by R and how to interpret them in a model. For instance, you can see that in the variable yF, the no(indicating the customer did not subscribe) will be used as the reference (0). Example seen here: https://datascienceplus.com/perform-logistic-regression-in-r/
contrasts(bank_copy$jobF)
## blue-collar entrepreneur housemaid management retired
## admin. 0 0 0 0 0
## blue-collar 1 0 0 0 0
## entrepreneur 0 1 0 0 0
## housemaid 0 0 1 0 0
## management 0 0 0 1 0
## retired 0 0 0 0 1
## self-employed 0 0 0 0 0
## services 0 0 0 0 0
## student 0 0 0 0 0
## technician 0 0 0 0 0
## unemployed 0 0 0 0 0
## self-employed services student technician unemployed
## admin. 0 0 0 0 0
## blue-collar 0 0 0 0 0
## entrepreneur 0 0 0 0 0
## housemaid 0 0 0 0 0
## management 0 0 0 0 0
## retired 0 0 0 0 0
## self-employed 1 0 0 0 0
## services 0 1 0 0 0
## student 0 0 1 0 0
## technician 0 0 0 1 0
## unemployed 0 0 0 0 1
Logistic Model
m1 = glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF + bank_copy$age, data=bank_copy, family = binomial)
summary(m1)
##
## Call:
## glm(formula = y_dummy ~ bank_copy$jobF + bank_copy$marriedF +
## bank_copy$age, family = binomial, data = bank_copy)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9555 -0.5351 -0.4609 -0.3954 2.4019
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.051271 0.355371 -8.586 < 2e-16 ***
## bank_copy$jobFblue-collar -0.484363 0.184277 -2.628 0.008577 **
## bank_copy$jobFentrepreneur -0.814744 0.405472 -2.009 0.044497 *
## bank_copy$jobFhousemaid -0.251107 0.374837 -0.670 0.502915
## bank_copy$jobFmanagement -0.489531 0.239055 -2.048 0.040582 *
## bank_copy$jobFretired 0.252587 0.280029 0.902 0.367055
## bank_copy$jobFself-employed -0.370964 0.310493 -1.195 0.232181
## bank_copy$jobFservices -0.461680 0.236791 -1.950 0.051208 .
## bank_copy$jobFstudent 0.834568 0.336106 2.483 0.013026 *
## bank_copy$jobFtechnician -0.139881 0.162777 -0.859 0.390153
## bank_copy$jobFunemployed 0.380907 0.295693 1.288 0.197682
## bank_copy$marriedFmarried 0.147205 0.191068 0.770 0.441042
## bank_copy$marriedFsingle 0.516753 0.213564 2.420 0.015535 *
## bank_copy$age 0.024432 0.006757 3.616 0.000299 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2264.4 on 3089 degrees of freedom
## Residual deviance: 2205.3 on 3076 degrees of freedom
## AIC: 2233.3
##
## Number of Fisher Scoring iterations: 5
It looks like those with the job type of blue-collar(-0.48), entrepreneur(-0.81) show a decrease in coefficient, which also means a decrease in the odds of those customers being confirmed as a ‘yes’ to being a subscriber (‘y’). And age is significant, and shows that with an increase in age, we see the odds go up (0.024), increasing the odds that the customer will subscribe.
Now we check the performance of the model
bank_copy$PredProb = predict.glm(m1, newdata = bank_copy, type = 'response')
bank_copy$PredChoice = ifelse(bank_copy$PredProb >= 0.25, 1, 0)
bank_copy$PredChoice = factor(bank_copy$PredChoice)
caret::confusionMatrix(as.factor(bank_copy$y_dummy), as.factor(bank_copy$PredChoice))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2673 47
## 1 345 25
##
## Accuracy : 0.8731
## 95% CI : (0.8609, 0.8847)
## No Information Rate : 0.9767
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0771
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.88569
## Specificity : 0.34722
## Pos Pred Value : 0.98272
## Neg Pred Value : 0.06757
## Prevalence : 0.97670
## Detection Rate : 0.86505
## Detection Prevalence : 0.88026
## Balanced Accuracy : 0.61645
##
## 'Positive' Class : 0
##