Marketing Case Study

Load necessary libraries

# Load the libraries
library("PerformanceAnalytics")
library(caret)
library(readr)
library(car)
library(dplyr)
library(tidyverse)
library(broom)

Read the CSV file and print the structure

# Read csv file. Use header=TRUE fr header; Use Sep = ";" for the data separator
# Use StringAsFactors to convert character strings to Factors
# Convert all missing values (saved as "unknown" when using read.csv) into NA so that omit function can be used on both Factor and numeric values
bmktg <- read.csv("bank-additional.csv", header=TRUE, sep=";", na.strings = "unknown" , stringsAsFactors = T)

# Get the structure of the dataset
str(bmktg)

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 NA 1 NA ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 NA 2 1 2 2 1 1 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 NA 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...

Number of observations, n = 4119
Predictors variables, p = 20
1 response variable, y

Summary of input dataset

# Get the summary
summary(bmktg)

##       age                 job           marital                   education   
##  Min.   :18.00   admin.     :1012   divorced: 446   university.degree  :1264  
##  1st Qu.:32.00   blue-collar: 884   married :2509   high.school        : 921  
##  Median :38.00   technician : 691   single  :1153   basic.9y           : 574  
##  Mean   :40.11   services   : 393   NA's    :  11   professional.course: 535  
##  3rd Qu.:47.00   management : 324                   basic.4y           : 429  
##  Max.   :88.00   (Other)    : 776                   (Other)            : 229  
##                  NA's       :  39                   NA's               : 167  
##  default     housing       loan           contact         month     
##  no  :3315   no  :1839   no  :3349   cellular :2652   may    :1378  
##  yes :   1   yes :2175   yes : 665   telephone:1467   jul    : 711  
##  NA's: 803   NA's: 105   NA's: 105                    aug    : 636  
##                                                       jun    : 530  
##                                                       nov    : 446  
##                                                       apr    : 215  
##                                                       (Other): 203  
##  day_of_week    duration         campaign          pdays          previous     
##  fri:768     Min.   :   0.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  mon:855     1st Qu.: 103.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  thu:860     Median : 181.0   Median : 2.000   Median :999.0   Median :0.0000  
##  tue:841     Mean   : 256.8   Mean   : 2.537   Mean   :960.4   Mean   :0.1903  
##  wed:795     3rd Qu.: 317.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##              Max.   :3643.0   Max.   :35.000   Max.   :999.0   Max.   :6.0000  
##                                                                                
##         poutcome     emp.var.rate      cons.price.idx  cons.conf.idx  
##  failure    : 454   Min.   :-3.40000   Min.   :92.20   Min.   :-50.8  
##  nonexistent:3523   1st Qu.:-1.80000   1st Qu.:93.08   1st Qu.:-42.7  
##  success    : 142   Median : 1.10000   Median :93.75   Median :-41.8  
##                     Mean   : 0.08497   Mean   :93.58   Mean   :-40.5  
##                     3rd Qu.: 1.40000   3rd Qu.:93.99   3rd Qu.:-36.4  
##                     Max.   : 1.40000   Max.   :94.77   Max.   :-26.9  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.635   Min.   :4964   no :3668  
##  1st Qu.:1.334   1st Qu.:5099   yes: 451  
##  Median :4.857   Median :5191             
##  Mean   :3.621   Mean   :5166             
##  3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
##

It is confirmed that read.csv function’s NA.strings parameter converted the factor variables null values to NA.

Remove the null values

# Remove the null values using na.omit function
bmktg_clean <- na.omit(bmktg)

Structure of clean dataset

# Get the structure again to know the final n
str(bmktg_clean)

## 'data.frame':    3090 obs. of  21 variables:
##  $ age           : int  30 39 25 47 32 32 31 36 36 47 ...
##  $ job           : Factor w/ 11 levels "admin.","blue-collar",..: 2 8 8 1 8 1 8 7 1 2 ...
##  $ marital       : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 3 1 3 2 2 ...
##  $ education     : Factor w/ 7 levels "basic.4y","basic.6y",..: 3 4 4 7 7 7 6 1 4 1 ...
##  $ default       : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ housing       : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
##  $ loan          : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ contact       : Factor w/ 2 levels "cellular","telephone": 1 2 2 1 1 1 1 1 2 2 ...
##  $ month         : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 8 10 10 8 4 7 5 ...
##  $ day_of_week   : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 2 3 2 4 3 5 3 ...
##  $ duration      : int  487 346 227 58 128 290 68 148 97 211 ...
##  $ campaign      : int  2 4 1 1 3 4 1 1 2 2 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 2 0 1 0 0 0 ...
##  $ poutcome      : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 1 2 1 2 2 2 ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 1.4 1.1 1.4 ...
##  $ cons.price.idx: num  92.9 94 94.5 93.2 94.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42.7 -36.4 -41.8 ...
##  $ euribor3m     : num  1.313 4.855 4.962 4.191 0.884 ...
##  $ nr.employed   : num  5099 5191 5228 5196 4964 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int  4 8 10 11 19 21 25 28 29 32 ...
##   ..- attr(*, "names")= chr  "4" "8" "10" "11" ...

Final Number of clean observations in the dataset, n = 3090
Predictors, p = 20

Summary

# Get the summary of the clean dataset
summary(bmktg_clean)

##       age                   job          marital                   education   
##  Min.   :20.00   admin.       :854   divorced: 348   basic.4y           : 243  
##  1st Qu.:31.00   technician   :573   married :1791   basic.6y           : 150  
##  Median :37.00   blue-collar  :554   single  : 951   basic.9y           : 407  
##  Mean   :39.18   services     :276                   high.school        : 728  
##  3rd Qu.:46.00   management   :265                   illiterate         :   1  
##  Max.   :88.00   self-employed:126                   professional.course: 454  
##                  (Other)      :442                   university.degree  :1107  
##  default    housing     loan           contact         month     day_of_week
##  no :3089   no :1402   no :2583   cellular :2108   may    :981   fri:580    
##  yes:   1   yes:1688   yes: 507   telephone: 982   jul    :514   mon:642    
##                                                    aug    :495   thu:630    
##                                                    nov    :387   tue:613    
##                                                    jun    :365   wed:625    
##                                                    apr    :169              
##                                                    (Other):179              
##     duration         campaign          pdays          previous     
##  Min.   :   0.0   Min.   : 1.000   Min.   :  0.0   Min.   :0.0000  
##  1st Qu.: 104.0   1st Qu.: 1.000   1st Qu.:999.0   1st Qu.:0.0000  
##  Median : 181.0   Median : 2.000   Median :999.0   Median :0.0000  
##  Mean   : 259.2   Mean   : 2.509   Mean   :953.4   Mean   :0.2081  
##  3rd Qu.: 315.0   3rd Qu.: 3.000   3rd Qu.:999.0   3rd Qu.:0.0000  
##  Max.   :3643.0   Max.   :35.000   Max.   :999.0   Max.   :6.0000  
##                                                                    
##         poutcome     emp.var.rate     cons.price.idx  cons.conf.idx   
##  failure    : 360   Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  nonexistent:2602   1st Qu.:-1.8000   1st Qu.:93.08   1st Qu.:-42.70  
##  success    : 128   Median : 1.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.0468   Mean   :93.53   Mean   :-40.62  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##                                                                       
##    euribor3m      nr.employed     y       
##  Min.   :0.635   Min.   :4964   no :2720  
##  1st Qu.:1.313   1st Qu.:5099   yes: 370  
##  Median :4.856   Median :5191             
##  Mean   :3.482   Mean   :5161             
##  3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228             
##

This summary indicates the following:
default variable has only 1 observation for the value of ‘yes’.
educator factor variable has only 1 observation for the value of ‘illiterate’.

Correlation

# Get the numeric varaibles and the response variable into a dataframe
mydata <- (bmktg_clean[,c(1,11:14, 16:21)])

# convert the factor response variable into numeric
mydata$y <- as.numeric(bmktg_clean[,c(21)])

# Get the correlation matrix
chart.Correlation(mydata, histogram=TRUE, pch=19)

Correlation matrix also showed that Nr.employed , Euribor3m and Emp.var.rate are highly correlated to the response variable and with each other.
Also, cons.price.idx is correlated with emp.var.rate.

Collinearity

# Run a base glm fit with all variables
glm.fit.all <- glm(y ~., data = bmktg_clean,family = binomial)

# Run vif function to determine collinearity
car::vif(glm.fit.all)

##                      GVIF Df GVIF^(1/(2*Df))
## age              2.292123  1        1.513976
## job              7.661960 10        1.107177
## marital          1.556327  2        1.116928
## education        3.669268  6        1.114418
## default          1.000001  1        1.000001
## housing          1.061807  1        1.030440
## loan             1.046895  1        1.023179
## contact          2.421407  1        1.556087
## month           96.423476  9        1.288939
## day_of_week      1.209362  4        1.024046
## duration         1.342813  1        1.158798
## campaign         1.097849  1        1.047783
## pdays           10.083901  1        3.175516
## previous         4.064878  1        2.016154
## poutcome        22.775383  2        2.184572
## emp.var.rate   128.514810  1       11.336437
## cons.price.idx  69.843243  1        8.357227
## cons.conf.idx    5.867293  1        2.422249
## euribor3m      137.127604  1       11.710150
## nr.employed    179.550903  1       13.399661

Above vif values show that nr.employed , euribor3m , emp.var.rate and cons.price.idx are collinear.
Correlation matrix also showed that nr.employed , euribor3m , emp.var.rate are highly correlated to the response variable and with each other.
Remove the euribor3m and emp.var.rate and run the fit to see the collinearity.

# Run the glm fit after removing the collinear variables - euribor3m and emp.var.rate
glm.fit.few <- glm(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean,family = binomial)

# Run vif function to determine collinearity
car::vif(glm.fit.few)

##                     GVIF Df GVIF^(1/(2*Df))
## age             2.277849  1        1.509255
## job             7.381590 10        1.105115
## marital         1.551172  2        1.116002
## education       3.625370  6        1.113301
## default         1.000001  1        1.000001
## housing         1.061526  1        1.030304
## loan            1.045110  1        1.022306
## contact         2.031111  1        1.425170
## month           7.212320  9        1.116017
## day_of_week     1.181188  4        1.021033
## duration        1.338770  1        1.157052
## campaign        1.093436  1        1.045675
## pdays           9.994310  1        3.161378
## previous        4.121105  1        2.030050
## poutcome       22.539229  2        2.178887
## cons.price.idx  2.139354  1        1.462653
## cons.conf.idx   2.754270  1        1.659599
## nr.employed     2.259336  1        1.503109

Now the collinearity issue is resolved.

Validate linearity assumption for Logistic regression

# Predict the probability (p) of logistic regression fit ran above
probabilities <- predict(glm.fit.few, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)

##     1     2     3     5     6     7 
## "neg" "neg" "neg" "neg" "neg" "neg"

# Select only numeric predictors
mydata2 <- bmktg_clean %>%
  dplyr::select_if(is.numeric) 
mydata2$emp.var.rate <- NULL
mydata2$euribor3m <- NULL
predictors <- colnames(mydata2)
# Bind the logit and tidying the data for plot
mydata2 <- mydata2 %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)

# Create the scatter plots
ggplot(mydata2, aes(logit, predictor.value))+
  geom_point(size = 0.5, alpha = 0.5) +
  geom_smooth(method = "loess") + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_y")

## `geom_smooth()` using formula 'y ~ x'

plot(glm.fit.few, which=4, id.n=3)

# Extract model results
glm.fit.few.data <- augment(glm.fit.few) %>% 
  mutate(index = 1:n()) 
glm.fit.few.data %>% top_n(3, .cooksd)

## # A tibble: 3 x 28
##   .rownames y       age job   marital education default housing loan  contact
##   <chr>     <fct> <int> <fct> <fct>   <fct>     <fct>   <fct>   <fct> <fct>  
## 1 1191      no       56 unem… married professi… no      no      no    cellul…
## 2 1547      no       46 admi… divorc… high.sch… no      yes     no    teleph…
## 3 2123      no       31 admi… single  universi… no      no      yes   cellul…
## # … with 18 more variables: month <fct>, day_of_week <fct>, duration <int>,
## #   campaign <int>, pdays <int>, previous <int>, poutcome <fct>,
## #   cons.price.idx <dbl>, cons.conf.idx <dbl>, nr.employed <dbl>,
## #   .fitted <dbl>, .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, .std.resid <dbl>, index <int>

ggplot(glm.fit.few.data, aes(index, .std.resid)) + 
  geom_point(aes(color = y), alpha = .5) +
  theme_bw()

## Warning: Removed 2 rows containing missing values (geom_point).

glm.fit.few.data %>% 
  filter(abs(.std.resid) > 3)

## # A tibble: 5 x 28
##   .rownames y       age job   marital education default housing loan  contact
##   <chr>     <fct> <int> <fct> <fct>   <fct>     <fct>   <fct>   <fct> <fct>  
## 1 1191      no       56 unem… married professi… no      no      no    cellul…
## 2 1311      no       50 tech… married universi… no      yes     yes   cellul…
## 3 1547      no       46 admi… divorc… high.sch… no      yes     no    teleph…
## 4 2502      yes      59 admi… married universi… no      no      no    teleph…
## 5 4110      no       63 reti… married high.sch… no      no      no    cellul…
## # … with 18 more variables: month <fct>, day_of_week <fct>, duration <int>,
## #   campaign <int>, pdays <int>, previous <int>, poutcome <fct>,
## #   cons.price.idx <dbl>, cons.conf.idx <dbl>, nr.employed <dbl>,
## #   .fitted <dbl>, .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## #   .cooksd <dbl>, .std.resid <dbl>, index <int>

Cross Validation train control and set seed

# set the seed
set.seed(12345)

# set train control parameter for 10-fold cross validation
train.control <- trainControl(method = "cv", number = 10)

GLM - Fit1 - Logistic fit with 10-fold Cross Validation using Caret package (all except collinear ones)

# Logistic regression with 
glm.fit <- train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean, method = "glm",family = binomial, trControl = train.control)

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

summary(glm.fit)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5666  -0.3107  -0.1818  -0.1122   3.1221  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   7.484e+01  1.612e+01   4.643 3.43e-06 ***
## age                           4.701e-03  9.087e-03   0.517 0.604932    
## `jobblue-collar`              5.341e-02  3.115e-01   0.171 0.863861    
## jobentrepreneur              -6.020e-01  5.429e-01  -1.109 0.267520    
## jobhousemaid                  5.659e-01  4.942e-01   1.145 0.252190    
## jobmanagement                -3.125e-01  3.109e-01  -1.005 0.314849    
## jobretired                   -5.146e-02  3.900e-01  -0.132 0.895029    
## `jobself-employed`           -4.857e-01  4.313e-01  -1.126 0.260046    
## jobservices                  -7.559e-02  3.302e-01  -0.229 0.818958    
## jobstudent                    2.746e-01  4.566e-01   0.601 0.547683    
## jobtechnician                 2.443e-01  2.429e-01   1.006 0.314505    
## jobunemployed                 3.807e-01  4.337e-01   0.878 0.380075    
## maritalmarried                2.889e-01  2.685e-01   1.076 0.281828    
## maritalsingle                 2.908e-01  3.015e-01   0.964 0.334876    
## educationbasic.6y             4.445e-01  5.020e-01   0.886 0.375829    
## educationbasic.9y             3.786e-01  3.842e-01   0.985 0.324451    
## educationhigh.school          3.977e-01  3.674e-01   1.083 0.278985    
## educationilliterate          -1.119e+01  5.354e+02  -0.021 0.983319    
## educationprofessional.course  2.892e-01  3.902e-01   0.741 0.458705    
## educationuniversity.degree    5.248e-01  3.674e-01   1.429 0.153138    
## defaultyes                   -8.968e+00  5.354e+02  -0.017 0.986636    
## housingyes                    4.614e-02  1.520e-01   0.304 0.761422    
## loanyes                      -1.227e-01  2.076e-01  -0.591 0.554357    
## contacttelephone             -9.835e-01  2.875e-01  -3.421 0.000623 ***
## monthaug                      2.122e-02  3.947e-01   0.054 0.957125    
## monthdec                     -1.115e-01  6.764e-01  -0.165 0.869070    
## monthjul                     -6.436e-02  3.929e-01  -0.164 0.869886    
## monthjun                      1.019e+00  3.557e-01   2.864 0.004180 ** 
## monthmar                      1.889e+00  4.610e-01   4.098 4.18e-05 ***
## monthmay                     -6.220e-01  3.023e-01  -2.058 0.039628 *  
## monthnov                     -5.263e-01  3.662e-01  -1.437 0.150715    
## monthoct                     -9.848e-02  4.653e-01  -0.212 0.832372    
## monthsep                     -6.504e-01  4.874e-01  -1.334 0.182073    
## day_of_weekmon                2.928e-01  2.357e-01   1.243 0.214009    
## day_of_weekthu                3.163e-01  2.369e-01   1.335 0.181834    
## day_of_weektue                1.128e-01  2.447e-01   0.461 0.644872    
## day_of_weekwed                3.172e-01  2.489e-01   1.274 0.202504    
## duration                      4.999e-03  2.889e-04  17.303  < 2e-16 ***
## campaign                     -9.688e-02  5.138e-02  -1.885 0.059366 .  
## pdays                        -4.161e-04  6.953e-04  -0.598 0.549560    
## previous                      1.251e-01  1.894e-01   0.660 0.508952    
## poutcomenonexistent           5.672e-01  3.212e-01   1.766 0.077393 .  
## poutcomesuccess               1.449e+00  6.886e-01   2.105 0.035290 *  
## cons.price.idx               -1.084e-01  1.582e-01  -0.685 0.493367    
## cons.conf.idx                 4.085e-02  1.980e-02   2.064 0.039060 *  
## nr.employed                  -1.318e-02  1.262e-03 -10.441  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.4  on 3089  degrees of freedom
## Residual deviance: 1288.0  on 3044  degrees of freedom
## AIC: 1380
## 
## Number of Fisher Scoring iterations: 12

# GLM.fit1 Accuracy 91.17%
pred.glm <- predict(glm.fit, bmktg_clean)
confusionMatrix(pred.glm, bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2646  199
##        yes   74  171
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.9011, 0.9214)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 1.293e-08       
##                                           
##                   Kappa : 0.5093          
##                                           
##  Mcnemar's Test P-Value : 6.151e-14       
##                                           
##             Sensitivity : 0.9728          
##             Specificity : 0.4622          
##          Pos Pred Value : 0.9301          
##          Neg Pred Value : 0.6980          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8563          
##    Detection Prevalence : 0.9207          
##       Balanced Accuracy : 0.7175          
##                                           
##        'Positive' Class : no              
##

The above GLM fit gave warning as “prediction from a rank-deficient fit may be misleading”. This is triggered by two categorical varaibles having rows with one particular category which have only one y value.
The summary of dataset indicates the following:
default variable has only 1 observation for the value of ‘yes’.
education factor variable has only 1 observation for the value of ‘illiterate’.
Lets run the glm again removing those two rows.

GLM fit2 - Logistic fit with 10-fold CV using Caret package (all except collinear and rank deficient ones)

# Logistic regression with with 10-fold CV after removing default and education predictors
glm.fit2 <- train(y~age+job+marital+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean, method = "glm",family = binomial, trControl = train.control)
summary(glm.fit2)

## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5691  -0.3121  -0.1841  -0.1136   3.1674  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         74.6440453 16.0963305   4.637 3.53e-06 ***
## age                  0.0023283  0.0088376   0.263 0.792204    
## `jobblue-collar`    -0.1023401  0.2462143  -0.416 0.677663    
## jobentrepreneur     -0.6828173  0.5448344  -1.253 0.210112    
## jobhousemaid         0.3914196  0.4785156   0.818 0.413365    
## jobmanagement       -0.2942730  0.3091786  -0.952 0.341204    
## jobretired          -0.1857310  0.3817778  -0.486 0.626620    
## `jobself-employed`  -0.4845239  0.4272992  -1.134 0.256827    
## jobservices         -0.1573227  0.3095477  -0.508 0.611289    
## jobstudent           0.1608898  0.4368502   0.368 0.712653    
## jobtechnician        0.1511583  0.2187764   0.691 0.489612    
## jobunemployed        0.2643250  0.4250602   0.622 0.534038    
## maritalmarried       0.2897844  0.2652000   1.093 0.274525    
## maritalsingle        0.2966331  0.2976717   0.997 0.319002    
## housingyes           0.0618943  0.1513724   0.409 0.682622    
## loanyes             -0.1333712  0.2075180  -0.643 0.520421    
## contacttelephone    -1.0014048  0.2874700  -3.484 0.000495 ***
## monthaug             0.0290765  0.3946493   0.074 0.941268    
## monthdec            -0.0991375  0.6773748  -0.146 0.883641    
## monthjul            -0.0670965  0.3925219  -0.171 0.864273    
## monthjun             1.0270046  0.3555020   2.889 0.003866 ** 
## monthmar             1.8895100  0.4604023   4.104 4.06e-05 ***
## monthmay            -0.6191274  0.3017346  -2.052 0.040180 *  
## monthnov            -0.5227512  0.3657874  -1.429 0.152972    
## monthoct            -0.1132380  0.4656910  -0.243 0.807881    
## monthsep            -0.6777440  0.4873570  -1.391 0.164331    
## day_of_weekmon       0.2827689  0.2356207   1.200 0.230100    
## day_of_weekthu       0.3251656  0.2363573   1.376 0.168903    
## day_of_weektue       0.0977748  0.2439433   0.401 0.688560    
## day_of_weekwed       0.3156885  0.2481429   1.272 0.203300    
## duration             0.0049885  0.0002884  17.297  < 2e-16 ***
## campaign            -0.0970685  0.0512967  -1.892 0.058452 .  
## pdays               -0.0004138  0.0006935  -0.597 0.550705    
## previous             0.1389808  0.1889813   0.735 0.462083    
## poutcomenonexistent  0.5863134  0.3203550   1.830 0.067220 .  
## poutcomesuccess      1.4400390  0.6867091   2.097 0.035992 *  
## cons.price.idx      -0.0989363  0.1578038  -0.627 0.530687    
## cons.conf.idx        0.0423288  0.0198000   2.138 0.032531 *  
## nr.employed         -0.0131979  0.0012591 -10.482  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2264.4  on 3089  degrees of freedom
## Residual deviance: 1290.8  on 3051  degrees of freedom
## AIC: 1368.8
## 
## Number of Fisher Scoring iterations: 7

#GLM.fit2 Accuracy 91.17%
pred.glm2 <- predict(glm.fit2, bmktg_clean)
confusionMatrix(pred.glm2, bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2647  200
##        yes   73  170
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.9011, 0.9214)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 1.293e-08       
##                                           
##                   Kappa : 0.5079          
##                                           
##  Mcnemar's Test P-Value : 2.424e-14       
##                                           
##             Sensitivity : 0.9732          
##             Specificity : 0.4595          
##          Pos Pred Value : 0.9298          
##          Neg Pred Value : 0.6996          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8566          
##    Detection Prevalence : 0.9214          
##       Balanced Accuracy : 0.7163          
##                                           
##        'Positive' Class : no              
##

Random Forests without the variables - , Euribor3m and Emp.var.rate and duration

# Run random forests without the variables - Euribor3m and Emp.var.rate and duration
# accuracy - 90.71%
rf.fit = train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed,
               data=bmktg_clean,
               method='rf',
               trControl=train.control,
               importance=TRUE)
rf.fit$besttune

## NULL

varImp(rf.fit)

## rf variable importance
## 
##   only 20 most important variables shown (out of 44)
## 
##                     Importance
## nr.employed             100.00
## pdays                    78.71
## poutcomesuccess          76.72
## monthmar                 66.91
## previous                 51.29
## contacttelephone         49.94
## cons.conf.idx            49.67
## poutcomenonexistent      46.66
## cons.price.idx           39.77
## age                      39.03
## monthoct                 36.91
## monthmay                 35.62
## jobstudent               30.00
## monthjun                 29.56
## monthdec                 27.30
## monthjul                 26.10
## monthnov                 24.70
## monthsep                 21.90
## jobtechnician            21.65
## maritalsingle            19.74

plot(varImp(rf.fit))

pred.rf<-predict(rf.fit,bmktg_clean)
confusionMatrix(pred.rf,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2718  284
##        yes    2   86
##                                           
##                Accuracy : 0.9074          
##                  95% CI : (0.8967, 0.9174)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 8.473e-07       
##                                           
##                   Kappa : 0.3454          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9993          
##             Specificity : 0.2324          
##          Pos Pred Value : 0.9054          
##          Neg Pred Value : 0.9773          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8796          
##    Detection Prevalence : 0.9715          
##       Balanced Accuracy : 0.6158          
##                                           
##        'Positive' Class : no              
##

# get variable importance of top 14
plot(varImp(rf.fit),14)

KNN fit - all variables except Euribor3m, Emp.var.rate (collinear) and default and education (rank deficient)

# KNN fit with all variables except Euribor3m, Emp.var.rate (collinear) and default and education (rank deficient)
# accuracy - 91.04%
knn.fit <- 
  train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed,
                 data=bmktg_clean,
                 method='knn',
                 trControl = train.control,
                 tuneLength=20)
pred.knn.fit <- predict(knn.fit, bmktg_clean)
confusionMatrix(pred.knn.fit,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2638  192
##        yes   82  178
##                                           
##                Accuracy : 0.9113          
##                  95% CI : (0.9007, 0.9211)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 1.825e-08       
##                                           
##                   Kappa : 0.5174          
##                                           
##  Mcnemar's Test P-Value : 4.551e-11       
##                                           
##             Sensitivity : 0.9699          
##             Specificity : 0.4811          
##          Pos Pred Value : 0.9322          
##          Neg Pred Value : 0.6846          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8537          
##    Detection Prevalence : 0.9159          
##       Balanced Accuracy : 0.7255          
##                                           
##        'Positive' Class : no              
##

KNN.fit1 = 91.17%

# KNN.fit1 with significant precitors - accuracy = 91.17%
set.seed(12345)
knn.fit1 <- 
  train(y~duration+nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,
                 data=bmktg_clean,
                 method='knn',
                 trControl = train.control,
                 tuneLength=20)
pred.knn.fit1 <- predict(knn.fit1, bmktg_clean)
confusionMatrix(pred.knn.fit1,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2638  191
##        yes   82  179
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.9011, 0.9214)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 1.293e-08       
##                                           
##                   Kappa : 0.5198          
##                                           
##  Mcnemar's Test P-Value : 6.299e-11       
##                                           
##             Sensitivity : 0.9699          
##             Specificity : 0.4838          
##          Pos Pred Value : 0.9325          
##          Neg Pred Value : 0.6858          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8537          
##    Detection Prevalence : 0.9155          
##       Balanced Accuracy : 0.7268          
##                                           
##        'Positive' Class : no              
##

glm.fit - with significant parameters accuracy = 91.23%

# glm.fit4 - with significant parameters accuracy = 91.23%
glm.fit4 <- train(y~duration+nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,data=bmktg_clean,method='glm',trControl = train.control)
pred.glm.fit4 <- predict(glm.fit4, bmktg_clean)
confusionMatrix(pred.glm.fit4,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2649  200
##        yes   71  170
##                                          
##                Accuracy : 0.9123         
##                  95% CI : (0.9018, 0.922)
##     No Information Rate : 0.8803         
##     P-Value [Acc > NIR] : 6.409e-09      
##                                          
##                   Kappa : 0.5102         
##                                          
##  Mcnemar's Test P-Value : 7.518e-15      
##                                          
##             Sensitivity : 0.9739         
##             Specificity : 0.4595         
##          Pos Pred Value : 0.9298         
##          Neg Pred Value : 0.7054         
##              Prevalence : 0.8803         
##          Detection Rate : 0.8573         
##    Detection Prevalence : 0.9220         
##       Balanced Accuracy : 0.7167         
##                                          
##        'Positive' Class : no             
##

GLM fit w/o duration and with significant params

# GLM fit w/o duration and with significant params ; accuracy = 89.45%
glm.fit5 <- train(y~nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,data=bmktg_clean,method='glm',trControl = train.control)
pred.glm.fit5 <- predict(glm.fit5, bmktg_clean)
confusionMatrix(pred.glm.fit5,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2675  281
##        yes   45   89
##                                           
##                Accuracy : 0.8945          
##                  95% CI : (0.8831, 0.9051)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 0.007218        
##                                           
##                   Kappa : 0.3092          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9835          
##             Specificity : 0.2405          
##          Pos Pred Value : 0.9049          
##          Neg Pred Value : 0.6642          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8657          
##    Detection Prevalence : 0.9566          
##       Balanced Accuracy : 0.6120          
##                                           
##        'Positive' Class : no              
##

KNN fit - w/o duration and with significant params

knn.fit2 <- 
  train(y~nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,
                 data=bmktg_clean,
                 method='knn',
                 trControl = train.control,
                 tuneLength=20)
pred.knn.fit2 <- predict(knn.fit2, bmktg_clean)
confusionMatrix(pred.knn.fit2,bmktg_clean$y)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  2693  299
##        yes   27   71
##                                           
##                Accuracy : 0.8945          
##                  95% CI : (0.8831, 0.9051)
##     No Information Rate : 0.8803          
##     P-Value [Acc > NIR] : 0.007218        
##                                           
##                   Kappa : 0.2666          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9901          
##             Specificity : 0.1919          
##          Pos Pred Value : 0.9001          
##          Neg Pred Value : 0.7245          
##              Prevalence : 0.8803          
##          Detection Rate : 0.8715          
##    Detection Prevalence : 0.9683          
##       Balanced Accuracy : 0.5910          
##                                           
##        'Positive' Class : no              
##

Accuracy_all <- cbind(c(91.23, 91.17,100),c(89.45,89.45,90.71))
colnames(Accuracy_all) <- c("Accuracy w duration","Accuracy w/o duration")
rownames(Accuracy_all) <- c("GLM","KNN","Random Forests")
Accuracy_all

##                Accuracy w duration Accuracy w/o duration
## GLM                          91.23                 89.45
## KNN                          91.17                 89.45
## Random Forests              100.00                 90.71