# Load the libraries
library("PerformanceAnalytics")
library(caret)
library(readr)
library(car)
library(dplyr)
library(tidyverse)
library(broom)
# Read csv file. Use header=TRUE fr header; Use Sep = ";" for the data separator
# Use StringAsFactors to convert character strings to Factors
# Convert all missing values (saved as "unknown" when using read.csv) into NA so that omit function can be used on both Factor and numeric values
bmktg <- read.csv("bank-additional.csv", header=TRUE, sep=";", na.strings = "unknown" , stringsAsFactors = T)
# Get the structure of the dataset
str(bmktg)
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 2 8 8 8 1 8 1 3 8 2 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 2 3 3 2 1 2 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 3 4 4 3 7 7 7 7 6 3 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 NA 1 NA ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 NA 2 1 2 2 1 1 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 NA 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 2 1 1 1 1 1 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 5 8 10 10 8 8 7 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 1 2 3 2 2 4 3 ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 2 1 2 2 1 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
Number of observations, n = 4119
Predictors variables, p = 20
1 response variable, y
# Get the summary
summary(bmktg)
## age job marital education
## Min. :18.00 admin. :1012 divorced: 446 university.degree :1264
## 1st Qu.:32.00 blue-collar: 884 married :2509 high.school : 921
## Median :38.00 technician : 691 single :1153 basic.9y : 574
## Mean :40.11 services : 393 NA's : 11 professional.course: 535
## 3rd Qu.:47.00 management : 324 basic.4y : 429
## Max. :88.00 (Other) : 776 (Other) : 229
## NA's : 39 NA's : 167
## default housing loan contact month
## no :3315 no :1839 no :3349 cellular :2652 may :1378
## yes : 1 yes :2175 yes : 665 telephone:1467 jul : 711
## NA's: 803 NA's: 105 NA's: 105 aug : 636
## jun : 530
## nov : 446
## apr : 215
## (Other): 203
## day_of_week duration campaign pdays previous
## fri:768 Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## mon:855 1st Qu.: 103.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## thu:860 Median : 181.0 Median : 2.000 Median :999.0 Median :0.0000
## tue:841 Mean : 256.8 Mean : 2.537 Mean :960.4 Mean :0.1903
## wed:795 3rd Qu.: 317.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :3643.0 Max. :35.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 454 Min. :-3.40000 Min. :92.20 Min. :-50.8
## nonexistent:3523 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## success : 142 Median : 1.10000 Median :93.75 Median :-41.8
## Mean : 0.08497 Mean :93.58 Mean :-40.5
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. : 1.40000 Max. :94.77 Max. :-26.9
##
## euribor3m nr.employed y
## Min. :0.635 Min. :4964 no :3668
## 1st Qu.:1.334 1st Qu.:5099 yes: 451
## Median :4.857 Median :5191
## Mean :3.621 Mean :5166
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
It is confirmed that read.csv function’s NA.strings parameter converted the factor variables null values to NA.
# Remove the null values using na.omit function
bmktg_clean <- na.omit(bmktg)
# Get the structure again to know the final n
str(bmktg_clean)
## 'data.frame': 3090 obs. of 21 variables:
## $ age : int 30 39 25 47 32 32 31 36 36 47 ...
## $ job : Factor w/ 11 levels "admin.","blue-collar",..: 2 8 8 1 8 1 8 7 1 2 ...
## $ marital : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 3 1 3 2 2 ...
## $ education : Factor w/ 7 levels "basic.4y","basic.6y",..: 3 4 4 7 7 7 6 1 4 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ housing : Factor w/ 2 levels "no","yes": 2 1 2 2 1 2 1 1 1 2 ...
## $ loan : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ contact : Factor w/ 2 levels "cellular","telephone": 1 2 2 1 1 1 1 1 2 2 ...
## $ month : Factor w/ 10 levels "apr","aug","dec",..: 7 7 5 8 10 10 8 4 7 5 ...
## $ day_of_week : Factor w/ 5 levels "fri","mon","thu",..: 1 1 5 2 3 2 4 3 5 3 ...
## $ duration : int 487 346 227 58 128 290 68 148 97 211 ...
## $ campaign : int 2 4 1 1 3 4 1 1 2 2 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 2 0 1 0 0 0 ...
## $ poutcome : Factor w/ 3 levels "failure","nonexistent",..: 2 2 2 2 1 2 1 2 2 2 ...
## $ emp.var.rate : num -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 1.4 1.1 1.4 ...
## $ cons.price.idx: num 92.9 94 94.5 93.2 94.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42.7 -36.4 -41.8 ...
## $ euribor3m : num 1.313 4.855 4.962 4.191 0.884 ...
## $ nr.employed : num 5099 5191 5228 5196 4964 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int 4 8 10 11 19 21 25 28 29 32 ...
## ..- attr(*, "names")= chr "4" "8" "10" "11" ...
Final Number of clean observations in the dataset, n = 3090
Predictors, p = 20
# Get the summary of the clean dataset
summary(bmktg_clean)
## age job marital education
## Min. :20.00 admin. :854 divorced: 348 basic.4y : 243
## 1st Qu.:31.00 technician :573 married :1791 basic.6y : 150
## Median :37.00 blue-collar :554 single : 951 basic.9y : 407
## Mean :39.18 services :276 high.school : 728
## 3rd Qu.:46.00 management :265 illiterate : 1
## Max. :88.00 self-employed:126 professional.course: 454
## (Other) :442 university.degree :1107
## default housing loan contact month day_of_week
## no :3089 no :1402 no :2583 cellular :2108 may :981 fri:580
## yes: 1 yes:1688 yes: 507 telephone: 982 jul :514 mon:642
## aug :495 thu:630
## nov :387 tue:613
## jun :365 wed:625
## apr :169
## (Other):179
## duration campaign pdays previous
## Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.0000
## 1st Qu.: 104.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.0000
## Median : 181.0 Median : 2.000 Median :999.0 Median :0.0000
## Mean : 259.2 Mean : 2.509 Mean :953.4 Mean :0.2081
## 3rd Qu.: 315.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :3643.0 Max. :35.000 Max. :999.0 Max. :6.0000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 360 Min. :-3.4000 Min. :92.20 Min. :-50.80
## nonexistent:2602 1st Qu.:-1.8000 1st Qu.:93.08 1st Qu.:-42.70
## success : 128 Median : 1.1000 Median :93.44 Median :-41.80
## Mean :-0.0468 Mean :93.53 Mean :-40.62
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
##
## euribor3m nr.employed y
## Min. :0.635 Min. :4964 no :2720
## 1st Qu.:1.313 1st Qu.:5099 yes: 370
## Median :4.856 Median :5191
## Mean :3.482 Mean :5161
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
This summary indicates the following:
default variable has only 1 observation for the value of ‘yes’.
educator factor variable has only 1 observation for the value of ‘illiterate’.
# Get the numeric varaibles and the response variable into a dataframe
mydata <- (bmktg_clean[,c(1,11:14, 16:21)])
# convert the factor response variable into numeric
mydata$y <- as.numeric(bmktg_clean[,c(21)])
# Get the correlation matrix
chart.Correlation(mydata, histogram=TRUE, pch=19)
Correlation matrix also showed that Nr.employed , Euribor3m and Emp.var.rate are highly correlated to the response variable and with each other.
Also, cons.price.idx is correlated with emp.var.rate.
# Run a base glm fit with all variables
glm.fit.all <- glm(y ~., data = bmktg_clean,family = binomial)
# Run vif function to determine collinearity
car::vif(glm.fit.all)
## GVIF Df GVIF^(1/(2*Df))
## age 2.292123 1 1.513976
## job 7.661960 10 1.107177
## marital 1.556327 2 1.116928
## education 3.669268 6 1.114418
## default 1.000001 1 1.000001
## housing 1.061807 1 1.030440
## loan 1.046895 1 1.023179
## contact 2.421407 1 1.556087
## month 96.423476 9 1.288939
## day_of_week 1.209362 4 1.024046
## duration 1.342813 1 1.158798
## campaign 1.097849 1 1.047783
## pdays 10.083901 1 3.175516
## previous 4.064878 1 2.016154
## poutcome 22.775383 2 2.184572
## emp.var.rate 128.514810 1 11.336437
## cons.price.idx 69.843243 1 8.357227
## cons.conf.idx 5.867293 1 2.422249
## euribor3m 137.127604 1 11.710150
## nr.employed 179.550903 1 13.399661
Above vif values show that nr.employed , euribor3m , emp.var.rate and cons.price.idx are collinear.
Correlation matrix also showed that nr.employed , euribor3m , emp.var.rate are highly correlated to the response variable and with each other.
Remove the euribor3m and emp.var.rate and run the fit to see the collinearity.
# Run the glm fit after removing the collinear variables - euribor3m and emp.var.rate
glm.fit.few <- glm(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean,family = binomial)
# Run vif function to determine collinearity
car::vif(glm.fit.few)
## GVIF Df GVIF^(1/(2*Df))
## age 2.277849 1 1.509255
## job 7.381590 10 1.105115
## marital 1.551172 2 1.116002
## education 3.625370 6 1.113301
## default 1.000001 1 1.000001
## housing 1.061526 1 1.030304
## loan 1.045110 1 1.022306
## contact 2.031111 1 1.425170
## month 7.212320 9 1.116017
## day_of_week 1.181188 4 1.021033
## duration 1.338770 1 1.157052
## campaign 1.093436 1 1.045675
## pdays 9.994310 1 3.161378
## previous 4.121105 1 2.030050
## poutcome 22.539229 2 2.178887
## cons.price.idx 2.139354 1 1.462653
## cons.conf.idx 2.754270 1 1.659599
## nr.employed 2.259336 1 1.503109
Now the collinearity issue is resolved.
# Predict the probability (p) of logistic regression fit ran above
probabilities <- predict(glm.fit.few, type = "response")
predicted.classes <- ifelse(probabilities > 0.5, "pos", "neg")
head(predicted.classes)
## 1 2 3 5 6 7
## "neg" "neg" "neg" "neg" "neg" "neg"
# Select only numeric predictors
mydata2 <- bmktg_clean %>%
dplyr::select_if(is.numeric)
mydata2$emp.var.rate <- NULL
mydata2$euribor3m <- NULL
predictors <- colnames(mydata2)
# Bind the logit and tidying the data for plot
mydata2 <- mydata2 %>%
mutate(logit = log(probabilities/(1-probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
# Create the scatter plots
ggplot(mydata2, aes(logit, predictor.value))+
geom_point(size = 0.5, alpha = 0.5) +
geom_smooth(method = "loess") +
theme_bw() +
facet_wrap(~predictors, scales = "free_y")
## `geom_smooth()` using formula 'y ~ x'
plot(glm.fit.few, which=4, id.n=3)
# Extract model results
glm.fit.few.data <- augment(glm.fit.few) %>%
mutate(index = 1:n())
glm.fit.few.data %>% top_n(3, .cooksd)
## # A tibble: 3 x 28
## .rownames y age job marital education default housing loan contact
## <chr> <fct> <int> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1191 no 56 unem… married professi… no no no cellul…
## 2 1547 no 46 admi… divorc… high.sch… no yes no teleph…
## 3 2123 no 31 admi… single universi… no no yes cellul…
## # … with 18 more variables: month <fct>, day_of_week <fct>, duration <int>,
## # campaign <int>, pdays <int>, previous <int>, poutcome <fct>,
## # cons.price.idx <dbl>, cons.conf.idx <dbl>, nr.employed <dbl>,
## # .fitted <dbl>, .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>, index <int>
ggplot(glm.fit.few.data, aes(index, .std.resid)) +
geom_point(aes(color = y), alpha = .5) +
theme_bw()
## Warning: Removed 2 rows containing missing values (geom_point).
glm.fit.few.data %>%
filter(abs(.std.resid) > 3)
## # A tibble: 5 x 28
## .rownames y age job marital education default housing loan contact
## <chr> <fct> <int> <fct> <fct> <fct> <fct> <fct> <fct> <fct>
## 1 1191 no 56 unem… married professi… no no no cellul…
## 2 1311 no 50 tech… married universi… no yes yes cellul…
## 3 1547 no 46 admi… divorc… high.sch… no yes no teleph…
## 4 2502 yes 59 admi… married universi… no no no teleph…
## 5 4110 no 63 reti… married high.sch… no no no cellul…
## # … with 18 more variables: month <fct>, day_of_week <fct>, duration <int>,
## # campaign <int>, pdays <int>, previous <int>, poutcome <fct>,
## # cons.price.idx <dbl>, cons.conf.idx <dbl>, nr.employed <dbl>,
## # .fitted <dbl>, .se.fit <dbl>, .resid <dbl>, .hat <dbl>, .sigma <dbl>,
## # .cooksd <dbl>, .std.resid <dbl>, index <int>
# set the seed
set.seed(12345)
# set train control parameter for 10-fold cross validation
train.control <- trainControl(method = "cv", number = 10)
# Logistic regression with
glm.fit <- train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean, method = "glm",family = binomial, trControl = train.control)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
summary(glm.fit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5666 -0.3107 -0.1818 -0.1122 3.1221
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.484e+01 1.612e+01 4.643 3.43e-06 ***
## age 4.701e-03 9.087e-03 0.517 0.604932
## `jobblue-collar` 5.341e-02 3.115e-01 0.171 0.863861
## jobentrepreneur -6.020e-01 5.429e-01 -1.109 0.267520
## jobhousemaid 5.659e-01 4.942e-01 1.145 0.252190
## jobmanagement -3.125e-01 3.109e-01 -1.005 0.314849
## jobretired -5.146e-02 3.900e-01 -0.132 0.895029
## `jobself-employed` -4.857e-01 4.313e-01 -1.126 0.260046
## jobservices -7.559e-02 3.302e-01 -0.229 0.818958
## jobstudent 2.746e-01 4.566e-01 0.601 0.547683
## jobtechnician 2.443e-01 2.429e-01 1.006 0.314505
## jobunemployed 3.807e-01 4.337e-01 0.878 0.380075
## maritalmarried 2.889e-01 2.685e-01 1.076 0.281828
## maritalsingle 2.908e-01 3.015e-01 0.964 0.334876
## educationbasic.6y 4.445e-01 5.020e-01 0.886 0.375829
## educationbasic.9y 3.786e-01 3.842e-01 0.985 0.324451
## educationhigh.school 3.977e-01 3.674e-01 1.083 0.278985
## educationilliterate -1.119e+01 5.354e+02 -0.021 0.983319
## educationprofessional.course 2.892e-01 3.902e-01 0.741 0.458705
## educationuniversity.degree 5.248e-01 3.674e-01 1.429 0.153138
## defaultyes -8.968e+00 5.354e+02 -0.017 0.986636
## housingyes 4.614e-02 1.520e-01 0.304 0.761422
## loanyes -1.227e-01 2.076e-01 -0.591 0.554357
## contacttelephone -9.835e-01 2.875e-01 -3.421 0.000623 ***
## monthaug 2.122e-02 3.947e-01 0.054 0.957125
## monthdec -1.115e-01 6.764e-01 -0.165 0.869070
## monthjul -6.436e-02 3.929e-01 -0.164 0.869886
## monthjun 1.019e+00 3.557e-01 2.864 0.004180 **
## monthmar 1.889e+00 4.610e-01 4.098 4.18e-05 ***
## monthmay -6.220e-01 3.023e-01 -2.058 0.039628 *
## monthnov -5.263e-01 3.662e-01 -1.437 0.150715
## monthoct -9.848e-02 4.653e-01 -0.212 0.832372
## monthsep -6.504e-01 4.874e-01 -1.334 0.182073
## day_of_weekmon 2.928e-01 2.357e-01 1.243 0.214009
## day_of_weekthu 3.163e-01 2.369e-01 1.335 0.181834
## day_of_weektue 1.128e-01 2.447e-01 0.461 0.644872
## day_of_weekwed 3.172e-01 2.489e-01 1.274 0.202504
## duration 4.999e-03 2.889e-04 17.303 < 2e-16 ***
## campaign -9.688e-02 5.138e-02 -1.885 0.059366 .
## pdays -4.161e-04 6.953e-04 -0.598 0.549560
## previous 1.251e-01 1.894e-01 0.660 0.508952
## poutcomenonexistent 5.672e-01 3.212e-01 1.766 0.077393 .
## poutcomesuccess 1.449e+00 6.886e-01 2.105 0.035290 *
## cons.price.idx -1.084e-01 1.582e-01 -0.685 0.493367
## cons.conf.idx 4.085e-02 1.980e-02 2.064 0.039060 *
## nr.employed -1.318e-02 1.262e-03 -10.441 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2264.4 on 3089 degrees of freedom
## Residual deviance: 1288.0 on 3044 degrees of freedom
## AIC: 1380
##
## Number of Fisher Scoring iterations: 12
# GLM.fit1 Accuracy 91.17%
pred.glm <- predict(glm.fit, bmktg_clean)
confusionMatrix(pred.glm, bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2646 199
## yes 74 171
##
## Accuracy : 0.9117
## 95% CI : (0.9011, 0.9214)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 1.293e-08
##
## Kappa : 0.5093
##
## Mcnemar's Test P-Value : 6.151e-14
##
## Sensitivity : 0.9728
## Specificity : 0.4622
## Pos Pred Value : 0.9301
## Neg Pred Value : 0.6980
## Prevalence : 0.8803
## Detection Rate : 0.8563
## Detection Prevalence : 0.9207
## Balanced Accuracy : 0.7175
##
## 'Positive' Class : no
##
The above GLM fit gave warning as “prediction from a rank-deficient fit may be misleading”. This is triggered by two categorical varaibles having rows with one particular category which have only one y value.
The summary of dataset indicates the following:
default variable has only 1 observation for the value of ‘yes’.
education factor variable has only 1 observation for the value of ‘illiterate’.
Lets run the glm again removing those two rows.
# Logistic regression with with 10-fold CV after removing default and education predictors
glm.fit2 <- train(y~age+job+marital+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed, data = bmktg_clean, method = "glm",family = binomial, trControl = train.control)
summary(glm.fit2)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.5691 -0.3121 -0.1841 -0.1136 3.1674
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 74.6440453 16.0963305 4.637 3.53e-06 ***
## age 0.0023283 0.0088376 0.263 0.792204
## `jobblue-collar` -0.1023401 0.2462143 -0.416 0.677663
## jobentrepreneur -0.6828173 0.5448344 -1.253 0.210112
## jobhousemaid 0.3914196 0.4785156 0.818 0.413365
## jobmanagement -0.2942730 0.3091786 -0.952 0.341204
## jobretired -0.1857310 0.3817778 -0.486 0.626620
## `jobself-employed` -0.4845239 0.4272992 -1.134 0.256827
## jobservices -0.1573227 0.3095477 -0.508 0.611289
## jobstudent 0.1608898 0.4368502 0.368 0.712653
## jobtechnician 0.1511583 0.2187764 0.691 0.489612
## jobunemployed 0.2643250 0.4250602 0.622 0.534038
## maritalmarried 0.2897844 0.2652000 1.093 0.274525
## maritalsingle 0.2966331 0.2976717 0.997 0.319002
## housingyes 0.0618943 0.1513724 0.409 0.682622
## loanyes -0.1333712 0.2075180 -0.643 0.520421
## contacttelephone -1.0014048 0.2874700 -3.484 0.000495 ***
## monthaug 0.0290765 0.3946493 0.074 0.941268
## monthdec -0.0991375 0.6773748 -0.146 0.883641
## monthjul -0.0670965 0.3925219 -0.171 0.864273
## monthjun 1.0270046 0.3555020 2.889 0.003866 **
## monthmar 1.8895100 0.4604023 4.104 4.06e-05 ***
## monthmay -0.6191274 0.3017346 -2.052 0.040180 *
## monthnov -0.5227512 0.3657874 -1.429 0.152972
## monthoct -0.1132380 0.4656910 -0.243 0.807881
## monthsep -0.6777440 0.4873570 -1.391 0.164331
## day_of_weekmon 0.2827689 0.2356207 1.200 0.230100
## day_of_weekthu 0.3251656 0.2363573 1.376 0.168903
## day_of_weektue 0.0977748 0.2439433 0.401 0.688560
## day_of_weekwed 0.3156885 0.2481429 1.272 0.203300
## duration 0.0049885 0.0002884 17.297 < 2e-16 ***
## campaign -0.0970685 0.0512967 -1.892 0.058452 .
## pdays -0.0004138 0.0006935 -0.597 0.550705
## previous 0.1389808 0.1889813 0.735 0.462083
## poutcomenonexistent 0.5863134 0.3203550 1.830 0.067220 .
## poutcomesuccess 1.4400390 0.6867091 2.097 0.035992 *
## cons.price.idx -0.0989363 0.1578038 -0.627 0.530687
## cons.conf.idx 0.0423288 0.0198000 2.138 0.032531 *
## nr.employed -0.0131979 0.0012591 -10.482 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2264.4 on 3089 degrees of freedom
## Residual deviance: 1290.8 on 3051 degrees of freedom
## AIC: 1368.8
##
## Number of Fisher Scoring iterations: 7
#GLM.fit2 Accuracy 91.17%
pred.glm2 <- predict(glm.fit2, bmktg_clean)
confusionMatrix(pred.glm2, bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2647 200
## yes 73 170
##
## Accuracy : 0.9117
## 95% CI : (0.9011, 0.9214)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 1.293e-08
##
## Kappa : 0.5079
##
## Mcnemar's Test P-Value : 2.424e-14
##
## Sensitivity : 0.9732
## Specificity : 0.4595
## Pos Pred Value : 0.9298
## Neg Pred Value : 0.6996
## Prevalence : 0.8803
## Detection Rate : 0.8566
## Detection Prevalence : 0.9214
## Balanced Accuracy : 0.7163
##
## 'Positive' Class : no
##
# Run random forests without the variables - Euribor3m and Emp.var.rate and duration
# accuracy - 90.71%
rf.fit = train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed,
data=bmktg_clean,
method='rf',
trControl=train.control,
importance=TRUE)
rf.fit$besttune
## NULL
varImp(rf.fit)
## rf variable importance
##
## only 20 most important variables shown (out of 44)
##
## Importance
## nr.employed 100.00
## pdays 78.71
## poutcomesuccess 76.72
## monthmar 66.91
## previous 51.29
## contacttelephone 49.94
## cons.conf.idx 49.67
## poutcomenonexistent 46.66
## cons.price.idx 39.77
## age 39.03
## monthoct 36.91
## monthmay 35.62
## jobstudent 30.00
## monthjun 29.56
## monthdec 27.30
## monthjul 26.10
## monthnov 24.70
## monthsep 21.90
## jobtechnician 21.65
## maritalsingle 19.74
plot(varImp(rf.fit))
pred.rf<-predict(rf.fit,bmktg_clean)
confusionMatrix(pred.rf,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2718 284
## yes 2 86
##
## Accuracy : 0.9074
## 95% CI : (0.8967, 0.9174)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 8.473e-07
##
## Kappa : 0.3454
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9993
## Specificity : 0.2324
## Pos Pred Value : 0.9054
## Neg Pred Value : 0.9773
## Prevalence : 0.8803
## Detection Rate : 0.8796
## Detection Prevalence : 0.9715
## Balanced Accuracy : 0.6158
##
## 'Positive' Class : no
##
# get variable importance of top 14
plot(varImp(rf.fit),14)
# KNN fit with all variables except Euribor3m, Emp.var.rate (collinear) and default and education (rank deficient)
# accuracy - 91.04%
knn.fit <-
train(y~age+job+marital+education+default+housing+loan+contact+month+day_of_week+duration+campaign+pdays+previous+poutcome+cons.price.idx+cons.conf.idx+nr.employed,
data=bmktg_clean,
method='knn',
trControl = train.control,
tuneLength=20)
pred.knn.fit <- predict(knn.fit, bmktg_clean)
confusionMatrix(pred.knn.fit,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2638 192
## yes 82 178
##
## Accuracy : 0.9113
## 95% CI : (0.9007, 0.9211)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 1.825e-08
##
## Kappa : 0.5174
##
## Mcnemar's Test P-Value : 4.551e-11
##
## Sensitivity : 0.9699
## Specificity : 0.4811
## Pos Pred Value : 0.9322
## Neg Pred Value : 0.6846
## Prevalence : 0.8803
## Detection Rate : 0.8537
## Detection Prevalence : 0.9159
## Balanced Accuracy : 0.7255
##
## 'Positive' Class : no
##
# KNN.fit1 with significant precitors - accuracy = 91.17%
set.seed(12345)
knn.fit1 <-
train(y~duration+nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,
data=bmktg_clean,
method='knn',
trControl = train.control,
tuneLength=20)
pred.knn.fit1 <- predict(knn.fit1, bmktg_clean)
confusionMatrix(pred.knn.fit1,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2638 191
## yes 82 179
##
## Accuracy : 0.9117
## 95% CI : (0.9011, 0.9214)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 1.293e-08
##
## Kappa : 0.5198
##
## Mcnemar's Test P-Value : 6.299e-11
##
## Sensitivity : 0.9699
## Specificity : 0.4838
## Pos Pred Value : 0.9325
## Neg Pred Value : 0.6858
## Prevalence : 0.8803
## Detection Rate : 0.8537
## Detection Prevalence : 0.9155
## Balanced Accuracy : 0.7268
##
## 'Positive' Class : no
##
# glm.fit4 - with significant parameters accuracy = 91.23%
glm.fit4 <- train(y~duration+nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,data=bmktg_clean,method='glm',trControl = train.control)
pred.glm.fit4 <- predict(glm.fit4, bmktg_clean)
confusionMatrix(pred.glm.fit4,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2649 200
## yes 71 170
##
## Accuracy : 0.9123
## 95% CI : (0.9018, 0.922)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 6.409e-09
##
## Kappa : 0.5102
##
## Mcnemar's Test P-Value : 7.518e-15
##
## Sensitivity : 0.9739
## Specificity : 0.4595
## Pos Pred Value : 0.9298
## Neg Pred Value : 0.7054
## Prevalence : 0.8803
## Detection Rate : 0.8573
## Detection Prevalence : 0.9220
## Balanced Accuracy : 0.7167
##
## 'Positive' Class : no
##
# GLM fit w/o duration and with significant params ; accuracy = 89.45%
glm.fit5 <- train(y~nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,data=bmktg_clean,method='glm',trControl = train.control)
pred.glm.fit5 <- predict(glm.fit5, bmktg_clean)
confusionMatrix(pred.glm.fit5,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2675 281
## yes 45 89
##
## Accuracy : 0.8945
## 95% CI : (0.8831, 0.9051)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 0.007218
##
## Kappa : 0.3092
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9835
## Specificity : 0.2405
## Pos Pred Value : 0.9049
## Neg Pred Value : 0.6642
## Prevalence : 0.8803
## Detection Rate : 0.8657
## Detection Prevalence : 0.9566
## Balanced Accuracy : 0.6120
##
## 'Positive' Class : no
##
knn.fit2 <-
train(y~nr.employed+pdays+poutcome+month+cons.conf.idx+cons.price.idx+contact+age+job+previous+loan+campaign+day_of_week,
data=bmktg_clean,
method='knn',
trControl = train.control,
tuneLength=20)
pred.knn.fit2 <- predict(knn.fit2, bmktg_clean)
confusionMatrix(pred.knn.fit2,bmktg_clean$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 2693 299
## yes 27 71
##
## Accuracy : 0.8945
## 95% CI : (0.8831, 0.9051)
## No Information Rate : 0.8803
## P-Value [Acc > NIR] : 0.007218
##
## Kappa : 0.2666
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9901
## Specificity : 0.1919
## Pos Pred Value : 0.9001
## Neg Pred Value : 0.7245
## Prevalence : 0.8803
## Detection Rate : 0.8715
## Detection Prevalence : 0.9683
## Balanced Accuracy : 0.5910
##
## 'Positive' Class : no
##
Accuracy_all <- cbind(c(91.23, 91.17,100),c(89.45,89.45,90.71))
colnames(Accuracy_all) <- c("Accuracy w duration","Accuracy w/o duration")
rownames(Accuracy_all) <- c("GLM","KNN","Random Forests")
Accuracy_all
## Accuracy w duration Accuracy w/o duration
## GLM 91.23 89.45
## KNN 91.17 89.45
## Random Forests 100.00 90.71