bank_data <- read.table("bank-additional-full.csv", head = TRUE, sep = ";", stringsAsFactors=T)
#Table headers
names(bank_data)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
summary(bank_data)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing loan
## university.degree :12168 no :32588 no :18622 no :33950
## high.school : 9515 unknown: 8597 unknown: 990 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576 yes : 6248
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## contact month day_of_week duration
## cellular :26144 may :13769 fri:7827 Min. : 0.0
## telephone:15044 jul : 7174 mon:8514 1st Qu.: 102.0
## aug : 6178 thu:8623 Median : 180.0
## jun : 5318 tue:8090 Mean : 258.3
## nov : 4101 wed:8134 3rd Qu.: 319.0
## apr : 2632 Max. :4918.0
## (Other): 2016
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 4252
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:35563
## Median : 2.000 Median :999.0 Median :0.000 success : 1373
## Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :56.000 Max. :999.0 Max. :7.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08189 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed y
## Min. :4964 no :36548
## 1st Qu.:5099 yes: 4640
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
#Number of observations
nrow(bank_data)
## [1] 41188
glm.fit1 <- glm(y ~ age + job + marital + education + default + housing + loan + month + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)
summary(glm.fit1)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## housing + loan + month + duration + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0552 -0.3189 -0.1941 -0.1371 3.2645
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.670e+01 3.664e+00 -12.746 < 2e-16 ***
## age 1.365e-03 2.363e-03 0.578 0.563477
## jobblue-collar -2.946e-01 7.819e-02 -3.767 0.000165 ***
## jobentrepreneur -2.263e-01 1.236e-01 -1.830 0.067214 .
## jobhousemaid -3.627e-02 1.421e-01 -0.255 0.798533
## jobmanagement -7.199e-02 8.305e-02 -0.867 0.386038
## jobretired 2.662e-01 1.038e-01 2.565 0.010317 *
## jobself-employed -2.290e-01 1.159e-01 -1.975 0.048229 *
## jobservices -1.892e-01 8.402e-02 -2.252 0.024314 *
## jobstudent 2.629e-01 1.071e-01 2.455 0.014098 *
## jobtechnician -3.095e-02 6.900e-02 -0.449 0.653737
## jobunemployed 5.409e-02 1.224e-01 0.442 0.658686
## jobunknown 1.075e-01 2.278e-01 0.472 0.636879
## maritalmarried 2.387e-02 6.676e-02 0.358 0.720685
## maritalsingle 9.119e-02 7.621e-02 1.196 0.231519
## maritalunknown 1.010e-01 4.038e-01 0.250 0.802465
## educationbasic.6y 1.330e-01 1.180e-01 1.127 0.259843
## educationbasic.9y -1.268e-03 9.326e-02 -0.014 0.989153
## educationhigh.school 1.717e-02 8.964e-02 0.192 0.848104
## educationilliterate 1.163e+00 7.215e-01 1.613 0.106824
## educationprofessional.course 9.849e-02 9.858e-02 0.999 0.317785
## educationuniversity.degree 1.865e-01 8.969e-02 2.080 0.037539 *
## educationunknown 1.020e-01 1.166e-01 0.875 0.381714
## defaultunknown -3.343e-01 6.640e-02 -5.035 4.78e-07 ***
## defaultyes -7.443e+00 1.136e+02 -0.066 0.947736
## housingunknown -9.494e-02 1.361e-01 -0.697 0.485597
## housingyes 7.138e-04 4.018e-02 0.018 0.985825
## loanunknown NA NA NA NA
## loanyes -4.671e-02 5.566e-02 -0.839 0.401372
## monthaug 5.316e-01 9.401e-02 5.654 1.57e-08 ***
## monthdec 5.226e-01 1.856e-01 2.815 0.004875 **
## monthjul 5.582e-01 8.813e-02 6.333 2.40e-10 ***
## monthjun 4.322e-01 8.807e-02 4.908 9.20e-07 ***
## monthmar 1.533e+00 1.160e-01 13.218 < 2e-16 ***
## monthmay -6.660e-01 7.212e-02 -9.235 < 2e-16 ***
## monthnov 3.485e-01 9.278e-02 3.756 0.000173 ***
## monthoct 5.841e-01 1.152e-01 5.072 3.94e-07 ***
## monthsep 2.825e-01 1.240e-01 2.279 0.022649 *
## duration 4.684e-03 7.342e-05 63.789 < 2e-16 ***
## cons.price.idx 5.011e-01 4.003e-02 12.517 < 2e-16 ***
## cons.conf.idx 4.190e-02 4.543e-03 9.223 < 2e-16 ***
## euribor3m -7.920e-01 1.599e-02 -49.534 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 17895 on 41147 degrees of freedom
## AIC: 17977
##
## Number of Fisher Scoring iterations: 10
glm.fit2 <- glm(y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = y ~ age + job + duration + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.8961 -0.3573 -0.1986 -0.1442 3.2024
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.913e+01 3.321e+00 -17.806 < 2e-16 ***
## age -1.078e-03 2.066e-03 -0.522 0.60184
## jobblue-collar -6.286e-01 6.196e-02 -10.146 < 2e-16 ***
## jobentrepreneur -3.447e-01 1.199e-01 -2.876 0.00403 **
## jobhousemaid -1.446e-01 1.336e-01 -1.082 0.27911
## jobmanagement -8.095e-02 8.003e-02 -1.011 0.31178
## jobretired 2.986e-01 9.759e-02 3.060 0.00222 **
## jobself-employed -2.101e-01 1.122e-01 -1.872 0.06119 .
## jobservices -4.360e-01 7.768e-02 -5.612 2e-08 ***
## jobstudent 2.398e-01 9.911e-02 2.419 0.01555 *
## jobtechnician -4.929e-02 6.017e-02 -0.819 0.41270
## jobunemployed 1.653e-02 1.177e-01 0.140 0.88837
## jobunknown -1.984e-02 2.183e-01 -0.091 0.92761
## duration 4.554e-03 7.119e-05 63.963 < 2e-16 ***
## cons.price.idx 6.542e-01 3.614e-02 18.101 < 2e-16 ***
## cons.conf.idx 7.566e-02 3.468e-03 21.819 < 2e-16 ***
## euribor3m -8.139e-01 1.428e-02 -56.990 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 18671 on 41171 degrees of freedom
## AIC: 18705
##
## Number of Fisher Scoring iterations: 6
In order to speed up the loading process, I have commented out the pairs charts.
#pairs(data = bank_data, y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m ,col = bank_data$y)
glm.probs <- predict(glm.fit2, type = "response")
glm.probs[1: 5]
## 1 2 3 4 5
## 0.03329431 0.01520298 0.02190812 0.02394977 0.03075755
glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")
attach(bank_data)
table(glm.pred, y)
## y
## glm.pred no yes
## no 35684 3068
## yes 864 1572
mean(glm.pred == y)
## [1] 0.9045353
mean(bank_data$age)
## [1] 40.02406
train <- age < 40
glm.fit <- glm(y ~ age + job + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial, subset = train)
summary(glm.fit)
##
## Call:
## glm(formula = y ~ age + job + duration + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_data, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.9360 -0.3775 -0.2109 -0.1493 3.2194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.182e+01 4.405e+00 -11.763 < 2e-16 ***
## age -3.587e-02 6.108e-03 -5.873 4.28e-09 ***
## jobblue-collar -6.250e-01 8.118e-02 -7.699 1.37e-14 ***
## jobentrepreneur -3.057e-01 1.710e-01 -1.787 0.0739 .
## jobhousemaid -3.745e-01 2.530e-01 -1.480 0.1388
## jobmanagement 1.014e-01 1.128e-01 0.899 0.3688
## jobretired -1.430e+00 1.081e+00 -1.323 0.1860
## jobself-employed -4.784e-02 1.439e-01 -0.333 0.7395
## jobservices -4.300e-01 9.581e-02 -4.488 7.17e-06 ***
## jobstudent 4.317e-02 1.077e-01 0.401 0.6884
## jobtechnician 6.212e-02 7.362e-02 0.844 0.3988
## jobunemployed 2.304e-01 1.512e-01 1.523 0.1277
## jobunknown -2.890e-01 3.833e-01 -0.754 0.4509
## duration 4.570e-03 9.459e-05 48.317 < 2e-16 ***
## cons.price.idx 5.856e-01 4.768e-02 12.283 < 2e-16 ***
## cons.conf.idx 7.369e-02 4.571e-03 16.121 < 2e-16 ***
## euribor3m -7.731e-01 1.902e-02 -40.654 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16286 on 22606 degrees of freedom
## Residual deviance: 10712 on 22590 degrees of freedom
## AIC: 10746
##
## Number of Fisher Scoring iterations: 6
glm.probs <- predict(glm.fit, newdata = bank_data[!train,], type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")
y.40 <- bank_data$y[!train]
table(glm.pred, y.40)
## y.40
## glm.pred no yes
## no 16402 1674
## yes 176 329
mean(glm.pred == y.40)
## [1] 0.9004359
train <- age < 40
glm.fit <- glm(y ~ age + job + cons.price.idx + euribor3m, data = bank_data, family = binomial, subset = train)
summary(glm.fit)
##
## Call:
## glm(formula = y ~ age + job + cons.price.idx + euribor3m, family = binomial,
## data = bank_data, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3730 -0.5305 -0.3343 -0.2779 2.7382
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -43.624120 3.968962 -10.991 < 2e-16 ***
## age -0.029067 0.005286 -5.498 3.83e-08 ***
## jobblue-collar -0.593823 0.068267 -8.699 < 2e-16 ***
## jobentrepreneur -0.259193 0.148287 -1.748 0.08048 .
## jobhousemaid -0.455672 0.227020 -2.007 0.04473 *
## jobmanagement 0.024516 0.098196 0.250 0.80285
## jobretired -1.381458 1.025140 -1.348 0.17779
## jobself-employed -0.102575 0.123676 -0.829 0.40689
## jobservices -0.395164 0.082839 -4.770 1.84e-06 ***
## jobstudent 0.249572 0.093979 2.656 0.00792 **
## jobtechnician 0.019501 0.063557 0.307 0.75897
## jobunemployed 0.163193 0.131178 1.244 0.21348
## jobunknown 0.145939 0.309415 0.472 0.63717
## cons.price.idx 0.474215 0.042652 11.118 < 2e-16 ***
## euribor3m -0.565727 0.015600 -36.264 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 16286 on 22606 degrees of freedom
## Residual deviance: 14195 on 22592 degrees of freedom
## AIC: 14225
##
## Number of Fisher Scoring iterations: 5
glm.probs <- predict(glm.fit, newdata = bank_data[!train,], type = "response")
glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")
y.40 <- bank_data$y[!train]
table(glm.pred, y.40)
## y.40
## glm.pred no yes
## no 16578 2003
mean(glm.pred == y.40)
## [1] 0.8922017