library(ggplot2)
bank_data <- read.table("bank-additional-full.csv", head = TRUE, sep = ";", stringsAsFactors=T)
#Table headers
names(bank_data)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
summary(bank_data)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing loan
## university.degree :12168 no :32588 no :18622 no :33950
## high.school : 9515 unknown: 8597 unknown: 990 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576 yes : 6248
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## contact month day_of_week duration
## cellular :26144 may :13769 fri:7827 Min. : 0.0
## telephone:15044 jul : 7174 mon:8514 1st Qu.: 102.0
## aug : 6178 thu:8623 Median : 180.0
## jun : 5318 tue:8090 Mean : 258.3
## nov : 4101 wed:8134 3rd Qu.: 319.0
## apr : 2632 Max. :4918.0
## (Other): 2016
## campaign pdays previous poutcome
## Min. : 1.000 Min. : 0.0 Min. :0.000 failure : 4252
## 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000 nonexistent:35563
## Median : 2.000 Median :999.0 Median :0.000 success : 1373
## Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :56.000 Max. :999.0 Max. :7.000
##
## emp.var.rate cons.price.idx cons.conf.idx euribor3m
## Min. :-3.40000 Min. :92.20 Min. :-50.8 Min. :0.634
## 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.344
## Median : 1.10000 Median :93.75 Median :-41.8 Median :4.857
## Mean : 0.08189 Mean :93.58 Mean :-40.5 Mean :3.621
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961
## Max. : 1.40000 Max. :94.77 Max. :-26.9 Max. :5.045
##
## nr.employed y
## Min. :4964 no :36548
## 1st Qu.:5099 yes: 4640
## Median :5191
## Mean :5167
## 3rd Qu.:5228
## Max. :5228
##
#Number of observations
nrow(bank_data)
## [1] 41188
ggplot(bank_data, aes(age)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$age)
summary(bank_data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.00 32.00 38.00 40.02 47.00 98.00
ggplot(bank_data, aes(duration)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$duration)
summary(bank_data$duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 102.0 180.0 258.3 319.0 4918.0
ggplot(bank_data, aes(campaign)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$campaign)
summary(bank_data$campaign)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.568 3.000 56.000
ggplot(bank_data, aes(pdays)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$pdays)
summary(bank_data$pdays)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 999.0 999.0 962.5 999.0 999.0
ggplot(bank_data, aes(previous)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$previous)
summary(bank_data$previous)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.173 0.000 7.000
ggplot(bank_data, aes(emp.var.rate)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$emp.var.rate)
summary(bank_data$emp.var.rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.40000 -1.80000 1.10000 0.08189 1.40000 1.40000
ggplot(bank_data, aes(cons.price.idx)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$cons.price.idx)
summary(bank_data$cons.price.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.20 93.08 93.75 93.58 93.99 94.77
ggplot(bank_data, aes(cons.conf.idx)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$cons.conf.idx)
summary(bank_data$cons.conf.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -50.8 -42.7 -41.8 -40.5 -36.4 -26.9
ggplot(bank_data, aes(euribor3m)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$euribor3m)
summary(bank_data$euribor3m)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.634 1.344 4.857 3.621 4.961 5.045
ggplot(bank_data, aes(nr.employed)) +
geom_histogram(color = "darkgreen", fill = "lightgreen") +
theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
boxplot(bank_data$nr.employed)
summary(bank_data$nr.employed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4964 5099 5191 5167 5228 5228
ggplot(bank_data, aes(y)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$y)
## no yes
## 36548 4640
ggplot(bank_data, aes(job)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic() +
theme(axis.text = element_text(size = 6))
#+theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
summary(bank_data$job)
## admin. blue-collar entrepreneur housemaid management
## 10422 9254 1456 1060 2924
## retired self-employed services student technician
## 1720 1421 3969 875 6743
## unemployed unknown
## 1014 330
ggplot(bank_data, aes(marital)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$marital)
## divorced married single unknown
## 4612 24928 11568 80
ggplot(bank_data, aes(education)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic() +
theme(axis.text = element_text(size = 6))
summary(bank_data$education)
## basic.4y basic.6y basic.9y high.school
## 4176 2292 6045 9515
## illiterate professional.course university.degree unknown
## 18 5243 12168 1731
ggplot(bank_data, aes(default)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$default)
## no unknown yes
## 32588 8597 3
ggplot(bank_data, aes(housing)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$housing)
## no unknown yes
## 18622 990 21576
ggplot(bank_data, aes(loan)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$loan)
## no unknown yes
## 33950 990 6248
ggplot(bank_data, aes(contact)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$contact)
## cellular telephone
## 26144 15044
ggplot(bank_data, aes(month)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$month)
## apr aug dec jul jun mar may nov oct sep
## 2632 6178 182 7174 5318 546 13769 4101 718 570
ggplot(bank_data, aes(day_of_week)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$day_of_week)
## fri mon thu tue wed
## 7827 8514 8623 8090 8134
ggplot(bank_data, aes(poutcome)) +
geom_bar(width = 0.2, color = "darkgreen", fill = "lightgreen") +
theme_classic()
summary(bank_data$poutcome)
## failure nonexistent success
## 4252 35563 1373
After performing univariate analysis, I perform bivariate analysis by comparing the “y” variable against other variables.
glm_mdl <- glm(y ~ age, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ age, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6137 -0.5006 -0.4799 -0.4678 2.1780
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.426596 0.061402 -39.520 < 2e-16 ***
## age 0.008976 0.001456 6.165 7.04e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28961 on 41186 degrees of freedom
## AIC: 28965
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) age
## -2.426596232 0.008975997
#converting "y" values to 0 and 1 instead of "no" and "yes"
#levels(bank_data$y) <- c(0, 1)
#levels(bank_data$y)
ggplot(bank_data, aes(age, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ duration, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ duration, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.4201 -0.4261 -0.3492 -0.3067 2.5296
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.293e+00 2.827e-02 -116.47 <2e-16 ***
## duration 3.656e-03 5.852e-05 62.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 24106 on 41186 degrees of freedom
## AIC: 24110
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) duration
## -3.293169519 0.003656366
ggplot(bank_data, aes(duration, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ campaign, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ campaign, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5275 -0.5275 -0.4974 -0.4416 3.0566
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.776717 0.024716 -71.89 <2e-16 ***
## campaign -0.125440 0.009192 -13.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28748 on 41186 degrees of freedom
## AIC: 28752
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) campaign
## -1.7767172 -0.1254402
ggplot(bank_data, aes(campaign, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ pdays, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ pdays, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4340 -0.4408 -0.4408 -0.4408 2.1816
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5854465 0.0537958 10.88 <2e-16 ***
## pdays -0.0028708 0.0000566 -50.72 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 26458 on 41186 degrees of freedom
## AIC: 26462
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) pdays
## 0.585446486 -0.002870853
ggplot(bank_data, aes(pdays, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ previous, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ previous, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9526 -0.4339 -0.4339 -0.4339 2.1953
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.31558 0.01798 -128.77 <2e-16 ***
## previous 0.95166 0.02435 39.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 27479 on 41186 degrees of freedom
## AIC: 27483
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) previous
## -2.3155761 0.9516627
ggplot(bank_data, aes(previous, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ emp.var.rate, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ emp.var.rate, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0047 -0.4422 -0.3193 -0.2941 2.5150
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.33228 0.01939 -120.31 <2e-16 ***
## emp.var.rate -0.56222 0.01018 -55.25 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 25597 on 41186 degrees of freedom
## AIC: 25601
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) emp.var.rate
## -2.332277 -0.562221
ggplot(bank_data, aes(emp.var.rate, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ cons.price.idx, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ cons.price.idx, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7516 -0.5386 -0.4192 -0.4081 2.4748
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 66.96533 2.53366 26.43 <2e-16 ***
## cons.price.idx -0.73844 0.02713 -27.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28242 on 41186 degrees of freedom
## AIC: 28246
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) cons.price.idx
## 66.9653338 -0.7384404
ggplot(bank_data, aes(cons.price.idx, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ cons.conf.idx, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ cons.conf.idx, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6139 -0.5221 -0.4738 -0.4402 2.2529
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.580449 0.133370 -4.352 1.35e-05 ***
## cons.conf.idx 0.036907 0.003321 11.114 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28876 on 41186 degrees of freedom
## AIC: 28880
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) cons.conf.idx
## -0.58044912 0.03690707
ggplot(bank_data, aes(cons.conf.idx, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ euribor3m, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ euribor3m, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8568 -0.3730 -0.2997 -0.2917 2.5380
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.472940 0.027521 -17.18 <2e-16 ***
## euribor3m -0.536582 0.009547 -56.21 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 25343 on 41186 degrees of freedom
## AIC: 25347
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) euribor3m
## -0.4729395 -0.5365818
ggplot(bank_data, aes(euribor3m, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ nr.employed, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ nr.employed, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2898 -0.3512 -0.3405 -0.2762 2.5633
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 66.0827700 1.0646647 62.07 <2e-16 ***
## nr.employed -0.0132610 0.0002082 -63.69 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 24525 on 41186 degrees of freedom
## AIC: 24529
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) nr.employed
## 66.08276995 -0.01326102
ggplot(bank_data, aes(nr.employed, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ job, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ job, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8687 -0.5272 -0.4787 -0.3780 2.3128
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.90339 0.02915 -65.290 < 2e-16 ***
## jobblue-collar -0.69965 0.05033 -13.901 < 2e-16 ***
## jobentrepreneur -0.47077 0.09831 -4.789 1.68e-06 ***
## jobhousemaid -0.29384 0.10645 -2.760 0.00578 **
## jobmanagement -0.16533 0.06545 -2.526 0.01154 *
## jobretired 0.81714 0.06270 13.032 < 2e-16 ***
## jobself-employed -0.24101 0.09136 -2.638 0.00834 **
## jobservices -0.52035 0.06496 -8.010 1.15e-15 ***
## jobstudent 1.12323 0.07844 14.319 < 2e-16 ***
## jobtechnician -0.20525 0.04885 -4.202 2.65e-05 ***
## jobunemployed 0.10471 0.09457 1.107 0.26822
## jobunknown -0.16587 0.17689 -0.938 0.34840
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28187 on 41176 degrees of freedom
## AIC: 28211
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) jobblue-collar jobentrepreneur jobhousemaid
## -1.9033873 -0.6996506 -0.4707680 -0.2938373
## jobmanagement jobretired jobself-employed jobservices
## -0.1653262 0.8171399 -0.2410122 -0.5203463
## jobstudent jobtechnician jobunemployed jobunknown
## 1.1232287 -0.2052473 0.1047074 -0.1658674
ggplot(bank_data, aes(as.numeric(job), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ marital, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ marital, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5701 -0.4668 -0.4628 -0.4628 2.1387
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.16207 0.04840 -44.673 < 2e-16 ***
## maritalmarried -0.01781 0.05274 -0.338 0.736
## maritalsingle 0.34712 0.05532 6.275 3.5e-10 ***
## maritalunknown 0.42747 0.31683 1.349 0.177
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28881 on 41184 degrees of freedom
## AIC: 28889
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) maritalmarried maritalsingle maritalunknown
## -2.1620666 -0.0178063 0.3471212 0.4274655
ggplot(bank_data, aes(as.numeric(marital), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ education, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ education, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7090 -0.5434 -0.4789 -0.4137 2.2574
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.16985 0.05102 -42.528 < 2e-16 ***
## educationbasic.6y -0.24530 0.09164 -2.677 0.00743 **
## educationbasic.9y -0.29656 0.06998 -4.238 2.26e-05 ***
## educationhigh.school 0.06220 0.06075 1.024 0.30592
## educationilliterate 0.91709 0.56924 1.611 0.10716
## educationprofessional.course 0.11422 0.06708 1.703 0.08858 .
## educationuniversity.degree 0.33149 0.05742 5.773 7.79e-09 ***
## educationunknown 0.39551 0.08522 4.641 3.47e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28802 on 41180 degrees of freedom
## AIC: 28818
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) educationbasic.6y
## -2.16985445 -0.24529916
## educationbasic.9y educationhigh.school
## -0.29655950 0.06220162
## educationilliterate educationprofessional.course
## 0.91709148 0.11422355
## educationuniversity.degree educationunknown
## 0.33149331 0.39551002
ggplot(bank_data, aes(as.numeric(education), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ default, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ default, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5251 -0.5251 -0.5251 -0.3253 2.4354
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.91170 0.01654 -115.598 <2e-16 ***
## defaultunknown -1.00099 0.05151 -19.432 <2e-16 ***
## defaultyes -8.65433 68.97491 -0.125 0.9
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28523 on 41185 degrees of freedom
## AIC: 28529
##
## Number of Fisher Scoring iterations: 9
coefficients(glm_mdl)
## (Intercept) defaultunknown defaultyes
## -1.911702 -1.000992 -8.654326
ggplot(bank_data, aes(as.numeric(default), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ housing, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ housing, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.497 -0.497 -0.480 -0.480 2.109
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.103098 0.023533 -89.368 <2e-16 ***
## housingunknown -0.007398 0.105030 -0.070 0.9438
## housingyes 0.074121 0.031704 2.338 0.0194 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28993 on 41185 degrees of freedom
## AIC: 28999
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) housingunknown housingyes
## -2.103098294 -0.007398069 0.074121134
ggplot(bank_data, aes(as.numeric(housing), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ loan, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ loan, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4906 -0.4906 -0.4906 -0.4812 2.1094
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.05645 0.01712 -120.149 <2e-16 ***
## loanunknown -0.05404 0.10378 -0.521 0.603
## loanyes -0.04131 0.04401 -0.939 0.348
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28998 on 41185 degrees of freedom
## AIC: 29004
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) loanunknown loanyes
## -2.05645202 -0.05404434 -0.04130538
ggplot(bank_data, aes(as.numeric(loan), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ contact, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ contact, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5647 -0.5647 -0.5647 -0.3278 2.4292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.75533 0.01745 -100.61 <2e-16 ***
## contacttelephone -1.14144 0.04056 -28.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28039 on 41186 degrees of freedom
## AIC: 28043
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) contacttelephone
## -1.755331 -1.141444
ggplot(bank_data, aes(as.numeric(contact), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ month, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ month, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1868 -0.4713 -0.4355 -0.3647 2.3424
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.35664 0.04830 -28.087 <2e-16 ***
## monthaug -0.77540 0.06357 -12.198 <2e-16 ***
## monthdec 1.31268 0.15595 8.417 <2e-16 ***
## monthjul -0.95133 0.06346 -14.991 <2e-16 ***
## monthjun -0.78501 0.06582 -11.927 <2e-16 ***
## monthmar 1.37862 0.09829 14.027 <2e-16 ***
## monthmay -1.32031 0.05949 -22.193 <2e-16 ***
## monthnov -0.82470 0.07077 -11.653 <2e-16 ***
## monthoct 1.11027 0.08938 12.422 <2e-16 ***
## monthsep 1.15242 0.09708 11.871 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 26823 on 41178 degrees of freedom
## AIC: 26843
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) monthaug monthdec monthjul monthjun monthmar
## -1.3566382 -0.7754031 1.3126750 -0.9513254 -0.7850052 1.3786171
## monthmay monthnov monthoct monthsep
## -1.3203088 -0.8247024 1.1102742 1.1524226
ggplot(bank_data, aes(as.numeric(month), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ day_of_week, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ day_of_week, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5083 -0.5007 -0.4981 -0.4578 2.1484
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.11043 0.03640 -57.974 < 2e-16 ***
## day_of_weekmon -0.09255 0.05134 -1.803 0.07144 .
## day_of_weekthu 0.12920 0.04913 2.629 0.00855 **
## day_of_weektue 0.09700 0.05015 1.934 0.05308 .
## day_of_weekwed 0.08609 0.05018 1.716 0.08625 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 28972 on 41183 degrees of freedom
## AIC: 28982
##
## Number of Fisher Scoring iterations: 4
coefficients(glm_mdl)
## (Intercept) day_of_weekmon day_of_weekthu day_of_weektue day_of_weekwed
## -2.11042809 -0.09255188 0.12919566 0.09699519 0.08608609
ggplot(bank_data, aes(as.numeric(day_of_week), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm_mdl <- glm(y ~ poutcome, data = bank_data, family = binomial)
summary(glm_mdl)
##
## Call:
## glm(formula = y ~ poutcome, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.451 -0.430 -0.430 -0.430 2.203
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.79643 0.04390 -40.92 <2e-16 ***
## poutcomenonexistent -0.53786 0.04771 -11.27 <2e-16 ***
## poutcomesuccess 2.42044 0.07165 33.78 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 26496 on 41185 degrees of freedom
## AIC: 26502
##
## Number of Fisher Scoring iterations: 5
coefficients(glm_mdl)
## (Intercept) poutcomenonexistent poutcomesuccess
## -1.7964317 -0.5378643 2.4204369
ggplot(bank_data, aes(as.numeric(poutcome), as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm.fit1 <- glm(y ~ age + job + marital + education + default + housing + loan + month + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)
summary(glm.fit1)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## housing + loan + month + duration + cons.price.idx + cons.conf.idx +
## euribor3m, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6.0552 -0.3189 -0.1941 -0.1371 3.2645
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.670e+01 3.664e+00 -12.746 < 2e-16 ***
## age 1.365e-03 2.363e-03 0.578 0.563477
## jobblue-collar -2.946e-01 7.819e-02 -3.767 0.000165 ***
## jobentrepreneur -2.263e-01 1.236e-01 -1.830 0.067214 .
## jobhousemaid -3.627e-02 1.421e-01 -0.255 0.798533
## jobmanagement -7.199e-02 8.305e-02 -0.867 0.386038
## jobretired 2.662e-01 1.038e-01 2.565 0.010317 *
## jobself-employed -2.290e-01 1.159e-01 -1.975 0.048229 *
## jobservices -1.892e-01 8.402e-02 -2.252 0.024314 *
## jobstudent 2.629e-01 1.071e-01 2.455 0.014098 *
## jobtechnician -3.095e-02 6.900e-02 -0.449 0.653737
## jobunemployed 5.409e-02 1.224e-01 0.442 0.658686
## jobunknown 1.075e-01 2.278e-01 0.472 0.636879
## maritalmarried 2.387e-02 6.676e-02 0.358 0.720685
## maritalsingle 9.119e-02 7.621e-02 1.196 0.231519
## maritalunknown 1.010e-01 4.038e-01 0.250 0.802465
## educationbasic.6y 1.330e-01 1.180e-01 1.127 0.259843
## educationbasic.9y -1.268e-03 9.326e-02 -0.014 0.989153
## educationhigh.school 1.717e-02 8.964e-02 0.192 0.848104
## educationilliterate 1.163e+00 7.215e-01 1.613 0.106824
## educationprofessional.course 9.849e-02 9.858e-02 0.999 0.317785
## educationuniversity.degree 1.865e-01 8.969e-02 2.080 0.037539 *
## educationunknown 1.020e-01 1.166e-01 0.875 0.381714
## defaultunknown -3.343e-01 6.640e-02 -5.035 4.78e-07 ***
## defaultyes -7.443e+00 1.136e+02 -0.066 0.947736
## housingunknown -9.494e-02 1.361e-01 -0.697 0.485597
## housingyes 7.138e-04 4.018e-02 0.018 0.985825
## loanunknown NA NA NA NA
## loanyes -4.671e-02 5.566e-02 -0.839 0.401372
## monthaug 5.316e-01 9.401e-02 5.654 1.57e-08 ***
## monthdec 5.226e-01 1.856e-01 2.815 0.004875 **
## monthjul 5.582e-01 8.813e-02 6.333 2.40e-10 ***
## monthjun 4.322e-01 8.807e-02 4.908 9.20e-07 ***
## monthmar 1.533e+00 1.160e-01 13.218 < 2e-16 ***
## monthmay -6.660e-01 7.212e-02 -9.235 < 2e-16 ***
## monthnov 3.485e-01 9.278e-02 3.756 0.000173 ***
## monthoct 5.841e-01 1.152e-01 5.072 3.94e-07 ***
## monthsep 2.825e-01 1.240e-01 2.279 0.022649 *
## duration 4.684e-03 7.342e-05 63.789 < 2e-16 ***
## cons.price.idx 5.011e-01 4.003e-02 12.517 < 2e-16 ***
## cons.conf.idx 4.190e-02 4.543e-03 9.223 < 2e-16 ***
## euribor3m -7.920e-01 1.599e-02 -49.534 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 17895 on 41147 degrees of freedom
## AIC: 17977
##
## Number of Fisher Scoring iterations: 10
ggplot(bank_data, aes(as.numeric(age) + as.numeric(job) + as.numeric(marital) + as.numeric(education) +
as.numeric(default) + as.numeric(housing) + as.numeric(loan) + as.numeric(month) +
as.numeric(duration) + as.numeric(cons.price.idx) + as.numeric(cons.conf.idx) +
as.numeric(euribor3m)
, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
glm.fit2 <- glm(y ~ job + education + duration + cons.price.idx + cons.conf.idx + euribor3m, data = bank_data, family = binomial)
summary(glm.fit2)
##
## Call:
## glm(formula = y ~ job + education + duration + cons.price.idx +
## cons.conf.idx + euribor3m, family = binomial, data = bank_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.9011 -0.3565 -0.1984 -0.1429 3.2191
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.917e+01 3.330e+00 -17.765 < 2e-16 ***
## jobblue-collar -4.561e-01 7.586e-02 -6.013 1.82e-09 ***
## jobentrepreneur -3.185e-01 1.206e-01 -2.641 0.008270 **
## jobhousemaid -2.510e-02 1.373e-01 -0.183 0.854938
## jobmanagement -1.317e-01 8.038e-02 -1.639 0.101224
## jobretired 3.832e-01 8.639e-02 4.435 9.19e-06 ***
## jobself-employed -2.268e-01 1.130e-01 -2.007 0.044772 *
## jobservices -3.000e-01 8.195e-02 -3.661 0.000251 ***
## jobstudent 3.514e-01 9.869e-02 3.560 0.000370 ***
## jobtechnician -2.732e-02 6.744e-02 -0.405 0.685447
## jobunemployed 1.048e-01 1.194e-01 0.878 0.380156
## jobunknown 3.340e-02 2.214e-01 0.151 0.880097
## educationbasic.6y 6.435e-02 1.150e-01 0.560 0.575773
## educationbasic.9y -1.584e-02 9.007e-02 -0.176 0.860396
## educationhigh.school 5.089e-02 8.612e-02 0.591 0.554619
## educationilliterate 1.156e+00 7.462e-01 1.549 0.121360
## educationprofessional.course 1.702e-01 9.538e-02 1.784 0.074412 .
## educationuniversity.degree 3.234e-01 8.534e-02 3.790 0.000150 ***
## educationunknown 1.822e-01 1.140e-01 1.599 0.109900
## duration 4.563e-03 7.133e-05 63.972 < 2e-16 ***
## cons.price.idx 6.510e-01 3.620e-02 17.984 < 2e-16 ***
## cons.conf.idx 7.405e-02 3.463e-03 21.387 < 2e-16 ***
## euribor3m -8.103e-01 1.428e-02 -56.748 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 28999 on 41187 degrees of freedom
## Residual deviance: 18636 on 41165 degrees of freedom
## AIC: 18682
##
## Number of Fisher Scoring iterations: 6
ggplot(bank_data, aes(as.numeric(job) + as.numeric(education) + as.numeric(duration) +
as.numeric(cons.price.idx) + as.numeric(cons.conf.idx) + as.numeric(euribor3m)
, as.numeric(y) - 1)) +
geom_point() +
geom_smooth(method="glm", color="blue", se=FALSE,
method.args = list(family='binomial'))
## `geom_smooth()` using formula 'y ~ x'
pairs(data = bank_data, y ~ age + job + cons.price.idx + cons.conf.idx + euribor3m ,col = bank_data$y)
glm.probs <- predict(glm.fit2, type = "response")
glm.probs[1: 5]
## 1 2 3 4 5
## 0.03102640 0.01511950 0.02134905 0.02075684 0.03060384
glm.pred <- ifelse(glm.probs > 0.5, "yes", "no")
attach(bank_data)
table(glm.pred, y)
## y
## glm.pred no yes
## no 35667 3070
## yes 881 1570
mean(glm.pred == y)
## [1] 0.904074