df_full <- read.csv("/Users/apple/Desktop/kun_hw/bank-full.csv", sep = ";")
library(magrittr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.7 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract() masks magrittr::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
require(ISLR)
## Loading required package: ISLR
require(grid)
## Loading required package: grid
library(ggmosaic)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
knitr::opts_chunk$set(echo = TRUE)
ncol(df_full)
## [1] 17
nrow(df_full)
## [1] 45211
str(df_full)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
summary(df_full)
## age job marital education
## Min. :18.00 Length:45211 Length:45211 Length:45211
## 1st Qu.:33.00 Class :character Class :character Class :character
## Median :39.00 Mode :character Mode :character Mode :character
## Mean :40.94
## 3rd Qu.:48.00
## Max. :95.00
## default balance housing loan
## Length:45211 Min. : -8019 Length:45211 Length:45211
## Class :character 1st Qu.: 72 Class :character Class :character
## Mode :character Median : 448 Mode :character Mode :character
## Mean : 1362
## 3rd Qu.: 1428
## Max. :102127
## contact day month duration
## Length:45211 Min. : 1.00 Length:45211 Min. : 0.0
## Class :character 1st Qu.: 8.00 Class :character 1st Qu.: 103.0
## Mode :character Median :16.00 Mode :character Median : 180.0
## Mean :15.81 Mean : 258.2
## 3rd Qu.:21.00 3rd Qu.: 319.0
## Max. :31.00 Max. :4918.0
## campaign pdays previous poutcome
## Min. : 1.000 Min. : -1.0 Min. : 0.0000 Length:45211
## 1st Qu.: 1.000 1st Qu.: -1.0 1st Qu.: 0.0000 Class :character
## Median : 2.000 Median : -1.0 Median : 0.0000 Mode :character
## Mean : 2.764 Mean : 40.2 Mean : 0.5803
## 3rd Qu.: 3.000 3rd Qu.: -1.0 3rd Qu.: 0.0000
## Max. :63.000 Max. :871.0 Max. :275.0000
## y
## Length:45211
## Class :character
## Mode :character
##
##
##
Upon the first glance of the dataset, we can see that age, job, marital, education, default, balance, housing, loan, contact, day, month, duration, campaign, pdays, previous and poutcome are the variable. The binary ‘deposit’ is the response variable.
barplot(table(df_full$y), col="cyan", ylim=c(0,5000),
xlab="Deposit",ylab="Frequency")
df_full <- df_full %>%
mutate(y_binary = ifelse(y == "no",0,1))
summary(df_full$y_binary)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 0.000 0.117 0.000 1.000
sum(is.na(df_full))
## [1] 0
sum(!complete.cases(df_full))
## [1] 0
Here we can see there is no missing data.
table(df_full$job)
##
## admin. blue-collar entrepreneur housemaid management
## 5171 9732 1487 1240 9458
## retired self-employed services student technician
## 2264 1579 4154 938 7597
## unemployed unknown
## 1303 288
We have 288 unknown jobs.
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y,job), fill = y))
## Warning: `unite_()` was deprecated in tidyr 1.2.0.
## Please use `unite()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
We can see higher response among students and retired people, also, we can ignore “unknown”. No big effect seen here.
str(df_full$marital)
## chr [1:45211] "married" "single" "married" "married" "single" "married" ...
levels(df_full$marital)
## NULL
plot(table(df_full$marital))
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y,marital), fill = y))
Here we can see no big effect of marriage. Singles slightly more like to say “yes”.
str(df_full$education)
## chr [1:45211] "tertiary" "secondary" "secondary" "unknown" "unknown" ...
levels(df_full$education)
## NULL
table(df_full$education)
##
## primary secondary tertiary unknown
## 6851 23202 13301 1857
plot(table(df_full$education))
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y, education), fill = y))
It appears that a positive correlation exists between the number of years of education and the odds of subscribing to a term deposit.
str(df_full$default)
## chr [1:45211] "no" "no" "no" "no" "no" "no" "no" "yes" "no" "no" "no" "no" ...
levels(df_full$default)
## NULL
table(df_full$default)
##
## no yes
## 44396 815
It seems like this variable is not useble in the final model.
str(df_full$housing)
## chr [1:45211] "yes" "yes" "yes" "yes" "no" "yes" "yes" "yes" "yes" "yes" ...
levels(df_full$housing)
## NULL
table(df_full$housing)
##
## no yes
## 20081 25130
plot(table(df_full$housing))
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y, housing), fill = y))
Not much difference between those who have housing loans and those who do not.
table(df_full$loan)
##
## no yes
## 37967 7244
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y, loan), fill = y))
Those do not have personal loans are more likely to say “yes”.
table(df_full$contact)
##
## cellular telephone unknown
## 29285 2906 13020
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y, contact), fill = y))
There are 13020 unknown contacts. We can see cellular responders are more likely to subscribe to a term deposit.
ggplot(data = df_full) +
aes(x = month, y = ..count../nrow(df_full), fill = y) +
geom_bar()
month_table <- table(df_full$month, df_full$y)
month_tab <- as.data.frame(prop.table(month_table, 2))
colnames(month_tab) <- c("month", "y", "percent")
ggplot(data = month_tab, aes(x = month, y = percent, fill = y)) +
geom_bar(stat = 'identity', position = 'dodge', alpha = 2/3)
We can see month may be a very important varible.
table(df_full$poutcome)
##
## failure other success unknown
## 4901 1840 1511 36959
ggplot(data = df_full) +
geom_mosaic(aes(x = product(poutcome), fill = y))
Here we can see a lot of unknown poutcomes. So we will not use this varible for modeling.
summary(df_full$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 33.00 39.00 40.94 48.00 95.00
ggplot(data = df_full) +
aes(x = age) +
geom_bar() +
geom_vline(xintercept = c(30, 60),
col = "red",
linetype = "dashed") +
facet_grid(y ~ .,
scales = "free_y") +
scale_x_continuous(breaks = seq(0, 100, 5))
df_full = df_full %>%
mutate(age_discrete = if_else(age > 60, "high", if_else(age > 30, "mid", "low")))
Here we can see banks are not very much interested by contacting the older population. Even though, elderly persons are more likely to subscribe to a term deposit.
ggplot(data = df_full) +
aes(x = day, y = ..count../nrow(df_full), fill = y) +
geom_bar()
table(df_full$previous)
##
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## 36954 2772 2106 1142 714 459 277 205 129 92 67 65 44
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 38 19 20 13 15 6 11 8 4 6 8 5 4
## 26 27 28 29 30 32 35 37 38 40 41 51 55
## 2 5 2 4 3 1 1 2 2 1 1 1 1
## 58 275
## 1 1
ggplot(data = df_full) +
geom_mosaic(aes(x = product(previous), fill = y))
To make it easier to understand.
df_full <- df_full %>%
mutate(previous_binned = if_else(previous >= 2, "2+", if_else(previous == 1, "1", "0")))
ggplot(data = df_full) +
geom_mosaic(aes(x = product(previous_binned), fill = y))
Here we can see it seems 2+ contacts increase probability of subscription.
df_full <- df_full %>%
mutate(pdays_dummy = if_else(pdays == 999, "0", "1"))
ggplot(data = df_full) +
geom_mosaic(aes(x = product(y, pdays_dummy), fill = y))
df_dup <- df_full
df_dup <- df_dup %>%
filter(job != "unknown")
df_dup <- df_dup %>%
filter(marital != "unknown")
df_dup = df_dup %>%
mutate(education = recode(education, "unknown" = "university.degree"))
df_dup$contact <- factor(df_dup$contact, order = TRUE, levels =c('telephone', 'cellular'))
df_dup$education <- factor(df_dup$education, order = TRUE, levels =c('illiterate','basic.4y', 'basic.6y','basic.9y', 'high.school','professional.course','university.degree'))
df_dup$age_discrete <- factor(df_dup$age_discrete, order = TRUE, levels =c('low', 'mid','high'))
df_dup$job <- factor(df_dup$job, order = TRUE, levels =c('blue-collar', 'services','entrepreneur', 'housemaid', 'self-employed','technician', 'management','admin.','unemployed', 'retired','student'))
df_dup$marital <- factor(df_dup$marital, order = TRUE, levels =c('married', 'divorced', 'single'))
df_dup$month <- factor(df_dup$month, order = TRUE, levels =c('mar', 'apr','may', 'jun','jul', 'aug', 'sep','oct', 'nov','dec'))
df_dup$previous_binned <- factor(df_dup$previous_binned, order = TRUE, levels =c('0', '1','2+'))
set.seed(1984)
training <- createDataPartition(df_dup$y_binary, p = 0.8, list=FALSE)
train_data <- df_dup[training,]
test_data <- df_dup[-training,]
model <- glm(y_binary ~ age_discrete + marital + month + job + contact + previous_binned + pdays, family=binomial(link='logit'),data=train_data)
summary(model)
##
## Call:
## glm(formula = y_binary ~ age_discrete + marital + month + job +
## contact + previous_binned + pdays, family = binomial(link = "logit"),
## data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8622 -0.5175 -0.4354 -0.3662 2.6126
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3606942 0.0703407 -5.128 2.93e-07 ***
## age_discrete.L 0.6215273 0.0906480 6.856 7.06e-12 ***
## age_discrete.Q 0.6034711 0.0530182 11.382 < 2e-16 ***
## marital.L 0.1903646 0.0354693 5.367 8.00e-08 ***
## marital.Q -0.0434142 0.0537657 -0.807 0.41940
## month.L -0.0480638 0.1084475 -0.443 0.65762
## month.Q 0.8138362 0.1116577 7.289 3.13e-13 ***
## month.C -0.6268737 0.1078767 -5.811 6.21e-09 ***
## month^4 0.6404933 0.0849781 7.537 4.80e-14 ***
## month^5 0.3796797 0.0804174 4.721 2.34e-06 ***
## month^6 1.5049113 0.0712363 21.126 < 2e-16 ***
## month^7 0.9007997 0.0697739 12.910 < 2e-16 ***
## month^8 -1.0147380 0.0824023 -12.314 < 2e-16 ***
## month^9 0.2826455 0.0660641 4.278 1.88e-05 ***
## job.L 0.6025173 0.0936266 6.435 1.23e-10 ***
## job.Q 0.0917363 0.0814869 1.126 0.26026
## job.C 0.0026683 0.0966357 0.028 0.97797
## job^4 -0.0209705 0.0930948 -0.225 0.82178
## job^5 0.2016689 0.0938861 2.148 0.03171 *
## job^6 0.0673022 0.0913305 0.737 0.46118
## job^7 0.2692993 0.1008115 2.671 0.00756 **
## job^8 0.2250140 0.1021595 2.203 0.02762 *
## job^9 0.0404592 0.0977821 0.414 0.67904
## job^10 0.2300963 0.0764311 3.011 0.00261 **
## contact.L 0.2755660 0.0559369 4.926 8.38e-07 ***
## previous_binned.L 0.8895023 0.0584090 15.229 < 2e-16 ***
## previous_binned.Q -0.3081200 0.0627427 -4.911 9.07e-07 ***
## pdays -0.0025648 0.0003236 -7.926 2.26e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 18713 on 22364 degrees of freedom
## Residual deviance: 16544 on 22337 degrees of freedom
## (13574 observations deleted due to missingness)
## AIC: 16600
##
## Number of Fisher Scoring iterations: 5
Let’s look at the prediction of the model on the test set (test_data).
pred.train <- predict(model,test_data)
pred.train <- ifelse(pred.train > 0.5,1,0)
mean(pred.train == test_data$y_binary)
## [1] NA
t1 <- table(pred.train,test_data$y_binary)
presicion <- t1[1,1]/(sum(t1[1,]))
recall <- t1[1,1]/(sum(t1[,1]))
presicion
## [1] 0.8541629
F1<- 2*presicion*recall/(presicion+recall)
F1
## [1] 0.9177857
According the presicion and recall of the model, we can say this model works well.