bank <- read.csv('bank-additional.csv', sep=";")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS); library(car); library(olsrr)
## Warning: package 'MASS' was built under R version 4.1.2
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Warning: package 'car' was built under R version 4.1.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.1
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Warning: package 'olsrr' was built under R version 4.1.1
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
##
## cement
## The following object is masked from 'package:datasets':
##
## rivers
library(DescTools);library(ResourceSelection)
## Warning: package 'DescTools' was built under R version 4.1.1
##
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
##
## Recode
## Warning: package 'ResourceSelection' was built under R version 4.1.2
## ResourceSelection 0.3-5 2019-07-22
library(caret);library(lattice);
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
##
## MAE, RMSE
library(gam);library(car)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following object is masked from 'package:DescTools':
##
## %:%
## Loaded gam 1.20
library(ROCR);library(gridExtra)
## Warning: package 'ROCR' was built under R version 4.1.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.1.3
## Registered S3 method overwritten by 'gdata':
## method from
## reorder.factor DescTools
## 'data.frame': 4119 obs. of 21 variables:
## $ age : int 30 39 25 38 47 32 32 41 31 35 ...
## $ job : chr "blue-collar" "services" "services" "services" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education : chr "basic.9y" "high.school" "high.school" "basic.9y" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "yes" "no" "yes" "unknown" ...
## $ loan : chr "no" "no" "no" "unknown" ...
## $ contact : chr "cellular" "telephone" "telephone" "telephone" ...
## $ month : chr "may" "may" "jun" "jun" ...
## $ day_of_week : chr "fri" "fri" "wed" "fri" ...
## $ duration : int 487 346 227 17 58 128 290 44 68 170 ...
## $ campaign : int 2 4 1 3 1 3 4 2 1 1 ...
## $ pdays : int 999 999 999 999 999 999 999 999 999 999 ...
## $ previous : int 0 0 0 0 0 2 0 0 1 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
## $ cons.price.idx: num 92.9 94 94.5 94.5 93.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
## $ euribor3m : num 1.31 4.86 4.96 4.96 4.19 ...
## $ nr.employed : num 5099 5191 5228 5228 5196 ...
## $ y : chr "no" "no" "no" "no" ...
## age job marital education default housing loan contact
## 1 30 blue-collar married basic.9y no yes no cellular
## 2 39 services single high.school no no no telephone
## 3 25 services married high.school no yes no telephone
## 4 38 services married basic.9y no unknown unknown telephone
## 5 47 admin. married university.degree no yes no cellular
## 6 32 services single university.degree no no no cellular
## month day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 may fri 487 2 999 0 nonexistent -1.8
## 2 may fri 346 4 999 0 nonexistent 1.1
## 3 jun wed 227 1 999 0 nonexistent 1.4
## 4 jun fri 17 3 999 0 nonexistent 1.4
## 5 nov mon 58 1 999 0 nonexistent -0.1
## 6 sep thu 128 3 999 2 failure -1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 92.893 -46.2 1.313 5099.1 no
## 2 93.994 -36.4 4.855 5191.0 no
## 3 94.465 -41.8 4.962 5228.1 no
## 4 94.465 -41.8 4.959 5228.1 no
## 5 93.200 -42.0 4.191 5195.8 no
## 6 94.199 -37.5 0.884 4963.6 no
summary(bank)
## age job marital education
## Min. :18.00 Length:4119 Length:4119 Length:4119
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.11
## 3rd Qu.:47.00
## Max. :88.00
## default housing loan contact
## Length:4119 Length:4119 Length:4119 Length:4119
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:4119 Length:4119 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 103.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 181.0 Median : 2.000
## Mean : 256.8 Mean : 2.537
## 3rd Qu.: 317.0 3rd Qu.: 3.000
## Max. :3643.0 Max. :35.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.0000 Length:4119 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :960.4 Mean :0.1903 Mean : 0.08497
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :6.0000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.635 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.334 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5166
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:4119
## Class :character
## Mode :character
##
##
##
#bank$job<-as.numeric(as.factor(bank$job))
#bank$marital<-as.numeric(as.factor(bank$marital))
#bank$education<-as.numeric(as.factor(bank$education))
#bank$default<-as.numeric(as.factor(bank$default))
#bank$housing<-as.numeric(as.factor(bank$housing))
#bank$loan<-as.numeric(as.factor(bank$loan))
#bank$contact<-as.numeric(as.factor(bank$contact))
#bank$month<-as.numeric(as.factor(bank$month))
#bank$day_of_week<-as.numeric(as.factor(bank$day_of_week))
#bank$campaign<-as.numeric(as.factor(bank$campaign))
#bank$previous<-as.numeric(as.factor(bank$previous))
#bank$poutcome<-as.numeric(as.factor(bank$poutcome))
colSums(is.na(bank))
## age job marital education default
## 0 0 0 0 0
## housing loan contact month day_of_week
## 0 0 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
sum(duplicated(bank))
## [1] 0
table(bank$y)
##
## no yes
## 3668 451
Number of no’s greater than yes so there is a data imbalance.
In the dataset description we also saw that the data contains ‘unknown’ values, Lets take a look at that.
table(bank== "unknown")
##
## FALSE TRUE
## 85269 1230
colSums(bank=="unknown")
## age job marital education default
## 0 39 11 167 803
## housing loan contact month day_of_week
## 105 105 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
So job, marital,education,default,housing and loan have unknown values. We can decide what to do with unknows after exploratory data analysis.
summary(bank$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 18.00 32.00 38.00 40.11 47.00 88.00
unique(bank$age)
## [1] 30 39 25 38 47 32 41 31 35 36 29 27 44 46 45 50 55 40 28 34 33 51 48 20 76
## [26] 56 24 58 60 37 52 42 49 54 59 57 43 53 75 82 71 21 22 23 26 81 61 67 73 18
## [51] 64 74 77 86 85 63 88 78 72 68 80 66 19 62 65 69 70
cor(bank$age,as.numeric(as.factor(bank$y)))
## [1] 0.06037408
The distribution of age shows that majority of the bank’s customers are aged between 25 - 50. More specifically, 50 % are between 32-47. The median age is 38. So, the target customers of the bank might be middle-aged adults.
p1= ggplot(bank) + geom_histogram(aes(x=age),color="black", fill="grey") +
ylab('Count') + xlab('Age') + geom_vline(aes(xintercept = mean(age), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 = ggplot(bank) + geom_boxplot(aes(x='', y=age))
p3=ggplot(data = bank, aes(x=age, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(bank$job)
## Length Class Mode
## 4119 character character
unique(bank$job)
## [1] "blue-collar" "services" "admin." "entrepreneur"
## [5] "self-employed" "technician" "management" "student"
## [9] "retired" "housemaid" "unemployed" "unknown"
#table(bank$job)
cor(as.numeric(as.factor(bank$job)),as.numeric(as.factor(bank$y)))
## [1] 0.02672463
p1 <- ggplot(bank,aes(y))+geom_bar(aes(job))
p2 <- ggplot(data = bank, aes(x=job, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)
## job variable classification w.r.t target variable
CrossTable(bank$job,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$job | no | yes | Row Total |
## --------------|-----------|-----------|-----------|
## admin. | 879 | 133 | 1012 |
## | 0.547 | 4.445 | |
## | 0.869 | 0.131 | 0.246 |
## --------------|-----------|-----------|-----------|
## blue-collar | 823 | 61 | 884 |
## | 1.627 | 13.235 | |
## | 0.931 | 0.069 | 0.215 |
## --------------|-----------|-----------|-----------|
## entrepreneur | 140 | 8 | 148 |
## | 0.511 | 4.154 | |
## | 0.946 | 0.054 | 0.036 |
## --------------|-----------|-----------|-----------|
## housemaid | 99 | 11 | 110 |
## | 0.011 | 0.091 | |
## | 0.900 | 0.100 | 0.027 |
## --------------|-----------|-----------|-----------|
## management | 294 | 30 | 324 |
## | 0.104 | 0.845 | |
## | 0.907 | 0.093 | 0.079 |
## --------------|-----------|-----------|-----------|
## retired | 128 | 38 | 166 |
## | 2.659 | 21.622 | |
## | 0.771 | 0.229 | 0.040 |
## --------------|-----------|-----------|-----------|
## self-employed | 146 | 13 | 159 |
## | 0.137 | 1.117 | |
## | 0.918 | 0.082 | 0.039 |
## --------------|-----------|-----------|-----------|
## services | 358 | 35 | 393 |
## | 0.184 | 1.499 | |
## | 0.911 | 0.089 | 0.095 |
## --------------|-----------|-----------|-----------|
## student | 63 | 19 | 82 |
## | 1.375 | 11.186 | |
## | 0.768 | 0.232 | 0.020 |
## --------------|-----------|-----------|-----------|
## technician | 611 | 80 | 691 |
## | 0.031 | 0.249 | |
## | 0.884 | 0.116 | 0.168 |
## --------------|-----------|-----------|-----------|
## unemployed | 92 | 19 | 111 |
## | 0.474 | 3.857 | |
## | 0.829 | 0.171 | 0.027 |
## --------------|-----------|-----------|-----------|
## unknown | 35 | 4 | 39 |
## | 0.002 | 0.017 | |
## | 0.897 | 0.103 | 0.009 |
## --------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## --------------|-----------|-----------|-----------|
##
##
summary(bank$marital)
## Length Class Mode
## 4119 character character
unique(bank$marital)
## [1] "married" "single" "divorced" "unknown"
#table(bank$marital)
cor(as.numeric(as.factor(bank$marital)),as.numeric(as.factor(bank$y)))
## [1] 0.04383328
p1<-ggplot(bank,aes(y))+geom_bar(aes(marital))
p2<-ggplot(data = bank, aes(x=marital, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)
## marital variable classification w.r.t target variable
CrossTable(bank$marital,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$marital | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## divorced | 403 | 43 | 446 |
## | 0.086 | 0.697 | |
## | 0.904 | 0.096 | 0.108 |
## -------------|-----------|-----------|-----------|
## married | 2257 | 252 | 2509 |
## | 0.231 | 1.879 | |
## | 0.900 | 0.100 | 0.609 |
## -------------|-----------|-----------|-----------|
## single | 998 | 155 | 1153 |
## | 0.805 | 6.550 | |
## | 0.866 | 0.134 | 0.280 |
## -------------|-----------|-----------|-----------|
## unknown | 10 | 1 | 11 |
## | 0.004 | 0.035 | |
## | 0.909 | 0.091 | 0.003 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$education)
## Length Class Mode
## 4119 character character
unique(bank$education)
## [1] "basic.9y" "high.school" "university.degree"
## [4] "professional.course" "basic.6y" "basic.4y"
## [7] "unknown" "illiterate"
cor(as.numeric(as.factor(bank$education)),as.numeric(as.factor(bank$y)))
## [1] 0.06731618
p1<-ggplot(bank,aes(y))+geom_bar(aes(education))
p2<-ggplot(data = bank, aes(x=education, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)
## education classification w.r.t target variable
CrossTable(bank$education,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$education | no | yes | Row Total |
## --------------------|-----------|-----------|-----------|
## basic.4y | 391 | 38 | 429 |
## | 0.211 | 1.714 | |
## | 0.911 | 0.089 | 0.104 |
## --------------------|-----------|-----------|-----------|
## basic.6y | 211 | 17 | 228 |
## | 0.312 | 2.541 | |
## | 0.925 | 0.075 | 0.055 |
## --------------------|-----------|-----------|-----------|
## basic.9y | 531 | 43 | 574 |
## | 0.771 | 6.269 | |
## | 0.925 | 0.075 | 0.139 |
## --------------------|-----------|-----------|-----------|
## high.school | 824 | 97 | 921 |
## | 0.018 | 0.146 | |
## | 0.895 | 0.105 | 0.224 |
## --------------------|-----------|-----------|-----------|
## illiterate | 1 | 0 | 1 |
## | 0.013 | 0.109 | |
## | 1.000 | 0.000 | 0.000 |
## --------------------|-----------|-----------|-----------|
## professional.course | 470 | 65 | 535 |
## | 0.087 | 0.704 | |
## | 0.879 | 0.121 | 0.130 |
## --------------------|-----------|-----------|-----------|
## university.degree | 1099 | 165 | 1264 |
## | 0.629 | 5.113 | |
## | 0.869 | 0.131 | 0.307 |
## --------------------|-----------|-----------|-----------|
## unknown | 141 | 26 | 167 |
## | 0.400 | 3.255 | |
## | 0.844 | 0.156 | 0.041 |
## --------------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## --------------------|-----------|-----------|-----------|
##
##
summary(bank$default)
## Length Class Mode
## 4119 character character
unique(bank$default)
## [1] "no" "unknown" "yes"
#table(bank$default)
cor(as.numeric(as.factor(bank$default)),as.numeric(as.factor(bank$y)))
## [1] -0.07662722
p1 <- ggplot(bank,aes(y))+geom_bar(aes(default))
p2 <- ggplot(data = bank, aes(x=default, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)
## default classification w.r.t target variable
CrossTable(bank$default,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$default | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## no | 2913 | 402 | 3315 |
## | 0.516 | 4.197 | |
## | 0.879 | 0.121 | 0.805 |
## -------------|-----------|-----------|-----------|
## unknown | 754 | 49 | 803 |
## | 2.119 | 17.231 | |
## | 0.939 | 0.061 | 0.195 |
## -------------|-----------|-----------|-----------|
## yes | 1 | 0 | 1 |
## | 0.013 | 0.109 | |
## | 1.000 | 0.000 | 0.000 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$housing)
## Length Class Mode
## 4119 character character
unique(bank$housing)
## [1] "yes" "no" "unknown"
#table(bank$housing)
cor(as.numeric(as.factor(bank$housing)),as.numeric(as.factor(bank$y)))
## [1] 0.0009566489
p1 <- ggplot(bank,aes(y))+geom_bar(aes(housing))
p2 <- ggplot(data = bank, aes(x=housing, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)
## housing classification w.r.t target variable
CrossTable(bank$housing,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$housing | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## no | 1637 | 202 | 1839 |
## | 0.000 | 0.002 | |
## | 0.890 | 0.110 | 0.446 |
## -------------|-----------|-----------|-----------|
## unknown | 96 | 9 | 105 |
## | 0.067 | 0.542 | |
## | 0.914 | 0.086 | 0.025 |
## -------------|-----------|-----------|-----------|
## yes | 1935 | 240 | 2175 |
## | 0.002 | 0.014 | |
## | 0.890 | 0.110 | 0.528 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$loan)
## Length Class Mode
## 4119 character character
unique(bank$loan)
## [1] "no" "unknown" "yes"
cor(as.numeric(as.factor(bank$loan)),as.numeric(as.factor(bank$y)))
## [1] -0.01270932
p1 <- ggplot(bank,aes(y))+geom_bar(aes(loan))
p2 <- ggplot(data = bank, aes(x=loan, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)
## loan classification w.r.t target variable
CrossTable(bank$loan,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$loan | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## no | 2975 | 374 | 3349 |
## | 0.018 | 0.146 | |
## | 0.888 | 0.112 | 0.813 |
## -------------|-----------|-----------|-----------|
## unknown | 96 | 9 | 105 |
## | 0.067 | 0.542 | |
## | 0.914 | 0.086 | 0.025 |
## -------------|-----------|-----------|-----------|
## yes | 597 | 68 | 665 |
## | 0.039 | 0.318 | |
## | 0.898 | 0.102 | 0.161 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$contact)
## Length Class Mode
## 4119 character character
unique(bank$contact)
## [1] "cellular" "telephone"
cor(as.numeric(as.factor(bank$contact)),as.numeric(as.factor(bank$y)))
## [1] -0.1374007
p1 <- ggplot(bank,aes(y))+geom_bar(aes(contact))
p2 <- ggplot(data = bank, aes(x=contact, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)
## contact classification w.r.t target variable
CrossTable(bank$contact,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$contact | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## cellular | 2277 | 375 | 2652 |
## | 3.032 | 24.663 | |
## | 0.859 | 0.141 | 0.644 |
## -------------|-----------|-----------|-----------|
## telephone | 1391 | 76 | 1467 |
## | 5.482 | 44.585 | |
## | 0.948 | 0.052 | 0.356 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$month)
## Length Class Mode
## 4119 character character
unique(bank$month)
## [1] "may" "jun" "nov" "sep" "jul" "aug" "mar" "oct" "apr" "dec"
cor(as.numeric(as.factor(bank$month)),as.numeric(as.factor(bank$y)))
## [1] 0.005048514
p1 <- ggplot(bank,aes(y))+geom_bar(aes(month))
p2 <- ggplot(data = bank, aes(x=month, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)
## month classification w.r.t target variable
CrossTable(bank$month,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$month | no | yes | Row Total |
## -------------|-----------|-----------|-----------|
## apr | 179 | 36 | 215 |
## | 0.811 | 6.594 | |
## | 0.833 | 0.167 | 0.052 |
## -------------|-----------|-----------|-----------|
## aug | 572 | 64 | 636 |
## | 0.056 | 0.456 | |
## | 0.899 | 0.101 | 0.154 |
## -------------|-----------|-----------|-----------|
## dec | 10 | 12 | 22 |
## | 4.696 | 38.189 | |
## | 0.455 | 0.545 | 0.005 |
## -------------|-----------|-----------|-----------|
## jul | 652 | 59 | 711 |
## | 0.561 | 4.564 | |
## | 0.917 | 0.083 | 0.173 |
## -------------|-----------|-----------|-----------|
## jun | 462 | 68 | 530 |
## | 0.211 | 1.713 | |
## | 0.872 | 0.128 | 0.129 |
## -------------|-----------|-----------|-----------|
## mar | 20 | 28 | 48 |
## | 12.102 | 98.429 | |
## | 0.417 | 0.583 | 0.012 |
## -------------|-----------|-----------|-----------|
## may | 1288 | 90 | 1378 |
## | 3.020 | 24.566 | |
## | 0.935 | 0.065 | 0.335 |
## -------------|-----------|-----------|-----------|
## nov | 403 | 43 | 446 |
## | 0.086 | 0.697 | |
## | 0.904 | 0.096 | 0.108 |
## -------------|-----------|-----------|-----------|
## oct | 44 | 25 | 69 |
## | 4.953 | 40.282 | |
## | 0.638 | 0.362 | 0.017 |
## -------------|-----------|-----------|-----------|
## sep | 38 | 26 | 64 |
## | 6.329 | 51.475 | |
## | 0.594 | 0.406 | 0.016 |
## -------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -------------|-----------|-----------|-----------|
##
##
summary(bank$day_of_week)
## Length Class Mode
## 4119 character character
unique(bank$day_of_week)
## [1] "fri" "wed" "mon" "thu" "tue"
cor(as.numeric(as.factor(bank$day_of_week)),as.numeric(as.factor(bank$y)))
## [1] -0.006369504
p1 <- ggplot(bank,aes(y))+geom_bar(aes(day_of_week))
p2 <- ggplot(data = bank, aes(x=day_of_week, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)
## days of week classification w.r.t target variable
CrossTable(bank$day_of_week,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$day_of_week | no | yes | Row Total |
## -----------------|-----------|-----------|-----------|
## fri | 685 | 83 | 768 |
## | 0.002 | 0.014 | |
## | 0.892 | 0.108 | 0.186 |
## -----------------|-----------|-----------|-----------|
## mon | 757 | 98 | 855 |
## | 0.025 | 0.205 | |
## | 0.885 | 0.115 | 0.208 |
## -----------------|-----------|-----------|-----------|
## thu | 764 | 96 | 860 |
## | 0.004 | 0.036 | |
## | 0.888 | 0.112 | 0.209 |
## -----------------|-----------|-----------|-----------|
## tue | 750 | 91 | 841 |
## | 0.002 | 0.013 | |
## | 0.892 | 0.108 | 0.204 |
## -----------------|-----------|-----------|-----------|
## wed | 712 | 83 | 795 |
## | 0.023 | 0.188 | |
## | 0.896 | 0.104 | 0.193 |
## -----------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## -----------------|-----------|-----------|-----------|
##
##
summary(bank$duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 103.0 181.0 256.8 317.0 3643.0
unique(bank$duration)
## [1] 487 346 227 17 58 128 290 44 68 170 301 148 97 211 553
## [16] 698 191 59 38 849 326 222 626 119 388 479 446 127 109 113
## [31] 393 151 256 42 525 57 499 84 137 31 430 126 340 412 132
## [46] 79 341 157 252 263 215 89 143 40 10 481 233 204 403 180
## [61] 16 447 81 361 1091 395 432 596 77 768 96 357 459 11 264
## [76] 93 374 158 95 835 505 300 390 274 135 257 268 477 91 76
## [91] 103 436 483 250 259 389 7 123 92 297 406 104 854 147 203
## [106] 149 144 394 523 73 197 108 80 114 122 1161 181 239 360 314
## [121] 984 663 141 706 797 311 63 111 49 171 242 279 246 309 168
## [136] 153 152 90 117 640 199 1114 74 190 738 224 344 383 35 772
## [151] 124 345 951 188 809 192 154 100 317 293 30 442 187 64 629
## [166] 423 888 207 265 273 85 261 136 711 88 72 307 39 156 202
## [181] 353 159 347 174 280 686 94 225 474 377 185 121 160 313 219
## [196] 267 228 355 102 116 83 473 605 585 255 1868 846 404 51 87
## [211] 167 440 673 48 236 288 193 318 209 173 503 101 370 1207 262
## [226] 609 806 335 266 434 82 15 155 339 206 178 461 50 56 55
## [241] 142 9 247 130 336 424 617 238 632 86 165 212 54 184 6
## [256] 70 98 106 456 118 241 439 322 417 498 405 99 712 112 223
## [271] 133 258 958 898 282 175 235 372 69 183 270 134 449 115 205
## [286] 145 548 379 105 544 401 549 291 655 179 391 750 454 23 363
## [301] 775 164 988 471 385 125 886 34 334 955 545 659 230 699 1276
## [316] 251 25 696 701 342 161 275 172 139 232 131 36 600 177 217
## [331] 216 329 604 634 107 245 690 286 201 198 249 226 1058 299 441
## [346] 285 195 292 298 1013 248 1319 146 294 575 237 861 618 271 200
## [361] 166 367 218 584 509 27 78 162 651 415 1149 110 240 366 284
## [376] 431 608 244 455 807 420 182 638 641 21 1348 324 331 550 489
## [391] 304 189 728 278 387 29 71 767 1476 176 52 150 32 12 501
## [406] 381 482 14 569 697 581 243 229 408 53 305 316 577 427 214
## [421] 19 65 281 468 67 438 582 721 295 231 221 1170 368 1360 433
## [436] 352 37 650 289 213 22 43 26 532 75 557 541 62 5 941
## [451] 422 319 653 397 1447 999 321 1143 667 1132 60 396 194 1068 337
## [466] 400 140 409 208 13 458 713 820 310 587 320 566 748 599 411
## [481] 1185 398 169 272 66 679 8 18 497 1065 276 716 20 760 253
## [496] 551 675 46 484 333 369 464 362 997 287 649 470 762 591 758
## [511] 1551 480 869 61 129 979 630 234 354 502 451 296 407 120 754
## [526] 589 41 514 919 530 595 526 494 24 1353 332 1234 687 428 488
## [541] 486 413 892 452 614 749 1327 28 47 677 643 2653 302 570 938
## [556] 260 901 138 590 546 371 312 163 328 722 323 611 539 359 671
## [571] 781 1005 303 343 418 45 419 1148 349 3253 606 894 813 891 210
## [586] 1067 543 382 492 1183 903 4 375 1628 840 1167 386 868 327 485
## [601] 506 351 315 529 1720 533 429 766 616 1130 747 496 2301 460 220
## [616] 776 568 448 186 534 1334 1138 1019 364 1090 857 269 637 536 475
## [631] 453 330 338 764 873 1176 384 33 602 476 0 689 718 796 662
## [646] 799 715 633 348 1014 700 1045 1152 725 358 196 493 254 742 504
## [661] 1092 399 952 426 457 3643 1105 838 829 565 644 771 513 646 356
## [676] 693 592 628 556 769 1111 843 668 848 855 517 992 619 867 1441
## [691] 665 1171 542 607 800 1150 1855 1203 723 308 823 1076 837 780 789
## [706] 1002 578 507 508 567 421 1241 373 571 469 527 588 645 1221 704
## [721] 378 1127 818 1062 562 825 435 802 531 306 739 365 325 1432 1806
## [736] 1046 674 740 1119 636 1357 414 727 1009 283 1011 511 1186 402 519
## [751] 490 683 688 1340 472 882 520 515 1332 1820 1311 559 1365 1980 410
## [766] 895 1190 784 376 521 834 450 1128 516 770 1074 1259 1422 1300 1135
## [781] 624 540 657 627 681 491 705 597 1298 1438 277 1087 782 416 1288
## [796] 1424 720 726 537 996 815 805 1468 801 495 463 814 350 702 623
## [811] 980 1195 478 881 445 658 528 522 1012 1590 621 1602 757 593 879
## [826] 580 620 1386
cor(as.numeric(as.factor(bank$duration)),as.numeric(as.factor(bank$y)))
## [1] 0.4197168
p1= ggplot(bank) + geom_histogram(aes(x=duration),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('Duration') + geom_vline(aes(xintercept = mean(duration), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 = ggplot(bank) + geom_boxplot(aes(x='', y=duration))
p3=ggplot(data = bank, aes(x=duration, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
summary(bank$campaign)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.537 3.000 35.000
unique(bank$campaign)
## [1] 2 4 1 3 6 7 27 5 12 14 10 8 11 13 9 15 16 18 17 22 19 23 24 35 29
cor(as.numeric(as.factor(bank$campaign)),as.numeric(as.factor(bank$y)))
## [1] -0.07726492
p1 <- ggplot(bank) + geom_histogram(aes(x=campaign),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('Campaign') + geom_vline(aes(xintercept = mean(campaign), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=campaign))
p3 <- ggplot(data = bank, aes(x=campaign, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
summary(bank$pdays)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 999.0 999.0 960.4 999.0 999.0
unique(bank$pdays)
## [1] 999 12 3 6 5 2 10 11 7 1 18 4 15 0 16 9 19 17 13
## [20] 21 14
cor(as.numeric(as.factor(bank$pdays)),as.numeric(as.factor(bank$y)))
## [1] -0.3292231
bank <- bank %>% mutate(pdays = if_else(pdays == 999, "0", "1"))
#hist(as.numeric(bank$pdays))
p1 <- ggplot(data = bank, aes(x=pdays, fill=y)) + geom_bar()+ guides()
p1
summary(bank$previous)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.1903 0.0000 6.0000
unique(bank$previous)
## [1] 0 2 1 3 5 4 6
cor(as.numeric(as.factor(bank$previous)),as.numeric(as.factor(bank$y)))
## [1] 0.2556966
p1 <- ggplot(bank) + geom_histogram(aes(x=previous),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('Previous') + geom_vline(aes(xintercept = mean(previous), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 <- ggplot(data = bank, aes(x=previous, fill=y)) + geom_bar() + guides()
grid.arrange(p1,p2,ncol=2)
## 15.Poutcome
summary(bank$poutcome)
## Length Class Mode
## 4119 character character
unique(bank$poutcome)
## [1] "nonexistent" "failure" "success"
cor(as.numeric(as.factor(bank$poutcome)),as.numeric(as.factor(bank$y)))
## [1] 0.1233946
p1 <- ggplot(bank,aes(y))+geom_bar(aes(poutcome))
p2 <- ggplot(data = bank, aes(x=poutcome, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)
CrossTable(bank$poutcome,bank$y,prop.t = FALSE,prop.c = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## |-------------------------|
##
##
## Total Observations in Table: 4119
##
##
## | bank$y
## bank$poutcome | no | yes | Row Total |
## --------------|-----------|-----------|-----------|
## failure | 387 | 67 | 454 |
## | 0.739 | 6.014 | |
## | 0.852 | 0.148 | 0.110 |
## --------------|-----------|-----------|-----------|
## nonexistent | 3231 | 292 | 3523 |
## | 2.801 | 22.781 | |
## | 0.917 | 0.083 | 0.855 |
## --------------|-----------|-----------|-----------|
## success | 50 | 92 | 142 |
## | 46.222 | 375.928 | |
## | 0.352 | 0.648 | 0.034 |
## --------------|-----------|-----------|-----------|
## Column Total | 3668 | 451 | 4119 |
## --------------|-----------|-----------|-----------|
##
##
summary(bank$emp.var.rate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.40000 -1.80000 1.10000 0.08497 1.40000 1.40000
unique(bank$emp.var.rate)
## [1] -1.8 1.1 1.4 -0.1 -1.1 -2.9 -1.7 -3.4 -3.0 -0.2
cor(as.numeric(as.factor(bank$emp.var.rate)),as.numeric(as.factor(bank$y)))
## [1] -0.2714018
p1 <- ggplot(bank) + geom_histogram(aes(x=emp.var.rate),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('emp.var.rate') + geom_vline(aes(xintercept = mean(emp.var.rate), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=emp.var.rate))
p3 <- ggplot(data = bank, aes(x=emp.var.rate, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
## 17.cons.price.idx
summary(bank$cons.price.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 92.20 93.08 93.75 93.58 93.99 94.77
unique(bank$cons.price.idx)
## [1] 92.893 93.994 94.465 93.200 94.199 93.918 93.444 93.369 92.843 92.963
## [11] 94.601 94.027 92.379 92.431 93.749 93.075 94.055 92.469 94.767 92.201
## [21] 92.649 94.215 93.876 93.798 92.713 92.756
cor(as.numeric(as.factor(bank$cons.price.idx)),as.numeric(as.factor(bank$y)))
## [1] -0.102792
p1 <- ggplot(bank) + geom_histogram(aes(x=cons.price.idx),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('cons.price.idx') + geom_vline(aes(xintercept = mean(emp.var.rate), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=cons.price.idx))
p3 <- ggplot(data = bank, aes(x=cons.price.idx, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
## 18.cons.conf.idx
summary(bank$cons.conf.idx)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -50.8 -42.7 -41.8 -40.5 -36.4 -26.9
unique(bank$cons.conf.idx)
## [1] -46.2 -36.4 -41.8 -42.0 -37.5 -42.7 -36.1 -34.8 -50.0 -40.8 -49.5 -38.3
## [13] -29.8 -26.9 -34.6 -47.1 -39.8 -33.6 -50.8 -31.4 -30.1 -40.3 -40.0 -40.4
## [25] -33.0 -45.9
cor(as.numeric(as.factor(bank$cons.conf.idx)),as.numeric(as.factor(bank$y)))
## [1] 0.06968329
p1 <- ggplot(bank) + geom_histogram(aes(x=cons.conf.idx),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('cons.conf.idx') + geom_vline(aes(xintercept = mean(cons.conf.idx), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10)) + theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=cons.conf.idx))
p3 <- ggplot(data = bank, aes(x=cons.conf.idx, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
## 19.euribor3m
summary(bank$euribor3m)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.635 1.334 4.857 3.621 4.961 5.045
unique(bank$euribor3m)
## [1] 1.313 4.855 4.962 4.959 4.191 0.884 0.879 4.153 4.958 4.968 4.859 4.963
## [13] 4.957 4.965 4.961 0.639 4.967 4.864 4.856 1.299 4.860 1.687 4.865 1.268
## [25] 4.120 1.334 0.977 1.344 0.899 1.327 4.592 4.970 1.260 4.966 0.770 4.866
## [37] 4.964 4.857 0.886 0.739 0.654 1.405 1.281 4.960 0.754 1.291 1.365 4.076
## [49] 1.266 1.410 1.250 4.858 0.702 1.029 1.085 1.392 1.262 1.050 0.851 0.716
## [61] 0.877 0.835 1.048 0.904 1.028 0.637 1.244 1.354 4.021 1.453 0.715 1.778
## [73] 0.773 1.035 0.900 0.898 0.742 0.861 1.264 0.704 1.270 0.695 1.039 1.531
## [85] 0.883 0.748 0.809 4.794 1.479 0.697 0.959 1.032 0.896 0.827 1.483 0.905
## [97] 1.466 0.714 0.644 0.849 0.881 0.834 0.645 0.659 0.885 1.041 0.942 0.737
## [109] 4.947 0.722 1.049 1.415 0.797 0.699 0.810 0.710 1.423 0.707 0.646 1.043
## [121] 4.955 0.668 0.825 1.435 0.720 0.767 0.982 1.602 1.259 1.811 0.859 1.224
## [133] 0.876 0.878 1.099 0.788 0.717 0.838 0.640 0.762 1.663 0.730 0.728 1.372
## [145] 0.782 4.245 1.510 3.329 0.749 4.343 0.893 0.731 0.635 0.700 0.889 0.649
## [157] 0.873 1.445 1.629 0.944 3.853 0.870 0.790 5.045 0.914 0.719 0.735 1.498
## [169] 0.677 0.819 0.652 0.692 0.829 1.726 1.406 0.761 0.846 1.252 4.956 0.953
## [181] 0.803 0.937 0.706 0.869 1.703 0.729 0.709 1.046 0.752 0.921 4.921 0.987
## [193] 1.030 1.031 0.741 0.843 1.044 0.643 0.755 0.724 0.882 1.757 1.215 0.740
## [205] 0.683 1.520 4.663 1.059 0.636 0.771 0.655 1.400 0.650 1.384 0.778 0.682
## [217] 1.614 1.040 1.538 1.072 1.000 1.799 1.640 1.650 0.642 0.718 0.768 0.723
## [229] 0.996 0.721 0.672 0.854 1.016 0.965
cor(as.numeric(as.factor(bank$euribor3m)),as.numeric(as.factor(bank$y)))
## [1] -0.370733
p1 <- ggplot(bank) + geom_histogram(aes(x=euribor3m),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('euribor3m') + geom_vline(aes(xintercept = mean(euribor3m), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10))
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=euribor3m))
p3 <- ggplot(data = bank, aes(x=euribor3m, fill=y)) + geom_bar(width=0.1) + guides()
grid.arrange(p1, p2,p3,ncol=2)
## Warning: position_stack requires non-overlapping x intervals
summary(bank$nr.employed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4964 5099 5191 5166 5228 5228
unique(bank$nr.employed)
## [1] 5099.1 5191.0 5228.1 5195.8 4963.6 5008.7 5076.2 4991.6 5017.5 5023.5
## [11] 5176.3
cor(as.numeric(as.factor(bank$nr.employed)),as.numeric(as.factor(bank$y)))
## [1] -0.3516595
p1 <- ggplot(bank) + geom_histogram(aes(x=nr.employed),color="black", fill="grey",bins=30) +
ylab('Count') + xlab('nr.employed') + geom_vline(aes(xintercept = mean(nr.employed), color = "red")) +
scale_x_continuous(breaks = seq(0,100,10))
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=nr.employed))
p3 <- ggplot(data = bank, aes(x=nr.employed, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
library(corrplot)
## corrplot 0.90 loaded
bank_mat <-bank
bank_mat$age<-as.numeric(as.factor(bank_mat$age))
bank_mat$job<-as.numeric(as.factor(bank_mat$job))
bank_mat$marital<-as.numeric(as.factor(bank_mat$marital))
bank_mat$education<-as.numeric(as.factor(bank_mat$education))
bank_mat$default<-as.numeric(as.factor(bank_mat$default))
bank_mat$duration<-as.numeric(as.factor(bank_mat$duration))
bank_mat$housing<-as.numeric(as.factor(bank_mat$housing))
bank_mat$loan<-as.numeric(as.factor(bank_mat$loan))
bank_mat$contact<-as.numeric(as.factor(bank_mat$contact))
bank_mat$pdays<-as.numeric(as.factor(bank_mat$pdays))
bank_mat$month<-as.numeric(as.factor(bank_mat$month))
bank_mat$day_of_week<-as.numeric(as.factor(bank_mat$day_of_week))
bank_mat$campaign<-as.numeric(as.factor(bank_mat$campaign))
bank_mat$previous<-as.numeric(as.factor(bank_mat$previous))
bank_mat$poutcome<-as.numeric(as.factor(bank_mat$poutcome))
bank_mat$y <- as.numeric(as.factor(bank_mat$y))
mat <- cor(bank_mat)
corrplot(mat,method="number",tl.cex=0.7,number.cex = 0.5,col=colorRampPalette(c("grey","blue","black"))(100))
#bank$y <- ifelse(bank$y=='yes',1,0)
#bank$pdays<-as.numeric(as.factor(bank$pdays))
pairs(bank_mat)
bank_1 <- bank_mat[, c('age','duration','month','campaign')]
pairs(bank_1)
# Majority Null for pdays, duration should be removed
bank$pdays <- NULL
bank$duration <- NULL
bank$y <- as.factor(bank$y)
# omitting null/unknowns
bank2 <- na.omit(bank)
bank2[bank2 == "unknown"] <- NA
bank2 <- na.omit(bank2)
str(bank2)
## 'data.frame': 3090 obs. of 19 variables:
## $ age : int 30 39 25 47 32 32 31 36 36 47 ...
## $ job : chr "blue-collar" "services" "services" "admin." ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education : chr "basic.9y" "high.school" "high.school" "university.degree" ...
## $ default : chr "no" "no" "no" "no" ...
## $ housing : chr "yes" "no" "yes" "yes" ...
## $ loan : chr "no" "no" "no" "no" ...
## $ contact : chr "cellular" "telephone" "telephone" "cellular" ...
## $ month : chr "may" "may" "jun" "nov" ...
## $ day_of_week : chr "fri" "fri" "wed" "mon" ...
## $ campaign : int 2 4 1 1 3 4 1 1 2 2 ...
## $ previous : int 0 0 0 0 2 0 1 0 0 0 ...
## $ poutcome : chr "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
## $ emp.var.rate : num -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 1.4 1.1 1.4 ...
## $ cons.price.idx: num 92.9 94 94.5 93.2 94.2 ...
## $ cons.conf.idx : num -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42.7 -36.4 -41.8 ...
## $ euribor3m : num 1.313 4.855 4.962 4.191 0.884 ...
## $ nr.employed : num 5099 5191 5228 5196 4964 ...
## $ y : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
## - attr(*, "na.action")= 'omit' Named int [1:1029] 4 8 10 11 19 21 25 28 29 32 ...
## ..- attr(*, "names")= chr [1:1029] "4" "8" "10" "11" ...
levels(as.factor(bank2$default))
## [1] "no" "yes"
bank2$default = as.factor(bank2$default)
summary(bank2)
## age job marital education
## Min. :20.00 Length:3090 Length:3090 Length:3090
## 1st Qu.:31.00 Class :character Class :character Class :character
## Median :37.00 Mode :character Mode :character Mode :character
## Mean :39.18
## 3rd Qu.:46.00
## Max. :88.00
## default housing loan contact
## no :3089 Length:3090 Length:3090 Length:3090
## yes: 1 Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## month day_of_week campaign previous
## Length:3090 Length:3090 Min. : 1.000 Min. :0.0000
## Class :character Class :character 1st Qu.: 1.000 1st Qu.:0.0000
## Mode :character Mode :character Median : 2.000 Median :0.0000
## Mean : 2.509 Mean :0.2081
## 3rd Qu.: 3.000 3rd Qu.:0.0000
## Max. :35.000 Max. :6.0000
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## Length:3090 Min. :-3.4000 Min. :92.20 Min. :-50.80
## Class :character 1st Qu.:-1.8000 1st Qu.:93.08 1st Qu.:-42.70
## Mode :character Median : 1.1000 Median :93.44 Median :-41.80
## Mean :-0.0468 Mean :93.53 Mean :-40.62
## 3rd Qu.: 1.4000 3rd Qu.:93.99 3rd Qu.:-36.40
## Max. : 1.4000 Max. :94.77 Max. :-26.90
## euribor3m nr.employed y
## Min. :0.635 Min. :4964 no :2720
## 1st Qu.:1.313 1st Qu.:5099 yes: 370
## Median :4.856 Median :5191
## Mean :3.482 Mean :5161
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
# Splitting data train and test
splitBank = sort(sample(nrow(bank2), nrow(bank2)*.75))
Btrain <- bank2[splitBank,]
Btest <- bank2[-splitBank,]
# AIC model
form_2 = as.formula(paste0('y ~ .'))
form_2
## y ~ .
set.seed(1234)
objControl <- trainControl(method = "none",
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE)
Btrain[Btrain == "blue-collar"] <- "bluecollar"
Btrain[Btrain == "self-employed"] <- "selfemployed"
aicmodel <- train(form_2, data = Btrain,
method = 'glmStepAIC',
trControl = objControl,
metric = "ROC",
direction = 'forward')
## Start: AIC=1733.79
## .outcome ~ 1
##
## Df Deviance AIC
## + nr.employed 1 1477.7 1481.7
## + euribor3m 1 1520.7 1524.7
## + emp.var.rate 1 1543.0 1547.0
## + poutcomesuccess 1 1584.5 1588.5
## + previous 1 1625.2 1629.2
## + poutcomenonexistent 1 1651.2 1655.2
## + contacttelephone 1 1665.7 1669.7
## + monthmar 1 1682.8 1686.8
## + cons.price.idx 1 1704.7 1708.7
## + monthoct 1 1705.1 1709.1
## + monthmay 1 1706.0 1710.0
## + campaign 1 1711.4 1715.4
## + monthsep 1 1712.4 1716.4
## + maritalsingle 1 1723.3 1727.3
## + jobstudent 1 1723.3 1727.3
## + maritalmarried 1 1724.1 1728.1
## + cons.conf.idx 1 1724.7 1728.7
## + jobentrepreneur 1 1724.9 1728.9
## + educationuniversity.degree 1 1726.6 1730.6
## + monthdec 1 1726.7 1730.7
## + jobretired 1 1726.9 1730.9
## + age 1 1727.5 1731.5
## + monthjul 1 1727.5 1731.5
## + educationbasic.9y 1 1727.9 1731.9
## + jobbluecollar 1 1728.5 1732.5
## + jobunemployed 1 1728.8 1732.8
## <none> 1731.8 1733.8
## + jobmanagement 1 1730.0 1734.0
## + monthjun 1 1730.2 1734.2
## + jobservices 1 1730.5 1734.5
## + monthnov 1 1730.6 1734.6
## + housingyes 1 1731.0 1735.0
## + educationbasic.6y 1 1731.2 1735.2
## + day_of_weekwed 1 1731.2 1735.2
## + loanyes 1 1731.4 1735.4
## + jobselfemployed 1 1731.4 1735.4
## + defaultyes 1 1731.5 1735.5
## + monthaug 1 1731.7 1735.7
## + educationprofessional.course 1 1731.8 1735.8
## + educationhigh.school 1 1731.8 1735.8
## + day_of_weekmon 1 1731.8 1735.8
## + day_of_weekthu 1 1731.8 1735.8
## + jobtechnician 1 1731.8 1735.8
## + day_of_weektue 1 1731.8 1735.8
## + jobhousemaid 1 1731.8 1735.8
##
## Step: AIC=1481.69
## .outcome ~ nr.employed
##
## Df Deviance AIC
## + poutcomesuccess 1 1434.4 1440.4
## + monthmay 1 1448.5 1454.5
## + contacttelephone 1 1456.5 1462.5
## + monthmar 1 1456.5 1462.5
## + monthjun 1 1469.5 1475.5
## + previous 1 1469.9 1475.9
## + monthaug 1 1472.2 1478.2
## + cons.conf.idx 1 1472.7 1478.7
## + monthjul 1 1472.9 1478.9
## + campaign 1 1473.2 1479.2
## + jobentrepreneur 1 1473.5 1479.5
## + maritalmarried 1 1474.5 1480.5
## + maritalsingle 1 1474.7 1480.7
## + poutcomenonexistent 1 1475.1 1481.1
## + educationuniversity.degree 1 1475.3 1481.3
## <none> 1477.7 1481.7
## + monthsep 1 1476.2 1482.2
## + jobmanagement 1 1476.5 1482.5
## + educationbasic.9y 1 1476.6 1482.6
## + jobtechnician 1 1476.6 1482.6
## + emp.var.rate 1 1476.6 1482.6
## + day_of_weekmon 1 1476.6 1482.6
## + jobservices 1 1476.6 1482.6
## + jobunemployed 1 1476.8 1482.8
## + jobbluecollar 1 1477.0 1483.0
## + cons.price.idx 1 1477.2 1483.2
## + housingyes 1 1477.3 1483.3
## + age 1 1477.3 1483.3
## + jobselfemployed 1 1477.3 1483.3
## + educationhigh.school 1 1477.4 1483.4
## + jobstudent 1 1477.5 1483.5
## + educationprofessional.course 1 1477.5 1483.5
## + monthnov 1 1477.5 1483.5
## + defaultyes 1 1477.6 1483.6
## + monthoct 1 1477.6 1483.6
## + monthdec 1 1477.6 1483.6
## + jobretired 1 1477.6 1483.6
## + euribor3m 1 1477.7 1483.7
## + loanyes 1 1477.7 1483.7
## + day_of_weekthu 1 1477.7 1483.7
## + day_of_weektue 1 1477.7 1483.7
## + educationbasic.6y 1 1477.7 1483.7
## + day_of_weekwed 1 1477.7 1483.7
## + jobhousemaid 1 1477.7 1483.7
##
## Step: AIC=1440.41
## .outcome ~ nr.employed + poutcomesuccess
##
## Df Deviance AIC
## + monthmay 1 1411.0 1419.0
## + monthmar 1 1413.6 1421.6
## + contacttelephone 1 1414.0 1422.0
## + monthjun 1 1425.3 1433.3
## + campaign 1 1429.8 1437.8
## + maritalmarried 1 1430.7 1438.7
## + jobentrepreneur 1 1431.2 1439.2
## + monthjul 1 1431.2 1439.2
## + maritalsingle 1 1431.3 1439.3
## + monthaug 1 1431.3 1439.3
## + emp.var.rate 1 1431.9 1439.9
## + poutcomenonexistent 1 1432.2 1440.2
## <none> 1434.4 1440.4
## + monthsep 1 1432.4 1440.4
## + cons.conf.idx 1 1432.6 1440.6
## + educationuniversity.degree 1 1432.7 1440.7
## + jobtechnician 1 1432.7 1440.7
## + jobmanagement 1 1433.1 1441.1
## + cons.price.idx 1 1433.2 1441.2
## + jobstudent 1 1433.6 1441.6
## + jobservices 1 1433.6 1441.6
## + monthnov 1 1433.6 1441.6
## + day_of_weekmon 1 1433.7 1441.7
## + monthoct 1 1433.8 1441.8
## + educationprofessional.course 1 1434.0 1442.0
## + educationbasic.9y 1 1434.0 1442.0
## + educationhigh.school 1 1434.0 1442.0
## + jobbluecollar 1 1434.1 1442.1
## + jobunemployed 1 1434.1 1442.1
## + euribor3m 1 1434.1 1442.1
## + jobselfemployed 1 1434.1 1442.1
## + jobretired 1 1434.2 1442.2
## + defaultyes 1 1434.3 1442.3
## + previous 1 1434.3 1442.3
## + loanyes 1 1434.3 1442.3
## + age 1 1434.4 1442.4
## + monthdec 1 1434.4 1442.4
## + housingyes 1 1434.4 1442.4
## + day_of_weekwed 1 1434.4 1442.4
## + jobhousemaid 1 1434.4 1442.4
## + day_of_weektue 1 1434.4 1442.4
## + day_of_weekthu 1 1434.4 1442.4
## + educationbasic.6y 1 1434.4 1442.4
##
## Step: AIC=1419.01
## .outcome ~ nr.employed + poutcomesuccess + monthmay
##
## Df Deviance AIC
## + monthmar 1 1394.8 1404.8
## + contacttelephone 1 1396.7 1406.7
## + monthnov 1 1406.6 1416.6
## + campaign 1 1406.6 1416.6
## + monthsep 1 1406.8 1416.8
## + euribor3m 1 1407.0 1417.0
## + emp.var.rate 1 1407.1 1417.1
## + monthjun 1 1407.5 1417.5
## + jobentrepreneur 1 1407.5 1417.5
## + maritalmarried 1 1408.1 1418.1
## + cons.price.idx 1 1408.4 1418.4
## + maritalsingle 1 1408.8 1418.8
## + poutcomenonexistent 1 1408.9 1418.9
## <none> 1411.0 1419.0
## + jobtechnician 1 1409.5 1419.5
## + day_of_weekmon 1 1409.9 1419.9
## + jobmanagement 1 1409.9 1419.9
## + jobretired 1 1410.2 1420.2
## + educationuniversity.degree 1 1410.4 1420.4
## + jobservices 1 1410.6 1420.6
## + educationprofessional.course 1 1410.7 1420.7
## + jobselfemployed 1 1410.7 1420.7
## + monthjul 1 1410.7 1420.7
## + jobstudent 1 1410.7 1420.7
## + educationhigh.school 1 1410.7 1420.7
## + jobunemployed 1 1410.8 1420.8
## + monthdec 1 1410.8 1420.8
## + monthaug 1 1410.8 1420.8
## + defaultyes 1 1410.8 1420.8
## + loanyes 1 1410.9 1420.9
## + previous 1 1410.9 1420.9
## + jobbluecollar 1 1411.0 1421.0
## + educationbasic.6y 1 1411.0 1421.0
## + day_of_weekwed 1 1411.0 1421.0
## + jobhousemaid 1 1411.0 1421.0
## + educationbasic.9y 1 1411.0 1421.0
## + housingyes 1 1411.0 1421.0
## + cons.conf.idx 1 1411.0 1421.0
## + monthoct 1 1411.0 1421.0
## + age 1 1411.0 1421.0
## + day_of_weektue 1 1411.0 1421.0
## + day_of_weekthu 1 1411.0 1421.0
##
## Step: AIC=1404.75
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar
##
## Df Deviance AIC
## + contacttelephone 1 1380.9 1392.9
## + monthjun 1 1389.7 1401.7
## + campaign 1 1390.3 1402.3
## + emp.var.rate 1 1391.0 1403.0
## + jobentrepreneur 1 1391.7 1403.7
## + monthnov 1 1391.7 1403.7
## + monthsep 1 1392.0 1404.0
## + euribor3m 1 1392.0 1404.0
## + maritalmarried 1 1392.3 1404.3
## + poutcomenonexistent 1 1392.4 1404.4
## <none> 1394.8 1404.8
## + maritalsingle 1 1392.8 1404.8
## + cons.price.idx 1 1393.0 1405.0
## + jobtechnician 1 1393.5 1405.5
## + jobmanagement 1 1393.5 1405.5
## + day_of_weekmon 1 1393.6 1405.6
## + jobretired 1 1393.8 1405.8
## + educationuniversity.degree 1 1394.0 1406.0
## + monthaug 1 1394.1 1406.1
## + monthjul 1 1394.1 1406.1
## + jobselfemployed 1 1394.3 1406.3
## + cons.conf.idx 1 1394.3 1406.3
## + jobservices 1 1394.4 1406.4
## + jobunemployed 1 1394.4 1406.4
## + monthoct 1 1394.5 1406.5
## + loanyes 1 1394.6 1406.6
## + educationprofessional.course 1 1394.6 1406.6
## + educationhigh.school 1 1394.6 1406.6
## + defaultyes 1 1394.6 1406.6
## + day_of_weekwed 1 1394.6 1406.6
## + previous 1 1394.6 1406.6
## + monthdec 1 1394.7 1406.7
## + jobbluecollar 1 1394.7 1406.7
## + jobstudent 1 1394.7 1406.7
## + educationbasic.9y 1 1394.7 1406.7
## + age 1 1394.7 1406.7
## + educationbasic.6y 1 1394.7 1406.7
## + day_of_weektue 1 1394.7 1406.7
## + housingyes 1 1394.8 1406.8
## + jobhousemaid 1 1394.8 1406.8
## + day_of_weekthu 1 1394.8 1406.8
##
## Step: AIC=1392.92
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone
##
## Df Deviance AIC
## + monthjun 1 1367.3 1381.3
## + poutcomenonexistent 1 1376.2 1390.2
## + monthnov 1 1376.8 1390.8
## + campaign 1 1377.5 1391.5
## + jobentrepreneur 1 1377.8 1391.8
## + cons.conf.idx 1 1378.7 1392.7
## <none> 1380.9 1392.9
## + maritalmarried 1 1379.0 1393.0
## + jobmanagement 1 1379.1 1393.1
## + maritalsingle 1 1379.5 1393.5
## + monthsep 1 1379.7 1393.7
## + jobtechnician 1 1379.8 1393.8
## + emp.var.rate 1 1379.9 1393.9
## + jobretired 1 1379.9 1393.9
## + day_of_weekmon 1 1380.1 1394.1
## + previous 1 1380.3 1394.3
## + jobunemployed 1 1380.3 1394.3
## + monthoct 1 1380.4 1394.4
## + jobselfemployed 1 1380.4 1394.4
## + jobservices 1 1380.5 1394.5
## + educationuniversity.degree 1 1380.6 1394.6
## + educationprofessional.course 1 1380.7 1394.7
## + loanyes 1 1380.7 1394.7
## + monthjul 1 1380.7 1394.7
## + defaultyes 1 1380.7 1394.7
## + educationbasic.6y 1 1380.8 1394.8
## + jobbluecollar 1 1380.8 1394.8
## + jobstudent 1 1380.8 1394.8
## + educationhigh.school 1 1380.8 1394.8
## + euribor3m 1 1380.8 1394.8
## + day_of_weekwed 1 1380.9 1394.9
## + educationbasic.9y 1 1380.9 1394.9
## + housingyes 1 1380.9 1394.9
## + monthdec 1 1380.9 1394.9
## + jobhousemaid 1 1380.9 1394.9
## + day_of_weekthu 1 1380.9 1394.9
## + monthaug 1 1380.9 1394.9
## + cons.price.idx 1 1380.9 1394.9
## + day_of_weektue 1 1380.9 1394.9
## + age 1 1380.9 1394.9
##
## Step: AIC=1381.28
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun
##
## Df Deviance AIC
## + cons.conf.idx 1 1362.8 1378.8
## + poutcomenonexistent 1 1363.1 1379.1
## + campaign 1 1364.1 1380.1
## + jobentrepreneur 1 1364.6 1380.6
## <none> 1367.3 1381.3
## + monthnov 1 1365.3 1381.3
## + monthoct 1 1365.6 1381.6
## + maritalmarried 1 1365.7 1381.7
## + jobmanagement 1 1365.8 1381.8
## + monthjul 1 1366.1 1382.1
## + maritalsingle 1 1366.1 1382.1
## + jobtechnician 1 1366.2 1382.2
## + jobselfemployed 1 1366.3 1382.3
## + jobretired 1 1366.4 1382.4
## + monthaug 1 1366.6 1382.6
## + day_of_weekmon 1 1366.7 1382.7
## + previous 1 1366.8 1382.8
## + jobunemployed 1 1366.8 1382.8
## + jobservices 1 1366.9 1382.9
## + educationprofessional.course 1 1367.0 1383.0
## + euribor3m 1 1367.0 1383.0
## + monthsep 1 1367.0 1383.0
## + educationuniversity.degree 1 1367.1 1383.1
## + loanyes 1 1367.1 1383.1
## + defaultyes 1 1367.1 1383.1
## + emp.var.rate 1 1367.2 1383.2
## + cons.price.idx 1 1367.2 1383.2
## + educationbasic.6y 1 1367.2 1383.2
## + jobstudent 1 1367.2 1383.2
## + jobbluecollar 1 1367.2 1383.2
## + educationhigh.school 1 1367.2 1383.2
## + age 1 1367.2 1383.2
## + housingyes 1 1367.2 1383.2
## + jobhousemaid 1 1367.2 1383.2
## + day_of_weekwed 1 1367.2 1383.2
## + day_of_weektue 1 1367.3 1383.3
## + educationbasic.9y 1 1367.3 1383.3
## + day_of_weekthu 1 1367.3 1383.3
## + monthdec 1 1367.3 1383.3
##
## Step: AIC=1378.75
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun + cons.conf.idx
##
## Df Deviance AIC
## + poutcomenonexistent 1 1358.3 1376.3
## + campaign 1 1359.7 1377.7
## + jobentrepreneur 1 1360.1 1378.1
## + monthjul 1 1360.7 1378.7
## <none> 1362.8 1378.8
## + maritalmarried 1 1361.2 1379.2
## + jobmanagement 1 1361.3 1379.3
## + monthnov 1 1361.5 1379.5
## + jobretired 1 1361.5 1379.5
## + maritalsingle 1 1361.5 1379.5
## + jobselfemployed 1 1361.7 1379.7
## + jobtechnician 1 1361.8 1379.8
## + monthoct 1 1361.8 1379.8
## + monthsep 1 1362.1 1380.1
## + previous 1 1362.3 1380.3
## + cons.price.idx 1 1362.3 1380.3
## + day_of_weekmon 1 1362.3 1380.3
## + jobunemployed 1 1362.3 1380.3
## + educationprofessional.course 1 1362.5 1380.5
## + jobbluecollar 1 1362.5 1380.5
## + educationbasic.6y 1 1362.5 1380.5
## + euribor3m 1 1362.5 1380.5
## + jobservices 1 1362.5 1380.5
## + loanyes 1 1362.6 1380.6
## + defaultyes 1 1362.6 1380.6
## + day_of_weektue 1 1362.7 1380.7
## + jobstudent 1 1362.7 1380.7
## + educationuniversity.degree 1 1362.7 1380.7
## + emp.var.rate 1 1362.7 1380.7
## + educationhigh.school 1 1362.7 1380.7
## + jobhousemaid 1 1362.7 1380.7
## + monthdec 1 1362.7 1380.7
## + day_of_weekthu 1 1362.7 1380.7
## + day_of_weekwed 1 1362.7 1380.7
## + housingyes 1 1362.7 1380.7
## + monthaug 1 1362.7 1380.7
## + age 1 1362.8 1380.8
## + educationbasic.9y 1 1362.8 1380.8
##
## Step: AIC=1376.32
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent
##
## Df Deviance AIC
## + campaign 1 1355.1 1375.1
## + jobentrepreneur 1 1355.7 1375.7
## + monthjul 1 1356.2 1376.2
## <none> 1358.3 1376.3
## + previous 1 1356.6 1376.6
## + maritalmarried 1 1356.8 1376.8
## + jobmanagement 1 1356.9 1376.9
## + jobretired 1 1357.0 1377.0
## + monthnov 1 1357.1 1377.1
## + maritalsingle 1 1357.2 1377.2
## + jobselfemployed 1 1357.2 1377.2
## + jobtechnician 1 1357.2 1377.2
## + monthoct 1 1357.4 1377.4
## + monthsep 1 1357.6 1377.6
## + day_of_weekmon 1 1357.8 1377.8
## + cons.price.idx 1 1357.8 1377.8
## + educationprofessional.course 1 1357.9 1377.9
## + jobbluecollar 1 1358.0 1378.0
## + jobunemployed 1 1358.0 1378.0
## + educationbasic.6y 1 1358.1 1378.1
## + jobstudent 1 1358.1 1378.1
## + jobservices 1 1358.1 1378.1
## + euribor3m 1 1358.2 1378.2
## + loanyes 1 1358.2 1378.2
## + defaultyes 1 1358.2 1378.2
## + day_of_weektue 1 1358.2 1378.2
## + emp.var.rate 1 1358.3 1378.3
## + monthaug 1 1358.3 1378.3
## + educationuniversity.degree 1 1358.3 1378.3
## + day_of_weekwed 1 1358.3 1378.3
## + educationhigh.school 1 1358.3 1378.3
## + housingyes 1 1358.3 1378.3
## + monthdec 1 1358.3 1378.3
## + day_of_weekthu 1 1358.3 1378.3
## + jobhousemaid 1 1358.3 1378.3
## + educationbasic.9y 1 1358.3 1378.3
## + age 1 1358.3 1378.3
##
## Step: AIC=1375.06
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent +
## campaign
##
## Df Deviance AIC
## + jobentrepreneur 1 1352.2 1374.2
## + monthjul 1 1352.6 1374.6
## <none> 1355.1 1375.1
## + previous 1 1353.3 1375.3
## + monthnov 1 1353.5 1375.5
## + jobmanagement 1 1353.6 1375.6
## + maritalmarried 1 1353.6 1375.6
## + jobretired 1 1353.7 1375.7
## + maritalsingle 1 1353.9 1375.9
## + jobselfemployed 1 1354.0 1376.0
## + jobtechnician 1 1354.1 1376.1
## + cons.price.idx 1 1354.2 1376.2
## + monthoct 1 1354.3 1376.3
## + monthsep 1 1354.4 1376.4
## + day_of_weekmon 1 1354.6 1376.6
## + educationprofessional.course 1 1354.7 1376.7
## + euribor3m 1 1354.8 1376.8
## + educationbasic.6y 1 1354.8 1376.8
## + jobunemployed 1 1354.8 1376.8
## + jobbluecollar 1 1354.8 1376.8
## + emp.var.rate 1 1354.8 1376.8
## + jobstudent 1 1354.9 1376.9
## + loanyes 1 1354.9 1376.9
## + day_of_weektue 1 1354.9 1376.9
## + defaultyes 1 1355.0 1377.0
## + jobservices 1 1355.0 1377.0
## + educationuniversity.degree 1 1355.0 1377.0
## + housingyes 1 1355.0 1377.0
## + day_of_weekthu 1 1355.0 1377.0
## + educationbasic.9y 1 1355.0 1377.0
## + educationhigh.school 1 1355.1 1377.1
## + monthaug 1 1355.1 1377.1
## + day_of_weekwed 1 1355.1 1377.1
## + jobhousemaid 1 1355.1 1377.1
## + monthdec 1 1355.1 1377.1
## + age 1 1355.1 1377.1
##
## Step: AIC=1374.25
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent +
## campaign + jobentrepreneur
##
## Df Deviance AIC
## + monthjul 1 1350.0 1374.0
## <none> 1352.2 1374.2
## + previous 1 1350.4 1374.4
## + jobmanagement 1 1350.5 1374.5
## + jobretired 1 1350.8 1374.8
## + monthnov 1 1350.9 1374.9
## + maritalmarried 1 1350.9 1374.9
## + jobselfemployed 1 1351.1 1375.1
## + maritalsingle 1 1351.3 1375.3
## + cons.price.idx 1 1351.5 1375.5
## + monthoct 1 1351.5 1375.5
## + jobtechnician 1 1351.5 1375.5
## + monthsep 1 1351.6 1375.6
## + day_of_weekmon 1 1351.8 1375.8
## + educationprofessional.course 1 1351.9 1375.9
## + educationbasic.6y 1 1352.0 1376.0
## + euribor3m 1 1352.0 1376.0
## + jobunemployed 1 1352.0 1376.0
## + emp.var.rate 1 1352.1 1376.1
## + loanyes 1 1352.1 1376.1
## + jobservices 1 1352.1 1376.1
## + jobstudent 1 1352.1 1376.1
## + jobbluecollar 1 1352.1 1376.1
## + day_of_weektue 1 1352.1 1376.1
## + defaultyes 1 1352.1 1376.1
## + educationuniversity.degree 1 1352.2 1376.2
## + day_of_weekthu 1 1352.2 1376.2
## + monthaug 1 1352.2 1376.2
## + housingyes 1 1352.2 1376.2
## + educationbasic.9y 1 1352.2 1376.2
## + educationhigh.school 1 1352.2 1376.2
## + age 1 1352.2 1376.2
## + day_of_weekwed 1 1352.2 1376.2
## + jobhousemaid 1 1352.2 1376.2
## + monthdec 1 1352.2 1376.2
##
## Step: AIC=1374.01
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar +
## contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent +
## campaign + jobentrepreneur + monthjul
##
## Df Deviance AIC
## <none> 1350.0 1374.0
## + previous 1 1348.3 1374.3
## + jobmanagement 1 1348.5 1374.5
## + jobretired 1 1348.6 1374.6
## + jobselfemployed 1 1348.9 1374.9
## + maritalmarried 1 1348.9 1374.9
## + monthoct 1 1349.2 1375.2
## + maritalsingle 1 1349.2 1375.2
## + jobtechnician 1 1349.3 1375.3
## + monthsep 1 1349.4 1375.4
## + monthnov 1 1349.5 1375.5
## + educationbasic.6y 1 1349.7 1375.7
## + day_of_weekmon 1 1349.7 1375.7
## + educationprofessional.course 1 1349.8 1375.8
## + jobunemployed 1 1349.8 1375.8
## + monthaug 1 1349.8 1375.8
## + cons.price.idx 1 1349.8 1375.8
## + jobservices 1 1349.8 1375.8
## + loanyes 1 1349.8 1375.8
## + day_of_weektue 1 1349.8 1375.8
## + educationuniversity.degree 1 1349.9 1375.9
## + jobstudent 1 1349.9 1375.9
## + jobbluecollar 1 1349.9 1375.9
## + defaultyes 1 1349.9 1375.9
## + euribor3m 1 1349.9 1375.9
## + day_of_weekthu 1 1350.0 1376.0
## + educationbasic.9y 1 1350.0 1376.0
## + educationhigh.school 1 1350.0 1376.0
## + emp.var.rate 1 1350.0 1376.0
## + age 1 1350.0 1376.0
## + housingyes 1 1350.0 1376.0
## + day_of_weekwed 1 1350.0 1376.0
## + monthdec 1 1350.0 1376.0
## + jobhousemaid 1 1350.0 1376.0
summary(aicmodel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7170 -0.4104 -0.3556 -0.2348 2.7132
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 48.7547659 4.8727168 10.006 < 2e-16 ***
## nr.employed -0.0096611 0.0009683 -9.978 < 2e-16 ***
## poutcomesuccess 1.8134198 0.3003673 6.037 1.57e-09 ***
## monthmay -0.1917847 0.2041473 -0.939 0.3475
## monthmar 1.9170591 0.4181888 4.584 4.56e-06 ***
## contacttelephone -1.0829270 0.2223230 -4.871 1.11e-06 ***
## monthjun 1.0333749 0.2510308 4.117 3.85e-05 ***
## cons.conf.idx 0.0331188 0.0141603 2.339 0.0193 *
## poutcomenonexistent 0.4541331 0.2160839 2.102 0.0356 *
## campaign -0.0777937 0.0432435 -1.799 0.0720 .
## jobentrepreneur -0.8700039 0.6106654 -1.425 0.1542
## monthjul 0.3602712 0.2378341 1.515 0.1298
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.8 on 2316 degrees of freedom
## Residual deviance: 1350.0 on 2305 degrees of freedom
## AIC: 1374
##
## Number of Fisher Scoring iterations: 6
g1 = glm(formula = y ~ ., data = Btrain, family = binomial)
summary(g1)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = Btrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9514 -0.4299 -0.3406 -0.2358 2.7311
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.166e+02 1.342e+02 -0.869 0.3850
## age 1.378e-02 8.973e-03 1.536 0.1245
## jobbluecollar 1.715e-01 2.935e-01 0.584 0.5590
## jobentrepreneur -9.667e-01 6.312e-01 -1.532 0.1256
## jobhousemaid 5.649e-02 5.152e-01 0.110 0.9127
## jobmanagement -4.349e-01 3.109e-01 -1.399 0.1619
## jobretired -6.118e-01 4.087e-01 -1.497 0.1344
## jobselfemployed -4.625e-01 4.193e-01 -1.103 0.2701
## jobservices -1.123e-01 3.141e-01 -0.357 0.7207
## jobstudent 1.926e-01 4.522e-01 0.426 0.6702
## jobtechnician 7.799e-02 2.383e-01 0.327 0.7435
## jobunemployed 2.142e-01 4.266e-01 0.502 0.6157
## maritalmarried -1.254e-01 2.531e-01 -0.496 0.6202
## maritalsingle 5.963e-02 2.837e-01 0.210 0.8335
## educationbasic.6y 5.399e-01 4.842e-01 1.115 0.2648
## educationbasic.9y 3.381e-01 3.949e-01 0.856 0.3919
## educationhigh.school 4.634e-01 3.743e-01 1.238 0.2157
## educationprofessional.course 5.214e-01 3.961e-01 1.316 0.1881
## educationuniversity.degree 5.623e-01 3.795e-01 1.482 0.1384
## defaultyes -9.443e+00 3.247e+02 -0.029 0.9768
## housingyes -1.452e-02 1.507e-01 -0.096 0.9232
## loanyes -8.390e-02 2.060e-01 -0.407 0.6838
## contacttelephone -1.281e+00 3.018e-01 -4.245 2.19e-05 ***
## monthaug 2.537e-01 4.684e-01 0.542 0.5881
## monthdec 2.201e-01 8.079e-01 0.272 0.7853
## monthjul 1.415e-01 4.047e-01 0.350 0.7266
## monthjun 4.917e-01 4.705e-01 1.045 0.2960
## monthmar 2.240e+00 5.738e-01 3.903 9.49e-05 ***
## monthmay -6.994e-02 3.334e-01 -0.210 0.8338
## monthnov -2.065e-01 4.650e-01 -0.444 0.6570
## monthoct 4.576e-01 5.868e-01 0.780 0.4355
## monthsep 2.028e-01 6.712e-01 0.302 0.7626
## day_of_weekmon -1.607e-01 2.300e-01 -0.699 0.4847
## day_of_weekthu -6.108e-02 2.324e-01 -0.263 0.7927
## day_of_weektue -1.351e-01 2.369e-01 -0.570 0.5685
## day_of_weekwed -7.853e-02 2.378e-01 -0.330 0.7412
## campaign -7.483e-02 4.367e-02 -1.714 0.0866 .
## previous 2.291e-01 2.015e-01 1.137 0.2555
## poutcomenonexistent 7.824e-01 3.485e-01 2.245 0.0248 *
## poutcomesuccess 1.812e+00 3.162e-01 5.729 1.01e-08 ***
## emp.var.rate -7.817e-01 4.857e-01 -1.609 0.1075
## cons.price.idx 1.178e+00 8.736e-01 1.349 0.1775
## cons.conf.idx 4.049e-02 2.984e-02 1.357 0.1748
## euribor3m 6.934e-02 4.933e-01 0.141 0.8882
## nr.employed 8.207e-04 1.114e-02 0.074 0.9413
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.8 on 2316 degrees of freedom
## Residual deviance: 1331.9 on 2272 degrees of freedom
## AIC: 1421.9
##
## Number of Fisher Scoring iterations: 11
vif(g1)
## GVIF Df GVIF^(1/(2*Df))
## age 2.042470 1 1.429150
## job 6.443548 10 1.097631
## marital 1.479163 2 1.102818
## education 3.357011 5 1.128744
## default 1.000003 1 1.000002
## housing 1.074458 1 1.036561
## loan 1.035792 1 1.017739
## contact 2.383798 1 1.543955
## month 93.327292 9 1.286604
## day_of_week 1.168171 4 1.019620
## campaign 1.064350 1 1.031673
## previous 4.098425 1 2.024457
## poutcome 4.413599 2 1.449433
## emp.var.rate 129.160437 1 11.364877
## cons.price.idx 62.820665 1 7.925949
## cons.conf.idx 5.839559 1 2.416518
## euribor3m 153.227292 1 12.378501
## nr.employed 185.061496 1 13.603731
#multicollinerty square last column > 10
g2 = glm(formula = y ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + campaign + previous + poutcome + cons.conf.idx, data = Btrain, family = binomial)
summary(g2)
##
## Call:
## glm(formula = y ~ age + job + marital + education + default +
## housing + loan + contact + month + day_of_week + campaign +
## previous + poutcome + cons.conf.idx, family = binomial, data = Btrain)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0322 -0.4765 -0.3844 -0.2454 2.8590
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.160208 1.119400 -1.036 0.29999
## age 0.018839 0.008877 2.122 0.03383 *
## jobbluecollar 0.168841 0.287526 0.587 0.55705
## jobentrepreneur -1.021741 0.628333 -1.626 0.10393
## jobhousemaid 0.157648 0.491831 0.321 0.74857
## jobmanagement -0.520983 0.304542 -1.711 0.08713 .
## jobretired -0.430118 0.401267 -1.072 0.28377
## jobselfemployed -0.477189 0.413727 -1.153 0.24875
## jobservices -0.065224 0.306366 -0.213 0.83141
## jobstudent 0.793640 0.446381 1.778 0.07541 .
## jobtechnician 0.015248 0.230292 0.066 0.94721
## jobunemployed 0.345398 0.411445 0.839 0.40120
## maritalmarried -0.144562 0.244344 -0.592 0.55410
## maritalsingle 0.108589 0.273317 0.397 0.69115
## educationbasic.6y 0.613411 0.473963 1.294 0.19559
## educationbasic.9y 0.375210 0.393141 0.954 0.33989
## educationhigh.school 0.565601 0.374249 1.511 0.13071
## educationprofessional.course 0.667561 0.394129 1.694 0.09031 .
## educationuniversity.degree 0.747415 0.378561 1.974 0.04834 *
## defaultyes -10.098315 324.744149 -0.031 0.97519
## housingyes -0.004615 0.147049 -0.031 0.97496
## loanyes -0.111740 0.201335 -0.555 0.57890
## contacttelephone -1.763254 0.242901 -7.259 3.90e-13 ***
## monthaug -1.019246 0.389277 -2.618 0.00884 **
## monthdec 0.070405 0.807249 0.087 0.93050
## monthjul -0.687185 0.345542 -1.989 0.04673 *
## monthjun 0.822571 0.360265 2.283 0.02242 *
## monthmar 1.985356 0.487109 4.076 4.59e-05 ***
## monthmay -0.316261 0.317376 -0.996 0.31901
## monthnov -0.783809 0.358960 -2.184 0.02899 *
## monthoct 0.687158 0.482276 1.425 0.15421
## monthsep 0.281533 0.515757 0.546 0.58516
## day_of_weekmon -0.114120 0.224988 -0.507 0.61200
## day_of_weekthu -0.065997 0.227209 -0.290 0.77146
## day_of_weektue -0.106158 0.231208 -0.459 0.64613
## day_of_weekwed -0.063015 0.231705 -0.272 0.78565
## campaign -0.091075 0.041905 -2.173 0.02975 *
## previous 0.522585 0.201434 2.594 0.00948 **
## poutcomenonexistent 0.779988 0.350580 2.225 0.02609 *
## poutcomesuccess 2.204888 0.314099 7.020 2.22e-12 ***
## cons.conf.idx 0.050201 0.018859 2.662 0.00777 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.8 on 2316 degrees of freedom
## Residual deviance: 1390.1 on 2276 degrees of freedom
## AIC: 1472.1
##
## Number of Fisher Scoring iterations: 11
vif(g2)
## GVIF Df GVIF^(1/(2*Df))
## age 1.916263 1 1.384292
## job 5.384075 10 1.087816
## marital 1.462097 2 1.099624
## education 3.135346 5 1.121059
## default 1.000003 1 1.000001
## housing 1.063613 1 1.031316
## loan 1.027543 1 1.013678
## contact 1.583114 1 1.258219
## month 3.985011 9 1.079834
## day_of_week 1.130158 4 1.015412
## campaign 1.052298 1 1.025816
## previous 3.990697 1 1.997673
## poutcome 4.465515 2 1.453677
## cons.conf.idx 2.146600 1 1.465128
# model 3 from AIC
g3 = glm(formula = y ~ nr.employed + poutcome + month + contact + cons.conf.idx + campaign, data = Btrain, family = binomial)
vif(g3)
## GVIF Df GVIF^(1/(2*Df))
## nr.employed 1.895607 1 1.376811
## poutcome 1.353166 2 1.078544
## month 3.540725 9 1.072766
## contact 1.402960 1 1.184466
## cons.conf.idx 1.909875 1 1.381982
## campaign 1.038116 1 1.018880
# Probability for 1st model
Btrain$PredProb = predict.glm(g2, newdata = Btrain, type = "response")
Btrain$Predy = ifelse(Btrain$PredProb >= 0.5,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1996 35
## yes 216 70
##
## Accuracy : 0.8917
## 95% CI : (0.8783, 0.904)
## No Information Rate : 0.9547
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3125
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9024
## Specificity : 0.6667
## Pos Pred Value : 0.9828
## Neg Pred Value : 0.2448
## Prevalence : 0.9547
## Detection Rate : 0.8615
## Detection Prevalence : 0.8766
## Balanced Accuracy : 0.7845
##
## 'Positive' Class : no
##
# ROC CURVE MODEL 1
lgPredObj <- prediction(Btrain$PredProb, Btrain$y)
lgPerfObj <- performance(lgPredObj, "tpr", "fpr")
plot(lgPerfObj, main = "ROC Curve", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 2, lty = 3, col = "black")
find_cutoff <- function(actual_value, positive_class_name, negitive_class_name, pred_probability, pred_01=1, pred_10=1){
# Initialising Variables
msclaf_cost <- c()
youden_index <- c()
cutoff <- c()
pred00 <- c() #correct classification of negative as negative (Sensitivity)
pred01 <- c() #mis classification of negative to positive class (actual is 0, predicted 1)
pred10 <- c() #mis classification of positive to negative class (actual 1 predicted 0)
pred11 <- c() #correct classification of positive as positive (Specificity)
costs = matrix(c(0, pred_01, pred_10, 0), ncol = 2)
for (i in 1:100) {
predList <- as.factor(ifelse(pred_probability >= i/100, positive_class_name, negitive_class_name))
tbl <- table(predList, actual_value)
# Classifying actual no as yes
pred00[i] <- tbl[1]/(tbl[1] + tbl[2])
pred01[i] <- tbl[2]/(tbl[1] + tbl[2])
# Classifying actual yes as no
pred10[i] <- tbl[3]/(tbl[3] + tbl[4])
pred11[i] <- tbl[4]/(tbl[3] + tbl[4])
cutoff[i] <- i/100
msclaf_cost[i] <- pred10[i] * costs[3] + pred01[i] * costs[2]
youden_index[i] <- pred11[i] + pred00[i] - 1
}
df.cost.table <- as.data.frame(cbind(cutoff, pred10, pred01, pred11, pred00, youden_index, msclaf_cost))
cat(paste0('The ideal cutoff for:\n Yodens Index approach : ', which.max(df.cost.table$youden_index)/100))
cat(paste0('\n Cost based approach : ', which.min(df.cost.table$msclaf_cost)/100))
ggplot(df.cost.table, aes(x = cutoff)) +
geom_line(aes(y = youden_index, color = 'yoden index')) +
geom_line(aes(y = msclaf_cost, color = 'misclassification cost'))+
labs(x = 'Cutoff p value', y='Index', title = 'Cutoff p value',fill = 'Plot') +
theme_minimal()+ theme(legend.position="bottom")
}
# CUTOFF MODEL
find_cutoff(actual_value = Btrain$y, positive_class_name = 'yes', negitive_class_name = 'no', pred_probability = Btrain$PredProb, pred_01 =3, pred_10 = 1)
## The ideal cutoff for:
## Yodens Index approach : 0.16
## Cost based approach : 0.2
## Warning: Removed 2 row(s) containing missing values (geom_path).
## Warning: Removed 2 row(s) containing missing values (geom_path).
Btrain$Predy = ifelse(Btrain$PredProb >= 0.14,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1766 265
## yes 137 149
##
## Accuracy : 0.8265
## 95% CI : (0.8105, 0.8417)
## No Information Rate : 0.8213
## P-Value [Acc > NIR] : 0.2676
##
## Kappa : 0.3275
##
## Mcnemar's Test P-Value : 2.386e-10
##
## Sensitivity : 0.9280
## Specificity : 0.3599
## Pos Pred Value : 0.8695
## Neg Pred Value : 0.5210
## Prevalence : 0.8213
## Detection Rate : 0.7622
## Detection Prevalence : 0.8766
## Balanced Accuracy : 0.6440
##
## 'Positive' Class : no
##
#### AIC MODEL
Btrain$PredProb3 = predict.glm(g3, newdata = Btrain, type = "response")
Btrain$Predy3 = ifelse(Btrain$PredProb3 >= 0.5,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy3))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1997 34
## yes 216 70
##
## Accuracy : 0.8921
## 95% CI : (0.8788, 0.9044)
## No Information Rate : 0.9551
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3138
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9024
## Specificity : 0.6731
## Pos Pred Value : 0.9833
## Neg Pred Value : 0.2448
## Prevalence : 0.9551
## Detection Rate : 0.8619
## Detection Prevalence : 0.8766
## Balanced Accuracy : 0.7877
##
## 'Positive' Class : no
##
#Model 3 Cutoff
find_cutoff(actual_value = Btrain$y, positive_class_name = 'yes', negitive_class_name = 'no', pred_probability = Btrain$PredProb3, pred_01 =3, pred_10 = 1)
## The ideal cutoff for:
## Yodens Index approach : 0.14
## Cost based approach : 0.25
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).
Btrain$Predy3 = ifelse(Btrain$PredProb3 >= 0.13,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy3))
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 1667 364
## yes 97 189
##
## Accuracy : 0.801
## 95% CI : (0.7842, 0.8171)
## No Information Rate : 0.7613
## P-Value [Acc > NIR] : 2.719e-06
##
## Kappa : 0.3438
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9450
## Specificity : 0.3418
## Pos Pred Value : 0.8208
## Neg Pred Value : 0.6608
## Prevalence : 0.7613
## Detection Rate : 0.7195
## Detection Prevalence : 0.8766
## Balanced Accuracy : 0.6434
##
## 'Positive' Class : no
##
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.