bank <- read.csv('bank-additional.csv', sep=";")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(MASS); library(car); library(olsrr)
## Warning: package 'MASS' was built under R version 4.1.2
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Warning: package 'car' was built under R version 4.1.1
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.1.1
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## Warning: package 'olsrr' was built under R version 4.1.1
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:MASS':
## 
##     cement
## The following object is masked from 'package:datasets':
## 
##     rivers
library(DescTools);library(ResourceSelection)
## Warning: package 'DescTools' was built under R version 4.1.1
## 
## Attaching package: 'DescTools'
## The following object is masked from 'package:car':
## 
##     Recode
## Warning: package 'ResourceSelection' was built under R version 4.1.2
## ResourceSelection 0.3-5   2019-07-22
library(caret);library(lattice);
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE
library(gam);library(car)
## Warning: package 'gam' was built under R version 4.1.2
## Loading required package: splines
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following object is masked from 'package:DescTools':
## 
##     %:%
## Loaded gam 1.20
library(ROCR);library(gridExtra)
## Warning: package 'ROCR' was built under R version 4.1.2
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.1.3
## Registered S3 method overwritten by 'gdata':
##   method         from     
##   reorder.factor DescTools

overview:

## 'data.frame':    4119 obs. of  21 variables:
##  $ age           : int  30 39 25 38 47 32 32 41 31 35 ...
##  $ job           : chr  "blue-collar" "services" "services" "services" ...
##  $ marital       : chr  "married" "single" "married" "married" ...
##  $ education     : chr  "basic.9y" "high.school" "high.school" "basic.9y" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "yes" "no" "yes" "unknown" ...
##  $ loan          : chr  "no" "no" "no" "unknown" ...
##  $ contact       : chr  "cellular" "telephone" "telephone" "telephone" ...
##  $ month         : chr  "may" "may" "jun" "jun" ...
##  $ day_of_week   : chr  "fri" "fri" "wed" "fri" ...
##  $ duration      : int  487 346 227 17 58 128 290 44 68 170 ...
##  $ campaign      : int  2 4 1 3 1 3 4 2 1 1 ...
##  $ pdays         : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ previous      : int  0 0 0 0 0 2 0 0 1 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 1.4 -0.1 -1.1 -1.1 -0.1 -0.1 1.1 ...
##  $ cons.price.idx: num  92.9 94 94.5 94.5 93.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -41.8 -42 -37.5 -37.5 -42 -42 -36.4 ...
##  $ euribor3m     : num  1.31 4.86 4.96 4.96 4.19 ...
##  $ nr.employed   : num  5099 5191 5228 5228 5196 ...
##  $ y             : chr  "no" "no" "no" "no" ...
##   age         job marital         education default housing    loan   contact
## 1  30 blue-collar married          basic.9y      no     yes      no  cellular
## 2  39    services  single       high.school      no      no      no telephone
## 3  25    services married       high.school      no     yes      no telephone
## 4  38    services married          basic.9y      no unknown unknown telephone
## 5  47      admin. married university.degree      no     yes      no  cellular
## 6  32    services  single university.degree      no      no      no  cellular
##   month day_of_week duration campaign pdays previous    poutcome emp.var.rate
## 1   may         fri      487        2   999        0 nonexistent         -1.8
## 2   may         fri      346        4   999        0 nonexistent          1.1
## 3   jun         wed      227        1   999        0 nonexistent          1.4
## 4   jun         fri       17        3   999        0 nonexistent          1.4
## 5   nov         mon       58        1   999        0 nonexistent         -0.1
## 6   sep         thu      128        3   999        2     failure         -1.1
##   cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 1         92.893         -46.2     1.313      5099.1 no
## 2         93.994         -36.4     4.855      5191.0 no
## 3         94.465         -41.8     4.962      5228.1 no
## 4         94.465         -41.8     4.959      5228.1 no
## 5         93.200         -42.0     4.191      5195.8 no
## 6         94.199         -37.5     0.884      4963.6 no
summary(bank)
##       age            job              marital           education        
##  Min.   :18.00   Length:4119        Length:4119        Length:4119       
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.11                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :88.00                                                           
##    default            housing              loan             contact         
##  Length:4119        Length:4119        Length:4119        Length:4119       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:4119        Length:4119        Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 103.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 181.0   Median : 2.000  
##                                        Mean   : 256.8   Mean   : 2.537  
##                                        3rd Qu.: 317.0   3rd Qu.: 3.000  
##                                        Max.   :3643.0   Max.   :35.000  
##      pdays          previous        poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.0000   Length:4119        Min.   :-3.40000  
##  1st Qu.:999.0   1st Qu.:0.0000   Class :character   1st Qu.:-1.80000  
##  Median :999.0   Median :0.0000   Mode  :character   Median : 1.10000  
##  Mean   :960.4   Mean   :0.1903                      Mean   : 0.08497  
##  3rd Qu.:999.0   3rd Qu.:0.0000                      3rd Qu.: 1.40000  
##  Max.   :999.0   Max.   :6.0000                      Max.   : 1.40000  
##  cons.price.idx  cons.conf.idx     euribor3m      nr.employed  
##  Min.   :92.20   Min.   :-50.8   Min.   :0.635   Min.   :4964  
##  1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.334   1st Qu.:5099  
##  Median :93.75   Median :-41.8   Median :4.857   Median :5191  
##  Mean   :93.58   Mean   :-40.5   Mean   :3.621   Mean   :5166  
##  3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :94.77   Max.   :-26.9   Max.   :5.045   Max.   :5228  
##       y            
##  Length:4119       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
#bank$job<-as.numeric(as.factor(bank$job))
#bank$marital<-as.numeric(as.factor(bank$marital))
#bank$education<-as.numeric(as.factor(bank$education))
#bank$default<-as.numeric(as.factor(bank$default))
#bank$housing<-as.numeric(as.factor(bank$housing))
#bank$loan<-as.numeric(as.factor(bank$loan))
#bank$contact<-as.numeric(as.factor(bank$contact))
#bank$month<-as.numeric(as.factor(bank$month))
#bank$day_of_week<-as.numeric(as.factor(bank$day_of_week))
#bank$campaign<-as.numeric(as.factor(bank$campaign))
#bank$previous<-as.numeric(as.factor(bank$previous))
#bank$poutcome<-as.numeric(as.factor(bank$poutcome))

Check missing values

colSums(is.na(bank))
##            age            job        marital      education        default 
##              0              0              0              0              0 
##        housing           loan        contact          month    day_of_week 
##              0              0              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

check duplicate values

sum(duplicated(bank))
## [1] 0

Target variable

table(bank$y)
## 
##   no  yes 
## 3668  451

Number of no’s greater than yes so there is a data imbalance.

Unknown values

In the dataset description we also saw that the data contains ‘unknown’ values, Lets take a look at that.

table(bank== "unknown")
## 
## FALSE  TRUE 
## 85269  1230
colSums(bank=="unknown")
##            age            job        marital      education        default 
##              0             39             11            167            803 
##        housing           loan        contact          month    day_of_week 
##            105            105              0              0              0 
##       duration       campaign          pdays       previous       poutcome 
##              0              0              0              0              0 
##   emp.var.rate cons.price.idx  cons.conf.idx      euribor3m    nr.employed 
##              0              0              0              0              0 
##              y 
##              0

So job, marital,education,default,housing and loan have unknown values. We can decide what to do with unknows after exploratory data analysis.

Visualization and Analysis

1.Age

summary(bank$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   18.00   32.00   38.00   40.11   47.00   88.00
unique(bank$age)
##  [1] 30 39 25 38 47 32 41 31 35 36 29 27 44 46 45 50 55 40 28 34 33 51 48 20 76
## [26] 56 24 58 60 37 52 42 49 54 59 57 43 53 75 82 71 21 22 23 26 81 61 67 73 18
## [51] 64 74 77 86 85 63 88 78 72 68 80 66 19 62 65 69 70
cor(bank$age,as.numeric(as.factor(bank$y)))
## [1] 0.06037408

The distribution of age shows that majority of the bank’s customers are aged between 25 - 50. More specifically, 50 % are between 32-47. The median age is 38. So, the target customers of the bank might be middle-aged adults.

p1= ggplot(bank) + geom_histogram(aes(x=age),color="black", fill="grey") +
  ylab('Count') +  xlab('Age') +  geom_vline(aes(xintercept = mean(age), color = "red")) +
  scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 = ggplot(bank) + geom_boxplot(aes(x='', y=age))
p3=ggplot(data = bank, aes(x=age, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2.Job

summary(bank$job)
##    Length     Class      Mode 
##      4119 character character
unique(bank$job)
##  [1] "blue-collar"   "services"      "admin."        "entrepreneur" 
##  [5] "self-employed" "technician"    "management"    "student"      
##  [9] "retired"       "housemaid"     "unemployed"    "unknown"
#table(bank$job)
cor(as.numeric(as.factor(bank$job)),as.numeric(as.factor(bank$y)))
## [1] 0.02672463
p1 <- ggplot(bank,aes(y))+geom_bar(aes(job))
p2 <- ggplot(data = bank, aes(x=job, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)

## job variable classification w.r.t target variable

CrossTable(bank$job,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##               | bank$y 
##      bank$job |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##        admin. |       879 |       133 |      1012 | 
##               |     0.547 |     4.445 |           | 
##               |     0.869 |     0.131 |     0.246 | 
## --------------|-----------|-----------|-----------|
##   blue-collar |       823 |        61 |       884 | 
##               |     1.627 |    13.235 |           | 
##               |     0.931 |     0.069 |     0.215 | 
## --------------|-----------|-----------|-----------|
##  entrepreneur |       140 |         8 |       148 | 
##               |     0.511 |     4.154 |           | 
##               |     0.946 |     0.054 |     0.036 | 
## --------------|-----------|-----------|-----------|
##     housemaid |        99 |        11 |       110 | 
##               |     0.011 |     0.091 |           | 
##               |     0.900 |     0.100 |     0.027 | 
## --------------|-----------|-----------|-----------|
##    management |       294 |        30 |       324 | 
##               |     0.104 |     0.845 |           | 
##               |     0.907 |     0.093 |     0.079 | 
## --------------|-----------|-----------|-----------|
##       retired |       128 |        38 |       166 | 
##               |     2.659 |    21.622 |           | 
##               |     0.771 |     0.229 |     0.040 | 
## --------------|-----------|-----------|-----------|
## self-employed |       146 |        13 |       159 | 
##               |     0.137 |     1.117 |           | 
##               |     0.918 |     0.082 |     0.039 | 
## --------------|-----------|-----------|-----------|
##      services |       358 |        35 |       393 | 
##               |     0.184 |     1.499 |           | 
##               |     0.911 |     0.089 |     0.095 | 
## --------------|-----------|-----------|-----------|
##       student |        63 |        19 |        82 | 
##               |     1.375 |    11.186 |           | 
##               |     0.768 |     0.232 |     0.020 | 
## --------------|-----------|-----------|-----------|
##    technician |       611 |        80 |       691 | 
##               |     0.031 |     0.249 |           | 
##               |     0.884 |     0.116 |     0.168 | 
## --------------|-----------|-----------|-----------|
##    unemployed |        92 |        19 |       111 | 
##               |     0.474 |     3.857 |           | 
##               |     0.829 |     0.171 |     0.027 | 
## --------------|-----------|-----------|-----------|
##       unknown |        35 |         4 |        39 | 
##               |     0.002 |     0.017 |           | 
##               |     0.897 |     0.103 |     0.009 | 
## --------------|-----------|-----------|-----------|
##  Column Total |      3668 |       451 |      4119 | 
## --------------|-----------|-----------|-----------|
## 
## 

3.Marital

summary(bank$marital)
##    Length     Class      Mode 
##      4119 character character
unique(bank$marital)
## [1] "married"  "single"   "divorced" "unknown"
#table(bank$marital)
cor(as.numeric(as.factor(bank$marital)),as.numeric(as.factor(bank$y)))
## [1] 0.04383328
p1<-ggplot(bank,aes(y))+geom_bar(aes(marital))
p2<-ggplot(data = bank, aes(x=marital, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)

## marital variable classification w.r.t target variable

CrossTable(bank$marital,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$marital |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##     divorced |       403 |        43 |       446 | 
##              |     0.086 |     0.697 |           | 
##              |     0.904 |     0.096 |     0.108 | 
## -------------|-----------|-----------|-----------|
##      married |      2257 |       252 |      2509 | 
##              |     0.231 |     1.879 |           | 
##              |     0.900 |     0.100 |     0.609 | 
## -------------|-----------|-----------|-----------|
##       single |       998 |       155 |      1153 | 
##              |     0.805 |     6.550 |           | 
##              |     0.866 |     0.134 |     0.280 | 
## -------------|-----------|-----------|-----------|
##      unknown |        10 |         1 |        11 | 
##              |     0.004 |     0.035 |           | 
##              |     0.909 |     0.091 |     0.003 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

4.Education

summary(bank$education)
##    Length     Class      Mode 
##      4119 character character
unique(bank$education)
## [1] "basic.9y"            "high.school"         "university.degree"  
## [4] "professional.course" "basic.6y"            "basic.4y"           
## [7] "unknown"             "illiterate"
cor(as.numeric(as.factor(bank$education)),as.numeric(as.factor(bank$y)))
## [1] 0.06731618
p1<-ggplot(bank,aes(y))+geom_bar(aes(education))
p2<-ggplot(data = bank, aes(x=education, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)

## education classification w.r.t target variable

CrossTable(bank$education,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##                     | bank$y 
##      bank$education |        no |       yes | Row Total | 
## --------------------|-----------|-----------|-----------|
##            basic.4y |       391 |        38 |       429 | 
##                     |     0.211 |     1.714 |           | 
##                     |     0.911 |     0.089 |     0.104 | 
## --------------------|-----------|-----------|-----------|
##            basic.6y |       211 |        17 |       228 | 
##                     |     0.312 |     2.541 |           | 
##                     |     0.925 |     0.075 |     0.055 | 
## --------------------|-----------|-----------|-----------|
##            basic.9y |       531 |        43 |       574 | 
##                     |     0.771 |     6.269 |           | 
##                     |     0.925 |     0.075 |     0.139 | 
## --------------------|-----------|-----------|-----------|
##         high.school |       824 |        97 |       921 | 
##                     |     0.018 |     0.146 |           | 
##                     |     0.895 |     0.105 |     0.224 | 
## --------------------|-----------|-----------|-----------|
##          illiterate |         1 |         0 |         1 | 
##                     |     0.013 |     0.109 |           | 
##                     |     1.000 |     0.000 |     0.000 | 
## --------------------|-----------|-----------|-----------|
## professional.course |       470 |        65 |       535 | 
##                     |     0.087 |     0.704 |           | 
##                     |     0.879 |     0.121 |     0.130 | 
## --------------------|-----------|-----------|-----------|
##   university.degree |      1099 |       165 |      1264 | 
##                     |     0.629 |     5.113 |           | 
##                     |     0.869 |     0.131 |     0.307 | 
## --------------------|-----------|-----------|-----------|
##             unknown |       141 |        26 |       167 | 
##                     |     0.400 |     3.255 |           | 
##                     |     0.844 |     0.156 |     0.041 | 
## --------------------|-----------|-----------|-----------|
##        Column Total |      3668 |       451 |      4119 | 
## --------------------|-----------|-----------|-----------|
## 
## 

5.default

summary(bank$default)
##    Length     Class      Mode 
##      4119 character character
unique(bank$default)
## [1] "no"      "unknown" "yes"
#table(bank$default)
cor(as.numeric(as.factor(bank$default)),as.numeric(as.factor(bank$y)))
## [1] -0.07662722
p1 <- ggplot(bank,aes(y))+geom_bar(aes(default))
p2 <- ggplot(data = bank, aes(x=default, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)

## default classification w.r.t target variable

CrossTable(bank$default,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$default |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |      2913 |       402 |      3315 | 
##              |     0.516 |     4.197 |           | 
##              |     0.879 |     0.121 |     0.805 | 
## -------------|-----------|-----------|-----------|
##      unknown |       754 |        49 |       803 | 
##              |     2.119 |    17.231 |           | 
##              |     0.939 |     0.061 |     0.195 | 
## -------------|-----------|-----------|-----------|
##          yes |         1 |         0 |         1 | 
##              |     0.013 |     0.109 |           | 
##              |     1.000 |     0.000 |     0.000 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

6.Housing

summary(bank$housing)
##    Length     Class      Mode 
##      4119 character character
unique(bank$housing)
## [1] "yes"     "no"      "unknown"
#table(bank$housing)
cor(as.numeric(as.factor(bank$housing)),as.numeric(as.factor(bank$y)))
## [1] 0.0009566489
p1 <- ggplot(bank,aes(y))+geom_bar(aes(housing))
p2 <- ggplot(data = bank, aes(x=housing, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2, ncol = 2)

## housing classification w.r.t target variable

CrossTable(bank$housing,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$housing |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |      1637 |       202 |      1839 | 
##              |     0.000 |     0.002 |           | 
##              |     0.890 |     0.110 |     0.446 | 
## -------------|-----------|-----------|-----------|
##      unknown |        96 |         9 |       105 | 
##              |     0.067 |     0.542 |           | 
##              |     0.914 |     0.086 |     0.025 | 
## -------------|-----------|-----------|-----------|
##          yes |      1935 |       240 |      2175 | 
##              |     0.002 |     0.014 |           | 
##              |     0.890 |     0.110 |     0.528 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

7. Loan

summary(bank$loan)
##    Length     Class      Mode 
##      4119 character character
unique(bank$loan)
## [1] "no"      "unknown" "yes"
cor(as.numeric(as.factor(bank$loan)),as.numeric(as.factor(bank$y)))
## [1] -0.01270932
p1 <- ggplot(bank,aes(y))+geom_bar(aes(loan))
p2 <- ggplot(data = bank, aes(x=loan, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)

## loan classification w.r.t target variable

CrossTable(bank$loan,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
##    bank$loan |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |      2975 |       374 |      3349 | 
##              |     0.018 |     0.146 |           | 
##              |     0.888 |     0.112 |     0.813 | 
## -------------|-----------|-----------|-----------|
##      unknown |        96 |         9 |       105 | 
##              |     0.067 |     0.542 |           | 
##              |     0.914 |     0.086 |     0.025 | 
## -------------|-----------|-----------|-----------|
##          yes |       597 |        68 |       665 | 
##              |     0.039 |     0.318 |           | 
##              |     0.898 |     0.102 |     0.161 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

8.Contact

summary(bank$contact)
##    Length     Class      Mode 
##      4119 character character
unique(bank$contact)
## [1] "cellular"  "telephone"
cor(as.numeric(as.factor(bank$contact)),as.numeric(as.factor(bank$y)))
## [1] -0.1374007
p1 <- ggplot(bank,aes(y))+geom_bar(aes(contact))
p2 <- ggplot(data = bank, aes(x=contact, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)

## contact classification w.r.t target variable

CrossTable(bank$contact,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
## bank$contact |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##     cellular |      2277 |       375 |      2652 | 
##              |     3.032 |    24.663 |           | 
##              |     0.859 |     0.141 |     0.644 | 
## -------------|-----------|-----------|-----------|
##    telephone |      1391 |        76 |      1467 | 
##              |     5.482 |    44.585 |           | 
##              |     0.948 |     0.052 |     0.356 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

9.Month

summary(bank$month)
##    Length     Class      Mode 
##      4119 character character
unique(bank$month)
##  [1] "may" "jun" "nov" "sep" "jul" "aug" "mar" "oct" "apr" "dec"
cor(as.numeric(as.factor(bank$month)),as.numeric(as.factor(bank$y)))
## [1] 0.005048514
p1 <- ggplot(bank,aes(y))+geom_bar(aes(month))
p2 <- ggplot(data = bank, aes(x=month, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)

## month classification w.r.t target variable

CrossTable(bank$month,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##              | bank$y 
##   bank$month |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##          apr |       179 |        36 |       215 | 
##              |     0.811 |     6.594 |           | 
##              |     0.833 |     0.167 |     0.052 | 
## -------------|-----------|-----------|-----------|
##          aug |       572 |        64 |       636 | 
##              |     0.056 |     0.456 |           | 
##              |     0.899 |     0.101 |     0.154 | 
## -------------|-----------|-----------|-----------|
##          dec |        10 |        12 |        22 | 
##              |     4.696 |    38.189 |           | 
##              |     0.455 |     0.545 |     0.005 | 
## -------------|-----------|-----------|-----------|
##          jul |       652 |        59 |       711 | 
##              |     0.561 |     4.564 |           | 
##              |     0.917 |     0.083 |     0.173 | 
## -------------|-----------|-----------|-----------|
##          jun |       462 |        68 |       530 | 
##              |     0.211 |     1.713 |           | 
##              |     0.872 |     0.128 |     0.129 | 
## -------------|-----------|-----------|-----------|
##          mar |        20 |        28 |        48 | 
##              |    12.102 |    98.429 |           | 
##              |     0.417 |     0.583 |     0.012 | 
## -------------|-----------|-----------|-----------|
##          may |      1288 |        90 |      1378 | 
##              |     3.020 |    24.566 |           | 
##              |     0.935 |     0.065 |     0.335 | 
## -------------|-----------|-----------|-----------|
##          nov |       403 |        43 |       446 | 
##              |     0.086 |     0.697 |           | 
##              |     0.904 |     0.096 |     0.108 | 
## -------------|-----------|-----------|-----------|
##          oct |        44 |        25 |        69 | 
##              |     4.953 |    40.282 |           | 
##              |     0.638 |     0.362 |     0.017 | 
## -------------|-----------|-----------|-----------|
##          sep |        38 |        26 |        64 | 
##              |     6.329 |    51.475 |           | 
##              |     0.594 |     0.406 |     0.016 | 
## -------------|-----------|-----------|-----------|
## Column Total |      3668 |       451 |      4119 | 
## -------------|-----------|-----------|-----------|
## 
## 

10.Day_of_Week

summary(bank$day_of_week)
##    Length     Class      Mode 
##      4119 character character
unique(bank$day_of_week)
## [1] "fri" "wed" "mon" "thu" "tue"
cor(as.numeric(as.factor(bank$day_of_week)),as.numeric(as.factor(bank$y)))
## [1] -0.006369504
p1 <- ggplot(bank,aes(y))+geom_bar(aes(day_of_week))
p2 <- ggplot(data = bank, aes(x=day_of_week, fill=y)) + geom_bar()+ guides()
grid.arrange(p1, p2, ncol = 2)

## days of week classification w.r.t target variable

CrossTable(bank$day_of_week,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##                  | bank$y 
## bank$day_of_week |        no |       yes | Row Total | 
## -----------------|-----------|-----------|-----------|
##              fri |       685 |        83 |       768 | 
##                  |     0.002 |     0.014 |           | 
##                  |     0.892 |     0.108 |     0.186 | 
## -----------------|-----------|-----------|-----------|
##              mon |       757 |        98 |       855 | 
##                  |     0.025 |     0.205 |           | 
##                  |     0.885 |     0.115 |     0.208 | 
## -----------------|-----------|-----------|-----------|
##              thu |       764 |        96 |       860 | 
##                  |     0.004 |     0.036 |           | 
##                  |     0.888 |     0.112 |     0.209 | 
## -----------------|-----------|-----------|-----------|
##              tue |       750 |        91 |       841 | 
##                  |     0.002 |     0.013 |           | 
##                  |     0.892 |     0.108 |     0.204 | 
## -----------------|-----------|-----------|-----------|
##              wed |       712 |        83 |       795 | 
##                  |     0.023 |     0.188 |           | 
##                  |     0.896 |     0.104 |     0.193 | 
## -----------------|-----------|-----------|-----------|
##     Column Total |      3668 |       451 |      4119 | 
## -----------------|-----------|-----------|-----------|
## 
## 

11.Duation

summary(bank$duration)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   103.0   181.0   256.8   317.0  3643.0
unique(bank$duration)
##   [1]  487  346  227   17   58  128  290   44   68  170  301  148   97  211  553
##  [16]  698  191   59   38  849  326  222  626  119  388  479  446  127  109  113
##  [31]  393  151  256   42  525   57  499   84  137   31  430  126  340  412  132
##  [46]   79  341  157  252  263  215   89  143   40   10  481  233  204  403  180
##  [61]   16  447   81  361 1091  395  432  596   77  768   96  357  459   11  264
##  [76]   93  374  158   95  835  505  300  390  274  135  257  268  477   91   76
##  [91]  103  436  483  250  259  389    7  123   92  297  406  104  854  147  203
## [106]  149  144  394  523   73  197  108   80  114  122 1161  181  239  360  314
## [121]  984  663  141  706  797  311   63  111   49  171  242  279  246  309  168
## [136]  153  152   90  117  640  199 1114   74  190  738  224  344  383   35  772
## [151]  124  345  951  188  809  192  154  100  317  293   30  442  187   64  629
## [166]  423  888  207  265  273   85  261  136  711   88   72  307   39  156  202
## [181]  353  159  347  174  280  686   94  225  474  377  185  121  160  313  219
## [196]  267  228  355  102  116   83  473  605  585  255 1868  846  404   51   87
## [211]  167  440  673   48  236  288  193  318  209  173  503  101  370 1207  262
## [226]  609  806  335  266  434   82   15  155  339  206  178  461   50   56   55
## [241]  142    9  247  130  336  424  617  238  632   86  165  212   54  184    6
## [256]   70   98  106  456  118  241  439  322  417  498  405   99  712  112  223
## [271]  133  258  958  898  282  175  235  372   69  183  270  134  449  115  205
## [286]  145  548  379  105  544  401  549  291  655  179  391  750  454   23  363
## [301]  775  164  988  471  385  125  886   34  334  955  545  659  230  699 1276
## [316]  251   25  696  701  342  161  275  172  139  232  131   36  600  177  217
## [331]  216  329  604  634  107  245  690  286  201  198  249  226 1058  299  441
## [346]  285  195  292  298 1013  248 1319  146  294  575  237  861  618  271  200
## [361]  166  367  218  584  509   27   78  162  651  415 1149  110  240  366  284
## [376]  431  608  244  455  807  420  182  638  641   21 1348  324  331  550  489
## [391]  304  189  728  278  387   29   71  767 1476  176   52  150   32   12  501
## [406]  381  482   14  569  697  581  243  229  408   53  305  316  577  427  214
## [421]   19   65  281  468   67  438  582  721  295  231  221 1170  368 1360  433
## [436]  352   37  650  289  213   22   43   26  532   75  557  541   62    5  941
## [451]  422  319  653  397 1447  999  321 1143  667 1132   60  396  194 1068  337
## [466]  400  140  409  208   13  458  713  820  310  587  320  566  748  599  411
## [481] 1185  398  169  272   66  679    8   18  497 1065  276  716   20  760  253
## [496]  551  675   46  484  333  369  464  362  997  287  649  470  762  591  758
## [511] 1551  480  869   61  129  979  630  234  354  502  451  296  407  120  754
## [526]  589   41  514  919  530  595  526  494   24 1353  332 1234  687  428  488
## [541]  486  413  892  452  614  749 1327   28   47  677  643 2653  302  570  938
## [556]  260  901  138  590  546  371  312  163  328  722  323  611  539  359  671
## [571]  781 1005  303  343  418   45  419 1148  349 3253  606  894  813  891  210
## [586] 1067  543  382  492 1183  903    4  375 1628  840 1167  386  868  327  485
## [601]  506  351  315  529 1720  533  429  766  616 1130  747  496 2301  460  220
## [616]  776  568  448  186  534 1334 1138 1019  364 1090  857  269  637  536  475
## [631]  453  330  338  764  873 1176  384   33  602  476    0  689  718  796  662
## [646]  799  715  633  348 1014  700 1045 1152  725  358  196  493  254  742  504
## [661] 1092  399  952  426  457 3643 1105  838  829  565  644  771  513  646  356
## [676]  693  592  628  556  769 1111  843  668  848  855  517  992  619  867 1441
## [691]  665 1171  542  607  800 1150 1855 1203  723  308  823 1076  837  780  789
## [706] 1002  578  507  508  567  421 1241  373  571  469  527  588  645 1221  704
## [721]  378 1127  818 1062  562  825  435  802  531  306  739  365  325 1432 1806
## [736] 1046  674  740 1119  636 1357  414  727 1009  283 1011  511 1186  402  519
## [751]  490  683  688 1340  472  882  520  515 1332 1820 1311  559 1365 1980  410
## [766]  895 1190  784  376  521  834  450 1128  516  770 1074 1259 1422 1300 1135
## [781]  624  540  657  627  681  491  705  597 1298 1438  277 1087  782  416 1288
## [796] 1424  720  726  537  996  815  805 1468  801  495  463  814  350  702  623
## [811]  980 1195  478  881  445  658  528  522 1012 1590  621 1602  757  593  879
## [826]  580  620 1386
cor(as.numeric(as.factor(bank$duration)),as.numeric(as.factor(bank$y)))
## [1] 0.4197168
p1= ggplot(bank) + geom_histogram(aes(x=duration),color="black", fill="grey",bins=30) +
  ylab('Count') +  xlab('Duration') +  geom_vline(aes(xintercept = mean(duration), color = "red")) +
  scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 = ggplot(bank) + geom_boxplot(aes(x='', y=duration))
p3=ggplot(data = bank, aes(x=duration, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

12.Campaign

summary(bank$campaign)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.537   3.000  35.000
unique(bank$campaign)
##  [1]  2  4  1  3  6  7 27  5 12 14 10  8 11 13  9 15 16 18 17 22 19 23 24 35 29
cor(as.numeric(as.factor(bank$campaign)),as.numeric(as.factor(bank$y)))
## [1] -0.07726492
p1 <-  ggplot(bank) + geom_histogram(aes(x=campaign),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('Campaign') +  geom_vline(aes(xintercept = mean(campaign), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=campaign))
p3 <- ggplot(data = bank, aes(x=campaign, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

13.Pdays

summary(bank$pdays)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   999.0   999.0   960.4   999.0   999.0
unique(bank$pdays)
##  [1] 999  12   3   6   5   2  10  11   7   1  18   4  15   0  16   9  19  17  13
## [20]  21  14
cor(as.numeric(as.factor(bank$pdays)),as.numeric(as.factor(bank$y)))
## [1] -0.3292231
bank <- bank %>% mutate(pdays = if_else(pdays == 999, "0", "1"))
#hist(as.numeric(bank$pdays))
p1 <- ggplot(data = bank, aes(x=pdays, fill=y)) + geom_bar()+ guides()
p1

poutcome classification w.r.t target variable

CrossTable(bank$poutcome,bank$y,prop.t = FALSE,prop.c = FALSE)
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  4119 
## 
##  
##               | bank$y 
## bank$poutcome |        no |       yes | Row Total | 
## --------------|-----------|-----------|-----------|
##       failure |       387 |        67 |       454 | 
##               |     0.739 |     6.014 |           | 
##               |     0.852 |     0.148 |     0.110 | 
## --------------|-----------|-----------|-----------|
##   nonexistent |      3231 |       292 |      3523 | 
##               |     2.801 |    22.781 |           | 
##               |     0.917 |     0.083 |     0.855 | 
## --------------|-----------|-----------|-----------|
##       success |        50 |        92 |       142 | 
##               |    46.222 |   375.928 |           | 
##               |     0.352 |     0.648 |     0.034 | 
## --------------|-----------|-----------|-----------|
##  Column Total |      3668 |       451 |      4119 | 
## --------------|-----------|-----------|-----------|
## 
## 

16.emp.var.rate

summary(bank$emp.var.rate)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -3.40000 -1.80000  1.10000  0.08497  1.40000  1.40000
unique(bank$emp.var.rate)
##  [1] -1.8  1.1  1.4 -0.1 -1.1 -2.9 -1.7 -3.4 -3.0 -0.2
cor(as.numeric(as.factor(bank$emp.var.rate)),as.numeric(as.factor(bank$y)))
## [1] -0.2714018
p1 <-  ggplot(bank) + geom_histogram(aes(x=emp.var.rate),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('emp.var.rate') +  geom_vline(aes(xintercept = mean(emp.var.rate), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=emp.var.rate))
p3 <- ggplot(data = bank, aes(x=emp.var.rate, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

## 17.cons.price.idx

summary(bank$cons.price.idx)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   92.20   93.08   93.75   93.58   93.99   94.77
unique(bank$cons.price.idx)
##  [1] 92.893 93.994 94.465 93.200 94.199 93.918 93.444 93.369 92.843 92.963
## [11] 94.601 94.027 92.379 92.431 93.749 93.075 94.055 92.469 94.767 92.201
## [21] 92.649 94.215 93.876 93.798 92.713 92.756
cor(as.numeric(as.factor(bank$cons.price.idx)),as.numeric(as.factor(bank$y)))
## [1] -0.102792
p1 <-  ggplot(bank) + geom_histogram(aes(x=cons.price.idx),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('cons.price.idx') +  geom_vline(aes(xintercept = mean(emp.var.rate), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=cons.price.idx))
p3 <- ggplot(data = bank, aes(x=cons.price.idx, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

## 18.cons.conf.idx

summary(bank$cons.conf.idx)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -50.8   -42.7   -41.8   -40.5   -36.4   -26.9
unique(bank$cons.conf.idx)
##  [1] -46.2 -36.4 -41.8 -42.0 -37.5 -42.7 -36.1 -34.8 -50.0 -40.8 -49.5 -38.3
## [13] -29.8 -26.9 -34.6 -47.1 -39.8 -33.6 -50.8 -31.4 -30.1 -40.3 -40.0 -40.4
## [25] -33.0 -45.9
cor(as.numeric(as.factor(bank$cons.conf.idx)),as.numeric(as.factor(bank$y)))
## [1] 0.06968329
p1 <-  ggplot(bank) + geom_histogram(aes(x=cons.conf.idx),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('cons.conf.idx') +  geom_vline(aes(xintercept = mean(cons.conf.idx), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) +  theme_minimal()
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=cons.conf.idx))
p3 <- ggplot(data = bank, aes(x=cons.conf.idx, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

## 19.euribor3m

summary(bank$euribor3m)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.635   1.334   4.857   3.621   4.961   5.045
unique(bank$euribor3m)
##   [1] 1.313 4.855 4.962 4.959 4.191 0.884 0.879 4.153 4.958 4.968 4.859 4.963
##  [13] 4.957 4.965 4.961 0.639 4.967 4.864 4.856 1.299 4.860 1.687 4.865 1.268
##  [25] 4.120 1.334 0.977 1.344 0.899 1.327 4.592 4.970 1.260 4.966 0.770 4.866
##  [37] 4.964 4.857 0.886 0.739 0.654 1.405 1.281 4.960 0.754 1.291 1.365 4.076
##  [49] 1.266 1.410 1.250 4.858 0.702 1.029 1.085 1.392 1.262 1.050 0.851 0.716
##  [61] 0.877 0.835 1.048 0.904 1.028 0.637 1.244 1.354 4.021 1.453 0.715 1.778
##  [73] 0.773 1.035 0.900 0.898 0.742 0.861 1.264 0.704 1.270 0.695 1.039 1.531
##  [85] 0.883 0.748 0.809 4.794 1.479 0.697 0.959 1.032 0.896 0.827 1.483 0.905
##  [97] 1.466 0.714 0.644 0.849 0.881 0.834 0.645 0.659 0.885 1.041 0.942 0.737
## [109] 4.947 0.722 1.049 1.415 0.797 0.699 0.810 0.710 1.423 0.707 0.646 1.043
## [121] 4.955 0.668 0.825 1.435 0.720 0.767 0.982 1.602 1.259 1.811 0.859 1.224
## [133] 0.876 0.878 1.099 0.788 0.717 0.838 0.640 0.762 1.663 0.730 0.728 1.372
## [145] 0.782 4.245 1.510 3.329 0.749 4.343 0.893 0.731 0.635 0.700 0.889 0.649
## [157] 0.873 1.445 1.629 0.944 3.853 0.870 0.790 5.045 0.914 0.719 0.735 1.498
## [169] 0.677 0.819 0.652 0.692 0.829 1.726 1.406 0.761 0.846 1.252 4.956 0.953
## [181] 0.803 0.937 0.706 0.869 1.703 0.729 0.709 1.046 0.752 0.921 4.921 0.987
## [193] 1.030 1.031 0.741 0.843 1.044 0.643 0.755 0.724 0.882 1.757 1.215 0.740
## [205] 0.683 1.520 4.663 1.059 0.636 0.771 0.655 1.400 0.650 1.384 0.778 0.682
## [217] 1.614 1.040 1.538 1.072 1.000 1.799 1.640 1.650 0.642 0.718 0.768 0.723
## [229] 0.996 0.721 0.672 0.854 1.016 0.965
cor(as.numeric(as.factor(bank$euribor3m)),as.numeric(as.factor(bank$y)))
## [1] -0.370733
p1 <-  ggplot(bank) + geom_histogram(aes(x=euribor3m),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('euribor3m') +  geom_vline(aes(xintercept = mean(euribor3m), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) 
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=euribor3m))
p3 <- ggplot(data = bank, aes(x=euribor3m, fill=y)) + geom_bar(width=0.1) + guides()
grid.arrange(p1, p2,p3,ncol=2)
## Warning: position_stack requires non-overlapping x intervals

20.nr.employee

summary(bank$nr.employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4964    5099    5191    5166    5228    5228
unique(bank$nr.employed)
##  [1] 5099.1 5191.0 5228.1 5195.8 4963.6 5008.7 5076.2 4991.6 5017.5 5023.5
## [11] 5176.3
cor(as.numeric(as.factor(bank$nr.employed)),as.numeric(as.factor(bank$y)))
## [1] -0.3516595
p1 <-  ggplot(bank) + geom_histogram(aes(x=nr.employed),color="black", fill="grey",bins=30) +
    ylab('Count') +  xlab('nr.employed') +  geom_vline(aes(xintercept = mean(nr.employed), color = "red")) +
    scale_x_continuous(breaks = seq(0,100,10)) 
p2 <- ggplot(bank) + geom_boxplot(aes(x='', y=nr.employed))
p3 <- ggplot(data = bank, aes(x=nr.employed, fill=y)) + geom_bar() + guides()
grid.arrange(p1, p2,p3,ncol=2)

correlation and pairwise comparison

library(corrplot)
## corrplot 0.90 loaded
bank_mat <-bank
bank_mat$age<-as.numeric(as.factor(bank_mat$age))
bank_mat$job<-as.numeric(as.factor(bank_mat$job))
bank_mat$marital<-as.numeric(as.factor(bank_mat$marital))
bank_mat$education<-as.numeric(as.factor(bank_mat$education))
bank_mat$default<-as.numeric(as.factor(bank_mat$default))
bank_mat$duration<-as.numeric(as.factor(bank_mat$duration))
bank_mat$housing<-as.numeric(as.factor(bank_mat$housing))
bank_mat$loan<-as.numeric(as.factor(bank_mat$loan))
bank_mat$contact<-as.numeric(as.factor(bank_mat$contact))
bank_mat$pdays<-as.numeric(as.factor(bank_mat$pdays))
bank_mat$month<-as.numeric(as.factor(bank_mat$month))
bank_mat$day_of_week<-as.numeric(as.factor(bank_mat$day_of_week))
bank_mat$campaign<-as.numeric(as.factor(bank_mat$campaign))
bank_mat$previous<-as.numeric(as.factor(bank_mat$previous))
bank_mat$poutcome<-as.numeric(as.factor(bank_mat$poutcome))
bank_mat$y <- as.numeric(as.factor(bank_mat$y))
mat <- cor(bank_mat)

corrplot(mat,method="number",tl.cex=0.7,number.cex = 0.5,col=colorRampPalette(c("grey","blue","black"))(100))

#bank$y <- ifelse(bank$y=='yes',1,0)
#bank$pdays<-as.numeric(as.factor(bank$pdays))
pairs(bank_mat)

bank_1 <- bank_mat[, c('age','duration','month','campaign')]
pairs(bank_1)

Model

# Majority Null for pdays, duration should be removed
bank$pdays <- NULL 
bank$duration <- NULL

bank$y <- as.factor(bank$y)

# omitting null/unknowns
bank2 <- na.omit(bank)
bank2[bank2 == "unknown"] <- NA
bank2 <- na.omit(bank2)
str(bank2)
## 'data.frame':    3090 obs. of  19 variables:
##  $ age           : int  30 39 25 47 32 32 31 36 36 47 ...
##  $ job           : chr  "blue-collar" "services" "services" "admin." ...
##  $ marital       : chr  "married" "single" "married" "married" ...
##  $ education     : chr  "basic.9y" "high.school" "high.school" "university.degree" ...
##  $ default       : chr  "no" "no" "no" "no" ...
##  $ housing       : chr  "yes" "no" "yes" "yes" ...
##  $ loan          : chr  "no" "no" "no" "no" ...
##  $ contact       : chr  "cellular" "telephone" "telephone" "cellular" ...
##  $ month         : chr  "may" "may" "jun" "nov" ...
##  $ day_of_week   : chr  "fri" "fri" "wed" "mon" ...
##  $ campaign      : int  2 4 1 1 3 4 1 1 2 2 ...
##  $ previous      : int  0 0 0 0 2 0 1 0 0 0 ...
##  $ poutcome      : chr  "nonexistent" "nonexistent" "nonexistent" "nonexistent" ...
##  $ emp.var.rate  : num  -1.8 1.1 1.4 -0.1 -1.1 -1.1 -0.1 1.4 1.1 1.4 ...
##  $ cons.price.idx: num  92.9 94 94.5 93.2 94.2 ...
##  $ cons.conf.idx : num  -46.2 -36.4 -41.8 -42 -37.5 -37.5 -42 -42.7 -36.4 -41.8 ...
##  $ euribor3m     : num  1.313 4.855 4.962 4.191 0.884 ...
##  $ nr.employed   : num  5099 5191 5228 5196 4964 ...
##  $ y             : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, "na.action")= 'omit' Named int [1:1029] 4 8 10 11 19 21 25 28 29 32 ...
##   ..- attr(*, "names")= chr [1:1029] "4" "8" "10" "11" ...
levels(as.factor(bank2$default))
## [1] "no"  "yes"
bank2$default = as.factor(bank2$default)
summary(bank2)
##       age            job              marital           education        
##  Min.   :20.00   Length:3090        Length:3090        Length:3090       
##  1st Qu.:31.00   Class :character   Class :character   Class :character  
##  Median :37.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :39.18                                                           
##  3rd Qu.:46.00                                                           
##  Max.   :88.00                                                           
##  default      housing              loan             contact         
##  no :3089   Length:3090        Length:3090        Length:3090       
##  yes:   1   Class :character   Class :character   Class :character  
##             Mode  :character   Mode  :character   Mode  :character  
##                                                                     
##                                                                     
##                                                                     
##     month           day_of_week           campaign         previous     
##  Length:3090        Length:3090        Min.   : 1.000   Min.   :0.0000  
##  Class :character   Class :character   1st Qu.: 1.000   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median : 2.000   Median :0.0000  
##                                        Mean   : 2.509   Mean   :0.2081  
##                                        3rd Qu.: 3.000   3rd Qu.:0.0000  
##                                        Max.   :35.000   Max.   :6.0000  
##    poutcome          emp.var.rate     cons.price.idx  cons.conf.idx   
##  Length:3090        Min.   :-3.4000   Min.   :92.20   Min.   :-50.80  
##  Class :character   1st Qu.:-1.8000   1st Qu.:93.08   1st Qu.:-42.70  
##  Mode  :character   Median : 1.1000   Median :93.44   Median :-41.80  
##                     Mean   :-0.0468   Mean   :93.53   Mean   :-40.62  
##                     3rd Qu.: 1.4000   3rd Qu.:93.99   3rd Qu.:-36.40  
##                     Max.   : 1.4000   Max.   :94.77   Max.   :-26.90  
##    euribor3m      nr.employed     y       
##  Min.   :0.635   Min.   :4964   no :2720  
##  1st Qu.:1.313   1st Qu.:5099   yes: 370  
##  Median :4.856   Median :5191             
##  Mean   :3.482   Mean   :5161             
##  3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :5.045   Max.   :5228
# Splitting data train and test
splitBank = sort(sample(nrow(bank2), nrow(bank2)*.75))
Btrain <- bank2[splitBank,]
Btest <- bank2[-splitBank,]

# AIC model
form_2 = as.formula(paste0('y ~ .'))
form_2
## y ~ .
set.seed(1234)
objControl <- trainControl(method = "none",
                           summaryFunction = twoClassSummary,
                           classProbs = TRUE,
                           savePredictions = TRUE)

Btrain[Btrain == "blue-collar"] <- "bluecollar"
Btrain[Btrain == "self-employed"] <- "selfemployed"

aicmodel <- train(form_2, data = Btrain,
               method = 'glmStepAIC',
               trControl = objControl,
               metric = "ROC",
               direction = 'forward')
## Start:  AIC=1733.79
## .outcome ~ 1
## 
##                                Df Deviance    AIC
## + nr.employed                   1   1477.7 1481.7
## + euribor3m                     1   1520.7 1524.7
## + emp.var.rate                  1   1543.0 1547.0
## + poutcomesuccess               1   1584.5 1588.5
## + previous                      1   1625.2 1629.2
## + poutcomenonexistent           1   1651.2 1655.2
## + contacttelephone              1   1665.7 1669.7
## + monthmar                      1   1682.8 1686.8
## + cons.price.idx                1   1704.7 1708.7
## + monthoct                      1   1705.1 1709.1
## + monthmay                      1   1706.0 1710.0
## + campaign                      1   1711.4 1715.4
## + monthsep                      1   1712.4 1716.4
## + maritalsingle                 1   1723.3 1727.3
## + jobstudent                    1   1723.3 1727.3
## + maritalmarried                1   1724.1 1728.1
## + cons.conf.idx                 1   1724.7 1728.7
## + jobentrepreneur               1   1724.9 1728.9
## + educationuniversity.degree    1   1726.6 1730.6
## + monthdec                      1   1726.7 1730.7
## + jobretired                    1   1726.9 1730.9
## + age                           1   1727.5 1731.5
## + monthjul                      1   1727.5 1731.5
## + educationbasic.9y             1   1727.9 1731.9
## + jobbluecollar                 1   1728.5 1732.5
## + jobunemployed                 1   1728.8 1732.8
## <none>                              1731.8 1733.8
## + jobmanagement                 1   1730.0 1734.0
## + monthjun                      1   1730.2 1734.2
## + jobservices                   1   1730.5 1734.5
## + monthnov                      1   1730.6 1734.6
## + housingyes                    1   1731.0 1735.0
## + educationbasic.6y             1   1731.2 1735.2
## + day_of_weekwed                1   1731.2 1735.2
## + loanyes                       1   1731.4 1735.4
## + jobselfemployed               1   1731.4 1735.4
## + defaultyes                    1   1731.5 1735.5
## + monthaug                      1   1731.7 1735.7
## + educationprofessional.course  1   1731.8 1735.8
## + educationhigh.school          1   1731.8 1735.8
## + day_of_weekmon                1   1731.8 1735.8
## + day_of_weekthu                1   1731.8 1735.8
## + jobtechnician                 1   1731.8 1735.8
## + day_of_weektue                1   1731.8 1735.8
## + jobhousemaid                  1   1731.8 1735.8
## 
## Step:  AIC=1481.69
## .outcome ~ nr.employed
## 
##                                Df Deviance    AIC
## + poutcomesuccess               1   1434.4 1440.4
## + monthmay                      1   1448.5 1454.5
## + contacttelephone              1   1456.5 1462.5
## + monthmar                      1   1456.5 1462.5
## + monthjun                      1   1469.5 1475.5
## + previous                      1   1469.9 1475.9
## + monthaug                      1   1472.2 1478.2
## + cons.conf.idx                 1   1472.7 1478.7
## + monthjul                      1   1472.9 1478.9
## + campaign                      1   1473.2 1479.2
## + jobentrepreneur               1   1473.5 1479.5
## + maritalmarried                1   1474.5 1480.5
## + maritalsingle                 1   1474.7 1480.7
## + poutcomenonexistent           1   1475.1 1481.1
## + educationuniversity.degree    1   1475.3 1481.3
## <none>                              1477.7 1481.7
## + monthsep                      1   1476.2 1482.2
## + jobmanagement                 1   1476.5 1482.5
## + educationbasic.9y             1   1476.6 1482.6
## + jobtechnician                 1   1476.6 1482.6
## + emp.var.rate                  1   1476.6 1482.6
## + day_of_weekmon                1   1476.6 1482.6
## + jobservices                   1   1476.6 1482.6
## + jobunemployed                 1   1476.8 1482.8
## + jobbluecollar                 1   1477.0 1483.0
## + cons.price.idx                1   1477.2 1483.2
## + housingyes                    1   1477.3 1483.3
## + age                           1   1477.3 1483.3
## + jobselfemployed               1   1477.3 1483.3
## + educationhigh.school          1   1477.4 1483.4
## + jobstudent                    1   1477.5 1483.5
## + educationprofessional.course  1   1477.5 1483.5
## + monthnov                      1   1477.5 1483.5
## + defaultyes                    1   1477.6 1483.6
## + monthoct                      1   1477.6 1483.6
## + monthdec                      1   1477.6 1483.6
## + jobretired                    1   1477.6 1483.6
## + euribor3m                     1   1477.7 1483.7
## + loanyes                       1   1477.7 1483.7
## + day_of_weekthu                1   1477.7 1483.7
## + day_of_weektue                1   1477.7 1483.7
## + educationbasic.6y             1   1477.7 1483.7
## + day_of_weekwed                1   1477.7 1483.7
## + jobhousemaid                  1   1477.7 1483.7
## 
## Step:  AIC=1440.41
## .outcome ~ nr.employed + poutcomesuccess
## 
##                                Df Deviance    AIC
## + monthmay                      1   1411.0 1419.0
## + monthmar                      1   1413.6 1421.6
## + contacttelephone              1   1414.0 1422.0
## + monthjun                      1   1425.3 1433.3
## + campaign                      1   1429.8 1437.8
## + maritalmarried                1   1430.7 1438.7
## + jobentrepreneur               1   1431.2 1439.2
## + monthjul                      1   1431.2 1439.2
## + maritalsingle                 1   1431.3 1439.3
## + monthaug                      1   1431.3 1439.3
## + emp.var.rate                  1   1431.9 1439.9
## + poutcomenonexistent           1   1432.2 1440.2
## <none>                              1434.4 1440.4
## + monthsep                      1   1432.4 1440.4
## + cons.conf.idx                 1   1432.6 1440.6
## + educationuniversity.degree    1   1432.7 1440.7
## + jobtechnician                 1   1432.7 1440.7
## + jobmanagement                 1   1433.1 1441.1
## + cons.price.idx                1   1433.2 1441.2
## + jobstudent                    1   1433.6 1441.6
## + jobservices                   1   1433.6 1441.6
## + monthnov                      1   1433.6 1441.6
## + day_of_weekmon                1   1433.7 1441.7
## + monthoct                      1   1433.8 1441.8
## + educationprofessional.course  1   1434.0 1442.0
## + educationbasic.9y             1   1434.0 1442.0
## + educationhigh.school          1   1434.0 1442.0
## + jobbluecollar                 1   1434.1 1442.1
## + jobunemployed                 1   1434.1 1442.1
## + euribor3m                     1   1434.1 1442.1
## + jobselfemployed               1   1434.1 1442.1
## + jobretired                    1   1434.2 1442.2
## + defaultyes                    1   1434.3 1442.3
## + previous                      1   1434.3 1442.3
## + loanyes                       1   1434.3 1442.3
## + age                           1   1434.4 1442.4
## + monthdec                      1   1434.4 1442.4
## + housingyes                    1   1434.4 1442.4
## + day_of_weekwed                1   1434.4 1442.4
## + jobhousemaid                  1   1434.4 1442.4
## + day_of_weektue                1   1434.4 1442.4
## + day_of_weekthu                1   1434.4 1442.4
## + educationbasic.6y             1   1434.4 1442.4
## 
## Step:  AIC=1419.01
## .outcome ~ nr.employed + poutcomesuccess + monthmay
## 
##                                Df Deviance    AIC
## + monthmar                      1   1394.8 1404.8
## + contacttelephone              1   1396.7 1406.7
## + monthnov                      1   1406.6 1416.6
## + campaign                      1   1406.6 1416.6
## + monthsep                      1   1406.8 1416.8
## + euribor3m                     1   1407.0 1417.0
## + emp.var.rate                  1   1407.1 1417.1
## + monthjun                      1   1407.5 1417.5
## + jobentrepreneur               1   1407.5 1417.5
## + maritalmarried                1   1408.1 1418.1
## + cons.price.idx                1   1408.4 1418.4
## + maritalsingle                 1   1408.8 1418.8
## + poutcomenonexistent           1   1408.9 1418.9
## <none>                              1411.0 1419.0
## + jobtechnician                 1   1409.5 1419.5
## + day_of_weekmon                1   1409.9 1419.9
## + jobmanagement                 1   1409.9 1419.9
## + jobretired                    1   1410.2 1420.2
## + educationuniversity.degree    1   1410.4 1420.4
## + jobservices                   1   1410.6 1420.6
## + educationprofessional.course  1   1410.7 1420.7
## + jobselfemployed               1   1410.7 1420.7
## + monthjul                      1   1410.7 1420.7
## + jobstudent                    1   1410.7 1420.7
## + educationhigh.school          1   1410.7 1420.7
## + jobunemployed                 1   1410.8 1420.8
## + monthdec                      1   1410.8 1420.8
## + monthaug                      1   1410.8 1420.8
## + defaultyes                    1   1410.8 1420.8
## + loanyes                       1   1410.9 1420.9
## + previous                      1   1410.9 1420.9
## + jobbluecollar                 1   1411.0 1421.0
## + educationbasic.6y             1   1411.0 1421.0
## + day_of_weekwed                1   1411.0 1421.0
## + jobhousemaid                  1   1411.0 1421.0
## + educationbasic.9y             1   1411.0 1421.0
## + housingyes                    1   1411.0 1421.0
## + cons.conf.idx                 1   1411.0 1421.0
## + monthoct                      1   1411.0 1421.0
## + age                           1   1411.0 1421.0
## + day_of_weektue                1   1411.0 1421.0
## + day_of_weekthu                1   1411.0 1421.0
## 
## Step:  AIC=1404.75
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar
## 
##                                Df Deviance    AIC
## + contacttelephone              1   1380.9 1392.9
## + monthjun                      1   1389.7 1401.7
## + campaign                      1   1390.3 1402.3
## + emp.var.rate                  1   1391.0 1403.0
## + jobentrepreneur               1   1391.7 1403.7
## + monthnov                      1   1391.7 1403.7
## + monthsep                      1   1392.0 1404.0
## + euribor3m                     1   1392.0 1404.0
## + maritalmarried                1   1392.3 1404.3
## + poutcomenonexistent           1   1392.4 1404.4
## <none>                              1394.8 1404.8
## + maritalsingle                 1   1392.8 1404.8
## + cons.price.idx                1   1393.0 1405.0
## + jobtechnician                 1   1393.5 1405.5
## + jobmanagement                 1   1393.5 1405.5
## + day_of_weekmon                1   1393.6 1405.6
## + jobretired                    1   1393.8 1405.8
## + educationuniversity.degree    1   1394.0 1406.0
## + monthaug                      1   1394.1 1406.1
## + monthjul                      1   1394.1 1406.1
## + jobselfemployed               1   1394.3 1406.3
## + cons.conf.idx                 1   1394.3 1406.3
## + jobservices                   1   1394.4 1406.4
## + jobunemployed                 1   1394.4 1406.4
## + monthoct                      1   1394.5 1406.5
## + loanyes                       1   1394.6 1406.6
## + educationprofessional.course  1   1394.6 1406.6
## + educationhigh.school          1   1394.6 1406.6
## + defaultyes                    1   1394.6 1406.6
## + day_of_weekwed                1   1394.6 1406.6
## + previous                      1   1394.6 1406.6
## + monthdec                      1   1394.7 1406.7
## + jobbluecollar                 1   1394.7 1406.7
## + jobstudent                    1   1394.7 1406.7
## + educationbasic.9y             1   1394.7 1406.7
## + age                           1   1394.7 1406.7
## + educationbasic.6y             1   1394.7 1406.7
## + day_of_weektue                1   1394.7 1406.7
## + housingyes                    1   1394.8 1406.8
## + jobhousemaid                  1   1394.8 1406.8
## + day_of_weekthu                1   1394.8 1406.8
## 
## Step:  AIC=1392.92
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone
## 
##                                Df Deviance    AIC
## + monthjun                      1   1367.3 1381.3
## + poutcomenonexistent           1   1376.2 1390.2
## + monthnov                      1   1376.8 1390.8
## + campaign                      1   1377.5 1391.5
## + jobentrepreneur               1   1377.8 1391.8
## + cons.conf.idx                 1   1378.7 1392.7
## <none>                              1380.9 1392.9
## + maritalmarried                1   1379.0 1393.0
## + jobmanagement                 1   1379.1 1393.1
## + maritalsingle                 1   1379.5 1393.5
## + monthsep                      1   1379.7 1393.7
## + jobtechnician                 1   1379.8 1393.8
## + emp.var.rate                  1   1379.9 1393.9
## + jobretired                    1   1379.9 1393.9
## + day_of_weekmon                1   1380.1 1394.1
## + previous                      1   1380.3 1394.3
## + jobunemployed                 1   1380.3 1394.3
## + monthoct                      1   1380.4 1394.4
## + jobselfemployed               1   1380.4 1394.4
## + jobservices                   1   1380.5 1394.5
## + educationuniversity.degree    1   1380.6 1394.6
## + educationprofessional.course  1   1380.7 1394.7
## + loanyes                       1   1380.7 1394.7
## + monthjul                      1   1380.7 1394.7
## + defaultyes                    1   1380.7 1394.7
## + educationbasic.6y             1   1380.8 1394.8
## + jobbluecollar                 1   1380.8 1394.8
## + jobstudent                    1   1380.8 1394.8
## + educationhigh.school          1   1380.8 1394.8
## + euribor3m                     1   1380.8 1394.8
## + day_of_weekwed                1   1380.9 1394.9
## + educationbasic.9y             1   1380.9 1394.9
## + housingyes                    1   1380.9 1394.9
## + monthdec                      1   1380.9 1394.9
## + jobhousemaid                  1   1380.9 1394.9
## + day_of_weekthu                1   1380.9 1394.9
## + monthaug                      1   1380.9 1394.9
## + cons.price.idx                1   1380.9 1394.9
## + day_of_weektue                1   1380.9 1394.9
## + age                           1   1380.9 1394.9
## 
## Step:  AIC=1381.28
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun
## 
##                                Df Deviance    AIC
## + cons.conf.idx                 1   1362.8 1378.8
## + poutcomenonexistent           1   1363.1 1379.1
## + campaign                      1   1364.1 1380.1
## + jobentrepreneur               1   1364.6 1380.6
## <none>                              1367.3 1381.3
## + monthnov                      1   1365.3 1381.3
## + monthoct                      1   1365.6 1381.6
## + maritalmarried                1   1365.7 1381.7
## + jobmanagement                 1   1365.8 1381.8
## + monthjul                      1   1366.1 1382.1
## + maritalsingle                 1   1366.1 1382.1
## + jobtechnician                 1   1366.2 1382.2
## + jobselfemployed               1   1366.3 1382.3
## + jobretired                    1   1366.4 1382.4
## + monthaug                      1   1366.6 1382.6
## + day_of_weekmon                1   1366.7 1382.7
## + previous                      1   1366.8 1382.8
## + jobunemployed                 1   1366.8 1382.8
## + jobservices                   1   1366.9 1382.9
## + educationprofessional.course  1   1367.0 1383.0
## + euribor3m                     1   1367.0 1383.0
## + monthsep                      1   1367.0 1383.0
## + educationuniversity.degree    1   1367.1 1383.1
## + loanyes                       1   1367.1 1383.1
## + defaultyes                    1   1367.1 1383.1
## + emp.var.rate                  1   1367.2 1383.2
## + cons.price.idx                1   1367.2 1383.2
## + educationbasic.6y             1   1367.2 1383.2
## + jobstudent                    1   1367.2 1383.2
## + jobbluecollar                 1   1367.2 1383.2
## + educationhigh.school          1   1367.2 1383.2
## + age                           1   1367.2 1383.2
## + housingyes                    1   1367.2 1383.2
## + jobhousemaid                  1   1367.2 1383.2
## + day_of_weekwed                1   1367.2 1383.2
## + day_of_weektue                1   1367.3 1383.3
## + educationbasic.9y             1   1367.3 1383.3
## + day_of_weekthu                1   1367.3 1383.3
## + monthdec                      1   1367.3 1383.3
## 
## Step:  AIC=1378.75
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun + cons.conf.idx
## 
##                                Df Deviance    AIC
## + poutcomenonexistent           1   1358.3 1376.3
## + campaign                      1   1359.7 1377.7
## + jobentrepreneur               1   1360.1 1378.1
## + monthjul                      1   1360.7 1378.7
## <none>                              1362.8 1378.8
## + maritalmarried                1   1361.2 1379.2
## + jobmanagement                 1   1361.3 1379.3
## + monthnov                      1   1361.5 1379.5
## + jobretired                    1   1361.5 1379.5
## + maritalsingle                 1   1361.5 1379.5
## + jobselfemployed               1   1361.7 1379.7
## + jobtechnician                 1   1361.8 1379.8
## + monthoct                      1   1361.8 1379.8
## + monthsep                      1   1362.1 1380.1
## + previous                      1   1362.3 1380.3
## + cons.price.idx                1   1362.3 1380.3
## + day_of_weekmon                1   1362.3 1380.3
## + jobunemployed                 1   1362.3 1380.3
## + educationprofessional.course  1   1362.5 1380.5
## + jobbluecollar                 1   1362.5 1380.5
## + educationbasic.6y             1   1362.5 1380.5
## + euribor3m                     1   1362.5 1380.5
## + jobservices                   1   1362.5 1380.5
## + loanyes                       1   1362.6 1380.6
## + defaultyes                    1   1362.6 1380.6
## + day_of_weektue                1   1362.7 1380.7
## + jobstudent                    1   1362.7 1380.7
## + educationuniversity.degree    1   1362.7 1380.7
## + emp.var.rate                  1   1362.7 1380.7
## + educationhigh.school          1   1362.7 1380.7
## + jobhousemaid                  1   1362.7 1380.7
## + monthdec                      1   1362.7 1380.7
## + day_of_weekthu                1   1362.7 1380.7
## + day_of_weekwed                1   1362.7 1380.7
## + housingyes                    1   1362.7 1380.7
## + monthaug                      1   1362.7 1380.7
## + age                           1   1362.8 1380.8
## + educationbasic.9y             1   1362.8 1380.8
## 
## Step:  AIC=1376.32
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent
## 
##                                Df Deviance    AIC
## + campaign                      1   1355.1 1375.1
## + jobentrepreneur               1   1355.7 1375.7
## + monthjul                      1   1356.2 1376.2
## <none>                              1358.3 1376.3
## + previous                      1   1356.6 1376.6
## + maritalmarried                1   1356.8 1376.8
## + jobmanagement                 1   1356.9 1376.9
## + jobretired                    1   1357.0 1377.0
## + monthnov                      1   1357.1 1377.1
## + maritalsingle                 1   1357.2 1377.2
## + jobselfemployed               1   1357.2 1377.2
## + jobtechnician                 1   1357.2 1377.2
## + monthoct                      1   1357.4 1377.4
## + monthsep                      1   1357.6 1377.6
## + day_of_weekmon                1   1357.8 1377.8
## + cons.price.idx                1   1357.8 1377.8
## + educationprofessional.course  1   1357.9 1377.9
## + jobbluecollar                 1   1358.0 1378.0
## + jobunemployed                 1   1358.0 1378.0
## + educationbasic.6y             1   1358.1 1378.1
## + jobstudent                    1   1358.1 1378.1
## + jobservices                   1   1358.1 1378.1
## + euribor3m                     1   1358.2 1378.2
## + loanyes                       1   1358.2 1378.2
## + defaultyes                    1   1358.2 1378.2
## + day_of_weektue                1   1358.2 1378.2
## + emp.var.rate                  1   1358.3 1378.3
## + monthaug                      1   1358.3 1378.3
## + educationuniversity.degree    1   1358.3 1378.3
## + day_of_weekwed                1   1358.3 1378.3
## + educationhigh.school          1   1358.3 1378.3
## + housingyes                    1   1358.3 1378.3
## + monthdec                      1   1358.3 1378.3
## + day_of_weekthu                1   1358.3 1378.3
## + jobhousemaid                  1   1358.3 1378.3
## + educationbasic.9y             1   1358.3 1378.3
## + age                           1   1358.3 1378.3
## 
## Step:  AIC=1375.06
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent + 
##     campaign
## 
##                                Df Deviance    AIC
## + jobentrepreneur               1   1352.2 1374.2
## + monthjul                      1   1352.6 1374.6
## <none>                              1355.1 1375.1
## + previous                      1   1353.3 1375.3
## + monthnov                      1   1353.5 1375.5
## + jobmanagement                 1   1353.6 1375.6
## + maritalmarried                1   1353.6 1375.6
## + jobretired                    1   1353.7 1375.7
## + maritalsingle                 1   1353.9 1375.9
## + jobselfemployed               1   1354.0 1376.0
## + jobtechnician                 1   1354.1 1376.1
## + cons.price.idx                1   1354.2 1376.2
## + monthoct                      1   1354.3 1376.3
## + monthsep                      1   1354.4 1376.4
## + day_of_weekmon                1   1354.6 1376.6
## + educationprofessional.course  1   1354.7 1376.7
## + euribor3m                     1   1354.8 1376.8
## + educationbasic.6y             1   1354.8 1376.8
## + jobunemployed                 1   1354.8 1376.8
## + jobbluecollar                 1   1354.8 1376.8
## + emp.var.rate                  1   1354.8 1376.8
## + jobstudent                    1   1354.9 1376.9
## + loanyes                       1   1354.9 1376.9
## + day_of_weektue                1   1354.9 1376.9
## + defaultyes                    1   1355.0 1377.0
## + jobservices                   1   1355.0 1377.0
## + educationuniversity.degree    1   1355.0 1377.0
## + housingyes                    1   1355.0 1377.0
## + day_of_weekthu                1   1355.0 1377.0
## + educationbasic.9y             1   1355.0 1377.0
## + educationhigh.school          1   1355.1 1377.1
## + monthaug                      1   1355.1 1377.1
## + day_of_weekwed                1   1355.1 1377.1
## + jobhousemaid                  1   1355.1 1377.1
## + monthdec                      1   1355.1 1377.1
## + age                           1   1355.1 1377.1
## 
## Step:  AIC=1374.25
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent + 
##     campaign + jobentrepreneur
## 
##                                Df Deviance    AIC
## + monthjul                      1   1350.0 1374.0
## <none>                              1352.2 1374.2
## + previous                      1   1350.4 1374.4
## + jobmanagement                 1   1350.5 1374.5
## + jobretired                    1   1350.8 1374.8
## + monthnov                      1   1350.9 1374.9
## + maritalmarried                1   1350.9 1374.9
## + jobselfemployed               1   1351.1 1375.1
## + maritalsingle                 1   1351.3 1375.3
## + cons.price.idx                1   1351.5 1375.5
## + monthoct                      1   1351.5 1375.5
## + jobtechnician                 1   1351.5 1375.5
## + monthsep                      1   1351.6 1375.6
## + day_of_weekmon                1   1351.8 1375.8
## + educationprofessional.course  1   1351.9 1375.9
## + educationbasic.6y             1   1352.0 1376.0
## + euribor3m                     1   1352.0 1376.0
## + jobunemployed                 1   1352.0 1376.0
## + emp.var.rate                  1   1352.1 1376.1
## + loanyes                       1   1352.1 1376.1
## + jobservices                   1   1352.1 1376.1
## + jobstudent                    1   1352.1 1376.1
## + jobbluecollar                 1   1352.1 1376.1
## + day_of_weektue                1   1352.1 1376.1
## + defaultyes                    1   1352.1 1376.1
## + educationuniversity.degree    1   1352.2 1376.2
## + day_of_weekthu                1   1352.2 1376.2
## + monthaug                      1   1352.2 1376.2
## + housingyes                    1   1352.2 1376.2
## + educationbasic.9y             1   1352.2 1376.2
## + educationhigh.school          1   1352.2 1376.2
## + age                           1   1352.2 1376.2
## + day_of_weekwed                1   1352.2 1376.2
## + jobhousemaid                  1   1352.2 1376.2
## + monthdec                      1   1352.2 1376.2
## 
## Step:  AIC=1374.01
## .outcome ~ nr.employed + poutcomesuccess + monthmay + monthmar + 
##     contacttelephone + monthjun + cons.conf.idx + poutcomenonexistent + 
##     campaign + jobentrepreneur + monthjul
## 
##                                Df Deviance    AIC
## <none>                              1350.0 1374.0
## + previous                      1   1348.3 1374.3
## + jobmanagement                 1   1348.5 1374.5
## + jobretired                    1   1348.6 1374.6
## + jobselfemployed               1   1348.9 1374.9
## + maritalmarried                1   1348.9 1374.9
## + monthoct                      1   1349.2 1375.2
## + maritalsingle                 1   1349.2 1375.2
## + jobtechnician                 1   1349.3 1375.3
## + monthsep                      1   1349.4 1375.4
## + monthnov                      1   1349.5 1375.5
## + educationbasic.6y             1   1349.7 1375.7
## + day_of_weekmon                1   1349.7 1375.7
## + educationprofessional.course  1   1349.8 1375.8
## + jobunemployed                 1   1349.8 1375.8
## + monthaug                      1   1349.8 1375.8
## + cons.price.idx                1   1349.8 1375.8
## + jobservices                   1   1349.8 1375.8
## + loanyes                       1   1349.8 1375.8
## + day_of_weektue                1   1349.8 1375.8
## + educationuniversity.degree    1   1349.9 1375.9
## + jobstudent                    1   1349.9 1375.9
## + jobbluecollar                 1   1349.9 1375.9
## + defaultyes                    1   1349.9 1375.9
## + euribor3m                     1   1349.9 1375.9
## + day_of_weekthu                1   1350.0 1376.0
## + educationbasic.9y             1   1350.0 1376.0
## + educationhigh.school          1   1350.0 1376.0
## + emp.var.rate                  1   1350.0 1376.0
## + age                           1   1350.0 1376.0
## + housingyes                    1   1350.0 1376.0
## + day_of_weekwed                1   1350.0 1376.0
## + monthdec                      1   1350.0 1376.0
## + jobhousemaid                  1   1350.0 1376.0
summary(aicmodel)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7170  -0.4104  -0.3556  -0.2348   2.7132  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         48.7547659  4.8727168  10.006  < 2e-16 ***
## nr.employed         -0.0096611  0.0009683  -9.978  < 2e-16 ***
## poutcomesuccess      1.8134198  0.3003673   6.037 1.57e-09 ***
## monthmay            -0.1917847  0.2041473  -0.939   0.3475    
## monthmar             1.9170591  0.4181888   4.584 4.56e-06 ***
## contacttelephone    -1.0829270  0.2223230  -4.871 1.11e-06 ***
## monthjun             1.0333749  0.2510308   4.117 3.85e-05 ***
## cons.conf.idx        0.0331188  0.0141603   2.339   0.0193 *  
## poutcomenonexistent  0.4541331  0.2160839   2.102   0.0356 *  
## campaign            -0.0777937  0.0432435  -1.799   0.0720 .  
## jobentrepreneur     -0.8700039  0.6106654  -1.425   0.1542    
## monthjul             0.3602712  0.2378341   1.515   0.1298    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.8  on 2316  degrees of freedom
## Residual deviance: 1350.0  on 2305  degrees of freedom
## AIC: 1374
## 
## Number of Fisher Scoring iterations: 6
g1 = glm(formula = y ~ ., data = Btrain, family = binomial)
summary(g1)
## 
## Call:
## glm(formula = y ~ ., family = binomial, data = Btrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9514  -0.4299  -0.3406  -0.2358   2.7311  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -1.166e+02  1.342e+02  -0.869   0.3850    
## age                           1.378e-02  8.973e-03   1.536   0.1245    
## jobbluecollar                 1.715e-01  2.935e-01   0.584   0.5590    
## jobentrepreneur              -9.667e-01  6.312e-01  -1.532   0.1256    
## jobhousemaid                  5.649e-02  5.152e-01   0.110   0.9127    
## jobmanagement                -4.349e-01  3.109e-01  -1.399   0.1619    
## jobretired                   -6.118e-01  4.087e-01  -1.497   0.1344    
## jobselfemployed              -4.625e-01  4.193e-01  -1.103   0.2701    
## jobservices                  -1.123e-01  3.141e-01  -0.357   0.7207    
## jobstudent                    1.926e-01  4.522e-01   0.426   0.6702    
## jobtechnician                 7.799e-02  2.383e-01   0.327   0.7435    
## jobunemployed                 2.142e-01  4.266e-01   0.502   0.6157    
## maritalmarried               -1.254e-01  2.531e-01  -0.496   0.6202    
## maritalsingle                 5.963e-02  2.837e-01   0.210   0.8335    
## educationbasic.6y             5.399e-01  4.842e-01   1.115   0.2648    
## educationbasic.9y             3.381e-01  3.949e-01   0.856   0.3919    
## educationhigh.school          4.634e-01  3.743e-01   1.238   0.2157    
## educationprofessional.course  5.214e-01  3.961e-01   1.316   0.1881    
## educationuniversity.degree    5.623e-01  3.795e-01   1.482   0.1384    
## defaultyes                   -9.443e+00  3.247e+02  -0.029   0.9768    
## housingyes                   -1.452e-02  1.507e-01  -0.096   0.9232    
## loanyes                      -8.390e-02  2.060e-01  -0.407   0.6838    
## contacttelephone             -1.281e+00  3.018e-01  -4.245 2.19e-05 ***
## monthaug                      2.537e-01  4.684e-01   0.542   0.5881    
## monthdec                      2.201e-01  8.079e-01   0.272   0.7853    
## monthjul                      1.415e-01  4.047e-01   0.350   0.7266    
## monthjun                      4.917e-01  4.705e-01   1.045   0.2960    
## monthmar                      2.240e+00  5.738e-01   3.903 9.49e-05 ***
## monthmay                     -6.994e-02  3.334e-01  -0.210   0.8338    
## monthnov                     -2.065e-01  4.650e-01  -0.444   0.6570    
## monthoct                      4.576e-01  5.868e-01   0.780   0.4355    
## monthsep                      2.028e-01  6.712e-01   0.302   0.7626    
## day_of_weekmon               -1.607e-01  2.300e-01  -0.699   0.4847    
## day_of_weekthu               -6.108e-02  2.324e-01  -0.263   0.7927    
## day_of_weektue               -1.351e-01  2.369e-01  -0.570   0.5685    
## day_of_weekwed               -7.853e-02  2.378e-01  -0.330   0.7412    
## campaign                     -7.483e-02  4.367e-02  -1.714   0.0866 .  
## previous                      2.291e-01  2.015e-01   1.137   0.2555    
## poutcomenonexistent           7.824e-01  3.485e-01   2.245   0.0248 *  
## poutcomesuccess               1.812e+00  3.162e-01   5.729 1.01e-08 ***
## emp.var.rate                 -7.817e-01  4.857e-01  -1.609   0.1075    
## cons.price.idx                1.178e+00  8.736e-01   1.349   0.1775    
## cons.conf.idx                 4.049e-02  2.984e-02   1.357   0.1748    
## euribor3m                     6.934e-02  4.933e-01   0.141   0.8882    
## nr.employed                   8.207e-04  1.114e-02   0.074   0.9413    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.8  on 2316  degrees of freedom
## Residual deviance: 1331.9  on 2272  degrees of freedom
## AIC: 1421.9
## 
## Number of Fisher Scoring iterations: 11
vif(g1)
##                      GVIF Df GVIF^(1/(2*Df))
## age              2.042470  1        1.429150
## job              6.443548 10        1.097631
## marital          1.479163  2        1.102818
## education        3.357011  5        1.128744
## default          1.000003  1        1.000002
## housing          1.074458  1        1.036561
## loan             1.035792  1        1.017739
## contact          2.383798  1        1.543955
## month           93.327292  9        1.286604
## day_of_week      1.168171  4        1.019620
## campaign         1.064350  1        1.031673
## previous         4.098425  1        2.024457
## poutcome         4.413599  2        1.449433
## emp.var.rate   129.160437  1       11.364877
## cons.price.idx  62.820665  1        7.925949
## cons.conf.idx    5.839559  1        2.416518
## euribor3m      153.227292  1       12.378501
## nr.employed    185.061496  1       13.603731
#multicollinerty square last column > 10
g2 = glm(formula = y ~ age + job + marital + education + default + housing + loan + contact + month + day_of_week + campaign + previous + poutcome + cons.conf.idx, data = Btrain, family = binomial) 
summary(g2)
## 
## Call:
## glm(formula = y ~ age + job + marital + education + default + 
##     housing + loan + contact + month + day_of_week + campaign + 
##     previous + poutcome + cons.conf.idx, family = binomial, data = Btrain)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0322  -0.4765  -0.3844  -0.2454   2.8590  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.160208   1.119400  -1.036  0.29999    
## age                            0.018839   0.008877   2.122  0.03383 *  
## jobbluecollar                  0.168841   0.287526   0.587  0.55705    
## jobentrepreneur               -1.021741   0.628333  -1.626  0.10393    
## jobhousemaid                   0.157648   0.491831   0.321  0.74857    
## jobmanagement                 -0.520983   0.304542  -1.711  0.08713 .  
## jobretired                    -0.430118   0.401267  -1.072  0.28377    
## jobselfemployed               -0.477189   0.413727  -1.153  0.24875    
## jobservices                   -0.065224   0.306366  -0.213  0.83141    
## jobstudent                     0.793640   0.446381   1.778  0.07541 .  
## jobtechnician                  0.015248   0.230292   0.066  0.94721    
## jobunemployed                  0.345398   0.411445   0.839  0.40120    
## maritalmarried                -0.144562   0.244344  -0.592  0.55410    
## maritalsingle                  0.108589   0.273317   0.397  0.69115    
## educationbasic.6y              0.613411   0.473963   1.294  0.19559    
## educationbasic.9y              0.375210   0.393141   0.954  0.33989    
## educationhigh.school           0.565601   0.374249   1.511  0.13071    
## educationprofessional.course   0.667561   0.394129   1.694  0.09031 .  
## educationuniversity.degree     0.747415   0.378561   1.974  0.04834 *  
## defaultyes                   -10.098315 324.744149  -0.031  0.97519    
## housingyes                    -0.004615   0.147049  -0.031  0.97496    
## loanyes                       -0.111740   0.201335  -0.555  0.57890    
## contacttelephone              -1.763254   0.242901  -7.259 3.90e-13 ***
## monthaug                      -1.019246   0.389277  -2.618  0.00884 ** 
## monthdec                       0.070405   0.807249   0.087  0.93050    
## monthjul                      -0.687185   0.345542  -1.989  0.04673 *  
## monthjun                       0.822571   0.360265   2.283  0.02242 *  
## monthmar                       1.985356   0.487109   4.076 4.59e-05 ***
## monthmay                      -0.316261   0.317376  -0.996  0.31901    
## monthnov                      -0.783809   0.358960  -2.184  0.02899 *  
## monthoct                       0.687158   0.482276   1.425  0.15421    
## monthsep                       0.281533   0.515757   0.546  0.58516    
## day_of_weekmon                -0.114120   0.224988  -0.507  0.61200    
## day_of_weekthu                -0.065997   0.227209  -0.290  0.77146    
## day_of_weektue                -0.106158   0.231208  -0.459  0.64613    
## day_of_weekwed                -0.063015   0.231705  -0.272  0.78565    
## campaign                      -0.091075   0.041905  -2.173  0.02975 *  
## previous                       0.522585   0.201434   2.594  0.00948 ** 
## poutcomenonexistent            0.779988   0.350580   2.225  0.02609 *  
## poutcomesuccess                2.204888   0.314099   7.020 2.22e-12 ***
## cons.conf.idx                  0.050201   0.018859   2.662  0.00777 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.8  on 2316  degrees of freedom
## Residual deviance: 1390.1  on 2276  degrees of freedom
## AIC: 1472.1
## 
## Number of Fisher Scoring iterations: 11
vif(g2)
##                   GVIF Df GVIF^(1/(2*Df))
## age           1.916263  1        1.384292
## job           5.384075 10        1.087816
## marital       1.462097  2        1.099624
## education     3.135346  5        1.121059
## default       1.000003  1        1.000001
## housing       1.063613  1        1.031316
## loan          1.027543  1        1.013678
## contact       1.583114  1        1.258219
## month         3.985011  9        1.079834
## day_of_week   1.130158  4        1.015412
## campaign      1.052298  1        1.025816
## previous      3.990697  1        1.997673
## poutcome      4.465515  2        1.453677
## cons.conf.idx 2.146600  1        1.465128
# model 3 from AIC
g3 = glm(formula = y ~ nr.employed + poutcome + month + contact + cons.conf.idx + campaign, data = Btrain, family = binomial)
vif(g3)
##                   GVIF Df GVIF^(1/(2*Df))
## nr.employed   1.895607  1        1.376811
## poutcome      1.353166  2        1.078544
## month         3.540725  9        1.072766
## contact       1.402960  1        1.184466
## cons.conf.idx 1.909875  1        1.381982
## campaign      1.038116  1        1.018880
# Probability for 1st model
Btrain$PredProb = predict.glm(g2, newdata = Btrain, type = "response")
Btrain$Predy = ifelse(Btrain$PredProb >= 0.5,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1996   35
##        yes  216   70
##                                          
##                Accuracy : 0.8917         
##                  95% CI : (0.8783, 0.904)
##     No Information Rate : 0.9547         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.3125         
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.9024         
##             Specificity : 0.6667         
##          Pos Pred Value : 0.9828         
##          Neg Pred Value : 0.2448         
##              Prevalence : 0.9547         
##          Detection Rate : 0.8615         
##    Detection Prevalence : 0.8766         
##       Balanced Accuracy : 0.7845         
##                                          
##        'Positive' Class : no             
## 
# ROC CURVE MODEL 1
lgPredObj <- prediction(Btrain$PredProb, Btrain$y)
lgPerfObj <- performance(lgPredObj, "tpr", "fpr")
plot(lgPerfObj, main = "ROC Curve", col = 2, lwd = 2)
abline(a = 0, b = 1, lwd = 2, lty = 3, col = "black")

find_cutoff <- function(actual_value, positive_class_name, negitive_class_name, pred_probability, pred_01=1, pred_10=1){
  # Initialising Variables
  msclaf_cost <- c()
  youden_index <- c()
  cutoff <- c()
  pred00 <- c() #correct classification of negative as negative (Sensitivity)
  pred01 <- c() #mis classification of negative to positive class (actual is 0, predicted 1)
  pred10 <- c() #mis classification of positive to negative class (actual 1 predicted 0)
  pred11 <- c() #correct classification of positive as positive (Specificity)
  
  costs = matrix(c(0, pred_01, pred_10, 0), ncol = 2)
  
  for (i in 1:100) {
    predList <- as.factor(ifelse(pred_probability >= i/100, positive_class_name, negitive_class_name))
    tbl <- table(predList, actual_value)
    
    # Classifying actual no as yes
    pred00[i] <- tbl[1]/(tbl[1] + tbl[2])
    
    pred01[i] <- tbl[2]/(tbl[1] + tbl[2])
    
    # Classifying actual yes as no
    pred10[i] <- tbl[3]/(tbl[3] + tbl[4])
    
    pred11[i] <- tbl[4]/(tbl[3] + tbl[4])
    
    cutoff[i] <- i/100
    msclaf_cost[i] <- pred10[i] * costs[3] + pred01[i] * costs[2]
    youden_index[i] <- pred11[i] + pred00[i] - 1
  }
  df.cost.table <- as.data.frame(cbind(cutoff, pred10, pred01, pred11, pred00, youden_index, msclaf_cost))
  cat(paste0('The ideal cutoff for:\n Yodens Index approach : ', which.max(df.cost.table$youden_index)/100))
  cat(paste0('\n Cost based approach : ', which.min(df.cost.table$msclaf_cost)/100))
  ggplot(df.cost.table, aes(x = cutoff)) +
    geom_line(aes(y = youden_index, color = 'yoden index')) +
    geom_line(aes(y = msclaf_cost, color = 'misclassification cost'))+
    labs(x = 'Cutoff p value', y='Index',  title = 'Cutoff p value',fill = 'Plot') +
    theme_minimal()+ theme(legend.position="bottom")
}

# CUTOFF MODEL
find_cutoff(actual_value = Btrain$y, positive_class_name = 'yes', negitive_class_name = 'no', pred_probability = Btrain$PredProb, pred_01 =3, pred_10 = 1)
## The ideal cutoff for:
##  Yodens Index approach : 0.16
##  Cost based approach : 0.2
## Warning: Removed 2 row(s) containing missing values (geom_path).

## Warning: Removed 2 row(s) containing missing values (geom_path).

Btrain$Predy = ifelse(Btrain$PredProb >= 0.14,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1766  265
##        yes  137  149
##                                           
##                Accuracy : 0.8265          
##                  95% CI : (0.8105, 0.8417)
##     No Information Rate : 0.8213          
##     P-Value [Acc > NIR] : 0.2676          
##                                           
##                   Kappa : 0.3275          
##                                           
##  Mcnemar's Test P-Value : 2.386e-10       
##                                           
##             Sensitivity : 0.9280          
##             Specificity : 0.3599          
##          Pos Pred Value : 0.8695          
##          Neg Pred Value : 0.5210          
##              Prevalence : 0.8213          
##          Detection Rate : 0.7622          
##    Detection Prevalence : 0.8766          
##       Balanced Accuracy : 0.6440          
##                                           
##        'Positive' Class : no              
## 
#### AIC MODEL
Btrain$PredProb3 = predict.glm(g3, newdata = Btrain, type = "response")
Btrain$Predy3 = ifelse(Btrain$PredProb3 >= 0.5,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy3))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1997   34
##        yes  216   70
##                                           
##                Accuracy : 0.8921          
##                  95% CI : (0.8788, 0.9044)
##     No Information Rate : 0.9551          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3138          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9024          
##             Specificity : 0.6731          
##          Pos Pred Value : 0.9833          
##          Neg Pred Value : 0.2448          
##              Prevalence : 0.9551          
##          Detection Rate : 0.8619          
##    Detection Prevalence : 0.8766          
##       Balanced Accuracy : 0.7877          
##                                           
##        'Positive' Class : no              
## 
#Model 3 Cutoff
find_cutoff(actual_value = Btrain$y, positive_class_name = 'yes', negitive_class_name = 'no', pred_probability = Btrain$PredProb3, pred_01 =3, pred_10 = 1)
## The ideal cutoff for:
##  Yodens Index approach : 0.14
##  Cost based approach : 0.25
## Warning: Removed 6 row(s) containing missing values (geom_path).
## Warning: Removed 6 row(s) containing missing values (geom_path).

Btrain$Predy3 = ifelse(Btrain$PredProb3 >= 0.13,"yes","no")
confusionMatrix(Btrain$y, as.factor(Btrain$Predy3))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  1667  364
##        yes   97  189
##                                           
##                Accuracy : 0.801           
##                  95% CI : (0.7842, 0.8171)
##     No Information Rate : 0.7613          
##     P-Value [Acc > NIR] : 2.719e-06       
##                                           
##                   Kappa : 0.3438          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.9450          
##             Specificity : 0.3418          
##          Pos Pred Value : 0.8208          
##          Neg Pred Value : 0.6608          
##              Prevalence : 0.7613          
##          Detection Rate : 0.7195          
##    Detection Prevalence : 0.8766          
##       Balanced Accuracy : 0.6434          
##                                           
##        'Positive' Class : no              
## 

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.