OUTLINE



1. Problem Description


ABC Bank wants to sell it’s term deposit product to customers and before launching the product they want to develop a model which help them in understanding whether a particular customer will buy their product or not (based on customer’s past interaction with bank or other Financial Institution).


2. Data understanding

library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")
kbl(psych::headTail(bank,10,10), caption="First and last 10 variables in the data", booktabs = T) %>% kable_styling(latex_options = "striped",font_size=10)
First and last 10 variables in the data
age job marital education default balance housing loan contact day month duration campaign pdays previous poutcome y
1 58 management married tertiary no 2143 yes no unknown 5 may 261 1 -1 0 unknown no
2 44 technician single secondary no 29 yes no unknown 5 may 151 1 -1 0 unknown no
3 33 entrepreneur married secondary no 2 yes yes unknown 5 may 76 1 -1 0 unknown no
4 47 blue-collar married unknown no 1506 yes no unknown 5 may 92 1 -1 0 unknown no
5 33 unknown single unknown no 1 no no unknown 5 may 198 1 -1 0 unknown no
6 35 management married tertiary no 231 yes no unknown 5 may 139 1 -1 0 unknown no
7 28 management single tertiary no 447 yes yes unknown 5 may 217 1 -1 0 unknown no
8 42 entrepreneur divorced tertiary yes 2 yes no unknown 5 may 380 1 -1 0 unknown no
9 58 retired married primary no 121 yes no unknown 5 may 50 1 -1 0 unknown no
10 43 technician single secondary no 593 yes no unknown 5 may 55 1 -1 0 unknown no
NA NA NA NA NA NA NA NA NA NA
45202 53 management married tertiary no 583 no no cellular 17 nov 226 1 184 4 success yes
45203 34 admin. single secondary no 557 no no cellular 17 nov 224 1 -1 0 unknown yes
45204 23 student single tertiary no 113 no no cellular 17 nov 266 1 -1 0 unknown yes
45205 73 retired married secondary no 2850 no no cellular 17 nov 300 1 40 8 failure yes
45206 25 technician single secondary no 505 no yes cellular 17 nov 386 2 -1 0 unknown yes
45207 51 technician married tertiary no 825 no no cellular 17 nov 977 3 -1 0 unknown yes
45208 71 retired divorced primary no 1729 no no cellular 17 nov 456 2 -1 0 unknown yes
45209 72 retired married secondary no 5715 no no cellular 17 nov 1127 5 184 3 success yes
45210 57 blue-collar married secondary no 668 no no telephone 17 nov 508 4 -1 0 unknown no
45211 37 entrepreneur married secondary no 2971 no no cellular 17 nov 361 2 188 11 other no
bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)

bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)
dim(bank)
## [1] 45211    17
summary(bank)
##       age                 job           marital          education    
##  Min.   :18.00   blue-collar:9732   divorced: 5207   primary  : 6851  
##  1st Qu.:33.00   management :9458   married :27214   secondary:23202  
##  Median :39.00   technician :7597   single  :12790   tertiary :13301  
##  Mean   :40.94   admin.     :5171                    unknown  : 1857  
##  3rd Qu.:48.00   services   :4154                                     
##  Max.   :95.00   retired    :2264                                     
##                  (Other)    :6835                                     
##  default        balance       housing      loan            contact     
##  no :44396   Min.   : -8019   no :20081   no :37967   cellular :29285  
##  yes:  815   1st Qu.:    72   yes:25130   yes: 7244   telephone: 2906  
##              Median :   448                           unknown  :13020  
##              Mean   :  1362                                            
##              3rd Qu.:  1428                                            
##              Max.   :102127                                            
##                                                                        
##       day           month              duration         campaign     
##  Min.   : 1.00   Length:45211       Min.   :   0.0   Min.   : 1.000  
##  1st Qu.: 8.00   Class :character   1st Qu.: 103.0   1st Qu.: 1.000  
##  Median :16.00   Mode  :character   Median : 180.0   Median : 2.000  
##  Mean   :15.81                      Mean   : 258.2   Mean   : 2.764  
##  3rd Qu.:21.00                      3rd Qu.: 319.0   3rd Qu.: 3.000  
##  Max.   :31.00                      Max.   :4918.0   Max.   :63.000  
##                                                                      
##      pdays          previous           poutcome          y            
##  Min.   : -1.0   Min.   :  0.0000   failure: 4901   Length:45211      
##  1st Qu.: -1.0   1st Qu.:  0.0000   other  : 1840   Class :character  
##  Median : -1.0   Median :  0.0000   success: 1511   Mode  :character  
##  Mean   : 40.2   Mean   :  0.5803   unknown:36959                     
##  3rd Qu.: -1.0   3rd Qu.:  0.0000                                     
##  Max.   :871.0   Max.   :275.0000                                     
## 
str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : Factor w/ 12 levels "admin.","blue-collar",..: 5 10 3 2 12 5 5 3 6 10 ...
##  $ marital  : Factor w/ 3 levels "divorced","married",..: 2 3 2 2 3 2 3 1 2 3 ...
##  $ education: Factor w/ 4 levels "primary","secondary",..: 3 2 2 4 4 3 3 3 1 2 ...
##  $ default  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 2 1 1 ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : Factor w/ 2 levels "no","yes": 2 2 2 2 1 2 2 2 2 2 ...
##  $ loan     : Factor w/ 2 levels "no","yes": 1 1 2 1 1 1 2 1 1 1 ...
##  $ contact  : Factor w/ 3 levels "cellular","telephone",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : Factor w/ 4 levels "failure","other",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ y        : chr  "no" "no" "no" "no" ...




Output variable (desired target): - 17. y :has the client subscribed a term deposit? (binary: ‘yes’,‘no’)


Exploratory data Analysis


Is there duplicated rows in the data?

library(dplyr)
# get the row numbers of duplicated rows
duplicated_rows <- data_frame(duplicated = duplicated(bank), row = 1:nrow(bank)) %>%
    filter(duplicated == T)

count(duplicated_rows)
## # A tibble: 1 x 1
##       n
##   <int>
## 1     0
  • As seen there is not duplicated rows in the data.


Is there any missing value in the data?

library(naniar)
vis_miss(bank)

  • As seen there is no missing value in the data.


Frequencies of the categorical variables and distributions of the numeric variables

library(funModeling)
profiling_num(bank) 
##   variable         mean     std_dev variation_coef p_01 p_05 p_25 p_50 p_75
## 1      age   40.9362102   10.618762      0.2593978   23   27   33   39   48
## 2  balance 1362.2720577 3044.765829      2.2350644 -627 -172   72  448 1428
## 3      day   15.8064188    8.322476      0.5265251    2    3    8   16   21
## 4 duration  258.1630798  257.527812      0.9975393   11   35  103  180  319
## 5 campaign    2.7638407    3.098021      1.1209115    1    1    1    2    3
## 6    pdays   40.1978280  100.128746      2.4908994   -1   -1   -1   -1   -1
## 7 previous    0.5803234    2.303441      3.9692371    0    0    0    0    0
##   p_95    p_99    skewness    kurtosis  iqr              range_98  range_80
## 1   59    71.0  0.68479520    3.319402   15              [23, 71]  [29, 56]
## 2 5768 13164.9  8.36003095  143.735848 1356       [-627, 13164.9] [0, 3574]
## 3   29    31.0  0.09307593    1.940087   13               [2, 31]   [5, 28]
## 4  751  1269.0  3.14421378   21.151775  216            [11, 1269] [58, 548]
## 5    8    16.0  4.89848764   42.245178    2               [1, 16]    [1, 5]
## 6  317   370.0  2.61562869    9.934296    0             [-1, 370] [-1, 185]
## 7    3     8.9 41.84506609 4509.362118    0 [0, 8.90000000000146]    [0, 2]
plot_num(bank) 

freq(bank) 

##              job frequency percentage cumulative_perc
## 1    blue-collar      9732      21.53           21.53
## 2     management      9458      20.92           42.45
## 3     technician      7597      16.80           59.25
## 4         admin.      5171      11.44           70.69
## 5       services      4154       9.19           79.88
## 6        retired      2264       5.01           84.89
## 7  self-employed      1579       3.49           88.38
## 8   entrepreneur      1487       3.29           91.67
## 9     unemployed      1303       2.88           94.55
## 10     housemaid      1240       2.74           97.29
## 11       student       938       2.07           99.36
## 12       unknown       288       0.64          100.00

##    marital frequency percentage cumulative_perc
## 1  married     27214      60.19           60.19
## 2   single     12790      28.29           88.48
## 3 divorced      5207      11.52          100.00

##   education frequency percentage cumulative_perc
## 1 secondary     23202      51.32           51.32
## 2  tertiary     13301      29.42           80.74
## 3   primary      6851      15.15           95.89
## 4   unknown      1857       4.11          100.00

##   default frequency percentage cumulative_perc
## 1      no     44396       98.2            98.2
## 2     yes       815        1.8           100.0

##   housing frequency percentage cumulative_perc
## 1     yes     25130      55.58           55.58
## 2      no     20081      44.42          100.00

##   loan frequency percentage cumulative_perc
## 1   no     37967      83.98           83.98
## 2  yes      7244      16.02          100.00

##     contact frequency percentage cumulative_perc
## 1  cellular     29285      64.77           64.77
## 2   unknown     13020      28.80           93.57
## 3 telephone      2906       6.43          100.00

##    month frequency percentage cumulative_perc
## 1    may     13766      30.45           30.45
## 2    jul      6895      15.25           45.70
## 3    aug      6247      13.82           59.52
## 4    jun      5341      11.81           71.33
## 5    nov      3970       8.78           80.11
## 6    apr      2932       6.49           86.60
## 7    feb      2649       5.86           92.46
## 8    jan      1403       3.10           95.56
## 9    oct       738       1.63           97.19
## 10   sep       579       1.28           98.47
## 11   mar       477       1.06           99.53
## 12   dec       214       0.47          100.00

##   poutcome frequency percentage cumulative_perc
## 1  unknown     36959      81.75           81.75
## 2  failure      4901      10.84           92.59
## 3    other      1840       4.07           96.66
## 4  success      1511       3.34          100.00

##     y frequency percentage cumulative_perc
## 1  no     39922       88.3            88.3
## 2 yes      5289       11.7           100.0
## [1] "Variables processed: job, marital, education, default, housing, loan, contact, month, poutcome, y"


Does numeric variables have any outlier and what will be shape of the variables exclude the outliers?

num<-bank[,c(1,6,12:13)]
dlookr::plot_outlier(num)

  • As seen all of the 4 numeric variables have outlier.
  • Shapes of the variables changed when outliers removed.


Is there any significant relationship between numeric variables and y, if y taken as numeric (1:no, 0:yes)?

for(i in 1: nrow(bank)){
  if(bank$y[i]=="yes"){
    bank$y[i]=1
  }
  else{
   bank$y[i]=0 
  }
}

bank$y<-as.numeric(bank$y)
# correlations as taking y numeric( 0 and 1)
library(corrplot)
library(PerformanceAnalytics)

numeric<-bank[,c(1,6,12:15,17)]

M <- cor(numeric[,])
res1 <- cor(M, method="spearman")
corrplot::corrplot(res1, method= "color", order = "hclust", addCoef.col = "black", 
         tl.col="black", tl.srt=45
)


Positive correlations are shown in blue and negative correlations in red color. Color intensity is proportional to the correlation coefficients. Lets look at the correlation matrix to examine which variables have strong relationship with response variable y.

  • Between y and duration, there is strong positive relationship.
  • Between y and campaign, there is strong negative relationship.


We can also see the relationship between other variables (covariates).

library(GGally)
ggpairs(numeric)

We can see from this plot we can see that if the relationship between variables significant or not. We can see that all of the relationship between covariates and y are significant.


Is there any significant relationship between categorical variables and y?

\[ H_0: There\ is\ not\ significant\ relationship\ between\ variables\ (Variables\ are\ independent) \]

yes<- bank[bank$y=="yes",]
no<- bank[bank$y=="no",]


y<-yes %>% 
  select(job, y) %>%  
  group_by(job) %>% 
  summarise(n = n())

n<-no %>% 
  select(job, y) %>%  
  group_by(job) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
job<-rbind(No,Yes)

library(ggplot2)
a<-ggplot(data=job, aes(x=job, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=0.1, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "Job to subscribed a term deposit")+ coord_flip()+ theme(legend.position="top")
y<-yes %>% 
  select(marital, y) %>%  
  group_by(marital) %>% 
  summarise(n = n())

n<-no %>% 
  select(marital, y) %>%  
  group_by(marital) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
marital<-rbind(No,Yes)


b<-ggplot(data=marital, aes(x=marital, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=1, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "marital to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
y<-yes %>% 
  select(education, y) %>%  
  group_by(education) %>% 
  summarise(n = n())

n<-no %>% 
  select(education, y) %>%  
  group_by(education) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
education<-rbind(No,Yes)


c<-ggplot(data=education, aes(x=education, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=0.2, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "education to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
y<-yes %>% 
  select(default, y) %>%  
  group_by(default) %>% 
  summarise(n = n())

n<-no %>% 
  select(default, y) %>%  
  group_by(default) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
default<-rbind(No,Yes)


d<-ggplot(data=default, aes(x=default, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=1, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Pastel1")+
  theme_grey()+labs(title = "default to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
y<-yes %>% 
  select(housing, y) %>%  
  group_by(housing) %>% 
  summarise(n = n())

n<-no %>% 
  select(housing, y) %>%  
  group_by(housing) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
housing<-rbind(No,Yes)


e<-ggplot(data=housing, aes(x=housing, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=1.1, color="white",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "housing to subscribed a term deposit")+ coord_flip()+ theme(legend.position="top")
y<-yes %>% 
  select(loan, y) %>%  
  group_by(loan) %>% 
  summarise(n = n())

n<-no %>% 
  select(loan, y) %>%  
  group_by(loan) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
loan<-rbind(No,Yes)


f<-ggplot(data=loan, aes(x=loan, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=0.4, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "loan to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
y<-yes %>% 
  select(contact, y) %>%  
  group_by(contact) %>% 
  summarise(n = n())

n<-no %>% 
  select(contact, y) %>%  
  group_by(contact) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
contact<-rbind(No,Yes)


g<-ggplot(data=contact, aes(x=contact, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=0.2, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "contact to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
y<-yes %>% 
  select(poutcome, y) %>%  
  group_by(poutcome) %>% 
  summarise(n = n())

n<-no %>% 
  select(poutcome, y) %>%  
  group_by(poutcome) %>% 
  summarise(n = n())


No<-cbind(n,subscribed=rep("No",12))
Yes<-cbind(y,subscribed=rep("Yes",12))
poutcome<-rbind(No,Yes)


h<-ggplot(data=poutcome, aes(x=poutcome, y=n, fill=subscribed)) +
  geom_bar(stat="identity", position=position_dodge())+
  geom_text(aes(label=n), vjust=0.4,hjust=0.2, color="black",
            position = position_dodge(0.9), size=2.7, fontface="bold")+
  scale_fill_brewer(palette="Set1")+
  theme_grey()+labs(title = "poutcome to subscribed a term deposit")+ coord_flip()+ theme(legend.position="none")
library(ggpubr)
ggarrange(a,b,c,d,nrow=2,ncol=2)

ggarrange(e,f,g,h,nrow=2,ncol=2)


chis <- lapply(bank[,c(2:5,7:9,16)], function(x) chisq.test(bank[,c(17)],x,simulate.p.value=TRUE))


kbl(do.call(rbind, chis)[,c(1,3)], caption="Chi-Square table", booktabs = T) %>% kable_styling(latex_options = "striped",font_size=10)
Chi-Square table
statistic p.value
job c(X-squared = 836.105487747197) 0.000499750124937531
marital c(X-squared = 196.49594565604) 0.000499750124937531
education c(X-squared = 238.923506164076) 0.000499750124937531
default c(X-squared = 22.72350213915) 0.000499750124937531
housing c(X-squared = 875.69371810544) 0.000499750124937531
loan c(X-squared = 210.194934196027) 0.000499750124937531
contact c(X-squared = 1035.71422535629) 0.000499750124937531
poutcome c(X-squared = 4391.50658876866) 0.000499750124937531
  • As seen all p-values are smaller than the significance level of 0.05, so there is significant relationship between categorical variables and y.


MODELING

Data Preparation

We see that in EDA part, in response variable, “no” class proportion is 88.3 while “yes” class proportion is 11.7. There is huge difference difference between two class. Thus, we have imbalance data and it causes reduction in accuracy of ML algorithms.

What are the methods to deal with imbalanced data sets ?

The methods are widely known as ‘Sampling Methods’. Generally, these methods aim to modify an imbalanced data into balanced distribution using some mechanism. The modification occurs by altering the size of original data set and provide the same proportion of balance.


Below are the methods used to treat imbalanced datasets:


  • Undersampling

  • Oversampling

  • Synthetic Data Generation

  • Cost Sensitive Learning

  • I applied the both undersampling and oversampling since you we’ve lost significant information from the sample when doing undersampling.

  • In this case, the minority class is oversampled with replacement and majority class is undersampled without replacement.

library(ROSE)
data_balanced_both <- ovun.sample(y ~ ., data = bank, method = "both", p=0.5, N=45211, seed = 1)$data

After under and oversampling number of response class be:

table(data_balanced_both$y)
## 
##     0     1 
## 22628 22583
prop.table(table(data_balanced_both$y))
## 
##         0         1 
## 0.5004977 0.4995023
  • After over and undersampling data divided into two part; training and test set.
  • 80% of the data used as training set and 20% of the data used as test set.
set.seed(123)
split <- initial_split(data_balanced_both, prop = .8)
train1 <- training(split)
test  <- testing(split)
nrow(train1)
## [1] 36169
nrow(test)
## [1] 9042
prop.table(table(train1$y))
## 
##         0         1 
## 0.5006221 0.4993779
  • In all of models, y taken as response variable (by taking 0:No, 1:Yes), all of the other variables taking as covariate.


LOGISTIC REGRESSION

regression<- glm(y~., data = train1, family  = binomial(link = "logit"))
summary(regression)
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.1313  -0.5893  -0.0601   0.5988   2.9451  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -7.460e-01  1.503e-01  -4.964 6.89e-07 ***
## age                -9.109e-04  1.798e-03  -0.507  0.61236    
## jobblue-collar     -3.345e-01  5.796e-02  -5.771 7.87e-09 ***
## jobentrepreneur    -3.902e-01  9.889e-02  -3.946 7.94e-05 ***
## jobhousemaid       -4.098e-01  1.049e-01  -3.909 9.29e-05 ***
## jobmanagement      -9.117e-02  6.011e-02  -1.517  0.12935    
## jobretired          4.064e-01  8.213e-02   4.949 7.47e-07 ***
## jobself-employed   -2.232e-01  9.052e-02  -2.466  0.01365 *  
## jobservices        -2.869e-01  6.739e-02  -4.258 2.07e-05 ***
## jobstudent          7.235e-01  9.822e-02   7.366 1.76e-13 ***
## jobtechnician      -1.004e-01  5.569e-02  -1.803  0.07134 .  
## jobunemployed      -8.228e-02  9.312e-02  -0.884  0.37694    
## jobunknown         -2.553e-01  1.915e-01  -1.333  0.18240    
## maritalmarried     -1.878e-01  4.783e-02  -3.927 8.62e-05 ***
## maritalsingle       1.205e-01  5.497e-02   2.192  0.02838 *  
## educationsecondary  2.280e-01  5.191e-02   4.391 1.13e-05 ***
## educationtertiary   4.179e-01  6.126e-02   6.823 8.93e-12 ***
## educationunknown    2.802e-01  8.518e-02   3.290  0.00100 ** 
## defaultyes          1.003e-01  1.218e-01   0.824  0.41003    
## balance             2.341e-05  5.007e-06   4.675 2.94e-06 ***
## housingyes         -7.052e-01  3.480e-02 -20.267  < 2e-16 ***
## loanyes            -5.427e-01  4.672e-02 -11.615  < 2e-16 ***
## contacttelephone   -4.067e-02  6.089e-02  -0.668  0.50417    
## contactunknown     -1.718e+00  5.404e-02 -31.787  < 2e-16 ***
## day                 4.870e-03  1.976e-03   2.465  0.01371 *  
## monthaug           -9.246e-01  6.292e-02 -14.696  < 2e-16 ***
## monthdec            6.884e-01  1.789e-01   3.847  0.00012 ***
## monthfeb           -1.044e-01  7.124e-02  -1.465  0.14297    
## monthjan           -1.302e+00  9.564e-02 -13.613  < 2e-16 ***
## monthjul           -1.078e+00  6.312e-02 -17.071  < 2e-16 ***
## monthjun            3.044e-01  7.397e-02   4.116 3.86e-05 ***
## monthmar            1.715e+00  1.202e-01  14.264  < 2e-16 ***
## monthmay           -6.591e-01  6.013e-02 -10.962  < 2e-16 ***
## monthnov           -1.025e+00  6.912e-02 -14.826  < 2e-16 ***
## monthoct            1.241e+00  1.022e-01  12.138  < 2e-16 ***
## monthsep            9.476e-01  1.161e-01   8.161 3.33e-16 ***
## duration            5.698e-03  7.200e-05  79.143  < 2e-16 ***
## campaign           -1.067e-01  7.754e-03 -13.760  < 2e-16 ***
## pdays              -4.373e-04  2.416e-04  -1.810  0.07027 .  
## previous            1.928e-02  8.812e-03   2.188  0.02864 *  
## poutcomeother       1.138e-01  7.449e-02   1.527  0.12665    
## poutcomesuccess     2.504e+00  8.438e-02  29.677  < 2e-16 ***
## poutcomeunknown    -2.512e-01  7.915e-02  -3.173  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50141  on 36168  degrees of freedom
## Residual deviance: 28827  on 36126  degrees of freedom
## AIC: 28913
## 
## Number of Fisher Scoring iterations: 6
#Construct the Confusion Matrix
prediction <- predict(regression, newdata = test, type = 'response')
pred <- factor(ifelse(prediction <= 0.5,0,1))
result <- caret::confusionMatrix(pred,test$y)
result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3857  813
##          1  664 3708
##                                           
##                Accuracy : 0.8367          
##                  95% CI : (0.8289, 0.8442)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6733          
##                                           
##  Mcnemar's Test P-Value : 0.0001176       
##                                           
##             Sensitivity : 0.8531          
##             Specificity : 0.8202          
##          Pos Pred Value : 0.8259          
##          Neg Pred Value : 0.8481          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4266          
##    Detection Prevalence : 0.5165          
##       Balanced Accuracy : 0.8367          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.8531
Specificity 0.8202
Pos Pred Value 0.8259
Neg Pred Value 0.8481
Precision 0.8259
Recall 0.8531
F1 0.8393
Prevalence 0.5000
Detection Rate 0.4266
Detection Prevalence 0.5165
Balanced Accuracy 0.8367


DECISION TREE

ensemble <- rpart(y~., data = train1, method = 'class')
rpart.plot(ensemble)

#Construct the Confusion Matrix
prediction2 <- predict(ensemble, newdata = test, type = 'class')

result2 <- caret::confusionMatrix(prediction2,test$y)
result2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3377  475
##          1 1144 4046
##                                           
##                Accuracy : 0.8209          
##                  95% CI : (0.8129, 0.8288)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6419          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7470          
##             Specificity : 0.8949          
##          Pos Pred Value : 0.8767          
##          Neg Pred Value : 0.7796          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3735          
##    Detection Prevalence : 0.4260          
##       Balanced Accuracy : 0.8209          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(result2$byClass)
colnames(metrics)<-"metrics"
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.7470
Specificity 0.8949
Pos Pred Value 0.8767
Neg Pred Value 0.7796
Precision 0.8767
Recall 0.7470
F1 0.8066
Prevalence 0.5000
Detection Rate 0.3735
Detection Prevalence 0.4260
Balanced Accuracy 0.8209


XGBOOST

indexes = createDataPartition(bank$y, p=.8, list=F)
train = bank[indexes, ]
test = bank[-indexes, ]

train_x = data.matrix(train[,-17])
train_y = train[,17]
 
test_x = data.matrix(test[,-17])
test_y = test[,17]


xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)

xgbc = xgboost(data=xgb_train, max.depth=3, nrounds=50)
## [1]  train-rmse:0.528343 
## [2]  train-rmse:0.421607 
## [3]  train-rmse:0.355102 
## [4]  train-rmse:0.317497 
## [5]  train-rmse:0.296199 
## [6]  train-rmse:0.284485 
## [7]  train-rmse:0.278101 
## [8]  train-rmse:0.270568 
## [9]  train-rmse:0.268198 
## [10] train-rmse:0.266526 
## [11] train-rmse:0.265473 
## [12] train-rmse:0.263241 
## [13] train-rmse:0.262567 
## [14] train-rmse:0.262051 
## [15] train-rmse:0.261411 
## [16] train-rmse:0.260376 
## [17] train-rmse:0.260021 
## [18] train-rmse:0.258469 
## [19] train-rmse:0.257723 
## [20] train-rmse:0.257425 
## [21] train-rmse:0.257155 
## [22] train-rmse:0.256120 
## [23] train-rmse:0.255367 
## [24] train-rmse:0.255176 
## [25] train-rmse:0.255055 
## [26] train-rmse:0.254855 
## [27] train-rmse:0.254251 
## [28] train-rmse:0.254096 
## [29] train-rmse:0.253733 
## [30] train-rmse:0.253601 
## [31] train-rmse:0.253427 
## [32] train-rmse:0.253130 
## [33] train-rmse:0.253084 
## [34] train-rmse:0.252246 
## [35] train-rmse:0.251716 
## [36] train-rmse:0.251556 
## [37] train-rmse:0.250770 
## [38] train-rmse:0.250254 
## [39] train-rmse:0.250200 
## [40] train-rmse:0.250169 
## [41] train-rmse:0.249942 
## [42] train-rmse:0.249733 
## [43] train-rmse:0.249190 
## [44] train-rmse:0.249003 
## [45] train-rmse:0.248928 
## [46] train-rmse:0.248828 
## [47] train-rmse:0.248693 
## [48] train-rmse:0.248509 
## [49] train-rmse:0.248467 
## [50] train-rmse:0.248442
pred = predict(xgbc, xgb_test)
pred[(pred>3)] = 3
pred_y = as.factor((levels(test_y))[round(pred)])
cm = confusionMatrix(test_y, pred_y)
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7784  200
##          1  644  413
##                                           
##                Accuracy : 0.9066          
##                  95% CI : (0.9005, 0.9126)
##     No Information Rate : 0.9322          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4472          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9236          
##             Specificity : 0.6737          
##          Pos Pred Value : 0.9749          
##          Neg Pred Value : 0.3907          
##              Prevalence : 0.9322          
##          Detection Rate : 0.8610          
##    Detection Prevalence : 0.8831          
##       Balanced Accuracy : 0.7987          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(cm$byClass)
colnames(metrics)<-"metrics"
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.9236
Specificity 0.6737
Pos Pred Value 0.9749
Neg Pred Value 0.3907
Precision 0.9749
Recall 0.9236
F1 0.9486
Prevalence 0.9322
Detection Rate 0.8610
Detection Prevalence 0.8831
Balanced Accuracy 0.7987

Model Selection

f1<-c(0.836,0.82,0.798)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Accuracy")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Accuracy")

f1<-c(0.839,0.806,0.949)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("F1 Score")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="F1 Score")

f1<-c(0.85,0.747,0.923)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Recall")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Recall")

f1<-c(0.825,0.876,0.975)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Precision")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Precision")

  • As seen from the plots, F1 score, recall and precision of the model conducted with XGBOOST is the highest.

  • Thus, F1 score, recall and precision suggest that XGBOOST model is better model among 3 models.