Aim:

To Compare results of 2 classification Machine Learning Algorithms for Bank Marketing Data Set .The classification goal is to predict if the client will subscribe a term deposit (variable y).

Literature Survey:

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.

Reading Data-set from bank.csv

library(ggplot2)
library(dplyr)
dataset = read.csv("bank.csv", sep=';')
ds=dataset
glimpse(dataset)
## Rows: 4,521
## Columns: 17
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, ~
## $ job       <chr> "unemployed", "services", "management", "management", "blue-~
## $ marital   <chr> "married", "married", "single", "married", "married", "singl~
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondary",~
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26~
## $ housing   <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",~
## $ loan      <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "yes~
## $ contact   <chr> "cellular", "cellular", "cellular", "unknown", "unknown", "c~
## $ day       <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29,~
## $ month     <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "may~
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32~
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, ~
## $ pdays     <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1,~
## $ previous  <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, ~
## $ poutcome  <chr> "unknown", "failure", "failure", "unknown", "unknown", "fail~
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~

Selecting the features which are useful to us using dplyr package’s select()

dataset = dataset %>% select(job, marital,balance, housing, loan, contact, month, duration, poutcome,age,y)
View(dataset)

Encoding the char data to factor using factor()

dataset$job = factor(dataset$job,labels = c(1:12))
dataset$marital = factor(dataset$marital, labels = c(1:3))
dataset$housing = factor(dataset$housing, labels = c(0,1))
dataset$loan = factor(dataset$loan, labels = c(0,1))
dataset$contact = factor(dataset$contact, labels = c(1:3))
dataset$month = factor(dataset$month ,labels = c(1:12))
dataset$poutcome = factor(dataset$poutcome, labels = c(1:4))
dataset$y= factor(dataset$y, labels = c(0,1))
glimpse(dataset)
## Rows: 4,521
## Columns: 11
## $ job      <fct> 11, 8, 5, 5, 2, 5, 7, 10, 3, 8, 8, 1, 10, 9, 2, 5, 10, 1, 2, ~
## $ marital  <fct> 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1~
## $ balance  <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 264~
## $ housing  <fct> 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1~
## $ loan     <fct> 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0~
## $ contact  <fct> 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1~
## $ month    <fct> 11, 9, 1, 7, 9, 4, 9, 9, 9, 1, 9, 1, 2, 1, 5, 2, 2, 1, 9, 6, ~
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 328~
## $ poutcome <fct> 4, 1, 1, 4, 4, 1, 2, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 1, 4, 2, 4~
## $ age      <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, 4~
## $ y        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~

Visualizations

  1. Histogram
library(hrbrthemes)
hist(x=dataset$age)

  1. Density
ggplot(data=dataset, aes(x=marital, group=job, fill=job)) +
  geom_density(adjust=1.5, alpha=.4) +
  theme_tinyhand()+
  ggtitle("Density plot for marital status and job type",subtitle = "20MID0139")+
  labs(y="Density", x="Marital Status")

  1. Heat-map
ds_num = ds %>% select(age,balance,day,duration, campaign, pdays, previous)
cormat <- round(cor(ds_num),2)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)
##       Var1 Var2 value
## 1      age  age  1.00
## 2  balance  age  0.08
## 3      day  age -0.02
## 4 duration  age  0.00
## 5 campaign  age -0.01
## 6    pdays  age -0.01
ggplot(melted_cormat, aes(Var1, Var2, fill=value)) + 
  geom_tile()+
  ggtitle("Correlation matrix heatmap for bank Marketing data",subtitle = "20MID0139")+
  scale_fill_distiller(palette = "RdPu") +
  theme_ipsum()+
  geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.9, 1.08),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "left", title.hjust = 0.5))

  1. Barplot
ggplot(data=head(dataset,50), aes(x=age, y=duration,fill=y)) +
  geom_bar(stat="identity", position=position_dodge(), width=0.7)+
  scale_fill_brewer(palette="Paired")+
  theme_tinyhand()+
  ggtitle("Barplot for age vs call duration",subtitle = "20MID0139")

  1. Stacked Area Plot
ggplot(dataset, aes(x=age,fill=marital)) + 
  geom_area(stat ="bin",size=0.5,colour="white") +
  theme_tinyhand()+
  ggtitle("Stacked Area plot for marital status and age",subtitle = "20MID0139")

  1. Scatterplot
dataset %>% plot()

Model 1: Logistic Regression

Splitting dataset for training and testing purpose

library(caTools)
set.seed(200)
split = sample.split(Y = dataset$y, SplitRatio = 0.75)
train_set = subset(x= dataset, split == TRUE)
test_set = subset(x= dataset, split == F)

Fitting the model

classifier = glm(formula = y ~ ., family = binomial(), data = train_set)
summary(classifier)
## 
## Call:
## glm(formula = y ~ ., family = binomial(), data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8567  -0.3937  -0.2633  -0.1626   2.8818  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.072e+00  5.228e-01  -3.964 7.38e-05 ***
## job2        -6.405e-01  2.647e-01  -2.420 0.015530 *  
## job3        -1.538e-01  4.123e-01  -0.373 0.709062    
## job4        -4.903e-01  4.717e-01  -1.039 0.298601    
## job5        -8.639e-02  2.383e-01  -0.363 0.716911    
## job6         4.634e-01  3.500e-01   1.324 0.185463    
## job7        -3.143e-01  3.865e-01  -0.813 0.416036    
## job8        -2.107e-01  3.008e-01  -0.701 0.483568    
## job9         3.104e-01  4.055e-01   0.765 0.443985    
## job10       -3.833e-01  2.575e-01  -1.488 0.136668    
## job11       -1.031e+00  4.990e-01  -2.066 0.038851 *  
## job12        3.736e-01  5.995e-01   0.623 0.533153    
## marital2    -4.447e-01  2.027e-01  -2.194 0.028218 *  
## marital3    -1.247e-01  2.332e-01  -0.535 0.592893    
## balance     -3.674e-06  1.900e-05  -0.193 0.846688    
## housing1    -2.030e-01  1.542e-01  -1.317 0.187973    
## loan1       -6.536e-01  2.269e-01  -2.881 0.003969 ** 
## contact2    -6.827e-02  2.604e-01  -0.262 0.793180    
## contact3    -1.284e+00  2.622e-01  -4.898 9.68e-07 ***
## month2      -3.459e-01  2.748e-01  -1.259 0.208080    
## month3       2.247e-01  7.540e-01   0.298 0.765701    
## month4       1.787e-01  3.107e-01   0.575 0.565226    
## month5      -9.546e-01  4.302e-01  -2.219 0.026482 *  
## month6      -8.085e-01  2.784e-01  -2.904 0.003688 ** 
## month7       3.556e-01  3.318e-01   1.072 0.283943    
## month8       1.737e+00  4.480e-01   3.878 0.000105 ***
## month9      -7.538e-01  2.668e-01  -2.825 0.004727 ** 
## month10     -6.857e-01  3.050e-01  -2.248 0.024582 *  
## month11      1.202e+00  3.870e-01   3.105 0.001903 ** 
## month12      4.367e-01  4.570e-01   0.956 0.339315    
## duration     4.088e-03  2.277e-04  17.956  < 2e-16 ***
## poutcome2    5.955e-01  3.053e-01   1.951 0.051101 .  
## poutcome3    2.395e+00  3.094e-01   7.740 9.92e-15 ***
## poutcome4   -1.356e-01  2.142e-01  -0.633 0.526513    
## age         -4.936e-03  8.133e-03  -0.607 0.543933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2424.3  on 3390  degrees of freedom
## Residual deviance: 1666.6  on 3356  degrees of freedom
## AIC: 1736.6
## 
## Number of Fisher Scoring iterations: 6

Prediction

y_predic = predict(classifier, type='response', newdata = test_set)
y_predic= ifelse(y_predic>0.5, 1,0)

Confusion matrix

cm =table(test_set$y, y_predic) 
cm
##    y_predic
##       0   1
##   0 979  21
##   1  81  49

Accuracy

acc = sum(diag(cm))/sum(cm)
sprintf("Accuracy of the above model is : %.2f", acc*100)
## [1] "Accuracy of the above model is : 90.97"

Model 2 : Naïve bayes

Importing e1071 for naive bayes classifier

library(e1071)

Fitting the model

classifier = naiveBayes(x= train_set[-11], y = train_set$y)
summary(classifier)
##           Length Class  Mode     
## apriori    2     table  numeric  
## tables    10     -none- list     
## levels     2     -none- character
## isnumeric 10     -none- logical  
## call       3     -none- call

Prediction

y_pred = predict(object = classifier, newdata = test_set)

Confusion matrix

cm = table(test_set$y, y_pred)
cm
##    y_pred
##       0   1
##   0 955  45
##   1  67  63

Accuracy

acc = sum(diag(cm))/sum(cm)
sprintf("Accuracy of the above model is : %.2f", acc*100)
## [1] "Accuracy of the above model is : 90.09"

Results

We can see that Accuracy of Logistical Regression classifier is more than Naïve Bayes Classifier, hence we can conclude that Logistic Regression is best algorithm for the given dataset.