Classification of bank Marketing dataset

Literature Survey:

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.

Reading Data-set from `bank.csv`

library(ggplot2)
library(dplyr)
dataset = read.csv("bank.csv", sep=';')
ds=dataset
glimpse(dataset)

## Rows: 4,521
## Columns: 17
## $ age       <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, ~
## $ job       <chr> "unemployed", "services", "management", "management", "blue-~
## $ marital   <chr> "married", "married", "single", "married", "married", "singl~
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondary",~
## $ default   <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ balance   <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26~
## $ housing   <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",~
## $ loan      <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "yes~
## $ contact   <chr> "cellular", "cellular", "cellular", "unknown", "unknown", "c~
## $ day       <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29,~
## $ month     <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "may~
## $ duration  <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32~
## $ campaign  <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, ~
## $ pdays     <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1,~
## $ previous  <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, ~
## $ poutcome  <chr> "unknown", "failure", "failure", "unknown", "unknown", "fail~
## $ y         <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~

Selecting the features which are useful to us using `dplyr` package’s `select()`

dataset = dataset %>% select(job, marital,balance, housing, loan, contact, month, duration, poutcome,age,y)
View(dataset)

Encoding the char data to factor using `factor()`

dataset$job = factor(dataset$job,labels = c(1:12))
dataset$marital = factor(dataset$marital, labels = c(1:3))
dataset$housing = factor(dataset$housing, labels = c(0,1))
dataset$loan = factor(dataset$loan, labels = c(0,1))
dataset$contact = factor(dataset$contact, labels = c(1:3))
dataset$month = factor(dataset$month ,labels = c(1:12))
dataset$poutcome = factor(dataset$poutcome, labels = c(1:4))
dataset$y= factor(dataset$y, labels = c(0,1))
glimpse(dataset)

## Rows: 4,521
## Columns: 11
## $ job      <fct> 11, 8, 5, 5, 2, 5, 7, 10, 3, 8, 8, 1, 10, 9, 2, 5, 10, 1, 2, ~
## $ marital  <fct> 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1~
## $ balance  <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 264~
## $ housing  <fct> 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1~
## $ loan     <fct> 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0~
## $ contact  <fct> 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1~
## $ month    <fct> 11, 9, 1, 7, 9, 4, 9, 9, 9, 1, 9, 1, 2, 1, 5, 2, 2, 1, 9, 6, ~
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 328~
## $ poutcome <fct> 4, 1, 1, 4, 4, 1, 2, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 1, 4, 2, 4~
## $ age      <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, 4~
## $ y        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~

Visualizations

Histogram

library(hrbrthemes)
hist(x=dataset$age)

Density

ggplot(data=dataset, aes(x=marital, group=job, fill=job)) +
  geom_density(adjust=1.5, alpha=.4) +
  theme_tinyhand()+
  ggtitle("Density plot for marital status and job type",subtitle = "20MID0139")+
  labs(y="Density", x="Marital Status")

Heat-map

ds_num = ds %>% select(age,balance,day,duration, campaign, pdays, previous)
cormat <- round(cor(ds_num),2)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)

##       Var1 Var2 value
## 1      age  age  1.00
## 2  balance  age  0.08
## 3      day  age -0.02
## 4 duration  age  0.00
## 5 campaign  age -0.01
## 6    pdays  age -0.01

ggplot(melted_cormat, aes(Var1, Var2, fill=value)) + 
  geom_tile()+
  ggtitle("Correlation matrix heatmap for bank Marketing data",subtitle = "20MID0139")+
  scale_fill_distiller(palette = "RdPu") +
  theme_ipsum()+
  geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
  theme(
    axis.title.x = element_blank(),
    axis.title.y = element_blank(),
    panel.grid.major = element_blank(),
    panel.border = element_blank(),
    panel.background = element_blank(),
    axis.ticks = element_blank(),
    legend.justification = c(1, 0),
    legend.position = c(0.9, 1.08),
    legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                               title.position = "left", title.hjust = 0.5))

Barplot

ggplot(data=head(dataset,50), aes(x=age, y=duration,fill=y)) +
  geom_bar(stat="identity", position=position_dodge(), width=0.7)+
  scale_fill_brewer(palette="Paired")+
  theme_tinyhand()+
  ggtitle("Barplot for age vs call duration",subtitle = "20MID0139")

Stacked Area Plot

ggplot(dataset, aes(x=age,fill=marital)) + 
  geom_area(stat ="bin",size=0.5,colour="white") +
  theme_tinyhand()+
  ggtitle("Stacked Area plot for marital status and age",subtitle = "20MID0139")

Scatterplot

dataset %>% plot()

Model 1: Logistic Regression

Splitting dataset for training and testing purpose

library(caTools)
set.seed(200)
split = sample.split(Y = dataset$y, SplitRatio = 0.75)
train_set = subset(x= dataset, split == TRUE)
test_set = subset(x= dataset, split == F)

Fitting the model

classifier = glm(formula = y ~ ., family = binomial(), data = train_set)
summary(classifier)

## 
## Call:
## glm(formula = y ~ ., family = binomial(), data = train_set)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8567  -0.3937  -0.2633  -0.1626   2.8818  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.072e+00  5.228e-01  -3.964 7.38e-05 ***
## job2        -6.405e-01  2.647e-01  -2.420 0.015530 *  
## job3        -1.538e-01  4.123e-01  -0.373 0.709062    
## job4        -4.903e-01  4.717e-01  -1.039 0.298601    
## job5        -8.639e-02  2.383e-01  -0.363 0.716911    
## job6         4.634e-01  3.500e-01   1.324 0.185463    
## job7        -3.143e-01  3.865e-01  -0.813 0.416036    
## job8        -2.107e-01  3.008e-01  -0.701 0.483568    
## job9         3.104e-01  4.055e-01   0.765 0.443985    
## job10       -3.833e-01  2.575e-01  -1.488 0.136668    
## job11       -1.031e+00  4.990e-01  -2.066 0.038851 *  
## job12        3.736e-01  5.995e-01   0.623 0.533153    
## marital2    -4.447e-01  2.027e-01  -2.194 0.028218 *  
## marital3    -1.247e-01  2.332e-01  -0.535 0.592893    
## balance     -3.674e-06  1.900e-05  -0.193 0.846688    
## housing1    -2.030e-01  1.542e-01  -1.317 0.187973    
## loan1       -6.536e-01  2.269e-01  -2.881 0.003969 ** 
## contact2    -6.827e-02  2.604e-01  -0.262 0.793180    
## contact3    -1.284e+00  2.622e-01  -4.898 9.68e-07 ***
## month2      -3.459e-01  2.748e-01  -1.259 0.208080    
## month3       2.247e-01  7.540e-01   0.298 0.765701    
## month4       1.787e-01  3.107e-01   0.575 0.565226    
## month5      -9.546e-01  4.302e-01  -2.219 0.026482 *  
## month6      -8.085e-01  2.784e-01  -2.904 0.003688 ** 
## month7       3.556e-01  3.318e-01   1.072 0.283943    
## month8       1.737e+00  4.480e-01   3.878 0.000105 ***
## month9      -7.538e-01  2.668e-01  -2.825 0.004727 ** 
## month10     -6.857e-01  3.050e-01  -2.248 0.024582 *  
## month11      1.202e+00  3.870e-01   3.105 0.001903 ** 
## month12      4.367e-01  4.570e-01   0.956 0.339315    
## duration     4.088e-03  2.277e-04  17.956  < 2e-16 ***
## poutcome2    5.955e-01  3.053e-01   1.951 0.051101 .  
## poutcome3    2.395e+00  3.094e-01   7.740 9.92e-15 ***
## poutcome4   -1.356e-01  2.142e-01  -0.633 0.526513    
## age         -4.936e-03  8.133e-03  -0.607 0.543933    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2424.3  on 3390  degrees of freedom
## Residual deviance: 1666.6  on 3356  degrees of freedom
## AIC: 1736.6
## 
## Number of Fisher Scoring iterations: 6

Prediction

y_predic = predict(classifier, type='response', newdata = test_set)
y_predic= ifelse(y_predic>0.5, 1,0)

Confusion matrix

cm =table(test_set$y, y_predic) 
cm

##    y_predic
##       0   1
##   0 979  21
##   1  81  49

Accuracy

acc = sum(diag(cm))/sum(cm)
sprintf("Accuracy of the above model is : %.2f", acc*100)

## [1] "Accuracy of the above model is : 90.97"

Classification of bank Marketing dataset

Nirad Yeola

Aim:

Literature Survey:

Reading Data-set from `bank.csv`

Selecting the features which are useful to us using `dplyr` package’s `select()`

Encoding the char data to factor using `factor()`

Visualizations

Model 1: Logistic Regression

Splitting dataset for training and testing purpose

Fitting the model

Prediction

Confusion matrix

Accuracy

Model 2 : Naïve bayes

Importing `e1071` for naive bayes classifier

Fitting the model

Prediction

Confusion matrix

Accuracy

Results

Classification of bank Marketing dataset

Nirad Yeola

Aim:

Literature Survey:

Reading Data-set from bank.csv

Selecting the features which are useful to us using dplyr package’s select()

Encoding the char data to factor using factor()

Visualizations

Model 1: Logistic Regression

Splitting dataset for training and testing purpose

Fitting the model

Prediction

Confusion matrix

Accuracy

Model 2 : Naïve bayes

Importing e1071 for naive bayes classifier

Fitting the model

Prediction

Confusion matrix

Accuracy

Results

Reading Data-set from `bank.csv`

Selecting the features which are useful to us using `dplyr` package’s `select()`

Encoding the char data to factor using `factor()`

Importing `e1071` for naive bayes classifier