To Compare results of 2 classification Machine Learning Algorithms for Bank Marketing Data Set .The classification goal is to predict if the client will subscribe a term deposit (variable y).
The data is related with direct marketing campaigns of a Portuguese
banking institution. The marketing campaigns were based on phone calls.
Often, more than one contact to the same client was required, in order
to access if the product (bank term deposit) would be (‘yes’) or not
(‘no’) subscribed.
bank.csv
library(ggplot2)
library(dplyr)
dataset = read.csv("bank.csv", sep=';')
ds=dataset
glimpse(dataset)
## Rows: 4,521
## Columns: 17
## $ age <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, ~
## $ job <chr> "unemployed", "services", "management", "management", "blue-~
## $ marital <chr> "married", "married", "single", "married", "married", "singl~
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondary",~
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
## $ balance <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26~
## $ housing <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",~
## $ loan <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "yes~
## $ contact <chr> "cellular", "cellular", "cellular", "unknown", "unknown", "c~
## $ day <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29,~
## $ month <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "may~
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32~
## $ campaign <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, ~
## $ pdays <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1,~
## $ previous <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, ~
## $ poutcome <chr> "unknown", "failure", "failure", "unknown", "unknown", "fail~
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", ~
dplyr
package’s select()
dataset = dataset %>% select(job, marital,balance, housing, loan, contact, month, duration, poutcome,age,y)
View(dataset)
factor()
dataset$job = factor(dataset$job,labels = c(1:12))
dataset$marital = factor(dataset$marital, labels = c(1:3))
dataset$housing = factor(dataset$housing, labels = c(0,1))
dataset$loan = factor(dataset$loan, labels = c(0,1))
dataset$contact = factor(dataset$contact, labels = c(1:3))
dataset$month = factor(dataset$month ,labels = c(1:12))
dataset$poutcome = factor(dataset$poutcome, labels = c(1:4))
dataset$y= factor(dataset$y, labels = c(0,1))
glimpse(dataset)
## Rows: 4,521
## Columns: 11
## $ job <fct> 11, 8, 5, 5, 2, 5, 7, 10, 3, 8, 8, 1, 10, 9, 2, 5, 10, 1, 2, ~
## $ marital <fct> 2, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1~
## $ balance <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 264~
## $ housing <fct> 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1~
## $ loan <fct> 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0~
## $ contact <fct> 1, 1, 1, 3, 3, 1, 1, 1, 3, 1, 3, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1~
## $ month <fct> 11, 9, 1, 7, 9, 4, 9, 9, 9, 1, 9, 1, 2, 1, 5, 2, 2, 1, 9, 6, ~
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 328~
## $ poutcome <fct> 4, 1, 1, 4, 4, 1, 2, 4, 4, 1, 4, 4, 4, 4, 1, 4, 4, 1, 4, 2, 4~
## $ age <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, 4~
## $ y <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
library(hrbrthemes)
hist(x=dataset$age)
ggplot(data=dataset, aes(x=marital, group=job, fill=job)) +
geom_density(adjust=1.5, alpha=.4) +
theme_tinyhand()+
ggtitle("Density plot for marital status and job type",subtitle = "20MID0139")+
labs(y="Density", x="Marital Status")
ds_num = ds %>% select(age,balance,day,duration, campaign, pdays, previous)
cormat <- round(cor(ds_num),2)
library(reshape2)
melted_cormat <- melt(cormat)
head(melted_cormat)
## Var1 Var2 value
## 1 age age 1.00
## 2 balance age 0.08
## 3 day age -0.02
## 4 duration age 0.00
## 5 campaign age -0.01
## 6 pdays age -0.01
ggplot(melted_cormat, aes(Var1, Var2, fill=value)) +
geom_tile()+
ggtitle("Correlation matrix heatmap for bank Marketing data",subtitle = "20MID0139")+
scale_fill_distiller(palette = "RdPu") +
theme_ipsum()+
geom_text(aes(Var2, Var1, label = value), color = "white", size = 4) +
theme(
axis.title.x = element_blank(),
axis.title.y = element_blank(),
panel.grid.major = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.ticks = element_blank(),
legend.justification = c(1, 0),
legend.position = c(0.9, 1.08),
legend.direction = "horizontal")+
guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
title.position = "left", title.hjust = 0.5))
ggplot(data=head(dataset,50), aes(x=age, y=duration,fill=y)) +
geom_bar(stat="identity", position=position_dodge(), width=0.7)+
scale_fill_brewer(palette="Paired")+
theme_tinyhand()+
ggtitle("Barplot for age vs call duration",subtitle = "20MID0139")
ggplot(dataset, aes(x=age,fill=marital)) +
geom_area(stat ="bin",size=0.5,colour="white") +
theme_tinyhand()+
ggtitle("Stacked Area plot for marital status and age",subtitle = "20MID0139")
dataset %>% plot()
library(caTools)
set.seed(200)
split = sample.split(Y = dataset$y, SplitRatio = 0.75)
train_set = subset(x= dataset, split == TRUE)
test_set = subset(x= dataset, split == F)
classifier = glm(formula = y ~ ., family = binomial(), data = train_set)
summary(classifier)
##
## Call:
## glm(formula = y ~ ., family = binomial(), data = train_set)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8567 -0.3937 -0.2633 -0.1626 2.8818
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.072e+00 5.228e-01 -3.964 7.38e-05 ***
## job2 -6.405e-01 2.647e-01 -2.420 0.015530 *
## job3 -1.538e-01 4.123e-01 -0.373 0.709062
## job4 -4.903e-01 4.717e-01 -1.039 0.298601
## job5 -8.639e-02 2.383e-01 -0.363 0.716911
## job6 4.634e-01 3.500e-01 1.324 0.185463
## job7 -3.143e-01 3.865e-01 -0.813 0.416036
## job8 -2.107e-01 3.008e-01 -0.701 0.483568
## job9 3.104e-01 4.055e-01 0.765 0.443985
## job10 -3.833e-01 2.575e-01 -1.488 0.136668
## job11 -1.031e+00 4.990e-01 -2.066 0.038851 *
## job12 3.736e-01 5.995e-01 0.623 0.533153
## marital2 -4.447e-01 2.027e-01 -2.194 0.028218 *
## marital3 -1.247e-01 2.332e-01 -0.535 0.592893
## balance -3.674e-06 1.900e-05 -0.193 0.846688
## housing1 -2.030e-01 1.542e-01 -1.317 0.187973
## loan1 -6.536e-01 2.269e-01 -2.881 0.003969 **
## contact2 -6.827e-02 2.604e-01 -0.262 0.793180
## contact3 -1.284e+00 2.622e-01 -4.898 9.68e-07 ***
## month2 -3.459e-01 2.748e-01 -1.259 0.208080
## month3 2.247e-01 7.540e-01 0.298 0.765701
## month4 1.787e-01 3.107e-01 0.575 0.565226
## month5 -9.546e-01 4.302e-01 -2.219 0.026482 *
## month6 -8.085e-01 2.784e-01 -2.904 0.003688 **
## month7 3.556e-01 3.318e-01 1.072 0.283943
## month8 1.737e+00 4.480e-01 3.878 0.000105 ***
## month9 -7.538e-01 2.668e-01 -2.825 0.004727 **
## month10 -6.857e-01 3.050e-01 -2.248 0.024582 *
## month11 1.202e+00 3.870e-01 3.105 0.001903 **
## month12 4.367e-01 4.570e-01 0.956 0.339315
## duration 4.088e-03 2.277e-04 17.956 < 2e-16 ***
## poutcome2 5.955e-01 3.053e-01 1.951 0.051101 .
## poutcome3 2.395e+00 3.094e-01 7.740 9.92e-15 ***
## poutcome4 -1.356e-01 2.142e-01 -0.633 0.526513
## age -4.936e-03 8.133e-03 -0.607 0.543933
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2424.3 on 3390 degrees of freedom
## Residual deviance: 1666.6 on 3356 degrees of freedom
## AIC: 1736.6
##
## Number of Fisher Scoring iterations: 6
y_predic = predict(classifier, type='response', newdata = test_set)
y_predic= ifelse(y_predic>0.5, 1,0)
cm =table(test_set$y, y_predic)
cm
## y_predic
## 0 1
## 0 979 21
## 1 81 49
acc = sum(diag(cm))/sum(cm)
sprintf("Accuracy of the above model is : %.2f", acc*100)
## [1] "Accuracy of the above model is : 90.97"
e1071
for naive bayes
classifierlibrary(e1071)
classifier = naiveBayes(x= train_set[-11], y = train_set$y)
summary(classifier)
## Length Class Mode
## apriori 2 table numeric
## tables 10 -none- list
## levels 2 -none- character
## isnumeric 10 -none- logical
## call 3 -none- call
y_pred = predict(object = classifier, newdata = test_set)
cm = table(test_set$y, y_pred)
cm
## y_pred
## 0 1
## 0 955 45
## 1 67 63
acc = sum(diag(cm))/sum(cm)
sprintf("Accuracy of the above model is : %.2f", acc*100)
## [1] "Accuracy of the above model is : 90.09"
We can see that Accuracy of Logistical Regression classifier is more than Naïve Bayes Classifier, hence we can conclude that Logistic Regression is best algorithm for the given dataset.