Data Set Information:

The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.

The classification goal is to predict if the client will subscribe (yes/no) a term deposit (variable y).

Citation Request:

This dataset is public available for research. The details are described in [Moro et al., 2014]. Please include this citation if you plan to use this database:

[Moro et al., 2014] S. Moro, P. Cortez and P. Rita. A Data-Driven Approach to Predict the Success of Bank Telemarketing. Decision Support Systems, Elsevier, 62:22-31, June 2014

Load Data in Data Frame

df <- read.csv("C:/PERSONAL DATA/UTD classes/BI-SAS-6324/Project/bank-additional-full.csv")

Replace all ‘unknown’ values with NA

df[df=="unknown"] <- NA

Divide age into age group category in a separate colum age_group

for(i in 1 : nrow(df)){
  if (df$age[i] < 20){
    df$age[i] = 'Teenagers'
  } else if (df$age[i] < 35 & df$age[i] > 19){
    df$age[i] = 'Young Adults'
  } else if (df$age[i] < 60 & df$age[i] > 34){
    df$age[i] = 'Adults'
  } else if (df$age[i] > 59){
    df$age[i] = 'Senior Citizens'
  }
  
}
df$age<-as.factor(df$age)

Replace target variables with yes with 1 and no with 0

df$y<-ifelse(df$y =='yes', 1,0)
df$y<-as.factor(df$y)

This Dataset contains 2 sets of customers, 5625 who were a part of previous campaign and 35563 who were contacted for the first time, since there is a huge difference in count we will do separate analysis of both datasets as the variables previous and pdays are valid only for old customers

Slicing datset into 2 groups of customes

old_Cust_df<-subset(df, df$poutcome != "nonexistent")
new_Cust_df<-subset(df, df$poutcome == "nonexistent")

OLD Customers Analysis (old_Cust_df)

Let’s analyze the missing values in this dataset

library(VIM)
## Warning: package 'VIM' was built under R version 3.3.3
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
aggr_plot <- aggr(old_Cust_df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable       Count
##         default 0.102222222
##       education 0.048000000
##         housing 0.024711111
##            loan 0.024711111
##             job 0.006577778
##         marital 0.003200000
##             age 0.000000000
##         contact 0.000000000
##           month 0.000000000
##     day_of_week 0.000000000
##        duration 0.000000000
##        campaign 0.000000000
##           pdays 0.000000000
##        previous 0.000000000
##        poutcome 0.000000000
##    emp.var.rate 0.000000000
##  cons.price.idx 0.000000000
##   cons.conf.idx 0.000000000
##       euribor3m 0.000000000
##     nr.employed 0.000000000
##               y 0.000000000

Check distribution of target variable, it is observed that there isn’t any data imbalance

counts <- table(old_Cust_df$y)
barplot(counts,col=c("darkblue","red"),legend = rownames(counts), main = "Term Deposit")

So for this dataset we face only the missing value problem, which can be solved by imputation method We will impute these missing values using MICE package

library(mice)
## Warning: package 'mice' was built under R version 3.3.3
old_imp<-mice(old_Cust_df)
## 
##  iter imp variable
##   1   1  job  marital  education  default  housing  loan
##   1   2  job  marital  education  default  housing  loan
##   1   3  job  marital  education  default  housing  loan
##   1   4  job  marital  education  default  housing  loan
##   1   5  job  marital  education  default  housing  loan
##   2   1  job  marital  education  default  housing  loan
##   2   2  job  marital  education  default  housing  loan
##   2   3  job  marital  education  default  housing  loan
##   2   4  job  marital  education  default  housing  loan
##   2   5  job  marital  education  default  housing  loan
##   3   1  job  marital  education  default  housing  loan
##   3   2  job  marital  education  default  housing  loan
##   3   3  job  marital  education  default  housing  loan
##   3   4  job  marital  education  default  housing  loan
##   3   5  job  marital  education  default  housing  loan
##   4   1  job  marital  education  default  housing  loan
##   4   2  job  marital  education  default  housing  loan
##   4   3  job  marital  education  default  housing  loan
##   4   4  job  marital  education  default  housing  loan
##   4   5  job  marital  education  default  housing  loan
##   5   1  job  marital  education  default  housing  loan
##   5   2  job  marital  education  default  housing  loan
##   5   3  job  marital  education  default  housing  loan
##   5   4  job  marital  education  default  housing  loan
##   5   5  job  marital  education  default  housing  loan
old_imp_df<-complete(old_imp)

Check if missing values exists

aggr_plot <- aggr(old_imp_df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))

## 
##  Variables sorted by number of missings: 
##        Variable Count
##             age     0
##             job     0
##         marital     0
##       education     0
##         default     0
##         housing     0
##            loan     0
##         contact     0
##           month     0
##     day_of_week     0
##        duration     0
##        campaign     0
##           pdays     0
##        previous     0
##        poutcome     0
##    emp.var.rate     0
##  cons.price.idx     0
##   cons.conf.idx     0
##       euribor3m     0
##     nr.employed     0
##               y     0

Now we have to reformat the variables of dataset to bring them back to factor format as the factor variables contains contrasts

old_imp_df$age<-as.character(old_imp_df$age)
old_imp_df$age<-as.factor(old_imp_df$age)
old_imp_df$job<-as.character(old_imp_df$job)
old_imp_df$job<-as.factor(old_imp_df$job)
old_imp_df$marital<-as.character(old_imp_df$marital)
old_imp_df$marital<-as.factor(old_imp_df$marital)
old_imp_df$education<-as.character(old_imp_df$education)
old_imp_df$education<-as.factor(old_imp_df$education)
old_imp_df$default<-as.character(old_imp_df$default)
old_imp_df$default<-as.factor(old_imp_df$default)
old_imp_df$housing<-as.character(old_imp_df$housing)
old_imp_df$housing<-as.factor(old_imp_df$housing)
old_imp_df$loan<-as.character(old_imp_df$loan)
old_imp_df$loan<-as.factor(old_imp_df$loan)
old_imp_df$contact<-as.character(old_imp_df$contact)
old_imp_df$contact<-as.factor(old_imp_df$contact)
old_imp_df$month<-as.character(old_imp_df$month)
old_imp_df$month<-as.factor(old_imp_df$month)
old_imp_df$day_of_week<-as.character(old_imp_df$day_of_week)
old_imp_df$day_of_week<-as.factor(old_imp_df$day_of_week)
old_imp_df$poutcome<-as.character(old_imp_df$poutcome)
old_imp_df$poutcome<-as.factor(old_imp_df$poutcome)
old_imp_df$y<-as.character(old_imp_df$y)
old_imp_df$y<-as.factor(old_imp_df$y)

Split old customer data into train and test

library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
idx <- createDataPartition(old_imp_df$y, p = 2/3,list = FALSE)
old_train <- old_imp_df[idx,]
old_test <- old_imp_df[-idx,]

Check target variable distribution of train and test data

table(old_train$y)
## 
##    0    1 
## 2751 1000
table(old_test$y)
## 
##    0    1 
## 1375  499

Logistic regression analysis

logit_model <- glm(y ~.,family=binomial(link='logit'),data = old_train)
summary(logit_model)
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = old_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -5.0259  -0.4892  -0.2322   0.3456   3.2762  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -9.280e+02  2.131e+02  -4.354 1.33e-05 ***
## ageSenior Citizens            7.453e-01  2.437e-01   3.058 0.002230 ** 
## ageTeenagers                 -8.360e-02  5.442e-01  -0.154 0.877908    
## ageYoung Adults               1.637e-01  1.316e-01   1.244 0.213402    
## jobblue-collar               -2.192e-01  2.150e-01  -1.019 0.308034    
## jobentrepreneur              -9.716e-02  3.570e-01  -0.272 0.785489    
## jobhousemaid                 -3.710e-01  3.909e-01  -0.949 0.342575    
## jobmanagement                 2.230e-01  2.118e-01   1.053 0.292397    
## jobretired                   -2.927e-01  2.760e-01  -1.061 0.288868    
## jobself-employed             -1.282e-01  3.235e-01  -0.396 0.692005    
## jobservices                   4.471e-02  2.195e-01   0.204 0.838572    
## jobstudent                    1.612e-01  2.335e-01   0.690 0.489926    
## jobtechnician                 3.198e-01  1.748e-01   1.830 0.067241 .  
## jobunemployed                 4.241e-01  2.900e-01   1.462 0.143649    
## maritalmarried                2.118e-02  1.736e-01   0.122 0.902901    
## maritalsingle                -1.436e-01  2.023e-01  -0.710 0.477810    
## educationbasic.6y            -1.475e-01  3.292e-01  -0.448 0.653990    
## educationbasic.9y            -5.669e-02  2.496e-01  -0.227 0.820336    
## educationhigh.school         -1.241e-01  2.295e-01  -0.541 0.588696    
## educationilliterate           1.294e+01  5.354e+02   0.024 0.980715    
## educationprofessional.course  1.248e-01  2.469e-01   0.506 0.613173    
## educationuniversity.degree    1.155e-02  2.292e-01   0.050 0.959812    
## defaultyes                   -9.731e+00  3.784e+02  -0.026 0.979482    
## housingyes                   -1.376e-02  1.015e-01  -0.136 0.892156    
## loanyes                       7.655e-02  1.401e-01   0.546 0.584761    
## contacttelephone             -4.026e-01  2.035e-01  -1.979 0.047862 *  
## monthaug                      1.723e+00  3.903e-01   4.413 1.02e-05 ***
## monthdec                      2.037e+00  7.322e-01   2.782 0.005397 ** 
## monthjul                      5.871e-01  3.392e-01   1.731 0.083442 .  
## monthjun                      3.759e-01  3.083e-01   1.219 0.222821    
## monthmar                      2.989e+00  5.572e-01   5.365 8.11e-08 ***
## monthmay                      1.326e-01  2.034e-01   0.652 0.514498    
## monthnov                      2.187e+00  7.246e-01   3.018 0.002548 ** 
## monthoct                      2.914e+00  8.671e-01   3.361 0.000777 ***
## monthsep                      3.441e+00  9.557e-01   3.601 0.000317 ***
## day_of_weekmon               -3.182e-01  1.620e-01  -1.964 0.049509 *  
## day_of_weekthu                1.671e-01  1.559e-01   1.072 0.283766    
## day_of_weektue                2.181e-01  1.610e-01   1.354 0.175706    
## day_of_weekwed                3.260e-01  1.645e-01   1.982 0.047490 *  
## duration                      4.241e-03  2.383e-04  17.797  < 2e-16 ***
## campaign                     -7.367e-02  4.013e-02  -1.836 0.066410 .  
## pdays                        -8.734e-04  2.719e-04  -3.213 0.001315 ** 
## previous                     -6.446e-02  7.789e-02  -0.828 0.407939    
## poutcomesuccess               9.460e-01  2.649e-01   3.572 0.000355 ***
## emp.var.rate                 -2.159e+00  3.952e-01  -5.464 4.65e-08 ***
## cons.price.idx                5.965e+00  1.218e+00   4.898 9.69e-07 ***
## cons.conf.idx                 1.493e-01  3.118e-02   4.787 1.69e-06 ***
## euribor3m                    -3.183e+00  9.550e-01  -3.333 0.000858 ***
## nr.employed                   7.408e-02  2.026e-02   3.656 0.000256 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4350.0  on 3750  degrees of freedom
## Residual deviance: 2571.2  on 3702  degrees of freedom
## AIC: 2669.2
## 
## Number of Fisher Scoring iterations: 12
anova(logit_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                            3750     4350.0              
## age             3   135.69      3747     4214.3 < 2.2e-16 ***
## job            10   142.17      3737     4072.1 < 2.2e-16 ***
## marital         2     2.42      3735     4069.7 0.2982113    
## education       6    28.84      3729     4040.9 6.511e-05 ***
## default         1     0.76      3728     4040.1 0.3839623    
## housing         1     0.54      3727     4039.6 0.4612731    
## loan            1     0.00      3726     4039.6 0.9838848    
## contact         1     0.00      3725     4039.6 0.9634581    
## month           9   340.54      3716     3699.0 < 2.2e-16 ***
## day_of_week     4    17.59      3712     3681.4 0.0014843 ** 
## duration        1   466.14      3711     3215.3 < 2.2e-16 ***
## campaign        1     7.34      3710     3207.9 0.0067460 ** 
## pdays           1   446.83      3709     2761.1 < 2.2e-16 ***
## previous        1     2.16      3708     2759.0 0.1418111    
## poutcome        1    10.76      3707     2748.2 0.0010348 ** 
## emp.var.rate    1    39.79      3706     2708.4 2.834e-10 ***
## cons.price.idx  1   112.45      3705     2596.0 < 2.2e-16 ***
## cons.conf.idx   1    10.91      3704     2585.0 0.0009542 ***
## euribor3m       1     0.42      3703     2584.6 0.5187268    
## nr.employed     1    13.47      3702     2571.2 0.0002423 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From chi square values we can figure out the significance of each variable

Lets test the accuracy of this model by passing our test data to it

test_result <- predict(logit_model,newdata=old_test,type='response')
test_result <- ifelse(test_result > 0.5,1,0)

error <- mean(test_result != old_test$y)
print(paste('Accuracy',1-error))
## [1] "Accuracy 0.851120597652081"

Receiver Operating Characteristic cureve (ROC) for this model

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.3.3
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
pr <- prediction(test_result, old_test$y)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

Area under ROC curve

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8053438

This suggests that this model is not the best but good for prediction

Lets try random forest analysis

library(randomForest)
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
rf_model<-randomForest(y ~.,data = old_train, importance=TRUE, ntree=1000)
varImpPlot(rf_model)

This shows the importance of each predictor in the dataset, we can observe that duration is the most significant predictor for this model, followed by pdays and rest

Accuracy of this model

test_result <- predict(rf_model,old_test)
error <- mean(test_result != old_test$y)
print(paste('Accuracy',1-error))
## [1] "Accuracy 0.86392742796158"

ROC curve and area under curve for this model

pr <- prediction(as.numeric(test_result), as.numeric(old_test$y))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

Area under curve for this model

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8262

Lets try Decision tree model using party package

library(party)
## Warning: package 'party' was built under R version 3.3.3
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.3.3
tree_model <- ctree(y ~.,data = old_train)
plot(tree_model)

Accuracy of this model

test_result <- predict(tree_model,old_test)
error <- mean(test_result != old_test$y)
print(paste('Accuracy',1-error))
## [1] "Accuracy 0.850053361792956"

ROC curve

pr <- prediction(as.numeric(test_result), as.numeric(old_test$y))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

Area under curve for this model

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8154688

New Customer Analysis: (new_Cust_df)

Explore Data

table(new_Cust_df$poutcome)
## 
##     failure nonexistent     success 
##           0       35563           0

Drop poutcome as all the entries in this sub dataset are having poutcome as nonexistent

new_Cust_df$poutcome<-NULL

table(new_Cust_df$previous)
## 
##     0 
## 35563

Drop previous as all the entries in this sub dataset are having poutcome as 0

new_Cust_df$previous<-NULL

table(new_Cust_df$pdays)
## 
##   999 
## 35563

Drop previous as all the entries in this sub dataset are having poutcome as 999

new_Cust_df$pdays<-NULL

table(new_Cust_df$default)
## 
##      no unknown     yes 
##   27539       0       2
new_Cust_df$default<-NULL

We removed the variables which were relevant for customers who were contacted in previous campaigns

Let’s analyze the missing values in this dataset

library(VIM)
aggr_plot <- aggr(new_Cust_df, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, labels=names(df), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies

## 
##  Variables sorted by number of missings: 
##        Variable       Count
##       education 0.041082023
##         default 0.023929365
##         housing 0.023929365
##             job 0.008238900
##         marital 0.001743385
##             age 0.000000000
##            loan 0.000000000
##         contact 0.000000000
##           month 0.000000000
##     day_of_week 0.000000000
##        duration 0.000000000
##        campaign 0.000000000
##           pdays 0.000000000
##        previous 0.000000000
##        poutcome 0.000000000
##    emp.var.rate 0.000000000
##  cons.price.idx 0.000000000

Check distribution of target variable, it is observed that there is a data imbalance

counts <- table(new_Cust_df$y)
barplot(counts,col=c("darkblue","red"),legend = rownames(counts), main = "Term Deposit")

table(new_Cust_df$y)
## 
##     0     1 
## 32422  3141
prop.table(table(new_Cust_df$y))
## 
##          0          1 
## 0.91167787 0.08832213

So for this dataset we face the missing value problem as well as data imbalance problem. Missing values issue can be solved by imputation method, data imbalance issue can be solved by using Synthetic Minority over sampling technique i.e. SMOTE

We will first impute missing values using MICE

library(mice)
new_imp<-mice(new_Cust_df)
## 
##  iter imp variable
##   1   1  job  marital  education  housing  loan
##   1   2  job  marital  education  housing  loan
##   1   3  job  marital  education  housing  loan
##   1   4  job  marital  education  housing  loan
##   1   5  job  marital  education  housing  loan
##   2   1  job  marital  education  housing  loan
##   2   2  job  marital  education  housing  loan
##   2   3  job  marital  education  housing  loan
##   2   4  job  marital  education  housing  loan
##   2   5  job  marital  education  housing  loan
##   3   1  job  marital  education  housing  loan
##   3   2  job  marital  education  housing  loan
##   3   3  job  marital  education  housing  loan
##   3   4  job  marital  education  housing  loan
##   3   5  job  marital  education  housing  loan
##   4   1  job  marital  education  housing  loan
##   4   2  job  marital  education  housing  loan
##   4   3  job  marital  education  housing  loan
##   4   4  job  marital  education  housing  loan
##   4   5  job  marital  education  housing  loan
##   5   1  job  marital  education  housing  loan
##   5   2  job  marital  education  housing  loan
##   5   3  job  marital  education  housing  loan
##   5   4  job  marital  education  housing  loan
##   5   5  job  marital  education  housing  loan
new_imp_df<-complete(new_imp)

Now we have to reformat the variables of dataset to bring them back to factor format as the factor variables contains contrasts

new_imp_df$age<-as.character(new_imp_df$age)
new_imp_df$age<-as.factor(new_imp_df$age)
new_imp_df$job<-as.character(new_imp_df$job)
new_imp_df$job<-as.factor(new_imp_df$job)
new_imp_df$marital<-as.character(new_imp_df$marital)
new_imp_df$marital<-as.factor(new_imp_df$marital)
new_imp_df$education<-as.character(new_imp_df$education)
new_imp_df$education<-as.factor(new_imp_df$education)
new_imp_df$housing<-as.character(new_imp_df$housing)
new_imp_df$housing<-as.factor(new_imp_df$housing)
new_imp_df$loan<-as.character(new_imp_df$loan)
new_imp_df$loan<-as.factor(new_imp_df$loan)
new_imp_df$contact<-as.character(new_imp_df$contact)
new_imp_df$contact<-as.factor(new_imp_df$contact)
new_imp_df$month<-as.character(new_imp_df$month)
new_imp_df$month<-as.factor(new_imp_df$month)
new_imp_df$day_of_week<-as.character(new_imp_df$day_of_week)
new_imp_df$day_of_week<-as.factor(new_imp_df$day_of_week)
new_imp_df$y<-as.character(new_imp_df$y)
new_imp_df$y<-as.factor(new_imp_df$y)

We will be using SMOTE on training data so first we will split the data into training and test data

library(caret)
idx <- createDataPartition(new_imp_df$y, p = 3/4,list = FALSE)
new_train <- new_imp_df[idx,]
new_test <- new_imp_df[-idx,]

Over sampling of rare event using SMOTE

library(DMwR)
## Warning: package 'DMwR' was built under R version 3.3.3
## 
## Attaching package: 'DMwR'
## The following object is masked from 'package:VIM':
## 
##     kNN
new_train<-SMOTE(y~.,new_train,perc.over = 100, perc.under=200)
prop.table(table(new_train$y))
## 
##   0   1 
## 0.5 0.5

Logistic regression analysis

logit_model <- glm(y ~.,family=binomial(link='logit'),data = new_train)
summary(logit_model)
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = new_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -6.5279  -0.3358  -0.0394   0.4359   2.8732  
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   8.0008756 47.2820632   0.169 0.865627    
## ageSenior Citizens            0.4368669  0.1895442   2.305 0.021176 *  
## ageTeenagers                  1.0205261  0.6262595   1.630 0.103195    
## ageYoung Adults               0.1746305  0.0754227   2.315 0.020593 *  
## jobblue-collar               -0.3440475  0.1205977  -2.853 0.004333 ** 
## jobentrepreneur              -0.1056568  0.1966086  -0.537 0.590994    
## jobhousemaid                  0.2290625  0.2426252   0.944 0.345118    
## jobmanagement                -0.0831294  0.1395340  -0.596 0.551333    
## jobretired                    0.2146011  0.1858386   1.155 0.248184    
## jobself-employed              0.1773948  0.1830468   0.969 0.332484    
## jobservices                  -0.1200018  0.1373855  -0.873 0.382408    
## jobstudent                    0.8210251  0.2043680   4.017 5.88e-05 ***
## jobtechnician                 0.0534548  0.1178543   0.454 0.650140    
## jobunemployed                 0.2886646  0.2153440   1.340 0.180089    
## maritalmarried               -0.3957760  0.1119200  -3.536 0.000406 ***
## maritalsingle                -0.0939305  0.1209166  -0.777 0.437265    
## educationbasic.6y            -0.0779008  0.1904531  -0.409 0.682518    
## educationbasic.9y            -0.2078042  0.1467388  -1.416 0.156732    
## educationhigh.school         -0.0306376  0.1388705  -0.221 0.825388    
## educationilliterate           1.2633479  0.5399572   2.340 0.019298 *  
## educationprofessional.course  0.0445021  0.1579133   0.282 0.778087    
## educationuniversity.degree    0.1320489  0.1378311   0.958 0.338038    
## housingyes                   -0.1038452  0.0688662  -1.508 0.131573    
## loanyes                       0.7976346  0.0846840   9.419  < 2e-16 ***
## contacttelephone              0.2954947  0.0974147   3.033 0.002418 ** 
## monthaug                      0.6355955  0.2043351   3.111 0.001867 ** 
## monthdec                      1.5442840  0.5823417   2.652 0.008005 ** 
## monthjul                      0.0340519  0.1543085   0.221 0.825346    
## monthjun                     -0.4154444  0.1797490  -2.311 0.020819 *  
## monthmar                      1.2281230  0.2479725   4.953 7.32e-07 ***
## monthmay                     -1.1976023  0.1312862  -9.122  < 2e-16 ***
## monthnov                     -0.7756399  0.1864651  -4.160 3.19e-05 ***
## monthoct                      0.5810638  0.2709081   2.145 0.031963 *  
## monthsep                     -0.0349935  0.3098237  -0.113 0.910073    
## day_of_weekmon               -0.2247976  0.1088544  -2.065 0.038911 *  
## day_of_weekthu               -0.1743279  0.1102771  -1.581 0.113920    
## day_of_weektue               -0.2046101  0.1120808  -1.826 0.067917 .  
## day_of_weekwed               -0.0094390  0.1105859  -0.085 0.931980    
## duration                      0.0081567  0.0001839  44.363  < 2e-16 ***
## campaign                     -0.0324716  0.0185676  -1.749 0.080321 .  
## emp.var.rate                 -1.2085608  0.1819316  -6.643 3.07e-11 ***
## cons.price.idx                0.6311633  0.3021717   2.089 0.036730 *  
## cons.conf.idx                -0.0624380  0.0144312  -4.327 1.51e-05 ***
## euribor3m                     0.7152680  0.2037019   3.511 0.000446 ***
## nr.employed                  -0.0145304  0.0042958  -3.382 0.000718 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13064.4  on 9423  degrees of freedom
## Residual deviance:  5657.6  on 9379  degrees of freedom
## AIC: 5747.6
## 
## Number of Fisher Scoring iterations: 6
anova(logit_model, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: y
## 
## Terms added sequentially (first to last)
## 
## 
##                Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                            9423    13064.4              
## age             3    412.5      9420    12651.9 < 2.2e-16 ***
## job            10    223.2      9410    12428.8 < 2.2e-16 ***
## marital         2     86.0      9408    12342.8 < 2.2e-16 ***
## education       6     12.4      9402    12330.4 0.0536240 .  
## housing         1      1.7      9401    12328.6 0.1880552    
## loan            1    207.6      9400    12121.0 < 2.2e-16 ***
## contact         1     86.8      9399    12034.3 < 2.2e-16 ***
## month           9    602.2      9390    11432.1 < 2.2e-16 ***
## day_of_week     4     19.5      9386    11412.6 0.0006373 ***
## duration        1   4420.0      9385     6992.6 < 2.2e-16 ***
## campaign        1     33.6      9384     6959.0 6.746e-09 ***
## emp.var.rate    1   1163.3      9383     5795.7 < 2.2e-16 ***
## cons.price.idx  1    119.0      9382     5676.8 < 2.2e-16 ***
## cons.conf.idx   1      5.7      9381     5671.0 0.0166590 *  
## euribor3m       1      1.8      9380     5669.2 0.1757437    
## nr.employed     1     11.6      9379     5657.6 0.0006624 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From chi square values we can figure out the significance of each variable

Lets test the accuracy of this model by passing our test data to it

test_result <- predict(logit_model,newdata=new_test,type='response')
test_result <- ifelse(test_result > 0.5,1,0)

error <- mean(test_result != new_test$y)
print(paste('Accuracy',1-error))
## [1] "Accuracy 0.861417322834646"

Receiver Operating Characteristic cureve (ROC) for this model

library(ROCR)
pr <- prediction(test_result, new_test$y)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

Area under ROC curve

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8561178

Lets try random forest analysis

library(randomForest)
rf_model<-randomForest(y ~.,data = new_train, importance=TRUE, ntree=1000)
varImpPlot(rf_model)

This shows the importance of each predictor in the dataset, we can observe that duration is the most significant predictor for this model, followed by euribor3m and rest

Accuracy of this model

test_result <- predict(rf_model,new_test)
error <- mean(test_result != new_test$y)
print(paste('Accuracy',1-error))
## [1] "Accuracy 0.868503937007874"

ROC curve and area under curve for this model

pr <- prediction(as.numeric(test_result), as.numeric(new_test$y))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

Area under curve for this model

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8749608

Lets try Decision tree model using party package

library(party)
tree_model <- ctree(y ~.,data = new_train)
plot(tree_model)