bank<-read.csv('/Users/12816/Desktop/Big data project/BankSep.csv',header = T)
summary(bank)
##       age            job              marital           education        
##  Min.   :18.00   Length:4119        Length:4119        Length:4119       
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.11                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :88.00                                                           
##    default            housing              loan             contact         
##  Length:4119        Length:4119        Length:4119        Length:4119       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:4119        Length:4119        Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 103.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 181.0   Median : 2.000  
##                                        Mean   : 256.8   Mean   : 2.537  
##                                        3rd Qu.: 317.0   3rd Qu.: 3.000  
##                                        Max.   :3643.0   Max.   :35.000  
##      pdays          previous        poutcome          emp.var.rate     
##  Min.   :  0.0   Min.   :0.0000   Length:4119        Min.   :-3.40000  
##  1st Qu.:999.0   1st Qu.:0.0000   Class :character   1st Qu.:-1.80000  
##  Median :999.0   Median :0.0000   Mode  :character   Median : 1.10000  
##  Mean   :960.4   Mean   :0.1903                      Mean   : 0.08497  
##  3rd Qu.:999.0   3rd Qu.:0.0000                      3rd Qu.: 1.40000  
##  Max.   :999.0   Max.   :6.0000                      Max.   : 1.40000  
##  cons.price.idx  cons.conf.idx     euribor3m      nr.employed  
##  Min.   :92.20   Min.   :-50.8   Min.   :0.635   Min.   :4964  
##  1st Qu.:93.08   1st Qu.:-42.7   1st Qu.:1.334   1st Qu.:5099  
##  Median :93.75   Median :-41.8   Median :4.857   Median :5191  
##  Mean   :93.58   Mean   :-40.5   Mean   :3.621   Mean   :5166  
##  3rd Qu.:93.99   3rd Qu.:-36.4   3rd Qu.:4.961   3rd Qu.:5228  
##  Max.   :94.77   Max.   :-26.9   Max.   :5.045   Max.   :5228  
##       y            
##  Length:4119       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Splitting into 75% training data, 25% testing data

train<-floor(0.75*nrow(bank))
traindata<-sample(seq_len(nrow(bank)),size=train)
maintraindata<-bank[traindata,]
maintestdata<-bank[-traindata,]
attach(maintestdata)
table(y)
## y
##  no yes 
## 919 111
head(maintraindata)
##      age           job  marital         education default housing loan
## 3591  53      services  married       high.school unknown     yes   no
## 607   56       retired  married       high.school      no     yes   no
## 2857  31        admin.  married university.degree      no     yes   no
## 4017  42        admin.   single university.degree unknown     yes  yes
## 3732  54    management  married university.degree      no     yes   no
## 1453  61 self-employed divorced university.degree      no      no   no
##        contact month day_of_week duration campaign pdays previous    poutcome
## 3591 telephone   may         mon      107        1   999        0 nonexistent
## 607  telephone   may         mon      167        3   999        0 nonexistent
## 2857  cellular   nov         thu       95        1   999        0 nonexistent
## 4017  cellular   jul         fri      115        1   999        0 nonexistent
## 3732  cellular   jul         wed      281        3   999        0 nonexistent
## 1453  cellular   mar         fri      102        2   999        1     failure
##      emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed  y
## 3591          1.1         93.994         -36.4     4.857      5191.0 no
## 607           1.1         93.994         -36.4     4.857      5191.0 no
## 2857         -0.1         93.200         -42.0     4.076      5195.8 no
## 4017          1.4         93.918         -42.7     4.957      5228.1 no
## 3732          1.4         93.918         -42.7     4.962      5228.1 no
## 1453         -1.8         93.369         -34.8     0.649      5008.7 no

Visualize data

maritalstatus <- table(bank$marital)
maritalstatus
## 
## divorced  married   single  unknown 
##      446     2509     1153       11
barplot(maritalstatus,beside = TRUE,main = "Marital Status", horiz = TRUE, names.arg = c("divorced","married","single","unknown"))

job <- table(bank$job)
job
## 
##        admin.   blue-collar  entrepreneur     housemaid    management 
##          1012           884           148           110           324 
##       retired self-employed      services       student    technician 
##           166           159           393            82           691 
##    unemployed       unknown 
##           111            39
barplot(job,beside = TRUE,main = "Job", horiz = FALSE, names.arg = c("admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),cex.axis=0.45, cex.names=0.45)

defaultstatus <- table(bank$default)
defaultstatus
## 
##      no unknown     yes 
##    3315     803       1
barplot(defaultstatus,beside = TRUE,main = "Default", horiz = TRUE, names.arg = c("no","unknown","yes"))

loanstatus <- table(bank$loan)
loanstatus
## 
##      no unknown     yes 
##    3349     105     665
barplot(loanstatus,beside = TRUE,main = "Loan", horiz = TRUE, names.arg = c("no","unknown","yes"))

### Correlation between day of week and duration

library(plyr)
library(corrplot)
## corrplot 0.95 loaded
bank$day_of_week <- revalue(bank$day_of_week,c("mon"="1","tue"="2","wed"="3","thu"="4","fri"="5"))
dayofweek<-as.numeric(bank$day_of_week)
names(dayofweek)<-names(bank)
duration<-as.numeric(bank$duration)
names(duration)<-names(bank)
res<-cor.test(dayofweek,duration,method="pearson")
res
## 
##  Pearson's product-moment correlation
## 
## data:  dayofweek and duration
## t = 0.82141, df = 4117, p-value = 0.4115
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01774674  0.04332414
## sample estimates:
##        cor 
## 0.01280064

The p-value of the test is 0.4115 which is higher than the significant level alpha = 0.05. We could not conclude that day of week and duration of the calls significantly correlated with a correlation of 0.01280064.

change yes and no to 0 and 1 values

Subscribe = rep(0, nrow(bank))
Subscribe[bank$y == 'yes'] = 1

Fit a cart model on the training data to predict y (if client will subscribe) and plot.

library(tree)
set.seed(1)
maintraindata$y = as.factor(maintraindata$y)
summary(maintraindata)
##       age            job              marital           education        
##  Min.   :18.00   Length:3089        Length:3089        Length:3089       
##  1st Qu.:32.00   Class :character   Class :character   Class :character  
##  Median :38.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :40.19                                                           
##  3rd Qu.:47.00                                                           
##  Max.   :86.00                                                           
##    default            housing              loan             contact         
##  Length:3089        Length:3089        Length:3089        Length:3089       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     month           day_of_week           duration         campaign     
##  Length:3089        Length:3089        Min.   :   0.0   Min.   : 1.000  
##  Class :character   Class :character   1st Qu.: 103.0   1st Qu.: 1.000  
##  Mode  :character   Mode  :character   Median : 181.0   Median : 2.000  
##                                        Mean   : 255.5   Mean   : 2.532  
##                                        3rd Qu.: 316.0   3rd Qu.: 3.000  
##                                        Max.   :3643.0   Max.   :35.000  
##      pdays          previous       poutcome          emp.var.rate    
##  Min.   :  0.0   Min.   :0.000   Length:3089        Min.   :-3.4000  
##  1st Qu.:999.0   1st Qu.:0.000   Class :character   1st Qu.:-1.8000  
##  Median :999.0   Median :0.000   Mode  :character   Median : 1.1000  
##  Mean   :961.4   Mean   :0.192                      Mean   : 0.1036  
##  3rd Qu.:999.0   3rd Qu.:0.000                      3rd Qu.: 1.4000  
##  Max.   :999.0   Max.   :6.000                      Max.   : 1.4000  
##  cons.price.idx  cons.conf.idx      euribor3m      nr.employed     y       
##  Min.   :92.20   Min.   :-50.80   Min.   :0.635   Min.   :4964   no :2749  
##  1st Qu.:93.08   1st Qu.:-42.70   1st Qu.:1.344   1st Qu.:5099   yes: 340  
##  Median :93.88   Median :-41.80   Median :4.857   Median :5191             
##  Mean   :93.59   Mean   :-40.43   Mean   :3.646   Mean   :5167             
##  3rd Qu.:93.99   3rd Qu.:-36.40   3rd Qu.:4.961   3rd Qu.:5228             
##  Max.   :94.77   Max.   :-26.90   Max.   :5.045   Max.   :5228
fit.cart = tree(y~., maintraindata)
## Warning in tree(y ~ ., maintraindata): NAs introduced by coercion
summary(fit.cart)
## 
## Classification tree:
## tree(formula = y ~ ., data = maintraindata)
## Variables actually used in tree construction:
## [1] "nr.employed"   "duration"      "cons.conf.idx" "euribor3m"    
## Number of terminal nodes:  9 
## Residual mean deviance:  0.3819 = 1176 / 3080 
## Misclassification error rate: 0.09032 = 279 / 3089
plot(fit.cart)
text(fit.cart)

Accuracy of training data

#train data
fit.p.cart = predict(fit.cart, newdata = maintraindata)
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
head(fit.p.cart)
##             no         yes
## 3591 0.9966796 0.003320421
## 607  0.9966796 0.003320421
## 2857 0.9966796 0.003320421
## 4017 0.9966796 0.003320421
## 3732 0.9619565 0.038043478
## 1453 0.8842975 0.115702479
fit.y.cart = fit.p.cart[,2] > 0.5
head(fit.y.cart)
##  3591   607  2857  4017  3732  1453 
## FALSE FALSE FALSE FALSE FALSE FALSE
confusion.table.cart = table(fit.y.cart, maintraindata$y)
confusion.table.cart
##           
## fit.y.cart   no  yes
##      FALSE 2644  174
##      TRUE   105  166
sum(diag(confusion.table.cart))/nrow(maintraindata)
## [1] 0.9096795
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
fit.roc.cart = roc(maintraindata$y, fit.p.cart[,2])
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(fit.roc.cart)
## Area under the curve: 0.9293

Accuracy of test data

fit.p.cart = predict(fit.cart, newdata = maintestdata)
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
head(fit.p.cart)
##           no         yes
## 5  0.9966796 0.003320421
## 10 0.9966796 0.003320421
## 19 0.9966796 0.003320421
## 25 0.9619565 0.038043478
## 26 0.8000000 0.200000000
## 35 0.9966796 0.003320421
fit.y.cart = fit.p.cart[,2] > 0.5
head(fit.y.cart)
##     5    10    19    25    26    35 
## FALSE FALSE FALSE FALSE FALSE FALSE
confusion.table.cart = table(fit.y.cart, maintestdata$y)
confusion.table.cart
##           
## fit.y.cart  no yes
##      FALSE 884  63
##      TRUE   35  48
sum(diag(confusion.table.cart))/nrow(maintestdata)
## [1] 0.9048544
fit.roc.cart = roc(maintestdata$y, fit.p.cart[,2])
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(fit.roc.cart)
## Area under the curve: 0.9315

Prune the tree

set.seed(1)
cv.bank = cv.tree(fit.cart, FUN = prune.misclass)
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
names(cv.bank)
## [1] "size"   "dev"    "k"      "method"
#cv.bank
#par(mfrow = c(1,2))
plot(cv.bank$size, cv.bank$dev / nrow(bank), type = "b", xlab = 'tree size', ylab = 'cv misclass rate')

prune.bank = prune.misclass(fit.cart, best = 3)
plot(prune.bank)
text(prune.bank, pretty = 0)

Random forest

library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
rf.bank = randomForest(formula = y ~., data = maintraindata, mtry = 4, importance = TRUE, na.action = na.exclude)
rf.bank
## 
## Call:
##  randomForest(formula = y ~ ., data = maintraindata, mtry = 4,      importance = TRUE, na.action = na.exclude) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 9.1%
## Confusion matrix:
##       no yes class.error
## no  2668  81  0.02946526
## yes  200 140  0.58823529
varImpPlot(rf.bank)