bank<-read.csv('/Users/12816/Desktop/Big data project/BankSep.csv',header = T)
summary(bank)
## age job marital education
## Min. :18.00 Length:4119 Length:4119 Length:4119
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.11
## 3rd Qu.:47.00
## Max. :88.00
## default housing loan contact
## Length:4119 Length:4119 Length:4119 Length:4119
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:4119 Length:4119 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 103.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 181.0 Median : 2.000
## Mean : 256.8 Mean : 2.537
## 3rd Qu.: 317.0 3rd Qu.: 3.000
## Max. :3643.0 Max. :35.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.0000 Length:4119 Min. :-3.40000
## 1st Qu.:999.0 1st Qu.:0.0000 Class :character 1st Qu.:-1.80000
## Median :999.0 Median :0.0000 Mode :character Median : 1.10000
## Mean :960.4 Mean :0.1903 Mean : 0.08497
## 3rd Qu.:999.0 3rd Qu.:0.0000 3rd Qu.: 1.40000
## Max. :999.0 Max. :6.0000 Max. : 1.40000
## cons.price.idx cons.conf.idx euribor3m nr.employed
## Min. :92.20 Min. :-50.8 Min. :0.635 Min. :4964
## 1st Qu.:93.08 1st Qu.:-42.7 1st Qu.:1.334 1st Qu.:5099
## Median :93.75 Median :-41.8 Median :4.857 Median :5191
## Mean :93.58 Mean :-40.5 Mean :3.621 Mean :5166
## 3rd Qu.:93.99 3rd Qu.:-36.4 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.9 Max. :5.045 Max. :5228
## y
## Length:4119
## Class :character
## Mode :character
##
##
##
train<-floor(0.75*nrow(bank))
traindata<-sample(seq_len(nrow(bank)),size=train)
maintraindata<-bank[traindata,]
maintestdata<-bank[-traindata,]
attach(maintestdata)
table(y)
## y
## no yes
## 919 111
head(maintraindata)
## age job marital education default housing loan
## 3591 53 services married high.school unknown yes no
## 607 56 retired married high.school no yes no
## 2857 31 admin. married university.degree no yes no
## 4017 42 admin. single university.degree unknown yes yes
## 3732 54 management married university.degree no yes no
## 1453 61 self-employed divorced university.degree no no no
## contact month day_of_week duration campaign pdays previous poutcome
## 3591 telephone may mon 107 1 999 0 nonexistent
## 607 telephone may mon 167 3 999 0 nonexistent
## 2857 cellular nov thu 95 1 999 0 nonexistent
## 4017 cellular jul fri 115 1 999 0 nonexistent
## 3732 cellular jul wed 281 3 999 0 nonexistent
## 1453 cellular mar fri 102 2 999 1 failure
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed y
## 3591 1.1 93.994 -36.4 4.857 5191.0 no
## 607 1.1 93.994 -36.4 4.857 5191.0 no
## 2857 -0.1 93.200 -42.0 4.076 5195.8 no
## 4017 1.4 93.918 -42.7 4.957 5228.1 no
## 3732 1.4 93.918 -42.7 4.962 5228.1 no
## 1453 -1.8 93.369 -34.8 0.649 5008.7 no
maritalstatus <- table(bank$marital)
maritalstatus
##
## divorced married single unknown
## 446 2509 1153 11
barplot(maritalstatus,beside = TRUE,main = "Marital Status", horiz = TRUE, names.arg = c("divorced","married","single","unknown"))
job <- table(bank$job)
job
##
## admin. blue-collar entrepreneur housemaid management
## 1012 884 148 110 324
## retired self-employed services student technician
## 166 159 393 82 691
## unemployed unknown
## 111 39
barplot(job,beside = TRUE,main = "Job", horiz = FALSE, names.arg = c("admin.","blue-collar","entrepreneur","housemaid","management","retired","self-employed","services","student","technician","unemployed","unknown"),cex.axis=0.45, cex.names=0.45)
defaultstatus <- table(bank$default)
defaultstatus
##
## no unknown yes
## 3315 803 1
barplot(defaultstatus,beside = TRUE,main = "Default", horiz = TRUE, names.arg = c("no","unknown","yes"))
loanstatus <- table(bank$loan)
loanstatus
##
## no unknown yes
## 3349 105 665
barplot(loanstatus,beside = TRUE,main = "Loan", horiz = TRUE, names.arg = c("no","unknown","yes"))
### Correlation between day of week and duration
library(plyr)
library(corrplot)
## corrplot 0.95 loaded
bank$day_of_week <- revalue(bank$day_of_week,c("mon"="1","tue"="2","wed"="3","thu"="4","fri"="5"))
dayofweek<-as.numeric(bank$day_of_week)
names(dayofweek)<-names(bank)
duration<-as.numeric(bank$duration)
names(duration)<-names(bank)
res<-cor.test(dayofweek,duration,method="pearson")
res
##
## Pearson's product-moment correlation
##
## data: dayofweek and duration
## t = 0.82141, df = 4117, p-value = 0.4115
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.01774674 0.04332414
## sample estimates:
## cor
## 0.01280064
library(tree)
set.seed(1)
maintraindata$y = as.factor(maintraindata$y)
summary(maintraindata)
## age job marital education
## Min. :18.00 Length:3089 Length:3089 Length:3089
## 1st Qu.:32.00 Class :character Class :character Class :character
## Median :38.00 Mode :character Mode :character Mode :character
## Mean :40.19
## 3rd Qu.:47.00
## Max. :86.00
## default housing loan contact
## Length:3089 Length:3089 Length:3089 Length:3089
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## month day_of_week duration campaign
## Length:3089 Length:3089 Min. : 0.0 Min. : 1.000
## Class :character Class :character 1st Qu.: 103.0 1st Qu.: 1.000
## Mode :character Mode :character Median : 181.0 Median : 2.000
## Mean : 255.5 Mean : 2.532
## 3rd Qu.: 316.0 3rd Qu.: 3.000
## Max. :3643.0 Max. :35.000
## pdays previous poutcome emp.var.rate
## Min. : 0.0 Min. :0.000 Length:3089 Min. :-3.4000
## 1st Qu.:999.0 1st Qu.:0.000 Class :character 1st Qu.:-1.8000
## Median :999.0 Median :0.000 Mode :character Median : 1.1000
## Mean :961.4 Mean :0.192 Mean : 0.1036
## 3rd Qu.:999.0 3rd Qu.:0.000 3rd Qu.: 1.4000
## Max. :999.0 Max. :6.000 Max. : 1.4000
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## Min. :92.20 Min. :-50.80 Min. :0.635 Min. :4964 no :2749
## 1st Qu.:93.08 1st Qu.:-42.70 1st Qu.:1.344 1st Qu.:5099 yes: 340
## Median :93.88 Median :-41.80 Median :4.857 Median :5191
## Mean :93.59 Mean :-40.43 Mean :3.646 Mean :5167
## 3rd Qu.:93.99 3rd Qu.:-36.40 3rd Qu.:4.961 3rd Qu.:5228
## Max. :94.77 Max. :-26.90 Max. :5.045 Max. :5228
fit.cart = tree(y~., maintraindata)
## Warning in tree(y ~ ., maintraindata): NAs introduced by coercion
summary(fit.cart)
##
## Classification tree:
## tree(formula = y ~ ., data = maintraindata)
## Variables actually used in tree construction:
## [1] "nr.employed" "duration" "cons.conf.idx" "euribor3m"
## Number of terminal nodes: 9
## Residual mean deviance: 0.3819 = 1176 / 3080
## Misclassification error rate: 0.09032 = 279 / 3089
plot(fit.cart)
text(fit.cart)
#train data
fit.p.cart = predict(fit.cart, newdata = maintraindata)
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
head(fit.p.cart)
## no yes
## 3591 0.9966796 0.003320421
## 607 0.9966796 0.003320421
## 2857 0.9966796 0.003320421
## 4017 0.9966796 0.003320421
## 3732 0.9619565 0.038043478
## 1453 0.8842975 0.115702479
fit.y.cart = fit.p.cart[,2] > 0.5
head(fit.y.cart)
## 3591 607 2857 4017 3732 1453
## FALSE FALSE FALSE FALSE FALSE FALSE
confusion.table.cart = table(fit.y.cart, maintraindata$y)
confusion.table.cart
##
## fit.y.cart no yes
## FALSE 2644 174
## TRUE 105 166
sum(diag(confusion.table.cart))/nrow(maintraindata)
## [1] 0.9096795
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
fit.roc.cart = roc(maintraindata$y, fit.p.cart[,2])
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(fit.roc.cart)
## Area under the curve: 0.9293
fit.p.cart = predict(fit.cart, newdata = maintestdata)
## Warning in pred1.tree(object, tree.matrix(newdata)): NAs introduced by coercion
head(fit.p.cart)
## no yes
## 5 0.9966796 0.003320421
## 10 0.9966796 0.003320421
## 19 0.9966796 0.003320421
## 25 0.9619565 0.038043478
## 26 0.8000000 0.200000000
## 35 0.9966796 0.003320421
fit.y.cart = fit.p.cart[,2] > 0.5
head(fit.y.cart)
## 5 10 19 25 26 35
## FALSE FALSE FALSE FALSE FALSE FALSE
confusion.table.cart = table(fit.y.cart, maintestdata$y)
confusion.table.cart
##
## fit.y.cart no yes
## FALSE 884 63
## TRUE 35 48
sum(diag(confusion.table.cart))/nrow(maintestdata)
## [1] 0.9048544
fit.roc.cart = roc(maintestdata$y, fit.p.cart[,2])
## Setting levels: control = no, case = yes
## Setting direction: controls < cases
auc(fit.roc.cart)
## Area under the curve: 0.9315
set.seed(1)
cv.bank = cv.tree(fit.cart, FUN = prune.misclass)
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
## Warning in tree(model = m[rand != i, , drop = FALSE]): NAs introduced by
## coercion
## Warning in pred1.tree(tree, tree.matrix(nd)): NAs introduced by coercion
names(cv.bank)
## [1] "size" "dev" "k" "method"
#cv.bank
#par(mfrow = c(1,2))
plot(cv.bank$size, cv.bank$dev / nrow(bank), type = "b", xlab = 'tree size', ylab = 'cv misclass rate')
prune.bank = prune.misclass(fit.cart, best = 3)
plot(prune.bank)
text(prune.bank, pretty = 0)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
rf.bank = randomForest(formula = y ~., data = maintraindata, mtry = 4, importance = TRUE, na.action = na.exclude)
rf.bank
##
## Call:
## randomForest(formula = y ~ ., data = maintraindata, mtry = 4, importance = TRUE, na.action = na.exclude)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 9.1%
## Confusion matrix:
## no yes class.error
## no 2668 81 0.02946526
## yes 200 140 0.58823529
varImpPlot(rf.bank)