library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")
bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)
bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)
for(i in 1: nrow(bank)){
if(bank$y[i]=="yes"){
bank$y[i]=1
}
else{
bank$y[i]=0
}
}
bank$y<-as.factor(bank$y)
library(ROSE)
## Loaded ROSE 0.0-3
data_balanced_both <- ovun.sample(y ~ ., data = bank, method = "both", p=0.5, N=45211, seed = 1)$data
table(data_balanced_both$y)
##
## 0 1
## 22628 22583
prop.table(table(data_balanced_both$y))
##
## 0 1
## 0.5004977 0.4995023
inst_pack_func <- function(list.of.packages){
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
lapply(list.of.packages,function(x){library(x,character.only=TRUE)})
}
list.of.packages <- c("ggplot2","dplyr","stats4","splines","VGAM","rsample","rpart","rpart.plot","ipred","caret","MVN")
inst_pack_func(list.of.packages)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:VGAM':
##
## predictors
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## sROC 0.1-2 loaded
## [[1]]
## [1] "ggplot2" "ROSE" "kableExtra" "stats" "graphics"
## [6] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "dplyr" "ggplot2" "ROSE" "kableExtra" "stats"
## [6] "graphics" "grDevices" "utils" "datasets" "methods"
## [11] "base"
##
## [[3]]
## [1] "stats4" "dplyr" "ggplot2" "ROSE" "kableExtra"
## [6] "stats" "graphics" "grDevices" "utils" "datasets"
## [11] "methods" "base"
##
## [[4]]
## [1] "splines" "stats4" "dplyr" "ggplot2" "ROSE"
## [6] "kableExtra" "stats" "graphics" "grDevices" "utils"
## [11] "datasets" "methods" "base"
##
## [[5]]
## [1] "VGAM" "splines" "stats4" "dplyr" "ggplot2"
## [6] "ROSE" "kableExtra" "stats" "graphics" "grDevices"
## [11] "utils" "datasets" "methods" "base"
##
## [[6]]
## [1] "rsample" "VGAM" "splines" "stats4" "dplyr"
## [6] "ggplot2" "ROSE" "kableExtra" "stats" "graphics"
## [11] "grDevices" "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "rpart" "rsample" "VGAM" "splines" "stats4"
## [6] "dplyr" "ggplot2" "ROSE" "kableExtra" "stats"
## [11] "graphics" "grDevices" "utils" "datasets" "methods"
## [16] "base"
##
## [[8]]
## [1] "rpart.plot" "rpart" "rsample" "VGAM" "splines"
## [6] "stats4" "dplyr" "ggplot2" "ROSE" "kableExtra"
## [11] "stats" "graphics" "grDevices" "utils" "datasets"
## [16] "methods" "base"
##
## [[9]]
## [1] "ipred" "rpart.plot" "rpart" "rsample" "VGAM"
## [6] "splines" "stats4" "dplyr" "ggplot2" "ROSE"
## [11] "kableExtra" "stats" "graphics" "grDevices" "utils"
## [16] "datasets" "methods" "base"
##
## [[10]]
## [1] "caret" "lattice" "ipred" "rpart.plot" "rpart"
## [6] "rsample" "VGAM" "splines" "stats4" "dplyr"
## [11] "ggplot2" "ROSE" "kableExtra" "stats" "graphics"
## [16] "grDevices" "utils" "datasets" "methods" "base"
##
## [[11]]
## [1] "MVN" "caret" "lattice" "ipred" "rpart.plot"
## [6] "rpart" "rsample" "VGAM" "splines" "stats4"
## [11] "dplyr" "ggplot2" "ROSE" "kableExtra" "stats"
## [16] "graphics" "grDevices" "utils" "datasets" "methods"
## [21] "base"
set.seed(123)
split <- initial_split(data_balanced_both, prop = .8)
train1 <- training(split)
test <- testing(split)
nrow(train1)
## [1] 36169
nrow(test)
## [1] 9042
prop.table(table(train1$y))
##
## 0 1
## 0.5006221 0.4993779
LOGISTIC REGRESSION
regression<- glm(y~., data = train1, family = binomial(link = "logit"))
summary(regression)
##
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.1313 -0.5893 -0.0601 0.5988 2.9451
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.460e-01 1.503e-01 -4.964 6.89e-07 ***
## age -9.109e-04 1.798e-03 -0.507 0.61236
## jobblue-collar -3.345e-01 5.796e-02 -5.771 7.87e-09 ***
## jobentrepreneur -3.902e-01 9.889e-02 -3.946 7.94e-05 ***
## jobhousemaid -4.098e-01 1.049e-01 -3.909 9.29e-05 ***
## jobmanagement -9.117e-02 6.011e-02 -1.517 0.12935
## jobretired 4.064e-01 8.213e-02 4.949 7.47e-07 ***
## jobself-employed -2.232e-01 9.052e-02 -2.466 0.01365 *
## jobservices -2.869e-01 6.739e-02 -4.258 2.07e-05 ***
## jobstudent 7.235e-01 9.822e-02 7.366 1.76e-13 ***
## jobtechnician -1.004e-01 5.569e-02 -1.803 0.07134 .
## jobunemployed -8.228e-02 9.312e-02 -0.884 0.37694
## jobunknown -2.553e-01 1.915e-01 -1.333 0.18240
## maritalmarried -1.878e-01 4.783e-02 -3.927 8.62e-05 ***
## maritalsingle 1.205e-01 5.497e-02 2.192 0.02838 *
## educationsecondary 2.280e-01 5.191e-02 4.391 1.13e-05 ***
## educationtertiary 4.179e-01 6.126e-02 6.823 8.93e-12 ***
## educationunknown 2.802e-01 8.518e-02 3.290 0.00100 **
## defaultyes 1.003e-01 1.218e-01 0.824 0.41003
## balance 2.341e-05 5.007e-06 4.675 2.94e-06 ***
## housingyes -7.052e-01 3.480e-02 -20.267 < 2e-16 ***
## loanyes -5.427e-01 4.672e-02 -11.615 < 2e-16 ***
## contacttelephone -4.067e-02 6.089e-02 -0.668 0.50417
## contactunknown -1.718e+00 5.404e-02 -31.787 < 2e-16 ***
## day 4.870e-03 1.976e-03 2.465 0.01371 *
## monthaug -9.246e-01 6.292e-02 -14.696 < 2e-16 ***
## monthdec 6.884e-01 1.789e-01 3.847 0.00012 ***
## monthfeb -1.044e-01 7.124e-02 -1.465 0.14297
## monthjan -1.302e+00 9.564e-02 -13.613 < 2e-16 ***
## monthjul -1.078e+00 6.312e-02 -17.071 < 2e-16 ***
## monthjun 3.044e-01 7.397e-02 4.116 3.86e-05 ***
## monthmar 1.715e+00 1.202e-01 14.264 < 2e-16 ***
## monthmay -6.591e-01 6.013e-02 -10.962 < 2e-16 ***
## monthnov -1.025e+00 6.912e-02 -14.826 < 2e-16 ***
## monthoct 1.241e+00 1.022e-01 12.138 < 2e-16 ***
## monthsep 9.476e-01 1.161e-01 8.161 3.33e-16 ***
## duration 5.698e-03 7.200e-05 79.143 < 2e-16 ***
## campaign -1.067e-01 7.754e-03 -13.760 < 2e-16 ***
## pdays -4.373e-04 2.416e-04 -1.810 0.07027 .
## previous 1.928e-02 8.812e-03 2.188 0.02864 *
## poutcomeother 1.138e-01 7.449e-02 1.527 0.12665
## poutcomesuccess 2.504e+00 8.438e-02 29.677 < 2e-16 ***
## poutcomeunknown -2.512e-01 7.915e-02 -3.173 0.00151 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 50141 on 36168 degrees of freedom
## Residual deviance: 28827 on 36126 degrees of freedom
## AIC: 28913
##
## Number of Fisher Scoring iterations: 6
test$y<-as.factor(test$y)
#Construct the Confusion Matrix
prediction <- predict(regression, newdata = test, type = 'response')
pred <- factor(ifelse(prediction <= 0.5,0,1))
result <- caret::confusionMatrix(pred,test$y)
result
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3857 813
## 1 664 3708
##
## Accuracy : 0.8367
## 95% CI : (0.8289, 0.8442)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6733
##
## Mcnemar's Test P-Value : 0.0001176
##
## Sensitivity : 0.8531
## Specificity : 0.8202
## Pos Pred Value : 0.8259
## Neg Pred Value : 0.8481
## Prevalence : 0.5000
## Detection Rate : 0.4266
## Detection Prevalence : 0.5165
## Balanced Accuracy : 0.8367
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.8531
|
|
Specificity
|
0.8202
|
|
Pos Pred Value
|
0.8259
|
|
Neg Pred Value
|
0.8481
|
|
Precision
|
0.8259
|
|
Recall
|
0.8531
|
|
F1
|
0.8393
|
|
Prevalence
|
0.5000
|
|
Detection Rate
|
0.4266
|
|
Detection Prevalence
|
0.5165
|
|
Balanced Accuracy
|
0.8367
|
DECISION TREE
library(rpart)
library(rpart.plot)
ensemble <- rpart(y~., data = train1, method = 'class')
rpart.plot(ensemble)

#Construct the Confusion Matrix
prediction2 <- predict(ensemble, newdata = test, type = 'class')
result2 <- caret::confusionMatrix(prediction2,test$y)
result2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3377 475
## 1 1144 4046
##
## Accuracy : 0.8209
## 95% CI : (0.8129, 0.8288)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6419
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7470
## Specificity : 0.8949
## Pos Pred Value : 0.8767
## Neg Pred Value : 0.7796
## Prevalence : 0.5000
## Detection Rate : 0.3735
## Detection Prevalence : 0.4260
## Balanced Accuracy : 0.8209
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(result2$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.7470
|
|
Specificity
|
0.8949
|
|
Pos Pred Value
|
0.8767
|
|
Neg Pred Value
|
0.7796
|
|
Precision
|
0.8767
|
|
Recall
|
0.7470
|
|
F1
|
0.8066
|
|
Prevalence
|
0.5000
|
|
Detection Rate
|
0.3735
|
|
Detection Prevalence
|
0.4260
|
|
Balanced Accuracy
|
0.8209
|
library(rpart)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
## The following object is masked from 'package:VGAM':
##
## wine
library(rpart.plot)
library(RColorBrewer)
fancyRpartPlot(ensemble, uniform=TRUE, main="y Tree")

predicted <- predict(ensemble, type="class")
table(train1$y,predicted)
## predicted
## 0 1
## 0 13488 4619
## 1 1910 16152
XGBOOST
library(xgboost)
##
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
##
## xgboost
## The following object is masked from 'package:dplyr':
##
## slice
library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")
bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)
bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)
for(i in 1: nrow(bank)){
if(bank$y[i]=="yes"){
bank$y[i]=1
}
else{
bank$y[i]=0
}
}
bank$y<-as.factor(bank$y)
library(xgboost)
library(caret)
indexes = createDataPartition(bank$y, p=.8, list=F)
train = bank[indexes, ]
test = bank[-indexes, ]
train_x = data.matrix(train[,-17])
train_y = train[,17]
test_x = data.matrix(test[,-17])
test_y = test[,17]
xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)
xgbc = xgboost(data=xgb_train, max.depth=3, nrounds=50)
## [1] train-rmse:0.528343
## [2] train-rmse:0.421607
## [3] train-rmse:0.355102
## [4] train-rmse:0.317497
## [5] train-rmse:0.296199
## [6] train-rmse:0.284485
## [7] train-rmse:0.278101
## [8] train-rmse:0.270568
## [9] train-rmse:0.268198
## [10] train-rmse:0.266526
## [11] train-rmse:0.265473
## [12] train-rmse:0.263241
## [13] train-rmse:0.262567
## [14] train-rmse:0.262051
## [15] train-rmse:0.261411
## [16] train-rmse:0.260376
## [17] train-rmse:0.260021
## [18] train-rmse:0.258469
## [19] train-rmse:0.257723
## [20] train-rmse:0.257425
## [21] train-rmse:0.257155
## [22] train-rmse:0.256120
## [23] train-rmse:0.255367
## [24] train-rmse:0.255176
## [25] train-rmse:0.255055
## [26] train-rmse:0.254855
## [27] train-rmse:0.254251
## [28] train-rmse:0.254096
## [29] train-rmse:0.253733
## [30] train-rmse:0.253601
## [31] train-rmse:0.253427
## [32] train-rmse:0.253130
## [33] train-rmse:0.253084
## [34] train-rmse:0.252246
## [35] train-rmse:0.251716
## [36] train-rmse:0.251556
## [37] train-rmse:0.250770
## [38] train-rmse:0.250254
## [39] train-rmse:0.250200
## [40] train-rmse:0.250169
## [41] train-rmse:0.249942
## [42] train-rmse:0.249733
## [43] train-rmse:0.249190
## [44] train-rmse:0.249003
## [45] train-rmse:0.248928
## [46] train-rmse:0.248828
## [47] train-rmse:0.248693
## [48] train-rmse:0.248509
## [49] train-rmse:0.248467
## [50] train-rmse:0.248442
pred = predict(xgbc, xgb_test)
pred[(pred>3)] = 3
pred_y = as.factor((levels(test_y))[round(pred)])
cm = confusionMatrix(test_y, pred_y)
print(cm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7784 200
## 1 644 413
##
## Accuracy : 0.9066
## 95% CI : (0.9005, 0.9126)
## No Information Rate : 0.9322
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4472
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9236
## Specificity : 0.6737
## Pos Pred Value : 0.9749
## Neg Pred Value : 0.3907
## Prevalence : 0.9322
## Detection Rate : 0.8610
## Detection Prevalence : 0.8831
## Balanced Accuracy : 0.7987
##
## 'Positive' Class : 0
##
metrics<-as.data.frame(cm$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
kable_styling(font_size = 16)
F1-score, Precision and Recall
|
|
metrics
|
|
Sensitivity
|
0.9236
|
|
Specificity
|
0.6737
|
|
Pos Pred Value
|
0.9749
|
|
Neg Pred Value
|
0.3907
|
|
Precision
|
0.9749
|
|
Recall
|
0.9236
|
|
F1
|
0.9486
|
|
Prevalence
|
0.9322
|
|
Detection Rate
|
0.8610
|
|
Detection Prevalence
|
0.8831
|
|
Balanced Accuracy
|
0.7987
|
f1<-c(0.825,0.876,0.975)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
library(ggplot2)
ggplot(v1, aes(x=modelss, y=f1)) +
geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Precision")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Precision")
