library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")
bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)

bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)
for(i in 1: nrow(bank)){
  if(bank$y[i]=="yes"){
    bank$y[i]=1
  }
  else{
   bank$y[i]=0 
  }
}

bank$y<-as.factor(bank$y)
library(ROSE)
## Loaded ROSE 0.0-3
data_balanced_both <- ovun.sample(y ~ ., data = bank, method = "both", p=0.5, N=45211, seed = 1)$data
table(data_balanced_both$y)
## 
##     0     1 
## 22628 22583
prop.table(table(data_balanced_both$y))
## 
##         0         1 
## 0.5004977 0.4995023
inst_pack_func <- function(list.of.packages){
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) install.packages(new.packages)
  lapply(list.of.packages,function(x){library(x,character.only=TRUE)})
}

list.of.packages <- c("ggplot2","dplyr","stats4","splines","VGAM","rsample","rpart","rpart.plot","ipred","caret","MVN")
inst_pack_func(list.of.packages)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:kableExtra':
## 
##     group_rows
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:VGAM':
## 
##     predictors
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## sROC 0.1-2 loaded
## [[1]]
##  [1] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
##  [6] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[2]]
##  [1] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
##  [6] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [11] "base"      
## 
## [[3]]
##  [1] "stats4"     "dplyr"      "ggplot2"    "ROSE"       "kableExtra"
##  [6] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [11] "methods"    "base"      
## 
## [[4]]
##  [1] "splines"    "stats4"     "dplyr"      "ggplot2"    "ROSE"      
##  [6] "kableExtra" "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "VGAM"       "splines"    "stats4"     "dplyr"      "ggplot2"   
##  [6] "ROSE"       "kableExtra" "stats"      "graphics"   "grDevices" 
## [11] "utils"      "datasets"   "methods"    "base"      
## 
## [[6]]
##  [1] "rsample"    "VGAM"       "splines"    "stats4"     "dplyr"     
##  [6] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[7]]
##  [1] "rpart"      "rsample"    "VGAM"       "splines"    "stats4"    
##  [6] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[8]]
##  [1] "rpart.plot" "rpart"      "rsample"    "VGAM"       "splines"   
##  [6] "stats4"     "dplyr"      "ggplot2"    "ROSE"       "kableExtra"
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"      
## 
## [[9]]
##  [1] "ipred"      "rpart.plot" "rpart"      "rsample"    "VGAM"      
##  [6] "splines"    "stats4"     "dplyr"      "ggplot2"    "ROSE"      
## [11] "kableExtra" "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[10]]
##  [1] "caret"      "lattice"    "ipred"      "rpart.plot" "rpart"     
##  [6] "rsample"    "VGAM"       "splines"    "stats4"     "dplyr"     
## [11] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[11]]
##  [1] "MVN"        "caret"      "lattice"    "ipred"      "rpart.plot"
##  [6] "rpart"      "rsample"    "VGAM"       "splines"    "stats4"    
## [11] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"
set.seed(123)
split <- initial_split(data_balanced_both, prop = .8)
train1 <- training(split)
test  <- testing(split)
nrow(train1)
## [1] 36169
nrow(test)
## [1] 9042
prop.table(table(train1$y))
## 
##         0         1 
## 0.5006221 0.4993779

LOGISTIC REGRESSION

regression<- glm(y~., data = train1, family  = binomial(link = "logit"))
summary(regression)
## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.1313  -0.5893  -0.0601   0.5988   2.9451  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -7.460e-01  1.503e-01  -4.964 6.89e-07 ***
## age                -9.109e-04  1.798e-03  -0.507  0.61236    
## jobblue-collar     -3.345e-01  5.796e-02  -5.771 7.87e-09 ***
## jobentrepreneur    -3.902e-01  9.889e-02  -3.946 7.94e-05 ***
## jobhousemaid       -4.098e-01  1.049e-01  -3.909 9.29e-05 ***
## jobmanagement      -9.117e-02  6.011e-02  -1.517  0.12935    
## jobretired          4.064e-01  8.213e-02   4.949 7.47e-07 ***
## jobself-employed   -2.232e-01  9.052e-02  -2.466  0.01365 *  
## jobservices        -2.869e-01  6.739e-02  -4.258 2.07e-05 ***
## jobstudent          7.235e-01  9.822e-02   7.366 1.76e-13 ***
## jobtechnician      -1.004e-01  5.569e-02  -1.803  0.07134 .  
## jobunemployed      -8.228e-02  9.312e-02  -0.884  0.37694    
## jobunknown         -2.553e-01  1.915e-01  -1.333  0.18240    
## maritalmarried     -1.878e-01  4.783e-02  -3.927 8.62e-05 ***
## maritalsingle       1.205e-01  5.497e-02   2.192  0.02838 *  
## educationsecondary  2.280e-01  5.191e-02   4.391 1.13e-05 ***
## educationtertiary   4.179e-01  6.126e-02   6.823 8.93e-12 ***
## educationunknown    2.802e-01  8.518e-02   3.290  0.00100 ** 
## defaultyes          1.003e-01  1.218e-01   0.824  0.41003    
## balance             2.341e-05  5.007e-06   4.675 2.94e-06 ***
## housingyes         -7.052e-01  3.480e-02 -20.267  < 2e-16 ***
## loanyes            -5.427e-01  4.672e-02 -11.615  < 2e-16 ***
## contacttelephone   -4.067e-02  6.089e-02  -0.668  0.50417    
## contactunknown     -1.718e+00  5.404e-02 -31.787  < 2e-16 ***
## day                 4.870e-03  1.976e-03   2.465  0.01371 *  
## monthaug           -9.246e-01  6.292e-02 -14.696  < 2e-16 ***
## monthdec            6.884e-01  1.789e-01   3.847  0.00012 ***
## monthfeb           -1.044e-01  7.124e-02  -1.465  0.14297    
## monthjan           -1.302e+00  9.564e-02 -13.613  < 2e-16 ***
## monthjul           -1.078e+00  6.312e-02 -17.071  < 2e-16 ***
## monthjun            3.044e-01  7.397e-02   4.116 3.86e-05 ***
## monthmar            1.715e+00  1.202e-01  14.264  < 2e-16 ***
## monthmay           -6.591e-01  6.013e-02 -10.962  < 2e-16 ***
## monthnov           -1.025e+00  6.912e-02 -14.826  < 2e-16 ***
## monthoct            1.241e+00  1.022e-01  12.138  < 2e-16 ***
## monthsep            9.476e-01  1.161e-01   8.161 3.33e-16 ***
## duration            5.698e-03  7.200e-05  79.143  < 2e-16 ***
## campaign           -1.067e-01  7.754e-03 -13.760  < 2e-16 ***
## pdays              -4.373e-04  2.416e-04  -1.810  0.07027 .  
## previous            1.928e-02  8.812e-03   2.188  0.02864 *  
## poutcomeother       1.138e-01  7.449e-02   1.527  0.12665    
## poutcomesuccess     2.504e+00  8.438e-02  29.677  < 2e-16 ***
## poutcomeunknown    -2.512e-01  7.915e-02  -3.173  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50141  on 36168  degrees of freedom
## Residual deviance: 28827  on 36126  degrees of freedom
## AIC: 28913
## 
## Number of Fisher Scoring iterations: 6
test$y<-as.factor(test$y)
#Construct the Confusion Matrix
prediction <- predict(regression, newdata = test, type = 'response')
pred <- factor(ifelse(prediction <= 0.5,0,1))
result <- caret::confusionMatrix(pred,test$y)
result
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3857  813
##          1  664 3708
##                                           
##                Accuracy : 0.8367          
##                  95% CI : (0.8289, 0.8442)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6733          
##                                           
##  Mcnemar's Test P-Value : 0.0001176       
##                                           
##             Sensitivity : 0.8531          
##             Specificity : 0.8202          
##          Pos Pred Value : 0.8259          
##          Neg Pred Value : 0.8481          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4266          
##    Detection Prevalence : 0.5165          
##       Balanced Accuracy : 0.8367          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.8531
Specificity 0.8202
Pos Pred Value 0.8259
Neg Pred Value 0.8481
Precision 0.8259
Recall 0.8531
F1 0.8393
Prevalence 0.5000
Detection Rate 0.4266
Detection Prevalence 0.5165
Balanced Accuracy 0.8367

DECISION TREE

library(rpart)
library(rpart.plot)
ensemble <- rpart(y~., data = train1, method = 'class')
rpart.plot(ensemble)

#Construct the Confusion Matrix
prediction2 <- predict(ensemble, newdata = test, type = 'class')

result2 <- caret::confusionMatrix(prediction2,test$y)
result2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3377  475
##          1 1144 4046
##                                           
##                Accuracy : 0.8209          
##                  95% CI : (0.8129, 0.8288)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6419          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7470          
##             Specificity : 0.8949          
##          Pos Pred Value : 0.8767          
##          Neg Pred Value : 0.7796          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3735          
##    Detection Prevalence : 0.4260          
##       Balanced Accuracy : 0.8209          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(result2$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.7470
Specificity 0.8949
Pos Pred Value 0.8767
Neg Pred Value 0.7796
Precision 0.8767
Recall 0.7470
F1 0.8066
Prevalence 0.5000
Detection Rate 0.3735
Detection Prevalence 0.4260
Balanced Accuracy 0.8209
library(rpart)
library(rattle)
## Loading required package: tibble
## Loading required package: bitops
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
## 
## Attaching package: 'rattle'
## The following object is masked from 'package:VGAM':
## 
##     wine
library(rpart.plot)
library(RColorBrewer)

fancyRpartPlot(ensemble, uniform=TRUE, main="y Tree")

predicted <- predict(ensemble, type="class")
table(train1$y,predicted)
##    predicted
##         0     1
##   0 13488  4619
##   1  1910 16152

XGBOOST

library(xgboost)
## 
## Attaching package: 'xgboost'
## The following object is masked from 'package:rattle':
## 
##     xgboost
## The following object is masked from 'package:dplyr':
## 
##     slice
library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")
bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)

bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)
for(i in 1: nrow(bank)){
  if(bank$y[i]=="yes"){
    bank$y[i]=1
  }
  else{
   bank$y[i]=0 
  }
}

bank$y<-as.factor(bank$y)
library(xgboost)
 library(caret)

indexes = createDataPartition(bank$y, p=.8, list=F)
train = bank[indexes, ]
test = bank[-indexes, ]

train_x = data.matrix(train[,-17])
train_y = train[,17]
 
test_x = data.matrix(test[,-17])
test_y = test[,17]



xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)

xgbc = xgboost(data=xgb_train, max.depth=3, nrounds=50)
## [1]  train-rmse:0.528343 
## [2]  train-rmse:0.421607 
## [3]  train-rmse:0.355102 
## [4]  train-rmse:0.317497 
## [5]  train-rmse:0.296199 
## [6]  train-rmse:0.284485 
## [7]  train-rmse:0.278101 
## [8]  train-rmse:0.270568 
## [9]  train-rmse:0.268198 
## [10] train-rmse:0.266526 
## [11] train-rmse:0.265473 
## [12] train-rmse:0.263241 
## [13] train-rmse:0.262567 
## [14] train-rmse:0.262051 
## [15] train-rmse:0.261411 
## [16] train-rmse:0.260376 
## [17] train-rmse:0.260021 
## [18] train-rmse:0.258469 
## [19] train-rmse:0.257723 
## [20] train-rmse:0.257425 
## [21] train-rmse:0.257155 
## [22] train-rmse:0.256120 
## [23] train-rmse:0.255367 
## [24] train-rmse:0.255176 
## [25] train-rmse:0.255055 
## [26] train-rmse:0.254855 
## [27] train-rmse:0.254251 
## [28] train-rmse:0.254096 
## [29] train-rmse:0.253733 
## [30] train-rmse:0.253601 
## [31] train-rmse:0.253427 
## [32] train-rmse:0.253130 
## [33] train-rmse:0.253084 
## [34] train-rmse:0.252246 
## [35] train-rmse:0.251716 
## [36] train-rmse:0.251556 
## [37] train-rmse:0.250770 
## [38] train-rmse:0.250254 
## [39] train-rmse:0.250200 
## [40] train-rmse:0.250169 
## [41] train-rmse:0.249942 
## [42] train-rmse:0.249733 
## [43] train-rmse:0.249190 
## [44] train-rmse:0.249003 
## [45] train-rmse:0.248928 
## [46] train-rmse:0.248828 
## [47] train-rmse:0.248693 
## [48] train-rmse:0.248509 
## [49] train-rmse:0.248467 
## [50] train-rmse:0.248442
pred = predict(xgbc, xgb_test)
pred[(pred>3)] = 3
pred_y = as.factor((levels(test_y))[round(pred)])
cm = confusionMatrix(test_y, pred_y)
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7784  200
##          1  644  413
##                                           
##                Accuracy : 0.9066          
##                  95% CI : (0.9005, 0.9126)
##     No Information Rate : 0.9322          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4472          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9236          
##             Specificity : 0.6737          
##          Pos Pred Value : 0.9749          
##          Neg Pred Value : 0.3907          
##              Prevalence : 0.9322          
##          Detection Rate : 0.8610          
##    Detection Prevalence : 0.8831          
##       Balanced Accuracy : 0.7987          
##                                           
##        'Positive' Class : 0               
## 
metrics<-as.data.frame(cm$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)
F1-score, Precision and Recall
metrics
Sensitivity 0.9236
Specificity 0.6737
Pos Pred Value 0.9749
Neg Pred Value 0.3907
Precision 0.9749
Recall 0.9236
F1 0.9486
Prevalence 0.9322
Detection Rate 0.8610
Detection Prevalence 0.8831
Balanced Accuracy 0.7987
f1<-c(0.825,0.876,0.975)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)
library(ggplot2)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Precision")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Precision")