Modeling

library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")

bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)

bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)

for(i in 1: nrow(bank)){
  if(bank$y[i]=="yes"){
    bank$y[i]=1
  }
  else{
   bank$y[i]=0 
  }
}

bank$y<-as.factor(bank$y)

library(ROSE)

## Loaded ROSE 0.0-3

data_balanced_both <- ovun.sample(y ~ ., data = bank, method = "both", p=0.5, N=45211, seed = 1)$data
table(data_balanced_both$y)

## 
##     0     1 
## 22628 22583

prop.table(table(data_balanced_both$y))

## 
##         0         1 
## 0.5004977 0.4995023

inst_pack_func <- function(list.of.packages){
  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
  if(length(new.packages)) install.packages(new.packages)
  lapply(list.of.packages,function(x){library(x,character.only=TRUE)})
}

list.of.packages <- c("ggplot2","dplyr","stats4","splines","VGAM","rsample","rpart","rpart.plot","ipred","caret","MVN")
inst_pack_func(list.of.packages)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:kableExtra':
## 
##     group_rows

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:VGAM':
## 
##     predictors

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

## sROC 0.1-2 loaded

## [[1]]
##  [1] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
##  [6] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[2]]
##  [1] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
##  [6] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [11] "base"      
## 
## [[3]]
##  [1] "stats4"     "dplyr"      "ggplot2"    "ROSE"       "kableExtra"
##  [6] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [11] "methods"    "base"      
## 
## [[4]]
##  [1] "splines"    "stats4"     "dplyr"      "ggplot2"    "ROSE"      
##  [6] "kableExtra" "stats"      "graphics"   "grDevices"  "utils"     
## [11] "datasets"   "methods"    "base"      
## 
## [[5]]
##  [1] "VGAM"       "splines"    "stats4"     "dplyr"      "ggplot2"   
##  [6] "ROSE"       "kableExtra" "stats"      "graphics"   "grDevices" 
## [11] "utils"      "datasets"   "methods"    "base"      
## 
## [[6]]
##  [1] "rsample"    "VGAM"       "splines"    "stats4"     "dplyr"     
##  [6] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
## [11] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[7]]
##  [1] "rpart"      "rsample"    "VGAM"       "splines"    "stats4"    
##  [6] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
## [11] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [16] "base"      
## 
## [[8]]
##  [1] "rpart.plot" "rpart"      "rsample"    "VGAM"       "splines"   
##  [6] "stats4"     "dplyr"      "ggplot2"    "ROSE"       "kableExtra"
## [11] "stats"      "graphics"   "grDevices"  "utils"      "datasets"  
## [16] "methods"    "base"      
## 
## [[9]]
##  [1] "ipred"      "rpart.plot" "rpart"      "rsample"    "VGAM"      
##  [6] "splines"    "stats4"     "dplyr"      "ggplot2"    "ROSE"      
## [11] "kableExtra" "stats"      "graphics"   "grDevices"  "utils"     
## [16] "datasets"   "methods"    "base"      
## 
## [[10]]
##  [1] "caret"      "lattice"    "ipred"      "rpart.plot" "rpart"     
##  [6] "rsample"    "VGAM"       "splines"    "stats4"     "dplyr"     
## [11] "ggplot2"    "ROSE"       "kableExtra" "stats"      "graphics"  
## [16] "grDevices"  "utils"      "datasets"   "methods"    "base"      
## 
## [[11]]
##  [1] "MVN"        "caret"      "lattice"    "ipred"      "rpart.plot"
##  [6] "rpart"      "rsample"    "VGAM"       "splines"    "stats4"    
## [11] "dplyr"      "ggplot2"    "ROSE"       "kableExtra" "stats"     
## [16] "graphics"   "grDevices"  "utils"      "datasets"   "methods"   
## [21] "base"

set.seed(123)
split <- initial_split(data_balanced_both, prop = .8)
train1 <- training(split)
test  <- testing(split)
nrow(train1)

## [1] 36169

nrow(test)

## [1] 9042

prop.table(table(train1$y))

## 
##         0         1 
## 0.5006221 0.4993779

LOGISTIC REGRESSION

regression<- glm(y~., data = train1, family  = binomial(link = "logit"))
summary(regression)

## 
## Call:
## glm(formula = y ~ ., family = binomial(link = "logit"), data = train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -7.1313  -0.5893  -0.0601   0.5988   2.9451  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -7.460e-01  1.503e-01  -4.964 6.89e-07 ***
## age                -9.109e-04  1.798e-03  -0.507  0.61236    
## jobblue-collar     -3.345e-01  5.796e-02  -5.771 7.87e-09 ***
## jobentrepreneur    -3.902e-01  9.889e-02  -3.946 7.94e-05 ***
## jobhousemaid       -4.098e-01  1.049e-01  -3.909 9.29e-05 ***
## jobmanagement      -9.117e-02  6.011e-02  -1.517  0.12935    
## jobretired          4.064e-01  8.213e-02   4.949 7.47e-07 ***
## jobself-employed   -2.232e-01  9.052e-02  -2.466  0.01365 *  
## jobservices        -2.869e-01  6.739e-02  -4.258 2.07e-05 ***
## jobstudent          7.235e-01  9.822e-02   7.366 1.76e-13 ***
## jobtechnician      -1.004e-01  5.569e-02  -1.803  0.07134 .  
## jobunemployed      -8.228e-02  9.312e-02  -0.884  0.37694    
## jobunknown         -2.553e-01  1.915e-01  -1.333  0.18240    
## maritalmarried     -1.878e-01  4.783e-02  -3.927 8.62e-05 ***
## maritalsingle       1.205e-01  5.497e-02   2.192  0.02838 *  
## educationsecondary  2.280e-01  5.191e-02   4.391 1.13e-05 ***
## educationtertiary   4.179e-01  6.126e-02   6.823 8.93e-12 ***
## educationunknown    2.802e-01  8.518e-02   3.290  0.00100 ** 
## defaultyes          1.003e-01  1.218e-01   0.824  0.41003    
## balance             2.341e-05  5.007e-06   4.675 2.94e-06 ***
## housingyes         -7.052e-01  3.480e-02 -20.267  < 2e-16 ***
## loanyes            -5.427e-01  4.672e-02 -11.615  < 2e-16 ***
## contacttelephone   -4.067e-02  6.089e-02  -0.668  0.50417    
## contactunknown     -1.718e+00  5.404e-02 -31.787  < 2e-16 ***
## day                 4.870e-03  1.976e-03   2.465  0.01371 *  
## monthaug           -9.246e-01  6.292e-02 -14.696  < 2e-16 ***
## monthdec            6.884e-01  1.789e-01   3.847  0.00012 ***
## monthfeb           -1.044e-01  7.124e-02  -1.465  0.14297    
## monthjan           -1.302e+00  9.564e-02 -13.613  < 2e-16 ***
## monthjul           -1.078e+00  6.312e-02 -17.071  < 2e-16 ***
## monthjun            3.044e-01  7.397e-02   4.116 3.86e-05 ***
## monthmar            1.715e+00  1.202e-01  14.264  < 2e-16 ***
## monthmay           -6.591e-01  6.013e-02 -10.962  < 2e-16 ***
## monthnov           -1.025e+00  6.912e-02 -14.826  < 2e-16 ***
## monthoct            1.241e+00  1.022e-01  12.138  < 2e-16 ***
## monthsep            9.476e-01  1.161e-01   8.161 3.33e-16 ***
## duration            5.698e-03  7.200e-05  79.143  < 2e-16 ***
## campaign           -1.067e-01  7.754e-03 -13.760  < 2e-16 ***
## pdays              -4.373e-04  2.416e-04  -1.810  0.07027 .  
## previous            1.928e-02  8.812e-03   2.188  0.02864 *  
## poutcomeother       1.138e-01  7.449e-02   1.527  0.12665    
## poutcomesuccess     2.504e+00  8.438e-02  29.677  < 2e-16 ***
## poutcomeunknown    -2.512e-01  7.915e-02  -3.173  0.00151 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 50141  on 36168  degrees of freedom
## Residual deviance: 28827  on 36126  degrees of freedom
## AIC: 28913
## 
## Number of Fisher Scoring iterations: 6

test$y<-as.factor(test$y)

#Construct the Confusion Matrix
prediction <- predict(regression, newdata = test, type = 'response')
pred <- factor(ifelse(prediction <= 0.5,0,1))
result <- caret::confusionMatrix(pred,test$y)
result

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3857  813
##          1  664 3708
##                                           
##                Accuracy : 0.8367          
##                  95% CI : (0.8289, 0.8442)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6733          
##                                           
##  Mcnemar's Test P-Value : 0.0001176       
##                                           
##             Sensitivity : 0.8531          
##             Specificity : 0.8202          
##          Pos Pred Value : 0.8259          
##          Neg Pred Value : 0.8481          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4266          
##    Detection Prevalence : 0.5165          
##       Balanced Accuracy : 0.8367          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.8531
Specificity	0.8202
Pos Pred Value	0.8259
Neg Pred Value	0.8481
Precision	0.8259
Recall	0.8531
F1	0.8393
Prevalence	0.5000
Detection Rate	0.4266
Detection Prevalence	0.5165
Balanced Accuracy	0.8367

DECISION TREE

library(rpart)
library(rpart.plot)
ensemble <- rpart(y~., data = train1, method = 'class')
rpart.plot(ensemble)

#Construct the Confusion Matrix
prediction2 <- predict(ensemble, newdata = test, type = 'class')

result2 <- caret::confusionMatrix(prediction2,test$y)
result2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3377  475
##          1 1144 4046
##                                           
##                Accuracy : 0.8209          
##                  95% CI : (0.8129, 0.8288)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6419          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7470          
##             Specificity : 0.8949          
##          Pos Pred Value : 0.8767          
##          Neg Pred Value : 0.7796          
##              Prevalence : 0.5000          
##          Detection Rate : 0.3735          
##    Detection Prevalence : 0.4260          
##       Balanced Accuracy : 0.8209          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(result2$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.7470
Specificity	0.8949
Pos Pred Value	0.8767
Neg Pred Value	0.7796
Precision	0.8767
Recall	0.7470
F1	0.8066
Prevalence	0.5000
Detection Rate	0.3735
Detection Prevalence	0.4260
Balanced Accuracy	0.8209

library(rpart)
library(rattle)

## Loading required package: tibble

## Loading required package: bitops

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

## 
## Attaching package: 'rattle'

## The following object is masked from 'package:VGAM':
## 
##     wine

library(rpart.plot)
library(RColorBrewer)

fancyRpartPlot(ensemble, uniform=TRUE, main="y Tree")

predicted <- predict(ensemble, type="class")
table(train1$y,predicted)

##    predicted
##         0     1
##   0 13488  4619
##   1  1910 16152

XGBOOST

library(xgboost)

## 
## Attaching package: 'xgboost'

## The following object is masked from 'package:rattle':
## 
##     xgboost

## The following object is masked from 'package:dplyr':
## 
##     slice

library(kableExtra)
bank<- read.csv(file = "bank-full.csv", head = T, sep=";")

bank$job<-as.factor(bank$job)
bank$marital<-as.factor(bank$marital)
bank$education<-as.factor(bank$education)
bank$default<-as.factor(bank$default)
bank$housing<-as.factor(bank$housing)

bank$loan<-as.factor(bank$loan)
bank$contact<-as.factor(bank$contact)
bank$poutcome<-as.factor(bank$poutcome)
#bank$ y<-as.factor(bank$ y)

for(i in 1: nrow(bank)){
  if(bank$y[i]=="yes"){
    bank$y[i]=1
  }
  else{
   bank$y[i]=0 
  }
}

bank$y<-as.factor(bank$y)

library(xgboost)
 library(caret)

indexes = createDataPartition(bank$y, p=.8, list=F)
train = bank[indexes, ]
test = bank[-indexes, ]

train_x = data.matrix(train[,-17])
train_y = train[,17]
 
test_x = data.matrix(test[,-17])
test_y = test[,17]



xgb_train = xgb.DMatrix(data=train_x, label=train_y)
xgb_test = xgb.DMatrix(data=test_x, label=test_y)

xgbc = xgboost(data=xgb_train, max.depth=3, nrounds=50)

## [1]  train-rmse:0.528343 
## [2]  train-rmse:0.421607 
## [3]  train-rmse:0.355102 
## [4]  train-rmse:0.317497 
## [5]  train-rmse:0.296199 
## [6]  train-rmse:0.284485 
## [7]  train-rmse:0.278101 
## [8]  train-rmse:0.270568 
## [9]  train-rmse:0.268198 
## [10] train-rmse:0.266526 
## [11] train-rmse:0.265473 
## [12] train-rmse:0.263241 
## [13] train-rmse:0.262567 
## [14] train-rmse:0.262051 
## [15] train-rmse:0.261411 
## [16] train-rmse:0.260376 
## [17] train-rmse:0.260021 
## [18] train-rmse:0.258469 
## [19] train-rmse:0.257723 
## [20] train-rmse:0.257425 
## [21] train-rmse:0.257155 
## [22] train-rmse:0.256120 
## [23] train-rmse:0.255367 
## [24] train-rmse:0.255176 
## [25] train-rmse:0.255055 
## [26] train-rmse:0.254855 
## [27] train-rmse:0.254251 
## [28] train-rmse:0.254096 
## [29] train-rmse:0.253733 
## [30] train-rmse:0.253601 
## [31] train-rmse:0.253427 
## [32] train-rmse:0.253130 
## [33] train-rmse:0.253084 
## [34] train-rmse:0.252246 
## [35] train-rmse:0.251716 
## [36] train-rmse:0.251556 
## [37] train-rmse:0.250770 
## [38] train-rmse:0.250254 
## [39] train-rmse:0.250200 
## [40] train-rmse:0.250169 
## [41] train-rmse:0.249942 
## [42] train-rmse:0.249733 
## [43] train-rmse:0.249190 
## [44] train-rmse:0.249003 
## [45] train-rmse:0.248928 
## [46] train-rmse:0.248828 
## [47] train-rmse:0.248693 
## [48] train-rmse:0.248509 
## [49] train-rmse:0.248467 
## [50] train-rmse:0.248442

pred = predict(xgbc, xgb_test)
pred[(pred>3)] = 3
pred_y = as.factor((levels(test_y))[round(pred)])
cm = confusionMatrix(test_y, pred_y)
print(cm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 7784  200
##          1  644  413
##                                           
##                Accuracy : 0.9066          
##                  95% CI : (0.9005, 0.9126)
##     No Information Rate : 0.9322          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4472          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.9236          
##             Specificity : 0.6737          
##          Pos Pred Value : 0.9749          
##          Neg Pred Value : 0.3907          
##              Prevalence : 0.9322          
##          Detection Rate : 0.8610          
##    Detection Prevalence : 0.8831          
##       Balanced Accuracy : 0.7987          
##                                           
##        'Positive' Class : 0               
##

metrics<-as.data.frame(cm$byClass)
colnames(metrics)<-"metrics"
library(dplyr)
library(kableExtra)
kable(round(metrics,4), caption = "F1-score, Precision and Recall ") %>%
  kable_styling(font_size = 16)

F1-score, Precision and Recall
	metrics
Sensitivity	0.9236
Specificity	0.6737
Pos Pred Value	0.9749
Neg Pred Value	0.3907
Precision	0.9749
Recall	0.9236
F1	0.9486
Prevalence	0.9322
Detection Rate	0.8610
Detection Prevalence	0.8831
Balanced Accuracy	0.7987

f1<-c(0.825,0.876,0.975)
modelss<-c("Logistic Regression", "Decision tree","XGBOOST")
v1=data.frame(f1,modelss)

library(ggplot2)
ggplot(v1, aes(x=modelss, y=f1)) + 
  geom_bar(stat = "identity",fill="gold")+coord_flip()+ggtitle("Precision")+geom_text(aes(label = f1), vjust = 0, hjust = 1.2) +labs(x="models",y="Precision")

Modeling

Irem TANRIVERDI

5/14/2021

LOGISTIC REGRESSION

DECISION TREE

XGBOOST