This assignment is aiming to resampling models for Bank+Marketing Dataset.

First, we need to import library and prepare dataset.

library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(e1071)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(rsample)
## 
## Attaching package: 'rsample'
## The following object is masked from 'package:e1071':
## 
##     permutations
library(boot)
## 
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
## 
##     melanoma
bank <- read.csv("bank-full.csv", sep = ";")
head(bank)
##   age          job marital education default balance housing loan contact day
## 1  58   management married  tertiary      no    2143     yes   no unknown   5
## 2  44   technician  single secondary      no      29     yes   no unknown   5
## 3  33 entrepreneur married secondary      no       2     yes  yes unknown   5
## 4  47  blue-collar married   unknown      no    1506     yes   no unknown   5
## 5  33      unknown  single   unknown      no       1      no   no unknown   5
## 6  35   management married  tertiary      no     231     yes   no unknown   5
##   month duration campaign pdays previous poutcome  y
## 1   may      261        1    -1        0  unknown no
## 2   may      151        1    -1        0  unknown no
## 3   may       76        1    -1        0  unknown no
## 4   may       92        1    -1        0  unknown no
## 5   may      198        1    -1        0  unknown no
## 6   may      139        1    -1        0  unknown no

Data Wrangling This is to check on whether all the data types are suitable to the column.

str(bank)
## 'data.frame':    45211 obs. of  17 variables:
##  $ age      : int  58 44 33 47 33 35 28 42 58 43 ...
##  $ job      : chr  "management" "technician" "entrepreneur" "blue-collar" ...
##  $ marital  : chr  "married" "single" "married" "married" ...
##  $ education: chr  "tertiary" "secondary" "secondary" "unknown" ...
##  $ default  : chr  "no" "no" "no" "no" ...
##  $ balance  : int  2143 29 2 1506 1 231 447 2 121 593 ...
##  $ housing  : chr  "yes" "yes" "yes" "yes" ...
##  $ loan     : chr  "no" "no" "yes" "no" ...
##  $ contact  : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ day      : int  5 5 5 5 5 5 5 5 5 5 ...
##  $ month    : chr  "may" "may" "may" "may" ...
##  $ duration : int  261 151 76 92 198 139 217 380 50 55 ...
##  $ campaign : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ pdays    : int  -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
##  $ previous : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ poutcome : chr  "unknown" "unknown" "unknown" "unknown" ...
##  $ y        : chr  "no" "no" "no" "no" ...

From above, we could see that some columns don’t have the most appropriate data types such as job, marital, education, default, housing, loan, contact, y.

Thus, we need to change data types of columns.

bank <- bank %>% 
        mutate_at(vars(job, marital, education, default, housing, loan, contact, month, poutcome, y), as.factor)

bank$duration <- as.integer(bank$duration)

Remove previous

bank <- bank %>%  select(-previous)

Check for null values

bank %>% 
  is.na() %>% 
  colSums()
##       age       job   marital education   default   balance   housing      loan 
##         0         0         0         0         0         0         0         0 
##   contact       day     month  duration  campaign     pdays  poutcome         y 
##         0         0         0         0         0         0         0         0

From above, we could see that there are no null values in this data.

# Separate the numeric columns from the original data
bank_numeric <- bank %>% select_if(is.numeric)
bank_non_numeric <- bank %>% select(-c(age, balance, duration, campaign, pdays))

# Scale the numeric columns
bank_numeric_s <- scale(bank_numeric)
# Put them back together
bank_clean <- cbind(bank_non_numeric, bank_numeric_s)
bank_clean <- bank_clean[,c(1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,11)]

Now, we should apply Cross Validation

#Check the proportion of the target data
prop.table(table(bank_clean$y))
## 
##        no       yes 
## 0.8830152 0.1169848

From the result, we could see that data now is not balanced, which means we need to do resampling.

#Split data into 75% train and 25% test
RNGkind(sample.kind = "Rounding")
set.seed(100)

index <- sample(nrow(bank_clean), nrow(bank_clean)*0.75)

data_train <- bank_clean[index,]
data_test <- bank_clean[-index,]
#Downsampling is reducing the majority class observation until it is balanced with the minority class. This is usually done to a large dataset.
RNGkind(sample.kind = "Rounding")
set.seed(100)

library(caret)
data_train <- downSample(x = data_train %>% select(-y),
                         y = data_train$y,
                         yname = "y")

prop.table(table(data_train$y))
## 
##  no yes 
## 0.5 0.5

#Now, the data is well balanced.We could Modelling the training data to the respective models. a. Naive Bayes

bank_naive <- naiveBayes(x = data_train %>% select(-y),
                         y = data_train$y)
bank_naive
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = data_train %>% select(-y), y = data_train$y)
## 
## A-priori probabilities:
## data_train$y
##  no yes 
## 0.5 0.5 
## 
## Conditional probabilities:
##             job
## data_train$y      admin. blue-collar entrepreneur   housemaid  management
##          no  0.108958228 0.229994967  0.037242073 0.032209361 0.199547056
##          yes 0.123553095 0.137393055  0.022647207 0.020130851 0.242576749
##             job
## data_train$y     retired self-employed    services     student  technician
##          no  0.039255159   0.034977353 0.097131354 0.015098138 0.170105687
##          yes 0.100150981   0.035228988 0.068444892 0.048817313 0.155762456
##             job
## data_train$y  unemployed     unknown
##          no  0.027428284 0.008052340
##          yes 0.039255159 0.006039255
## 
##             marital
## data_train$y  divorced   married    single
##          no  0.1149975 0.6114746 0.2735279
##          yes 0.1185204 0.5266734 0.3548062
## 
##             education
## data_train$y    primary  secondary   tertiary    unknown
##          no  0.15601409 0.52063412 0.28309009 0.04026170
##          yes 0.11172622 0.46502265 0.37695018 0.04630096
## 
##             default
## data_train$y         no        yes
##          no  0.97810770 0.02189230
##          yes 0.98968294 0.01031706
## 
##             housing
## data_train$y        no       yes
##          no  0.4272773 0.5727227
##          yes 0.6356316 0.3643684
## 
##             loan
## data_train$y         no        yes
##          no  0.84197282 0.15802718
##          yes 0.90865627 0.09134373
## 
##             contact
## data_train$y   cellular  telephone    unknown
##          no  0.63839960 0.05913437 0.30246603
##          yes 0.82637141 0.07196779 0.10166080
## 
##             day
## data_train$y     [,1]     [,2]
##          no  15.84977 8.367192
##          yes 15.09864 8.542359
## 
##             month
## data_train$y         apr         aug         dec         feb         jan
##          no  0.056114746 0.152994464 0.001761449 0.051836940 0.031454454
##          yes 0.106441872 0.129843986 0.018117765 0.081529945 0.028183191
##             month
## data_train$y         jul         jun         mar         may         nov
##          no  0.158782084 0.118268747 0.003522899 0.325868143 0.080523402
##          yes 0.118772018 0.106945143 0.049068948 0.171867136 0.078258681
##             month
## data_train$y         oct         sep
##          no  0.010568697 0.008303976
##          yes 0.059134373 0.051836940
## 
##             poutcome
## data_train$y    failure      other    success    unknown
##          no  0.10442879 0.04101661 0.01333669 0.84121792
##          yes 0.11499748 0.05712129 0.18595873 0.64192250
## 
##             age
## data_train$y        [,1]      [,2]
##          no  0.002831838 0.9493634
##          yes 0.078165447 1.2821592
## 
##             balance
## data_train$y         [,1]     [,2]
##          no  -0.008521819 1.066287
##          yes  0.140457353 1.101335
## 
##             day.1
## data_train$y         [,1]     [,2]
##          no   0.005209355 1.005373
##          yes -0.085044116 1.026420
## 
##             duration
## data_train$y       [,1]      [,2]
##          no  -0.1484305 0.7969071
##          yes  1.0630080 1.5063952
## 
##             campaign
## data_train$y        [,1]      [,2]
##          no   0.04739435 1.0973483
##          yes -0.19571100 0.6306424
  1. Decision Tree
bank_dt <- ctree(formula = y ~.,
                 data = data_train)
bank_dt
## 
## Model formula:
## y ~ job + marital + education + default + housing + loan + contact + 
##     day + month + poutcome + age + balance + day.1 + duration + 
##     campaign
## 
## Fitted party:
## [1] root
## |   [2] duration <= -0.19867
## |   |   [3] month in apr, dec, feb, mar, oct, sep
## |   |   |   [4] duration <= -0.65299
## |   |   |   |   [5] duration <= -0.76948
## |   |   |   |   |   [6] age <= 1.4186: no (n = 67, err = 0.0%)
## |   |   |   |   |   [7] age > 1.4186: no (n = 7, err = 14.3%)
## |   |   |   |   [8] duration > -0.76948: no (n = 83, err = 34.9%)
## |   |   |   [9] duration > -0.65299
## |   |   |   |   [10] poutcome in failure, other, unknown
## |   |   |   |   |   [11] housing in no
## |   |   |   |   |   |   [12] month in apr, dec, mar, oct: yes (n = 217, err = 23.0%)
## |   |   |   |   |   |   [13] month in feb, sep
## |   |   |   |   |   |   |   [14] day.1 <= -0.93799: no (n = 48, err = 25.0%)
## |   |   |   |   |   |   |   [15] day.1 > -0.93799: yes (n = 53, err = 20.8%)
## |   |   |   |   |   [16] housing in yes
## |   |   |   |   |   |   [17] month in apr, dec, feb: no (n = 119, err = 25.2%)
## |   |   |   |   |   |   [18] month in mar, oct, sep: yes (n = 48, err = 20.8%)
## |   |   |   |   [19] poutcome in success
## |   |   |   |   |   [20] campaign <= -0.24656: yes (n = 78, err = 5.1%)
## |   |   |   |   |   [21] campaign > -0.24656: yes (n = 15, err = 33.3%)
## |   |   [22] month in aug, jan, jul, jun, may, nov
## |   |   |   [23] poutcome in failure, other, unknown
## |   |   |   |   [24] duration <= -0.50932
## |   |   |   |   |   [25] job in admin., blue-collar, entrepreneur, housemaid, management, retired, self-employed, services, technician, unknown
## |   |   |   |   |   |   [26] day.1 <= -1.41862: no (n = 85, err = 14.1%)
## |   |   |   |   |   |   [27] day.1 > -1.41862
## |   |   |   |   |   |   |   [28] poutcome in failure, other
## |   |   |   |   |   |   |   |   [29] contact in cellular
## |   |   |   |   |   |   |   |   |   [30] month in aug, jul, jun: no (n = 11, err = 27.3%)
## |   |   |   |   |   |   |   |   |   [31] month in jan, may, nov: no (n = 116, err = 2.6%)
## |   |   |   |   |   |   |   |   [32] contact in telephone, unknown: no (n = 16, err = 18.8%)
## |   |   |   |   |   |   |   [33] poutcome in unknown: no (n = 1025, err = 1.7%)
## |   |   |   |   |   [34] job in student, unemployed: no (n = 53, err = 22.6%)
## |   |   |   |   [35] duration > -0.50932
## |   |   |   |   |   [36] contact in cellular, telephone
## |   |   |   |   |   |   [37] month in aug, jan, jul, may, nov
## |   |   |   |   |   |   |   [38] housing in no
## |   |   |   |   |   |   |   |   [39] poutcome in failure, other: yes (n = 55, err = 36.4%)
## |   |   |   |   |   |   |   |   [40] poutcome in unknown
## |   |   |   |   |   |   |   |   |   [41] month in aug, jul
## |   |   |   |   |   |   |   |   |   |   [42] job in admin., blue-collar, management, self-employed, services, technician, unknown: no (n = 178, err = 7.3%)
## |   |   |   |   |   |   |   |   |   |   [43] job in entrepreneur, housemaid, retired, student, unemployed: no (n = 40, err = 42.5%)
## |   |   |   |   |   |   |   |   |   [44] month in jan, may, nov
## |   |   |   |   |   |   |   |   |   |   [45] day <= 14: yes (n = 19, err = 15.8%)
## |   |   |   |   |   |   |   |   |   |   [46] day > 14
## |   |   |   |   |   |   |   |   |   |   |   [47] month in jan, nov: no (n = 47, err = 12.8%)
## |   |   |   |   |   |   |   |   |   |   |   [48] month in may: yes (n = 9, err = 33.3%)
## |   |   |   |   |   |   |   [49] housing in yes
## |   |   |   |   |   |   |   |   [50] month in aug, jan, nov: no (n = 104, err = 19.2%)
## |   |   |   |   |   |   |   |   [51] month in jul, may
## |   |   |   |   |   |   |   |   |   [52] balance <= 0.76877: no (n = 189, err = 3.7%)
## |   |   |   |   |   |   |   |   |   [53] balance > 0.76877: no (n = 11, err = 27.3%)
## |   |   |   |   |   |   [54] month in jun: yes (n = 62, err = 29.0%)
## |   |   |   |   |   [55] contact in unknown
## |   |   |   |   |   |   [56] month in jul, nov: no (n = 12, err = 41.7%)
## |   |   |   |   |   |   [57] month in jun, may: no (n = 300, err = 0.0%)
## |   |   |   [58] poutcome in success
## |   |   |   |   [59] duration <= -0.48602: yes (n = 32, err = 40.6%)
## |   |   |   |   [60] duration > -0.48602: yes (n = 99, err = 12.1%)
## |   [61] duration > -0.19867
## |   |   [62] duration <= 1.22642
## |   |   |   [63] month in apr, dec, feb, mar, oct, sep
## |   |   |   |   [64] month in apr, feb
## |   |   |   |   |   [65] housing in no
## |   |   |   |   |   |   [66] day <= 7: yes (n = 92, err = 42.4%)
## |   |   |   |   |   |   [67] day > 7
## |   |   |   |   |   |   |   [68] poutcome in failure, other: yes (n = 34, err = 26.5%)
## |   |   |   |   |   |   |   [69] poutcome in success, unknown: yes (n = 209, err = 6.7%)
## |   |   |   |   |   [70] housing in yes
## |   |   |   |   |   |   [71] day <= 20
## |   |   |   |   |   |   |   [72] month in apr: no (n = 94, err = 29.8%)
## |   |   |   |   |   |   |   [73] month in feb
## |   |   |   |   |   |   |   |   [74] day.1 <= -1.29846: no (n = 32, err = 34.4%)
## |   |   |   |   |   |   |   |   [75] day.1 > -1.29846: yes (n = 19, err = 5.3%)
## |   |   |   |   |   |   [76] day > 20: yes (n = 37, err = 5.4%)
## |   |   |   |   [77] month in dec, mar, oct, sep: yes (n = 423, err = 4.0%)
## |   |   |   [78] month in aug, jan, jul, jun, may, nov
## |   |   |   |   [79] poutcome in failure, other, unknown
## |   |   |   |   |   [80] contact in cellular, telephone
## |   |   |   |   |   |   [81] duration <= 0.58183
## |   |   |   |   |   |   |   [82] month in aug, jan, jul, may, nov
## |   |   |   |   |   |   |   |   [83] housing in no
## |   |   |   |   |   |   |   |   |   [84] job in admin., management, retired, student, unemployed
## |   |   |   |   |   |   |   |   |   |   [85] day <= 27
## |   |   |   |   |   |   |   |   |   |   |   [86] month in aug, jul: yes (n = 147, err = 43.5%)
## |   |   |   |   |   |   |   |   |   |   |   [87] month in jan, may, nov: yes (n = 102, err = 20.6%)
## |   |   |   |   |   |   |   |   |   |   [88] day > 27: no (n = 26, err = 23.1%)
## |   |   |   |   |   |   |   |   |   [89] job in blue-collar, entrepreneur, housemaid, self-employed, services, technician, unknown
## |   |   |   |   |   |   |   |   |   |   [90] day <= 4: yes (n = 20, err = 20.0%)
## |   |   |   |   |   |   |   |   |   |   [91] day > 4
## |   |   |   |   |   |   |   |   |   |   |   [92] balance <= -0.35217
## |   |   |   |   |   |   |   |   |   |   |   |   [93] duration <= 0.36049
## |   |   |   |   |   |   |   |   |   |   |   |   |   [94] month in aug, jan, jul, nov: no (n = 70, err = 2.9%)
## |   |   |   |   |   |   |   |   |   |   |   |   |   [95] month in may: no (n = 8, err = 50.0%)
## |   |   |   |   |   |   |   |   |   |   |   |   [96] duration > 0.36049: yes (n = 15, err = 46.7%)
## |   |   |   |   |   |   |   |   |   |   |   [97] balance > -0.35217: no (n = 140, err = 39.3%)
## |   |   |   |   |   |   |   |   [98] housing in yes
## |   |   |   |   |   |   |   |   |   [99] month in aug
## |   |   |   |   |   |   |   |   |   |   [100] poutcome in failure, other: yes (n = 13, err = 0.0%)
## |   |   |   |   |   |   |   |   |   |   [101] poutcome in unknown: no (n = 28, err = 32.1%)
## |   |   |   |   |   |   |   |   |   [102] month in jan, jul, may, nov
## |   |   |   |   |   |   |   |   |   |   [103] duration <= 0.34108: no (n = 257, err = 16.3%)
## |   |   |   |   |   |   |   |   |   |   [104] duration > 0.34108: no (n = 85, err = 32.9%)
## |   |   |   |   |   |   |   [105] month in jun: yes (n = 92, err = 7.6%)
## |   |   |   |   |   |   [106] duration > 0.58183
## |   |   |   |   |   |   |   [107] housing in no: yes (n = 259, err = 25.1%)
## |   |   |   |   |   |   |   [108] housing in yes: yes (n = 184, err = 42.9%)
## |   |   |   |   |   [109] contact in unknown
## |   |   |   |   |   |   [110] duration <= 0.66726
## |   |   |   |   |   |   |   [111] month in jul, may: no (n = 242, err = 1.2%)
## |   |   |   |   |   |   |   [112] month in jun, nov: no (n = 113, err = 8.8%)
## |   |   |   |   |   |   [113] duration > 0.66726: no (n = 139, err = 41.7%)
## |   |   |   |   [114] poutcome in success: yes (n = 232, err = 3.4%)
## |   |   [115] duration > 1.22642
## |   |   |   [116] contact in cellular, telephone: yes (n = 1279, err = 9.7%)
## |   |   |   [117] contact in unknown: yes (n = 359, err = 18.9%)
## 
## Number of inner nodes:    58
## Number of terminal nodes: 59
  1. Random Forest
# Get rid of the predictors that have near 0 variance or less informative to reduce computational time
n0_var <- nearZeroVar(bank_clean)
bank_new <- bank_clean[,-n0_var]
# Cross validation for Random Forest model
library(rsample)
RNGkind(sample.kind = "Rounding")
set.seed(100)

index <- initial_split(data = bank_new , prop = 0.75, strata = "y")
rf_train <- training(index)
rf_test <- testing(index)
# Check the target proportiton
prop.table(table(rf_train$y))
## 
##       no      yes 
## 0.883033 0.116967
# Downsample the target
RNGkind(sample.kind = "Rounding")
set.seed(100)

library(caret)
rf_train <- downSample(x = rf_train %>% select(-y),
                         y = rf_train$y,
                         yname = "y")

prop.table(table(rf_train$y))
## 
##  no yes 
## 0.5 0.5
#set.seed(417)
#ctrl <- trainControl(method = "repeatedcv",
#                      number = 5, # k-fold
#                      repeats = 3) # repetition
 
#bank_rf <- train(y ~ .,
#                    data = rf_train,
#                    method = "rf", # random forest
#                   trControl = ctrl)
 
#saveRDS(bank_rf, "bank_rf.RDS")
bank_rf <- readRDS("bank_rf.RDS")
library(randomForest)
bank_rf$finalModel
## 
## Call:
##  randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x))) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 21
## 
##         OOB estimate of  error rate: 14.12%
## Confusion matrix:
##       no  yes class.error
## no  3271  695   0.1752395
## yes  425 3541   0.1071609

Model Prediction and Evaluation

  1. Naive Bayes
bank_naive_pred <- predict(object = bank_naive,
                            newdata = data_test,
                            type = "class")
# Confusion matrix
confusionMatrix(data = bank_naive_pred,
                reference = data_test$y) #data actual
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  7861  210
##        yes 2127 1105
##                                           
##                Accuracy : 0.7932          
##                  95% CI : (0.7857, 0.8007)
##     No Information Rate : 0.8837          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3842          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.7870          
##             Specificity : 0.8403          
##          Pos Pred Value : 0.9740          
##          Neg Pred Value : 0.3419          
##              Prevalence : 0.8837          
##          Detection Rate : 0.6955          
##    Detection Prevalence : 0.7141          
##       Balanced Accuracy : 0.8137          
##                                           
##        'Positive' Class : no              
## 

AUC AND ROC

# Take the prediction results in terms of probability
bank_naive_pred_prob <- predict(object = bank_naive,
                           newdata = data_test,
                           type = "raw")
head(bank_naive_pred_prob)
##             no        yes
## [1,] 0.9501237 0.04987630
## [2,] 0.9685047 0.03149529
## [3,] 0.9240637 0.07593627
## [4,] 0.9800660 0.01993405
## [5,] 0.8910018 0.10899816
## [6,] 0.9837691 0.01623089
# menyiapkan pred vs actual
data_roc <- data.frame(pred_prob = bank_naive_pred_prob[, 'yes'],
                       actual = ifelse(data_test$y == 'yes',1, 0))
head(data_roc, 10)
##     pred_prob actual
## 1  0.04987630      0
## 2  0.03149529      0
## 3  0.07593627      0
## 4  0.01993405      0
## 5  0.10899816      0
## 6  0.01623089      0
## 7  0.06470933      0
## 8  0.04177653      0
## 9  0.01599570      0
## 10 0.05203860      0
# Make ROC by preparing prediction() object
library(ROCR)

# objek prediction
bank_naive_roc <- prediction(predictions = data_roc$pred_prob,
                        labels = data_roc$actual)
# ROC curve
plot(performance(bank_naive_roc,"tpr","fpr"))

  1. Decision Tree
bank_dt_pred <- predict(object = bank_dt,
                          newdata = data_test,
                          type = "response")
# Confusion matrix
confusionMatrix(data = bank_dt_pred,
                reference = data_test$y) #data actual
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  8190  147
##        yes 1798 1168
##                                           
##                Accuracy : 0.8279          
##                  95% CI : (0.8208, 0.8348)
##     No Information Rate : 0.8837          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4583          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8200          
##             Specificity : 0.8882          
##          Pos Pred Value : 0.9824          
##          Neg Pred Value : 0.3938          
##              Prevalence : 0.8837          
##          Detection Rate : 0.7246          
##    Detection Prevalence : 0.7376          
##       Balanced Accuracy : 0.8541          
##                                           
##        'Positive' Class : no              
## 
#See whethr the dt model overfits
bank_dt_pred_train <- predict(object = bank_dt,
                          newdata = data_train,
                          type = "response")
#Confusion matrix
confusionMatrix(data = bank_dt_pred_train,
                reference = data_train$y)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3296  449
##        yes  678 3525
##                                           
##                Accuracy : 0.8582          
##                  95% CI : (0.8503, 0.8658)
##     No Information Rate : 0.5             
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.7164          
##                                           
##  Mcnemar's Test P-Value : 1.109e-11       
##                                           
##             Sensitivity : 0.8294          
##             Specificity : 0.8870          
##          Pos Pred Value : 0.8801          
##          Neg Pred Value : 0.8387          
##              Prevalence : 0.5000          
##          Detection Rate : 0.4147          
##    Detection Prevalence : 0.4712          
##       Balanced Accuracy : 0.8582          
##                                           
##        'Positive' Class : no              
## 
  1. Random Forest
bank_rf_pred <- predict(object = bank_rf,
                          newdata = rf_test,
                          type = "raw")
#Confusion matrix
confusionMatrix(data = bank_rf_pred,
                reference = rf_test$y) #data actual
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  8283  125
##        yes 1698 1198
##                                           
##                Accuracy : 0.8387          
##                  95% CI : (0.8318, 0.8455)
##     No Information Rate : 0.883           
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.4852          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.8299          
##             Specificity : 0.9055          
##          Pos Pred Value : 0.9851          
##          Neg Pred Value : 0.4137          
##              Prevalence : 0.8830          
##          Detection Rate : 0.7327          
##    Detection Prevalence : 0.7438          
##       Balanced Accuracy : 0.8677          
##                                           
##        'Positive' Class : no              
## 
#Check some of the most important predictors
varImp(bank_rf)
## rf variable importance
## 
##   only 20 most important variables shown (out of 40)
## 
##                 Overall
## duration        100.000
## balance          19.647
## age              18.239
## poutcomesuccess  15.380
## contactunknown   12.365
## day.1            12.043
## day              11.919
## housingyes        8.888
## campaign          7.215
## poutcomeunknown   5.097
## monthmar          3.798
## monthaug          3.281
## monthjul          3.120
## loanyes           2.937
## monthoct          2.859
## monthmay          2.850
## jobblue-collar    2.473
## monthnov          2.452
## monthjun          2.403
## maritalmarried    2.281