This assignment is aiming to resampling models for Bank+Marketing Dataset.
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(e1071)
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rsample)
##
## Attaching package: 'rsample'
## The following object is masked from 'package:e1071':
##
## permutations
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
bank <- read.csv("bank-full.csv", sep = ";")
head(bank)
## age job marital education default balance housing loan contact day
## 1 58 management married tertiary no 2143 yes no unknown 5
## 2 44 technician single secondary no 29 yes no unknown 5
## 3 33 entrepreneur married secondary no 2 yes yes unknown 5
## 4 47 blue-collar married unknown no 1506 yes no unknown 5
## 5 33 unknown single unknown no 1 no no unknown 5
## 6 35 management married tertiary no 231 yes no unknown 5
## month duration campaign pdays previous poutcome y
## 1 may 261 1 -1 0 unknown no
## 2 may 151 1 -1 0 unknown no
## 3 may 76 1 -1 0 unknown no
## 4 may 92 1 -1 0 unknown no
## 5 may 198 1 -1 0 unknown no
## 6 may 139 1 -1 0 unknown no
Data Wrangling This is to check on whether all the data types are suitable to the column.
str(bank)
## 'data.frame': 45211 obs. of 17 variables:
## $ age : int 58 44 33 47 33 35 28 42 58 43 ...
## $ job : chr "management" "technician" "entrepreneur" "blue-collar" ...
## $ marital : chr "married" "single" "married" "married" ...
## $ education: chr "tertiary" "secondary" "secondary" "unknown" ...
## $ default : chr "no" "no" "no" "no" ...
## $ balance : int 2143 29 2 1506 1 231 447 2 121 593 ...
## $ housing : chr "yes" "yes" "yes" "yes" ...
## $ loan : chr "no" "no" "yes" "no" ...
## $ contact : chr "unknown" "unknown" "unknown" "unknown" ...
## $ day : int 5 5 5 5 5 5 5 5 5 5 ...
## $ month : chr "may" "may" "may" "may" ...
## $ duration : int 261 151 76 92 198 139 217 380 50 55 ...
## $ campaign : int 1 1 1 1 1 1 1 1 1 1 ...
## $ pdays : int -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 ...
## $ previous : int 0 0 0 0 0 0 0 0 0 0 ...
## $ poutcome : chr "unknown" "unknown" "unknown" "unknown" ...
## $ y : chr "no" "no" "no" "no" ...
From above, we could see that some columns don’t have the most appropriate data types such as job, marital, education, default, housing, loan, contact, y.
Thus, we need to change data types of columns.
bank <- bank %>%
mutate_at(vars(job, marital, education, default, housing, loan, contact, month, poutcome, y), as.factor)
bank$duration <- as.integer(bank$duration)
Remove previous
bank <- bank %>% select(-previous)
Check for null values
bank %>%
is.na() %>%
colSums()
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays poutcome y
## 0 0 0 0 0 0 0 0
From above, we could see that there are no null values in this data.
# Separate the numeric columns from the original data
bank_numeric <- bank %>% select_if(is.numeric)
bank_non_numeric <- bank %>% select(-c(age, balance, duration, campaign, pdays))
# Scale the numeric columns
bank_numeric_s <- scale(bank_numeric)
# Put them back together
bank_clean <- cbind(bank_non_numeric, bank_numeric_s)
bank_clean <- bank_clean[,c(1,2,3,4,5,6,7,8,9,10,12,13,14,15,16,11)]
#Check the proportion of the target data
prop.table(table(bank_clean$y))
##
## no yes
## 0.8830152 0.1169848
From the result, we could see that data now is not balanced, which means we need to do resampling.
#Split data into 75% train and 25% test
RNGkind(sample.kind = "Rounding")
set.seed(100)
index <- sample(nrow(bank_clean), nrow(bank_clean)*0.75)
data_train <- bank_clean[index,]
data_test <- bank_clean[-index,]
#Downsampling is reducing the majority class observation until it is balanced with the minority class. This is usually done to a large dataset.
RNGkind(sample.kind = "Rounding")
set.seed(100)
library(caret)
data_train <- downSample(x = data_train %>% select(-y),
y = data_train$y,
yname = "y")
prop.table(table(data_train$y))
##
## no yes
## 0.5 0.5
#Now, the data is well balanced.We could Modelling the training data to the respective models. a. Naive Bayes
bank_naive <- naiveBayes(x = data_train %>% select(-y),
y = data_train$y)
bank_naive
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = data_train %>% select(-y), y = data_train$y)
##
## A-priori probabilities:
## data_train$y
## no yes
## 0.5 0.5
##
## Conditional probabilities:
## job
## data_train$y admin. blue-collar entrepreneur housemaid management
## no 0.108958228 0.229994967 0.037242073 0.032209361 0.199547056
## yes 0.123553095 0.137393055 0.022647207 0.020130851 0.242576749
## job
## data_train$y retired self-employed services student technician
## no 0.039255159 0.034977353 0.097131354 0.015098138 0.170105687
## yes 0.100150981 0.035228988 0.068444892 0.048817313 0.155762456
## job
## data_train$y unemployed unknown
## no 0.027428284 0.008052340
## yes 0.039255159 0.006039255
##
## marital
## data_train$y divorced married single
## no 0.1149975 0.6114746 0.2735279
## yes 0.1185204 0.5266734 0.3548062
##
## education
## data_train$y primary secondary tertiary unknown
## no 0.15601409 0.52063412 0.28309009 0.04026170
## yes 0.11172622 0.46502265 0.37695018 0.04630096
##
## default
## data_train$y no yes
## no 0.97810770 0.02189230
## yes 0.98968294 0.01031706
##
## housing
## data_train$y no yes
## no 0.4272773 0.5727227
## yes 0.6356316 0.3643684
##
## loan
## data_train$y no yes
## no 0.84197282 0.15802718
## yes 0.90865627 0.09134373
##
## contact
## data_train$y cellular telephone unknown
## no 0.63839960 0.05913437 0.30246603
## yes 0.82637141 0.07196779 0.10166080
##
## day
## data_train$y [,1] [,2]
## no 15.84977 8.367192
## yes 15.09864 8.542359
##
## month
## data_train$y apr aug dec feb jan
## no 0.056114746 0.152994464 0.001761449 0.051836940 0.031454454
## yes 0.106441872 0.129843986 0.018117765 0.081529945 0.028183191
## month
## data_train$y jul jun mar may nov
## no 0.158782084 0.118268747 0.003522899 0.325868143 0.080523402
## yes 0.118772018 0.106945143 0.049068948 0.171867136 0.078258681
## month
## data_train$y oct sep
## no 0.010568697 0.008303976
## yes 0.059134373 0.051836940
##
## poutcome
## data_train$y failure other success unknown
## no 0.10442879 0.04101661 0.01333669 0.84121792
## yes 0.11499748 0.05712129 0.18595873 0.64192250
##
## age
## data_train$y [,1] [,2]
## no 0.002831838 0.9493634
## yes 0.078165447 1.2821592
##
## balance
## data_train$y [,1] [,2]
## no -0.008521819 1.066287
## yes 0.140457353 1.101335
##
## day.1
## data_train$y [,1] [,2]
## no 0.005209355 1.005373
## yes -0.085044116 1.026420
##
## duration
## data_train$y [,1] [,2]
## no -0.1484305 0.7969071
## yes 1.0630080 1.5063952
##
## campaign
## data_train$y [,1] [,2]
## no 0.04739435 1.0973483
## yes -0.19571100 0.6306424
bank_dt <- ctree(formula = y ~.,
data = data_train)
bank_dt
##
## Model formula:
## y ~ job + marital + education + default + housing + loan + contact +
## day + month + poutcome + age + balance + day.1 + duration +
## campaign
##
## Fitted party:
## [1] root
## | [2] duration <= -0.19867
## | | [3] month in apr, dec, feb, mar, oct, sep
## | | | [4] duration <= -0.65299
## | | | | [5] duration <= -0.76948
## | | | | | [6] age <= 1.4186: no (n = 67, err = 0.0%)
## | | | | | [7] age > 1.4186: no (n = 7, err = 14.3%)
## | | | | [8] duration > -0.76948: no (n = 83, err = 34.9%)
## | | | [9] duration > -0.65299
## | | | | [10] poutcome in failure, other, unknown
## | | | | | [11] housing in no
## | | | | | | [12] month in apr, dec, mar, oct: yes (n = 217, err = 23.0%)
## | | | | | | [13] month in feb, sep
## | | | | | | | [14] day.1 <= -0.93799: no (n = 48, err = 25.0%)
## | | | | | | | [15] day.1 > -0.93799: yes (n = 53, err = 20.8%)
## | | | | | [16] housing in yes
## | | | | | | [17] month in apr, dec, feb: no (n = 119, err = 25.2%)
## | | | | | | [18] month in mar, oct, sep: yes (n = 48, err = 20.8%)
## | | | | [19] poutcome in success
## | | | | | [20] campaign <= -0.24656: yes (n = 78, err = 5.1%)
## | | | | | [21] campaign > -0.24656: yes (n = 15, err = 33.3%)
## | | [22] month in aug, jan, jul, jun, may, nov
## | | | [23] poutcome in failure, other, unknown
## | | | | [24] duration <= -0.50932
## | | | | | [25] job in admin., blue-collar, entrepreneur, housemaid, management, retired, self-employed, services, technician, unknown
## | | | | | | [26] day.1 <= -1.41862: no (n = 85, err = 14.1%)
## | | | | | | [27] day.1 > -1.41862
## | | | | | | | [28] poutcome in failure, other
## | | | | | | | | [29] contact in cellular
## | | | | | | | | | [30] month in aug, jul, jun: no (n = 11, err = 27.3%)
## | | | | | | | | | [31] month in jan, may, nov: no (n = 116, err = 2.6%)
## | | | | | | | | [32] contact in telephone, unknown: no (n = 16, err = 18.8%)
## | | | | | | | [33] poutcome in unknown: no (n = 1025, err = 1.7%)
## | | | | | [34] job in student, unemployed: no (n = 53, err = 22.6%)
## | | | | [35] duration > -0.50932
## | | | | | [36] contact in cellular, telephone
## | | | | | | [37] month in aug, jan, jul, may, nov
## | | | | | | | [38] housing in no
## | | | | | | | | [39] poutcome in failure, other: yes (n = 55, err = 36.4%)
## | | | | | | | | [40] poutcome in unknown
## | | | | | | | | | [41] month in aug, jul
## | | | | | | | | | | [42] job in admin., blue-collar, management, self-employed, services, technician, unknown: no (n = 178, err = 7.3%)
## | | | | | | | | | | [43] job in entrepreneur, housemaid, retired, student, unemployed: no (n = 40, err = 42.5%)
## | | | | | | | | | [44] month in jan, may, nov
## | | | | | | | | | | [45] day <= 14: yes (n = 19, err = 15.8%)
## | | | | | | | | | | [46] day > 14
## | | | | | | | | | | | [47] month in jan, nov: no (n = 47, err = 12.8%)
## | | | | | | | | | | | [48] month in may: yes (n = 9, err = 33.3%)
## | | | | | | | [49] housing in yes
## | | | | | | | | [50] month in aug, jan, nov: no (n = 104, err = 19.2%)
## | | | | | | | | [51] month in jul, may
## | | | | | | | | | [52] balance <= 0.76877: no (n = 189, err = 3.7%)
## | | | | | | | | | [53] balance > 0.76877: no (n = 11, err = 27.3%)
## | | | | | | [54] month in jun: yes (n = 62, err = 29.0%)
## | | | | | [55] contact in unknown
## | | | | | | [56] month in jul, nov: no (n = 12, err = 41.7%)
## | | | | | | [57] month in jun, may: no (n = 300, err = 0.0%)
## | | | [58] poutcome in success
## | | | | [59] duration <= -0.48602: yes (n = 32, err = 40.6%)
## | | | | [60] duration > -0.48602: yes (n = 99, err = 12.1%)
## | [61] duration > -0.19867
## | | [62] duration <= 1.22642
## | | | [63] month in apr, dec, feb, mar, oct, sep
## | | | | [64] month in apr, feb
## | | | | | [65] housing in no
## | | | | | | [66] day <= 7: yes (n = 92, err = 42.4%)
## | | | | | | [67] day > 7
## | | | | | | | [68] poutcome in failure, other: yes (n = 34, err = 26.5%)
## | | | | | | | [69] poutcome in success, unknown: yes (n = 209, err = 6.7%)
## | | | | | [70] housing in yes
## | | | | | | [71] day <= 20
## | | | | | | | [72] month in apr: no (n = 94, err = 29.8%)
## | | | | | | | [73] month in feb
## | | | | | | | | [74] day.1 <= -1.29846: no (n = 32, err = 34.4%)
## | | | | | | | | [75] day.1 > -1.29846: yes (n = 19, err = 5.3%)
## | | | | | | [76] day > 20: yes (n = 37, err = 5.4%)
## | | | | [77] month in dec, mar, oct, sep: yes (n = 423, err = 4.0%)
## | | | [78] month in aug, jan, jul, jun, may, nov
## | | | | [79] poutcome in failure, other, unknown
## | | | | | [80] contact in cellular, telephone
## | | | | | | [81] duration <= 0.58183
## | | | | | | | [82] month in aug, jan, jul, may, nov
## | | | | | | | | [83] housing in no
## | | | | | | | | | [84] job in admin., management, retired, student, unemployed
## | | | | | | | | | | [85] day <= 27
## | | | | | | | | | | | [86] month in aug, jul: yes (n = 147, err = 43.5%)
## | | | | | | | | | | | [87] month in jan, may, nov: yes (n = 102, err = 20.6%)
## | | | | | | | | | | [88] day > 27: no (n = 26, err = 23.1%)
## | | | | | | | | | [89] job in blue-collar, entrepreneur, housemaid, self-employed, services, technician, unknown
## | | | | | | | | | | [90] day <= 4: yes (n = 20, err = 20.0%)
## | | | | | | | | | | [91] day > 4
## | | | | | | | | | | | [92] balance <= -0.35217
## | | | | | | | | | | | | [93] duration <= 0.36049
## | | | | | | | | | | | | | [94] month in aug, jan, jul, nov: no (n = 70, err = 2.9%)
## | | | | | | | | | | | | | [95] month in may: no (n = 8, err = 50.0%)
## | | | | | | | | | | | | [96] duration > 0.36049: yes (n = 15, err = 46.7%)
## | | | | | | | | | | | [97] balance > -0.35217: no (n = 140, err = 39.3%)
## | | | | | | | | [98] housing in yes
## | | | | | | | | | [99] month in aug
## | | | | | | | | | | [100] poutcome in failure, other: yes (n = 13, err = 0.0%)
## | | | | | | | | | | [101] poutcome in unknown: no (n = 28, err = 32.1%)
## | | | | | | | | | [102] month in jan, jul, may, nov
## | | | | | | | | | | [103] duration <= 0.34108: no (n = 257, err = 16.3%)
## | | | | | | | | | | [104] duration > 0.34108: no (n = 85, err = 32.9%)
## | | | | | | | [105] month in jun: yes (n = 92, err = 7.6%)
## | | | | | | [106] duration > 0.58183
## | | | | | | | [107] housing in no: yes (n = 259, err = 25.1%)
## | | | | | | | [108] housing in yes: yes (n = 184, err = 42.9%)
## | | | | | [109] contact in unknown
## | | | | | | [110] duration <= 0.66726
## | | | | | | | [111] month in jul, may: no (n = 242, err = 1.2%)
## | | | | | | | [112] month in jun, nov: no (n = 113, err = 8.8%)
## | | | | | | [113] duration > 0.66726: no (n = 139, err = 41.7%)
## | | | | [114] poutcome in success: yes (n = 232, err = 3.4%)
## | | [115] duration > 1.22642
## | | | [116] contact in cellular, telephone: yes (n = 1279, err = 9.7%)
## | | | [117] contact in unknown: yes (n = 359, err = 18.9%)
##
## Number of inner nodes: 58
## Number of terminal nodes: 59
# Get rid of the predictors that have near 0 variance or less informative to reduce computational time
n0_var <- nearZeroVar(bank_clean)
bank_new <- bank_clean[,-n0_var]
# Cross validation for Random Forest model
library(rsample)
RNGkind(sample.kind = "Rounding")
set.seed(100)
index <- initial_split(data = bank_new , prop = 0.75, strata = "y")
rf_train <- training(index)
rf_test <- testing(index)
# Check the target proportiton
prop.table(table(rf_train$y))
##
## no yes
## 0.883033 0.116967
# Downsample the target
RNGkind(sample.kind = "Rounding")
set.seed(100)
library(caret)
rf_train <- downSample(x = rf_train %>% select(-y),
y = rf_train$y,
yname = "y")
prop.table(table(rf_train$y))
##
## no yes
## 0.5 0.5
#set.seed(417)
#ctrl <- trainControl(method = "repeatedcv",
# number = 5, # k-fold
# repeats = 3) # repetition
#bank_rf <- train(y ~ .,
# data = rf_train,
# method = "rf", # random forest
# trControl = ctrl)
#saveRDS(bank_rf, "bank_rf.RDS")
bank_rf <- readRDS("bank_rf.RDS")
library(randomForest)
bank_rf$finalModel
##
## Call:
## randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x)))
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 21
##
## OOB estimate of error rate: 14.12%
## Confusion matrix:
## no yes class.error
## no 3271 695 0.1752395
## yes 425 3541 0.1071609
bank_naive_pred <- predict(object = bank_naive,
newdata = data_test,
type = "class")
# Confusion matrix
confusionMatrix(data = bank_naive_pred,
reference = data_test$y) #data actual
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 7861 210
## yes 2127 1105
##
## Accuracy : 0.7932
## 95% CI : (0.7857, 0.8007)
## No Information Rate : 0.8837
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3842
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.7870
## Specificity : 0.8403
## Pos Pred Value : 0.9740
## Neg Pred Value : 0.3419
## Prevalence : 0.8837
## Detection Rate : 0.6955
## Detection Prevalence : 0.7141
## Balanced Accuracy : 0.8137
##
## 'Positive' Class : no
##
AUC AND ROC
# Take the prediction results in terms of probability
bank_naive_pred_prob <- predict(object = bank_naive,
newdata = data_test,
type = "raw")
head(bank_naive_pred_prob)
## no yes
## [1,] 0.9501237 0.04987630
## [2,] 0.9685047 0.03149529
## [3,] 0.9240637 0.07593627
## [4,] 0.9800660 0.01993405
## [5,] 0.8910018 0.10899816
## [6,] 0.9837691 0.01623089
# menyiapkan pred vs actual
data_roc <- data.frame(pred_prob = bank_naive_pred_prob[, 'yes'],
actual = ifelse(data_test$y == 'yes',1, 0))
head(data_roc, 10)
## pred_prob actual
## 1 0.04987630 0
## 2 0.03149529 0
## 3 0.07593627 0
## 4 0.01993405 0
## 5 0.10899816 0
## 6 0.01623089 0
## 7 0.06470933 0
## 8 0.04177653 0
## 9 0.01599570 0
## 10 0.05203860 0
# Make ROC by preparing prediction() object
library(ROCR)
# objek prediction
bank_naive_roc <- prediction(predictions = data_roc$pred_prob,
labels = data_roc$actual)
# ROC curve
plot(performance(bank_naive_roc,"tpr","fpr"))
bank_dt_pred <- predict(object = bank_dt,
newdata = data_test,
type = "response")
# Confusion matrix
confusionMatrix(data = bank_dt_pred,
reference = data_test$y) #data actual
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 8190 147
## yes 1798 1168
##
## Accuracy : 0.8279
## 95% CI : (0.8208, 0.8348)
## No Information Rate : 0.8837
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4583
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8200
## Specificity : 0.8882
## Pos Pred Value : 0.9824
## Neg Pred Value : 0.3938
## Prevalence : 0.8837
## Detection Rate : 0.7246
## Detection Prevalence : 0.7376
## Balanced Accuracy : 0.8541
##
## 'Positive' Class : no
##
#See whethr the dt model overfits
bank_dt_pred_train <- predict(object = bank_dt,
newdata = data_train,
type = "response")
#Confusion matrix
confusionMatrix(data = bank_dt_pred_train,
reference = data_train$y)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3296 449
## yes 678 3525
##
## Accuracy : 0.8582
## 95% CI : (0.8503, 0.8658)
## No Information Rate : 0.5
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7164
##
## Mcnemar's Test P-Value : 1.109e-11
##
## Sensitivity : 0.8294
## Specificity : 0.8870
## Pos Pred Value : 0.8801
## Neg Pred Value : 0.8387
## Prevalence : 0.5000
## Detection Rate : 0.4147
## Detection Prevalence : 0.4712
## Balanced Accuracy : 0.8582
##
## 'Positive' Class : no
##
bank_rf_pred <- predict(object = bank_rf,
newdata = rf_test,
type = "raw")
#Confusion matrix
confusionMatrix(data = bank_rf_pred,
reference = rf_test$y) #data actual
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 8283 125
## yes 1698 1198
##
## Accuracy : 0.8387
## 95% CI : (0.8318, 0.8455)
## No Information Rate : 0.883
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.4852
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8299
## Specificity : 0.9055
## Pos Pred Value : 0.9851
## Neg Pred Value : 0.4137
## Prevalence : 0.8830
## Detection Rate : 0.7327
## Detection Prevalence : 0.7438
## Balanced Accuracy : 0.8677
##
## 'Positive' Class : no
##
#Check some of the most important predictors
varImp(bank_rf)
## rf variable importance
##
## only 20 most important variables shown (out of 40)
##
## Overall
## duration 100.000
## balance 19.647
## age 18.239
## poutcomesuccess 15.380
## contactunknown 12.365
## day.1 12.043
## day 11.919
## housingyes 8.888
## campaign 7.215
## poutcomeunknown 5.097
## monthmar 3.798
## monthaug 3.281
## monthjul 3.120
## loanyes 2.937
## monthoct 2.859
## monthmay 2.850
## jobblue-collar 2.473
## monthnov 2.452
## monthjun 2.403
## maritalmarried 2.281