Abstract: The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).
Data Set Information: The data is related with direct marketing campaigns of a Portuguese banking institution. The marketing campaigns were based on phone calls. Often, more than one contact to the same client was required, in order to access if the product (bank term deposit) would be (‘yes’) or not (‘no’) subscribed.
Dataset from : http://archive.ics.uci.edu/ml/datasets/Bank+Marketing#
library(dplyr)
library(caret)
library(e1071)
library(partykit)
library(ROCR)
library(rpart.plot)
library(randomForest)bank <- read.csv("data_input/bank-additional-full.csv", sep = ";", stringsAsFactors = T)
head(bank)glimpse(bank)#> Rows: 41,188
#> Columns: 21
#> $ age <int> 56, 57, 37, 40, 56, 45, 59, 41, 24, 25, 41, 25, 29, 57,~
#> $ job <fct> housemaid, services, services, admin., services, servic~
#> $ marital <fct> married, married, married, married, married, married, m~
#> $ education <fct> basic.4y, high.school, high.school, basic.6y, high.scho~
#> $ default <fct> no, unknown, no, no, no, unknown, no, unknown, no, no, ~
#> $ housing <fct> no, no, yes, no, no, no, no, no, yes, yes, no, yes, no,~
#> $ loan <fct> no, no, no, no, yes, no, no, no, no, no, no, no, yes, n~
#> $ contact <fct> telephone, telephone, telephone, telephone, telephone, ~
#> $ month <fct> may, may, may, may, may, may, may, may, may, may, may, ~
#> $ day_of_week <fct> mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, mon, ~
#> $ duration <int> 261, 149, 226, 151, 307, 198, 139, 217, 380, 50, 55, 22~
#> $ campaign <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
#> $ pdays <int> 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, ~
#> $ previous <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
#> $ poutcome <fct> nonexistent, nonexistent, nonexistent, nonexistent, non~
#> $ emp.var.rate <dbl> 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, 1.1, ~
#> $ cons.price.idx <dbl> 93.994, 93.994, 93.994, 93.994, 93.994, 93.994, 93.994,~
#> $ cons.conf.idx <dbl> -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4, -36.4,~
#> $ euribor3m <dbl> 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857, 4.857,~
#> $ nr.employed <dbl> 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5191, 5~
#> $ y <fct> no, no, no, no, no, no, no, no, no, no, no, no, no, no,~
anyNA(bank)#> [1] FALSE
Data has no missing Value
RNGkind(sample.kind = "Rounding")
set.seed(200)
# index sampling
index <- sample(x = nrow(bank),
size = nrow(bank)*0.8)
# splitting
bank_train <- bank[index, ]
bank_test <- bank[-index, ]prop.table(table(bank_train$y))#>
#> no yes
#> 0.8881335 0.1118665
The proportion of the data is 8:1, unbalanced data proportion.
RNGkind(sample.kind = "Rounding")
set.seed(123)
bank_train <- downSample(x = bank_train %>%
select(-y),
y = bank_train$y,
yname = "y")prop.table(table(bank_train$y))#>
#> no yes
#> 0.5 0.5
model_naive <- naiveBayes(formula = y ~ ., data = bank_train, laplace = 1)test <- bank_test %>% select(-y)
naive_pred <- predict(object = model_naive, newdata = test)
confusionMatrix(data = naive_pred, reference = bank_test$y, positive = "yes")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction no yes
#> no 5895 228
#> yes 1389 726
#>
#> Accuracy : 0.8037
#> 95% CI : (0.795, 0.8122)
#> No Information Rate : 0.8842
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.373
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.76101
#> Specificity : 0.80931
#> Pos Pred Value : 0.34326
#> Neg Pred Value : 0.96276
#> Prevalence : 0.11580
#> Detection Rate : 0.08813
#> Detection Prevalence : 0.25674
#> Balanced Accuracy : 0.78516
#>
#> 'Positive' Class : yes
#>
The model has a fairly good accuracy and specificity in the 80% value range. but for sensitivity only around 76%.
bank_test$pred <- predict(object = model_naive, newdata = bank_test, type = "raw")
bank_test$actual <- ifelse(test = bank_test$y == "yes", 1, 0)# objek prediction
roc_pred <- prediction(predictions = bank_test$pred[,1], labels = bank_test$actual)
# ROC curve
plot(performance(prediction.obj = roc_pred, measure = "tpr", x.measure = "fpr"))
abline(0,1,lty = 8)vote_auc <- performance(prediction.obj = roc_pred, measure = "auc")
vote_auc@y.values#> [[1]]
#> [1] 0.1379993
AUC = 0.96997, it can be concluded that the
model_naivemodel is good in separating the ‘yes’ and ‘not’ classes.
model_tree <- ctree(formula = y ~ ., data = bank_train)plot(model_tree, type="simple")model_tree#>
#> Model formula:
#> y ~ age + job + marital + education + default + housing + loan +
#> contact + month + day_of_week + duration + campaign + pdays +
#> previous + poutcome + emp.var.rate + cons.price.idx + cons.conf.idx +
#> euribor3m + nr.employed
#>
#> Fitted party:
#> [1] root
#> | [2] nr.employed <= 5076.2
#> | | [3] duration <= 123
#> | | | [4] poutcome in failure, nonexistent
#> | | | | [5] duration <= 77: no (n = 42, err = 4.8%)
#> | | | | [6] duration > 77
#> | | | | | [7] previous <= 1: no (n = 105, err = 45.7%)
#> | | | | | [8] previous > 1: no (n = 12, err = 0.0%)
#> | | | [9] poutcome in success: yes (n = 41, err = 19.5%)
#> | | [10] duration > 123
#> | | | [11] poutcome in failure, nonexistent
#> | | | | [12] duration <= 261: yes (n = 472, err = 18.9%)
#> | | | | [13] duration > 261: yes (n = 731, err = 6.3%)
#> | | | [14] poutcome in success: yes (n = 613, err = 4.1%)
#> | [15] nr.employed > 5076.2
#> | | [16] duration <= 474
#> | | | [17] month in apr, dec, mar, oct
#> | | | | [18] day_of_week in fri, mon
#> | | | | | [19] month in apr, dec
#> | | | | | | [20] education in basic.4y, university.degree, unknown: yes (n = 70, err = 47.1%)
#> | | | | | | [21] education in basic.6y, basic.9y, high.school, professional.course: no (n = 80, err = 12.5%)
#> | | | | | [22] month in mar, oct: yes (n = 50, err = 14.0%)
#> | | | | [23] day_of_week in thu, tue, wed
#> | | | | | [24] default in no
#> | | | | | | [25] month in apr
#> | | | | | | | [26] duration <= 96: no (n = 22, err = 18.2%)
#> | | | | | | | [27] duration > 96: yes (n = 217, err = 22.6%)
#> | | | | | | [28] month in mar, oct: yes (n = 84, err = 3.6%)
#> | | | | | [29] default in unknown: no (n = 14, err = 28.6%)
#> | | | [30] month in aug, jul, jun, may, nov
#> | | | | [31] duration <= 364
#> | | | | | [32] emp.var.rate <= -1.8
#> | | | | | | [33] duration <= 181
#> | | | | | | | [34] pdays <= 6: no (n = 8, err = 37.5%)
#> | | | | | | | [35] pdays > 6
#> | | | | | | | | [36] duration <= 107
#> | | | | | | | | | [37] job in admin., blue-collar, entrepreneur, management, retired, self-employed, services, technician: no (n = 167, err = 0.0%)
#> | | | | | | | | | [38] job in student, unemployed: no (n = 7, err = 14.3%)
#> | | | | | | | | [39] duration > 107
#> | | | | | | | | | [40] euribor3m <= 1.27: no (n = 52, err = 21.2%)
#> | | | | | | | | | [41] euribor3m > 1.27: no (n = 84, err = 0.0%)
#> | | | | | | [42] duration > 181
#> | | | | | | | [43] day_of_week in fri, mon
#> | | | | | | | | [44] pdays <= 11: yes (n = 7, err = 0.0%)
#> | | | | | | | | [45] pdays > 11: no (n = 97, err = 40.2%)
#> | | | | | | | [46] day_of_week in thu, tue, wed
#> | | | | | | | | [47] euribor3m <= 1.27: no (n = 31, err = 41.9%)
#> | | | | | | | | [48] euribor3m > 1.27: no (n = 82, err = 6.1%)
#> | | | | | [49] emp.var.rate > -1.8
#> | | | | | | [50] month in aug, jul, jun, may
#> | | | | | | | [51] duration <= 355
#> | | | | | | | | [52] duration <= 181: no (n = 1331, err = 0.0%)
#> | | | | | | | | [53] duration > 181: no (n = 636, err = 2.0%)
#> | | | | | | | [54] duration > 355: no (n = 24, err = 12.5%)
#> | | | | | | [55] month in nov
#> | | | | | | | [56] euribor3m <= 4.191
#> | | | | | | | | [57] duration <= 278
#> | | | | | | | | | [58] job in admin., blue-collar, entrepreneur, management, self-employed, services, technician, unemployed: no (n = 231, err = 0.0%)
#> | | | | | | | | | [59] job in housemaid, retired: no (n = 11, err = 9.1%)
#> | | | | | | | | [60] duration > 278: no (n = 28, err = 17.9%)
#> | | | | | | | [61] euribor3m > 4.191: yes (n = 15, err = 20.0%)
#> | | | | [62] duration > 364
#> | | | | | [63] cons.price.idx <= 92.893: yes (n = 86, err = 43.0%)
#> | | | | | [64] cons.price.idx > 92.893
#> | | | | | | [65] contact in cellular: no (n = 137, err = 31.4%)
#> | | | | | | [66] contact in telephone: no (n = 124, err = 12.9%)
#> | | [67] duration > 474
#> | | | [68] duration <= 635
#> | | | | [69] contact in cellular: yes (n = 346, err = 24.0%)
#> | | | | [70] contact in telephone
#> | | | | | [71] cons.conf.idx <= -40.4: yes (n = 69, err = 30.4%)
#> | | | | | [72] cons.conf.idx > -40.4: no (n = 77, err = 41.6%)
#> | | | [73] duration > 635: yes (n = 1169, err = 11.4%)
#>
#> Number of inner nodes: 36
#> Number of terminal nodes: 37
tree_pred <- predict(object = model_tree, newdata = bank_test)
confusionMatrix(data = tree_pred, reference = bank_test$y, positive = "yes")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction no yes
#> no 6073 69
#> yes 1211 885
#>
#> Accuracy : 0.8446
#> 95% CI : (0.8366, 0.8524)
#> No Information Rate : 0.8842
#> P-Value [Acc > NIR] : 1
#>
#> Kappa : 0.5009
#>
#> Mcnemar's Test P-Value : <2e-16
#>
#> Sensitivity : 0.9277
#> Specificity : 0.8337
#> Pos Pred Value : 0.4222
#> Neg Pred Value : 0.9888
#> Prevalence : 0.1158
#> Detection Rate : 0.1074
#> Detection Prevalence : 0.2544
#> Balanced Accuracy : 0.8807
#>
#> 'Positive' Class : yes
#>
the tree_pred model has a fairly high accuracy of 84.46%, Sensitivity = 92.77% and Specificity = 83.34%.
# objek prediction
roc_pred <- prediction(predictions = bank_test$pred[,1], labels = bank_test$actual)
# ROC curve
plot(performance(prediction.obj = roc_pred, measure = "tpr", x.measure = "fpr"))
abline(0,1,lty = 8)n0_var <- nearZeroVar(bank)
bank <- bank[ , -n0_var]head(bank)RNGkind(sample.kind = "Rounding")
set.seed(200)
# index sampling
index <- sample(x = nrow(bank),
size = nrow(bank)*0.8)
# splitting
bank_train2 <- bank[index, ]
bank_test2 <- bank[-index, ]set.seed(417)
ctrl <- trainControl(method = "repeatedcv",
number = 5, # k-fold
repeats = 3) # repetisi
bank_forest<- train(y ~ .,
data = bank_train2,
method = "rf", # random forest
trControl = ctrl)
saveRDS(bank_forest, "bank_forest.RDS") # save modelb_forest <- readRDS("bank_forest.RDS")b_forest$finalModel#>
#> Call:
#> randomForest(x = x, y = y, mtry = min(param$mtry, ncol(x)))
#> Type of random forest: classification
#> Number of trees: 500
#> No. of variables tried at each split: 27
#>
#> OOB estimate of error rate: 8.58%
#> Confusion matrix:
#> no yes class.error
#> no 28117 1147 0.03919492
#> yes 1680 2006 0.45577862
pred_forest <- predict(object = b_forest, newdata = bank_test2, type = "raw")confusionMatrix(data = pred_forest, reference = bank_test2$y, positive = "yes")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction no yes
#> no 6978 415
#> yes 306 539
#>
#> Accuracy : 0.9125
#> 95% CI : (0.9062, 0.9185)
#> No Information Rate : 0.8842
#> P-Value [Acc > NIR] : < 2.2e-16
#>
#> Kappa : 0.5503
#>
#> Mcnemar's Test P-Value : 5.767e-05
#>
#> Sensitivity : 0.56499
#> Specificity : 0.95799
#> Pos Pred Value : 0.63787
#> Neg Pred Value : 0.94387
#> Prevalence : 0.11580
#> Detection Rate : 0.06543
#> Detection Prevalence : 0.10257
#> Balanced Accuracy : 0.76149
#>
#> 'Positive' Class : yes
#>
The model has a fairly high accuracy of 91.25%, but the sensitivity/recall value is very small at 56.49%
based on the highest accuracy value, the random forest model has the highest accuracy, but in this case the recall value will be used as a comparison between the models. this is done to meet business needs, that the false negative value is very important to reconsider. then based on the recall value, the decision tree model is the best model for this case with a recall of 92.77%.
Social and economic context attributes