## Rows: 4,521
## Columns: 17
## $ age <int> 30, 33, 35, 30, 59, 35, 36, 39, 41, 43, 39, 43, 36, 20, 31, …
## $ job <chr> "unemployed", "services", "management", "management", "blue-…
## $ marital <chr> "married", "married", "single", "married", "married", "singl…
## $ education <chr> "primary", "secondary", "tertiary", "tertiary", "secondary",…
## $ default <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
## $ balance <int> 1787, 4789, 1350, 1476, 0, 747, 307, 147, 221, -88, 9374, 26…
## $ housing <chr> "no", "yes", "yes", "yes", "yes", "no", "yes", "yes", "yes",…
## $ loan <chr> "no", "yes", "no", "yes", "no", "no", "no", "no", "no", "yes…
## $ contact <chr> "cellular", "cellular", "cellular", "unknown", "unknown", "c…
## $ day <int> 19, 11, 16, 3, 5, 23, 14, 6, 14, 17, 20, 17, 13, 30, 29, 29,…
## $ month <chr> "oct", "may", "apr", "jun", "may", "feb", "may", "may", "may…
## $ duration <int> 79, 220, 185, 199, 226, 141, 341, 151, 57, 313, 273, 113, 32…
## $ campaign <int> 1, 1, 1, 4, 1, 2, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 5, 1, 1, 1, …
## $ pdays <int> -1, 339, 330, -1, -1, 176, 330, -1, -1, 147, -1, -1, -1, -1,…
## $ previous <int> 0, 4, 1, 0, 0, 3, 2, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 2, 0, 1, …
## $ poutcome <chr> "unknown", "failure", "failure", "unknown", "unknown", "fail…
## $ y <chr> "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", …
#convert categorical variables to a factor
bank <- bank %>%
mutate(y = factor(y, levels = c("no", "yes")),
job = as.factor(job),
marital = as.factor(marital),
education = as.factor(education),
default = as.factor(default),
housing = as.factor(housing),
loan = as.factor(loan),
contact = as.factor(contact),
month = as.factor(month),
poutcome = as.factor(poutcome),
job = as.factor(job))
#Summary statistics
summary(bank)
## age job marital education default
## Min. :19.00 management :969 divorced: 528 primary : 678 no :4445
## 1st Qu.:33.00 blue-collar:946 married :2797 secondary:2306 yes: 76
## Median :39.00 technician :768 single :1196 tertiary :1350
## Mean :41.17 admin. :478 unknown : 187
## 3rd Qu.:49.00 services :417
## Max. :87.00 retired :230
## (Other) :713
## balance housing loan contact day
## Min. :-3313 no :1962 no :3830 cellular :2896 Min. : 1.00
## 1st Qu.: 69 yes:2559 yes: 691 telephone: 301 1st Qu.: 9.00
## Median : 444 unknown :1324 Median :16.00
## Mean : 1423 Mean :15.92
## 3rd Qu.: 1480 3rd Qu.:21.00
## Max. :71188 Max. :31.00
##
## month duration campaign pdays
## may :1398 Min. : 4 Min. : 1.000 Min. : -1.00
## jul : 706 1st Qu.: 104 1st Qu.: 1.000 1st Qu.: -1.00
## aug : 633 Median : 185 Median : 2.000 Median : -1.00
## jun : 531 Mean : 264 Mean : 2.794 Mean : 39.77
## nov : 389 3rd Qu.: 329 3rd Qu.: 3.000 3rd Qu.: -1.00
## apr : 293 Max. :3025 Max. :50.000 Max. :871.00
## (Other): 571
## previous poutcome y
## Min. : 0.0000 failure: 490 no :4000
## 1st Qu.: 0.0000 other : 197 yes: 521
## Median : 0.0000 success: 129
## Mean : 0.5426 unknown:3705
## 3rd Qu.: 0.0000
## Max. :25.0000
##
n_bank <- bank %>%
select(where(is.numeric))
summary(n_bank)
## age balance day duration
## Min. :19.00 Min. :-3313 Min. : 1.00 Min. : 4
## 1st Qu.:33.00 1st Qu.: 69 1st Qu.: 9.00 1st Qu.: 104
## Median :39.00 Median : 444 Median :16.00 Median : 185
## Mean :41.17 Mean : 1423 Mean :15.92 Mean : 264
## 3rd Qu.:49.00 3rd Qu.: 1480 3rd Qu.:21.00 3rd Qu.: 329
## Max. :87.00 Max. :71188 Max. :31.00 Max. :3025
## campaign pdays previous
## Min. : 1.000 Min. : -1.00 Min. : 0.0000
## 1st Qu.: 1.000 1st Qu.: -1.00 1st Qu.: 0.0000
## Median : 2.000 Median : -1.00 Median : 0.0000
## Mean : 2.794 Mean : 39.77 Mean : 0.5426
## 3rd Qu.: 3.000 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :50.000 Max. :871.00 Max. :25.0000
c_bank <- bank %>% select(-where(is.numeric))
summary(c_bank)
## job marital education default housing
## management :969 divorced: 528 primary : 678 no :4445 no :1962
## blue-collar:946 married :2797 secondary:2306 yes: 76 yes:2559
## technician :768 single :1196 tertiary :1350
## admin. :478 unknown : 187
## services :417
## retired :230
## (Other) :713
## loan contact month poutcome y
## no :3830 cellular :2896 may :1398 failure: 490 no :4000
## yes: 691 telephone: 301 jul : 706 other : 197 yes: 521
## unknown :1324 aug : 633 success: 129
## jun : 531 unknown:3705
## nov : 389
## apr : 293
## (Other): 571
colSums(is.na(bank))
## age job marital education default balance housing loan
## 0 0 0 0 0 0 0 0
## contact day month duration campaign pdays previous poutcome
## 0 0 0 0 0 0 0 0
## y
## 0
#numeric distributions
n_bank %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "blue", alpha = 0.7) +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Numeric Variables")
c_bank %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_bar(fill = "blue", alpha = 0.7) +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(title = "Distribution of Categorical Variables") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
n_bank <- n_bank %>%
mutate(y = bank$y)
n_bank <- n_bank %>%
mutate(y = as.factor(y))
n_bank_long <- n_bank %>%
pivot_longer(cols = -y, names_to = "variable", values_to = "value")
ggplot(n_bank_long, aes(x = y, y = value, fill = y)) +
geom_boxplot() +
facet_wrap(~variable, scales = "free") +
labs(title = "Distribution of Numeric Variables by Yes/No in Y",
x = "Y (Outcome)", y = "Value") +
theme_minimal()
c_bank <- c_bank %>%
mutate(y = bank$y) %>%
mutate(y = as.factor(y))
c_bank_long <- c_bank %>%
pivot_longer(cols = -y, names_to = "variable", values_to = "value") %>%
count(variable, value, y)
ggplot(c_bank_long, aes(x = value, y = n, fill = y)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.7) +
facet_wrap(~variable, scales = "free") +
labs(title = "Relationship Between Categorical Variables and Y (Yes/No)",
x = "Category", y = "Count") +
scale_fill_manual(values = c("yes" = "blue", "no" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
n_bank <- n_bank %>%
mutate(y = as.numeric(as.factor(y)) - 1)
cor_matrix <- cor(n_bank, use = "complete.obs")
ggcorrplot(cor_matrix,
method = "circle",
type = "lower",
lab = TRUE,
lab_size = 3,
colors = c("blue", "white", "red"),
title = "Correlation Heatmap",
ggtheme = theme_minimal())
80% of the data will be used for training and 20% will be used for testing
set.seed(1234)
sample <- sample(nrow(bank), round(nrow(bank)*.8),
replace = FALSE)
bank_train <- bank[sample,]
bank_test <- bank[-sample,]
round(prop.table(table(select(bank, y))),2)
## y
## no yes
## 0.88 0.12
round(prop.table(table(select(bank_train, y))),2)
## y
## no yes
## 0.88 0.12
round(prop.table(table(select(bank_test, y))),2)
## y
## no yes
## 0.89 0.11
First model includes all variables and we will split the data 80/20. We want to understand which variables are significant
bank_mod <-
rpart(
y ~ .,
method = "class",
data = bank_train
)
rpart.plot(bank_mod)
bank_pred <- predict(bank_mod, bank_test, type = "class")
bank_pred_table <- table(bank_test$y, bank_pred)
bank_pred_table
## bank_pred
## no yes
## no 774 29
## yes 74 27
sum(diag(bank_pred_table)) / nrow(bank_test)
## [1] 0.8860619
cm <- confusionMatrix(data = bank_pred, reference = bank_test$y, positive = "yes")
cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 774 74
## yes 29 27
##
## Accuracy : 0.8861
## 95% CI : (0.8635, 0.906)
## No Information Rate : 0.8883
## P-Value [Acc > NIR] : 0.609
##
## Kappa : 0.2871
##
## Mcnemar's Test P-Value : 1.455e-05
##
## Sensitivity : 0.26733
## Specificity : 0.96389
## Pos Pred Value : 0.48214
## Neg Pred Value : 0.91274
## Prevalence : 0.11173
## Detection Rate : 0.02987
## Detection Prevalence : 0.06195
## Balanced Accuracy : 0.61561
##
## 'Positive' Class : yes
##
sensitivity is very low, only 27% of successes were detected. Model predicts yes correct less than 50% of the time.
balanced data using all variables Use Rose to over sample the yes class Hypothesis: A balanced data will improve the accuracy, Kappa, and Pos prediction value
data_balanced <- ROSE(y ~ ., data = bank, seed = 124)$data
table(data_balanced$y)
##
## no yes
## 2313 2208
set.seed(124)
trainIndex <- createDataPartition(data_balanced$y, p = 0.8, list = FALSE)
bank_train2 <- data_balanced[trainIndex, ]
bank_test2 <- data_balanced[-trainIndex, ]
bank_mod2 <-
rpart(
y ~ .,
method = "class",
data = bank_train2
)
rpart.plot(bank_mod2)
bank_pred2 <- predict(bank_mod2, bank_test2, type = "class")
cm2 <- confusionMatrix(data = bank_pred2, reference = bank_test2$y, positive = "yes")
cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 359 82
## yes 103 359
##
## Accuracy : 0.7951
## 95% CI : (0.7673, 0.821)
## No Information Rate : 0.5116
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.5905
##
## Mcnemar's Test P-Value : 0.1414
##
## Sensitivity : 0.8141
## Specificity : 0.7771
## Pos Pred Value : 0.7771
## Neg Pred Value : 0.8141
## Prevalence : 0.4884
## Detection Rate : 0.3976
## Detection Prevalence : 0.5116
## Balanced Accuracy : 0.7956
##
## 'Positive' Class : yes
##
after balancing the data, the models sensitivity improves 4 times the previous model. Kappa - .56 Sensitivity - .81 ## Random forest
Random forest using the balanced data and significant factors; Job
had very low importance. Accuracy, specificity, and Kappa remain high.
Kappa : 0.6037
Sensitivity : 0.8186
Specificity : 0.7857
bank_train2$y <- factor(bank_train2$y, levels = c("yes", "no"))
bank_test2$y <- factor(bank_test2$y, levels = c("yes", "no"))
set.seed(126)
rf_model <- randomForest(y ~ duration + poutcome + job + month , data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = y ~ duration + poutcome + job + month, data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 19.65%
## Confusion matrix:
## yes no class.error
## yes 1420 347 0.1963780
## no 364 1487 0.1966505
importance(rf_model)
## yes no MeanDecreaseAccuracy MeanDecreaseGini
## duration 174.23918 150.86938 221.1754 1037.9661
## poutcome 95.83095 97.54879 133.3147 181.0547
## job 71.06409 20.08557 69.0880 223.5180
## month 105.64045 81.29794 131.0640 364.9532
varImpPlot(rf_model)
predictions <- predict(rf_model, bank_test2, type = "class")
confusionMatrix(table(predictions, bank_test2$y))
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 361 99
## no 80 363
##
## Accuracy : 0.8018
## 95% CI : (0.7742, 0.8273)
## No Information Rate : 0.5116
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6037
##
## Mcnemar's Test P-Value : 0.1785
##
## Sensitivity : 0.8186
## Specificity : 0.7857
## Pos Pred Value : 0.7848
## Neg Pred Value : 0.8194
## Prevalence : 0.4884
## Detection Rate : 0.3998
## Detection Prevalence : 0.5094
## Balanced Accuracy : 0.8022
##
## 'Positive' Class : yes
##
For our next model we will go back to using all of the variables to understand which are important to predict a success in subcriptions to a term deposit.
Balanced data with all variables. The error rate for yes misclassifications decreased by a couple of percentage points. The Kappa improves to 72%, meaning that the model’s prediction agrees with the actual outcomes very well when accounting for the possibility of agreement by chance. Month along with duration is a significant factor as in the other models.
set.seed(127)
rf_model2 <- randomForest(y ~ . , data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)
print(rf_model2)
##
## Call:
## randomForest(formula = y ~ ., data = bank_train2, ntree = 500, mtry = sqrt(ncol(bank_train2) - 1), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 12.6%
## Confusion matrix:
## yes no class.error
## yes 1565 202 0.1143181
## no 254 1597 0.1372231
importance(rf_model2)
## yes no MeanDecreaseAccuracy MeanDecreaseGini
## age 19.820523 21.572753 29.700444 103.723779
## job 49.632793 17.263917 49.445207 126.675076
## marital 24.374131 10.114840 25.286746 28.699770
## education 18.927340 9.791511 21.902100 31.341975
## default 6.901647 2.935399 6.525886 2.891009
## balance 20.221959 8.092355 19.457560 102.214957
## housing 21.785807 11.726264 22.117810 20.989088
## loan 19.174621 8.614734 19.445217 15.450295
## contact 25.387859 32.874001 36.256648 44.063995
## day 18.308964 19.305632 26.024022 103.066523
## month 66.603151 61.593286 83.421355 221.760095
## duration 132.482029 133.421898 158.444418 514.816303
## campaign 25.589388 5.854974 23.598160 101.715509
## pdays 5.182782 34.678797 36.324847 129.501015
## previous 18.917044 42.631203 44.014716 164.431657
## poutcome 24.525720 40.771142 50.914476 94.491496
varImpPlot(rf_model2)
predictions <- predict(rf_model2, bank_test2, type = "class")
confusionMatrix(table(predictions, bank_test2$y))
## Confusion Matrix and Statistics
##
##
## predictions yes no
## yes 388 71
## no 53 391
##
## Accuracy : 0.8627
## 95% CI : (0.8385, 0.8845)
## No Information Rate : 0.5116
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7255
##
## Mcnemar's Test P-Value : 0.1268
##
## Sensitivity : 0.8798
## Specificity : 0.8463
## Pos Pred Value : 0.8453
## Neg Pred Value : 0.8806
## Prevalence : 0.4884
## Detection Rate : 0.4297
## Detection Prevalence : 0.5083
## Balanced Accuracy : 0.8631
##
## 'Positive' Class : yes
##
5 k-fold cross validation 500 trees Hypothesis: remove bias and overfitting
set.seed(128)
ada_model <- train(y ~ duration + month + previous + poutcome , data = bank_train2, method = "ada", trControl = trainControl(method = "cv", number = 5))
print(ada_model)
## Boosted Classification Trees
##
## 3618 samples
## 4 predictor
## 2 classes: 'yes', 'no'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 2894, 2894, 2895, 2895, 2894
## Resampling results across tuning parameters:
##
## maxdepth iter Accuracy Kappa
## 1 50 0.2636857 -0.4647131
## 1 100 0.2426797 -0.5084060
## 1 150 0.2363227 -0.5214948
## 2 50 0.2260945 -0.5425801
## 2 100 0.2222259 -0.5507751
## 2 150 0.2175256 -0.5606495
## 3 50 0.2147609 -0.5659306
## 3 100 0.2059165 -0.5846610
## 3 150 0.2017694 -0.5935630
##
## Tuning parameter 'nu' was held constant at a value of 0.1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were iter = 50, maxdepth = 1 and nu = 0.1.
Month showed no importance
pred <- predict(ada_model, newdata = bank_test2, type = "raw")
confusionMatrix(table(pred, bank_test2$y))
## Confusion Matrix and Statistics
##
##
## pred yes no
## yes 182 394
## no 259 68
##
## Accuracy : 0.2769
## 95% CI : (0.2479, 0.3073)
## No Information Rate : 0.5116
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.4371
##
## Mcnemar's Test P-Value : 1.573e-07
##
## Sensitivity : 0.4127
## Specificity : 0.1472
## Pos Pred Value : 0.3160
## Neg Pred Value : 0.2080
## Prevalence : 0.4884
## Detection Rate : 0.2016
## Detection Prevalence : 0.6379
## Balanced Accuracy : 0.2799
##
## 'Positive' Class : yes
##
plot(varImp(ada_model))
Accuracy : 0.2769 Sensitivity : 0.4127 Kappa : -0.4371
results worse than random prediction ### Model 6 increase K-folds to
reduce variance and get a better performance estimate
set.seed(129)
ada_model2 <- train(y ~ . - job,
data = bank_train2,
method = "ada",
trControl = trainControl(method = "cv", number = 20))
pred <- predict(ada_model2, newdata = bank_test2, type = "raw")
confusionMatrix(table(pred, bank_test2$y))
## Confusion Matrix and Statistics
##
##
## pred yes no
## yes 171 387
## no 270 75
##
## Accuracy : 0.2724
## 95% CI : (0.2436, 0.3027)
## No Information Rate : 0.5116
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.4472
##
## Mcnemar's Test P-Value : 6.023e-06
##
## Sensitivity : 0.3878
## Specificity : 0.1623
## Pos Pred Value : 0.3065
## Neg Pred Value : 0.2174
## Prevalence : 0.4884
## Detection Rate : 0.1894
## Detection Prevalence : 0.6179
## Balanced Accuracy : 0.2750
##
## 'Positive' Class : yes
##
plot(varImp(ada_model2))
Similar results as the first adaboost model.
model_results <- tibble(
Model = c("Decision tree (bank_mod)", "Decision tree (bank_mod2)", "Random Forest (rf_model)","Random Forest (rf_model2)", "Adaboost (ada_model)", "Adaboos (ada_model2"),
Accuracy = c(0.8861, 0.7951, 0.8018, 0.8627, 0.2769, 0.2724),
Sensitivity = c(0.2673, 0.8141, 0.8186, 0.8798, 0.4127, 0.3878),
Specificity = c(0.96389,0.7771,0.7857, 0.8463 ,0.1472,0.1623),
Kappa = c(0.2871, 0.59, 0.6037, 0.7255, -0.4371, -0.4472)
)
print(model_results)
## # A tibble: 6 × 5
## Model Accuracy Sensitivity Specificity Kappa
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Decision tree (bank_mod) 0.886 0.267 0.964 0.287
## 2 Decision tree (bank_mod2) 0.795 0.814 0.777 0.59
## 3 Random Forest (rf_model) 0.802 0.819 0.786 0.604
## 4 Random Forest (rf_model2) 0.863 0.880 0.846 0.726
## 5 Adaboost (ada_model) 0.277 0.413 0.147 -0.437
## 6 Adaboos (ada_model2 0.272 0.388 0.162 -0.447