In this part we train the same algorithms as before, but now using cross validation and hyperparameter tuning. Performing cross-validation is crucial to assess the model’s ability to generalize to unseen data, while hyperparameter tuning helps optimize the model’s performance by adjusting the values of these parameters to achieve better results on test data. Together, these practices contribute to a more robust and accurate model
library(tidyverse)
library(janitor)
library(readr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(gridExtra)
library(summarytools)
library(randomForest)
library(reshape2)
library(tidymodels)
# Encoders -----------
region_encoder <- readRDS("region_encoder.rds")
policy_encoder <- readRDS("policy_encoder.rds")
encoder_function <- function(df){
df %>%
left_join(region_encoder) %>%
select(-region_code) %>%
rename(region_code = region_num) %>%
left_join(policy_encoder) %>%
select(-policy_sales_channel) %>%
rename(policy_sales_channel = policy_num)
}
# Top @K metrics --------------------------
metrics_at_k_function <- function(model_name, model_results, k){
df_results <- model_results %>%
arrange(desc(.pred_yes)) %>%
mutate(
TP = ifelse(.pred_class == "yes" & response == "yes", 1, 0),
FP = ifelse(.pred_class == "yes" & response == "no", 1, 0),
FN = ifelse(.pred_class == "no" & response == "yes", 1, 0),
TN = ifelse(.pred_class == "no" & response == "no", 1, 0)
)
# Create list for precision and recall
precision_at_k <- list()
recall_at_k <- list()
# Populate the metric list
for (i in 1:k) {
subset_k <- df_results %>%
dplyr_row_slice(1:i)
precision_at_k[[i]] <- (subset_k$TP %>% sum())/(subset_k$TP %>% sum() + subset_k$FP %>% sum())
recall_at_k[[i]] <- (subset_k$TP %>% sum())/(subset_k$TP %>% sum() + subset_k$FN %>% sum())
}
# Complete dataframe
metrics_at_k_df <- df_results %>%
dplyr_row_slice(1:k) %>%
mutate(
precision_at_k = unlist(precision_at_k),
recall_at_k = unlist(recall_at_k)
)
final_at_k_df <- tibble(model = model_name, k = k) %>%
bind_cols(
metrics_at_k_df %>%
slice(k) %>%
select(precision_at_k, recall_at_k)
)
return(list(metrics_at_k_df, final_at_k_df))
}
# Gain & Lift Curves -------------
curves_function <- function(model_results){
gain_plt <- gain_curve(model_results, response, .pred_yes) %>%
autoplot()
lift_plt <- lift_curve(model_results, response, .pred_yes) %>% autoplot()
return(gridExtra::grid.arrange(gain_plt, lift_plt, ncol = 2))
}
df_selected <- readRDS("df_selected.rds")
df_preprocessed <- encoder_function(df_selected)
set.seed(123)
df_split <- df_preprocessed %>%
initial_split(strata = response)
df_train <- df_split %>%
training()
df_test <- df_split %>%
testing()
LEMBRAR DE TIRAR O ID PARA TREINAR!!!!!!!!
# Featue engineering recipe
df_recipe <- recipe(response ~ .,
data = df_train %>% select(-id)) %>%
step_normalize(age, days_associated) %>%
step_scale(health_annual_paid) %>%
step_dummy(all_nominal(), -all_outcomes())
df_kfolds <- vfold_cv(df_train %>% select(-id),
v = 5, strata = response)
Time to tune: 1.217322 mins
Time to train the final model: 4.43 secs
# Model Specification ----------
lr_model <- logistic_reg(penalty = tune(),
mixture = tune()) %>%
set_engine("glmnet") %>%
set_mode("classification")
# See parameters ----------
hardhat::extract_parameter_set_dials(lr_model)
# Using dials package
lr_grid <- grid_regular(extract_parameter_set_dials(lr_model), levels = 5)
library(glmnet)
# Parallelize tuning process
doParallel::registerDoParallel()
# Tune package
start_time <- Sys.time()
lr_tune <- tune_grid(lr_model, df_recipe,
resamples = df_kfolds,
grid = lr_grid)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(lr_tune, "lr_tune.rds")
# Select the best hyperparameters -----------
lr_tune <- readRDS("lr_tune.rds")
lr_param <- lr_tune %>%
select_best("roc_auc")
# Apply the hyperparameters to the model ----------
tidy_lr_model <- finalize_model(lr_model, lr_param)
# Create workflow ----------
lr_wkfl <- workflow() %>%
add_model(tidy_lr_model) %>%
add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()
start_time <- Sys.time()
# Train the model
lr_res <- last_fit(lr_wkfl, df_split)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(lr_res, "lr_res.rds")
lr_res <- readRDS("lr_res.rds")
# Confusion matrix
lr_res %>%
unnest(.predictions) %>%
conf_mat(truth = response, estimate = .pred_class)
## Truth
## Prediction yes no
## yes 0 0
## no 11678 83600
As we can see, the model classified all the client intention to sign for a new insurance as “no”, even with the parameter tuning.
lr_results <- lr_res %>%
unnest(.predictions) %>%
select(.pred_yes:response)
lr_metrics_at_k <- metrics_at_k_function("Logistic Regression", lr_results, 2000)
# Metrics @K ------------
lr_metrics_at_k_final <- lr_metrics_at_k[[2]]
lr_metrics_at_k_final
# Gain and lift curves ---------
curves_function(lr_results)
Gain: By approaching 25% of the ordered list, ~61% of all interested customers are reached.
Lift: By approaching 25% of the ordered list, the model performs ~2.4 times better than the random list.
Time to tune: 3.918506 mins
Time to train the final model: 14.88183 secs
# Model Specification ----------
tree_model <- decision_tree(
cost_complexity = tune(),
tree_depth = tune(),
min_n = tune()
) %>%
set_engine("rpart") %>%
set_mode("classification")
# See parameters
hardhat::extract_parameter_set_dials(tree_model)
# Using dials package
tree_grid <- grid_regular(extract_parameter_set_dials(tree_model), levels = 3)
library(glmnet)
# Parallelize tuning process
doParallel::registerDoParallel()
# Tune package
start_time <- Sys.time()
tree_tune <- tune_grid(tree_model, df_recipe,
resamples = df_kfolds,
grid = tree_grid)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(tree_tune, "tree_tune.rds")
# Select the best hyperparameters -----------
tree_tune <- readRDS("tree_tune.rds")
tree_param <- tree_tune %>%
select_best("roc_auc")
# Apply the hyperparameters to the model ----------
tidy_tree_model <- finalize_model(tree_model, tree_param)
# Create workflow ----------
tree_wkfl <- workflow() %>%
add_model(tidy_tree_model) %>%
add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()
start_time <- Sys.time()
# Train the model
tree_res <- last_fit(tree_wkfl, df_split)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(tree_res, "tree_res.rds")
tree_res <- readRDS("tree_res.rds")
# Confusion matrix
tree_res %>%
unnest(.predictions) %>%
conf_mat(truth = response, estimate = .pred_class)
## Truth
## Prediction yes no
## yes 459 992
## no 11219 82608
tree_results <- tree_res %>%
unnest(.predictions) %>%
select(.pred_yes:response)
tree_metrics_at_k <- metrics_at_k_function("Decision Tree", tree_results, 2000)
# Metrics @K ------------
dt_metrics_at_k_final <- tree_metrics_at_k[[2]]
dt_metrics_at_k_final
# Gain and lift curves ---------
curves_function(tree_results)
Gain: By approaching 25% of the ordered list, ~62% of all interested customers are reached.
Lift: By approaching 25% of the ordered list, the model performs ~2.65 times better than the random list.
Time to tune: 17.70377 mins
Time to train the final model: 1.552191 mins
# Model Specification ----------
rf_model <- rand_forest(
mtry = tune(),
trees = 100,
min_n = tune()
) %>%
set_engine("ranger") %>%
set_mode("classification")
# See parameters
hardhat::extract_parameter_set_dials(rf_model)
# Using dials package
rf_grid <- grid_regular(
mtry(range = c(10, 300)),
min_n(range = c(100, 1000)),
levels = 3
)
# Parallelize tuning process
doParallel::registerDoParallel()
# Tune package
start_time <- Sys.time()
rf_tune <- tune_grid(rf_model, df_recipe,
resamples = df_kfolds,
grid = rf_grid)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(rf_tune, "rf_tune.rds")
# Select the best hyperparameters -----------
rf_tune <- readRDS("rf_tune.rds")
rf_param <- rf_tune %>%
select_best("roc_auc")
# Apply the hyperparameters to the model ----------
tidy_rf_model <- finalize_model(rf_model, rf_param)
# Create workflow ----------
rf_wkfl <- workflow() %>%
add_model(tidy_rf_model) %>%
add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()
start_time <- Sys.time()
# Train the model
rf_res <- last_fit(rf_wkfl, df_split)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(rf_res, "rf_res.rds")
rf_res <- readRDS("rf_res.rds")
# Confusion matrix
rf_res %>%
unnest(.predictions) %>%
conf_mat(truth = response, estimate = .pred_class)
## Truth
## Prediction yes no
## yes 16 23
## no 11662 83577
rf_results <- rf_res %>%
unnest(.predictions) %>%
select(.pred_yes:response)
rf_metrics_at_k <- metrics_at_k_function("Random Forest", rf_results, 2000)
# Metrics @K ------------
rf_metrics_at_k_final <- rf_metrics_at_k[[2]]
rf_metrics_at_k_final
# Gain and lift curves ---------
curves_function(rf_results)
Gain: By approaching 25% of the ordered list, ~68% of all interested customers are reached.
Lift: By approaching 25% of the ordered list, the model performs ~2.7times better than the random list.
Time to tune: 28.93869 mins
Time to train the final model: 5.126836 mins
# Model Specification ----------
xgb_model <- boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune()
) %>%
set_engine("xgboost") %>%
set_mode("classification")
# See parameters
hardhat::extract_parameter_set_dials(xgb_model)
# Using dials package
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), df_train),
learn_rate(),
size = 10
)
Notice that we had to treat mtry() differently because it depends on the actual number of predictors in the data.
# Parallelize tuning process
doParallel::registerDoParallel()
# Tune package
start_time <- Sys.time()
xgb_tune <- tune_grid(xgb_model, df_recipe,
resamples = df_kfolds,
grid = xgb_grid)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(xgb_tune, "xgb_tune.rds")
# Select the best hyperparameters -----------
xgb_tune <- readRDS("xgb_tune.rds")
xgb_param <- xgb_tune %>%
select_best("roc_auc")
# Apply the hyperparameters to the model ----------
tidy_xgb_model <- finalize_model(xgb_model, xgb_param)
# Create workflow ----------
xgb_wkfl <- workflow() %>%
add_model(tidy_xgb_model) %>%
add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()
start_time <- Sys.time()
# Train the model
xgb_res <- last_fit(xgb_wkfl, df_split)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(xgb_res, "xgb_res.rds")
xgb_res <- readRDS("xgb_res.rds")
# Confusion matrix
xgb_res %>%
unnest(.predictions) %>%
conf_mat(truth = response, estimate = .pred_class)
## Truth
## Prediction yes no
## yes 1 0
## no 11677 83600
xgb_results <- xgb_res %>%
unnest(.predictions) %>%
select(.pred_yes:response)
xgb_metrics_at_k <- metrics_at_k_function("XGBoost", xgb_results, 2000)
# Metrics @K ------------
xgb_metrics_at_k_final <- xgb_metrics_at_k[[2]]
xgb_metrics_at_k_final
# Gain and lift curves ---------
curves_function(xgb_results)
Gain: By approaching 25% of the ordered list, ~70% of all interested customers are reached.
Lift: By approaching 25% of the ordered list, the model performs ~2.8 times better than the random list.
Time to tune: 1.525734 hours
Time to train the final model: Time difference of 26.91236 mins
# Model Specification ----------
knn_model <- nearest_neighbor(neighbors = tune()) %>%
set_engine("kknn") %>%
set_mode("classification")
# See parameters
hardhat::extract_parameter_set_dials(knn_model)
# Using dials package
knn_grid <- grid_regular(
extract_parameter_set_dials(knn_model),
levels = 3
)
# Parallelize tuning process
doParallel::registerDoParallel()
# Tune package
start_time <- Sys.time()
knn_tune <- tune_grid(knn_model, df_recipe,
resamples = df_kfolds,
grid = knn_grid)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(knn_tune, "knn_tune.rds")
# Select the best hyperparameters -----------
knn_tune <- readRDS("knn_tune.rds")
knn_param <- knn_tune %>%
select_best("roc_auc")
# Apply the hyperparameters to the model ----------
tidy_knn_model <- finalize_model(knn_model, knn_param)
# Create workflow ----------
knn_wkfl <- workflow() %>%
add_model(tidy_knn_model) %>%
add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()
start_time <- Sys.time()
# Train the model
knn_res <- last_fit(knn_wkfl, df_split)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(knn_res, "knn_res.rds")
knn_res <- readRDS("knn_res.rds")
# Confusion matrix
knn_res %>%
unnest(.predictions) %>%
conf_mat(truth = response, estimate = .pred_class)
## Truth
## Prediction yes no
## yes 1487 2869
## no 10191 80731
knn_results <- knn_res %>%
unnest(.predictions) %>%
select(.pred_yes:response)
knn_metrics_at_k <- metrics_at_k_function("KNN", knn_results, 2000)
# Metrics @K ------------
knn_metrics_at_k_final <- knn_metrics_at_k[[2]]
knn_metrics_at_k_final
# Gain and lift curves ---------
curves_function(knn_results)
Gain: By approaching 25% of the ordered list, ~62% of all interested customers are reached.
Lift: By approaching 25% of the ordered list, the model performs ~2.5 times better than the random list.
model_comparison_df <- bind_rows(
lr_metrics_at_k_final,
dt_metrics_at_k_final,
rf_metrics_at_k_final,
xgb_metrics_at_k_final,
knn_metrics_at_k_final
) %>%
arrange(recall_at_k %>% desc())
model_comparison_df
Conclusion: After cross validation and hyperparameter tuning, KNN have shown to be the best model when it comes to the recall @K metric. So, this model will be used to make predictions, and to deploy our application.
Time: 28 minutes
# Saving final model
start_time <- Sys.time()
doParallel::registerDoParallel()
final_model <- fit(knn_wkfl, df_preprocessed)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(final_model, "final_model.rds")
# Reading final model
final_model <- readRDS("final_model.rds")
final_model$pre$mold$predictors %>%
colnames() %>%
tibble()
# Making prediction
predict(final_model,
tibble(
"days_associated" = 299,
"age" = 18,
"health_annual_paid" = 60000,
"region_code" = 28,
"policy_sales_channel" = 100,
"vehicle_damage" = "yes",
"previously_insured" = "no"),
type = "prob") %>%
gather() %>%
arrange(desc(value))