1 Introduction

In this part we train the same algorithms as before, but now using cross validation and hyperparameter tuning. Performing cross-validation is crucial to assess the model’s ability to generalize to unseen data, while hyperparameter tuning helps optimize the model’s performance by adjusting the values of these parameters to achieve better results on test data. Together, these practices contribute to a more robust and accurate model

2 Imports

library(tidyverse)
library(janitor)
library(readr)
library(gtsummary)
library(summarytools)
library(kableExtra)
library(knitr)
library(gridExtra)
library(summarytools)
library(randomForest)
library(reshape2)
library(tidymodels)

3 Helper Functions

# Encoders -----------
region_encoder <- readRDS("region_encoder.rds")
policy_encoder <- readRDS("policy_encoder.rds")

encoder_function <- function(df){
  df %>% 
  left_join(region_encoder) %>% 
  select(-region_code) %>% 
  rename(region_code = region_num) %>% 
  left_join(policy_encoder) %>% 
  select(-policy_sales_channel) %>% 
  rename(policy_sales_channel = policy_num) 
}


# Top @K metrics  --------------------------
metrics_at_k_function <- function(model_name, model_results, k){
  
  df_results <- model_results %>% 
    arrange(desc(.pred_yes)) %>% 
    mutate(
      TP = ifelse(.pred_class == "yes" & response == "yes", 1, 0),
      FP = ifelse(.pred_class == "yes" & response == "no", 1, 0),
      FN = ifelse(.pred_class == "no" & response == "yes", 1, 0),
      TN = ifelse(.pred_class == "no" & response == "no", 1, 0)
      ) 
  
  # Create list for precision and recall
  precision_at_k <- list()
  recall_at_k <- list()

  # Populate the metric list
  for (i in 1:k) {
    subset_k <- df_results %>% 
    dplyr_row_slice(1:i)
    
    precision_at_k[[i]] <- (subset_k$TP %>% sum())/(subset_k$TP %>% sum() + subset_k$FP %>% sum())
  
    recall_at_k[[i]] <- (subset_k$TP %>% sum())/(subset_k$TP %>% sum() + subset_k$FN %>% sum())
}

  # Complete dataframe
    metrics_at_k_df <- df_results %>% 
      dplyr_row_slice(1:k) %>% 
      mutate(
        precision_at_k = unlist(precision_at_k),
        recall_at_k = unlist(recall_at_k)
        )
    
    final_at_k_df <- tibble(model = model_name, k = k) %>% 
      bind_cols(
        metrics_at_k_df %>% 
          slice(k) %>% 
          select(precision_at_k, recall_at_k)
      )
      
      
    
    return(list(metrics_at_k_df, final_at_k_df))
}

# Gain & Lift Curves -------------
curves_function <-  function(model_results){
  gain_plt <- gain_curve(model_results, response, .pred_yes) %>% 
  autoplot()
  
  lift_plt <- lift_curve(model_results, response, .pred_yes) %>%    autoplot()
  
  return(gridExtra::grid.arrange(gain_plt, lift_plt, ncol = 2))
}

4 Data Collection

df_selected <- readRDS("df_selected.rds")
df_preprocessed <- encoder_function(df_selected)

5 Spliting into train and test sets

set.seed(123)

df_split <- df_preprocessed %>% 
  initial_split(strata = response)

df_train <- df_split %>% 
  training()

df_test <- df_split %>% 
  testing()

6 Preprocessing

LEMBRAR DE TIRAR O ID PARA TREINAR!!!!!!!!

# Featue engineering recipe
df_recipe <- recipe(response ~ .,
       data = df_train %>% select(-id)) %>% 
  step_normalize(age, days_associated) %>% 
  step_scale(health_annual_paid) %>% 
  step_dummy(all_nominal(), -all_outcomes())

7 KFold cross validation

df_kfolds <- vfold_cv(df_train %>% select(-id),
                      v = 5, strata = response)

8 Models definition and hyperparameter tunning

8.1 Logistic Regression 💻

Time to tune: 1.217322 mins

Time to train the final model: 4.43 secs

# Model Specification ----------
lr_model <- logistic_reg(penalty = tune(), 
                         mixture = tune()) %>% 
  set_engine("glmnet") %>% 
  set_mode("classification")

# See parameters ----------
hardhat::extract_parameter_set_dials(lr_model)
# Using dials package
lr_grid <- grid_regular(extract_parameter_set_dials(lr_model), levels = 5)
library(glmnet)

# Parallelize tuning process
doParallel::registerDoParallel()

# Tune package
start_time <- Sys.time()

lr_tune <- tune_grid(lr_model, df_recipe, 
                     resamples = df_kfolds,
                     grid = lr_grid)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(lr_tune, "lr_tune.rds")
# Select the best hyperparameters -----------
lr_tune <- readRDS("lr_tune.rds")

lr_param <- lr_tune %>% 
  select_best("roc_auc")

# Apply the hyperparameters to the model ----------
tidy_lr_model <- finalize_model(lr_model, lr_param)

# Create workflow ----------
lr_wkfl <- workflow() %>% 
  add_model(tidy_lr_model) %>% 
  add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()

start_time <- Sys.time()

# Train the model
lr_res <- last_fit(lr_wkfl, df_split)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(lr_res, "lr_res.rds")
lr_res <- readRDS("lr_res.rds")

# Confusion matrix
lr_res %>% 
  unnest(.predictions) %>% 
  conf_mat(truth = response, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     0     0
##        no  11678 83600

As we can see, the model classified all the client intention to sign for a new insurance as “no”, even with the parameter tuning.

lr_results <- lr_res %>% 
  unnest(.predictions) %>% 
  select(.pred_yes:response)

lr_metrics_at_k <- metrics_at_k_function("Logistic Regression", lr_results, 2000)

# Metrics @K ------------
lr_metrics_at_k_final <- lr_metrics_at_k[[2]]
lr_metrics_at_k_final
# Gain and lift curves ---------
curves_function(lr_results)

Gain: By approaching 25% of the ordered list, ~61% of all interested customers are reached.

Lift: By approaching 25% of the ordered list, the model performs ~2.4 times better than the random list.

8.2 Decision Tree 💻

Time to tune: 3.918506 mins

Time to train the final model: 14.88183 secs

# Model Specification ----------
tree_model <- decision_tree(
  cost_complexity = tune(),
  tree_depth = tune(),
  min_n = tune()
) %>%
  set_engine("rpart") %>%
  set_mode("classification")

# See parameters
hardhat::extract_parameter_set_dials(tree_model)
# Using dials package
tree_grid <- grid_regular(extract_parameter_set_dials(tree_model), levels = 3)
library(glmnet)

# Parallelize tuning process
doParallel::registerDoParallel()

# Tune package
start_time <- Sys.time()

tree_tune <- tune_grid(tree_model, df_recipe, 
                     resamples = df_kfolds,
                     grid = tree_grid)

end_time <- Sys.time()

print(end_time - start_time)

saveRDS(tree_tune, "tree_tune.rds")
# Select the best hyperparameters -----------
tree_tune <- readRDS("tree_tune.rds")

tree_param <- tree_tune %>% 
  select_best("roc_auc")

# Apply the hyperparameters to the model ----------
tidy_tree_model <- finalize_model(tree_model, tree_param)

# Create workflow ----------
tree_wkfl <- workflow() %>% 
  add_model(tidy_tree_model) %>% 
  add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()

start_time <- Sys.time()

# Train the model
tree_res <- last_fit(tree_wkfl, df_split)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(tree_res, "tree_res.rds")
tree_res <- readRDS("tree_res.rds")

# Confusion matrix
tree_res %>% 
  unnest(.predictions) %>% 
  conf_mat(truth = response, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes   459   992
##        no  11219 82608
tree_results <- tree_res %>% 
  unnest(.predictions) %>% 
  select(.pred_yes:response)

tree_metrics_at_k <- metrics_at_k_function("Decision Tree", tree_results, 2000)

# Metrics @K ------------
dt_metrics_at_k_final <- tree_metrics_at_k[[2]]
dt_metrics_at_k_final
# Gain and lift curves ---------
curves_function(tree_results)

Gain: By approaching 25% of the ordered list, ~62% of all interested customers are reached.

Lift: By approaching 25% of the ordered list, the model performs ~2.65 times better than the random list.

8.3 Random Forest 💻

Time to tune: 17.70377 mins

Time to train the final model: 1.552191 mins

# Model Specification ----------
rf_model <- rand_forest(
  mtry = tune(),
  trees = 100,
  min_n = tune()
) %>%
  set_engine("ranger") %>%
  set_mode("classification")

# See parameters
hardhat::extract_parameter_set_dials(rf_model)
# Using dials package
rf_grid <- grid_regular(
  mtry(range = c(10, 300)),
  min_n(range = c(100, 1000)),
  levels = 3
  )
# Parallelize tuning process
doParallel::registerDoParallel()

# Tune package
start_time <- Sys.time()

rf_tune <- tune_grid(rf_model, df_recipe, 
                     resamples = df_kfolds,
                     grid = rf_grid)

end_time <- Sys.time()

print(end_time - start_time)

saveRDS(rf_tune, "rf_tune.rds")
# Select the best hyperparameters -----------
rf_tune <- readRDS("rf_tune.rds")

rf_param <- rf_tune %>% 
  select_best("roc_auc")

# Apply the hyperparameters to the model ----------
tidy_rf_model <- finalize_model(rf_model, rf_param)

# Create workflow ----------
rf_wkfl <- workflow() %>% 
  add_model(tidy_rf_model) %>% 
  add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()

start_time <- Sys.time()

# Train the model
rf_res <- last_fit(rf_wkfl, df_split)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(rf_res, "rf_res.rds")
rf_res <- readRDS("rf_res.rds")

# Confusion matrix
rf_res %>% 
  unnest(.predictions) %>% 
  conf_mat(truth = response, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes    16    23
##        no  11662 83577
rf_results <- rf_res %>% 
  unnest(.predictions) %>% 
  select(.pred_yes:response)

rf_metrics_at_k <- metrics_at_k_function("Random Forest", rf_results, 2000)

# Metrics @K ------------
rf_metrics_at_k_final <- rf_metrics_at_k[[2]]
rf_metrics_at_k_final
# Gain and lift curves ---------
curves_function(rf_results)

Gain: By approaching 25% of the ordered list, ~68% of all interested customers are reached.

Lift: By approaching 25% of the ordered list, the model performs ~2.7times better than the random list.

9 XGBoost 💻

Time to tune: 28.93869 mins

Time to train the final model: 5.126836 mins

# Model Specification ----------
xgb_model <- boost_tree(
  trees = 1000,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune()
) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

# See parameters
hardhat::extract_parameter_set_dials(xgb_model)
# Using dials package
xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), df_train),
  learn_rate(),
  size = 10
)

Notice that we had to treat mtry() differently because it depends on the actual number of predictors in the data.

# Parallelize tuning process
doParallel::registerDoParallel()

# Tune package
start_time <- Sys.time()

xgb_tune <- tune_grid(xgb_model, df_recipe, 
                     resamples = df_kfolds,
                     grid = xgb_grid)

end_time <- Sys.time()

print(end_time - start_time)

saveRDS(xgb_tune, "xgb_tune.rds")
# Select the best hyperparameters -----------
xgb_tune <- readRDS("xgb_tune.rds")

xgb_param <- xgb_tune %>% 
  select_best("roc_auc")

# Apply the hyperparameters to the model ----------
tidy_xgb_model <- finalize_model(xgb_model, xgb_param)

# Create workflow ----------
xgb_wkfl <- workflow() %>% 
  add_model(tidy_xgb_model) %>% 
  add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()

start_time <- Sys.time()

# Train the model
xgb_res <- last_fit(xgb_wkfl, df_split)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(xgb_res, "xgb_res.rds")
xgb_res <- readRDS("xgb_res.rds")

# Confusion matrix
xgb_res %>% 
  unnest(.predictions) %>% 
  conf_mat(truth = response, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes     1     0
##        no  11677 83600
xgb_results <- xgb_res %>% 
  unnest(.predictions) %>% 
  select(.pred_yes:response)

xgb_metrics_at_k <- metrics_at_k_function("XGBoost", xgb_results, 2000)

# Metrics @K ------------
xgb_metrics_at_k_final <- xgb_metrics_at_k[[2]]
xgb_metrics_at_k_final
# Gain and lift curves ---------
curves_function(xgb_results)

Gain: By approaching 25% of the ordered list, ~70% of all interested customers are reached.

Lift: By approaching 25% of the ordered list, the model performs ~2.8 times better than the random list.

10 KNN 💻

Time to tune: 1.525734 hours

Time to train the final model: Time difference of 26.91236 mins

# Model Specification ----------
knn_model <- nearest_neighbor(neighbors = tune()) %>%
  set_engine("kknn") %>%
  set_mode("classification")

# See parameters
hardhat::extract_parameter_set_dials(knn_model)
# Using dials package
knn_grid <- grid_regular(
  extract_parameter_set_dials(knn_model),
  levels = 3
)
# Parallelize tuning process
doParallel::registerDoParallel()

# Tune package
start_time <- Sys.time()

knn_tune <- tune_grid(knn_model, df_recipe, 
                     resamples = df_kfolds,
                     grid = knn_grid)

end_time <- Sys.time()

print(end_time - start_time)

saveRDS(knn_tune, "knn_tune.rds")
# Select the best hyperparameters -----------
knn_tune <- readRDS("knn_tune.rds")

knn_param <- knn_tune %>% 
  select_best("roc_auc")

# Apply the hyperparameters to the model ----------
tidy_knn_model <- finalize_model(knn_model, knn_param)

# Create workflow ----------
knn_wkfl <- workflow() %>% 
  add_model(tidy_knn_model) %>% 
  add_recipe(df_recipe)
# Train the final model ----------------
doParallel::registerDoParallel()

start_time <- Sys.time()

# Train the model
knn_res <- last_fit(knn_wkfl, df_split)

end_time <- Sys.time()
print(end_time - start_time)

saveRDS(knn_res, "knn_res.rds")
knn_res <- readRDS("knn_res.rds")

# Confusion matrix
knn_res %>% 
  unnest(.predictions) %>% 
  conf_mat(truth = response, estimate = .pred_class)
##           Truth
## Prediction   yes    no
##        yes  1487  2869
##        no  10191 80731
knn_results <- knn_res %>% 
  unnest(.predictions) %>% 
  select(.pred_yes:response)

knn_metrics_at_k <- metrics_at_k_function("KNN", knn_results, 2000)

# Metrics @K ------------
knn_metrics_at_k_final <- knn_metrics_at_k[[2]]
knn_metrics_at_k_final
# Gain and lift curves ---------
curves_function(knn_results)

Gain: By approaching 25% of the ordered list, ~62% of all interested customers are reached.

Lift: By approaching 25% of the ordered list, the model performs ~2.5 times better than the random list.

11 Model Comparison

model_comparison_df <- bind_rows(
  lr_metrics_at_k_final,
  dt_metrics_at_k_final,
  rf_metrics_at_k_final,
  xgb_metrics_at_k_final,
  knn_metrics_at_k_final
) %>% 
  arrange(recall_at_k %>% desc())

model_comparison_df

Conclusion: After cross validation and hyperparameter tuning, KNN have shown to be the best model when it comes to the recall @K metric. So, this model will be used to make predictions, and to deploy our application.

12 Saving the best model and test for prediction

Time: 28 minutes

# Saving final model
start_time <- Sys.time()
doParallel::registerDoParallel()
final_model <- fit(knn_wkfl, df_preprocessed)
end_time <- Sys.time()
print(end_time - start_time)
saveRDS(final_model, "final_model.rds")

12.1 Making predictions

# Reading final model 
final_model <- readRDS("final_model.rds")
final_model$pre$mold$predictors %>% 
  colnames() %>% 
  tibble()
# Making prediction 
predict(final_model,
        tibble(
          "days_associated" = 299,        
          "age" = 18,                   
          "health_annual_paid" = 60000,    
          "region_code" = 28,           
          "policy_sales_channel" = 100,  
          "vehicle_damage" = "yes",    
          "previously_insured" = "no"),
        type = "prob") %>%
  gather() %>%
  arrange(desc(value))