This data is from an old Kaggle competition - https://www.kaggle.com/datasets/blastchar/telco-customer-churn
The data set includes information about:
I’ve recently started using tidymodels instead of caret, so I wanted to run through an example.
library(tidymodels)
#broom, dials, dplyr, ggplot2, infer, parsnip, purrr, recipes, rsample, tibble, tune, workflows, yardstick
tidymodels_prefer()
library(knitr)Initial Data Review
## customerID gender SeniorCitizen Partner
## Length:7043 Length:7043 Min. :0.0000 Length:7043
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.1621
## 3rd Qu.:0.0000
## Max. :1.0000
##
## Dependents tenure PhoneService MultipleLines
## Length:7043 Min. : 0.00 Length:7043 Length:7043
## Class :character 1st Qu.: 9.00 Class :character Class :character
## Mode :character Median :29.00 Mode :character Mode :character
## Mean :32.37
## 3rd Qu.:55.00
## Max. :72.00
##
## InternetService OnlineSecurity OnlineBackup DeviceProtection
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## TechSupport StreamingTV StreamingMovies Contract
## Length:7043 Length:7043 Length:7043 Length:7043
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges TotalCharges
## Length:7043 Length:7043 Min. : 18.25 Min. : 18.8
## Class :character Class :character 1st Qu.: 35.50 1st Qu.: 401.4
## Mode :character Mode :character Median : 70.35 Median :1397.5
## Mean : 64.76 Mean :2283.3
## 3rd Qu.: 89.85 3rd Qu.:3794.7
## Max. :118.75 Max. :8684.8
## NA's :11
## Churn
## Length:7043
## Class :character
## Mode :character
##
##
##
##
Update the data
df_churn <- data |>
# update SeniorCitizen to a character value
mutate(SeniorCitizen = ifelse(SeniorCitizen == 0, "No","Yes")) |>
# replace NAs in TotalCharges with 0. When filtering the NAs, all these customers had a tenure of 0, meaning they haven't paid anything...yet
mutate(TotalCharges = ifelse(is.na(TotalCharges),0,TotalCharges)) |>
# I also want to create variables for Phone Only, Internet Only, or Both
mutate(Package = ifelse(PhoneService %in% 'Yes' & InternetService %in% 'No', 'PhoneOnly',
ifelse(InternetService %in% c('DSL','Fiber Optic') & PhoneService %in% 'No', 'InternetOnly','Both'))) |>
# Put Churn at the front
select(Churn,everything())This code utilizes a recipe to create a data frame for each of the factor variables. Rather than creating dummy variables, setting one_hot = TRUE creates an indicator column for each option. This is better because I want to see how each of the factors compares to the Churn variable.
If I had used one_hot = FALSE, than the column for contract type would NOT show me month to month, one year, and two year, it would just show 2 of the 3 options.
corr_obj <- df_churn |> select(-customerID) |>
recipe(~ .) |>
step_dummy(all_nominal(), one_hot = TRUE )
corr_prep <- prep(corr_obj)
corr_df <- bake(corr_prep, new_data = df_churn)Reviews the correlation between Churn and all the variables
# Creates a matrix by expanding factor variables to a set of 'dummy variables'
churn_corr <- as.data.frame(model.matrix(~., data = corr_df)) |> select(-1)
m <- cor(churn_corr)
m_churn <- m['Churn_Yes', ]
m_churn_df <- data.frame(variable = names(m_churn), correlation = m_churn)
m_churn_df |>
filter(variable != c('Churn_Yes','Churn_No')) |>
ggplot(aes(x = reorder(variable,-correlation), y = correlation)) +
geom_bar(stat = "identity", position = "identity", fill = 'steelblue') +
geom_text(aes(label = sprintf("%.2f", correlation)),size = 2, vjust = -0.5,
position = position_dodge(width = 0.8)) +
labs(title = 'Correlation with Churn', y = 'Correlation',x = 'Variable') +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, size = 8)) +
scale_y_continuous(limits = c(-0.4, 0.4), breaks = seq(-0.4, 0.4, by = 0.1))From this bar plot, we can see that month to month contracts, no online security, no tech support, and Fiber Optic internet have the strongest positive correlations and tenure and 2 year contract have the strongest negative correlations - meaning that they are positively correlated with not churning.
# month to month is the most common contract
df_churn %>%
ggplot(aes(x = Contract, fill = Contract)) +
geom_bar() +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# having no online security is the most common option
df_churn %>%
ggplot(aes(x = OnlineSecurity, fill = OnlineSecurity)) +
geom_bar() +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# month to month and no online security is the most common
df_churn %>%
ggplot(aes(x = Contract, fill = Contract)) +
geom_bar() +
facet_wrap(vars(OnlineSecurity)) +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# much of the churn appears to come from month to month contracts with No Online Security
df_churn %>%
ggplot(aes(x = Contract, fill = Contract)) +
geom_bar() +
facet_wrap(vars(Churn, OnlineSecurity)) +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# There are a lot of shorter tenures
df_churn %>%
ggplot(aes(x = tenure)) +
geom_bar(color = "blue") +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# Month-to-month is the most common contract
df_churn %>%
ggplot(aes(x = Contract, fill = Contract)) +
geom_bar() +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# Month-to-month have shorter tenures, two year contracts have longer tenures
df_churn %>%
ggplot(aes(x = tenure)) +
geom_bar(color = "blue") +
facet_wrap(vars(Contract)) +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))# much of the churn appears to come from month to month clients with a shorter tenure
df_churn %>%
ggplot(aes(x = tenure)) +
geom_bar(color = "blue") +
facet_wrap(vars(Churn, Contract)) +
theme(legend.position="none") +
labs(x ="") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))There are a few different models I want to look at for predicting Churn:
I’ll split the data on the Churn variable and use 10 fold cross-validation for the training and tuning processes
# create a model data frame and change values to factors
model_df <- df_churn |> mutate_if(is.character, as.factor) |> mutate(customerID = as.character(customerID))The models that use distance functions will need to be normalized, so I’ll create two different recipes
model_recipe_no_norm <-
recipe(Churn ~ ., churn_train) |>
# keeps the custID in case we have to explore a specific customer
update_role(customerID, new_role = "ID") |> # keeps the custID in case we have to explore a specific customer
#Creates dummy variables
step_dummy(all_nominal_predictors()) |>
#removes variables with zero variance, if any
step_zv(all_predictors())
model_recipe_norm <- model_recipe_no_norm |>
# Normalizes the data
step_normalize(all_numeric_predictors())I’ve created model specifications below. These will be used in a workflow for each model. For all except for glmnet - I need to specify the mode - classification.
glm_spec <-
logistic_reg(penalty = tune(),
mixture = tune() # range from 0 to 1, with 1 being a pure Lasso model
) |>
set_engine("glmnet")
rf_spec <-
rand_forest(mtry = tune(),
min_n = tune(),
trees = 1000 # trees of 1000 should be sufficient
) |>
set_engine("ranger") |>
set_mode("classification")
xgb_spec <-
boost_tree(trees = 1000, # trees of 1000 should be sufficient
tree_depth = tune(),
min_n = tune(),
learn_rate = tune(),
loss_reduction = tune(),
mtry = tune(),
sample_size = tune()
) |>
set_engine("xgboost") |>
set_mode("classification")
svm_spec <-
svm_poly(cost = tune(), degree = tune()) |>
set_engine("kernlab") |>
set_mode("classification")
knn_spec <-
nearest_neighbor(neighbors = tune(),
dist_power = tune(),
weight_func = tune()) |>
set_engine("kknn") |>
set_mode("classification")I want to create a function that displays the metrics
model_metrics <- function(final_model) {
#ROC Curve
model_roc <- final_model |>
collect_predictions() |>
roc_curve(Churn, .pred_No) |>
autoplot()
print(model_roc)
#Metrics
metrics <- final_model |>
collect_metrics()
print(kable(metrics))
#Predictions
predictions <- final_model |>
collect_predictions()
# Create a confusion matrix
final_model |>
collect_predictions() |>
conf_mat(truth = Churn, estimate = .pred_class) |>
autoplot(type = "heatmap") |>
print()
props <- predictions |>
group_by(Churn, .pred_class) |>
summarize(total = n(), .groups = "drop_last") |>
mutate(prop = total / sum(total))
matrix(props$prop, byrow = FALSE, ncol = 2,
dimnames = list(
c(levels(predictions$Churn)),
c(levels(predictions$Churn))
)
) |>
conf_mat(Churn,.pred_class) |> autoplot(type = "heatmap")
}Create a workflow
Create a grid
Tune the hyperparameters
set.seed(345)
glmnet_tune <-
glmnet_wf %>%
tune_grid(
resamples = churn_folds,
control = control_grid(save_pred = TRUE),
grid = glmnet_grid
)
saveRDS(glmnet_tune, file = "Saved_Model_Results/glmnet_initial_tune.rds")Results of tuning
options(scipen = 4)
glmnet_tune |>
collect_metrics() |>
filter(.metric == "roc_auc") |>
arrange(-mean,penalty, mixture) |>
head(10) |>
kable()| penalty | mixture | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 0.00000 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model21 |
| 0.00000 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model22 |
| 0.00001 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model23 |
| 0.00000 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model06 |
| 0.00000 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model07 |
| 0.00001 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model08 |
| 0.00000 | 0.50 | roc_auc | binary | 0.8411199 | 10 | 0.0051826 | Preprocessor1_Model11 |
| 0.00000 | 0.50 | roc_auc | binary | 0.8411199 | 10 | 0.0051826 | Preprocessor1_Model12 |
| 0.00001 | 0.50 | roc_auc | binary | 0.8411199 | 10 | 0.0051826 | Preprocessor1_Model13 |
| 0.00000 | 0.75 | roc_auc | binary | 0.8411145 | 10 | 0.0051871 | Preprocessor1_Model16 |
Plot of tuning results
glmnet_tune %>%
collect_metrics() %>%
mutate(mixture = factor(mixture)) %>%
ggplot(aes(penalty, mean, color = mixture)) +
geom_line(size = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)Lower penalties seem to do well, however mixture isn’t quite as clear - they all have rocs_aucs of around 0.83 and 0.84 Smaller penalties appear to do better, so I can re-tune and define a range of values for penalty.
# rebuild the grid
glmnet_grid <- grid_regular(penalty(range = c(-10,-5)), # range is -10 to 0 and on the log scale
mixture(),
levels = 5)
# Modeling
set.seed(345)
glmnet_retune <-
glmnet_wf %>%
tune_grid(
resamples = churn_folds,
control = control_grid(save_pred = TRUE),
grid = glmnet_grid
)
saveRDS(glmnet_retune, file = "Saved_Model_Results/glmnet_retune.rds")
options(scipen = 9)
# Collect metrics
glmnet_retune |>
collect_metrics() |>
filter(.metric == "roc_auc") |>
arrange(-mean,penalty, mixture) |>
head(10) |>
kable()| penalty | mixture | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 0.0000000 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model21 |
| 0.0000000 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model22 |
| 0.0000000 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model23 |
| 0.0000006 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model24 |
| 0.0000100 | 1.00 | roc_auc | binary | 0.8411495 | 10 | 0.0051941 | Preprocessor1_Model25 |
| 0.0000000 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model06 |
| 0.0000000 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model07 |
| 0.0000000 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model08 |
| 0.0000006 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model09 |
| 0.0000100 | 0.25 | roc_auc | binary | 0.8411255 | 10 | 0.0051760 | Preprocessor1_Model10 |
Not much of a change using smaller penalties, but a mixture of 1 (pure lasso) appears to be the best.
Select the best parameters
| penalty | mixture | .config |
|---|---|---|
| 0 | 1 | Preprocessor1_Model21 |
Finalized glmnet model
Create a workflow using the best penalty and mixture - a low penalty and a pure lasso regularization.
Final Model
glmnet_final <-
glmnet_wf %>%
last_fit(churn_split)
saveRDS(glmnet_final, file = "Saved_Model_Results/glmnet_final.rds")Metrics
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.807 Preprocessor1_Model1
## 2 roc_auc binary 0.858 Preprocessor1_Model1
Create a workflow
Tuning the Hyperparameters
doParallel::registerDoParallel()
set.seed(12542)
rf_tune <- tune_grid(
rf_wf,
resamples = churn_folds,
grid = 30
)## i Creating pre-processing data to finalize unknown parameter: mtry
Metrics
rf_tune |>
collect_metrics() |>
filter(.metric == "roc_auc") |>
arrange(-mean) |>
head(10) |>
kable()| mtry | min_n | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 5 | 28 | roc_auc | binary | 0.8419630 | 10 | 0.0049751 | Preprocessor1_Model26 |
| 3 | 17 | roc_auc | binary | 0.8410658 | 10 | 0.0047453 | Preprocessor1_Model05 |
| 7 | 30 | roc_auc | binary | 0.8406773 | 10 | 0.0050237 | Preprocessor1_Model02 |
| 9 | 32 | roc_auc | binary | 0.8395159 | 10 | 0.0051026 | Preprocessor1_Model18 |
| 8 | 23 | roc_auc | binary | 0.8387182 | 10 | 0.0049896 | Preprocessor1_Model27 |
| 12 | 33 | roc_auc | binary | 0.8379905 | 10 | 0.0051707 | Preprocessor1_Model12 |
| 10 | 22 | roc_auc | binary | 0.8369434 | 10 | 0.0050522 | Preprocessor1_Model28 |
| 14 | 35 | roc_auc | binary | 0.8368982 | 10 | 0.0052511 | Preprocessor1_Model14 |
| 6 | 7 | roc_auc | binary | 0.8363543 | 10 | 0.0047926 | Preprocessor1_Model22 |
| 2 | 29 | roc_auc | binary | 0.8354133 | 10 | 0.0045600 | Preprocessor1_Model21 |
Plots
rf_tune %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "AUC")Generally it looks like a min_n between 23 and 33 and an mtry between 3 and 9
Now that we have a range of values to use, grid_regular can be utilized.
Re-tuning and Plotting
set.seed(456)
rf_retune <- tune_grid(
rf_wf,
resamples = churn_folds,
grid = rf_grid
)
saveRDS(rf_retune, file = "Saved_Model_Results/rf_retune.rds")
rf_retune %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(mtry, mean, color = min_n)) +
geom_line(alpha = 0.5, size = 1.5) +
geom_point() +
labs(y = "AUC") +
scale_x_continuous(breaks = seq(0, 12, by = 1))An mtry of 4 and a min_n of 33 perform the best
Best Model
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 4 33 Preprocessor1_Model44
Finalized Model
# final_rf <-
# workflow() %>%
# add_recipe(model_recipe_no_norm) %>%
# add_model(final_rf)
rf_wf <-
rf_wf |>
finalize_workflow(best_rf)
rf_final <-
rf_wf %>%
last_fit(churn_split)
saveRDS(rf_final, file = "Saved_Model_Results/rf_final.rds")## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.804 Preprocessor1_Model1
## 2 roc_auc binary 0.854 Preprocessor1_Model1
Variable Importance Plot
library(vip)
rf_vip <-
rf_spec %>%
finalize_model(select_best(rf_retune)) %>%
set_engine("ranger", importance = "permutation") # need to add a way to compute importance, so the engine needs to be adjusted
workflow() |>
add_recipe(model_recipe_no_norm) |>
add_model(rf_vip) |>
fit(churn_train) %>%
pull_workflow_fit() |>
vip(geom = "point")Tenure and TotalCharges are important, but they’re definitely correlated - TotalCharges is the product of tenure and MonthlyCharges. Fiber Optic is also important as well as Monthly Charges - which by itself is a good indicator
Arguments for the xgboost
This method is a little bit different than the others since we have 6 hyperparameters. Instead of making a grid and trying all the values or racing, we’re going to create a grid that covers the parameter space and fills the 6-d space in such a way that all the different parts are spread out evenly
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = (sample_prop()), # this need to be a proportion
mtry = finalize(mtry(),churn_train), # has an unknown in it - starts at 1, but doesn't have an end. Finalize uses heuristics based on the churn_train df
learn_rate(),
size = 40
)Alternatively, a racing method could be used
Racing methods:
We are taking the 10 resamples from cross validation. It picks one set of hyperparameters and tries that with all the resamples. Then ANOVA is used to decide if any of them are statistically much worse and those are thrown away. It keeps going until it figures out which ones are best.
We don’t evaluate all 15 hyperparameter combinations on all 10 resamples. It’s likely that some of the hyperparameter combinations are thrown away after running the through the first resample.
# #uses an anova model to see if some hyperparamters are different between models
# set.seed(1235)
# xgb_tune <-
# tune_race_anova(
# xgb_workflow,
# resamples = churn_folds,
# grid = 25,
# control = control_race(verbose_elim = TRUE, save_pred = TRUE)
# )
# saveRDS(xgb_tune, file = "Saved_Model_Results/xgb_mod1.rds")
#
# xgb_tune$.metrics[[1]] |> filter(.metric == 'roc_auc') |> arrange(-.estimate)
# plot_race(xgb_tune) +
# scale_x_continuous(breaks = seq(0, 10, by = 1), labels = seq(0, 10, by = 1))Create a workflow
Tune the model
doParallel::registerDoParallel()
set.seed(43624)
xgb_tune <- tune_grid(
xgb_wf,
resamples = churn_folds,
grid = xgb_grid,
control = control_grid(save_pred = TRUE)
)
saveRDS(xgb_tune, file = "Saved_Model_Results/xgb_initial_tune.rds")Plots
xgb_tune %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, mtry:sample_size) %>%
pivot_longer( mtry:sample_size,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x")
This plot shows the results of each. There’s not really an indication of
a range for each argument - maybe a higher tree_depth. For example, it’s
hard to tell if a higher or lower mtry will give us a higher mean,
generally.
We can look at the parameters for the best models based on accuracy and roc_auc
| mtry | min_n | tree_depth | learn_rate | loss_reduction | sample_size | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 16 | 37 | 5 | 0.0042413 | 0.0000002 | 0.7407727 | roc_auc | binary | 0.8459393 | 10 | 0.0055821 | Preprocessor1_Model21 |
| 5 | 18 | 10 | 0.0060422 | 1.0659622 | 0.7772719 | roc_auc | binary | 0.8454827 | 10 | 0.0052444 | Preprocessor1_Model16 |
| 21 | 34 | 9 | 0.0077551 | 0.0000214 | 0.6841453 | roc_auc | binary | 0.8452450 | 10 | 0.0055234 | Preprocessor1_Model32 |
| 19 | 32 | 12 | 0.0132575 | 0.0000021 | 0.5042190 | roc_auc | binary | 0.8444505 | 10 | 0.0053582 | Preprocessor1_Model26 |
| 6 | 16 | 7 | 0.0012132 | 0.5612216 | 0.9770812 | roc_auc | binary | 0.8436718 | 10 | 0.0052316 | Preprocessor1_Model35 |
…And then we can plug into the testing data
xgb_final <-
xgb_wf |>
finalize_workflow(best_xgb) |>
last_fit(churn_split)
saveRDS(xgb_final, file = "Saved_Model_Results/xgb_final.rds")## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.811 Preprocessor1_Model1
## 2 roc_auc binary 0.856 Preprocessor1_Model1
The best roc_auc was 85.65, which is slightly worse than penalized
logistic regression If we look at the confusion matrix, the model does
better at predicting no churn 84% vs churn 69%
xgb_vip <-
xgb_spec %>%
finalize_model(select_best(xgb_tune, 'roc_auc'))
workflow() |>
add_recipe(model_recipe_no_norm) |>
add_model(xgb_vip) |>
fit(churn_train) %>%
pull_workflow_fit() |>
vip(geom = "point")Tenure, Fiber_Optic, and 2-year contract are the most important
Model information
svm_grid <- grid_regular(cost(),
degree(range = c(1,3)),
levels = c(cost = 5, degree = 3))
head(svm_grid,5) |>kable()| cost | degree |
|---|---|
| 0.0009766 | 1 |
| 0.0131390 | 1 |
| 0.1767767 | 1 |
| 2.3784142 | 1 |
| 32.0000000 | 1 |
doParallel::registerDoParallel()
set.seed(345)
svm_tune <-
svm_wf %>%
tune_grid(
resamples = churn_folds,
control = control_grid(save_pred = TRUE),
grid = svm_grid
)
saveRDS(svm_tune, file = "Saved_Model_Results/svm_initial_tune.rds")| cost | degree | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 0.0009766 | 1 | roc_auc | binary | 0.8305303 | 10 | 0.0058982 | Preprocessor1_Model01 |
| 0.0131390 | 1 | roc_auc | binary | 0.8300172 | 10 | 0.0060825 | Preprocessor1_Model02 |
| 0.1767767 | 1 | roc_auc | binary | 0.8284232 | 10 | 0.0060003 | Preprocessor1_Model03 |
| 2.3784142 | 1 | roc_auc | binary | 0.8281469 | 10 | 0.0060189 | Preprocessor1_Model04 |
| 32.0000000 | 1 | roc_auc | binary | 0.8281065 | 10 | 0.0059997 | Preprocessor1_Model05 |
| 0.0009766 | 2 | roc_auc | binary | 0.8100025 | 10 | 0.0074573 | Preprocessor1_Model06 |
| 0.0131390 | 2 | roc_auc | binary | 0.8028656 | 10 | 0.0080810 | Preprocessor1_Model07 |
| 0.1767767 | 2 | roc_auc | binary | 0.8015521 | 10 | 0.0081695 | Preprocessor1_Model08 |
| 2.3784142 | 2 | roc_auc | binary | 0.8009954 | 10 | 0.0082861 | Preprocessor1_Model09 |
| 32.0000000 | 2 | roc_auc | binary | 0.7881594 | 10 | 0.0091714 | Preprocessor1_Model10 |
| cost | degree | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|
| 0.0009766 | 1 | roc_auc | binary | 0.8305303 | 10 | 0.0058982 | Preprocessor1_Model01 |
| 0.0131390 | 1 | roc_auc | binary | 0.8300172 | 10 | 0.0060825 | Preprocessor1_Model02 |
| 0.1767767 | 1 | roc_auc | binary | 0.8284232 | 10 | 0.0060003 | Preprocessor1_Model03 |
| 2.3784142 | 1 | roc_auc | binary | 0.8281469 | 10 | 0.0060189 | Preprocessor1_Model04 |
| 32.0000000 | 1 | roc_auc | binary | 0.8281065 | 10 | 0.0059997 | Preprocessor1_Model05 |
svm_final <-
svm_wf |>
finalize_workflow(best_svm) |>
last_fit(churn_split)
saveRDS(svm_final, file = "Saved_Model_Results/svm_final.rds")## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.801 Preprocessor1_Model1
## 2 roc_auc binary 0.846 Preprocessor1_Model1
Here are the arguments that will be tuned:
recipe knn_spec <- nearest_neighbor(neighbors = tune(), dist_power = tune(), weight_func = tune()) %>% set_engine(“kknn”) %>% set_mode(“classification”)
knn_grid <- grid_regular(neighbors(),
dist_power(range = c(1,3)),
weight_func(),
levels = c(neighbors = 3, dist_power = 3,weight_func = 10))doParallel::registerDoParallel()
set.seed(4575)
knn_tune <-
knn_wf %>%
tune_grid(
resamples = churn_folds,
control = control_grid(save_pred = TRUE),
grid = knn_grid
)
saveRDS(knn_tune, file = "Saved_Model_Results/knn_initial_tune.rds")Since the grid only had 4 levels, only 4 of the 10 distance weighting functions were included. Rectangular performs the best and more neighbors perform better across the board. Dist power doesn’t appear to matter much.
Create a new grid and retune
knn_grid <- grid_regular(neighbors(range = c(10,60)),
dist_power(range(1,3)),
weight_func(values = 'rectangular'),
levels = c(neighbors = 8, dist_power = 3, weight_func = 1))
knn_retune <-
knn_wf %>%
tune_grid(
resamples = churn_folds,
control = control_grid(save_pred = TRUE),
grid = knn_grid
)
saveRDS(knn_retune, file = "Saved_Model_Results/knn_retuned.rds")
#autoplot(knn_retune)A Minkowskis distance of 1 indicates Manhattan distance and 45 neighbors has the best roc_auc, 31 neighbors has the best accuracy, so there’s a bit of a tradeoff
| neighbors | weight_func | dist_power | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|---|
| 60 | rectangular | 1 | roc_auc | binary | 0.8293796 | 10 | 0.0045258 | Preprocessor1_Model08 |
| 38 | rectangular | 1 | roc_auc | binary | 0.8290584 | 10 | 0.0054406 | Preprocessor1_Model05 |
| 45 | rectangular | 1 | roc_auc | binary | 0.8289684 | 10 | 0.0050726 | Preprocessor1_Model06 |
| 52 | rectangular | 1 | roc_auc | binary | 0.8285914 | 10 | 0.0044145 | Preprocessor1_Model07 |
| 31 | rectangular | 1 | roc_auc | binary | 0.8276035 | 10 | 0.0049272 | Preprocessor1_Model04 |
| 60 | rectangular | 2 | roc_auc | binary | 0.8263033 | 10 | 0.0048223 | Preprocessor1_Model16 |
| 52 | rectangular | 2 | roc_auc | binary | 0.8249508 | 10 | 0.0046678 | Preprocessor1_Model15 |
| 38 | rectangular | 2 | roc_auc | binary | 0.8239603 | 10 | 0.0047623 | Preprocessor1_Model13 |
| 24 | rectangular | 1 | roc_auc | binary | 0.8238411 | 10 | 0.0048074 | Preprocessor1_Model03 |
| 31 | rectangular | 2 | roc_auc | binary | 0.8237956 | 10 | 0.0042344 | Preprocessor1_Model12 |
| neighbors | weight_func | dist_power | .metric | .estimator | mean | n | std_err | .config |
|---|---|---|---|---|---|---|---|---|
| 60 | rectangular | 1 | roc_auc | binary | 0.8293796 | 10 | 0.0045258 | Preprocessor1_Model08 |
| 38 | rectangular | 1 | roc_auc | binary | 0.8290584 | 10 | 0.0054406 | Preprocessor1_Model05 |
| 45 | rectangular | 1 | roc_auc | binary | 0.8289684 | 10 | 0.0050726 | Preprocessor1_Model06 |
| 52 | rectangular | 1 | roc_auc | binary | 0.8285914 | 10 | 0.0044145 | Preprocessor1_Model07 |
| 31 | rectangular | 1 | roc_auc | binary | 0.8276035 | 10 | 0.0049272 | Preprocessor1_Model04 |
knn_final <-
knn_wf |>
finalize_workflow(best_knn) |>
last_fit(churn_split)
saveRDS(knn_final, file = "Saved_Model_Results/knn_final.rds")## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.796 Preprocessor1_Model1
## 2 roc_auc binary 0.841 Preprocessor1_Model1
model_results <- rbind(
collect_metrics(glmnet_final) |> select(.metric, .estimate) |> mutate(model = 'glmnet'),
collect_metrics(rf_final) |> select(.metric, .estimate) |> mutate(model = 'RF'),
collect_metrics(xgb_final) |> select(.metric, .estimate) |> mutate(model = 'xgb'),
collect_metrics(svm_final) |> select(.metric, .estimate) |> mutate(model = 'svm'),
collect_metrics(knn_final) |> select(.metric, .estimate) |> mutate(model = 'knn')
)
model_results |> arrange(desc(.metric),desc(.estimate)) |> kable(align = 'c', caption = "All Model Results")| .metric | .estimate | model |
|---|---|---|
| roc_auc | 0.8580794 | glmnet |
| roc_auc | 0.8564933 | xgb |
| roc_auc | 0.8537547 | RF |
| roc_auc | 0.8455214 | svm |
| roc_auc | 0.8410522 | knn |
| accuracy | 0.8110102 | xgb |
| accuracy | 0.8070375 | glmnet |
| accuracy | 0.8036322 | RF |
| accuracy | 0.8013621 | svm |
| accuracy | 0.7956867 | knn |
The lasso model produced the best area under the ROC curve and XGBoost produced the best accuracy. The area under the ROC curve is just slightly better than XGB