With the task at hand to help Regork analyze customer retention data and find a model that can predict churn based on historical data, I started by exploring the customer distribution — who left and who stayed. This was the start of the data wrangling and predictive modeling to represent customer retention.
The following packages were used to help with the task at hand it determining predicting factors for subscription turnover:
library(tidyverse)
library(tidymodels)
library(skimr)
library(janitor)
library(vip)
data <- read_csv("customer_retention.csv") %>%
clean_names()
skim(data)
| Name | data |
| Number of rows | 6999 |
| Number of columns | 20 |
| _______________________ | |
| Column type frequency: | |
| character | 16 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| gender | 0 | 1 | 4 | 6 | 0 | 2 | 0 |
| partner | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| dependents | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| phone_service | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| multiple_lines | 0 | 1 | 2 | 16 | 0 | 3 | 0 |
| internet_service | 0 | 1 | 2 | 11 | 0 | 3 | 0 |
| online_security | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| online_backup | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| device_protection | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| tech_support | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| streaming_tv | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| streaming_movies | 0 | 1 | 2 | 19 | 0 | 3 | 0 |
| contract | 0 | 1 | 8 | 14 | 0 | 3 | 0 |
| paperless_billing | 0 | 1 | 2 | 3 | 0 | 2 | 0 |
| payment_method | 0 | 1 | 12 | 25 | 0 | 4 | 0 |
| status | 0 | 1 | 4 | 7 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| senior_citizen | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| tenure | 0 | 1 | 32.38 | 24.55 | 0.00 | 9.00 | 29.00 | 55.00 | 72.00 | ▇▃▃▃▆ |
| monthly_charges | 0 | 1 | 64.75 | 30.10 | 18.25 | 35.48 | 70.35 | 89.85 | 118.75 | ▇▅▆▇▅ |
| total_charges | 11 | 1 | 2283.10 | 2266.22 | 18.80 | 401.92 | 1397.47 | 3796.91 | 8684.80 | ▇▂▂▂▁ |
colSums(is.na(data))
## gender senior_citizen partner dependents
## 0 0 0 0
## tenure phone_service multiple_lines internet_service
## 0 0 0 0
## online_security online_backup device_protection tech_support
## 0 0 0 0
## streaming_tv streaming_movies contract paperless_billing
## 0 0 0 0
## payment_method monthly_charges total_charges status
## 0 0 11 0
data <- data %>%
mutate(total_charges = as.numeric(total_charges)) %>%
drop_na(total_charges)
data <- data %>%
mutate(across(where(is.character), as.factor))
Here we wanted to see the overall distribution on customers who have maintained subscriptions vs those who have left the subcription service.
status_summary <- data %>% count(status) %>% mutate(percent = n / sum(n))
status_summary
## # A tibble: 2 × 3
## status n percent
## <fct> <int> <dbl>
## 1 Current 5132 0.734
## 2 Left 1856 0.266
ggplot(data, aes(x = status)) +
geom_bar(fill = "steelblue") +
labs(title = "Customer Status Distribution", x = "Status", y = "Count")
ggplot(data, aes(x = contract, fill = status)) +
geom_bar(position = "fill") +
labs(title = "Churn Rate by Contract Type", y = "Proportion")
ggplot(data, aes(x = gender, fill = status)) +
geom_bar(position = "fill") +
labs(title = "Churn Rate by Gender", y = "Proportion")
ggplot(data, aes(x = status, y = monthly_charges, fill = status)) +
geom_boxplot() +
labs(title = "Monthly Charges by Status", y = "Monthly Charges")
ggplot(data, aes(x = status, y = tenure, fill = status)) +
geom_boxplot() +
labs(title = "Tenure by Status", y = "Tenure (Months)")
set.seed(123)
split <- initial_split(data, prop = 0.7, strata = status)
train_data <- training(split)
test_data <- testing(split)
churn_recipe <- recipe(status ~ ., data = train_data) %>%
step_dummy(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors())
log_model <- logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
tree_model <- decision_tree(cost_complexity = tune(), tree_depth = tune()) %>%
set_engine("rpart") %>%
set_mode("classification")
rf_model <- rand_forest(mtry = tune(), trees = 500, min_n = tune()) %>%
set_engine("ranger", importance = "impurity") %>%
set_mode("classification")
log_wf <- workflow() %>% add_model(log_model) %>% add_recipe(churn_recipe)
tree_wf <- workflow() %>% add_model(tree_model) %>% add_recipe(churn_recipe)
rf_wf <- workflow() %>% add_model(rf_model) %>% add_recipe(churn_recipe)
set.seed(234)
cv_folds <- vfold_cv(train_data, v = 5, strata = status)
log_results <- fit_resamples(log_wf, resamples = cv_folds, metrics = metric_set(roc_auc))
tree_grid <- grid_regular(cost_complexity(), tree_depth(), levels = 5)
tree_results <- tune_grid(tree_wf, resamples = cv_folds, grid = tree_grid, metrics = metric_set(roc_auc))
rf_grid <- grid_random(mtry(range = c(2,10)), min_n(), size = 10)
rf_results <- tune_grid(rf_wf, resamples = cv_folds, grid = rf_grid, metrics = metric_set(roc_auc))
## Warning: package 'ranger' was built under R version 4.4.3
collect_metrics(log_results)
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 roc_auc binary 0.844 5 0.00504 Preprocessor1_Model1
collect_metrics(tree_results)
## # A tibble: 25 × 8
## cost_complexity tree_depth .metric .estimator mean n std_err .config
## <dbl> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 0.0000000001 1 roc_auc binary 0.5 5 0 Preprocess…
## 2 0.0000000178 1 roc_auc binary 0.5 5 0 Preprocess…
## 3 0.00000316 1 roc_auc binary 0.5 5 0 Preprocess…
## 4 0.000562 1 roc_auc binary 0.5 5 0 Preprocess…
## 5 0.1 1 roc_auc binary 0.5 5 0 Preprocess…
## 6 0.0000000001 4 roc_auc binary 0.719 5 0.0120 Preprocess…
## 7 0.0000000178 4 roc_auc binary 0.719 5 0.0120 Preprocess…
## 8 0.00000316 4 roc_auc binary 0.719 5 0.0120 Preprocess…
## 9 0.000562 4 roc_auc binary 0.719 5 0.0120 Preprocess…
## 10 0.1 4 roc_auc binary 0.657 5 0.0394 Preprocess…
## # ℹ 15 more rows
collect_metrics(rf_results)
## # A tibble: 10 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 3 24 roc_auc binary 0.844 5 0.00397 Preprocessor1_Model01
## 2 8 28 roc_auc binary 0.842 5 0.00478 Preprocessor1_Model02
## 3 9 4 roc_auc binary 0.833 5 0.00551 Preprocessor1_Model03
## 4 4 30 roc_auc binary 0.845 5 0.00414 Preprocessor1_Model04
## 5 4 25 roc_auc binary 0.845 5 0.00396 Preprocessor1_Model05
## 6 9 22 roc_auc binary 0.840 5 0.00502 Preprocessor1_Model06
## 7 6 10 roc_auc binary 0.840 5 0.00457 Preprocessor1_Model07
## 8 4 21 roc_auc binary 0.844 5 0.00409 Preprocessor1_Model08
## 9 6 35 roc_auc binary 0.844 5 0.00454 Preprocessor1_Model09
## 10 4 22 roc_auc binary 0.844 5 0.00437 Preprocessor1_Model10
final_rf <- finalize_workflow(rf_wf, select_best(rf_results, metric = "roc_auc"))
# Fit the final model on the training data
final_rf_fit <- fit(final_rf, data = train_data)
final_rf_fit %>%
extract_fit_parsnip() %>%
vip()
test_preds <- predict(final_rf_fit, test_data, type = "prob") %>%
bind_cols(predict(final_rf_fit, test_data)) %>%
bind_cols(test_data)
likely_to_leave <- test_preds %>% filter(.pred_class == "Left")
nrow(likely_to_leave)
## [1] 430
confusion_matrix <- test_preds %>%
conf_mat(truth = status, estimate = .pred_class)
confusion_matrix
## Truth
## Prediction Current Left
## Current 1392 275
## Left 148 282
roc_curve(test_preds, truth = status, .pred_Left) %>%
autoplot() +
labs(title = "AUC Curve for Model Performance")
predicted_loss <- sum(likely_to_leave$monthly_charges)
predicted_loss
## [1] 33016.5
cost_of_discount <- 20 * nrow(likely_to_leave)
cost_of_discount
## [1] 8600
In conclusion we found that Tenure and Total charges have a big impact on wether an individual will continue their sbscription or cancel it. These factors are important to know to help predict turnover rates and how we can be better at less churning in subscriptions.