1 Introduction

With the task at hand to help Regork analyze customer retention data and find a model that can predict churn based on historical data, I started by exploring the customer distribution — who left and who stayed. This was the start of the data wrangling and predictive modeling to represent customer retention.

2 Data Preparation

The following packages were used to help with the task at hand it determining predicting factors for subscription turnover:

library(tidyverse)
library(tidymodels)
library(skimr)
library(janitor)
library(vip)

data <- read_csv("customer_retention.csv") %>%
  clean_names()

skim(data)
Data summary
Name data
Number of rows 6999
Number of columns 20
_______________________
Column type frequency:
character 16
numeric 4
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
gender 0 1 4 6 0 2 0
partner 0 1 2 3 0 2 0
dependents 0 1 2 3 0 2 0
phone_service 0 1 2 3 0 2 0
multiple_lines 0 1 2 16 0 3 0
internet_service 0 1 2 11 0 3 0
online_security 0 1 2 19 0 3 0
online_backup 0 1 2 19 0 3 0
device_protection 0 1 2 19 0 3 0
tech_support 0 1 2 19 0 3 0
streaming_tv 0 1 2 19 0 3 0
streaming_movies 0 1 2 19 0 3 0
contract 0 1 8 14 0 3 0
paperless_billing 0 1 2 3 0 2 0
payment_method 0 1 12 25 0 4 0
status 0 1 4 7 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
senior_citizen 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
tenure 0 1 32.38 24.55 0.00 9.00 29.00 55.00 72.00 ▇▃▃▃▆
monthly_charges 0 1 64.75 30.10 18.25 35.48 70.35 89.85 118.75 ▇▅▆▇▅
total_charges 11 1 2283.10 2266.22 18.80 401.92 1397.47 3796.91 8684.80 ▇▂▂▂▁

2.1 Data Cleaning

colSums(is.na(data))
##            gender    senior_citizen           partner        dependents 
##                 0                 0                 0                 0 
##            tenure     phone_service    multiple_lines  internet_service 
##                 0                 0                 0                 0 
##   online_security     online_backup device_protection      tech_support 
##                 0                 0                 0                 0 
##      streaming_tv  streaming_movies          contract paperless_billing 
##                 0                 0                 0                 0 
##    payment_method   monthly_charges     total_charges            status 
##                 0                 0                11                 0
data <- data %>%
  mutate(total_charges = as.numeric(total_charges)) %>%
  drop_na(total_charges)

data <- data %>%
  mutate(across(where(is.character), as.factor))

3 Analysis

3.1 Customer Status Distribution

Here we wanted to see the overall distribution on customers who have maintained subscriptions vs those who have left the subcription service.

status_summary <- data %>% count(status) %>% mutate(percent = n / sum(n))
status_summary
## # A tibble: 2 × 3
##   status      n percent
##   <fct>   <int>   <dbl>
## 1 Current  5132   0.734
## 2 Left     1856   0.266
ggplot(data, aes(x = status)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Customer Status Distribution", x = "Status", y = "Count")

3.2 Turnover Rate by type of Contract

ggplot(data, aes(x = contract, fill = status)) +
  geom_bar(position = "fill") +
  labs(title = "Churn Rate by Contract Type", y = "Proportion")

3.3 Turnover Rate by Gender

ggplot(data, aes(x = gender, fill = status)) +
  geom_bar(position = "fill") +
  labs(title = "Churn Rate by Gender", y = "Proportion")

3.4 Charges by month per status

ggplot(data, aes(x = status, y = monthly_charges, fill = status)) +
  geom_boxplot() +
  labs(title = "Monthly Charges by Status", y = "Monthly Charges")

3.5 Length of status

ggplot(data, aes(x = status, y = tenure, fill = status)) +
  geom_boxplot() +
  labs(title = "Tenure by Status", y = "Tenure (Months)")

4 Modeling the Data

4.1 Data Splitting and Wrangling

set.seed(123)
split <- initial_split(data, prop = 0.7, strata = status)
train_data <- training(split)
test_data <- testing(split)
churn_recipe <- recipe(status ~ ., data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

4.2 Specification

log_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

tree_model <- decision_tree(cost_complexity = tune(), tree_depth = tune()) %>%
  set_engine("rpart") %>%
  set_mode("classification")

rf_model <- rand_forest(mtry = tune(), trees = 500, min_n = tune()) %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")
log_wf <- workflow() %>% add_model(log_model) %>% add_recipe(churn_recipe)
tree_wf <- workflow() %>% add_model(tree_model) %>% add_recipe(churn_recipe)
rf_wf <- workflow() %>% add_model(rf_model) %>% add_recipe(churn_recipe)

4.3 Cross Validation

set.seed(234)
cv_folds <- vfold_cv(train_data, v = 5, strata = status)

4.4 Tuning up the model

log_results <- fit_resamples(log_wf, resamples = cv_folds, metrics = metric_set(roc_auc))

tree_grid <- grid_regular(cost_complexity(), tree_depth(), levels = 5)
tree_results <- tune_grid(tree_wf, resamples = cv_folds, grid = tree_grid, metrics = metric_set(roc_auc))

rf_grid <- grid_random(mtry(range = c(2,10)), min_n(), size = 10)
rf_results <- tune_grid(rf_wf, resamples = cv_folds, grid = rf_grid, metrics = metric_set(roc_auc))
## Warning: package 'ranger' was built under R version 4.4.3
collect_metrics(log_results)
## # A tibble: 1 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 roc_auc binary     0.844     5 0.00504 Preprocessor1_Model1
collect_metrics(tree_results)
## # A tibble: 25 × 8
##    cost_complexity tree_depth .metric .estimator  mean     n std_err .config    
##              <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
##  1    0.0000000001          1 roc_auc binary     0.5       5  0      Preprocess…
##  2    0.0000000178          1 roc_auc binary     0.5       5  0      Preprocess…
##  3    0.00000316            1 roc_auc binary     0.5       5  0      Preprocess…
##  4    0.000562              1 roc_auc binary     0.5       5  0      Preprocess…
##  5    0.1                   1 roc_auc binary     0.5       5  0      Preprocess…
##  6    0.0000000001          4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  7    0.0000000178          4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  8    0.00000316            4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  9    0.000562              4 roc_auc binary     0.719     5  0.0120 Preprocess…
## 10    0.1                   4 roc_auc binary     0.657     5  0.0394 Preprocess…
## # ℹ 15 more rows
collect_metrics(rf_results)
## # A tibble: 10 × 8
##     mtry min_n .metric .estimator  mean     n std_err .config              
##    <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1     3    24 roc_auc binary     0.844     5 0.00397 Preprocessor1_Model01
##  2     8    28 roc_auc binary     0.842     5 0.00478 Preprocessor1_Model02
##  3     9     4 roc_auc binary     0.833     5 0.00551 Preprocessor1_Model03
##  4     4    30 roc_auc binary     0.845     5 0.00414 Preprocessor1_Model04
##  5     4    25 roc_auc binary     0.845     5 0.00396 Preprocessor1_Model05
##  6     9    22 roc_auc binary     0.840     5 0.00502 Preprocessor1_Model06
##  7     6    10 roc_auc binary     0.840     5 0.00457 Preprocessor1_Model07
##  8     4    21 roc_auc binary     0.844     5 0.00409 Preprocessor1_Model08
##  9     6    35 roc_auc binary     0.844     5 0.00454 Preprocessor1_Model09
## 10     4    22 roc_auc binary     0.844     5 0.00437 Preprocessor1_Model10
final_rf <- finalize_workflow(rf_wf, select_best(rf_results, metric = "roc_auc"))

# Fit the final model on the training data
final_rf_fit <- fit(final_rf, data = train_data)
final_rf_fit %>%
  extract_fit_parsnip() %>%
  vip()

5 Predictions

test_preds <- predict(final_rf_fit, test_data, type = "prob") %>%
  bind_cols(predict(final_rf_fit, test_data)) %>%
  bind_cols(test_data)

likely_to_leave <- test_preds %>% filter(.pred_class == "Left")
nrow(likely_to_leave)
## [1] 430
confusion_matrix <- test_preds %>%
  conf_mat(truth = status, estimate = .pred_class)

confusion_matrix
##           Truth
## Prediction Current Left
##    Current    1392  275
##    Left        148  282
roc_curve(test_preds, truth = status, .pred_Left) %>%
  autoplot() +
  labs(title = "AUC Curve for Model Performance")

5.1 Analysis

predicted_loss <- sum(likely_to_leave$monthly_charges)
predicted_loss
## [1] 33016.5
cost_of_discount <- 20 * nrow(likely_to_leave)

cost_of_discount
## [1] 8600

6 Conclusion

In conclusion we found that Tenure and Total charges have a big impact on wether an individual will continue their sbscription or cancel it. These factors are important to know to help predict turnover rates and how we can be better at less churning in subscriptions.