1 Introduction

With the task at hand to help Regork analyze customer retention data and find a model that can predict churn based on historical data, I started by exploring the customer distribution — who left and who stayed. This was the start of the data wrangling and predictive modeling to represent customer retention.

2 Data Preparation

The following packages were used to help with the task at hand it determining predicting factors for subscription turnover:

library(tidyverse)
library(tidymodels)
library(skimr)
library(janitor)
library(vip)

data <- read_csv("customer_retention.csv") %>%
  clean_names()

skim(data)

Data summary
Name	data
Number of rows	6999
Number of columns	20
_______________________
Column type frequency:
character	16
numeric	4
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
gender	1	4	6	2
partner	1	2	3	2
dependents	1	2	3	2
phone_service	1	2	3	2
multiple_lines	1	2	16	3
internet_service	1	2	11	3
online_security	1	2	19	3
online_backup	1	2	19	3
device_protection	1	2	19	3
tech_support	1	2	19	3
streaming_tv	1	2	19	3
streaming_movies	1	2	19	3
contract	1	8	14	3
paperless_billing	1	2	3	2
payment_method	1	12	25	4
status	1	4	7	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
senior_citizen	0	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
tenure	0	1	32.38	24.55	0.00	9.00	29.00	55.00	72.00	▇▃▃▃▆
monthly_charges	0	1	64.75	30.10	18.25	35.48	70.35	89.85	118.75	▇▅▆▇▅
total_charges	11	1	2283.10	2266.22	18.80	401.92	1397.47	3796.91	8684.80	▇▂▂▂▁

2.1 Data Cleaning

colSums(is.na(data))

##            gender    senior_citizen           partner        dependents 
##                 0                 0                 0                 0 
##            tenure     phone_service    multiple_lines  internet_service 
##                 0                 0                 0                 0 
##   online_security     online_backup device_protection      tech_support 
##                 0                 0                 0                 0 
##      streaming_tv  streaming_movies          contract paperless_billing 
##                 0                 0                 0                 0 
##    payment_method   monthly_charges     total_charges            status 
##                 0                 0                11                 0

data <- data %>%
  mutate(total_charges = as.numeric(total_charges)) %>%
  drop_na(total_charges)

data <- data %>%
  mutate(across(where(is.character), as.factor))

3 Analysis

3.1 Customer Status Distribution

Here we wanted to see the overall distribution on customers who have maintained subscriptions vs those who have left the subcription service.

status_summary <- data %>% count(status) %>% mutate(percent = n / sum(n))
status_summary

## # A tibble: 2 × 3
##   status      n percent
##   <fct>   <int>   <dbl>
## 1 Current  5132   0.734
## 2 Left     1856   0.266

ggplot(data, aes(x = status)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Customer Status Distribution", x = "Status", y = "Count")

3.2 Turnover Rate by type of Contract

ggplot(data, aes(x = contract, fill = status)) +
  geom_bar(position = "fill") +
  labs(title = "Churn Rate by Contract Type", y = "Proportion")

3.3 Turnover Rate by Gender

ggplot(data, aes(x = gender, fill = status)) +
  geom_bar(position = "fill") +
  labs(title = "Churn Rate by Gender", y = "Proportion")

3.4 Charges by month per status

ggplot(data, aes(x = status, y = monthly_charges, fill = status)) +
  geom_boxplot() +
  labs(title = "Monthly Charges by Status", y = "Monthly Charges")

3.5 Length of status

ggplot(data, aes(x = status, y = tenure, fill = status)) +
  geom_boxplot() +
  labs(title = "Tenure by Status", y = "Tenure (Months)")

4 Modeling the Data

4.1 Data Splitting and Wrangling

set.seed(123)
split <- initial_split(data, prop = 0.7, strata = status)
train_data <- training(split)
test_data <- testing(split)

churn_recipe <- recipe(status ~ ., data = train_data) %>%
  step_dummy(all_nominal_predictors()) %>%
  step_normalize(all_numeric_predictors())

4.2 Specification

log_model <- logistic_reg() %>% 
  set_engine("glm") %>% 
  set_mode("classification")

tree_model <- decision_tree(cost_complexity = tune(), tree_depth = tune()) %>%
  set_engine("rpart") %>%
  set_mode("classification")

rf_model <- rand_forest(mtry = tune(), trees = 500, min_n = tune()) %>%
  set_engine("ranger", importance = "impurity") %>%
  set_mode("classification")

log_wf <- workflow() %>% add_model(log_model) %>% add_recipe(churn_recipe)
tree_wf <- workflow() %>% add_model(tree_model) %>% add_recipe(churn_recipe)
rf_wf <- workflow() %>% add_model(rf_model) %>% add_recipe(churn_recipe)

4.3 Cross Validation

set.seed(234)
cv_folds <- vfold_cv(train_data, v = 5, strata = status)

4.4 Tuning up the model

log_results <- fit_resamples(log_wf, resamples = cv_folds, metrics = metric_set(roc_auc))

tree_grid <- grid_regular(cost_complexity(), tree_depth(), levels = 5)
tree_results <- tune_grid(tree_wf, resamples = cv_folds, grid = tree_grid, metrics = metric_set(roc_auc))

rf_grid <- grid_random(mtry(range = c(2,10)), min_n(), size = 10)
rf_results <- tune_grid(rf_wf, resamples = cv_folds, grid = rf_grid, metrics = metric_set(roc_auc))

## Warning: package 'ranger' was built under R version 4.4.3

collect_metrics(log_results)

## # A tibble: 1 × 6
##   .metric .estimator  mean     n std_err .config             
##   <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
## 1 roc_auc binary     0.844     5 0.00504 Preprocessor1_Model1

collect_metrics(tree_results)

## # A tibble: 25 × 8
##    cost_complexity tree_depth .metric .estimator  mean     n std_err .config    
##              <dbl>      <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>      
##  1    0.0000000001          1 roc_auc binary     0.5       5  0      Preprocess…
##  2    0.0000000178          1 roc_auc binary     0.5       5  0      Preprocess…
##  3    0.00000316            1 roc_auc binary     0.5       5  0      Preprocess…
##  4    0.000562              1 roc_auc binary     0.5       5  0      Preprocess…
##  5    0.1                   1 roc_auc binary     0.5       5  0      Preprocess…
##  6    0.0000000001          4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  7    0.0000000178          4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  8    0.00000316            4 roc_auc binary     0.719     5  0.0120 Preprocess…
##  9    0.000562              4 roc_auc binary     0.719     5  0.0120 Preprocess…
## 10    0.1                   4 roc_auc binary     0.657     5  0.0394 Preprocess…
## # ℹ 15 more rows

collect_metrics(rf_results)

## # A tibble: 10 × 8
##     mtry min_n .metric .estimator  mean     n std_err .config              
##    <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
##  1     3    24 roc_auc binary     0.844     5 0.00397 Preprocessor1_Model01
##  2     8    28 roc_auc binary     0.842     5 0.00478 Preprocessor1_Model02
##  3     9     4 roc_auc binary     0.833     5 0.00551 Preprocessor1_Model03
##  4     4    30 roc_auc binary     0.845     5 0.00414 Preprocessor1_Model04
##  5     4    25 roc_auc binary     0.845     5 0.00396 Preprocessor1_Model05
##  6     9    22 roc_auc binary     0.840     5 0.00502 Preprocessor1_Model06
##  7     6    10 roc_auc binary     0.840     5 0.00457 Preprocessor1_Model07
##  8     4    21 roc_auc binary     0.844     5 0.00409 Preprocessor1_Model08
##  9     6    35 roc_auc binary     0.844     5 0.00454 Preprocessor1_Model09
## 10     4    22 roc_auc binary     0.844     5 0.00437 Preprocessor1_Model10

final_rf <- finalize_workflow(rf_wf, select_best(rf_results, metric = "roc_auc"))

# Fit the final model on the training data
final_rf_fit <- fit(final_rf, data = train_data)

final_rf_fit %>%
  extract_fit_parsnip() %>%
  vip()

5 Predictions

test_preds <- predict(final_rf_fit, test_data, type = "prob") %>%
  bind_cols(predict(final_rf_fit, test_data)) %>%
  bind_cols(test_data)

likely_to_leave <- test_preds %>% filter(.pred_class == "Left")
nrow(likely_to_leave)

## [1] 430

confusion_matrix <- test_preds %>%
  conf_mat(truth = status, estimate = .pred_class)

confusion_matrix

##           Truth
## Prediction Current Left
##    Current    1392  275
##    Left        148  282

roc_curve(test_preds, truth = status, .pred_Left) %>%
  autoplot() +
  labs(title = "AUC Curve for Model Performance")

5.1 Analysis

predicted_loss <- sum(likely_to_leave$monthly_charges)
predicted_loss

## [1] 33016.5

cost_of_discount <- 20 * nrow(likely_to_leave)

cost_of_discount

## [1] 8600

6 Conclusion

In conclusion we found that Tenure and Total charges have a big impact on wether an individual will continue their sbscription or cancel it. These factors are important to know to help predict turnover rates and how we can be better at less churning in subscriptions.

Data Mining Final Project - Andy Boros

Andy Boros

2025-04-27