Introduction

Customer retention is crucial for Regork, a growing telecommunications company. With offerings like internet, phone, and streaming services, it is more cost-effective to retain existing customers than acquire new ones. Identifying customers at risk of leaving (churn) is vital to maintaining stable revenues and fostering loyalty.

This project explores customer data, builds predictive models to estimate churn risk, and provides actionable recommendations to enhance retention. Different variables are analyzed to bring fresh insights.

Libraries and Data Loading

library(tidymodels)
library(tidyverse)
library(baguette)
library(vip)
library(pdp)
library(here)
library(kernlab)
library(ggplot2)
library(earth)
library(rpart.plot)
library(ranger)

c_retention <- read.csv("customer_retention.csv")
c_retention <- c_retention %>%
  mutate(Status = factor(Status)) %>%
  na.omit()

Data Exploration

Internet Service Type

service_counts <- c_retention %>%
  count(InternetService) %>%
  arrange(desc(n))

ggplot(service_counts, aes(x = fct_reorder(InternetService, n), y = n)) +
  geom_col(fill = "steelblue", width = 0.6) +
  theme_minimal() +
  labs(title = "Distribution of Internet Service Types", x = "Internet Service", y = "Number of Customers") +
  coord_flip()

Payment Method

payment_counts <- c_retention %>%
  count(PaymentMethod) %>%
  arrange(desc(n))

ggplot(payment_counts, aes(x = fct_reorder(PaymentMethod, n), y = n)) +
  geom_col(fill = "darkred", width = 0.6) +
  theme_minimal() +
  labs(title = "Distribution of Payment Methods", x = "Payment Method", y = "Number of Customers") +
  coord_flip()

Monthly Charges Distribution

ggplot(c_retention, aes(x = MonthlyCharges)) +
  geom_histogram(binwidth = 5, fill = "darkgreen", color = "black") +
  theme_minimal() +
  labs(title = "Distribution of Monthly Charges", x = "Monthly Charges", y = "Count")

Senior Citizen Churn Analysis

ggplot(c_retention, aes(x = factor(SeniorCitizen), fill = Status)) +
  geom_bar(position = "fill") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal() +
  labs(title = "Churn Rate by Senior Citizen Status", x = "Senior Citizen", y = "Proportion", fill = "Status")

Machine Learning Models

Logistic Regression

set.seed(123)
log_split <- initial_split(c_retention, prop = 0.5, strata = Status)
log_train <- training(log_split)
log_test <- testing(log_split)

log_kfolds <- vfold_cv(log_train, v = 5, strata = Status)

results <- logistic_reg() %>%
  fit_resamples(Status ~ ., log_kfolds)
collect_metrics(results)
## # A tibble: 3 × 6
##   .metric     .estimator  mean     n std_err .config             
##   <chr>       <chr>      <dbl> <int>   <dbl> <chr>               
## 1 accuracy    binary     0.811     5 0.00691 Preprocessor1_Model1
## 2 brier_class binary     0.135     5 0.00282 Preprocessor1_Model1
## 3 roc_auc     binary     0.846     5 0.00780 Preprocessor1_Model1
final_log_fit <- logistic_reg() %>%
  fit(Status ~ ., data = log_train)

final_log_fit %>%
  predict(log_test) %>%
  bind_cols(log_test %>% select(Status)) %>%
  conf_mat(truth = Status, estimate = .pred_class)
##           Truth
## Prediction Current Left
##    Current    2272  412
##    Left        294  516

MARS Model

mars_recipe <- recipe(Status ~ ., data = log_train) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_normalize(all_numeric_predictors())

mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
  set_mode('classification')

mars_grid <- grid_regular(num_terms(range = c(1,30)), prod_degree(), levels = 40)

mars_wf <- workflow() %>%
  add_recipe(mars_recipe) %>%
  add_model(mars_mod)

tuning_results <- mars_wf %>%
  tune_grid(resamples = log_kfolds, grid = mars_grid)

autoplot(tuning_results)

show_best(tuning_results, metric = "roc_auc")
## # A tibble: 5 × 8
##   num_terms prod_degree .metric .estimator  mean     n std_err .config          
##       <int>       <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>            
## 1        17           1 roc_auc binary     0.850     5 0.00841 Preprocessor1_Mo…
## 2        18           1 roc_auc binary     0.850     5 0.00841 Preprocessor1_Mo…
## 3        19           1 roc_auc binary     0.850     5 0.00841 Preprocessor1_Mo…
## 4        20           1 roc_auc binary     0.850     5 0.00841 Preprocessor1_Mo…
## 5        21           1 roc_auc binary     0.850     5 0.00841 Preprocessor1_Mo…

Decision Tree

dt_mod <- decision_tree(mode = 'classification') %>%
  set_engine("rpart")
model_recipe <- recipe(Status ~ ., data = log_train)
dt_fit <- workflow() %>%
  add_recipe(model_recipe) %>%
  add_model(dt_mod) %>%
  fit(data = log_train)

rpart.plot::rpart.plot(dt_fit$fit$fit$fit)
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
##     Call rpart.plot with roundint=FALSE,
##     or rebuild the rpart model with model=TRUE.

Random Forest

rf_mod <- rand_forest(mode = "classification", trees = tune(), mtry = tune(), min_n = tune()) %>%
  set_engine("ranger", importance = "impurity")

rf_hyper_grid <- grid_regular(trees(range = c(50, 200)), mtry(range = c(2, 10)), min_n(range = c(1, 5)), levels = 3)

rf_results <- tune_grid(rf_mod, model_recipe, resamples = log_kfolds, grid = rf_hyper_grid)

show_best(rf_results, metric = "roc_auc")
## # A tibble: 5 × 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1     2   200     5 roc_auc binary     0.843     5 0.00789 Preprocessor1_Model21
## 2     2   125     3 roc_auc binary     0.843     5 0.00794 Preprocessor1_Model11
## 3     2   125     5 roc_auc binary     0.842     5 0.00893 Preprocessor1_Model20
## 4     2   200     1 roc_auc binary     0.841     5 0.00838 Preprocessor1_Model03
## 5     2   200     3 roc_auc binary     0.841     5 0.00773 Preprocessor1_Model12

Conclusion

Our analysis uncovered fresh insights about Regork’s customer churn. MonthlyCharges, InternetService, PaymentMethod, and SeniorCitizen status emerged as influential churn factors. Among the models, MARS again achieved the highest AUC, but Random Forest also performed competitively.

We recommend Regork to target at-risk groups, such as Fiber Optic users and customers with Electronic Check payments, while offering loyalty incentives to senior customers. Future improvements could involve longitudinal data collection and exploration of advanced ensemble models.

By embracing these strategies, Regork can enhance customer satisfaction, loyalty, and profitability in the competitive telecom market.