Introduction

This analysis focuses on predicting customer retention for Regork’s telecommunications business using various machine learning models and comprehensive data visualization techniques.

Load Required Packages

library(tidymodels)
library(tidyverse)
library(knitr)
library(baguette)
library(vip)
library(pdp)
library(here)
library(kernlab)
library(ggplot2)

Data Preparation

# Read and prepare the data
df <- read.csv("customer_retention.csv")
df <- mutate(df, Status = factor(Status))
df <- na.omit(df)

Exploratory Data Visualization

# Customer Tenure by Contract Type
p1 <- ggplot(df, aes(Tenure)) + 
  geom_bar(fill = "green") +
  facet_wrap(~Contract) +
  coord_flip() +
  ggtitle("Customer Tenure and Length of Contract") +
  labs(y = "Count of Contract Type", x = "Customer Tenure") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text = element_text(face = "bold")
  )
print(p1)

# Age Group and Paperless Billing Analysis
p2 <- ggplot(df, aes(x = factor(SeniorCitizen, labels = c("Non-Senior", "Senior")), 
               fill = PaperlessBilling)) +
  geom_bar(position = position_dodge(width = 0.9), width = 0.8) +
  scale_fill_manual(values = c("Blue", "LightGreen")) +
  ggtitle("Customers by Age Group and Paperless Billing") +
  labs(y = "Number of Customers", x = "Age Group", fill = "Paperless Billing") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "bottom",
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(face = "bold")
  )
print(p2)

# Multiple Lines Analysis by Contract Type
p3 <- ggplot(df, aes(MultipleLines)) +
  geom_bar(fill = "blue", color = "green") +
  facet_wrap(~Contract) +
  ggtitle("Multiple Lines Distribution by Contract Type") +
  labs(y = "Count of Customers", x = "Multiple Lines") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text = element_text(face = "bold")
  )
print(p3)

Model Preparation

# Set seed for reproducibility
set.seed(123)

# Create training/testing splits
logistic_split <- initial_split(df, prop = .5, strata = Status)
logistic_train <- training(logistic_split)
logistic_test <- testing(logistic_split)

# Create cross-validation folds
logistic_kfolds <- vfold_cv(logistic_train, v = 4, strata = Status)

Recipe Creation

# Create the model recipe
retention_recipe <- recipe(Status ~ ., data = logistic_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

Decision Tree Model

# Create and fit decision tree model
dt_mod <- decision_tree(mode = "classification") %>%
  set_engine("rpart")

dt_workflow <- workflow() %>%
  add_recipe(retention_recipe) %>%
  add_model(dt_mod)

dt_fit <- dt_workflow %>%
  fit(data = logistic_train)

dt_results <- fit_resamples(dt_mod, retention_recipe, logistic_kfolds)

Random Forest Model

# Create and fit random forest model
rf_split <- initial_split(df, prop = 0.5, strata = Status)
rf_train <- training(rf_split)
rf_test <- testing(rf_split)

rf_recipe <- recipe(Status ~ ., data = rf_train)
rf_mod <- rand_forest(mode = "classification") %>%
  set_engine("ranger")

rf_workflow <- workflow() %>%
  add_recipe(rf_recipe) %>%
  add_model(rf_mod)

rf_fit <- rf_workflow %>%
  fit(data = rf_train)

Model Evaluation

# Collect and display model metrics
model_metrics <- collect_metrics(dt_results) %>%
  mutate(model = "Decision Tree")

knitr::kable(model_metrics, 
             caption = "Model Performance Metrics",
             digits = 3)
Model Performance Metrics
.metric .estimator mean n std_err .config model
accuracy binary 0.792 4 0.004 Preprocessor1_Model1 Decision Tree
brier_class binary 0.158 4 0.002 Preprocessor1_Model1 Decision Tree
roc_auc binary 0.716 4 0.013 Preprocessor1_Model1 Decision Tree