Customer Retention Analysis

Introduction

This analysis focuses on predicting customer retention for Regork’s telecommunications business using various machine learning models and comprehensive data visualization techniques.

Load Required Packages

library(tidymodels)
library(tidyverse)
library(knitr)
library(baguette)
library(vip)
library(pdp)
library(here)
library(kernlab)
library(ggplot2)

Data Preparation

# Read and prepare the data
df <- read.csv("customer_retention.csv")
df <- mutate(df, Status = factor(Status))
df <- na.omit(df)

Exploratory Data Visualization

# Customer Tenure by Contract Type
p1 <- ggplot(df, aes(Tenure)) + 
  geom_bar(fill = "green") +
  facet_wrap(~Contract) +
  coord_flip() +
  ggtitle("Customer Tenure and Length of Contract") +
  labs(y = "Count of Contract Type", x = "Customer Tenure") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text = element_text(face = "bold")
  )
print(p1)

# Age Group and Paperless Billing Analysis
p2 <- ggplot(df, aes(x = factor(SeniorCitizen, labels = c("Non-Senior", "Senior")), 
               fill = PaperlessBilling)) +
  geom_bar(position = position_dodge(width = 0.9), width = 0.8) +
  scale_fill_manual(values = c("Blue", "LightGreen")) +
  ggtitle("Customers by Age Group and Paperless Billing") +
  labs(y = "Number of Customers", x = "Age Group", fill = "Paperless Billing") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    legend.position = "bottom",
    panel.grid.major.x = element_blank(),
    axis.text.x = element_text(face = "bold")
  )
print(p2)

# Multiple Lines Analysis by Contract Type
p3 <- ggplot(df, aes(MultipleLines)) +
  geom_bar(fill = "blue", color = "green") +
  facet_wrap(~Contract) +
  ggtitle("Multiple Lines Distribution by Contract Type") +
  labs(y = "Count of Customers", x = "Multiple Lines") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold"),
    strip.text = element_text(face = "bold")
  )
print(p3)

Model Preparation

# Set seed for reproducibility
set.seed(123)

# Create training/testing splits
logistic_split <- initial_split(df, prop = .5, strata = Status)
logistic_train <- training(logistic_split)
logistic_test <- testing(logistic_split)

# Create cross-validation folds
logistic_kfolds <- vfold_cv(logistic_train, v = 4, strata = Status)

Recipe Creation

# Create the model recipe
retention_recipe <- recipe(Status ~ ., data = logistic_train) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors())

Decision Tree Model

# Create and fit decision tree model
dt_mod <- decision_tree(mode = "classification") %>%
  set_engine("rpart")

dt_workflow <- workflow() %>%
  add_recipe(retention_recipe) %>%
  add_model(dt_mod)

dt_fit <- dt_workflow %>%
  fit(data = logistic_train)

dt_results <- fit_resamples(dt_mod, retention_recipe, logistic_kfolds)

Random Forest Model

# Create and fit random forest model
rf_split <- initial_split(df, prop = 0.5, strata = Status)
rf_train <- training(rf_split)
rf_test <- testing(rf_split)

rf_recipe <- recipe(Status ~ ., data = rf_train)
rf_mod <- rand_forest(mode = "classification") %>%
  set_engine("ranger")

rf_workflow <- workflow() %>%
  add_recipe(rf_recipe) %>%
  add_model(rf_mod)

rf_fit <- rf_workflow %>%
  fit(data = rf_train)

Model Evaluation

# Collect and display model metrics
model_metrics <- collect_metrics(dt_results) %>%
  mutate(model = "Decision Tree")

knitr::kable(model_metrics, 
             caption = "Model Performance Metrics",
             digits = 3)

Model Performance Metrics
.metric	.estimator	mean	n	std_err	.config	model
accuracy	binary	0.792	4	0.004	Preprocessor1_Model1	Decision Tree
brier_class	binary	0.158	4	0.002	Preprocessor1_Model1	Decision Tree
roc_auc	binary	0.716	4	0.013	Preprocessor1_Model1	Decision Tree

Customer Retention Analysis - Final Project

Atherva Patil

2024-12-08

Introduction

Load Required Packages

Data Preparation

Exploratory Data Visualization

Model Preparation

Recipe Creation

Decision Tree Model

Random Forest Model

Model Evaluation