knitr::opts_chunk$set(echo = TRUE)
We were brought on by Regork to analyze customer data and develop a model that predicts which customers are likely to leave, allowing Regork to proactively retain them. This insight is highly valuable for the CEO, as the cost of acquiring new customers often exceeds the cost of retaining current ones. By leveraging these findings, Regork can improve profitability through better customer retention strategies.
# Load libraries
library(tidyverse)
library(tidymodels)
library(baguette)
library(vip)
library(pdp)
library(here)
library(kernlab)
library(ggplot2)
library(ranger)
library(earth)
library(scales)
library(DT)
# Load data
data <- read_csv("/Users/Dhruv/Desktop/customer_retention.csv")
To address this problem, we applied three machine learning models: a Decision Tree, a Regularized Regression model, and a Multiple Adaptive Regression Splines (MARS) model. For each model, we split the data, evaluated key metrics, and analyzed coefficient significance. We then compared model performance based on the area under the curve (AUC). The MARS model showed the highest AUC score, making it the most accurate. We used this model to predict customer churn and estimate the associated revenue loss.
# Display data preview
viewable_data <- data
colnames(viewable_data)[colnames(viewable_data) == "PaperlessBilling"] <- "Paperless Billing"
colnames(viewable_data)[colnames(viewable_data) == "MonthlyCharges"] <- "Monthly Charges"
colnames(viewable_data)[colnames(viewable_data) == "TotalCharges"] <- "Total Charges"
selected_columns <- c("Status", "Monthly Charges", "Total Charges", "Tenure", "Contract", "Paperless Billing")
display_data <- viewable_data[selected_columns]
datatable(display_data, options = list(pageLength = 25, autoWidth = TRUE)) %>%
formatCurrency(c('Monthly Charges', 'Total Charges'), currency = "$") %>%
formatStyle('Status', target = 'row',
backgroundColor = styleEqual(c("Left", "Current"), c('#e5cdf1', '#dcf1f3')))
# Averages by status
status_avg_df <- data %>%
group_by(Status) %>%
summarise(Avg_Monthly = mean(MonthlyCharges, na.rm = TRUE),
Avg_Total = mean(TotalCharges, na.rm = TRUE),
Avg_Tenure = mean(Tenure, na.rm = TRUE))
status_avg_pivot <- status_avg_df %>%
pivot_longer(cols = starts_with("Avg"), names_to = "Metric", values_to = "Value")
status_avg_pivot$Metric <- factor(
status_avg_pivot$Metric,
levels = c("Avg_Monthly", "Avg_Total", "Avg_Tenure"),
labels = c("Avg Monthly Charges ($)", "Avg Total Charges ($)", "Avg Tenure (Months)")
)
# Plot averages
ggplot(status_avg_pivot, aes(x = Status, y = Value, fill = Status)) +
geom_col(width = 0.7) +
facet_wrap(~ Metric, scales = "free_y") +
scale_fill_manual(values = c("Left" = "#FF6F61", "Current" = "#6BAED6")) +
theme_light(base_size = 13) +
labs(title = "Key Averages by Customer Status", x = "Status", y = "Value") +
scale_y_continuous(labels = scales::comma)
Our findings will support the Regork Telecom CEO by identifying customers most at risk of leaving and highlighting the common traits among them. Based on these insights, we recommend two main strategies. First, a loyalty rewards program focused on customer tenure, offering perks and milestone-based incentives to encourage newer customers to stay longer. Second, improving direct customer service efforts — by consistently reaching out to customers to address concerns and contract questions early, Regork can further reduce churn.
# Tenure vs Monthly Charges
tenure_df <- data %>%
group_by(Tenure) %>%
summarize(average = mean(MonthlyCharges, na.rm = TRUE))
ggplot(tenure_df, aes(x = Tenure, y = average)) +
geom_line(color = "#6A51A3", size = 1.2) +
theme_minimal(base_size = 13) +
labs(title = "Trend: Monthly Charges Over Tenure",
subtitle = "Average Monthly Charges Across Tenure",
x = "Tenure (Months)",
y = "Avg Monthly Charges ($)") +
scale_y_continuous(labels = scales::dollar)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
There is a positive relationship between tenure and average monthly billing, showing that customers who stay longer generally pay more each month.
# Contract type by Status
ggplot(data, aes(x = Contract, fill = Status)) +
geom_bar(position = "fill") +
theme_minimal(base_size = 13) +
scale_fill_manual(values = c("Left" = "#FF6F61", "Current" = "#6BAED6")) +
labs(title = "Contract Type by Customer Status",
subtitle = "Proportion of Customers by Contract Type",
x = "Contract Type", y = "Proportion") +
scale_y_continuous(labels = scales::percent)
Month-to-month contract customers experience much higher churn rates
than those on one- or two-year agreements.
# Payment method by Status
ggplot(data, aes(x = PaymentMethod, fill = Status)) +
geom_bar(position = position_dodge(width = 0.8)) +
theme_bw(base_size = 13) +
scale_fill_manual(values = c("Left" = "#FC8D59", "Current" = "#91BFDB")) +
labs(title = "Payment Method vs Customer Status",
subtitle = "Count of Customers by Payment Method",
x = "Payment Method", y = "Count") +
theme(axis.text.x = element_text(angle = 30, hjust = 1))
Customers paying with electronic checks have a notably higher churn rate
compared to those using automatic payment methods.
# Paperless billing by Status
ggplot(data, aes(x = PaperlessBilling, fill = Status)) +
geom_bar(position = position_dodge(width = 0.7)) +
theme_classic(base_size = 13) +
scale_fill_manual(values = c("Left" = "#F8766D", "Current" = "#00BFC4")) +
labs(title = "Paperless Billing Preference vs Customer Status",
subtitle = "Comparison by Paperless Billing Preference",
x = "Paperless Billing", y = "Number of Customers")
Churn is significantly higher among paperless users, suggesting a
possible connection between paperless billing and higher churn.
# Train/test split
set.seed(123)
split <- initial_split(data, prop = 0.7, strata = Status)
train <- training(split) %>% na.omit()
test <- testing(split) %>% na.omit()
# Preprocessing recipe
recipe <- recipe(Status ~ ., data = train) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_normalize(all_numeric_predictors())
# MARS model
mars_mod <- mars(num_terms = tune(), prod_degree = tune()) %>%
set_mode('classification')
# Cross-validation
set.seed(123)
kfolds <- vfold_cv(train, v = 5, strata = Status)
# Grid search
mars_grid <- grid_regular(num_terms(c(1,30)), prod_degree(), levels = 25)
# Workflow
customer_retention_wf <- workflow() %>%
add_recipe(recipe) %>%
add_model(mars_mod)
# Hyperparameter tuning
tuning_results <- customer_retention_wf %>%
tune_grid(resamples = kfolds, grid = mars_grid)
# Best model
best_hyperparameters <- select_best(tuning_results, metric = "roc_auc")
final_wf <- customer_retention_wf %>%
finalize_workflow(best_hyperparameters)
mars_final_fit <- final_wf %>%
fit(data = train)
# VIP plot
mars_final_fit %>%
extract_fit_parsnip() %>%
vip(aesthetics = list(fill = "darkorchid3"))
The MARS model achieved the highest AUC score among the tested models,
identifying tenure, contract type, and spending patterns as major
factors.
# Predictions
churn_probabilities <- predict(mars_final_fit, new_data = data, type = "prob") %>%
bind_cols(data) %>%
select(everything()) %>%
arrange(desc(.pred_Left))
# Average churn probability
churn_probabilities %>%
group_by(Status) %>%
summarise(average = mean(.pred_Left, na.rm = TRUE))
## # A tibble: 2 × 2
## Status average
## <chr> <dbl>
## 1 Current 0.181
## 2 Left 0.499
# Customers likely to churn
churnned_cust <- churn_probabilities %>%
na.omit() %>%
filter(Status == "Current") %>%
filter(.pred_Left > 0.70)
# Revenue loss
churnned_cust %>%
summarise(total = sum(MonthlyCharges))
## # A tibble: 1 × 1
## total
## <dbl>
## 1 9803.
If no interventions are made, we anticipate 117 customers leaving within the next month, resulting in a potential revenue loss of $9,803.25.
We recommend introducing a loyalty program with milestone-based rewards and investing in improved customer service outreach.