This analysis focuses on predicting customer retention for Regork’s telecommunications business using various machine learning models and comprehensive data visualization techniques.
library(tidymodels)
library(tidyverse)
library(knitr)
library(baguette)
library(vip)
library(pdp)
library(here)
library(kernlab)
library(ggplot2)
# Read and prepare the data
df <- read.csv("customer_retention.csv")
df <- mutate(df, Status = factor(Status))
df <- na.omit(df)
# Customer Tenure by Contract Type
p1 <- ggplot(df, aes(Tenure)) +
geom_bar(fill = "green") +
facet_wrap(~Contract) +
coord_flip() +
ggtitle("Customer Tenure and Length of Contract") +
labs(y = "Count of Contract Type", x = "Customer Tenure") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
strip.text = element_text(face = "bold")
)
print(p1)
# Age Group and Paperless Billing Analysis
p2 <- ggplot(df, aes(x = factor(SeniorCitizen, labels = c("Non-Senior", "Senior")),
fill = PaperlessBilling)) +
geom_bar(position = position_dodge(width = 0.9), width = 0.8) +
scale_fill_manual(values = c("Blue", "LightGreen")) +
ggtitle("Customers by Age Group and Paperless Billing") +
labs(y = "Number of Customers", x = "Age Group", fill = "Paperless Billing") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "bottom",
panel.grid.major.x = element_blank(),
axis.text.x = element_text(face = "bold")
)
print(p2)
# Multiple Lines Analysis by Contract Type
p3 <- ggplot(df, aes(MultipleLines)) +
geom_bar(fill = "blue", color = "green") +
facet_wrap(~Contract) +
ggtitle("Multiple Lines Distribution by Contract Type") +
labs(y = "Count of Customers", x = "Multiple Lines") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
strip.text = element_text(face = "bold")
)
print(p3)
# Set seed for reproducibility
set.seed(123)
# Create training/testing splits
logistic_split <- initial_split(df, prop = .5, strata = Status)
logistic_train <- training(logistic_split)
logistic_test <- testing(logistic_split)
# Create cross-validation folds
logistic_kfolds <- vfold_cv(logistic_train, v = 4, strata = Status)
# Create the model recipe
retention_recipe <- recipe(Status ~ ., data = logistic_train) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors())
# Create and fit decision tree model
dt_mod <- decision_tree(mode = "classification") %>%
set_engine("rpart")
dt_workflow <- workflow() %>%
add_recipe(retention_recipe) %>%
add_model(dt_mod)
dt_fit <- dt_workflow %>%
fit(data = logistic_train)
dt_results <- fit_resamples(dt_mod, retention_recipe, logistic_kfolds)
# Create and fit random forest model
rf_split <- initial_split(df, prop = 0.5, strata = Status)
rf_train <- training(rf_split)
rf_test <- testing(rf_split)
rf_recipe <- recipe(Status ~ ., data = rf_train)
rf_mod <- rand_forest(mode = "classification") %>%
set_engine("ranger")
rf_workflow <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(rf_mod)
rf_fit <- rf_workflow %>%
fit(data = rf_train)
# Collect and display model metrics
model_metrics <- collect_metrics(dt_results) %>%
mutate(model = "Decision Tree")
knitr::kable(model_metrics,
caption = "Model Performance Metrics",
digits = 3)
.metric | .estimator | mean | n | std_err | .config | model |
---|---|---|---|---|---|---|
accuracy | binary | 0.792 | 4 | 0.004 | Preprocessor1_Model1 | Decision Tree |
brier_class | binary | 0.158 | 4 | 0.002 | Preprocessor1_Model1 | Decision Tree |
roc_auc | binary | 0.716 | 4 | 0.013 | Preprocessor1_Model1 | Decision Tree |