#Introduction Regork Telecom recently entered the telecommunications market, where customer retention is critical due to the high cost of acquiring new customers compared to retaining existing ones. This report addresses the pressing issue of customer churn and proposes a predictive modeling approach to identify customers likely to leave. By focusing on influential predictors of churn, we aim to enable targeted retention strategies, minimize revenue loss, and enhance customer satisfaction.
This analysis leverages a dataset containing customer demographics, service usage, and billing information. Using exploratory data analysis (EDA) and machine learning, we identify trends, build predictive models, and propose actionable business solutions to reduce churn. #Analysis/EDA
library(tidyverse)
library(caret)
library(ggplot2)
getwd()
## [1] "C:/Users/Aaron/OneDrive/Documents"
data <- read.csv("customer_retention.csv")
data$TotalCharges <- as.numeric(as.character(data$TotalCharges))
data <- na.omit(data)
# Summary of dataset
summary(data)
## Gender SeniorCitizen Partner Dependents
## Length:6988 Min. :0.0000 Length:6988 Length:6988
## Class :character 1st Qu.:0.0000 Class :character Class :character
## Mode :character Median :0.0000 Mode :character Mode :character
## Mean :0.1621
## 3rd Qu.:0.0000
## Max. :1.0000
## Tenure PhoneService MultipleLines InternetService
## Min. : 1.00 Length:6988 Length:6988 Length:6988
## 1st Qu.: 9.00 Class :character Class :character Class :character
## Median :29.00 Mode :character Mode :character Mode :character
## Mean :32.43
## 3rd Qu.:55.00
## Max. :72.00
## OnlineSecurity OnlineBackup DeviceProtection TechSupport
## Length:6988 Length:6988 Length:6988 Length:6988
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## StreamingTV StreamingMovies Contract PaperlessBilling
## Length:6988 Length:6988 Length:6988 Length:6988
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## PaymentMethod MonthlyCharges TotalCharges Status
## Length:6988 Min. : 18.25 Min. : 18.8 Length:6988
## Class :character 1st Qu.: 35.54 1st Qu.: 401.9 Class :character
## Mode :character Median : 70.35 Median :1397.5 Mode :character
## Mean : 64.79 Mean :2283.1
## 3rd Qu.: 89.90 3rd Qu.:3796.9
## Max. :118.75 Max. :8684.8
churn_plot <- ggplot(data, aes(x = Status)) +
geom_bar(fill = "steelblue") +
labs(title = "Churn Distribution", x = "Customer Status", y = "Count") +
theme_minimal()
print(churn_plot)
tenure_payment_plot <- ggplot(data, aes(x = PaymentMethod, y = Tenure)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Tenure by Payment Method", x = "Payment Method", y = "Tenure (Months)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(tenure_payment_plot)
monthly_charges_plot <- ggplot(data, aes(x = Status, y = MonthlyCharges, fill = Status)) +
geom_boxplot() +
labs(title = "Monthly Charges by Churn Status", x = "Customer Status", y = "Monthly Charges") +
theme_minimal()
print(monthly_charges_plot)
internet_service_plot <- ggplot(data, aes(x = InternetService, fill = Status)) +
geom_bar(position = "fill") +
labs(title = "Churn Proportion by Internet Service", x = "Internet Service", y = "Proportion") +
scale_fill_manual(values = c("steelblue", "orange")) +
theme_minimal()
print(internet_service_plot)
#Machine Learning
set.seed(123)
# Train-test split
trainIndex <- createDataPartition(data$Status, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
logit_model <- train(Status ~ ., data = trainData, method = "glm", family = "binomial")
# Model Summary
summary(logit_model)
##
## Call:
## NULL
##
## Coefficients: (7 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.622e+00 9.102e-01 1.782 0.0747
## GenderMale -4.861e-02 7.306e-02 -0.665 0.5059
## SeniorCitizen 2.415e-01 9.532e-02 2.533 0.0113
## PartnerYes -4.389e-02 8.690e-02 -0.505 0.6135
## DependentsYes -1.269e-01 1.001e-01 -1.267 0.2051
## Tenure -6.737e-02 7.157e-03 -9.413 < 2e-16
## PhoneServiceYes 2.171e-01 7.258e-01 0.299 0.7648
## `MultipleLinesNo phone service` NA NA NA NA
## MultipleLinesYes 4.588e-01 1.982e-01 2.315 0.0206
## `InternetServiceFiber optic` 1.943e+00 8.927e-01 2.177 0.0295
## InternetServiceNo -2.010e+00 9.029e-01 -2.227 0.0260
## `OnlineSecurityNo internet service` NA NA NA NA
## OnlineSecurityYes -1.936e-01 1.993e-01 -0.972 0.3312
## `OnlineBackupNo internet service` NA NA NA NA
## OnlineBackupYes 3.680e-02 1.963e-01 0.187 0.8513
## `DeviceProtectionNo internet service` NA NA NA NA
## DeviceProtectionYes 2.006e-01 1.974e-01 1.016 0.3096
## `TechSupportNo internet service` NA NA NA NA
## TechSupportYes -1.157e-01 2.023e-01 -0.572 0.5675
## `StreamingTVNo internet service` NA NA NA NA
## StreamingTVYes 6.550e-01 3.652e-01 1.793 0.0729
## `StreamingMoviesNo internet service` NA NA NA NA
## StreamingMoviesYes 7.040e-01 3.640e-01 1.934 0.0531
## `ContractOne year` -6.147e-01 1.200e-01 -5.123 3.00e-07
## `ContractTwo year` -1.312e+00 1.955e-01 -6.709 1.96e-11
## PaperlessBillingYes 3.703e-01 8.363e-02 4.428 9.50e-06
## `PaymentMethodCredit card (automatic)` -8.184e-02 1.264e-01 -0.648 0.5173
## `PaymentMethodElectronic check` 2.188e-01 1.050e-01 2.084 0.0372
## `PaymentMethodMailed check` -2.259e-01 1.290e-01 -1.751 0.0800
## MonthlyCharges -4.877e-02 3.549e-02 -1.374 0.1694
## TotalCharges 3.927e-04 8.051e-05 4.878 1.07e-06
##
## (Intercept) .
## GenderMale
## SeniorCitizen *
## PartnerYes
## DependentsYes
## Tenure ***
## PhoneServiceYes
## `MultipleLinesNo phone service`
## MultipleLinesYes *
## `InternetServiceFiber optic` *
## InternetServiceNo *
## `OnlineSecurityNo internet service`
## OnlineSecurityYes
## `OnlineBackupNo internet service`
## OnlineBackupYes
## `DeviceProtectionNo internet service`
## DeviceProtectionYes
## `TechSupportNo internet service`
## TechSupportYes
## `StreamingTVNo internet service`
## StreamingTVYes .
## `StreamingMoviesNo internet service`
## StreamingMoviesYes .
## `ContractOne year` ***
## `ContractTwo year` ***
## PaperlessBillingYes ***
## `PaymentMethodCredit card (automatic)`
## `PaymentMethodElectronic check` *
## `PaymentMethodMailed check` .
## MonthlyCharges
## TotalCharges ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6472.6 on 5590 degrees of freedom
## Residual deviance: 4597.7 on 5567 degrees of freedom
## AIC: 4645.7
##
## Number of Fisher Scoring iterations: 6
library(rpart.plot)
tree_model <- train(Status ~ ., data = trainData, method = "rpart", trControl = trainControl(method = "cv", number = 5))
# Plot Decision Tree
rpart.plot::rpart.plot(tree_model$finalModel)
# Print the tree model summary
print(tree_model)
## CART
##
## 5591 samples
## 19 predictor
## 2 classes: 'Current', 'Left'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4473, 4473, 4473, 4472, 4473
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.003367003 0.7905540 0.4008617
## 0.005836139 0.7909124 0.4073335
## 0.107407407 0.7524625 0.1424784
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005836139.
### Random Forest
# Ensure Random Forest model works with tuning and correct parameters
library(randomForest)
# Train the Random Forest model
rf_model <- train(
Status ~ .,
data = trainData,
method = "rf",
trControl = trainControl(method = "cv", number = 5), # 5-fold cross-validation
tuneGrid = expand.grid(mtry = c(1:5)), # Hyperparameter tuning for mtry
importance = TRUE
)
# Print the Random Forest model summary
print(rf_model)
## Random Forest
##
## 5591 samples
## 19 predictor
## 2 classes: 'Current', 'Left'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 4472, 4473, 4473, 4473, 4473
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 1 0.7351103 0.006434559
## 2 0.7901983 0.358283757
## 3 0.8009301 0.437008487
## 4 0.8012882 0.449593691
## 5 0.8002157 0.448879265
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
### Model Comparison
# Ensure all the necessary libraries are loaded
# Set seed for reproducibility
set.seed(123)
# Logistic Regression
log_reg_model <- train(
Status ~ .,
data = trainData,
method = "glm",
family = "binomial",
trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
metric = "ROC"
)
# Decision Tree
tree_model <- train(
Status ~ .,
data = trainData,
method = "rpart",
trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
tuneLength = 10,
metric = "ROC"
)
# Random Forest
rf_model <- train(
Status ~ .,
data = trainData,
method = "rf",
trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
tuneGrid = expand.grid(mtry = c(1:5)),
metric = "ROC",
importance = TRUE
)
# Collect model results for comparison
results <- resamples(list(
Logistic_Regression = log_reg_model,
Decision_Tree = tree_model,
Random_Forest = rf_model
))
# Print model comparison summary
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: Logistic_Regression, Decision_Tree, Random_Forest
## Number of resamples: 5
##
## ROC
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Logistic_Regression 0.8239090 0.8488006 0.8499308 0.8478857 0.8530863 0.8637020
## Decision_Tree 0.7970673 0.7981047 0.8137444 0.8104168 0.8152352 0.8279322
## Random_Forest 0.8198899 0.8260566 0.8316970 0.8401187 0.8565333 0.8664169
## NA's
## Logistic_Regression 0
## Decision_Tree 0
## Random_Forest 0
##
## Sens
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Logistic_Regression 0.8842875 0.8844282 0.8867235 0.8918674 0.8989038 0.9049939
## Decision_Tree 0.8952497 0.9013398 0.9013398 0.9030685 0.9038929 0.9135201
## Random_Forest 0.8915956 0.8990268 0.9037759 0.9074546 0.9159562 0.9269184
## NA's
## Logistic_Regression 0
## Decision_Tree 0
## Random_Forest 0
##
## Spec
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## Logistic_Regression 0.4949495 0.5185185 0.5488215 0.5515152 0.5959596 0.5993266
## Decision_Tree 0.4646465 0.4680135 0.4848485 0.4895623 0.4915825 0.5387205
## Random_Forest 0.4747475 0.4983165 0.5185185 0.5111111 0.5218855 0.5420875
## NA's
## Logistic_Regression 0
## Decision_Tree 0
## Random_Forest 0
# Plot the comparison of models based on ROC (AUC)
bwplot(results)
# Predict churn probability
probabilities <- predict(rf_model, newdata = testData, type = "prob")[, "Left"]
testData$Predicted_Prob <- probabilities
testData$Predicted_Status <- ifelse(probabilities > 0.5, "Left", "Current")
# Revenue loss calculation
churners <- testData %>% filter(Predicted_Status == "Left")
monthly_loss <- sum(churners$MonthlyCharges)
cat("Predicted monthly revenue loss: $", round(monthly_loss, 2))
## Predicted monthly revenue loss: $ 22357.5
# Cost of incentives
discount_rate <- 0.2
incentive_cost <- sum(churners$MonthlyCharges * discount_rate * 3)
cat("Incentive cost for 3 months: $", round(incentive_cost, 2))
## Incentive cost for 3 months: $ 13414.5
# Retained revenue
retained_revenue <- sum(churners$MonthlyCharges)
cat("Potential retained revenue: $", round(retained_revenue, 2))
## Potential retained revenue: $ 22357.5
#Reccomondation Focus retention efforts on customers with short tenures, high monthly charges, and contract flexibility. Offer targeted discounts to customers using month-to-month contracts or high-risk payment methods. Monitor and improve services tied to internet and phone lines, as these show strong churn correlations. #Summary and conclusion Churn Predictors: Key predictors include contract type, tenure, and monthly charges. Proposed Action: Offer targeted discounts to high-risk customers, reducing churn and preserving revenue. Expected ROI: The benefit of retained revenue outweighs the cost of incentives. Limitations: Future models can incorporate real-time data and customer feedback for improved accuracy.
Our analysis highlights critical factors driving customer churn and provides actionable recommendations to Regork Telecom. Implementing these insights can significantly improve customer retention, reduce revenue loss, and build long-term customer loyalty
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.