#Introduction Regork Telecom recently entered the telecommunications market, where customer retention is critical due to the high cost of acquiring new customers compared to retaining existing ones. This report addresses the pressing issue of customer churn and proposes a predictive modeling approach to identify customers likely to leave. By focusing on influential predictors of churn, we aim to enable targeted retention strategies, minimize revenue loss, and enhance customer satisfaction.

This analysis leverages a dataset containing customer demographics, service usage, and billing information. Using exploratory data analysis (EDA) and machine learning, we identify trends, build predictive models, and propose actionable business solutions to reduce churn. #Analysis/EDA

library(tidyverse)
library(caret)
library(ggplot2)
getwd()
## [1] "C:/Users/Aaron/OneDrive/Documents"
data <- read.csv("customer_retention.csv")

data$TotalCharges <- as.numeric(as.character(data$TotalCharges))
data <- na.omit(data)

# Summary of dataset
summary(data)
##     Gender          SeniorCitizen      Partner           Dependents       
##  Length:6988        Min.   :0.0000   Length:6988        Length:6988       
##  Class :character   1st Qu.:0.0000   Class :character   Class :character  
##  Mode  :character   Median :0.0000   Mode  :character   Mode  :character  
##                     Mean   :0.1621                                        
##                     3rd Qu.:0.0000                                        
##                     Max.   :1.0000                                        
##      Tenure      PhoneService       MultipleLines      InternetService   
##  Min.   : 1.00   Length:6988        Length:6988        Length:6988       
##  1st Qu.: 9.00   Class :character   Class :character   Class :character  
##  Median :29.00   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :32.43                                                           
##  3rd Qu.:55.00                                                           
##  Max.   :72.00                                                           
##  OnlineSecurity     OnlineBackup       DeviceProtection   TechSupport       
##  Length:6988        Length:6988        Length:6988        Length:6988       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  StreamingTV        StreamingMovies      Contract         PaperlessBilling  
##  Length:6988        Length:6988        Length:6988        Length:6988       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  PaymentMethod      MonthlyCharges    TotalCharges       Status         
##  Length:6988        Min.   : 18.25   Min.   :  18.8   Length:6988       
##  Class :character   1st Qu.: 35.54   1st Qu.: 401.9   Class :character  
##  Mode  :character   Median : 70.35   Median :1397.5   Mode  :character  
##                     Mean   : 64.79   Mean   :2283.1                     
##                     3rd Qu.: 89.90   3rd Qu.:3796.9                     
##                     Max.   :118.75   Max.   :8684.8
churn_plot <- ggplot(data, aes(x = Status)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Churn Distribution", x = "Customer Status", y = "Count") +
  theme_minimal()

print(churn_plot)

tenure_payment_plot <- ggplot(data, aes(x = PaymentMethod, y = Tenure)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Tenure by Payment Method", x = "Payment Method", y = "Tenure (Months)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(tenure_payment_plot)

monthly_charges_plot <- ggplot(data, aes(x = Status, y = MonthlyCharges, fill = Status)) +
  geom_boxplot() +
  labs(title = "Monthly Charges by Churn Status", x = "Customer Status", y = "Monthly Charges") +
  theme_minimal()

print(monthly_charges_plot)

internet_service_plot <- ggplot(data, aes(x = InternetService, fill = Status)) +
  geom_bar(position = "fill") +
  labs(title = "Churn Proportion by Internet Service", x = "Internet Service", y = "Proportion") +
  scale_fill_manual(values = c("steelblue", "orange")) +
  theme_minimal()

print(internet_service_plot)

#Machine Learning

set.seed(123)

# Train-test split
trainIndex <- createDataPartition(data$Status, p = 0.8, list = FALSE)
trainData <- data[trainIndex, ]
testData <- data[-trainIndex, ]
logit_model <- train(Status ~ ., data = trainData, method = "glm", family = "binomial")

# Model Summary
summary(logit_model)
## 
## Call:
## NULL
## 
## Coefficients: (7 not defined because of singularities)
##                                          Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             1.622e+00  9.102e-01   1.782   0.0747
## GenderMale                             -4.861e-02  7.306e-02  -0.665   0.5059
## SeniorCitizen                           2.415e-01  9.532e-02   2.533   0.0113
## PartnerYes                             -4.389e-02  8.690e-02  -0.505   0.6135
## DependentsYes                          -1.269e-01  1.001e-01  -1.267   0.2051
## Tenure                                 -6.737e-02  7.157e-03  -9.413  < 2e-16
## PhoneServiceYes                         2.171e-01  7.258e-01   0.299   0.7648
## `MultipleLinesNo phone service`                NA         NA      NA       NA
## MultipleLinesYes                        4.588e-01  1.982e-01   2.315   0.0206
## `InternetServiceFiber optic`            1.943e+00  8.927e-01   2.177   0.0295
## InternetServiceNo                      -2.010e+00  9.029e-01  -2.227   0.0260
## `OnlineSecurityNo internet service`            NA         NA      NA       NA
## OnlineSecurityYes                      -1.936e-01  1.993e-01  -0.972   0.3312
## `OnlineBackupNo internet service`              NA         NA      NA       NA
## OnlineBackupYes                         3.680e-02  1.963e-01   0.187   0.8513
## `DeviceProtectionNo internet service`          NA         NA      NA       NA
## DeviceProtectionYes                     2.006e-01  1.974e-01   1.016   0.3096
## `TechSupportNo internet service`               NA         NA      NA       NA
## TechSupportYes                         -1.157e-01  2.023e-01  -0.572   0.5675
## `StreamingTVNo internet service`               NA         NA      NA       NA
## StreamingTVYes                          6.550e-01  3.652e-01   1.793   0.0729
## `StreamingMoviesNo internet service`           NA         NA      NA       NA
## StreamingMoviesYes                      7.040e-01  3.640e-01   1.934   0.0531
## `ContractOne year`                     -6.147e-01  1.200e-01  -5.123 3.00e-07
## `ContractTwo year`                     -1.312e+00  1.955e-01  -6.709 1.96e-11
## PaperlessBillingYes                     3.703e-01  8.363e-02   4.428 9.50e-06
## `PaymentMethodCredit card (automatic)` -8.184e-02  1.264e-01  -0.648   0.5173
## `PaymentMethodElectronic check`         2.188e-01  1.050e-01   2.084   0.0372
## `PaymentMethodMailed check`            -2.259e-01  1.290e-01  -1.751   0.0800
## MonthlyCharges                         -4.877e-02  3.549e-02  -1.374   0.1694
## TotalCharges                            3.927e-04  8.051e-05   4.878 1.07e-06
##                                           
## (Intercept)                            .  
## GenderMale                                
## SeniorCitizen                          *  
## PartnerYes                                
## DependentsYes                             
## Tenure                                 ***
## PhoneServiceYes                           
## `MultipleLinesNo phone service`           
## MultipleLinesYes                       *  
## `InternetServiceFiber optic`           *  
## InternetServiceNo                      *  
## `OnlineSecurityNo internet service`       
## OnlineSecurityYes                         
## `OnlineBackupNo internet service`         
## OnlineBackupYes                           
## `DeviceProtectionNo internet service`     
## DeviceProtectionYes                       
## `TechSupportNo internet service`          
## TechSupportYes                            
## `StreamingTVNo internet service`          
## StreamingTVYes                         .  
## `StreamingMoviesNo internet service`      
## StreamingMoviesYes                     .  
## `ContractOne year`                     ***
## `ContractTwo year`                     ***
## PaperlessBillingYes                    ***
## `PaymentMethodCredit card (automatic)`    
## `PaymentMethodElectronic check`        *  
## `PaymentMethodMailed check`            .  
## MonthlyCharges                            
## TotalCharges                           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6472.6  on 5590  degrees of freedom
## Residual deviance: 4597.7  on 5567  degrees of freedom
## AIC: 4645.7
## 
## Number of Fisher Scoring iterations: 6
library(rpart.plot)
tree_model <- train(Status ~ ., data = trainData, method = "rpart", trControl = trainControl(method = "cv", number = 5))

# Plot Decision Tree
rpart.plot::rpart.plot(tree_model$finalModel)

# Print the tree model summary
print(tree_model)
## CART 
## 
## 5591 samples
##   19 predictor
##    2 classes: 'Current', 'Left' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4473, 4473, 4473, 4472, 4473 
## Resampling results across tuning parameters:
## 
##   cp           Accuracy   Kappa    
##   0.003367003  0.7905540  0.4008617
##   0.005836139  0.7909124  0.4073335
##   0.107407407  0.7524625  0.1424784
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.005836139.
### Random Forest

# Ensure Random Forest model works with tuning and correct parameters
library(randomForest)

# Train the Random Forest model
rf_model <- train(
  Status ~ ., 
  data = trainData, 
  method = "rf", 
  trControl = trainControl(method = "cv", number = 5), # 5-fold cross-validation
  tuneGrid = expand.grid(mtry = c(1:5)),  # Hyperparameter tuning for mtry
  importance = TRUE
)

# Print the Random Forest model summary
print(rf_model)
## Random Forest 
## 
## 5591 samples
##   19 predictor
##    2 classes: 'Current', 'Left' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 4472, 4473, 4473, 4473, 4473 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa      
##   1     0.7351103  0.006434559
##   2     0.7901983  0.358283757
##   3     0.8009301  0.437008487
##   4     0.8012882  0.449593691
##   5     0.8002157  0.448879265
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 4.
### Model Comparison

# Ensure all the necessary libraries are loaded

# Set seed for reproducibility
set.seed(123)

# Logistic Regression
log_reg_model <- train(
  Status ~ ., 
  data = trainData, 
  method = "glm", 
  family = "binomial", 
  trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
  metric = "ROC"
)

# Decision Tree
tree_model <- train(
  Status ~ ., 
  data = trainData, 
  method = "rpart", 
  trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
  tuneLength = 10,
  metric = "ROC"
)

# Random Forest
rf_model <- train(
  Status ~ ., 
  data = trainData, 
  method = "rf", 
  trControl = trainControl(method = "cv", number = 5, classProbs = TRUE, summaryFunction = twoClassSummary),
  tuneGrid = expand.grid(mtry = c(1:5)), 
  metric = "ROC",
  importance = TRUE
)

# Collect model results for comparison
results <- resamples(list(
  Logistic_Regression = log_reg_model,
  Decision_Tree = tree_model,
  Random_Forest = rf_model
))

# Print model comparison summary
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: Logistic_Regression, Decision_Tree, Random_Forest 
## Number of resamples: 5 
## 
## ROC 
##                          Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Logistic_Regression 0.8239090 0.8488006 0.8499308 0.8478857 0.8530863 0.8637020
## Decision_Tree       0.7970673 0.7981047 0.8137444 0.8104168 0.8152352 0.8279322
## Random_Forest       0.8198899 0.8260566 0.8316970 0.8401187 0.8565333 0.8664169
##                     NA's
## Logistic_Regression    0
## Decision_Tree          0
## Random_Forest          0
## 
## Sens 
##                          Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Logistic_Regression 0.8842875 0.8844282 0.8867235 0.8918674 0.8989038 0.9049939
## Decision_Tree       0.8952497 0.9013398 0.9013398 0.9030685 0.9038929 0.9135201
## Random_Forest       0.8915956 0.8990268 0.9037759 0.9074546 0.9159562 0.9269184
##                     NA's
## Logistic_Regression    0
## Decision_Tree          0
## Random_Forest          0
## 
## Spec 
##                          Min.   1st Qu.    Median      Mean   3rd Qu.      Max.
## Logistic_Regression 0.4949495 0.5185185 0.5488215 0.5515152 0.5959596 0.5993266
## Decision_Tree       0.4646465 0.4680135 0.4848485 0.4895623 0.4915825 0.5387205
## Random_Forest       0.4747475 0.4983165 0.5185185 0.5111111 0.5218855 0.5420875
##                     NA's
## Logistic_Regression    0
## Decision_Tree          0
## Random_Forest          0
# Plot the comparison of models based on ROC (AUC)
bwplot(results)

# Predict churn probability
probabilities <- predict(rf_model, newdata = testData, type = "prob")[, "Left"]
testData$Predicted_Prob <- probabilities
testData$Predicted_Status <- ifelse(probabilities > 0.5, "Left", "Current")

# Revenue loss calculation
churners <- testData %>% filter(Predicted_Status == "Left")
monthly_loss <- sum(churners$MonthlyCharges)
cat("Predicted monthly revenue loss: $", round(monthly_loss, 2))
## Predicted monthly revenue loss: $ 22357.5
# Cost of incentives
discount_rate <- 0.2
incentive_cost <- sum(churners$MonthlyCharges * discount_rate * 3)
cat("Incentive cost for 3 months: $", round(incentive_cost, 2))
## Incentive cost for 3 months: $ 13414.5
# Retained revenue
retained_revenue <- sum(churners$MonthlyCharges)
cat("Potential retained revenue: $", round(retained_revenue, 2))
## Potential retained revenue: $ 22357.5

#Reccomondation Focus retention efforts on customers with short tenures, high monthly charges, and contract flexibility. Offer targeted discounts to customers using month-to-month contracts or high-risk payment methods. Monitor and improve services tied to internet and phone lines, as these show strong churn correlations. #Summary and conclusion Churn Predictors: Key predictors include contract type, tenure, and monthly charges. Proposed Action: Offer targeted discounts to high-risk customers, reducing churn and preserving revenue. Expected ROI: The benefit of retained revenue outweighs the cost of incentives. Limitations: Future models can incorporate real-time data and customer feedback for improved accuracy.

Our analysis highlights critical factors driving customer churn and provides actionable recommendations to Regork Telecom. Implementing these insights can significantly improve customer retention, reduce revenue loss, and build long-term customer loyalty

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.