1. Data Preparation & EDA

data <- read.csv("customer_retention.csv", stringsAsFactors = TRUE)
summary(data)
##     Gender     SeniorCitizen    Partner    Dependents     Tenure     
##  Female:3467   Min.   :0.0000   No :3613   No :4894   Min.   : 0.00  
##  Male  :3532   1st Qu.:0.0000   Yes:3386   Yes:2105   1st Qu.: 9.00  
##                Median :0.0000                         Median :29.00  
##                Mean   :0.1619                         Mean   :32.38  
##                3rd Qu.:0.0000                         3rd Qu.:55.00  
##                Max.   :1.0000                         Max.   :72.00  
##                                                                      
##  PhoneService          MultipleLines     InternetService
##  No : 676     No              :3371   DSL        :2405  
##  Yes:6323     No phone service: 676   Fiber optic:3075  
##               Yes             :2952   No         :1519  
##                                                         
##                                                         
##                                                         
##                                                         
##              OnlineSecurity              OnlineBackup 
##  No                 :3471   No                 :3070  
##  No internet service:1519   No internet service:1519  
##  Yes                :2009   Yes                :2410  
##                                                       
##                                                       
##                                                       
##                                                       
##             DeviceProtection              TechSupport  
##  No                 :3074    No                 :3448  
##  No internet service:1519    No internet service:1519  
##  Yes                :2406    Yes                :2032  
##                                                        
##                                                        
##                                                        
##                                                        
##               StreamingTV              StreamingMovies           Contract   
##  No                 :2792   No                 :2762   Month-to-month:3847  
##  No internet service:1519   No internet service:1519   One year      :1465  
##  Yes                :2688   Yes                :2718   Two year      :1687  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  PaperlessBilling                   PaymentMethod  MonthlyCharges  
##  No :2862         Bank transfer (automatic):1534   Min.   : 18.25  
##  Yes:4137         Credit card (automatic)  :1512   1st Qu.: 35.48  
##                   Electronic check         :2350   Median : 70.35  
##                   Mailed check             :1603   Mean   : 64.75  
##                                                    3rd Qu.: 89.85  
##                                                    Max.   :118.75  
##                                                                    
##   TotalCharges        Status    
##  Min.   :  18.8   Current:5143  
##  1st Qu.: 401.9   Left   :1856  
##  Median :1397.5                 
##  Mean   :2283.1                 
##  3rd Qu.:3796.9                 
##  Max.   :8684.8                 
##  NA's   :11
# Handle missing values in TotalCharges
data <- data[!is.na(data$TotalCharges), ]

# Remove irrelevant levels
cols_to_clean <- c("MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
                   "TechSupport", "StreamingTV", "StreamingMovies")
for (col in cols_to_clean) {
  levels(data[[col]]) <- gsub("No internet service|No phone service", "No", levels(data[[col]]))
}

# Convert target to factor with levels 0 (Current), 1 (Left)
data$Status <- factor(ifelse(data$Status == "Left", 1, 0), levels = c(0, 1))
# Churn rate
prop.table(table(data$Status))
## 
##         0         1 
## 0.7344018 0.2655982
# Churn by contract type
ggplot(data, aes(x = Contract, fill = Status)) + 
  geom_bar(position = "fill") + 
  ylab("Proportion") + 
  ggtitle("Churn Rate by Contract Type")

# MonthlyCharges vs Status
ggplot(data, aes(x = Status, y = MonthlyCharges, fill = Status)) + 
  geom_boxplot() + 
  ggtitle("Monthly Charges by Customer Status")

2. Machine Learning

set.seed(123)
trainIndex <- createDataPartition(data$Status, p = 0.7, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
log_model <- glm(Status ~ ., data = train, family = binomial)
summary(log_model)
## 
## Call:
## glm(formula = Status ~ ., family = binomial, data = train)
## 
## Coefficients:
##                                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                           8.631e-01  9.774e-01   0.883 0.377194    
## GenderMale                           -5.320e-02  7.839e-02  -0.679 0.497336    
## SeniorCitizen                         1.670e-01  1.023e-01   1.633 0.102424    
## PartnerYes                           -9.239e-02  9.311e-02  -0.992 0.321075    
## DependentsYes                        -7.577e-02  1.081e-01  -0.701 0.483469    
## Tenure                               -6.247e-02  7.684e-03  -8.130 4.31e-16 ***
## PhoneServiceYes                      -3.578e-01  7.810e-01  -0.458 0.646828    
## MultipleLinesYes                      3.833e-01  2.114e-01   1.813 0.069767 .  
## InternetServiceFiber optic            1.305e+00  9.581e-01   1.362 0.173047    
## InternetServiceNo                    -1.259e+00  9.701e-01  -1.298 0.194333    
## OnlineSecurityYes                    -3.364e-01  2.139e-01  -1.573 0.115734    
## OnlineBackupYes                      -1.085e-01  2.099e-01  -0.517 0.605244    
## DeviceProtectionYes                   1.578e-01  2.117e-01   0.746 0.455881    
## TechSupportYes                       -2.552e-01  2.171e-01  -1.176 0.239764    
## StreamingTVYes                        3.414e-01  3.909e-01   0.873 0.382555    
## StreamingMoviesYes                    4.594e-01  3.905e-01   1.176 0.239492    
## ContractOne year                     -7.783e-01  1.315e-01  -5.920 3.22e-09 ***
## ContractTwo year                     -1.450e+00  2.106e-01  -6.887 5.70e-12 ***
## PaperlessBillingYes                   3.377e-01  9.002e-02   3.752 0.000176 ***
## PaymentMethodCredit card (automatic) -1.508e-01  1.372e-01  -1.099 0.271908    
## PaymentMethodElectronic check         2.561e-01  1.121e-01   2.285 0.022323 *  
## PaymentMethodMailed check            -1.726e-01  1.377e-01  -1.254 0.210012    
## MonthlyCharges                       -2.069e-02  3.806e-02  -0.544 0.586691    
## TotalCharges                          3.514e-04  8.677e-05   4.050 5.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5665.3  on 4892  degrees of freedom
## Residual deviance: 3995.9  on 4869  degrees of freedom
## AIC: 4043.9
## 
## Number of Fisher Scoring iterations: 6
log_pred <- predict(log_model, newdata = test, type = "response")
log_roc <- roc(test$Status, log_pred)
log_auc <- auc(log_roc)
log_auc
## Area under the curve: 0.8344
tree_model <- rpart(Status ~ ., data = train, method = "class", cp = 0.01)
prp(tree_model)

tree_pred <- predict(tree_model, test, type = "prob")[,2]
tree_roc <- roc(test$Status, tree_pred)
tree_auc <- auc(tree_roc)
tree_auc
## Area under the curve: 0.7953
rf_model <- randomForest(Status ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[,2]
rf_roc <- roc(test$Status, rf_pred)
rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.8176
varImpPlot(rf_model)

data.frame(Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
           AUC = c(log_auc, tree_auc, rf_auc))
##                 Model       AUC
## 1 Logistic Regression 0.8344476
## 2       Decision Tree 0.7953456
## 3       Random Forest 0.8176126

3. Business Analysis & Conclusion

# Predicted probabilities and classification using optimal model (Random Forest)
predicted <- ifelse(rf_pred > 0.5, 1, 0)
churn_customers <- test[predicted == 1, ]
predicted_loss <- sum(churn_customers$MonthlyCharges)
predicted_loss
## [1] 33908.4
# Assume $20 incentive/month to retain each customer
cost <- 20 * nrow(churn_customers)
benefit <- predicted_loss
net_benefit <- benefit - cost
net_benefit
## [1] 25088.4
## Based on my analysis, the Random Forest model achieved the highest AUC of 0.818 . I recommend targeting customers with month-to-month contracts and high monthly charges, as these features strongly predict churn. Offering a $20/month retention incentive to predicted churners is expected to yield a net monthly savings of $ 25088.4 .