1. Data Preparation & EDA
data <- read.csv("customer_retention.csv", stringsAsFactors = TRUE)
summary(data)
## Gender SeniorCitizen Partner Dependents Tenure
## Female:3467 Min. :0.0000 No :3613 No :4894 Min. : 0.00
## Male :3532 1st Qu.:0.0000 Yes:3386 Yes:2105 1st Qu.: 9.00
## Median :0.0000 Median :29.00
## Mean :0.1619 Mean :32.38
## 3rd Qu.:0.0000 3rd Qu.:55.00
## Max. :1.0000 Max. :72.00
##
## PhoneService MultipleLines InternetService
## No : 676 No :3371 DSL :2405
## Yes:6323 No phone service: 676 Fiber optic:3075
## Yes :2952 No :1519
##
##
##
##
## OnlineSecurity OnlineBackup
## No :3471 No :3070
## No internet service:1519 No internet service:1519
## Yes :2009 Yes :2410
##
##
##
##
## DeviceProtection TechSupport
## No :3074 No :3448
## No internet service:1519 No internet service:1519
## Yes :2406 Yes :2032
##
##
##
##
## StreamingTV StreamingMovies Contract
## No :2792 No :2762 Month-to-month:3847
## No internet service:1519 No internet service:1519 One year :1465
## Yes :2688 Yes :2718 Two year :1687
##
##
##
##
## PaperlessBilling PaymentMethod MonthlyCharges
## No :2862 Bank transfer (automatic):1534 Min. : 18.25
## Yes:4137 Credit card (automatic) :1512 1st Qu.: 35.48
## Electronic check :2350 Median : 70.35
## Mailed check :1603 Mean : 64.75
## 3rd Qu.: 89.85
## Max. :118.75
##
## TotalCharges Status
## Min. : 18.8 Current:5143
## 1st Qu.: 401.9 Left :1856
## Median :1397.5
## Mean :2283.1
## 3rd Qu.:3796.9
## Max. :8684.8
## NA's :11
# Handle missing values in TotalCharges
data <- data[!is.na(data$TotalCharges), ]
# Remove irrelevant levels
cols_to_clean <- c("MultipleLines", "OnlineSecurity", "OnlineBackup", "DeviceProtection",
"TechSupport", "StreamingTV", "StreamingMovies")
for (col in cols_to_clean) {
levels(data[[col]]) <- gsub("No internet service|No phone service", "No", levels(data[[col]]))
}
# Convert target to factor with levels 0 (Current), 1 (Left)
data$Status <- factor(ifelse(data$Status == "Left", 1, 0), levels = c(0, 1))
# Churn rate
prop.table(table(data$Status))
##
## 0 1
## 0.7344018 0.2655982
# Churn by contract type
ggplot(data, aes(x = Contract, fill = Status)) +
geom_bar(position = "fill") +
ylab("Proportion") +
ggtitle("Churn Rate by Contract Type")

# MonthlyCharges vs Status
ggplot(data, aes(x = Status, y = MonthlyCharges, fill = Status)) +
geom_boxplot() +
ggtitle("Monthly Charges by Customer Status")

2. Machine Learning
set.seed(123)
trainIndex <- createDataPartition(data$Status, p = 0.7, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
log_model <- glm(Status ~ ., data = train, family = binomial)
summary(log_model)
##
## Call:
## glm(formula = Status ~ ., family = binomial, data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.631e-01 9.774e-01 0.883 0.377194
## GenderMale -5.320e-02 7.839e-02 -0.679 0.497336
## SeniorCitizen 1.670e-01 1.023e-01 1.633 0.102424
## PartnerYes -9.239e-02 9.311e-02 -0.992 0.321075
## DependentsYes -7.577e-02 1.081e-01 -0.701 0.483469
## Tenure -6.247e-02 7.684e-03 -8.130 4.31e-16 ***
## PhoneServiceYes -3.578e-01 7.810e-01 -0.458 0.646828
## MultipleLinesYes 3.833e-01 2.114e-01 1.813 0.069767 .
## InternetServiceFiber optic 1.305e+00 9.581e-01 1.362 0.173047
## InternetServiceNo -1.259e+00 9.701e-01 -1.298 0.194333
## OnlineSecurityYes -3.364e-01 2.139e-01 -1.573 0.115734
## OnlineBackupYes -1.085e-01 2.099e-01 -0.517 0.605244
## DeviceProtectionYes 1.578e-01 2.117e-01 0.746 0.455881
## TechSupportYes -2.552e-01 2.171e-01 -1.176 0.239764
## StreamingTVYes 3.414e-01 3.909e-01 0.873 0.382555
## StreamingMoviesYes 4.594e-01 3.905e-01 1.176 0.239492
## ContractOne year -7.783e-01 1.315e-01 -5.920 3.22e-09 ***
## ContractTwo year -1.450e+00 2.106e-01 -6.887 5.70e-12 ***
## PaperlessBillingYes 3.377e-01 9.002e-02 3.752 0.000176 ***
## PaymentMethodCredit card (automatic) -1.508e-01 1.372e-01 -1.099 0.271908
## PaymentMethodElectronic check 2.561e-01 1.121e-01 2.285 0.022323 *
## PaymentMethodMailed check -1.726e-01 1.377e-01 -1.254 0.210012
## MonthlyCharges -2.069e-02 3.806e-02 -0.544 0.586691
## TotalCharges 3.514e-04 8.677e-05 4.050 5.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5665.3 on 4892 degrees of freedom
## Residual deviance: 3995.9 on 4869 degrees of freedom
## AIC: 4043.9
##
## Number of Fisher Scoring iterations: 6
log_pred <- predict(log_model, newdata = test, type = "response")
log_roc <- roc(test$Status, log_pred)
log_auc <- auc(log_roc)
log_auc
## Area under the curve: 0.8344
tree_model <- rpart(Status ~ ., data = train, method = "class", cp = 0.01)
prp(tree_model)

tree_pred <- predict(tree_model, test, type = "prob")[,2]
tree_roc <- roc(test$Status, tree_pred)
tree_auc <- auc(tree_roc)
tree_auc
## Area under the curve: 0.7953
rf_model <- randomForest(Status ~ ., data = train, ntree = 100)
rf_pred <- predict(rf_model, test, type = "prob")[,2]
rf_roc <- roc(test$Status, rf_pred)
rf_auc <- auc(rf_roc)
rf_auc
## Area under the curve: 0.8176
varImpPlot(rf_model)

data.frame(Model = c("Logistic Regression", "Decision Tree", "Random Forest"),
AUC = c(log_auc, tree_auc, rf_auc))
## Model AUC
## 1 Logistic Regression 0.8344476
## 2 Decision Tree 0.7953456
## 3 Random Forest 0.8176126
3. Business Analysis & Conclusion
# Predicted probabilities and classification using optimal model (Random Forest)
predicted <- ifelse(rf_pred > 0.5, 1, 0)
churn_customers <- test[predicted == 1, ]
predicted_loss <- sum(churn_customers$MonthlyCharges)
predicted_loss
## [1] 33908.4
# Assume $20 incentive/month to retain each customer
cost <- 20 * nrow(churn_customers)
benefit <- predicted_loss
net_benefit <- benefit - cost
net_benefit
## [1] 25088.4
## Based on my analysis, the Random Forest model achieved the highest AUC of 0.818 . I recommend targeting customers with month-to-month contracts and high monthly charges, as these features strongly predict churn. Offering a $20/month retention incentive to predicted churners is expected to yield a net monthly savings of $ 25088.4 .