library(tidyverse)
library(caret)
library(randomForest)
library(knitr)
library(ggplot2)
library(corrplot)
library(pROC)
  1. Introduction

Customer churn is one of the most critical problems for subscription-based businesses. This project focuses on:

Understanding drivers of churn

Translating findings into marketing insights

Recommending strategic actions to reduce churn

Building predictive models to identify at-risk customers

The dataset contains information on customer demographics, service usage, billing, and churn outcome.

df <- read.csv("C:\\Users\\kosci\\Documents\\rstudy\\churn_dataset.csv")
str(df)
## 'data.frame':    500 obs. of  9 variables:
##  $ customer_id     : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ tenure          : int  45 48 65 68 68 10 22 37 71 13 ...
##  $ monthly_charges : num  45.7 58.2 89.7 109.7 32.1 ...
##  $ total_charges   : num  2057 2793 5831 7460 2183 ...
##  $ contract_type   : chr  "Month-to-month" "Month-to-month" "One year" "One year" ...
##  $ internet_service: chr  "DSL" "Fiber optic" "DSL" "Fiber optic" ...
##  $ support_calls   : int  2 3 1 1 3 3 7 2 1 3 ...
##  $ payment_method  : chr  "Bank transfer" "Bank transfer" "Bank transfer" "Electronic check" ...
##  $ churn           : int  0 0 0 1 1 0 1 0 1 0 ...
head(df)
##   customer_id tenure monthly_charges total_charges  contract_type
## 1           1     45           45.72       2057.40 Month-to-month
## 2           2     48           58.18       2792.64 Month-to-month
## 3           3     65           89.71       5831.15       One year
## 4           4     68          109.71       7460.28       One year
## 5           5     68           32.11       2183.48 Month-to-month
## 6           6     10           44.15        441.50       Two year
##   internet_service support_calls   payment_method churn
## 1              DSL             2    Bank transfer     0
## 2      Fiber optic             3    Bank transfer     0
## 3              DSL             1    Bank transfer     0
## 4      Fiber optic             1 Electronic check     1
## 5      Fiber optic             3    Bank transfer     1
## 6              DSL             3    Bank transfer     0
  1. Data Cleaning & Preparation
df$churn <- factor(df$churn, levels=c(0,1), labels=c("No","Yes"))
df$contract_type <- factor(df$contract_type)
df$internet_service <- factor(df$internet_service)
df$payment_method <- factor(df$payment_method)
df$support_calls <- as.numeric(df$support_calls)

summary(df)
##   customer_id        tenure     monthly_charges  total_charges    
##  Min.   :  1.0   Min.   : 1.0   Min.   : 20.12   Min.   :  22.45  
##  1st Qu.:125.8   1st Qu.:19.0   1st Qu.: 46.45   1st Qu.:1015.79  
##  Median :250.5   Median :37.0   Median : 71.73   Median :2247.24  
##  Mean   :250.5   Mean   :36.5   Mean   : 70.41   Mean   :2636.70  
##  3rd Qu.:375.2   3rd Qu.:54.0   3rd Qu.: 92.53   3rd Qu.:3859.70  
##  Max.   :500.0   Max.   :71.0   Max.   :119.74   Max.   :8351.00  
##         contract_type    internet_service support_calls  
##  Month-to-month:295   DSL        :202     Min.   :0.000  
##  One year      :125   Fiber optic:247     1st Qu.:1.000  
##  Two year      : 80   None       : 51     Median :2.000  
##                                           Mean   :2.026  
##                                           3rd Qu.:3.000  
##                                           Max.   :7.000  
##           payment_method churn    
##  Bank transfer   :172    No :384  
##  Credit card     :161    Yes:116  
##  Electronic check:167             
##                                   
##                                   
## 
  1. Exploratory Data Analysis (EDA)

3.1 Churn Distribution

ggplot(df, aes(churn, fill=churn)) +
geom_bar() +
labs(title="Churn Distribution", y="Count")

3.2 Churn by Contract Type

ggplot(df, aes(contract_type, fill=churn)) +
geom_bar(position="fill") +
labs(title="Churn Rate by Contract Type", y="Proportion")

Marketing Insight: Month-to-month contracts typically show the highest churn. Customers with yearly contracts are more committed and less price-sensitive.

3.3 Churn by Payment Method

ggplot(df, aes(payment_method, fill=churn)) +
geom_bar(position="fill") +
labs(title="Churn by Payment Method")

Marketing Insight: Electronic check users often have highest churn → payment friction might be contributing.

3.4 Relationship Between Monthly Charges & Churn

ggplot(df, aes(monthly_charges, fill=churn)) +
geom_density(alpha=0.5) +
labs(title="Distribution of Monthly Charges by Churn Status")

Insight: Customers paying higher monthly fees churn more often.

  1. Feature Correlation
install.packages("corrplot")
## The following package(s) will be installed:
## - corrplot [0.95]
## These packages will be installed into "~/rstudy/Rprogram 2025/R programming/renv/library/windows/R-4.4/x86_64-w64-mingw32".
## 
## # Installing packages --------------------------------------------------------
## - Installing corrplot ...                       OK [linked from cache]
## Successfully installed 1 package in 22 milliseconds.
library(corrplot)
numeric_df <- df %>% select(tenure, monthly_charges, total_charges, support_calls)
corrplot(cor(numeric_df), method = "number")

  1. Predictive Modeling

5.1 Train/Test Split

set.seed(123)
train_index <- createDataPartition(df$churn, p=0.7, list=FALSE)
train <- df[train_index,]
test <- df[-train_index,]

5.2 Logistic Regression

log_model <- glm(churn ~ tenure + monthly_charges + support_calls + contract_type +
internet_service + payment_method,
data=train, family="binomial")

summary(log_model)
## 
## Call:
## glm(formula = churn ~ tenure + monthly_charges + support_calls + 
##     contract_type + internet_service + payment_method, family = "binomial", 
##     data = train)
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)
## (Intercept)                    -0.793469   0.493631  -1.607    0.108
## tenure                         -0.007070   0.006367  -1.110    0.267
## monthly_charges                -0.007339   0.004611  -1.592    0.111
## support_calls                   0.120736   0.088110   1.370    0.171
## contract_typeOne year          -0.280390   0.327418  -0.856    0.392
## contract_typeTwo year           0.299766   0.341165   0.879    0.380
## internet_serviceFiber optic     0.279937   0.274182   1.021    0.307
## internet_serviceNone           -0.106925   0.477559  -0.224    0.823
## payment_methodCredit card       0.235382   0.305372   0.771    0.441
## payment_methodElectronic check -0.358653   0.331874  -1.081    0.280
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 381.62  on 350  degrees of freedom
## Residual deviance: 369.54  on 341  degrees of freedom
## AIC: 389.54
## 
## Number of Fisher Scoring iterations: 4

Predicting & Evaluating

log_pred <- predict(log_model, test, type="response")
log_roc <- roc(test$churn, log_pred)
## Setting levels: control = No, case = Yes
## Setting direction: controls > cases
auc(log_roc)
## Area under the curve: 0.4941
plot(log_roc)

5.3 Random Forest Model

rf_model <- randomForest(churn ~ ., data=train, importance=TRUE)
rf_model
## 
## Call:
##  randomForest(formula = churn ~ ., data = train, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.64%
## Confusion matrix:
##      No Yes class.error
## No  259  10  0.03717472
## Yes  80   2  0.97560976
varImpPlot(rf_model)

6. Marketing Interpretation of Results 🔑 Key Findings

Short-tenure customers churn the most → onboarding and early engagement must improve.

Month-to-month contracts are risky → encourage annual plans.

Higher monthly charges correlate with churn → customers may perceive low value for cost.

Frequent support calls are a red flag → improve customer service efficiency.

Electronic check payment users churn more → payment process redesign needed.

  1. Marketing Recommendations
  2. Develop a Retention-Focused Welcome Journey

First 30 days are the highest-risk period.

Send automated onboarding emails, tutorials, and personalized tips.

Offer a check-in call for new high-value customers.

  1. Introduce Incentives for Annual Contracts

Offer discounted yearly plans.

Provide value-add services (bonus support, faster speeds, or loyalty points).

Message: “Save more when you stay longer.”

  1. Redesign Pricing Tiers

Identify customers with high monthly charges and offer personalized downgrade options.

Introduce “loyalty pricing” for vulnerable segments.

  1. Fix Customer Support Pain Points

Analyze support logs to identify common issues.

Reduce friction by improving FAQ, self-service portals, or chatbot assistance.

Offer priority support for at-risk customers.

  1. Improve Payment Experience

Encourage switch to automatic bank transfer or credit card.

Provide incentives such as fee waivers or loyalty rewards.

  1. Deploy a Churn Prediction System

Use the random forest output to flag customers with:

High support calls

High monthly charges

Short tenure

Electronic check payment

Send these customers:

Personalized retention offers

Satisfaction surveys

Targeted upgrade/downgrade recommendations

  1. Conclusion

This marketing-oriented churn analysis identifies actionable strategies to reduce customer attrition and improve long-term profitability. By understanding churn drivers and applying predictive analytics, organizations can enhance retention programs and increase customer lifetime value.