library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
# install.packages("caret")
library(caret)
# install.packages("tidymodels")
library(tidymodels)
library(vip)
library(pROC)
library (kableExtra)
# install.packages("cli")
data = read_excel("~/Documents/r files/E Commerce Dataset.xlsx", sheet = "E Comm")
This analysis explores factors influencing customer churn - when customers stop engaging with a business.
Research goal: Identify the strongest predictors of churn among high-frequency customers and to build a logistic regression model that can support retention strategy decisions.
Analytical focus: This analysis focuses specifically on high-frequency customers — those with above-average order counts. Churn drivers differ significantly across customer segments. High-frequency customers who churn represent a more actionable and higher-value retention problem since something disrupted an established pattern of interacting with the company. Identifying those triggers enables more targeted intervention rather than with low-frequency customers.
Business application: Business simulation estimating the impact of a loyalty programme on churn reduction was made based on the findings.
data = data %>%
mutate(Churn = as.factor(Churn),
Complain = as.factor(Complain),
Gender = as.factor(Gender),
PreferredPaymentMode = as.factor(PreferredPaymentMode),
PreferredLoginDevice = as.factor(PreferredLoginDevice),
Tenure = as.numeric(Tenure),
SatisfactionScore = as.numeric(SatisfactionScore),
OrderCount = as.numeric(OrderCount)) %>%
na.omit()
mean_orders = mean(data$OrderCount)
mean_orders
## [1] 2.825384
data_high = data %>%
mutate(segment = factor(case_when(
OrderCount > mean_orders ~ "high-frequency",
TRUE ~ "low-frequency")))
data2 = data_high %>% group_by (segment) %>%
summarise (count = n()) %>%
mutate(pct = round(count / sum(count) * 100, 1))
ggplot(data2,
aes(x = segment, y = count)) +
geom_bar(stat = "identity", fill = "#104E8B") +
geom_text(aes(label = paste0(count,
" (", pct, "%)")),
vjust = -0.5, size = 3.5) +
labs(x = "Segment of customers") +
theme_classic()
data_high = data_high %>% filter(segment == "high-frequency")
cat("Working dataset:", nrow(data_high), "high-frequency customers\n")
## Working dataset: 1077 high-frequency customers
churn_rate = data_high %>%
group_by(Churn) %>%
summarise(count = n()) %>%
mutate(pct = round(count / sum(count) * 100, 1))
ggplot(churn_rate, aes(x = Churn, y = pct)) +
geom_bar(stat = "identity", fill = "#104E8B", width = 0.5) +
geom_text(aes(label = paste0(pct, "%")),
vjust = -0.5, size = 4) +
labs(title = "Churn rate — high-frequency customer segment",
x = "Churn (0 = Stayed, 1 = Churned)",
y = "Percentage of customers") +
theme_classic()
ggplot() +
geom_boxplot(data = data_high, aes(x=Churn, y=Tenure), fill="#BFEFFF") +
labs(x = "Churn (0 = Stayed, 1 = Churned)", y = "Tenure (months)") +
ggtitle ("Customer tenure by churn status") +
theme_classic()
t.test(Tenure ~ Churn, data = data_high)
##
## Welch Two Sample t-test
##
## data: Tenure by Churn
## t = 12.718, df = 269.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## 5.771528 7.885827
## sample estimates:
## mean in group 0 mean in group 1
## 10.851934 4.023256
t.test
## function (x, ...)
## UseMethod("t.test")
## <bytecode: 0x12d1fca78>
## <environment: namespace:stats>
Tenure significantly differs between churned and customers who stayed (p < 0.01). Churned customers tend to have shorter tenure, confirming that newer customers — even among frequent buyers — are at higher risk.
complain_churn = data_high %>%
group_by(Complain, Churn) %>%
summarise(count = n(), .groups = "drop") %>%
group_by(Complain) %>%
mutate(pct = round(count / sum(count) * 100, 1)) %>%
filter(Churn == 1)
ggplot(complain_churn, aes(x = Complain, y = pct)) +
geom_bar(stat = "identity", fill = "#104E8B", width = 0.5) +
geom_text(aes(label = paste0(pct, "%")),
vjust = -0.5, size = 4) +
labs(title = "Churn rate by complaint status",
x = "Complaint filed (0 = No, 1 = Yes)",
y = "Churn rate (%)") +
theme_classic()
ch.test = chisq.test(data_high$Churn, data_high$Complain)
ch.test
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: data_high$Churn and data_high$Complain
## X-squared = 94.988, df = 1, p-value < 2.2e-16
Customers who filed a complaint in the last month show significantly higher churn rates (p-value < 0.01).
t.test(SatisfactionScore ~ Churn, data = data_high)
##
## Welch Two Sample t-test
##
## data: SatisfactionScore by Churn
## t = -1.2897, df = 243.5, p-value = 0.1984
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.37765556 0.07879907
## sample estimates:
## mean in group 0 mean in group 1
## 2.966851 3.116279
ggplot() +
geom_boxplot(data = data_high, aes(x=Churn, y=SatisfactionScore), fill="#BFEFFF") +
labs(x = "Churn (0 = Stayed, 1 = Churned)", y = "Satisfaction score (1–5)") +
ggtitle ("Differences in satisfaction scores") +
theme_classic()
T-test shows no statistically significant difference between means of satisfaction scores between left customers and those who stayed (p-value > 0.01).
set.seed(1234)
ind = createDataPartition(data_high$Churn, p = 0.8, list = F)
train = data_high[ind,]
test = data_high[-ind,]
model_spec = logistic_reg()
model = model_spec %>%
fit(Churn ~ Tenure + Gender + Complain +
PreferredPaymentMode + SatisfactionScore +
PreferredLoginDevice + OrderCount,
data = train)
summary(model$fit)
##
## Call:
## stats::glm(formula = Churn ~ Tenure + Gender + Complain + PreferredPaymentMode +
## SatisfactionScore + PreferredLoginDevice + OrderCount, family = stats::binomial,
## data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.329018 1.061176 -2.195 0.02818 *
## Tenure -0.213650 0.024510 -8.717 < 2e-16 ***
## GenderMale -0.090865 0.224460 -0.405 0.68561
## Complain1 1.520799 0.219597 6.925 4.35e-12 ***
## PreferredPaymentModeCC -11.529187 544.103433 -0.021 0.98309
## PreferredPaymentModeCOD 0.575391 0.988728 0.582 0.56060
## PreferredPaymentModeCredit Card 0.912764 0.940269 0.971 0.33167
## PreferredPaymentModeDebit Card 0.697752 0.939324 0.743 0.45759
## PreferredPaymentModeE wallet 1.159899 0.988414 1.173 0.24060
## PreferredPaymentModeUPI 0.775202 1.025914 0.756 0.44988
## SatisfactionScore 0.210784 0.078953 2.670 0.00759 **
## PreferredLoginDeviceMobile Phone 0.124496 0.245072 0.508 0.61146
## PreferredLoginDevicePhone -0.544948 0.377787 -1.442 0.14917
## OrderCount 0.009516 0.044245 0.215 0.82971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 758.26 on 861 degrees of freedom
## Residual deviance: 559.44 on 848 degrees of freedom
## AIC: 587.44
##
## Number of Fisher Scoring iterations: 13
Final model with statistically significant predictors.
model_full = model_spec %>%
fit(Churn ~ Tenure + Complain + SatisfactionScore,
data = train)
summary(model_full$fit)
##
## Call:
## stats::glm(formula = Churn ~ Tenure + Complain + SatisfactionScore,
## family = stats::binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.46357 0.29531 -4.956 7.19e-07 ***
## Tenure -0.20205 0.02322 -8.702 < 2e-16 ***
## Complain1 1.46924 0.21400 6.866 6.61e-12 ***
## SatisfactionScore 0.17902 0.07579 2.362 0.0182 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 758.26 on 861 degrees of freedom
## Residual deviance: 567.14 on 858 degrees of freedom
## AIC: 575.14
##
## Number of Fisher Scoring iterations: 6
Interpretation of the model: There are three predictors with statistically significant results:
predlog = predict(model_full, test)
table(predlog$.pred_class, test$Churn)
##
## 0 1
## 0 179 21
## 1 2 13
metrics_full = test %>%
mutate(pred =predlog$.pred_class) %>%
conf_mat(estimate = pred, truth = Churn) %>%
summary()
metrics_full
## # A tibble: 13 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.893
## 2 kap binary 0.480
## 3 sens binary 0.989
## 4 spec binary 0.382
## 5 ppv binary 0.895
## 6 npv binary 0.867
## 7 mcc binary 0.532
## 8 j_index binary 0.371
## 9 bal_accuracy binary 0.686
## 10 detection_prevalence binary 0.930
## 11 precision binary 0.895
## 12 recall binary 0.989
## 13 f_meas binary 0.940
pred_prob = predict(model, test, type = "prob")
pred_class = predict(model_full, test, type = "class")
results = bind_cols(test, pred_prob, pred_class)
roc_auc(results,
truth = Churn,
.pred_1,
event_level = "second")
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 roc_auc binary 0.775
AUC = 0.77 — indicates moderate to good probability ability. Given class imbalance in the dataset, AUC is a more informative metric than overall accuracy.
vip(model_full,
aesthetics = list(fill = "#104E8B")) +
labs(title = "Variable importance — full logistic regression") +
theme_classic()
Tenure is a significant negative predictor of churn - longer-tenured customers are less likely to leave. Simulation will be based on this finding.
Scenario: Suppose the company introduces a loyalty programme targeting customers in their first months of engagement — offering discounts, rewards, and personalised incentives that make continued purchasing more attractive. We model this as effectively adding 5 months of engagement depth to customers currently below mean tenure.
Research question: How much would predicted churn decrease if we could extend customer relationship depth through a loyalty intervention?
mean_tenure = mean(test$Tenure)
pred_baseline = predict(model_full, test, type = "prob")
churn_baseline = mean(pred_baseline$.pred_1)
test_simulated = test %>%
mutate(Tenure = case_when(
Tenure < mean_tenure ~ Tenure + 5,
TRUE ~ Tenure))
pred_simulated = predict(model_full, test_simulated, type = "prob")
churn_simulated = mean(pred_simulated$.pred_1)
reduction = round((churn_baseline - churn_simulated) * 100, 1)
graph = data.frame(scenario = c("Baseline", "With loyalty programme"),
churn_rate = c(churn_baseline, churn_simulated) * 100)
ggplot(graph, aes(x = scenario, y = churn_rate)) +
geom_bar(stat = "identity", fill = c("#BFEFFF", "#FFAEB9"), width = 0.5) +
geom_text(aes(label = paste0(round(churn_rate, 1), "%")), vjust = -0.5, size = 4.5) +
labs(title = "Simulated impact of loyalty programme on churn rate",
x = NULL,
y = "Predicted churn rate (%)") +
theme_classic()
cat("Baseline churn rate: ", round(churn_baseline * 100, 1), "%\n")
## Baseline churn rate: 13.7 %
cat("Simulated churn rate: ", round(churn_simulated * 100, 1), "%\n")
## Simulated churn rate: 7.5 %
cat("Estimated reduction: ", reduction, "%")
## Estimated reduction: 6.2 %
Simulation findings: The simulation estimates a reduction in average predicted churn probability from 13.7 % to 7.5 %, corresponding to a 6.2 percentage point decrease under the assumption that increased tenure is achievable through a loyalty intervention. This represents a scenario-based estimate rather than a causal effect and real-world impact depends on programme execution.
findings <- data.frame(
Finding = c(
"Short tenure = high churn risk even among frequent buyers",
"Complaint is high predictor of churn",
"Loyalty programme could reduce churn by estimated 6,2%",
"Low probability power of satisfaction score"),
`Business Recommendation` = c(
"Invest in engagement programmes for customers in first 3 months",
"Immediately react when a complaint is logged",
"A tenure-focused loyalty programme targeting the first 5 months to be considered based on this model",
"Retention strategy should prioritise behavioural indicators (complaint filing, tenure) over satisfaction ratings for high-frequency customers"), check.names = FALSE)
kable(findings, caption = "Summary of findings and business recommendations")
| Finding | Business Recommendation |
|---|---|
| Short tenure = high churn risk even among frequent buyers | Invest in engagement programmes for customers in first 3 months |
| Complaint is high predictor of churn | Immediately react when a complaint is logged |
| Loyalty programme could reduce churn by estimated 6,2% | A tenure-focused loyalty programme targeting the first 5 months to be considered based on this model |
| Low probability power of satisfaction score | Retention strategy should prioritise behavioural indicators (complaint filing, tenure) over satisfaction ratings for high-frequency customers |