library(readxl)
library(readr)
library(dplyr)
library(ggplot2)
# install.packages("caret")
library(caret)
# install.packages("tidymodels")
library(tidymodels)
library(vip)
library(pROC)
library (kableExtra)
# install.packages("cli")
data = read_excel("~/Documents/r files/E Commerce Dataset.xlsx", sheet = "E Comm")

Overview

This analysis explores factors influencing customer churn - when customers stop engaging with a business.

Research goal: Identify the strongest predictors of churn among high-frequency customers and to build a logistic regression model that can support retention strategy decisions.

Analytical focus: This analysis focuses specifically on high-frequency customers — those with above-average order counts. Churn drivers differ significantly across customer segments. High-frequency customers who churn represent a more actionable and higher-value retention problem since something disrupted an established pattern of interacting with the company. Identifying those triggers enables more targeted intervention rather than with low-frequency customers.

Business application: Business simulation estimating the impact of a loyalty programme on churn reduction was made based on the findings.

data = data %>%
  mutate(Churn = as.factor(Churn),
    Complain = as.factor(Complain),
    Gender = as.factor(Gender),
    PreferredPaymentMode = as.factor(PreferredPaymentMode),
    PreferredLoginDevice = as.factor(PreferredLoginDevice),
    Tenure = as.numeric(Tenure),
    SatisfactionScore = as.numeric(SatisfactionScore),
    OrderCount = as.numeric(OrderCount)) %>%
  na.omit()

Segment Definition: High-Frequency Customers

mean_orders = mean(data$OrderCount)
mean_orders
## [1] 2.825384
data_high = data %>%
  mutate(segment = factor(case_when(
    OrderCount > mean_orders ~ "high-frequency",
    TRUE ~ "low-frequency")))
data2 = data_high %>% group_by (segment) %>% 
  summarise (count = n()) %>% 
  mutate(pct = round(count / sum(count) * 100, 1))
ggplot(data2, 
       aes(x = segment, y = count)) +
  geom_bar(stat = "identity", fill = "#104E8B") +
  geom_text(aes(label = paste0(count, 
                               " (", pct, "%)")),
            vjust = -0.5, size = 3.5) +
  labs(x = "Segment of customers") +
  theme_classic()

data_high = data_high %>% filter(segment == "high-frequency") 
cat("Working dataset:", nrow(data_high), "high-frequency customers\n")
## Working dataset: 1077 high-frequency customers

Exploratory Analysis

Overall churn rate in segment

churn_rate = data_high %>%
  group_by(Churn) %>%
  summarise(count = n()) %>%
  mutate(pct = round(count / sum(count) * 100, 1))

ggplot(churn_rate, aes(x = Churn, y = pct)) +
  geom_bar(stat = "identity", fill = "#104E8B", width = 0.5) +
  geom_text(aes(label = paste0(pct, "%")), 
            vjust = -0.5, size = 4) +
  labs(title = "Churn rate — high-frequency customer segment",
    x = "Churn (0 = Stayed, 1 = Churned)",
    y = "Percentage of customers") +
  theme_classic() 

Relationship between Tenure and Churn

ggplot() + 
  geom_boxplot(data = data_high, aes(x=Churn, y=Tenure), fill="#BFEFFF") +
    labs(x = "Churn (0 = Stayed, 1 = Churned)", y = "Tenure (months)") +
  ggtitle ("Customer tenure by churn status") +
  theme_classic()

t.test(Tenure ~ Churn, data = data_high)
## 
##  Welch Two Sample t-test
## 
## data:  Tenure by Churn
## t = 12.718, df = 269.42, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  5.771528 7.885827
## sample estimates:
## mean in group 0 mean in group 1 
##       10.851934        4.023256
t.test
## function (x, ...) 
## UseMethod("t.test")
## <bytecode: 0x12d1fca78>
## <environment: namespace:stats>

Tenure significantly differs between churned and customers who stayed (p < 0.01). Churned customers tend to have shorter tenure, confirming that newer customers — even among frequent buyers — are at higher risk.

Relationship between Complain and Churn

complain_churn = data_high %>%
  group_by(Complain, Churn) %>%
  summarise(count = n(), .groups = "drop") %>%
  group_by(Complain) %>%
  mutate(pct = round(count / sum(count) * 100, 1)) %>%
  filter(Churn == 1)
ggplot(complain_churn, aes(x = Complain, y = pct)) +
  geom_bar(stat = "identity", fill = "#104E8B", width = 0.5) +
  geom_text(aes(label = paste0(pct, "%")), 
            vjust = -0.5, size = 4) +
  labs(title = "Churn rate by complaint status",
    x = "Complaint filed (0 = No, 1 = Yes)",
    y = "Churn rate (%)") +
  theme_classic() 

ch.test = chisq.test(data_high$Churn, data_high$Complain)
ch.test
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  data_high$Churn and data_high$Complain
## X-squared = 94.988, df = 1, p-value < 2.2e-16

Customers who filed a complaint in the last month show significantly higher churn rates (p-value < 0.01).

Satisfaction score distribution

t.test(SatisfactionScore ~ Churn, data = data_high)
## 
##  Welch Two Sample t-test
## 
## data:  SatisfactionScore by Churn
## t = -1.2897, df = 243.5, p-value = 0.1984
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.37765556  0.07879907
## sample estimates:
## mean in group 0 mean in group 1 
##        2.966851        3.116279
ggplot() + 
  geom_boxplot(data = data_high, aes(x=Churn, y=SatisfactionScore), fill="#BFEFFF") +
    labs(x = "Churn (0 = Stayed, 1 = Churned)", y = "Satisfaction score (1–5)") +
  ggtitle ("Differences in satisfaction scores") +
  theme_classic()

T-test shows no statistically significant difference between means of satisfaction scores between left customers and those who stayed (p-value > 0.01).

Logistic Regression Model

set.seed(1234)
ind = createDataPartition(data_high$Churn, p = 0.8, list = F)
train = data_high[ind,]
test = data_high[-ind,]
model_spec = logistic_reg()

model = model_spec %>%
  fit(Churn ~ Tenure + Gender + Complain + 
        PreferredPaymentMode + SatisfactionScore + 
        PreferredLoginDevice + OrderCount,
      data = train)

summary(model$fit)
## 
## Call:
## stats::glm(formula = Churn ~ Tenure + Gender + Complain + PreferredPaymentMode + 
##     SatisfactionScore + PreferredLoginDevice + OrderCount, family = stats::binomial, 
##     data = data)
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                       -2.329018   1.061176  -2.195  0.02818 *  
## Tenure                            -0.213650   0.024510  -8.717  < 2e-16 ***
## GenderMale                        -0.090865   0.224460  -0.405  0.68561    
## Complain1                          1.520799   0.219597   6.925 4.35e-12 ***
## PreferredPaymentModeCC           -11.529187 544.103433  -0.021  0.98309    
## PreferredPaymentModeCOD            0.575391   0.988728   0.582  0.56060    
## PreferredPaymentModeCredit Card    0.912764   0.940269   0.971  0.33167    
## PreferredPaymentModeDebit Card     0.697752   0.939324   0.743  0.45759    
## PreferredPaymentModeE wallet       1.159899   0.988414   1.173  0.24060    
## PreferredPaymentModeUPI            0.775202   1.025914   0.756  0.44988    
## SatisfactionScore                  0.210784   0.078953   2.670  0.00759 ** 
## PreferredLoginDeviceMobile Phone   0.124496   0.245072   0.508  0.61146    
## PreferredLoginDevicePhone         -0.544948   0.377787  -1.442  0.14917    
## OrderCount                         0.009516   0.044245   0.215  0.82971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 758.26  on 861  degrees of freedom
## Residual deviance: 559.44  on 848  degrees of freedom
## AIC: 587.44
## 
## Number of Fisher Scoring iterations: 13

Final model with statistically significant predictors.

model_full = model_spec %>%
  fit(Churn ~ Tenure + Complain + SatisfactionScore,
      data = train)

summary(model_full$fit)
## 
## Call:
## stats::glm(formula = Churn ~ Tenure + Complain + SatisfactionScore, 
##     family = stats::binomial, data = data)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -1.46357    0.29531  -4.956 7.19e-07 ***
## Tenure            -0.20205    0.02322  -8.702  < 2e-16 ***
## Complain1          1.46924    0.21400   6.866 6.61e-12 ***
## SatisfactionScore  0.17902    0.07579   2.362   0.0182 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 758.26  on 861  degrees of freedom
## Residual deviance: 567.14  on 858  degrees of freedom
## AIC: 575.14
## 
## Number of Fisher Scoring iterations: 6

Interpretation of the model: There are three predictors with statistically significant results:

  • Tenure: The longer a customer has been buying from the store, the less likely they are to leave
  • Complain: Customers who have filed a complaint in the last month are more likely to leave than those who have not filed a complaint
  • Satisfaction score: Although satisfaction score is statistically significant in the multivariate model (p < 0.05), its lack of significance in the t-test suggests that its predictive contribution is context-dependent and likely mediated by behavioural variables such as tenure and complaint history. This indicates that satisfaction alone is insufficient as a standalone churn indicator in high-frequency customers.

Evaluation of model

predlog = predict(model_full, test)

table(predlog$.pred_class, test$Churn)
##    
##       0   1
##   0 179  21
##   1   2  13
metrics_full = test %>% 
  mutate(pred =predlog$.pred_class) %>% 
  conf_mat(estimate = pred, truth = Churn) %>% 
  summary()

metrics_full
## # A tibble: 13 × 3
##    .metric              .estimator .estimate
##    <chr>                <chr>          <dbl>
##  1 accuracy             binary         0.893
##  2 kap                  binary         0.480
##  3 sens                 binary         0.989
##  4 spec                 binary         0.382
##  5 ppv                  binary         0.895
##  6 npv                  binary         0.867
##  7 mcc                  binary         0.532
##  8 j_index              binary         0.371
##  9 bal_accuracy         binary         0.686
## 10 detection_prevalence binary         0.930
## 11 precision            binary         0.895
## 12 recall               binary         0.989
## 13 f_meas               binary         0.940
pred_prob = predict(model, test, type = "prob")
pred_class = predict(model_full, test, type = "class")

results = bind_cols(test, pred_prob, pred_class)

roc_auc(results,
  truth = Churn,
  .pred_1,
  event_level = "second")
## # A tibble: 1 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 roc_auc binary         0.775

AUC = 0.77 — indicates moderate to good probability ability. Given class imbalance in the dataset, AUC is a more informative metric than overall accuracy.

Variable importance

vip(model_full,
    aesthetics = list(fill = "#104E8B")) +
  labs(title = "Variable importance — full logistic regression") +
  theme_classic()

Tenure is a significant negative predictor of churn - longer-tenured customers are less likely to leave. Simulation will be based on this finding.

Business Simulation: Impact of a Loyalty Programme.

Scenario: Suppose the company introduces a loyalty programme targeting customers in their first months of engagement — offering discounts, rewards, and personalised incentives that make continued purchasing more attractive. We model this as effectively adding 5 months of engagement depth to customers currently below mean tenure.

Research question: How much would predicted churn decrease if we could extend customer relationship depth through a loyalty intervention?

mean_tenure = mean(test$Tenure)
pred_baseline  = predict(model_full, test, type = "prob")
churn_baseline = mean(pred_baseline$.pred_1)
test_simulated = test %>%
  mutate(Tenure = case_when(
    Tenure < mean_tenure ~ Tenure + 5,
    TRUE ~ Tenure))

pred_simulated = predict(model_full, test_simulated, type = "prob")
churn_simulated = mean(pred_simulated$.pred_1)

reduction = round((churn_baseline - churn_simulated) * 100, 1)
graph = data.frame(scenario = c("Baseline", "With loyalty programme"),
  churn_rate = c(churn_baseline, churn_simulated) * 100)

ggplot(graph, aes(x = scenario, y = churn_rate)) +
  geom_bar(stat = "identity", fill = c("#BFEFFF", "#FFAEB9"), width = 0.5) +
  geom_text(aes(label = paste0(round(churn_rate, 1), "%")), vjust = -0.5, size = 4.5) +
  labs(title = "Simulated impact of loyalty programme on churn rate",
    x  = NULL,
    y  = "Predicted churn rate (%)") +
  theme_classic() 

cat("Baseline churn rate:   ", round(churn_baseline * 100, 1), "%\n")
## Baseline churn rate:    13.7 %
cat("Simulated churn rate:  ", round(churn_simulated * 100, 1), "%\n")
## Simulated churn rate:   7.5 %
cat("Estimated reduction:   ", reduction, "%")
## Estimated reduction:    6.2 %

Simulation findings: The simulation estimates a reduction in average predicted churn probability from 13.7 % to 7.5 %, corresponding to a 6.2 percentage point decrease under the assumption that increased tenure is achievable through a loyalty intervention. This represents a scenario-based estimate rather than a causal effect and real-world impact depends on programme execution.

Key Findings and Recommendations

findings <- data.frame(
  Finding = c(
    "Short tenure = high churn risk even among frequent buyers",
    "Complaint is high predictor of churn",
    "Loyalty programme could reduce churn by estimated 6,2%",
    "Low probability power of satisfaction score"),
  `Business Recommendation` = c(
    "Invest in engagement programmes for customers in first 3 months",
    "Immediately react when a complaint is logged",
    "A tenure-focused loyalty programme targeting the first 5 months to be considered based on this model", 
    "Retention strategy should prioritise behavioural indicators (complaint filing, tenure) over satisfaction ratings for high-frequency customers"), check.names = FALSE)

kable(findings, caption = "Summary of findings and business recommendations")
Summary of findings and business recommendations
Finding Business Recommendation
Short tenure = high churn risk even among frequent buyers Invest in engagement programmes for customers in first 3 months
Complaint is high predictor of churn Immediately react when a complaint is logged
Loyalty programme could reduce churn by estimated 6,2% A tenure-focused loyalty programme targeting the first 5 months to be considered based on this model
Low probability power of satisfaction score Retention strategy should prioritise behavioural indicators (complaint filing, tenure) over satisfaction ratings for high-frequency customers