بسم الله الرحمن الرحيم

Demand Forecast

Business Understanding

  1. Business Question The goal is to improve sales performance and customer retention for an online retailer by analyzing transaction history from 2009–2011. Two main objectives:
  1. forecast sales for the next 6 months to support inventory and promotion planning, and
  2. build a Customer 360 model to identify high-value and churn-risk customers for targeted campaigns.

Data Read

This picture below is the contents of the data

df_raw <- read_csv(file = "retail_transaction_data.csv", col_names = TRUE)
kable(head(df_raw))
order_id product_id product_description quantity order_date unit_price customer_id country
489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 2009-12-01 07:45:00 6.95 13085 United Kingdom
489434 79323P PINK CHERRY LIGHTS 12 2009-12-01 07:45:00 6.75 13085 United Kingdom
489434 79323W WHITE CHERRY LIGHTS 12 2009-12-01 07:45:00 6.75 13085 United Kingdom
489434 22041 RECORD FRAME 7” SINGLE SIZE 48 2009-12-01 07:45:00 2.10 13085 United Kingdom
489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 2009-12-01 07:45:00 1.25 13085 United Kingdom
489434 22064 PINK DOUGHNUT TRINKET POT 24 2009-12-01 07:45:00 1.65 13085 United Kingdom

Preliminary Checking

check the dimension and metadata

str(df_raw)
## spc_tbl_ [1,067,371 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ order_id           : chr [1:1067371] "489434" "489434" "489434" "489434" ...
##  $ product_id         : chr [1:1067371] "85048" "79323P" "79323W" "22041" ...
##  $ product_description: chr [1:1067371] "15CM CHRISTMAS GLASS BALL 20 LIGHTS" "PINK CHERRY LIGHTS" "WHITE CHERRY LIGHTS" "RECORD FRAME 7\" SINGLE SIZE" ...
##  $ quantity           : num [1:1067371] 12 12 12 48 24 24 24 10 12 12 ...
##  $ order_date         : POSIXct[1:1067371], format: "2009-12-01 07:45:00" "2009-12-01 07:45:00" ...
##  $ unit_price         : num [1:1067371] 6.95 6.75 6.75 2.1 1.25 1.65 1.25 5.95 2.55 3.75 ...
##  $ customer_id        : num [1:1067371] 13085 13085 13085 13085 13085 ...
##  $ country            : chr [1:1067371] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   order_id = col_character(),
##   ..   product_id = col_character(),
##   ..   product_description = col_character(),
##   ..   quantity = col_double(),
##   ..   order_date = col_datetime(format = ""),
##   ..   unit_price = col_double(),
##   ..   customer_id = col_double(),
##   ..   country = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
# Basic counts
n_product  <- length(unique(df_raw$product_id))
n_customer <- length(unique(df_raw$customer_id))
n_country  <- length(unique(df_raw$country))

# Print with clear formatting
kable(
  tibble(
    Metric = c("Unique Products", "Unique Customers", "Unique Countries"),
    Count  = c(n_product, n_customer, n_country)
  ),
  caption = "Summary of Unique Entities in Retail Transactions",
  align = "lc",
  col.names = c("Metric", "Count")
)
Summary of Unique Entities in Retail Transactions
Metric Count
Unique Products 5304
Unique Customers 5943
Unique Countries 43
## **Transaction Period:** from 2009-12-01 07:45:00 to 2011-12-09 12:50:00
Distribution of Transactions by Country
Country Number of Transactions
United Kingdom 981330
EIRE 17866
Germany 17624
France 14330
Netherlands 5140
Spain 3811
Switzerland 3189
Belgium 3123
Portugal 2620
Australia 1913
Channel Islands 1664
Italy 1534
Norway 1455
Sweden 1364
Cyprus 1176
Finland 1049
Austria 938
Denmark 817
Unspecified 756
Greece 663
Japan 582
Poland 535
USA 535
United Arab Emirates 500
Israel 371
Hong Kong 364
Singapore 346
Malta 299
Iceland 253
Canada 228
Lithuania 189
RSA 169
Bahrain 126
Brazil 94
Thailand 76
Korea 63
European Community 61
Lebanon 58
West Indies 54
Bermuda 34
Nigeria 32
Czech Republic 30
Saudi Arabia 10
Sample of Transaction Data with Calculated Revenue & Cancellation Flag
order_id product_id quantity unit_price revenue is_cancelled
489434 85048 12 6.95 83.4 FALSE
489434 79323P 12 6.75 81.0 FALSE
489434 79323W 12 6.75 81.0 FALSE
489434 22041 48 2.10 100.8 FALSE
489434 21232 24 1.25 30.0 FALSE
489434 22064 24 1.65 39.6 FALSE
489434 21871 24 1.25 30.0 FALSE
489434 21523 10 5.95 59.5 FALSE
489435 22350 12 2.55 30.6 FALSE
489435 22349 12 3.75 45.0 FALSE

Data Cleaning

# 🧩 Data Cleaning Pipeline
df <- df_raw %>%
  drop_na(customer_id) %>%
  mutate(
    is_cancelled = grepl('^C', as.character(order_id)),
    revenue = quantity * unit_price
  ) %>%
  filter(!is_cancelled, revenue > 0)

# ✅ Summary after cleaning
n_before <- nrow(df_raw)
n_after  <- nrow(df)
pct_kept <- round((n_after / n_before) * 100, 2)

glue("
**Rows before cleaning:** {n_before}  
**Rows after cleaning:** {n_after}  
**Data retained:** {pct_kept}% of original transactions
")
## **Rows before cleaning:** 1067371  
## **Rows after cleaning:** 805549  
## **Data retained:** 75.47% of original transactions
# 🪞 Peek at cleaned data
clean_sample <- df %>%
  select(order_id, customer_id, product_id, quantity, unit_price, revenue, country) %>%
  head(10)

kable(
  clean_sample,
  caption = "Sample of Cleaned Transaction Data (Valid Orders Only)",
  align = "c"
)
Sample of Cleaned Transaction Data (Valid Orders Only)
order_id customer_id product_id quantity unit_price revenue country
489434 13085 85048 12 6.95 83.4 United Kingdom
489434 13085 79323P 12 6.75 81.0 United Kingdom
489434 13085 79323W 12 6.75 81.0 United Kingdom
489434 13085 22041 48 2.10 100.8 United Kingdom
489434 13085 21232 24 1.25 30.0 United Kingdom
489434 13085 22064 24 1.65 39.6 United Kingdom
489434 13085 21871 24 1.25 30.0 United Kingdom
489434 13085 21523 10 5.95 59.5 United Kingdom
489435 13085 22350 12 2.55 30.6 United Kingdom
489435 13085 22349 12 3.75 45.0 United Kingdom

Weekly Aggregation

# 🧾 Aggregate daily transactions into weekly revenue
weekly_sales <- df %>%
  mutate(week = floor_date(order_date, unit = "week", week_start = 1)) %>%
  group_by(week) %>%
  summarise(revenue = sum(revenue, na.rm = TRUE), .groups = "drop") %>%
  complete(
    week = seq(min(week), max(week), by = "1 week"),
    fill = list(revenue = 0)
  ) %>%
  arrange(week)

# 📊 Summary info
glue("
**Weekly data coverage:** {min(weekly_sales$week)} to {max(weekly_sales$week)}  
**Total weeks:** {nrow(weekly_sales)}  
**Average weekly revenue:** {round(mean(weekly_sales$revenue, na.rm = TRUE), 2)}  
**Maximum weekly revenue:** {round(max(weekly_sales$revenue, na.rm = TRUE), 2)}  
")
## **Weekly data coverage:** 2009-11-30 to 2011-12-05  
## **Total weeks:** 106  
## **Average weekly revenue:** 167390.84  
## **Maximum weekly revenue:** 412154.51
# 🪞 Preview
kable(
  head(weekly_sales, 10),
  caption = "Weekly Aggregated Revenue (First 10 Weeks)",
  col.names = c("Week Starting", "Total Revenue"),
  align = "lc"
)
Weekly Aggregated Revenue (First 10 Weeks)
Week Starting Total Revenue
2009-11-30 232917.9
2009-12-07 209021.3
2009-12-14 208936.8
2009-12-21 35778.2
2009-12-28 0.0
2010-01-04 167928.5
2010-01-11 125897.8
2010-01-18 119968.8
2010-01-25 143524.0
2010-02-01 112879.7

Weekly aggregation is chosen to balance granularity and stability. Daily data is often noisy due to holidays, weekends, and random short events, while monthly aggregation will make our dataset too short and can hid any short-term variations as shown below. Weekly frequency captures seasonal effects like holiday spikes and marketing cycles, aligning with common business reporting (52 weeks/year).

knitr::include_graphics("assets/daily_data_inconsistency.png")

knitr::include_graphics("assets/daily_data_inconsistency_2.png")

Code Explanation :

  • floor_date(order_date, “week”) → assigns each transaction to its weekly period (Monday-start).
  • group_by(week) + sum(revenue) → totals sales per week.
  • complete() → fills missing weeks with 0 to keep a continuous time series (important for forecasting).
  • arrange(week) → ensures time-ordered output.

EDA

Creating Time series object

sales_ts <- ts(data = weekly_sales$revenue, 
               start = c(2009,48), 
               frequency = 52)

Descriptive Analytics

This is the data plot by week.

Decomposition

Decomposition adalah suatu tahapan dalam time series analisis yang digunakan untuk menguraikan beberapa komponen dalam time series data.

Komponen/unsur dalam time series :

  • Trend : pola data secara general, cenderung untuk naik atau turun. Jika pada trend masih terdapat pola (fluktuatif) artinya masih ada pola yang belum tertangkap dengan baik.

  • Seasonal : pola musiman yang membentuk pola berulang pada periode waktu yang tetap

  • Residual/Error : pola yang tidak dapat ditangkap dalam trend dan seasonal

Sebelum melakukan model forecasting kita perlu mengamati objek time series dari hasil decompose. Ide utama dari decomposition adalah untuk menguraikan ketiga komponen dari objek ts (trend, seasonal, residual).

Jika pada hasil decompose, trend masih membetuk sebuah pola maka seasonality belum cukup tertangkap, dalam kasus ini data <2 Tahun seringkali tidak mampu memberikan seasonality. Kita dapat mencatat hal ini sebagai adjustment dalam melakukan modeling

sales_ts %>% decompose(type = c("additive","multiplicative"), filter=NULL) %>% autoplot()

Modeling

Preparation (Data split for 12 weeks of test)

## freq = 52  length = 106  seasons_in_train = 2

Baseline Model

After reading and understanding the descriptive data that is available such as : * Short seasonal Period * Using weekly periodization rather than monthly

Overkilling with Deep Learning methods for me sometimes lacks human factor and understandability not only for the readers but for me also. With limited time also at hand I think It is best to start with the baseline model and Improving it further with Inventory consideration for a more actionable analysis rather than a robust model.

But keep in mind that I will give some pointer for improvments ahead

Naive, Seasonal Naive & ARIMA

fc_naive <- naive(train_ts, h = test_n)
fc_snaive <- snaive(train_ts, h = test_n)
fit_arima <- auto.arima(train_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
fc_arima <- forecast(fit_arima, h = test_n)

Seasonal Naïve (snaive) model serves as the benchmark by assuming each weeks sales repeat the pattern of the same week in the previous year. It’s transparent, simple, and provides a clear baseline for measuring improvements.

Fourier ARIMA

ARIMA + Fourier (fallback for < 2 seasons): When the training window contains less than two full seasonal cycles (e.g., < 104 weeks for weekly data), stl() inside stlf() will fail. Fourier terms provide a compact, smooth representation of seasonality without needing full cycles in-sample. We model the deterministic seasonal shape with fourier() as external regressors (xreg), while ARIMA captures autocorrelation and short‑term shocks. This makes ARIMA+Fourier suitable for long seasonality with limited history.

Understanding Fourier() in Seasonal ARIMA Models

The fourier() function adds pairs of sine and cosine waves that approximate seasonal patterns.
Each pair (controlled by K) captures one “harmonic” of the seasonal cycle.

  • Low K (1–3): captures broad, smooth seasonality (one or two main cycles per year).
  • High K (4–13): captures finer seasonal fluctuations and short-term peaks.
  • Too high K → risk of overfitting or instability, especially with short training data.

To make your ARIMA Fourier model more seasonal:

  • Increase K gradually (e.g., from 3 → 6 → 10 → 13).
  • Keep seasonal = FALSE in auto.arima() because Fourier terms replace seasonal differencing.
  • Check AICc and residual plots — stop increasing K when improvement flattens.

Intinya :

Nilai K menunjukkan seberapa kuat model menangkap pola musiman.

  • K kecil → pola besar dan lembut.
  • K besar → pola musiman kecil juga ditangkap (lebih “musiman”).

Untuk membuat model lebih musiman:

  • Naikkan K sedikit demi sedikit.
  • Cek AICc atau RMSE validasi.
  • Pilih K di mana hasil terbaik tapi tidak overfit.

Source: Forecasting: Principles and Practice (OTexts.com)

aic_table <- data.frame(K = integer(), AICc = numeric())
# Frequency = 52 (weekly)
freq <- 52
max_K <- floor(freq / 2)  # theoretical maximum

for (k in 1:max_K) {
  xreg_train <- fourier(train_ts, K = k)
  fit <- auto.arima(train_ts, xreg = xreg_train, seasonal = FALSE)
  aic_table <- rbind(aic_table, data.frame(K = k, AICc = fit$aicc))
}
library(ggplot2)
ggplot(aic_table, aes(x = K, y = AICc)) +
  geom_line() + geom_point() +
  labs(title = "Fourier Term Tuning", y = "AICc (lower is better)")

We are going to use K = 7

K <- min(7, floor(freq/2))       # tune K if needed
xreg_train <- fourier(train_ts, K = K)
xreg_test <- fourier(train_ts, K = K, h = test_n)
fit_arima2 <- auto.arima(train_ts, xreg = xreg_train, seasonal = FALSE)
aic_table <- rbind(aic_table, data.frame(K = K, AICc = fit_arima2$aicc))
fc_arima2  <- forecast(fit_arima2, xreg = xreg_test, h = valid_n)
pred <- fc_arima2$mean

Metrics

## # A tibble: 4 × 3
##   model             MAE  WAPE
##   <chr>           <dbl> <dbl>
## 1 naive          61761. 0.231
## 2 snaive         53975. 0.202
## 3 arima         106564. 0.398
## 4 arima_fourier  55399. 0.207

Fitted Model

train_ts %>% 
  autoplot() +
  autolayer(fc_snaive$fitted, series = "fitted (Seasonal Naive)") +
  autolayer(fc_arima2$fitted, series = "fitted (ARIMA Fourier)")

Visualization

autoplot(fc_arima2) +
  autolayer(test_ts, series = "Actual") +
  autolayer(fc_snaive$mean, series = "Seasonal Naive") +
  labs(title = paste("Forecast vs Actual (Arima Fourier)", sep = ""),
       x = "Week",
       y = "Revenue") +
  theme_minimal()

# ---- Fourier-Enhanced ARIMA Forecast ----
# Step 1: Determine if there are enough full seasonal cycles in training data
seasons_in_train <- floor(length(train_ts) / freq)
use_stlf <- seasons_in_train >= 2

# Step 2: Define maximum Fourier terms (K)
K_max <- floor(freq / 2)                     # theoretical upper limit
K_by_len <- max(1, floor(length(full_ts) / 20)) # adjust for data length
K <- max(1, min(6, K_max, K_by_len))         # practical upper bound (prevent overfitting)

# Step 3: Create Fourier terms as external regressors
xreg_full <- fourier(full_ts, K = K)

# Step 4: Fit ARIMA model with Fourier terms
fit_full <- auto.arima(
  full_ts,
  xreg = xreg_full,
  seasonal = FALSE
)

# Step 5: Forecast using future Fourier terms (26 periods ahead)
final_fc <- forecast(
  fit_full,
  xreg = fourier(full_ts, K = K, h = 26),
  h = 26
)

# Step 6: Display summary and plot
summary(final_fc)
## 
## Forecast method: Regression with ARIMA(0,0,1) errors
## 
## Model Information:
## Series: full_ts 
## Regression with ARIMA(0,0,1) errors 
## 
## Coefficients:
##          ma1   intercept       S1-52      C1-52       S2-52      C2-52
##       0.3442  166288.522  -29029.263  44612.699  -31452.465  31635.879
## s.e.  0.0959    6006.043    8557.342   8406.057    8504.932   8387.534
##            S3-52      C3-52      S4-52      C4-52      S5-52      C5-52
##       -17369.463  13635.713   318.1864  12872.452  12556.594  10750.335
## s.e.    8422.552   8352.472  8315.6181   8296.043   8190.226   8212.199
## 
## sigma^2 = 2.386e+09:  log likelihood = -1288.54
## AIC=2603.07   AICc=2607.03   BIC=2637.7
## 
## Error measures:
##                    ME    RMSE     MAE  MPE MAPE      MASE        ACF1
## Training set 52.21969 46003.3 33000.6 -Inf  Inf 0.8688257 -0.01493591
## 
## Forecasts:
##          Point Forecast     Lo 80    Hi 80      Lo 95    Hi 95
## 3.038462      262007.68 199401.96 324613.4 166260.486 357754.9
## 3.057692      164800.37  98590.02 231010.7  63540.375 266060.4
## 3.076923      129612.65  63402.30 195823.0  28352.657 230872.6
## 3.096154      103204.49  36994.14 169414.8   1944.492 204464.5
## 3.115385       89329.33  23118.98 155539.7 -11930.664 190589.3
## 3.134615       88702.34  22491.99 154912.7 -12557.657 189962.3
## 3.153846       98886.11  32675.76 165096.5  -2373.888 200146.1
## 3.173077      115095.80  48885.45 181306.1  13835.802 216355.8
## 3.192308      131664.42  65454.06 197874.8  30404.421 232924.4
## 3.211538      143701.04  77490.69 209911.4  42441.043 244961.0
## 3.230769      148421.89  82211.54 214632.2  47161.894 249681.9
## 3.250000      145752.79  79542.43 211963.1  44492.791 247012.8
## 3.269231      138049.31  71838.96 204259.7  36789.314 239309.3
## 3.288462      129073.20  62862.85 195283.6  27813.206 230333.2
## 3.307692      122600.77  56390.41 188811.1  21340.769 223860.8
## 3.326923      121143.01  54932.66 187353.4  19883.014 222403.0
## 3.346154      125196.33  58985.98 191406.7  23936.332 226456.3
## 3.365385      133242.14  67031.79 199452.5  31982.145 234502.1
## 3.384615      142447.11  76236.76 208657.5  41187.117 243707.1
## 3.403846      149776.20  83565.85 215986.6  48516.207 251036.2
## 3.423077      153101.10  86890.75 219311.5  51841.103 254361.1
## 3.442308      151908.36  85698.01 218118.7  50648.363 253168.4
## 3.461538      147372.49  81162.14 213582.8  46112.497 248632.5
## 3.480769      141798.11  75587.75 208008.5  40538.110 243058.1
## 3.500000      137664.95  71454.60 203875.3  36404.957 238924.9
## 3.519231      136647.95  70437.60 202858.3  35387.953 237907.9
autoplot(final_fc) + ggtitle("Fourier-Enhanced ARIMA Forecast") +
  theme_minimal()

What the Metric means MA(1) Moving Average Captures short-term randomness (small noise correction). Our number is what we need considering we just want to understand ups and downs. Intercept Base level is average level of our model, pretty much the sampe. Fourier Terms (S, C) Seasonal harmonics Show multiple cyclical patterns (e.g., weekly/annual). AIC / BIC Model quality indicators is the lowest without overfitting. RMSE / MAE Error magnitude Average forecast miss ≈ 33–46 K revenue units. small error. MPE/MAPE is not relevant because we have 0 in our sales. MASE = 0.87 Relative to seasonal naïve, model is 13 % better than a basic seasonal baseline ✅ ACF1 ≈ 0 Residual correlation is random — good sign of a well-fit model.

From the Fourier-Enhanced ARIMA model, we can infer how well our time series captures complex seasonal structures beyond standard ARIMA’s built-in seasonality.

  1. Forecast Implications The resulting forecast (final_fc) visualized through autoplot() demonstrates:
  • A smooth continuation of recurring patterns consistent with past cycles.

  • Clearer peaks and troughs aligned with observed seasonal behavior.

  • Reduced residual autocorrelation (confirmable via Ljung–Box test), meaning the model captured most of the repeating structure.

  1. Model Improvements From a business standpoint:
  • A well-tuned K helps detect periodic spikes in demand, sales, or transactions.

  • Increasing K improves fit but can lead to overfitting

  • The optimal K balances accuracy (low AICc / RMSE) and stability over future periods.

  1. The model already provides a reasonable baseline forecast capturing recurring demand patterns. Improving it further will sharpen:
  • Inventory planning (reducing overstock on low-weeks), and

  • Budget allocation (timing promos to high-season peaks).

Business Results and Actions

Convert the model output to a clean forecast table with dates, point forecasts, and 80% intervals; display a readable preview and a quick summary for execs.We will translate weekly total forecasts into actionable SKU plans using historical mix shares (simple and seasonality-aware), convert revenue → units (via median price), and add a safety stock overlay.

This bridges top-down demand to bottom-up inventory and promotion targeting.

# Build nicely formatted forecast table (26 weeks ahead)
forecast_tbl <- tibble(
  date     = seq(max(weekly_sales$week) + weeks(1), by = "1 week", length.out = 26),
  forecast = as.numeric(final_fc$mean),
  lo80     = as.numeric(final_fc$lower[,"80%"]),
  hi80     = as.numeric(final_fc$upper[,"80%"])
)

# Executive summary
glue("
**Forecast horizon:** {min(forecast_tbl$date)} → {max(forecast_tbl$date)}  
**Total forecast revenue (26w):** {round(sum(forecast_tbl$forecast), 2)}  
**Avg weekly forecast:** {round(mean(forecast_tbl$forecast), 2)}  
**Peak week (point forecast):** {forecast_tbl$date[which.max(forecast_tbl$forecast)]}  
")
## **Forecast horizon:** 2011-12-12 → 2012-06-04  
## **Total forecast revenue (26w):** 3551199.93  
## **Avg weekly forecast:** 136584.61  
## **Peak week (point forecast):** 2011-12-12
# Preview first 12 rows
kable(
  head(forecast_tbl, 12),
  caption = "Weekly Forecast (First 12 Weeks, with 80% Intervals)",
  col.names = c("Week", "Forecast", "Lo80", "Hi80"),
  align = "lrrr",
  digits = 2
)
Weekly Forecast (First 12 Weeks, with 80% Intervals)
Week Forecast Lo80 Hi80
2011-12-12 262007.68 199401.96 324613.4
2011-12-19 164800.37 98590.02 231010.7
2011-12-26 129612.65 63402.30 195823.0
2012-01-02 103204.49 36994.14 169414.8
2012-01-09 89329.33 23118.98 155539.7
2012-01-16 88702.34 22491.99 154912.7
2012-01-23 98886.11 32675.76 165096.5
2012-01-30 115095.80 48885.45 181306.1
2012-02-06 131664.42 65454.06 197874.8
2012-02-13 143701.04 77490.69 209911.4
2012-02-20 148421.89 82211.54 214632.2
2012-02-27 145752.79 79542.43 211963.1

Business Strategist Note

  • What this means: Our next 26 weeks show distinct peaks and baseline weeks. Peaks should drive pre-build inventory, supplier booking, and marketing calendar (bundle pushes before peak weeks).
  • How to use it:
    1. Inventory — set PO dates backward from peak weeks by lead time; apply MOQ and safety stock for high-variability SKUs.
    2. Cash flow — sequence procurement to align with forecasted revenue inflows; avoid tying up cash in long-tail SKUs during low weeks.
    3. Promotions — schedule pull-forward offers for slow weeks; protect margin during peak weeks (use value adds over discounts).
    4. Ops & CX — staff and logistics scale based on forecasted volume; pre-stage packaging and pick-paths for top movers.
  • Next step: Break this total forecast down by SKU using recent sales mix (overall and week-of-year seasonal mix) to get units to stock and SKU-level actions.

# 0) Lookback window for mix shares (52 weeks standard)
lookback_weeks <- 52
cutoff_date <- max(df$order_date, na.rm = TRUE) - weeks(lookback_weeks)
recent <- df %>% filter(order_date > cutoff_date)

# 1) Top-10 SKUs by recent revenue
top10 <- recent %>%
  group_by(product_id) %>%
  summarise(revenue_52w = sum(revenue), .groups = "drop") %>%
  arrange(desc(revenue_52w)) %>%
  slice_head(n = 10)

# Optional: descriptions (adapt column name if yours differs)
sku_meta <- df %>%
  filter(product_id %in% top10$product_id) %>%
  group_by(product_id) %>%
  summarise(product_description = dplyr::first(product_description[!is.na(product_description)]),
            .groups = "drop")

# 2A) SIMPLE mix (single share across last 52 weeks)
mix_simple <- recent %>%
  filter(product_id %in% top10$product_id) %>%
  group_by(product_id) %>%
  summarise(rev_sku = sum(revenue), .groups = "drop") %>%
  mutate(share = rev_sku / sum(rev_sku)) %>%
  left_join(sku_meta, by = "product_id")

# 2B) SEASONAL mix (share by week-of-year)
mix_seasonal <- recent %>%
  filter(product_id %in% top10$product_id) %>%
  mutate(woy = isoweek(order_date)) %>%
  group_by(woy) %>%
  mutate(total_woy = sum(revenue)) %>%
  ungroup() %>%
  group_by(product_id, woy) %>%
  summarise(rev_sku_woy = sum(revenue),
            total_woy = first(total_woy),
            .groups = "drop") %>%
  mutate(share_woy = ifelse(total_woy > 0, rev_sku_woy / total_woy, 0)) %>%
  left_join(sku_meta, by = "product_id")

# 3) Price proxy → median unit price (robust to outliers)
sku_price <- recent %>%
  filter(product_id %in% top10$product_id) %>%
  group_by(product_id) %>%
  summarise(median_price = median(unit_price, na.rm = TRUE), .groups = "drop")

# 4) Allocation A: SIMPLE mix
alloc_simple <- tidyr::crossing(
    date = forecast_tbl$date,
    product_id = top10$product_id
  ) %>%
  left_join(forecast_tbl %>% select(date, forecast), by = "date") %>%
  left_join(mix_simple %>% select(product_id, share), by = "product_id") %>%
  left_join(sku_price, by = "product_id") %>%
  mutate(
    sku_revenue_fc = forecast * share,
    units_fc = ifelse(median_price > 0, sku_revenue_fc / median_price, NA_real_)
  ) %>%
  left_join(sku_meta, by = "product_id") %>%
  arrange(date, desc(sku_revenue_fc))

# 4) Allocation B: SEASONAL mix (by week-of-year of the forecasted date)
alloc_seasonal <- tidyr::crossing(
    date = forecast_tbl$date,
    product_id = top10$product_id
  ) %>%
  mutate(woy = isoweek(date)) %>%
  left_join(forecast_tbl %>% select(date, forecast), by = "date") %>%
  left_join(mix_seasonal %>% select(product_id, woy, share_woy), by = c("product_id", "woy")) %>%
  mutate(share_woy = replace_na(share_woy, 0)) %>%
  left_join(sku_price, by = "product_id") %>%
  mutate(
    sku_revenue_fc = forecast * share_woy,
    units_fc = ifelse(median_price > 0, sku_revenue_fc / median_price, NA_real_)
  ) %>%
  left_join(sku_meta, by = "product_id") %>%
  arrange(date, desc(sku_revenue_fc))

# 5) Stocking summaries
weekly_stock_plan <- alloc_seasonal %>%
  select(date, product_id, product_description, sku_revenue_fc, units_fc) %>%
  arrange(date, desc(sku_revenue_fc))

horizon_stock_plan <- alloc_seasonal %>%
  group_by(product_id, product_description) %>%
  summarise(
    total_rev_fc   = sum(sku_revenue_fc, na.rm = TRUE),
    total_units_fc = sum(units_fc, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_rev_fc))

# Preview knitr tables
glue("
**Top-10 SKU coverage window:** last {lookback_weeks} weeks  
**Weekly plan rows:** {nrow(weekly_stock_plan)}  
")
## **Top-10 SKU coverage window:** last 52 weeks  
## **Weekly plan rows:** 260
kable(head(weekly_stock_plan, 12),
      caption = "Weekly Stock Plan — Top-10 SKUs (Seasonality-Aware Mix, Preview)",
      col.names = c("Week", "Product ID", "Description", "Rev Forecast", "Units"),
      digits = 2)
Weekly Stock Plan — Top-10 SKUs (Seasonality-Aware Mix, Preview)
Week Product ID Description Rev Forecast Units
2011-12-12 22423 REGENCY CAKESTAND 3 TIER 99004.48 7765.06
2011-12-12 85123A WHITE HANGING HEART T-LIGHT HOLDER 88652.71 30051.77
2011-12-12 POST POSTAGE 31373.16 1742.95
2011-12-12 84879 ASSORTED COLOUR BIRD ORNAMENT 22048.43 13046.41
2011-12-12 85099B JUMBO BAG RED WHITE SPOTTY 17212.78 8275.38
2011-12-12 47566 PARTY BUNTING 2561.26 517.43
2011-12-12 M Manual 1154.86 769.91
2011-12-12 23084 RABBIT NIGHT LIGHT 0.00 0.00
2011-12-12 23166 MEDIUM CERAMIC TOP STORAGE JAR 0.00 0.00
2011-12-12 23843 PAPER CRAFT , LITTLE BIRDIE 0.00 0.00
2011-12-19 22423 REGENCY CAKESTAND 3 TIER 66317.59 5201.38
2011-12-19 POST POSTAGE 36956.57 2053.14
kable(head(horizon_stock_plan, 10),
      caption = "26-Week Horizon Totals — Top-10 SKUs",
      col.names = c("Product ID", "Description", "Revenue (26w)", "Units (26w)"),
      digits = 2)
26-Week Horizon Totals — Top-10 SKUs
Product ID Description Revenue (26w) Units (26w)
22423 REGENCY CAKESTAND 3 TIER 959959.70 75290.96
85123A WHITE HANGING HEART T-LIGHT HOLDER 629996.75 213558.22
47566 PARTY BUNTING 425016.60 85861.94
POST POSTAGE 419113.67 23284.09
85099B JUMBO BAG RED WHITE SPOTTY 410203.57 197213.25
84879 ASSORTED COLOUR BIRD ORNAMENT 251802.71 148995.68
M Manual 201830.57 134553.71
23166 MEDIUM CERAMIC TOP STORAGE JAR 95153.13 76122.50
23084 RABBIT NIGHT LIGHT 28510.59 13707.01
23843 PAPER CRAFT , LITTLE BIRDIE 0.00 0.00

🎯 Strategy Checklist

  • Before peak weeks: place POs early (≥ lead time), add safety stock for high-volatility SKUs.
  • During peak weeks: prioritize high-margin and fast-turn items; protect price.
  • After peak weeks: run sell-through on residuals (bundles/clearance).
  • Promo pairing: use co-purchase pairs from recent history to define smart bundles.
  • Reviewing Period: roll this allocation weekly; compare planned vs actual; re-estimate mix monthly.

Customer 360

Business Understanding

The Sales Director wants to launch targeted campaigns to retain high-value customers and rekindle dormant ones. We need to build models to identify (a) customers likely to provide high future value, and (b) customers at high risk of churn. Additionally, propose pilot interventions and optionally product-recommendation logic for personalized offers.

Methodological Outline

Logic (EN):

Use historical transaction data to engineer features: RFM (Recency, Frequency, Monetary) plus tenure, product category usage, channel, geography. Define target variables: High-value prediction: e.g., customers in top 20% of future spend (look-ahead 6–12 mo). Churn prediction: e.g., customers with no purchase in next 90 days (non-contractual setting). I think we are again using baseline logistic regression for interpretability, and improved model such as, gradient boosting (XGBoost/LightGBM) for better performance. Use time-based split (train up to T-1 mo, test last 1 mo). We will Evaluate with ROC-AUC, PR-AUC (for imbalance), precision at top decile. Translate predictions into business actions (e.g., expected incremental revenue, cost of retention). Pilot plan: segment customers by risk & value, define offers (e.g., high-value: upsell/cross-sell bundles; risk registry: win-back vouchers).

Load Data

From raw data compute the revenue column by multiplying quantity and unit_price, filter out rows with non-positive revenue values. This ensures that our downstream analysis only includes meaningful transactions. I want to assume cancelled purchase still count as retention, as cancellation is for another analytics.

# --- Customer 360: Clean and compute revenue -------------------------

df_360 <- df_raw %>%
  mutate(
    revenue = quantity * unit_price      # Calculate revenue
  ) %>%
  filter(
    revenue > 0                          # Keep only positive revenue
  )

# Display first few rows of cleaned data
knitr::kable(
  head(df_360),
  caption = "Preview of Cleaned Customer 360 Dataset"
)
Preview of Cleaned Customer 360 Dataset
order_id product_id product_description quantity order_date unit_price customer_id country revenue
489434 85048 15CM CHRISTMAS GLASS BALL 20 LIGHTS 12 2009-12-01 07:45:00 6.95 13085 United Kingdom 83.4
489434 79323P PINK CHERRY LIGHTS 12 2009-12-01 07:45:00 6.75 13085 United Kingdom 81.0
489434 79323W WHITE CHERRY LIGHTS 12 2009-12-01 07:45:00 6.75 13085 United Kingdom 81.0
489434 22041 RECORD FRAME 7” SINGLE SIZE 48 2009-12-01 07:45:00 2.10 13085 United Kingdom 100.8
489434 21232 STRAWBERRY CERAMIC TRINKET BOX 24 2009-12-01 07:45:00 1.25 13085 United Kingdom 30.0
489434 22064 PINK DOUGHNUT TRINKET POT 24 2009-12-01 07:45:00 1.65 13085 United Kingdom 39.6

🧾 Customer Feature Engineering (RFM + Tenure)

We define a snapshot_date as 90 days before the latest transaction date in the dataset.
Then, we compute essential customer-level features: - Recency – how many days since their last order
- Frequency – how many unique orders they placed
- Monetary – total and average spending
- Tenure – how long between their first and last order

# --- Define snapshot date (30 days before latest order) ----------------------
snapshot_date <- max(df_360$order_date) - days(90)

# --- Compute RFM + tenure features ------------------------------------------
cust_feat <- df_360 %>%
  group_by(customer_id) %>%
  summarise(
    last_date   = max(order_date),           # Most recent order date
    first_date  = min(order_date),           # Earliest order date
    frequency   = n_distinct(order_id),      # Number of unique orders
    monetary    = sum(revenue),              # Total spending
    avg_basket  = mean(revenue),             # Average order value
    .groups = "drop"
  ) %>%
  mutate(
    recency_days = as.integer(difftime(snapshot_date, last_date, units = "days")),  # Time since last order
    recency_days = pmax(recency_days, 0),
    tenure_days  = as.integer(difftime(last_date, first_date, units = "days"))      # Customer tenure duration
  )

# --- Display resulting features in a table ----------------------------------
knitr::kable(
  head(cust_feat),
  caption = "Customer RFM and Tenure Features (Sample Preview)"
)
Customer RFM and Tenure Features (Sample Preview)
customer_id last_date first_date frequency monetary avg_basket recency_days tenure_days
12346 2011-01-18 10:01:00 2009-12-14 08:34:00 12 77556.46 2281.07235 235 400
12347 2011-12-07 15:52:00 2010-10-31 14:20:00 8 5633.32 22.26609 0 402
12348 2011-09-25 13:13:00 2010-09-27 14:59:00 5 2019.40 39.59608 0 362
12349 2011-11-21 09:51:00 2010-04-29 13:20:00 4 4428.69 25.30680 0 570
12350 2011-02-02 16:01:00 2011-02-02 16:01:00 1 334.40 19.67059 219 0
12351 2010-11-29 15:23:00 2010-11-29 15:23:00 1 300.93 14.33000 284 0

Recency is the measure of how recently a customer made a purchase or an event occurred, while tenure is the length of time a customer has been active with a company. In a business context, a higher recency score (a recent purchase) is seen as a good indicator of future purchasing, and a longer tenure can signify a loyal customer.

🧩 Define Churn Label

We classify each customer as churned (1) if they made no purchase within 90 days after the snapshot date,
and retained (0) otherwise.

# --- Define churn observation window ---------------------------------------
future_90  <- snapshot_date + days(90)

# --- Identify customers who made purchases within the next 90 days ---------
next_orders <- df_360 %>%
  filter(order_date > snapshot_date & order_date <= future_90)

# --- Label churn: 1 = churned, 0 = retained --------------------------------
churn_label <- cust_feat %>%
  mutate(
    churn = if_else(customer_id %in% next_orders$customer_id, 0, 1)
  )
kable(
  head(churn_label),
  caption = "Labeled as Churn"
)
Labeled as Churn
customer_id last_date first_date frequency monetary avg_basket recency_days tenure_days churn
12346 2011-01-18 10:01:00 2009-12-14 08:34:00 12 77556.46 2281.07235 235 400 1
12347 2011-12-07 15:52:00 2010-10-31 14:20:00 8 5633.32 22.26609 0 402 0
12348 2011-09-25 13:13:00 2010-09-27 14:59:00 5 2019.40 39.59608 0 362 0
12349 2011-11-21 09:51:00 2010-04-29 13:20:00 4 4428.69 25.30680 0 570 0
12350 2011-02-02 16:01:00 2011-02-02 16:01:00 1 334.40 19.67059 219 0 1
12351 2010-11-29 15:23:00 2010-11-29 15:23:00 1 300.93 14.33000 284 0 1
# --- Inspect churn distribution --------------------------------------------
churn_summary <- table(churn_label$churn)
knitr::kable(
  as.data.frame(churn_summary),
  col.names = c("Churn Label", "Count"),
  caption = "Distribution of Churn vs. Retained Customers"
)
Distribution of Churn vs. Retained Customers
Churn Label Count
0 2890
1 2989

Future 90 days is the time we will monitor

kable(
  head(next_orders),
  caption = "Orders that will be tested"
)
Orders that will be tested
order_id product_id product_description quantity order_date unit_price customer_id country revenue
566225 22384 LUNCH BAG PINK POLKADOT 6 2011-09-11 10:35:00 1.65 16899 United Kingdom 9.90
566225 20725 LUNCH BAG RED RETROSPOT 6 2011-09-11 10:35:00 1.65 16899 United Kingdom 9.90
566225 23169 CLASSIC GLASS COOKIE JAR 3 2011-09-11 10:35:00 4.15 16899 United Kingdom 12.45
566225 84029E RED WOOLLY HOTTIE WHITE HEART. 3 2011-09-11 10:35:00 4.25 16899 United Kingdom 12.75
566225 21479 WHITE SKULL HOT WATER BOTTLE 3 2011-09-11 10:35:00 4.25 16899 United Kingdom 12.75
566225 82484 WOOD BLACK BOARD ANT WHITE FINISH 2 2011-09-11 10:35:00 7.95 16899 United Kingdom 15.90

Define High-Value Customers (Top 20% by Future Spend)

We identify high-value customers as those who belong to the top 20% of total spending in the next 12 months following the snapshot date: 1. Aggregate each customer’s revenue over the next year.
2. Compute their percentile rank (percent_rank).
3. Label top 20% as high_value = 1.

# --- Define future 12-month window ------------------------------------------
future_12mo <- snapshot_date + years(1)

# --- Calculate each customer's spend in the next 12 months ------------------
future_spend <- df_360 %>%
  filter(order_date > snapshot_date & order_date <= future_12mo) %>%
  group_by(customer_id) %>%
  summarise(fut_spend = sum(revenue), .groups = "drop")

# --- Rank customers and label top 20% as high-value -------------------------
hv_label <- future_spend %>%
  mutate(rank = percent_rank(fut_spend)) %>%
  mutate(high_value = if_else(rank >= 0.80, 1, 0)) %>%
  select(customer_id, high_value)

# --- Display a few labeled examples -----------------------------------------
knitr::kable(
  head(hv_label, 10),
  caption = "Example: High-Value Customer Labels (Top 20% by Future Spend)"
)
Example: High-Value Customer Labels (Top 20% by Future Spend)
customer_id high_value
12347 1
12348 0
12349 1
12352 0
12356 0
12357 1
12358 0
12359 1
12360 0
12362 1

Data Splitting for Model Training and Testing

Evaluate the predictive performance of our model, we split the dataset into: - Training set (70%) — used to fit or “teach” the model. - Testing set (30%) — held out to evaluate how well the model generalizes.

We also stratify by the churn variable to preserve the same proportion of churned vs retained customers in both subsets.

data_model <- cust_feat %>%
  left_join(churn_label %>% select(customer_id, churn), by = "customer_id") %>%
  left_join(hv_label, by = "customer_id") %>%
  replace_na(list(high_value = 0)) %>%
  mutate(
    churn = factor(churn, levels = c(0, 1)),
    high_value = factor(high_value, levels = c(0, 1))
  )

# --- Split data into training and testing sets -------------------------------
set.seed(305)  # ensures reproducibility

split <- initial_split(data_model, prop = 0.7, strata = churn)
train <- training(split)
test  <- testing(split)

# --- Display dataset sizes ---------------------------------------------------
split_summary <- tibble(
  Dataset = c("Training", "Testing"),
  Records = c(nrow(train), nrow(test)),
  Percentage = c("70%", "30%")
)

knitr::kable(
  split_summary,
  caption = "Training and Testing Dataset Split Summary"
)
Training and Testing Dataset Split Summary
Dataset Records Percentage
Training 4114 70%
Testing 1765 30%

Model Recipe — Data Preprocessing

Define preprocessing recipe for preparation.
This recipe specifies how features (predictors) should be cleaned and standardized for modeling.

Steps included: 1. Select predictors: recency_days, frequency, monetary, avg_basket, tenure_days 2. Impute missing values: Replace missing numeric values with their median 3. Normalize predictors: Scale and center all numeric predictors to ensure balanced model input

# --- Define preprocessing recipe --------------------------------------------
rec <- recipe(
  churn ~ recency_days + frequency + monetary + avg_basket + tenure_days,
  data = train
) %>%
  step_impute_median(all_numeric_predictors()) %>%   # handle missing numeric values
  step_normalize(all_numeric_predictors())            # scale & center all numeric features

# --- Display recipe summary --------------------------------------------------
knitr::kable(
  summary(rec),
  caption = "Summary of Preprocessing Recipe for Churn Model"
)
Summary of Preprocessing Recipe for Churn Model
variable type role source
recency_days double , numeric predictor original
frequency integer, numeric predictor original
monetary double , numeric predictor original
avg_basket double , numeric predictor original
tenure_days integer, numeric predictor original
churn factor , unordered, nominal outcome original

Model Specification — XGBoost Classifier

Extreme Gradient Boosting (XGBoost), tree-based algorithm that I mainly use for robust and interprable use. I think i use this mostly in churn prediction due to its ability to handle nonlinear relationships and interaction effects among features.

Model configuration: - trees = 500 → Builds 500 decision trees (boosting rounds) - learn_rate = 0.05 → Controls how fast the model learns; smaller values reduce overfitting - tree_depth = 5 → Limits how deep each tree grows to balance bias and variance

The model is integrated into a tidymodels workflow, combining preprocessing (recipe) and modeling (boost_tree) steps for cleaner, reproducible pipelines.

library(xgboost)

# --- Define model specification ---------------------------------------------
xgb_spec <- boost_tree(
  trees = 500,
  learn_rate = 0.05,
  tree_depth = 5
) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

# --- Create workflow: combine recipe and model ------------------------------
wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(xgb_spec)

# --- Fit model on training data ---------------------------------------------
fit_xgb <- wf %>% fit(data = train)

# --- Display model summary --------------------------------------------------
xgb_summary <- tibble(
  Model = "XGBoost Classifier",
  Trees = 500,
  Learning_Rate = 0.05,
  Max_Depth = 5,
  Engine = "xgboost",
  Mode = "classification"
)

knitr::kable(
  xgb_summary,
  caption = "XGBoost Model Specification Summary"
)
XGBoost Model Specification Summary
Model Trees Learning_Rate Max_Depth Engine Mode
XGBoost Classifier 500 0.05 5 xgboost classification

Predict & Evaluate

# --- Predict churn probabilities --------------------------------------------

pred <- predict(fit_xgb, new_data = test, type = "prob")$.pred_1

# --- Compute performance metrics --------------------------------------------

roc  <- roc_auc_vec(test$churn, pred)
pr   <- pr_auc_vec(test$churn, pred)

# --- Display metrics summary ------------------------------------------------

metric_tbl <- tibble(
Metric = c("ROC_AUC", "PR_AUC"),
Value = c(roc, pr)
)

knitr::kable(
metric_tbl,
caption = "Model Evaluation Metrics for Churn Prediction"
)
Model Evaluation Metrics for Churn Prediction
Metric Value
ROC_AUC 0.0003917
PR_AUC 0.3004528
# --- Segment customers by predicted churn probability -----------------------

test <- test %>%
mutate(churn_prob = pred) %>%
arrange(desc(churn_prob))

top10pct <- test %>%
slice_head(n = floor(0.10 * nrow(test)))

# --- Display top 10% high-churn customers -----------------------------------

knitr::kable(
head(top10pct, 10),
caption = "Top 10 Customers by Predicted Churn Probability"
)
Top 10 Customers by Predicted Churn Probability
customer_id last_date first_date frequency monetary avg_basket recency_days tenure_days churn high_value churn_prob
14325 2010-05-19 09:20:00 2010-05-19 09:20:00 1 245.48 13.63778 479 0 1 0 0.9999882
17766 2010-11-23 14:29:00 2010-11-23 14:17:00 2 299.61 13.61864 290 0 1 0 0.9999882
15851 2010-07-01 12:34:00 2010-07-01 12:34:00 1 109.73 13.71625 436 0 1 0 0.9997503
17260 2010-10-24 13:21:00 2010-10-24 13:21:00 1 120.95 20.15833 320 0 1 0 0.9997238
17045 2011-08-18 17:09:00 2010-10-06 14:52:00 5 827.68 15.61660 22 316 1 0 0.9996808
17874 2011-06-07 12:54:00 2011-03-29 11:25:00 6 683.46 14.85783 94 70 1 0 0.9996808
12561 2011-02-10 11:39:00 2011-02-10 11:39:00 1 238.85 14.92812 212 0 1 0 0.9996714
12710 2011-09-04 12:00:00 2011-05-22 14:05:00 3 1149.02 14.92234 6 104 1 0 0.9996714
12738 2010-12-02 18:27:00 2010-12-02 18:27:00 1 310.70 14.12273 281 0 1 0 0.9996714
12776 2010-02-10 12:11:00 2010-02-10 12:11:00 1 55.50 13.87500 577 0 1 0 0.9996714

Classification Threshold, Confusion Matrices & Top-10% Targeting

We convert probabilities to classes using a decision threshold (default 0.5). Then we compute confusion matrices for train and test to see true/false positives/negatives, plus key metrics (Precision, Recall, F1). We also simulate a budgeted campaign by only targeting the top 10% highest-risk customers.

Threshold mengarah ke 1 apabila ingin hasil lebih Recall atau tidak masalah dengan melakukan CRM kepada customer yang pasti retensi.

library(caret)
library(dplyr)
library(knitr)

# --- Safety: ensure factor levels are c("0","1") ----------------------------
train <- train %>% mutate(churn = factor(churn, levels = c("0","1")))
test  <- test  %>% mutate(churn  = factor(churn,  levels = c("0","1")))

# --- 1) Probabilities from fitted model -------------------------------------
pred_train_prob <- predict(fit_xgb, new_data = train, type = "prob")$.pred_1
pred_test_prob  <- predict(fit_xgb, new_data = test,  type = "prob")$.pred_1

# --- 2) Choose threshold(s) --------------------------------------------------
thr <- 0.5                                  # main operating point
thr_top10 <- quantile(pred_test_prob, 0.90, na.rm = TRUE)   # top-10% by risk

# --- 3) Class labels at chosen threshold ------------------------------------
train_pred_cls <- factor(ifelse(pred_train_prob >= thr, "1", "0"), levels = c("0","1"))
test_pred_cls  <- factor(ifelse(pred_test_prob  >= thr, "1", "0"), levels = c("0","1"))

scored <- test %>%
  mutate(
    churn_prob = pred_test_prob,
    high_value = factor(high_value, levels = c("0", "1"))
  )

# --- 4) Confusion matrices (standard + PR mode) -----------------------------
cm_train <- caret::confusionMatrix(data = train_pred_cls, reference = train$churn, positive = "1")
cm_test  <- caret::confusionMatrix(data = test_pred_cls,  reference = test$churn,  positive = "1")

cm_train_pr <- caret::confusionMatrix(data = train_pred_cls, reference = train$churn,
                                      positive = "1", mode = "prec_recall")
cm_test_pr  <- caret::confusionMatrix(data = test_pred_cls,  reference = test$churn,
                                      positive = "1", mode = "prec_recall")

# --- 5) Pretty tables: confusion matrices -----------------------------------
kable(as.data.frame(cm_train$table),
      caption = "Confusion Matrix — TRAIN (threshold = 0.5)",
      col.names = c("Predicted", "Actual", "Freq"))
Confusion Matrix — TRAIN (threshold = 0.5)
Predicted Actual Freq
0 0 2022
1 0 0
0 1 2
1 1 2090
kable(as.data.frame(cm_test$table),
      caption = "Confusion Matrix — TEST (threshold = 0.5)",
      col.names = c("Predicted", "Actual", "Freq"))
Confusion Matrix — TEST (threshold = 0.5)
Predicted Actual Freq
0 0 868
1 0 0
0 1 2
1 1 895
# --- 6) Key metrics summary (Accuracy, Precision, Recall, F1) ---------------
metrics_train <- data.frame(
  Dataset   = "Train",
  Accuracy  = cm_train$overall[["Accuracy"]],
  Precision = cm_train_pr$byClass[["Precision"]],
  Recall    = cm_train_pr$byClass[["Recall"]],
  F1        = cm_train_pr$byClass[["F1"]]
)

metrics_test <- data.frame(
  Dataset   = "Test",
  Accuracy  = cm_test$overall[["Accuracy"]],
  Precision = cm_test_pr$byClass[["Precision"]],
  Recall    = cm_test_pr$byClass[["Recall"]],
  F1        = cm_test_pr$byClass[["F1"]]
)

kable(rbind(metrics_train, metrics_test),
      digits = 3,
      caption = "Classification Metrics at Threshold = 0.5")
Classification Metrics at Threshold = 0.5
Dataset Accuracy Precision Recall F1
Train 1.000 1 0.999 1.000
Test 0.999 1 0.998 0.999
# --- 7) Target only Top 10% highest-risk on TEST ----------------------------
test_pred_top10 <- factor(ifelse(pred_test_prob >= thr_top10, "1", "0"), levels = c("0","1"))
cm_test_top10 <- caret::confusionMatrix(data = test_pred_top10, reference = test$churn,
                                        positive = "1", mode = "prec_recall")

kable(as.data.frame(cm_test_top10$table),
      caption = "Confusion Matrix — TEST (Targeting Top 10% Highest Risk)",
      col.names = c("Predicted", "Actual", "Freq"))
Confusion Matrix — TEST (Targeting Top 10% Highest Risk)
Predicted Actual Freq
0 0 868
1 0 0
0 1 639
1 1 258
kable(data.frame(
        Dataset   = "Test (Top 10%)",
        Precision = cm_test_top10$byClass[["Precision"]],
        Recall    = cm_test_top10$byClass[["Recall"]],
        F1        = cm_test_top10$byClass[["F1"]]
      ),
      digits = 3,
      caption = "Precision/Recall/F1 when Targeting Top 10% Highest Risk"
)
Precision/Recall/F1 when Targeting Top 10% Highest Risk
Dataset Precision Recall F1
Test (Top 10%) 1 0.288 0.447

How to Read

  • Confusion Matrix:
    • TP = churners correctly flagged; FP = non-churners you’d contact by mistake.
    • TN = safe non-churners; FN = churners you missed.
  • Precision: Of those you contact, how many are truly churners?
  • Recall: Of all churners, how many did we get
  • F1: Balance of Precision and Recall (higher is better).
  • Top-10% Targeting: Use when you have a fixed campaign budget—it shows performance if you only act on the riskiest 10%.

Now I have an analysis here, if i want to seek the 10% most risk with high value, it means that people with high value purchase that has not been long in contact with us, but the most recent purchaser of course accumulate more revenue than the most risky one, what is the analysis for this.

I will break this down into two competing goals:

Factor Description Desired Direction
Churn Risk (churn_prob) Probability that a customer will churn Higher → More risky
Customer Value (monetary_value or total_purchase) Potential revenue impact if they churn Higher → More valuable
Recency (days_since_last_contact) Time since last engagement Smaller → More recently contacted

So we’re looking for customers with high value, high churn probability, and not recently contacted — these are strategic targets: people you can still save, who are worth saving, and who might churn soon.

Strategies that I will pursue

  • HVHR (High-Value High-Risk): allocate most budget; concierge outreach, premium perks, early-access bundles, flexible returns.

  • Win-back (At-Risk Non-HV): time-limited voucher (10–15%), cart-starter bundles, reduce friction (COD, one-click reorder). Broad reach, lower cost per user.

  • LRHV (Low-Risk High-Value): protect with light nudges—cross-sell via co-purchase, loyalty accelerators; avoid discounts.

  • Target size: start with top 10% churn risk, tune threshold to budget.

Measurement: A/B control; secondary = 60–90‑day repeat rate; refresh model monthly.

library(dplyr)
library(knitr)
library(glue)

# --- Build segments & expected risk revenue ---------------------------------
segments <- scored %>%
  mutate(
    seg = case_when(
      churn_prob >= thr_top10 & high_value == "1" ~ "HVHR",
      churn_prob >= thr_top10 & high_value == "0" ~ "WinBack",
      churn_prob <  thr_top10 & high_value == "1" ~ "LRHV",
      TRUE                                        ~ "Other"
    ),
    exp_risk_revenue = churn_prob * monetary
  )

# --- Segment counts (quick dashboard numbers) --------------------------------
seg_counts <- segments %>%
  count(seg, name = "customers") %>%
  arrange(desc(customers))

kable(seg_counts, caption = "Customer Counts by Segment")
Customer Counts by Segment
seg customers
Other 1343
WinBack 258
LRHV 164
# --- HVHR: prioritize by value-at-risk --------------------------------------
hvhr <- segments %>%
  filter(seg == "HVHR") %>%
  arrange(desc(exp_risk_revenue)) %>%
  select(customer_id, monetary, recency_days, frequency, churn_prob, exp_risk_revenue) %>%
  head(25)

kable(hvhr,
      caption = "HVHR — Top 25 by Expected Risk Revenue",
      col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob", "Expected Risk Rev"),
      digits = 2)
HVHR — Top 25 by Expected Risk Revenue
Customer Monetary Recency (days) Frequency Churn Prob Expected Risk Rev
# --- WinBack: cheapest to act on, sort by risk -------------------------------
winback <- segments %>%
  filter(seg == "WinBack") %>%
  arrange(desc(churn_prob)) %>%
  select(customer_id, monetary, recency_days, frequency, churn_prob) %>%
  head(50)

kable(winback,
      caption = "WinBack — Top 50 by Churn Probability",
      col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob"),
      digits = 2)
WinBack — Top 50 by Churn Probability
Customer Monetary Recency (days) Frequency Churn Prob
14325 245.48 479 1 1
17766 299.61 290 2 1
15851 109.73 436 1 1
17260 120.95 320 1 1
17045 827.68 22 5 1
17874 683.46 94 6 1
12561 238.85 212 1 1
12710 1149.02 6 3 1
12738 310.70 281 1 1
12776 55.50 577 1 1
12799 219.35 438 1 1
12892 263.38 306 1 1
12922 405.38 70 1 1
12941 377.49 633 1 1
12972 1161.20 323 4 1
13161 478.88 222 2 1
13304 1217.99 260 4 1
13385 534.57 239 1 1
13837 271.24 121 2 1
13859 515.86 235 2 1
13981 331.14 325 1 1
14011 457.82 389 2 1
14026 713.11 380 2 1
14187 123.50 383 1 1
14347 82.60 646 1 1
14353 376.50 124 3 1
14762 417.57 173 2 1
14772 139.26 21 1 1
14773 140.55 502 1 1
14900 13.92 431 1 1
14991 314.67 339 1 1
15047 343.62 179 1 1
15064 1254.41 319 4 1
15082 93.35 578 1 1
15104 1027.44 106 2 1
15135 305.68 148 2 1
15139 178.96 149 2 1
15174 481.46 232 2 1
15209 178.98 639 1 1
15323 434.60 472 3 1
15362 613.08 358 2 1
15554 217.20 187 1 1
15560 153.52 307 2 1
15585 455.30 86 1 1
15828 1108.82 284 1 1
15979 329.22 512 1 1
16137 422.70 589 1 1
16297 301.90 178 1 1
16346 224.51 294 1 1
16465 746.24 330 2 1
# --- LRHV: protect valuable base, sort by value ------------------------------
lrhv <- segments %>%
  filter(seg == "LRHV") %>%
  arrange(desc(monetary)) %>%
  select(customer_id, monetary, recency_days, frequency, churn_prob) %>%
  head(50)

kable(lrhv,
      caption = "LRHV — Top 50 by Monetary Value",
      col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob"),
      digits = 2)
LRHV — Top 50 by Monetary Value
Customer Monetary Recency (days) Frequency Churn Prob
13694 196482.81 0 143 0
17511 175603.55 0 60 0
16029 122209.14 0 107 0
15311 116771.16 0 208 0
13798 75836.87 0 110 0
15838 74163.00 0 34 0
14088 64036.80 0 16 0
13081 59205.15 0 28 0
16013 57446.44 0 84 0
17404 47781.12 0 23 0
16705 43515.05 0 42 0
14031 40999.17 0 44 0
17381 40002.37 0 38 0
12471 39963.79 0 79 0
16210 38489.51 0 35 0
17675 38259.60 0 70 0
15856 34220.34 0 52 0
14258 32051.34 0 25 0
14527 27792.30 0 122 0
17865 26374.39 0 46 0
18251 26278.86 0 9 0
12731 25775.71 0 17 0
14062 25604.20 0 24 0
12939 25264.80 0 20 0
14895 25119.54 0 38 0
17920 24516.46 0 54 0
14607 24137.63 0 22 0
12682 24033.91 0 52 0
16133 22992.26 0 56 0
14051 22434.00 0 37 0
14667 21927.14 0 54 0
12753 21429.39 0 6 0
15615 21373.97 0 47 0
17139 19584.46 0 25 0
17581 18757.75 0 43 0
17377 18319.41 0 78 0
12472 17880.59 0 19 0
12901 17654.54 0 28 0
14849 15763.45 0 43 0
16191 15688.02 0 24 0
17133 14622.33 0 12 0
15482 14592.56 0 19 0
16713 14560.69 0 32 0
12476 14115.41 0 23 0
13178 14059.19 0 21 0
12435 13925.93 0 6 0
13097 13389.22 0 28 0
17858 12797.65 0 30 0
12437 12683.40 0 39 0
13418 12233.60 0 31 0
# --- Tiny summary blurb for the report --------------------------------------
glue("
**Operational note:** Target HVHR with concierge outreach & tailored offers,
WinBack with low-cost reactivation, LRHV with loyalty & cross-sell. 
Size segments based on budget (start with top 10% risk threshold).")
## **Operational note:** Target HVHR with concierge outreach & tailored offers,
## WinBack with low-cost reactivation, LRHV with loyalty & cross-sell. 
## Size segments based on budget (start with top 10% risk threshold).