This picture below is the contents of the data
df_raw <- read_csv(file = "retail_transaction_data.csv", col_names = TRUE)
kable(head(df_raw))
| order_id | product_id | product_description | quantity | order_date | unit_price | customer_id | country |
|---|---|---|---|---|---|---|---|
| 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085 | United Kingdom |
| 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085 | United Kingdom |
| 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085 | United Kingdom |
| 489434 | 22041 | RECORD FRAME 7” SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085 | United Kingdom |
| 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085 | United Kingdom |
| 489434 | 22064 | PINK DOUGHNUT TRINKET POT | 24 | 2009-12-01 07:45:00 | 1.65 | 13085 | United Kingdom |
check the dimension and metadata
str(df_raw)
## spc_tbl_ [1,067,371 × 8] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ order_id : chr [1:1067371] "489434" "489434" "489434" "489434" ...
## $ product_id : chr [1:1067371] "85048" "79323P" "79323W" "22041" ...
## $ product_description: chr [1:1067371] "15CM CHRISTMAS GLASS BALL 20 LIGHTS" "PINK CHERRY LIGHTS" "WHITE CHERRY LIGHTS" "RECORD FRAME 7\" SINGLE SIZE" ...
## $ quantity : num [1:1067371] 12 12 12 48 24 24 24 10 12 12 ...
## $ order_date : POSIXct[1:1067371], format: "2009-12-01 07:45:00" "2009-12-01 07:45:00" ...
## $ unit_price : num [1:1067371] 6.95 6.75 6.75 2.1 1.25 1.65 1.25 5.95 2.55 3.75 ...
## $ customer_id : num [1:1067371] 13085 13085 13085 13085 13085 ...
## $ country : chr [1:1067371] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## - attr(*, "spec")=
## .. cols(
## .. order_id = col_character(),
## .. product_id = col_character(),
## .. product_description = col_character(),
## .. quantity = col_double(),
## .. order_date = col_datetime(format = ""),
## .. unit_price = col_double(),
## .. customer_id = col_double(),
## .. country = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
# Basic counts
n_product <- length(unique(df_raw$product_id))
n_customer <- length(unique(df_raw$customer_id))
n_country <- length(unique(df_raw$country))
# Print with clear formatting
kable(
tibble(
Metric = c("Unique Products", "Unique Customers", "Unique Countries"),
Count = c(n_product, n_customer, n_country)
),
caption = "Summary of Unique Entities in Retail Transactions",
align = "lc",
col.names = c("Metric", "Count")
)
| Metric | Count |
|---|---|
| Unique Products | 5304 |
| Unique Customers | 5943 |
| Unique Countries | 43 |
## **Transaction Period:** from 2009-12-01 07:45:00 to 2011-12-09 12:50:00
| Country | Number of Transactions |
|---|---|
| United Kingdom | 981330 |
| EIRE | 17866 |
| Germany | 17624 |
| France | 14330 |
| Netherlands | 5140 |
| Spain | 3811 |
| Switzerland | 3189 |
| Belgium | 3123 |
| Portugal | 2620 |
| Australia | 1913 |
| Channel Islands | 1664 |
| Italy | 1534 |
| Norway | 1455 |
| Sweden | 1364 |
| Cyprus | 1176 |
| Finland | 1049 |
| Austria | 938 |
| Denmark | 817 |
| Unspecified | 756 |
| Greece | 663 |
| Japan | 582 |
| Poland | 535 |
| USA | 535 |
| United Arab Emirates | 500 |
| Israel | 371 |
| Hong Kong | 364 |
| Singapore | 346 |
| Malta | 299 |
| Iceland | 253 |
| Canada | 228 |
| Lithuania | 189 |
| RSA | 169 |
| Bahrain | 126 |
| Brazil | 94 |
| Thailand | 76 |
| Korea | 63 |
| European Community | 61 |
| Lebanon | 58 |
| West Indies | 54 |
| Bermuda | 34 |
| Nigeria | 32 |
| Czech Republic | 30 |
| Saudi Arabia | 10 |
| order_id | product_id | quantity | unit_price | revenue | is_cancelled |
|---|---|---|---|---|---|
| 489434 | 85048 | 12 | 6.95 | 83.4 | FALSE |
| 489434 | 79323P | 12 | 6.75 | 81.0 | FALSE |
| 489434 | 79323W | 12 | 6.75 | 81.0 | FALSE |
| 489434 | 22041 | 48 | 2.10 | 100.8 | FALSE |
| 489434 | 21232 | 24 | 1.25 | 30.0 | FALSE |
| 489434 | 22064 | 24 | 1.65 | 39.6 | FALSE |
| 489434 | 21871 | 24 | 1.25 | 30.0 | FALSE |
| 489434 | 21523 | 10 | 5.95 | 59.5 | FALSE |
| 489435 | 22350 | 12 | 2.55 | 30.6 | FALSE |
| 489435 | 22349 | 12 | 3.75 | 45.0 | FALSE |
# 🧩 Data Cleaning Pipeline
df <- df_raw %>%
drop_na(customer_id) %>%
mutate(
is_cancelled = grepl('^C', as.character(order_id)),
revenue = quantity * unit_price
) %>%
filter(!is_cancelled, revenue > 0)
# ✅ Summary after cleaning
n_before <- nrow(df_raw)
n_after <- nrow(df)
pct_kept <- round((n_after / n_before) * 100, 2)
glue("
**Rows before cleaning:** {n_before}
**Rows after cleaning:** {n_after}
**Data retained:** {pct_kept}% of original transactions
")
## **Rows before cleaning:** 1067371
## **Rows after cleaning:** 805549
## **Data retained:** 75.47% of original transactions
# 🪞 Peek at cleaned data
clean_sample <- df %>%
select(order_id, customer_id, product_id, quantity, unit_price, revenue, country) %>%
head(10)
kable(
clean_sample,
caption = "Sample of Cleaned Transaction Data (Valid Orders Only)",
align = "c"
)
| order_id | customer_id | product_id | quantity | unit_price | revenue | country |
|---|---|---|---|---|---|---|
| 489434 | 13085 | 85048 | 12 | 6.95 | 83.4 | United Kingdom |
| 489434 | 13085 | 79323P | 12 | 6.75 | 81.0 | United Kingdom |
| 489434 | 13085 | 79323W | 12 | 6.75 | 81.0 | United Kingdom |
| 489434 | 13085 | 22041 | 48 | 2.10 | 100.8 | United Kingdom |
| 489434 | 13085 | 21232 | 24 | 1.25 | 30.0 | United Kingdom |
| 489434 | 13085 | 22064 | 24 | 1.65 | 39.6 | United Kingdom |
| 489434 | 13085 | 21871 | 24 | 1.25 | 30.0 | United Kingdom |
| 489434 | 13085 | 21523 | 10 | 5.95 | 59.5 | United Kingdom |
| 489435 | 13085 | 22350 | 12 | 2.55 | 30.6 | United Kingdom |
| 489435 | 13085 | 22349 | 12 | 3.75 | 45.0 | United Kingdom |
# 🧾 Aggregate daily transactions into weekly revenue
weekly_sales <- df %>%
mutate(week = floor_date(order_date, unit = "week", week_start = 1)) %>%
group_by(week) %>%
summarise(revenue = sum(revenue, na.rm = TRUE), .groups = "drop") %>%
complete(
week = seq(min(week), max(week), by = "1 week"),
fill = list(revenue = 0)
) %>%
arrange(week)
# 📊 Summary info
glue("
**Weekly data coverage:** {min(weekly_sales$week)} to {max(weekly_sales$week)}
**Total weeks:** {nrow(weekly_sales)}
**Average weekly revenue:** {round(mean(weekly_sales$revenue, na.rm = TRUE), 2)}
**Maximum weekly revenue:** {round(max(weekly_sales$revenue, na.rm = TRUE), 2)}
")
## **Weekly data coverage:** 2009-11-30 to 2011-12-05
## **Total weeks:** 106
## **Average weekly revenue:** 167390.84
## **Maximum weekly revenue:** 412154.51
# 🪞 Preview
kable(
head(weekly_sales, 10),
caption = "Weekly Aggregated Revenue (First 10 Weeks)",
col.names = c("Week Starting", "Total Revenue"),
align = "lc"
)
| Week Starting | Total Revenue |
|---|---|
| 2009-11-30 | 232917.9 |
| 2009-12-07 | 209021.3 |
| 2009-12-14 | 208936.8 |
| 2009-12-21 | 35778.2 |
| 2009-12-28 | 0.0 |
| 2010-01-04 | 167928.5 |
| 2010-01-11 | 125897.8 |
| 2010-01-18 | 119968.8 |
| 2010-01-25 | 143524.0 |
| 2010-02-01 | 112879.7 |
Weekly aggregation is chosen to balance granularity and stability. Daily data is often noisy due to holidays, weekends, and random short events, while monthly aggregation will make our dataset too short and can hid any short-term variations as shown below. Weekly frequency captures seasonal effects like holiday spikes and marketing cycles, aligning with common business reporting (52 weeks/year).
knitr::include_graphics("assets/daily_data_inconsistency.png")
knitr::include_graphics("assets/daily_data_inconsistency_2.png")
Code Explanation :
Creating Time series object
sales_ts <- ts(data = weekly_sales$revenue,
start = c(2009,48),
frequency = 52)
This is the data plot by week.
Decomposition adalah suatu tahapan dalam time series analisis yang digunakan untuk menguraikan beberapa komponen dalam time series data.
Komponen/unsur dalam time series :
Trend : pola data secara general, cenderung untuk naik atau turun. Jika pada trend masih terdapat pola (fluktuatif) artinya masih ada pola yang belum tertangkap dengan baik.
Seasonal : pola musiman yang membentuk pola berulang pada periode waktu yang tetap
Residual/Error : pola yang tidak dapat ditangkap dalam trend dan seasonal
Sebelum melakukan model forecasting kita perlu mengamati objek time
series dari hasil decompose. Ide utama dari decomposition
adalah untuk menguraikan ketiga komponen dari objek ts (trend, seasonal,
residual).
Jika pada hasil decompose, trend masih membetuk sebuah pola maka seasonality belum cukup tertangkap, dalam kasus ini data <2 Tahun seringkali tidak mampu memberikan seasonality. Kita dapat mencatat hal ini sebagai adjustment dalam melakukan modeling
sales_ts %>% decompose(type = c("additive","multiplicative"), filter=NULL) %>% autoplot()
## freq = 52 length = 106 seasons_in_train = 2
After reading and understanding the descriptive data that is available such as : * Short seasonal Period * Using weekly periodization rather than monthly
Overkilling with Deep Learning methods for me sometimes lacks human factor and understandability not only for the readers but for me also. With limited time also at hand I think It is best to start with the baseline model and Improving it further with Inventory consideration for a more actionable analysis rather than a robust model.
But keep in mind that I will give some pointer for improvments ahead
fc_naive <- naive(train_ts, h = test_n)
fc_snaive <- snaive(train_ts, h = test_n)
fit_arima <- auto.arima(train_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
fc_arima <- forecast(fit_arima, h = test_n)
Seasonal Naïve (snaive) model serves as the benchmark by assuming each weeks sales repeat the pattern of the same week in the previous year. It’s transparent, simple, and provides a clear baseline for measuring improvements.
ARIMA + Fourier (fallback for < 2 seasons): When the training window contains less than two full seasonal cycles (e.g., < 104 weeks for weekly data), stl() inside stlf() will fail. Fourier terms provide a compact, smooth representation of seasonality without needing full cycles in-sample. We model the deterministic seasonal shape with fourier() as external regressors (xreg), while ARIMA captures autocorrelation and short‑term shocks. This makes ARIMA+Fourier suitable for long seasonality with limited history.
The fourier() function adds pairs of sine and cosine waves
that approximate seasonal patterns.
Each pair (controlled by K) captures one
“harmonic” of the seasonal cycle.
K gradually (e.g., from 3 → 6 → 10 → 13).
seasonal = FALSE in auto.arima() because
Fourier terms replace seasonal differencing.
AICc and residual plots — stop increasing
K when improvement flattens.
Nilai K menunjukkan seberapa kuat model menangkap pola
musiman.
Untuk membuat model lebih musiman:
K sedikit demi sedikit.
AICc atau RMSE validasi.
K di mana hasil terbaik tapi tidak overfit.
aic_table <- data.frame(K = integer(), AICc = numeric())
# Frequency = 52 (weekly)
freq <- 52
max_K <- floor(freq / 2) # theoretical maximum
for (k in 1:max_K) {
xreg_train <- fourier(train_ts, K = k)
fit <- auto.arima(train_ts, xreg = xreg_train, seasonal = FALSE)
aic_table <- rbind(aic_table, data.frame(K = k, AICc = fit$aicc))
}
library(ggplot2)
ggplot(aic_table, aes(x = K, y = AICc)) +
geom_line() + geom_point() +
labs(title = "Fourier Term Tuning", y = "AICc (lower is better)")
We are going to use K = 7
K <- min(7, floor(freq/2)) # tune K if needed
xreg_train <- fourier(train_ts, K = K)
xreg_test <- fourier(train_ts, K = K, h = test_n)
fit_arima2 <- auto.arima(train_ts, xreg = xreg_train, seasonal = FALSE)
aic_table <- rbind(aic_table, data.frame(K = K, AICc = fit_arima2$aicc))
fc_arima2 <- forecast(fit_arima2, xreg = xreg_test, h = valid_n)
pred <- fc_arima2$mean
## # A tibble: 4 × 3
## model MAE WAPE
## <chr> <dbl> <dbl>
## 1 naive 61761. 0.231
## 2 snaive 53975. 0.202
## 3 arima 106564. 0.398
## 4 arima_fourier 55399. 0.207
train_ts %>%
autoplot() +
autolayer(fc_snaive$fitted, series = "fitted (Seasonal Naive)") +
autolayer(fc_arima2$fitted, series = "fitted (ARIMA Fourier)")
autoplot(fc_arima2) +
autolayer(test_ts, series = "Actual") +
autolayer(fc_snaive$mean, series = "Seasonal Naive") +
labs(title = paste("Forecast vs Actual (Arima Fourier)", sep = ""),
x = "Week",
y = "Revenue") +
theme_minimal()
# ---- Fourier-Enhanced ARIMA Forecast ----
# Step 1: Determine if there are enough full seasonal cycles in training data
seasons_in_train <- floor(length(train_ts) / freq)
use_stlf <- seasons_in_train >= 2
# Step 2: Define maximum Fourier terms (K)
K_max <- floor(freq / 2) # theoretical upper limit
K_by_len <- max(1, floor(length(full_ts) / 20)) # adjust for data length
K <- max(1, min(6, K_max, K_by_len)) # practical upper bound (prevent overfitting)
# Step 3: Create Fourier terms as external regressors
xreg_full <- fourier(full_ts, K = K)
# Step 4: Fit ARIMA model with Fourier terms
fit_full <- auto.arima(
full_ts,
xreg = xreg_full,
seasonal = FALSE
)
# Step 5: Forecast using future Fourier terms (26 periods ahead)
final_fc <- forecast(
fit_full,
xreg = fourier(full_ts, K = K, h = 26),
h = 26
)
# Step 6: Display summary and plot
summary(final_fc)
##
## Forecast method: Regression with ARIMA(0,0,1) errors
##
## Model Information:
## Series: full_ts
## Regression with ARIMA(0,0,1) errors
##
## Coefficients:
## ma1 intercept S1-52 C1-52 S2-52 C2-52
## 0.3442 166288.522 -29029.263 44612.699 -31452.465 31635.879
## s.e. 0.0959 6006.043 8557.342 8406.057 8504.932 8387.534
## S3-52 C3-52 S4-52 C4-52 S5-52 C5-52
## -17369.463 13635.713 318.1864 12872.452 12556.594 10750.335
## s.e. 8422.552 8352.472 8315.6181 8296.043 8190.226 8212.199
##
## sigma^2 = 2.386e+09: log likelihood = -1288.54
## AIC=2603.07 AICc=2607.03 BIC=2637.7
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 52.21969 46003.3 33000.6 -Inf Inf 0.8688257 -0.01493591
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 3.038462 262007.68 199401.96 324613.4 166260.486 357754.9
## 3.057692 164800.37 98590.02 231010.7 63540.375 266060.4
## 3.076923 129612.65 63402.30 195823.0 28352.657 230872.6
## 3.096154 103204.49 36994.14 169414.8 1944.492 204464.5
## 3.115385 89329.33 23118.98 155539.7 -11930.664 190589.3
## 3.134615 88702.34 22491.99 154912.7 -12557.657 189962.3
## 3.153846 98886.11 32675.76 165096.5 -2373.888 200146.1
## 3.173077 115095.80 48885.45 181306.1 13835.802 216355.8
## 3.192308 131664.42 65454.06 197874.8 30404.421 232924.4
## 3.211538 143701.04 77490.69 209911.4 42441.043 244961.0
## 3.230769 148421.89 82211.54 214632.2 47161.894 249681.9
## 3.250000 145752.79 79542.43 211963.1 44492.791 247012.8
## 3.269231 138049.31 71838.96 204259.7 36789.314 239309.3
## 3.288462 129073.20 62862.85 195283.6 27813.206 230333.2
## 3.307692 122600.77 56390.41 188811.1 21340.769 223860.8
## 3.326923 121143.01 54932.66 187353.4 19883.014 222403.0
## 3.346154 125196.33 58985.98 191406.7 23936.332 226456.3
## 3.365385 133242.14 67031.79 199452.5 31982.145 234502.1
## 3.384615 142447.11 76236.76 208657.5 41187.117 243707.1
## 3.403846 149776.20 83565.85 215986.6 48516.207 251036.2
## 3.423077 153101.10 86890.75 219311.5 51841.103 254361.1
## 3.442308 151908.36 85698.01 218118.7 50648.363 253168.4
## 3.461538 147372.49 81162.14 213582.8 46112.497 248632.5
## 3.480769 141798.11 75587.75 208008.5 40538.110 243058.1
## 3.500000 137664.95 71454.60 203875.3 36404.957 238924.9
## 3.519231 136647.95 70437.60 202858.3 35387.953 237907.9
autoplot(final_fc) + ggtitle("Fourier-Enhanced ARIMA Forecast") +
theme_minimal()
What the Metric means MA(1) Moving Average Captures short-term randomness (small noise correction). Our number is what we need considering we just want to understand ups and downs. Intercept Base level is average level of our model, pretty much the sampe. Fourier Terms (S, C) Seasonal harmonics Show multiple cyclical patterns (e.g., weekly/annual). AIC / BIC Model quality indicators is the lowest without overfitting. RMSE / MAE Error magnitude Average forecast miss ≈ 33–46 K revenue units. small error. MPE/MAPE is not relevant because we have 0 in our sales. MASE = 0.87 Relative to seasonal naïve, model is 13 % better than a basic seasonal baseline ✅ ACF1 ≈ 0 Residual correlation is random — good sign of a well-fit model.
From the Fourier-Enhanced ARIMA model, we can infer how well our time series captures complex seasonal structures beyond standard ARIMA’s built-in seasonality.
A smooth continuation of recurring patterns consistent with past cycles.
Clearer peaks and troughs aligned with observed seasonal behavior.
Reduced residual autocorrelation (confirmable via Ljung–Box test), meaning the model captured most of the repeating structure.
A well-tuned K helps detect periodic spikes in demand, sales, or transactions.
Increasing K improves fit but can lead to overfitting
The optimal K balances accuracy (low AICc / RMSE) and stability over future periods.
Inventory planning (reducing overstock on low-weeks), and
Budget allocation (timing promos to high-season peaks).
Convert the model output to a clean forecast table with dates, point forecasts, and 80% intervals; display a readable preview and a quick summary for execs.We will translate weekly total forecasts into actionable SKU plans using historical mix shares (simple and seasonality-aware), convert revenue → units (via median price), and add a safety stock overlay.
This bridges top-down demand to bottom-up inventory and promotion targeting.
# Build nicely formatted forecast table (26 weeks ahead)
forecast_tbl <- tibble(
date = seq(max(weekly_sales$week) + weeks(1), by = "1 week", length.out = 26),
forecast = as.numeric(final_fc$mean),
lo80 = as.numeric(final_fc$lower[,"80%"]),
hi80 = as.numeric(final_fc$upper[,"80%"])
)
# Executive summary
glue("
**Forecast horizon:** {min(forecast_tbl$date)} → {max(forecast_tbl$date)}
**Total forecast revenue (26w):** {round(sum(forecast_tbl$forecast), 2)}
**Avg weekly forecast:** {round(mean(forecast_tbl$forecast), 2)}
**Peak week (point forecast):** {forecast_tbl$date[which.max(forecast_tbl$forecast)]}
")
## **Forecast horizon:** 2011-12-12 → 2012-06-04
## **Total forecast revenue (26w):** 3551199.93
## **Avg weekly forecast:** 136584.61
## **Peak week (point forecast):** 2011-12-12
# Preview first 12 rows
kable(
head(forecast_tbl, 12),
caption = "Weekly Forecast (First 12 Weeks, with 80% Intervals)",
col.names = c("Week", "Forecast", "Lo80", "Hi80"),
align = "lrrr",
digits = 2
)
| Week | Forecast | Lo80 | Hi80 |
|---|---|---|---|
| 2011-12-12 | 262007.68 | 199401.96 | 324613.4 |
| 2011-12-19 | 164800.37 | 98590.02 | 231010.7 |
| 2011-12-26 | 129612.65 | 63402.30 | 195823.0 |
| 2012-01-02 | 103204.49 | 36994.14 | 169414.8 |
| 2012-01-09 | 89329.33 | 23118.98 | 155539.7 |
| 2012-01-16 | 88702.34 | 22491.99 | 154912.7 |
| 2012-01-23 | 98886.11 | 32675.76 | 165096.5 |
| 2012-01-30 | 115095.80 | 48885.45 | 181306.1 |
| 2012-02-06 | 131664.42 | 65454.06 | 197874.8 |
| 2012-02-13 | 143701.04 | 77490.69 | 209911.4 |
| 2012-02-20 | 148421.89 | 82211.54 | 214632.2 |
| 2012-02-27 | 145752.79 | 79542.43 | 211963.1 |
# 0) Lookback window for mix shares (52 weeks standard)
lookback_weeks <- 52
cutoff_date <- max(df$order_date, na.rm = TRUE) - weeks(lookback_weeks)
recent <- df %>% filter(order_date > cutoff_date)
# 1) Top-10 SKUs by recent revenue
top10 <- recent %>%
group_by(product_id) %>%
summarise(revenue_52w = sum(revenue), .groups = "drop") %>%
arrange(desc(revenue_52w)) %>%
slice_head(n = 10)
# Optional: descriptions (adapt column name if yours differs)
sku_meta <- df %>%
filter(product_id %in% top10$product_id) %>%
group_by(product_id) %>%
summarise(product_description = dplyr::first(product_description[!is.na(product_description)]),
.groups = "drop")
# 2A) SIMPLE mix (single share across last 52 weeks)
mix_simple <- recent %>%
filter(product_id %in% top10$product_id) %>%
group_by(product_id) %>%
summarise(rev_sku = sum(revenue), .groups = "drop") %>%
mutate(share = rev_sku / sum(rev_sku)) %>%
left_join(sku_meta, by = "product_id")
# 2B) SEASONAL mix (share by week-of-year)
mix_seasonal <- recent %>%
filter(product_id %in% top10$product_id) %>%
mutate(woy = isoweek(order_date)) %>%
group_by(woy) %>%
mutate(total_woy = sum(revenue)) %>%
ungroup() %>%
group_by(product_id, woy) %>%
summarise(rev_sku_woy = sum(revenue),
total_woy = first(total_woy),
.groups = "drop") %>%
mutate(share_woy = ifelse(total_woy > 0, rev_sku_woy / total_woy, 0)) %>%
left_join(sku_meta, by = "product_id")
# 3) Price proxy → median unit price (robust to outliers)
sku_price <- recent %>%
filter(product_id %in% top10$product_id) %>%
group_by(product_id) %>%
summarise(median_price = median(unit_price, na.rm = TRUE), .groups = "drop")
# 4) Allocation A: SIMPLE mix
alloc_simple <- tidyr::crossing(
date = forecast_tbl$date,
product_id = top10$product_id
) %>%
left_join(forecast_tbl %>% select(date, forecast), by = "date") %>%
left_join(mix_simple %>% select(product_id, share), by = "product_id") %>%
left_join(sku_price, by = "product_id") %>%
mutate(
sku_revenue_fc = forecast * share,
units_fc = ifelse(median_price > 0, sku_revenue_fc / median_price, NA_real_)
) %>%
left_join(sku_meta, by = "product_id") %>%
arrange(date, desc(sku_revenue_fc))
# 4) Allocation B: SEASONAL mix (by week-of-year of the forecasted date)
alloc_seasonal <- tidyr::crossing(
date = forecast_tbl$date,
product_id = top10$product_id
) %>%
mutate(woy = isoweek(date)) %>%
left_join(forecast_tbl %>% select(date, forecast), by = "date") %>%
left_join(mix_seasonal %>% select(product_id, woy, share_woy), by = c("product_id", "woy")) %>%
mutate(share_woy = replace_na(share_woy, 0)) %>%
left_join(sku_price, by = "product_id") %>%
mutate(
sku_revenue_fc = forecast * share_woy,
units_fc = ifelse(median_price > 0, sku_revenue_fc / median_price, NA_real_)
) %>%
left_join(sku_meta, by = "product_id") %>%
arrange(date, desc(sku_revenue_fc))
# 5) Stocking summaries
weekly_stock_plan <- alloc_seasonal %>%
select(date, product_id, product_description, sku_revenue_fc, units_fc) %>%
arrange(date, desc(sku_revenue_fc))
horizon_stock_plan <- alloc_seasonal %>%
group_by(product_id, product_description) %>%
summarise(
total_rev_fc = sum(sku_revenue_fc, na.rm = TRUE),
total_units_fc = sum(units_fc, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_rev_fc))
# Preview knitr tables
glue("
**Top-10 SKU coverage window:** last {lookback_weeks} weeks
**Weekly plan rows:** {nrow(weekly_stock_plan)}
")
## **Top-10 SKU coverage window:** last 52 weeks
## **Weekly plan rows:** 260
kable(head(weekly_stock_plan, 12),
caption = "Weekly Stock Plan — Top-10 SKUs (Seasonality-Aware Mix, Preview)",
col.names = c("Week", "Product ID", "Description", "Rev Forecast", "Units"),
digits = 2)
| Week | Product ID | Description | Rev Forecast | Units |
|---|---|---|---|---|
| 2011-12-12 | 22423 | REGENCY CAKESTAND 3 TIER | 99004.48 | 7765.06 |
| 2011-12-12 | 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 88652.71 | 30051.77 |
| 2011-12-12 | POST | POSTAGE | 31373.16 | 1742.95 |
| 2011-12-12 | 84879 | ASSORTED COLOUR BIRD ORNAMENT | 22048.43 | 13046.41 |
| 2011-12-12 | 85099B | JUMBO BAG RED WHITE SPOTTY | 17212.78 | 8275.38 |
| 2011-12-12 | 47566 | PARTY BUNTING | 2561.26 | 517.43 |
| 2011-12-12 | M | Manual | 1154.86 | 769.91 |
| 2011-12-12 | 23084 | RABBIT NIGHT LIGHT | 0.00 | 0.00 |
| 2011-12-12 | 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 0.00 | 0.00 |
| 2011-12-12 | 23843 | PAPER CRAFT , LITTLE BIRDIE | 0.00 | 0.00 |
| 2011-12-19 | 22423 | REGENCY CAKESTAND 3 TIER | 66317.59 | 5201.38 |
| 2011-12-19 | POST | POSTAGE | 36956.57 | 2053.14 |
kable(head(horizon_stock_plan, 10),
caption = "26-Week Horizon Totals — Top-10 SKUs",
col.names = c("Product ID", "Description", "Revenue (26w)", "Units (26w)"),
digits = 2)
| Product ID | Description | Revenue (26w) | Units (26w) |
|---|---|---|---|
| 22423 | REGENCY CAKESTAND 3 TIER | 959959.70 | 75290.96 |
| 85123A | WHITE HANGING HEART T-LIGHT HOLDER | 629996.75 | 213558.22 |
| 47566 | PARTY BUNTING | 425016.60 | 85861.94 |
| POST | POSTAGE | 419113.67 | 23284.09 |
| 85099B | JUMBO BAG RED WHITE SPOTTY | 410203.57 | 197213.25 |
| 84879 | ASSORTED COLOUR BIRD ORNAMENT | 251802.71 | 148995.68 |
| M | Manual | 201830.57 | 134553.71 |
| 23166 | MEDIUM CERAMIC TOP STORAGE JAR | 95153.13 | 76122.50 |
| 23084 | RABBIT NIGHT LIGHT | 28510.59 | 13707.01 |
| 23843 | PAPER CRAFT , LITTLE BIRDIE | 0.00 | 0.00 |
The Sales Director wants to launch targeted campaigns to retain high-value customers and rekindle dormant ones. We need to build models to identify (a) customers likely to provide high future value, and (b) customers at high risk of churn. Additionally, propose pilot interventions and optionally product-recommendation logic for personalized offers.
Logic (EN):
Use historical transaction data to engineer features: RFM (Recency, Frequency, Monetary) plus tenure, product category usage, channel, geography. Define target variables: High-value prediction: e.g., customers in top 20% of future spend (look-ahead 6–12 mo). Churn prediction: e.g., customers with no purchase in next 90 days (non-contractual setting). I think we are again using baseline logistic regression for interpretability, and improved model such as, gradient boosting (XGBoost/LightGBM) for better performance. Use time-based split (train up to T-1 mo, test last 1 mo). We will Evaluate with ROC-AUC, PR-AUC (for imbalance), precision at top decile. Translate predictions into business actions (e.g., expected incremental revenue, cost of retention). Pilot plan: segment customers by risk & value, define offers (e.g., high-value: upsell/cross-sell bundles; risk registry: win-back vouchers).
From raw data compute the revenue column by multiplying
quantity and unit_price, filter out rows with
non-positive revenue values. This ensures that our downstream analysis
only includes meaningful transactions. I want to assume cancelled
purchase still count as retention, as cancellation is for another
analytics.
# --- Customer 360: Clean and compute revenue -------------------------
df_360 <- df_raw %>%
mutate(
revenue = quantity * unit_price # Calculate revenue
) %>%
filter(
revenue > 0 # Keep only positive revenue
)
# Display first few rows of cleaned data
knitr::kable(
head(df_360),
caption = "Preview of Cleaned Customer 360 Dataset"
)
| order_id | product_id | product_description | quantity | order_date | unit_price | customer_id | country | revenue |
|---|---|---|---|---|---|---|---|---|
| 489434 | 85048 | 15CM CHRISTMAS GLASS BALL 20 LIGHTS | 12 | 2009-12-01 07:45:00 | 6.95 | 13085 | United Kingdom | 83.4 |
| 489434 | 79323P | PINK CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085 | United Kingdom | 81.0 |
| 489434 | 79323W | WHITE CHERRY LIGHTS | 12 | 2009-12-01 07:45:00 | 6.75 | 13085 | United Kingdom | 81.0 |
| 489434 | 22041 | RECORD FRAME 7” SINGLE SIZE | 48 | 2009-12-01 07:45:00 | 2.10 | 13085 | United Kingdom | 100.8 |
| 489434 | 21232 | STRAWBERRY CERAMIC TRINKET BOX | 24 | 2009-12-01 07:45:00 | 1.25 | 13085 | United Kingdom | 30.0 |
| 489434 | 22064 | PINK DOUGHNUT TRINKET POT | 24 | 2009-12-01 07:45:00 | 1.65 | 13085 | United Kingdom | 39.6 |
We define a snapshot_date as 90 days
before the latest transaction date in the dataset.
Then, we compute essential customer-level features: -
Recency – how many days since their last order
- Frequency – how many unique orders they placed
- Monetary – total and average spending
- Tenure – how long between their first and last
order
# --- Define snapshot date (30 days before latest order) ----------------------
snapshot_date <- max(df_360$order_date) - days(90)
# --- Compute RFM + tenure features ------------------------------------------
cust_feat <- df_360 %>%
group_by(customer_id) %>%
summarise(
last_date = max(order_date), # Most recent order date
first_date = min(order_date), # Earliest order date
frequency = n_distinct(order_id), # Number of unique orders
monetary = sum(revenue), # Total spending
avg_basket = mean(revenue), # Average order value
.groups = "drop"
) %>%
mutate(
recency_days = as.integer(difftime(snapshot_date, last_date, units = "days")), # Time since last order
recency_days = pmax(recency_days, 0),
tenure_days = as.integer(difftime(last_date, first_date, units = "days")) # Customer tenure duration
)
# --- Display resulting features in a table ----------------------------------
knitr::kable(
head(cust_feat),
caption = "Customer RFM and Tenure Features (Sample Preview)"
)
| customer_id | last_date | first_date | frequency | monetary | avg_basket | recency_days | tenure_days |
|---|---|---|---|---|---|---|---|
| 12346 | 2011-01-18 10:01:00 | 2009-12-14 08:34:00 | 12 | 77556.46 | 2281.07235 | 235 | 400 |
| 12347 | 2011-12-07 15:52:00 | 2010-10-31 14:20:00 | 8 | 5633.32 | 22.26609 | 0 | 402 |
| 12348 | 2011-09-25 13:13:00 | 2010-09-27 14:59:00 | 5 | 2019.40 | 39.59608 | 0 | 362 |
| 12349 | 2011-11-21 09:51:00 | 2010-04-29 13:20:00 | 4 | 4428.69 | 25.30680 | 0 | 570 |
| 12350 | 2011-02-02 16:01:00 | 2011-02-02 16:01:00 | 1 | 334.40 | 19.67059 | 219 | 0 |
| 12351 | 2010-11-29 15:23:00 | 2010-11-29 15:23:00 | 1 | 300.93 | 14.33000 | 284 | 0 |
Recency is the measure of how recently a customer made a purchase or an event occurred, while tenure is the length of time a customer has been active with a company. In a business context, a higher recency score (a recent purchase) is seen as a good indicator of future purchasing, and a longer tenure can signify a loyal customer.
We classify each customer as churned (1) if they
made no purchase within 90 days after the snapshot
date,
and retained (0) otherwise.
# --- Define churn observation window ---------------------------------------
future_90 <- snapshot_date + days(90)
# --- Identify customers who made purchases within the next 90 days ---------
next_orders <- df_360 %>%
filter(order_date > snapshot_date & order_date <= future_90)
# --- Label churn: 1 = churned, 0 = retained --------------------------------
churn_label <- cust_feat %>%
mutate(
churn = if_else(customer_id %in% next_orders$customer_id, 0, 1)
)
kable(
head(churn_label),
caption = "Labeled as Churn"
)
| customer_id | last_date | first_date | frequency | monetary | avg_basket | recency_days | tenure_days | churn |
|---|---|---|---|---|---|---|---|---|
| 12346 | 2011-01-18 10:01:00 | 2009-12-14 08:34:00 | 12 | 77556.46 | 2281.07235 | 235 | 400 | 1 |
| 12347 | 2011-12-07 15:52:00 | 2010-10-31 14:20:00 | 8 | 5633.32 | 22.26609 | 0 | 402 | 0 |
| 12348 | 2011-09-25 13:13:00 | 2010-09-27 14:59:00 | 5 | 2019.40 | 39.59608 | 0 | 362 | 0 |
| 12349 | 2011-11-21 09:51:00 | 2010-04-29 13:20:00 | 4 | 4428.69 | 25.30680 | 0 | 570 | 0 |
| 12350 | 2011-02-02 16:01:00 | 2011-02-02 16:01:00 | 1 | 334.40 | 19.67059 | 219 | 0 | 1 |
| 12351 | 2010-11-29 15:23:00 | 2010-11-29 15:23:00 | 1 | 300.93 | 14.33000 | 284 | 0 | 1 |
# --- Inspect churn distribution --------------------------------------------
churn_summary <- table(churn_label$churn)
knitr::kable(
as.data.frame(churn_summary),
col.names = c("Churn Label", "Count"),
caption = "Distribution of Churn vs. Retained Customers"
)
| Churn Label | Count |
|---|---|
| 0 | 2890 |
| 1 | 2989 |
Future 90 days is the time we will monitor
kable(
head(next_orders),
caption = "Orders that will be tested"
)
| order_id | product_id | product_description | quantity | order_date | unit_price | customer_id | country | revenue |
|---|---|---|---|---|---|---|---|---|
| 566225 | 22384 | LUNCH BAG PINK POLKADOT | 6 | 2011-09-11 10:35:00 | 1.65 | 16899 | United Kingdom | 9.90 |
| 566225 | 20725 | LUNCH BAG RED RETROSPOT | 6 | 2011-09-11 10:35:00 | 1.65 | 16899 | United Kingdom | 9.90 |
| 566225 | 23169 | CLASSIC GLASS COOKIE JAR | 3 | 2011-09-11 10:35:00 | 4.15 | 16899 | United Kingdom | 12.45 |
| 566225 | 84029E | RED WOOLLY HOTTIE WHITE HEART. | 3 | 2011-09-11 10:35:00 | 4.25 | 16899 | United Kingdom | 12.75 |
| 566225 | 21479 | WHITE SKULL HOT WATER BOTTLE | 3 | 2011-09-11 10:35:00 | 4.25 | 16899 | United Kingdom | 12.75 |
| 566225 | 82484 | WOOD BLACK BOARD ANT WHITE FINISH | 2 | 2011-09-11 10:35:00 | 7.95 | 16899 | United Kingdom | 15.90 |
We identify high-value customers as those who belong
to the top 20% of total spending in the next 12 months
following the snapshot date: 1. Aggregate each customer’s revenue over
the next year.
2. Compute their percentile rank (percent_rank).
3. Label top 20% as high_value = 1.
# --- Define future 12-month window ------------------------------------------
future_12mo <- snapshot_date + years(1)
# --- Calculate each customer's spend in the next 12 months ------------------
future_spend <- df_360 %>%
filter(order_date > snapshot_date & order_date <= future_12mo) %>%
group_by(customer_id) %>%
summarise(fut_spend = sum(revenue), .groups = "drop")
# --- Rank customers and label top 20% as high-value -------------------------
hv_label <- future_spend %>%
mutate(rank = percent_rank(fut_spend)) %>%
mutate(high_value = if_else(rank >= 0.80, 1, 0)) %>%
select(customer_id, high_value)
# --- Display a few labeled examples -----------------------------------------
knitr::kable(
head(hv_label, 10),
caption = "Example: High-Value Customer Labels (Top 20% by Future Spend)"
)
| customer_id | high_value |
|---|---|
| 12347 | 1 |
| 12348 | 0 |
| 12349 | 1 |
| 12352 | 0 |
| 12356 | 0 |
| 12357 | 1 |
| 12358 | 0 |
| 12359 | 1 |
| 12360 | 0 |
| 12362 | 1 |
Evaluate the predictive performance of our model, we split the dataset into: - Training set (70%) — used to fit or “teach” the model. - Testing set (30%) — held out to evaluate how well the model generalizes.
We also stratify by the churn variable to preserve the
same proportion of churned vs retained customers in both subsets.
data_model <- cust_feat %>%
left_join(churn_label %>% select(customer_id, churn), by = "customer_id") %>%
left_join(hv_label, by = "customer_id") %>%
replace_na(list(high_value = 0)) %>%
mutate(
churn = factor(churn, levels = c(0, 1)),
high_value = factor(high_value, levels = c(0, 1))
)
# --- Split data into training and testing sets -------------------------------
set.seed(305) # ensures reproducibility
split <- initial_split(data_model, prop = 0.7, strata = churn)
train <- training(split)
test <- testing(split)
# --- Display dataset sizes ---------------------------------------------------
split_summary <- tibble(
Dataset = c("Training", "Testing"),
Records = c(nrow(train), nrow(test)),
Percentage = c("70%", "30%")
)
knitr::kable(
split_summary,
caption = "Training and Testing Dataset Split Summary"
)
| Dataset | Records | Percentage |
|---|---|---|
| Training | 4114 | 70% |
| Testing | 1765 | 30% |
Define preprocessing recipe for preparation.
This recipe specifies how features (predictors) should be cleaned and
standardized for modeling.
Steps included: 1. Select
predictors: recency_days, frequency,
monetary, avg_basket, tenure_days
2. Impute missing values: Replace missing numeric
values with their median 3. Normalize
predictors: Scale and center all numeric predictors to ensure
balanced model input
# --- Define preprocessing recipe --------------------------------------------
rec <- recipe(
churn ~ recency_days + frequency + monetary + avg_basket + tenure_days,
data = train
) %>%
step_impute_median(all_numeric_predictors()) %>% # handle missing numeric values
step_normalize(all_numeric_predictors()) # scale & center all numeric features
# --- Display recipe summary --------------------------------------------------
knitr::kable(
summary(rec),
caption = "Summary of Preprocessing Recipe for Churn Model"
)
| variable | type | role | source |
|---|---|---|---|
| recency_days | double , numeric | predictor | original |
| frequency | integer, numeric | predictor | original |
| monetary | double , numeric | predictor | original |
| avg_basket | double , numeric | predictor | original |
| tenure_days | integer, numeric | predictor | original |
| churn | factor , unordered, nominal | outcome | original |
Extreme Gradient Boosting (XGBoost), tree-based algorithm that I mainly use for robust and interprable use. I think i use this mostly in churn prediction due to its ability to handle nonlinear relationships and interaction effects among features.
Model configuration: - trees = 500 →
Builds 500 decision trees (boosting rounds) -
learn_rate = 0.05 → Controls how fast the model learns;
smaller values reduce overfitting - tree_depth = 5 → Limits
how deep each tree grows to balance bias and variance
The model is integrated into a tidymodels workflow,
combining preprocessing (recipe) and modeling
(boost_tree) steps for cleaner, reproducible pipelines.
library(xgboost)
# --- Define model specification ---------------------------------------------
xgb_spec <- boost_tree(
trees = 500,
learn_rate = 0.05,
tree_depth = 5
) %>%
set_engine("xgboost") %>%
set_mode("classification")
# --- Create workflow: combine recipe and model ------------------------------
wf <- workflow() %>%
add_recipe(rec) %>%
add_model(xgb_spec)
# --- Fit model on training data ---------------------------------------------
fit_xgb <- wf %>% fit(data = train)
# --- Display model summary --------------------------------------------------
xgb_summary <- tibble(
Model = "XGBoost Classifier",
Trees = 500,
Learning_Rate = 0.05,
Max_Depth = 5,
Engine = "xgboost",
Mode = "classification"
)
knitr::kable(
xgb_summary,
caption = "XGBoost Model Specification Summary"
)
| Model | Trees | Learning_Rate | Max_Depth | Engine | Mode |
|---|---|---|---|---|---|
| XGBoost Classifier | 500 | 0.05 | 5 | xgboost | classification |
# --- Predict churn probabilities --------------------------------------------
pred <- predict(fit_xgb, new_data = test, type = "prob")$.pred_1
# --- Compute performance metrics --------------------------------------------
roc <- roc_auc_vec(test$churn, pred)
pr <- pr_auc_vec(test$churn, pred)
# --- Display metrics summary ------------------------------------------------
metric_tbl <- tibble(
Metric = c("ROC_AUC", "PR_AUC"),
Value = c(roc, pr)
)
knitr::kable(
metric_tbl,
caption = "Model Evaluation Metrics for Churn Prediction"
)
| Metric | Value |
|---|---|
| ROC_AUC | 0.0003917 |
| PR_AUC | 0.3004528 |
# --- Segment customers by predicted churn probability -----------------------
test <- test %>%
mutate(churn_prob = pred) %>%
arrange(desc(churn_prob))
top10pct <- test %>%
slice_head(n = floor(0.10 * nrow(test)))
# --- Display top 10% high-churn customers -----------------------------------
knitr::kable(
head(top10pct, 10),
caption = "Top 10 Customers by Predicted Churn Probability"
)
| customer_id | last_date | first_date | frequency | monetary | avg_basket | recency_days | tenure_days | churn | high_value | churn_prob |
|---|---|---|---|---|---|---|---|---|---|---|
| 14325 | 2010-05-19 09:20:00 | 2010-05-19 09:20:00 | 1 | 245.48 | 13.63778 | 479 | 0 | 1 | 0 | 0.9999882 |
| 17766 | 2010-11-23 14:29:00 | 2010-11-23 14:17:00 | 2 | 299.61 | 13.61864 | 290 | 0 | 1 | 0 | 0.9999882 |
| 15851 | 2010-07-01 12:34:00 | 2010-07-01 12:34:00 | 1 | 109.73 | 13.71625 | 436 | 0 | 1 | 0 | 0.9997503 |
| 17260 | 2010-10-24 13:21:00 | 2010-10-24 13:21:00 | 1 | 120.95 | 20.15833 | 320 | 0 | 1 | 0 | 0.9997238 |
| 17045 | 2011-08-18 17:09:00 | 2010-10-06 14:52:00 | 5 | 827.68 | 15.61660 | 22 | 316 | 1 | 0 | 0.9996808 |
| 17874 | 2011-06-07 12:54:00 | 2011-03-29 11:25:00 | 6 | 683.46 | 14.85783 | 94 | 70 | 1 | 0 | 0.9996808 |
| 12561 | 2011-02-10 11:39:00 | 2011-02-10 11:39:00 | 1 | 238.85 | 14.92812 | 212 | 0 | 1 | 0 | 0.9996714 |
| 12710 | 2011-09-04 12:00:00 | 2011-05-22 14:05:00 | 3 | 1149.02 | 14.92234 | 6 | 104 | 1 | 0 | 0.9996714 |
| 12738 | 2010-12-02 18:27:00 | 2010-12-02 18:27:00 | 1 | 310.70 | 14.12273 | 281 | 0 | 1 | 0 | 0.9996714 |
| 12776 | 2010-02-10 12:11:00 | 2010-02-10 12:11:00 | 1 | 55.50 | 13.87500 | 577 | 0 | 1 | 0 | 0.9996714 |
We convert probabilities to classes using a decision threshold (default 0.5). Then we compute confusion matrices for train and test to see true/false positives/negatives, plus key metrics (Precision, Recall, F1). We also simulate a budgeted campaign by only targeting the top 10% highest-risk customers.
Threshold mengarah ke 1 apabila ingin hasil lebih Recall atau tidak masalah dengan melakukan CRM kepada customer yang pasti retensi.
library(caret)
library(dplyr)
library(knitr)
# --- Safety: ensure factor levels are c("0","1") ----------------------------
train <- train %>% mutate(churn = factor(churn, levels = c("0","1")))
test <- test %>% mutate(churn = factor(churn, levels = c("0","1")))
# --- 1) Probabilities from fitted model -------------------------------------
pred_train_prob <- predict(fit_xgb, new_data = train, type = "prob")$.pred_1
pred_test_prob <- predict(fit_xgb, new_data = test, type = "prob")$.pred_1
# --- 2) Choose threshold(s) --------------------------------------------------
thr <- 0.5 # main operating point
thr_top10 <- quantile(pred_test_prob, 0.90, na.rm = TRUE) # top-10% by risk
# --- 3) Class labels at chosen threshold ------------------------------------
train_pred_cls <- factor(ifelse(pred_train_prob >= thr, "1", "0"), levels = c("0","1"))
test_pred_cls <- factor(ifelse(pred_test_prob >= thr, "1", "0"), levels = c("0","1"))
scored <- test %>%
mutate(
churn_prob = pred_test_prob,
high_value = factor(high_value, levels = c("0", "1"))
)
# --- 4) Confusion matrices (standard + PR mode) -----------------------------
cm_train <- caret::confusionMatrix(data = train_pred_cls, reference = train$churn, positive = "1")
cm_test <- caret::confusionMatrix(data = test_pred_cls, reference = test$churn, positive = "1")
cm_train_pr <- caret::confusionMatrix(data = train_pred_cls, reference = train$churn,
positive = "1", mode = "prec_recall")
cm_test_pr <- caret::confusionMatrix(data = test_pred_cls, reference = test$churn,
positive = "1", mode = "prec_recall")
# --- 5) Pretty tables: confusion matrices -----------------------------------
kable(as.data.frame(cm_train$table),
caption = "Confusion Matrix — TRAIN (threshold = 0.5)",
col.names = c("Predicted", "Actual", "Freq"))
| Predicted | Actual | Freq |
|---|---|---|
| 0 | 0 | 2022 |
| 1 | 0 | 0 |
| 0 | 1 | 2 |
| 1 | 1 | 2090 |
kable(as.data.frame(cm_test$table),
caption = "Confusion Matrix — TEST (threshold = 0.5)",
col.names = c("Predicted", "Actual", "Freq"))
| Predicted | Actual | Freq |
|---|---|---|
| 0 | 0 | 868 |
| 1 | 0 | 0 |
| 0 | 1 | 2 |
| 1 | 1 | 895 |
# --- 6) Key metrics summary (Accuracy, Precision, Recall, F1) ---------------
metrics_train <- data.frame(
Dataset = "Train",
Accuracy = cm_train$overall[["Accuracy"]],
Precision = cm_train_pr$byClass[["Precision"]],
Recall = cm_train_pr$byClass[["Recall"]],
F1 = cm_train_pr$byClass[["F1"]]
)
metrics_test <- data.frame(
Dataset = "Test",
Accuracy = cm_test$overall[["Accuracy"]],
Precision = cm_test_pr$byClass[["Precision"]],
Recall = cm_test_pr$byClass[["Recall"]],
F1 = cm_test_pr$byClass[["F1"]]
)
kable(rbind(metrics_train, metrics_test),
digits = 3,
caption = "Classification Metrics at Threshold = 0.5")
| Dataset | Accuracy | Precision | Recall | F1 |
|---|---|---|---|---|
| Train | 1.000 | 1 | 0.999 | 1.000 |
| Test | 0.999 | 1 | 0.998 | 0.999 |
# --- 7) Target only Top 10% highest-risk on TEST ----------------------------
test_pred_top10 <- factor(ifelse(pred_test_prob >= thr_top10, "1", "0"), levels = c("0","1"))
cm_test_top10 <- caret::confusionMatrix(data = test_pred_top10, reference = test$churn,
positive = "1", mode = "prec_recall")
kable(as.data.frame(cm_test_top10$table),
caption = "Confusion Matrix — TEST (Targeting Top 10% Highest Risk)",
col.names = c("Predicted", "Actual", "Freq"))
| Predicted | Actual | Freq |
|---|---|---|
| 0 | 0 | 868 |
| 1 | 0 | 0 |
| 0 | 1 | 639 |
| 1 | 1 | 258 |
kable(data.frame(
Dataset = "Test (Top 10%)",
Precision = cm_test_top10$byClass[["Precision"]],
Recall = cm_test_top10$byClass[["Recall"]],
F1 = cm_test_top10$byClass[["F1"]]
),
digits = 3,
caption = "Precision/Recall/F1 when Targeting Top 10% Highest Risk"
)
| Dataset | Precision | Recall | F1 |
|---|---|---|---|
| Test (Top 10%) | 1 | 0.288 | 0.447 |
Now I have an analysis here, if i want to seek the 10% most risk with high value, it means that people with high value purchase that has not been long in contact with us, but the most recent purchaser of course accumulate more revenue than the most risky one, what is the analysis for this.
I will break this down into two competing goals:
| Factor | Description | Desired Direction |
|---|---|---|
Churn Risk (churn_prob) |
Probability that a customer will churn | Higher → More risky |
Customer Value (monetary_value or
total_purchase) |
Potential revenue impact if they churn | Higher → More valuable |
Recency (days_since_last_contact) |
Time since last engagement | Smaller → More recently contacted |
So we’re looking for customers with high value, high churn probability, and not recently contacted — these are strategic targets: people you can still save, who are worth saving, and who might churn soon.
HVHR (High-Value High-Risk): allocate most budget; concierge outreach, premium perks, early-access bundles, flexible returns.
Win-back (At-Risk Non-HV): time-limited voucher (10–15%), cart-starter bundles, reduce friction (COD, one-click reorder). Broad reach, lower cost per user.
LRHV (Low-Risk High-Value): protect with light nudges—cross-sell via co-purchase, loyalty accelerators; avoid discounts.
Target size: start with top 10% churn risk, tune threshold to budget.
Measurement: A/B control; secondary = 60–90‑day repeat rate; refresh model monthly.
library(dplyr)
library(knitr)
library(glue)
# --- Build segments & expected risk revenue ---------------------------------
segments <- scored %>%
mutate(
seg = case_when(
churn_prob >= thr_top10 & high_value == "1" ~ "HVHR",
churn_prob >= thr_top10 & high_value == "0" ~ "WinBack",
churn_prob < thr_top10 & high_value == "1" ~ "LRHV",
TRUE ~ "Other"
),
exp_risk_revenue = churn_prob * monetary
)
# --- Segment counts (quick dashboard numbers) --------------------------------
seg_counts <- segments %>%
count(seg, name = "customers") %>%
arrange(desc(customers))
kable(seg_counts, caption = "Customer Counts by Segment")
| seg | customers |
|---|---|
| Other | 1343 |
| WinBack | 258 |
| LRHV | 164 |
# --- HVHR: prioritize by value-at-risk --------------------------------------
hvhr <- segments %>%
filter(seg == "HVHR") %>%
arrange(desc(exp_risk_revenue)) %>%
select(customer_id, monetary, recency_days, frequency, churn_prob, exp_risk_revenue) %>%
head(25)
kable(hvhr,
caption = "HVHR — Top 25 by Expected Risk Revenue",
col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob", "Expected Risk Rev"),
digits = 2)
| Customer | Monetary | Recency (days) | Frequency | Churn Prob | Expected Risk Rev |
|---|
# --- WinBack: cheapest to act on, sort by risk -------------------------------
winback <- segments %>%
filter(seg == "WinBack") %>%
arrange(desc(churn_prob)) %>%
select(customer_id, monetary, recency_days, frequency, churn_prob) %>%
head(50)
kable(winback,
caption = "WinBack — Top 50 by Churn Probability",
col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob"),
digits = 2)
| Customer | Monetary | Recency (days) | Frequency | Churn Prob |
|---|---|---|---|---|
| 14325 | 245.48 | 479 | 1 | 1 |
| 17766 | 299.61 | 290 | 2 | 1 |
| 15851 | 109.73 | 436 | 1 | 1 |
| 17260 | 120.95 | 320 | 1 | 1 |
| 17045 | 827.68 | 22 | 5 | 1 |
| 17874 | 683.46 | 94 | 6 | 1 |
| 12561 | 238.85 | 212 | 1 | 1 |
| 12710 | 1149.02 | 6 | 3 | 1 |
| 12738 | 310.70 | 281 | 1 | 1 |
| 12776 | 55.50 | 577 | 1 | 1 |
| 12799 | 219.35 | 438 | 1 | 1 |
| 12892 | 263.38 | 306 | 1 | 1 |
| 12922 | 405.38 | 70 | 1 | 1 |
| 12941 | 377.49 | 633 | 1 | 1 |
| 12972 | 1161.20 | 323 | 4 | 1 |
| 13161 | 478.88 | 222 | 2 | 1 |
| 13304 | 1217.99 | 260 | 4 | 1 |
| 13385 | 534.57 | 239 | 1 | 1 |
| 13837 | 271.24 | 121 | 2 | 1 |
| 13859 | 515.86 | 235 | 2 | 1 |
| 13981 | 331.14 | 325 | 1 | 1 |
| 14011 | 457.82 | 389 | 2 | 1 |
| 14026 | 713.11 | 380 | 2 | 1 |
| 14187 | 123.50 | 383 | 1 | 1 |
| 14347 | 82.60 | 646 | 1 | 1 |
| 14353 | 376.50 | 124 | 3 | 1 |
| 14762 | 417.57 | 173 | 2 | 1 |
| 14772 | 139.26 | 21 | 1 | 1 |
| 14773 | 140.55 | 502 | 1 | 1 |
| 14900 | 13.92 | 431 | 1 | 1 |
| 14991 | 314.67 | 339 | 1 | 1 |
| 15047 | 343.62 | 179 | 1 | 1 |
| 15064 | 1254.41 | 319 | 4 | 1 |
| 15082 | 93.35 | 578 | 1 | 1 |
| 15104 | 1027.44 | 106 | 2 | 1 |
| 15135 | 305.68 | 148 | 2 | 1 |
| 15139 | 178.96 | 149 | 2 | 1 |
| 15174 | 481.46 | 232 | 2 | 1 |
| 15209 | 178.98 | 639 | 1 | 1 |
| 15323 | 434.60 | 472 | 3 | 1 |
| 15362 | 613.08 | 358 | 2 | 1 |
| 15554 | 217.20 | 187 | 1 | 1 |
| 15560 | 153.52 | 307 | 2 | 1 |
| 15585 | 455.30 | 86 | 1 | 1 |
| 15828 | 1108.82 | 284 | 1 | 1 |
| 15979 | 329.22 | 512 | 1 | 1 |
| 16137 | 422.70 | 589 | 1 | 1 |
| 16297 | 301.90 | 178 | 1 | 1 |
| 16346 | 224.51 | 294 | 1 | 1 |
| 16465 | 746.24 | 330 | 2 | 1 |
# --- LRHV: protect valuable base, sort by value ------------------------------
lrhv <- segments %>%
filter(seg == "LRHV") %>%
arrange(desc(monetary)) %>%
select(customer_id, monetary, recency_days, frequency, churn_prob) %>%
head(50)
kable(lrhv,
caption = "LRHV — Top 50 by Monetary Value",
col.names = c("Customer", "Monetary", "Recency (days)", "Frequency", "Churn Prob"),
digits = 2)
| Customer | Monetary | Recency (days) | Frequency | Churn Prob |
|---|---|---|---|---|
| 13694 | 196482.81 | 0 | 143 | 0 |
| 17511 | 175603.55 | 0 | 60 | 0 |
| 16029 | 122209.14 | 0 | 107 | 0 |
| 15311 | 116771.16 | 0 | 208 | 0 |
| 13798 | 75836.87 | 0 | 110 | 0 |
| 15838 | 74163.00 | 0 | 34 | 0 |
| 14088 | 64036.80 | 0 | 16 | 0 |
| 13081 | 59205.15 | 0 | 28 | 0 |
| 16013 | 57446.44 | 0 | 84 | 0 |
| 17404 | 47781.12 | 0 | 23 | 0 |
| 16705 | 43515.05 | 0 | 42 | 0 |
| 14031 | 40999.17 | 0 | 44 | 0 |
| 17381 | 40002.37 | 0 | 38 | 0 |
| 12471 | 39963.79 | 0 | 79 | 0 |
| 16210 | 38489.51 | 0 | 35 | 0 |
| 17675 | 38259.60 | 0 | 70 | 0 |
| 15856 | 34220.34 | 0 | 52 | 0 |
| 14258 | 32051.34 | 0 | 25 | 0 |
| 14527 | 27792.30 | 0 | 122 | 0 |
| 17865 | 26374.39 | 0 | 46 | 0 |
| 18251 | 26278.86 | 0 | 9 | 0 |
| 12731 | 25775.71 | 0 | 17 | 0 |
| 14062 | 25604.20 | 0 | 24 | 0 |
| 12939 | 25264.80 | 0 | 20 | 0 |
| 14895 | 25119.54 | 0 | 38 | 0 |
| 17920 | 24516.46 | 0 | 54 | 0 |
| 14607 | 24137.63 | 0 | 22 | 0 |
| 12682 | 24033.91 | 0 | 52 | 0 |
| 16133 | 22992.26 | 0 | 56 | 0 |
| 14051 | 22434.00 | 0 | 37 | 0 |
| 14667 | 21927.14 | 0 | 54 | 0 |
| 12753 | 21429.39 | 0 | 6 | 0 |
| 15615 | 21373.97 | 0 | 47 | 0 |
| 17139 | 19584.46 | 0 | 25 | 0 |
| 17581 | 18757.75 | 0 | 43 | 0 |
| 17377 | 18319.41 | 0 | 78 | 0 |
| 12472 | 17880.59 | 0 | 19 | 0 |
| 12901 | 17654.54 | 0 | 28 | 0 |
| 14849 | 15763.45 | 0 | 43 | 0 |
| 16191 | 15688.02 | 0 | 24 | 0 |
| 17133 | 14622.33 | 0 | 12 | 0 |
| 15482 | 14592.56 | 0 | 19 | 0 |
| 16713 | 14560.69 | 0 | 32 | 0 |
| 12476 | 14115.41 | 0 | 23 | 0 |
| 13178 | 14059.19 | 0 | 21 | 0 |
| 12435 | 13925.93 | 0 | 6 | 0 |
| 13097 | 13389.22 | 0 | 28 | 0 |
| 17858 | 12797.65 | 0 | 30 | 0 |
| 12437 | 12683.40 | 0 | 39 | 0 |
| 13418 | 12233.60 | 0 | 31 | 0 |
# --- Tiny summary blurb for the report --------------------------------------
glue("
**Operational note:** Target HVHR with concierge outreach & tailored offers,
WinBack with low-cost reactivation, LRHV with loyalty & cross-sell.
Size segments based on budget (start with top 10% risk threshold).")
## **Operational note:** Target HVHR with concierge outreach & tailored offers,
## WinBack with low-cost reactivation, LRHV with loyalty & cross-sell.
## Size segments based on budget (start with top 10% risk threshold).