The below work has been implemented at my work at Eurostar Nested Bucket Analysis with 156K rows of data, containing 4 month travel date and each travel dates’ 3 month lead-in analysis.
To express, I took 15 days travel date and each travel date’s each service’s each nested buckets’ 2 month lead-in window.
Median is chosen because it only cares about order, not distance. As long as <50% of days are extreme, the median barely moves. However, mean can take into account the extreme outliers and can disrupt the analysis.
I changed the bucket names and fares for Eurostar’s confidentiality and generated synthetic data to express the methodology for four services: Morning, Afternoon, Evening, Night.
(Important) The code got very lengthy because lot of synthetic data was created for illustration and the parts 2) 3) 4) 5) 6) are just based to mask the real data. However, the real data extracted will be easier to analyse.
If interested in results (plots) only, please click on 8)Plots in LHS menu, which shows, per service, the median open→close and a faint band for the 25–75% spread (the typical spread). If to see the code as well, please click on the Show all code in the upper right corner.
suppressPackageStartupMessages({
pacman::p_load("pacman","tidyverse","ggthemes","stats","purrr","scales")
})
set.seed(42)
Travel date 1 August - 15 August has been taken.
OVERLAP_PROBS <- c(Morning=0.40, Afternoon=0.35, Evening=0.30, Night=0.28)
MIN_WIDTH_RNG <- list(Morning=c(10,18), Afternoon=c(9,16), Evening=c(8,14), Night=c(7,12))
LEAD_MAX_RNG <- list(Morning=c(55,75), Afternoon=c(48,68), Evening=c(45,65), Night=c(42,60))
LF_RNG <- list(Morning=c(0.72,0.95), Afternoon=c(0.70,0.92),
Evening=c(0.66,0.90), Night=c(0.64,0.88))
TRAVEL_START <- as.Date("2025-08-01")
TRAVEL_END <- as.Date("2025-08-15")
fare_map <- tibble::tibble(
s3_bucket = sprintf("K%02d", 1:18), # K01 .. K18
fare = seq(230, 60, length.out = 18) |> round() # strictly decreasing
) |>
dplyr::mutate(bucket_order = dplyr::dense_rank(fare))
fare_map
4 Services were taken across different times of the day.
services <- tibble::tribble(
~service_id, ~service_name, ~dep_band, ~dep_time,
1, "Service 1", "Morning", "08:00",
2, "Service 2", "Afternoon", "13:00",
3, "Service 3", "Evening", "17:00",
4, "Service 4", "Night", "21:00"
)
travel_days <- seq(TRAVEL_START, TRAVEL_END, by = "day")
band_bias_map <- c(Morning = 0.6, Afternoon = 0.3, Evening = 0.15, Night = 0.0)
This illustration is purely synthetic to show the ideology. For real analysis, just need to replace with real data.
Create an open/close window (in D−X) for each bucket for one service-day. - lead_max: horizon for that day - min_width: minimum (open - close) - overlap_p: chance to nudge more overlap - Business rule: K01 closes at 0 D−X.
gen_windows <- function(lead_max, min_width, overlap_p = 0.30) {
# Baseline: higher fares open earlier; add small randomness
base_open <- pmax(
min_width + 2,
round(
lead_max -
(fare_map$bucket_order - 1) * runif(1, 1.6, 3.7) + # slope
rnorm(nrow(fare_map), 0, 2.5) # noise
)
)
# Close after open, enforcing width >= min_width
base_close <- pmax(0L, base_open - round(runif(nrow(fare_map), min_width, min_width + 15)))
width_now <- base_open - base_close
too_narrow <- which(width_now < min_width)
if (length(too_narrow)) {
base_close[too_narrow] <- pmax(0L, base_open[too_narrow] - min_width - sample(0:5, length(too_narrow), TRUE))
}
# Optional overlaps
if (runif(1) < overlap_p) {
bump_idx <- sample(seq_along(base_open), size = sample(2:3, 1))
base_close[bump_idx] <- pmax(0L, base_close[bump_idx] + sample(6:14, length(bump_idx), TRUE))
}
# Rare inversions at the low end
if (runif(1) < 0.35) {
low_ids <- which(fare_map$s3_bucket %in% c("K12","K14","K16","K18"))
if (length(low_ids) >= 2) {
pair <- sort(sample(low_ids, 2))
a <- pair[1]; b <- pair[2]
base_open[a] <- max(0L, base_close[b] - sample(0:2,1))
base_close[a] <- pmax(0L, base_open[a] - sample(min_width:(min_width+6),1))
}
}
# Force K01 to close at 0 D−X (median close at departure)
idx_k01 <- which(fare_map$s3_bucket == "K01")
if (length(idx_k01) == 1) {
base_close[idx_k01] <- 0L
base_open[idx_k01] <- max(base_open[idx_k01], min_width)
}
# Bounds and return
base_open <- pmin(base_open, lead_max)
base_close <- pmax(base_close, 0L)
base_close <- pmin(base_close, base_open)
tibble(
s3_bucket = fare_map$s3_bucket,
open_L = base_open,
close_L = base_close
)
}
# Distribute seats across open..close using a gamma shape shifted by band
distribute_by_lead <- function(total_seats, open_L, close_L, band_bias = 0) {
if (is.na(open_L) || is.na(close_L) || open_L < close_L || total_seats <= 0) {
return(tibble(lead_days = integer(0), seats = integer(0)))
}
days <- seq(close_L, open_L)
if (!length(days)) return(tibble(lead_days = integer(0), seats = integer(0)))
shape <- 1.6 + band_bias # higher → earlier bookings
scale <- 10 - band_bias * 2
inten <- dgamma(days + 1, shape = shape, scale = scale) + 1e-6
probs <- inten / sum(inten)
alloc <- as.vector(rmultinom(1, size = total_seats, prob = probs))
tibble(lead_days = days, seats = alloc) %>% filter(seats > 0)
}
This sets the numerical synthetic values.
synthetic <- purrr::pmap_dfr(
list(services$service_name, services$dep_band, services$dep_time),
function(service_name, dep_band, dep_time) {
lead_max_rng <- LEAD_MAX_RNG[[dep_band]]
lf_rng <- LF_RNG[[dep_band]]
min_width_rng<- MIN_WIDTH_RNG[[dep_band]]
overlap_p <- OVERLAP_PROBS[[dep_band]]
purrr::map_dfr(travel_days, function(td) {
# Capacity & LF → total sold
ref_cap <- sample(c(536, 600, 672, 720), 1, prob = c(0.35,0.25,0.3,0.1))
lf <- runif(1, lf_rng[1], lf_rng[2])
sold_tot <- round(ref_cap * lf)
# Windows for this day
lead_max <- sample(seq(lead_max_rng[1], lead_max_rng[2]), 1)
min_width <- sample(seq(min_width_rng[1], min_width_rng[2]), 1)
windows <- gen_windows(lead_max, min_width, overlap_p) %>%
left_join(fare_map, by = "s3_bucket")
# Allocate more share to lower fares; slight band-dependent slope
band_slope <- switch(dep_band, "Morning"=1.7, "Afternoon"=1.5, "Evening"=1.3, "Night"=1.2)
wts <- rexp(nrow(fare_map), rate = scales::rescale(fare_map$bucket_order, to = c(band_slope, 0.45)))
wts <- wts / sum(wts)
bucket_seats <- round(wts * sold_tot)
band_bias <- band_bias_map[[dep_band]]
purrr::map_dfr(seq_len(nrow(windows)), function(i) {
s <- bucket_seats[i]
if (s <= 0) return(tibble())
day_alloc <- distribute_by_lead(s, windows$open_L[i], windows$close_L[i], band_bias)
if (!nrow(day_alloc)) return(tibble())
day_alloc %>%
transmute(
booking_route_od = "SYN_OD",
cos_group = "Economy",
train_no = service_name, # labels: Service 1..4
dep_band = dep_band,
dep_time = dep_time,
travel_date_raw = format(td, "%d/%m/%Y"),
booking_date_raw = format(td - lead_days, "%d/%m/%Y"),
volume = seats,
bookings_n = seats,
bucket = windows$s3_bucket[i],
revenue_local = seats * windows$fare[i],
ref_leg_capacity_b = ref_cap,
service_code = paste0("SYN", sample(100:999,1))
)
})
})
}
)
# Peek
dplyr::glimpse(synthetic)
## Rows: 11,225
## Columns: 13
## $ booking_route_od <chr> "SYN_OD", "SYN_OD", "SYN_OD", "SYN_OD", "SYN_OD", "…
## $ cos_group <chr> "Economy", "Economy", "Economy", "Economy", "Econom…
## $ train_no <chr> "Service 1", "Service 1", "Service 1", "Service 1",…
## $ dep_band <chr> "Morning", "Morning", "Morning", "Morning", "Mornin…
## $ dep_time <chr> "08:00", "08:00", "08:00", "08:00", "08:00", "08:00…
## $ travel_date_raw <chr> "01/08/2025", "01/08/2025", "01/08/2025", "01/08/20…
## $ booking_date_raw <chr> "28/07/2025", "25/07/2025", "24/07/2025", "23/07/20…
## $ volume <int> 2, 4, 3, 2, 1, 1, 2, 1, 2, 1, 3, 2, 3, 3, 5, 7, 7, …
## $ bookings_n <int> 2, 4, 3, 2, 1, 1, 2, 1, 2, 1, 3, 2, 3, 3, 5, 7, 7, …
## $ bucket <chr> "K01", "K01", "K01", "K01", "K01", "K01", "K01", "K…
## $ revenue_local <dbl> 460, 920, 690, 460, 230, 230, 460, 230, 460, 230, 6…
## $ ref_leg_capacity_b <dbl> 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 7…
## $ service_code <chr> "SYN459", "SYN459", "SYN459", "SYN459", "SYN459", "…
df <- synthetic %>%
mutate(
travel_date = as.Date(travel_date_raw, format = "%d/%m/%Y"),
booking_date = as.Date(booking_date_raw, format = "%d/%m/%Y"),
s3_bucket = toupper(trimws(as.character(bucket))),
lead_days = as.integer(travel_date - booking_date)
) %>%
filter(!is.na(lead_days), lead_days >= 0) %>%
filter(travel_date >= TRAVEL_START, travel_date <= TRAVEL_END)
df_b <- df %>%
inner_join(fare_map, by = "s3_bucket") %>%
group_by(train_no, dep_band, dep_time, travel_date, booking_date, lead_days,
s3_bucket, fare, bucket_order) %>%
summarise(
seats = sum(replace_na(volume, 0), na.rm = TRUE),
revenue = sum(replace_na(revenue_local, 0), na.rm = TRUE),
.groups = "drop"
)
service_bucket <- df_b %>%
group_by(train_no, dep_band, dep_time, travel_date, s3_bucket, fare, bucket_order) %>%
summarise(
open_L = { tmp <- lead_days[seats > 0]; if (length(tmp)) max(tmp) else NA_integer_ },
close_L = { tmp <- lead_days[seats > 0]; if (length(tmp)) min(tmp) else NA_integer_ },
sold = sum(seats, na.rm = TRUE),
.groups = "drop"
)
stats_by_train <- service_bucket %>%
group_by(train_no, dep_band, dep_time, s3_bucket, fare, bucket_order) %>%
summarise(
open_p25 = suppressWarnings(quantile(open_L, 0.25, na.rm = TRUE)),
open_p50 = suppressWarnings(quantile(open_L, 0.50, na.rm = TRUE)),
open_p75 = suppressWarnings(quantile(open_L, 0.75, na.rm = TRUE)),
close_p25 = suppressWarnings(quantile(close_L, 0.25, na.rm = TRUE)),
close_p50 = suppressWarnings(quantile(close_L, 0.50, na.rm = TRUE)), # K01 median → 0
close_p75 = suppressWarnings(quantile(close_L, 0.75, na.rm = TRUE)),
n_services_with_sales = sum(!is.na(open_L) | !is.na(close_L)),
.groups = "drop"
) %>%
filter(n_services_with_sales > 0) %>%
arrange(train_no, bucket_order)
head(stats_by_train, 10)
services_to_plot <- stats_by_train |>
dplyr::distinct(train_no, dep_band, dep_time)
plots_by_train <- purrr::pmap(
list(
svc = services_to_plot$train_no,
band = services_to_plot$dep_band,
time = services_to_plot$dep_time
),
function(svc, band, time) {
dat <- stats_by_train |>
dplyr::filter(train_no == svc, dep_band == band, dep_time == time) |>
dplyr::arrange(dplyr::desc(fare)) |>
dplyr::mutate(bucket = factor(s3_bucket, levels = unique(s3_bucket)))
if (nrow(dat) == 0) {
return(ggplot() + labs(title = paste0(svc, " — no data in window")))
}
maxL <- suppressWarnings(max(c(dat$open_p75, dat$close_p75), na.rm = TRUE))
if (!is.finite(maxL)) maxL <- 60
maxL <- max(14, ceiling(maxL / 7) * 7)
ggplot(dat, aes(y = bucket)) +
geom_segment(
aes(x = open_p25, xend = close_p75, yend = bucket,
color = "IQR (25–75%)", linewidth = "IQR (25–75%)"),
alpha = 0.15, lineend = "square"
) +
geom_segment(
aes(x = open_p50, xend = close_p50, yend = bucket,
color = "Median window", linewidth = "Median window"),
lineend = "round"
) +
geom_point(
aes(x = open_p50, fill = "Open (median)"),
shape = 21, stroke = 0.6, size = 2, color = "black"
) +
geom_point(
aes(x = close_p50, fill = "Close (median)"),
shape = 21, stroke = 0.6, size = 2, color = "black"
) +
scale_color_manual(NULL, values = c("IQR (25–75%)" = "darkorange",
"Median window" = "deeppink")) +
scale_linewidth_manual(NULL, values = c("IQR (25–75%)" = 5,
"Median window" = 1.2)) +
scale_fill_manual(NULL, values = c("Open (median)" = "darkorchid1",
"Close (median)" = "turquoise")) +
guides(
linewidth = "none",
color = guide_legend(order = 1),
fill = guide_legend(order = 2, override.aes = list(shape = 21, size = 3, color = "black"))
) +
scale_x_reverse(limits = c(maxL, 0), breaks = seq(0, maxL, by = 7)) +
labs(
title = paste0(svc, " — ", band, " (", time, ") — K-Bucket Opening/Closing Median"),
x = "Lead-in days (D−X, 0 = departure day)",
y = "Bucket (nested K only)"
) +
ggthemes::theme_solarized(base_size = 11) +
theme(legend.position = "bottom", plot.caption.position = "plot")
}
)
invisible(purrr::walk(plots_by_train, print))