What this notebook does & Important Notes regarding the analysis.

  • The below work has been implemented at my work at Eurostar Nested Bucket Analysis with 156K rows of data, containing 4 month travel date and each travel dates’ 3 month lead-in analysis.

  • To express, I took 15 days travel date and each travel date’s each service’s each nested buckets’ 2 month lead-in window.

  • Median is chosen because it only cares about order, not distance. As long as <50% of days are extreme, the median barely moves. However, mean can take into account the extreme outliers and can disrupt the analysis.

  • I changed the bucket names and fares for Eurostar’s confidentiality and generated synthetic data to express the methodology for four services: Morning, Afternoon, Evening, Night.

  • (Important) The code got very lengthy because lot of synthetic data was created for illustration and the parts 2) 3) 4) 5) 6) are just based to mask the real data. However, the real data extracted will be easier to analyse.

  • If interested in results (plots) only, please click on 8)Plots in LHS menu, which shows, per service, the median open→close and a faint band for the 25–75% spread (the typical spread). If to see the code as well, please click on the Show all code in the upper right corner.

1) Packages

suppressPackageStartupMessages({
  pacman::p_load("pacman","tidyverse","ggthemes","stats","purrr","scales")
})
set.seed(42)

2) Tunable parameters

Travel date 1 August - 15 August has been taken.

OVERLAP_PROBS <- c(Morning=0.40, Afternoon=0.35, Evening=0.30, Night=0.28)
MIN_WIDTH_RNG <- list(Morning=c(10,18), Afternoon=c(9,16), Evening=c(8,14), Night=c(7,12))
LEAD_MAX_RNG  <- list(Morning=c(55,75), Afternoon=c(48,68), Evening=c(45,65), Night=c(42,60))
LF_RNG        <- list(Morning=c(0.72,0.95), Afternoon=c(0.70,0.92),
                      Evening=c(0.66,0.90), Night=c(0.64,0.88))

TRAVEL_START <- as.Date("2025-08-01")
TRAVEL_END   <- as.Date("2025-08-15")

3) Synthetic Buckets & fare ladder

fare_map <- tibble::tibble(
  s3_bucket = sprintf("K%02d", 1:18),                  # K01 .. K18
  fare      = seq(230, 60, length.out = 18) |> round() # strictly decreasing
) |>
  dplyr::mutate(bucket_order = dplyr::dense_rank(fare))

fare_map

4) Services and travel window

4 Services were taken across different times of the day.

services <- tibble::tribble(
  ~service_id, ~service_name, ~dep_band,   ~dep_time,
  1,           "Service 1",   "Morning",   "08:00",
  2,           "Service 2",   "Afternoon", "13:00",
  3,           "Service 3",   "Evening",   "17:00",
  4,           "Service 4",   "Night",     "21:00"
)

travel_days <- seq(TRAVEL_START, TRAVEL_END, by = "day")

band_bias_map <- c(Morning = 0.6, Afternoon = 0.3, Evening = 0.15, Night = 0.0)

5) Helper functions

This illustration is purely synthetic to show the ideology. For real analysis, just need to replace with real data.

Create an open/close window (in D−X) for each bucket for one service-day. - lead_max: horizon for that day - min_width: minimum (open - close) - overlap_p: chance to nudge more overlap - Business rule: K01 closes at 0 D−X.

gen_windows <- function(lead_max, min_width, overlap_p = 0.30) {

  # Baseline: higher fares open earlier; add small randomness
  base_open <- pmax(
    min_width + 2,
    round(
      lead_max -
        (fare_map$bucket_order - 1) * runif(1, 1.6, 3.7) +  # slope
        rnorm(nrow(fare_map), 0, 2.5)                        # noise
    )
  )

  # Close after open, enforcing width >= min_width
  base_close <- pmax(0L, base_open - round(runif(nrow(fare_map), min_width, min_width + 15)))

  width_now  <- base_open - base_close
  too_narrow <- which(width_now < min_width)
  if (length(too_narrow)) {
    base_close[too_narrow] <- pmax(0L, base_open[too_narrow] - min_width - sample(0:5, length(too_narrow), TRUE))
  }

  # Optional overlaps
  if (runif(1) < overlap_p) {
    bump_idx <- sample(seq_along(base_open), size = sample(2:3, 1))
    base_close[bump_idx] <- pmax(0L, base_close[bump_idx] + sample(6:14, length(bump_idx), TRUE))
  }

  # Rare inversions at the low end
  if (runif(1) < 0.35) {
    low_ids <- which(fare_map$s3_bucket %in% c("K12","K14","K16","K18"))
    if (length(low_ids) >= 2) {
      pair <- sort(sample(low_ids, 2))
      a <- pair[1]; b <- pair[2]
      base_open[a]  <- max(0L, base_close[b] - sample(0:2,1))
      base_close[a] <- pmax(0L, base_open[a] - sample(min_width:(min_width+6),1))
    }
  }

  # Force K01 to close at 0 D−X (median close at departure)
  idx_k01 <- which(fare_map$s3_bucket == "K01")
  if (length(idx_k01) == 1) {
    base_close[idx_k01] <- 0L
    base_open[idx_k01]  <- max(base_open[idx_k01], min_width)
  }

  # Bounds and return
  base_open  <- pmin(base_open, lead_max)
  base_close <- pmax(base_close, 0L)
  base_close <- pmin(base_close, base_open)

  tibble(
    s3_bucket = fare_map$s3_bucket,
    open_L    = base_open,
    close_L   = base_close
  )
}

# Distribute seats across open..close using a gamma shape shifted by band
distribute_by_lead <- function(total_seats, open_L, close_L, band_bias = 0) {
  if (is.na(open_L) || is.na(close_L) || open_L < close_L || total_seats <= 0) {
    return(tibble(lead_days = integer(0), seats = integer(0)))
  }
  days <- seq(close_L, open_L)
  if (!length(days)) return(tibble(lead_days = integer(0), seats = integer(0)))

  shape <- 1.6 + band_bias       # higher → earlier bookings
  scale <- 10  - band_bias * 2
  inten <- dgamma(days + 1, shape = shape, scale = scale) + 1e-6
  probs <- inten / sum(inten)
  alloc <- as.vector(rmultinom(1, size = total_seats, prob = probs))
  tibble(lead_days = days, seats = alloc) %>% filter(seats > 0)
}

6) Build synthetic bookings

This sets the numerical synthetic values.

synthetic <- purrr::pmap_dfr(
  list(services$service_name, services$dep_band, services$dep_time),
  function(service_name, dep_band, dep_time) {

    lead_max_rng <- LEAD_MAX_RNG[[dep_band]]
    lf_rng       <- LF_RNG[[dep_band]]
    min_width_rng<- MIN_WIDTH_RNG[[dep_band]]
    overlap_p    <- OVERLAP_PROBS[[dep_band]]

    purrr::map_dfr(travel_days, function(td) {
      # Capacity & LF → total sold
      ref_cap  <- sample(c(536, 600, 672, 720), 1, prob = c(0.35,0.25,0.3,0.1))
      lf       <- runif(1, lf_rng[1], lf_rng[2])
      sold_tot <- round(ref_cap * lf)

      # Windows for this day
      lead_max  <- sample(seq(lead_max_rng[1], lead_max_rng[2]), 1)
      min_width <- sample(seq(min_width_rng[1], min_width_rng[2]), 1)
      windows   <- gen_windows(lead_max, min_width, overlap_p) %>%
        left_join(fare_map, by = "s3_bucket")

      # Allocate more share to lower fares; slight band-dependent slope
      band_slope <- switch(dep_band, "Morning"=1.7, "Afternoon"=1.5, "Evening"=1.3, "Night"=1.2)
      wts <- rexp(nrow(fare_map), rate = scales::rescale(fare_map$bucket_order, to = c(band_slope, 0.45)))
      wts <- wts / sum(wts)
      bucket_seats <- round(wts * sold_tot)

      band_bias <- band_bias_map[[dep_band]]

      purrr::map_dfr(seq_len(nrow(windows)), function(i) {
        s  <- bucket_seats[i]
        if (s <= 0) return(tibble())
        day_alloc <- distribute_by_lead(s, windows$open_L[i], windows$close_L[i], band_bias)
        if (!nrow(day_alloc)) return(tibble())
        day_alloc %>%
          transmute(
            booking_route_od   = "SYN_OD",
            cos_group          = "Economy",
            train_no           = service_name,      # labels: Service 1..4
            dep_band           = dep_band,
            dep_time           = dep_time,
            travel_date_raw    = format(td, "%d/%m/%Y"),
            booking_date_raw   = format(td - lead_days, "%d/%m/%Y"),
            volume             = seats,
            bookings_n         = seats,
            bucket             = windows$s3_bucket[i],
            revenue_local      = seats * windows$fare[i],
            ref_leg_capacity_b = ref_cap,
            service_code       = paste0("SYN", sample(100:999,1))
          )
      })
    })
  }
)

# Peek
dplyr::glimpse(synthetic)
## Rows: 11,225
## Columns: 13
## $ booking_route_od   <chr> "SYN_OD", "SYN_OD", "SYN_OD", "SYN_OD", "SYN_OD", "…
## $ cos_group          <chr> "Economy", "Economy", "Economy", "Economy", "Econom…
## $ train_no           <chr> "Service 1", "Service 1", "Service 1", "Service 1",…
## $ dep_band           <chr> "Morning", "Morning", "Morning", "Morning", "Mornin…
## $ dep_time           <chr> "08:00", "08:00", "08:00", "08:00", "08:00", "08:00…
## $ travel_date_raw    <chr> "01/08/2025", "01/08/2025", "01/08/2025", "01/08/20…
## $ booking_date_raw   <chr> "28/07/2025", "25/07/2025", "24/07/2025", "23/07/20…
## $ volume             <int> 2, 4, 3, 2, 1, 1, 2, 1, 2, 1, 3, 2, 3, 3, 5, 7, 7, …
## $ bookings_n         <int> 2, 4, 3, 2, 1, 1, 2, 1, 2, 1, 3, 2, 3, 3, 5, 7, 7, …
## $ bucket             <chr> "K01", "K01", "K01", "K01", "K01", "K01", "K01", "K…
## $ revenue_local      <dbl> 460, 920, 690, 460, 230, 230, 460, 230, 460, 230, 6…
## $ ref_leg_capacity_b <dbl> 720, 720, 720, 720, 720, 720, 720, 720, 720, 720, 7…
## $ service_code       <chr> "SYN459", "SYN459", "SYN459", "SYN459", "SYN459", "…

7) Clean data and infer open/close

df <- synthetic %>%
  mutate(
    travel_date  = as.Date(travel_date_raw,  format = "%d/%m/%Y"),
    booking_date = as.Date(booking_date_raw, format = "%d/%m/%Y"),
    s3_bucket    = toupper(trimws(as.character(bucket))),
    lead_days    = as.integer(travel_date - booking_date)
  ) %>%
  filter(!is.na(lead_days), lead_days >= 0) %>%
  filter(travel_date >= TRAVEL_START, travel_date <= TRAVEL_END)

df_b <- df %>%
  inner_join(fare_map, by = "s3_bucket") %>%
  group_by(train_no, dep_band, dep_time, travel_date, booking_date, lead_days,
           s3_bucket, fare, bucket_order) %>%
  summarise(
    seats   = sum(replace_na(volume, 0), na.rm = TRUE),
    revenue = sum(replace_na(revenue_local, 0), na.rm = TRUE),
    .groups = "drop"
  )

service_bucket <- df_b %>%
  group_by(train_no, dep_band, dep_time, travel_date, s3_bucket, fare, bucket_order) %>%
  summarise(
    open_L  = { tmp <- lead_days[seats > 0]; if (length(tmp)) max(tmp) else NA_integer_ },
    close_L = { tmp <- lead_days[seats > 0]; if (length(tmp)) min(tmp) else NA_integer_ },
    sold    = sum(seats, na.rm = TRUE),
    .groups = "drop"
  )

stats_by_train <- service_bucket %>%
  group_by(train_no, dep_band, dep_time, s3_bucket, fare, bucket_order) %>%
  summarise(
    open_p25  = suppressWarnings(quantile(open_L,  0.25, na.rm = TRUE)),
    open_p50  = suppressWarnings(quantile(open_L,  0.50, na.rm = TRUE)),
    open_p75  = suppressWarnings(quantile(open_L,  0.75, na.rm = TRUE)),
    close_p25 = suppressWarnings(quantile(close_L, 0.25, na.rm = TRUE)),
    close_p50 = suppressWarnings(quantile(close_L, 0.50, na.rm = TRUE)),  # K01 median → 0
    close_p75 = suppressWarnings(quantile(close_L, 0.75, na.rm = TRUE)),
    n_services_with_sales = sum(!is.na(open_L) | !is.na(close_L)),
    .groups = "drop"
  ) %>%
  filter(n_services_with_sales > 0) %>%
  arrange(train_no, bucket_order)

head(stats_by_train, 10)

8) Plots

services_to_plot <- stats_by_train |>
  dplyr::distinct(train_no, dep_band, dep_time)

plots_by_train <- purrr::pmap(
  list(
    svc  = services_to_plot$train_no,
    band = services_to_plot$dep_band,
    time = services_to_plot$dep_time
  ),
  function(svc, band, time) {

    dat <- stats_by_train |>
      dplyr::filter(train_no == svc, dep_band == band, dep_time == time) |>
      dplyr::arrange(dplyr::desc(fare)) |>
      dplyr::mutate(bucket = factor(s3_bucket, levels = unique(s3_bucket)))

    if (nrow(dat) == 0) {
      return(ggplot() + labs(title = paste0(svc, " — no data in window")))
    }

    maxL <- suppressWarnings(max(c(dat$open_p75, dat$close_p75), na.rm = TRUE))
    if (!is.finite(maxL)) maxL <- 60
    maxL <- max(14, ceiling(maxL / 7) * 7)

    ggplot(dat, aes(y = bucket)) +
      geom_segment(
        aes(x = open_p25, xend = close_p75, yend = bucket,
            color = "IQR (25–75%)", linewidth = "IQR (25–75%)"),
        alpha = 0.15, lineend = "square"
      ) +
      geom_segment(
        aes(x = open_p50, xend = close_p50, yend = bucket,
            color = "Median window", linewidth = "Median window"),
        lineend = "round"
      ) +
      geom_point(
        aes(x = open_p50, fill = "Open (median)"),
        shape = 21, stroke = 0.6, size = 2, color = "black"
      ) +
      geom_point(
        aes(x = close_p50, fill = "Close (median)"),
        shape = 21, stroke = 0.6, size = 2, color = "black"
      ) +
      scale_color_manual(NULL, values = c("IQR (25–75%)" = "darkorange",
                                          "Median window" = "deeppink")) +
      scale_linewidth_manual(NULL, values = c("IQR (25–75%)" = 5,
                                              "Median window" = 1.2)) +
      scale_fill_manual(NULL, values = c("Open (median)"  = "darkorchid1",
                                         "Close (median)" = "turquoise")) +
      guides(
        linewidth = "none",
        color = guide_legend(order = 1),
        fill  = guide_legend(order = 2, override.aes = list(shape = 21, size = 3, color = "black"))
      ) +
      scale_x_reverse(limits = c(maxL, 0), breaks = seq(0, maxL, by = 7)) +
      labs(
        title = paste0(svc, " — ", band, " (", time, ") — K-Bucket Opening/Closing Median"),
        x = "Lead-in days (D−X, 0 = departure day)",
        y = "Bucket (nested K only)"
      ) +
      ggthemes::theme_solarized(base_size = 11) +
      theme(legend.position = "bottom", plot.caption.position = "plot")
  }
)

invisible(purrr::walk(plots_by_train, print))