Introduction

This report analyzes travel data for October and November, comparing key metrics such as total revenue, revenue per seat (RPS), load factor, and yield between 2023 and 2024.

Data Preparation

library(tidyverse)
library(lubridate)
library(readxl)
library(scales)
library(ggtext)
library(showtext)
library(colorspace)
library(ggalt)
library(ggthemes)
library(ggrepel)
library(readxl)

# Load and preprocess data
df <- read_excel("C:/Users/Veerpal Kaur/OneDrive/Desktop/Revenue Management Task/Revenue_Management_Analyst_intern_-_Business_Case.xlsx", sheet = 2)

df <- df %>%
  mutate(`Travel Date` = as.Date(`Travel Date`),
         Year = year(`Travel Date`),
         Month = month(`Travel Date`, label = TRUE),
         LoadFactor = BookedVolume / 882,
         RPS = Revenue / 882)

# Filter to Oct and Nov only and summarize
df_summary <- df %>%
  filter(Month %in% c("Oct", "Nov")) %>%
  group_by(Year) %>%
  summarise(
    TotalRevenue = sum(Revenue, na.rm = TRUE),
    AvgLoadFactor = mean(LoadFactor, na.rm = TRUE),
    AvgRPS = mean(RPS, na.rm = TRUE),
    .groups = "drop"
  )

Total Revenue Analysis

We compare the total revenue for October and November between 2023 and 2024.

ggplot(df_summary, aes(x = factor(Year), y = TotalRevenue, fill = factor(Year))) +
  geom_col(width = 0.5) +
  labs(title = "Total Revenue: Oct–Nov (2023 vs 2024)",
       x = "Year", y = "Total Revenue (€)") +
  scale_y_continuous(labels = comma, breaks = seq(0, 30000000, by = 5000000)) +
  theme_solarized() +
  theme(legend.position = "none")

revenue_2023 <- df_summary %>% filter(Year == 2023) %>% pull(TotalRevenue)
revenue_2024 <- df_summary %>% filter(Year == 2024) %>% pull(TotalRevenue)
revenue_change_pct <- ((revenue_2024 - revenue_2023) / revenue_2023) * 100

Revenue Change: Revenue changed by -10.36% from 2023 to 2024.

Average Revenue Per Seat (RPS)

This section examines the average revenue per seat (RPS) for October and November.

ggplot(df_summary, aes(x = factor(Year), y = AvgRPS, fill = factor(Year))) +
  geom_col(width = 0.5) +
  labs(title = "Average Revenue Per Seat (RPS): Oct–Nov (2023 vs 2024)",
       x = "Year", y = "RPS (€)") +
  scale_y_continuous(labels = scales::dollar_format(prefix = "€")) +
  theme_solarized_2() +
  theme(legend.position = "none")

rps_summary <- df %>%
  filter(Month %in% c("Oct", "Nov")) %>%
  group_by(Year) %>%
  summarise(AvgRPS = mean(RPS, na.rm = TRUE), .groups = "drop")

rps_2023 <- rps_summary %>% filter(Year == 2023) %>% pull(AvgRPS)
rps_2024 <- rps_summary %>% filter(Year == 2024) %>% pull(AvgRPS)
rps_change_pct <- ((rps_2024 - rps_2023) / rps_2023) * 100

RPS Change: Average RPS decreased by 4.17% from 2023 to 2024.

Average Load Factor

The load factor (BookedVolume / 882) is analyzed for October and November.

ggplot(df_summary, aes(x = factor(Year), y = AvgLoadFactor, fill = factor(Year))) +
  geom_col(width = 0.5) +
  geom_text(aes(label = scales::percent(AvgLoadFactor, accuracy = 0.1)),
            vjust = -0.5, size = 4) +
  labs(title = "Average Load Factor: Oct–Nov (2023 vs 2024)",
       x = "Year", y = "Load Factor") +
  scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, by = 0.2)) +
  theme_solarized() +
  theme(legend.position = "none")

Average Yield Per Customer

We calculate the average yield (Revenue / BookedVolume) by year and month.

yield_summary <- df %>%
  filter(Month %in% c("Oct", "Nov")) %>%
  mutate(Scope = as.character(Month)) %>%
  group_by(Year) %>%
  summarise(AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE), Scope = "Total", .groups = "drop") %>%
  bind_rows(
    df %>%
      filter(Month %in% c("Oct", "Nov")) %>%
      group_by(Year, Month) %>%
      summarise(AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE), .groups = "drop") %>%
      rename(Scope = Month)
  )

yield_summary <- yield_summary %>%
  mutate(Scope = factor(Scope, levels = c("Total", "Oct", "Nov")))

color_map <- c(
  "Total" = "#E91E63",
  "Oct"   = "#7F8C8D",
  "Nov"   = "#4D5656"
)

ggplot(yield_summary, aes(x = factor(Year), y = AvgYield, fill = Scope)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6, aes(group = Scope)) +
  labs(title = "Average Revenue per Booked Seat (Yield): Oct–Nov Breakdown",
       x = "Year", y = "Avg Yield (€)", fill = "Scope") +
  scale_y_continuous(labels = dollar_format(prefix = "€"), breaks = pretty(yield_summary$AvgYield, n = 6)) +
  scale_fill_manual(values = color_map, labels = c("Total", "October", "November"), name = "Scope") +
  theme_solarized()

yield_change <- yield_summary %>%
  select(Year, Scope, AvgYield) %>%
  pivot_wider(names_from = Year, values_from = AvgYield) %>%
  mutate(
    PctChange = (`2024` - `2023`) / `2023` * 100,
    Direction = ifelse(PctChange >= 0, "increase", "decrease")
  ) %>%
  mutate(PctChange = round(PctChange, 2)) %>%
  arrange(desc(Scope))

yield_change
## # A tibble: 3 × 5
##   Scope `2023` `2024` PctChange Direction
##   <fct>  <dbl>  <dbl>     <dbl> <chr>    
## 1 Nov     102.   123.     20.9  increase 
## 2 Oct     153.   130.    -14.9  decrease 
## 3 Total   140.   129.     -7.57 decrease

Load Factor and Revenue by Day of Week

We analyze how load factor and yield vary by day of the week.

df_tradeoff <- df %>%
  filter(Year %in% c(2023, 2024), Month %in% c("Oct", "Nov")) %>%
  mutate(
    Weekday = wday(`Travel Date`, label = TRUE, abbr = FALSE, week_start = 1),
    LoadFactor = BookedVolume / 882,
    Yield = Revenue / BookedVolume
  ) %>%
  group_by(Year, Weekday) %>%
  summarise(
    AvgYield = mean(Yield, na.rm = TRUE),
    AvgLoadFactor = mean(LoadFactor, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_longer(cols = c(AvgYield, AvgLoadFactor),
               names_to = "Metric", values_to = "Value")

format_metric <- function(metric) {
  if (metric == "AvgYield") {
    scales::dollar_format(prefix = "€", accuracy = 1)
  } else {
    scales::percent_format(accuracy = 1)
  }
}

ggplot(df_tradeoff, aes(x = Weekday, y = Value, fill = factor(Year))) +
  geom_col(position = "dodge", width = 0.6) +
  facet_wrap(
    ~Metric,
    scales = "free_y",
    labeller = as_labeller(
      c(AvgYield = "Yield (€)", AvgLoadFactor = "Load Factor")
    ),
    nrow = 1
  ) +
  scale_y_continuous(
    breaks = function(limits) {
      if (diff(limits) <= 1) {
        seq(0, 1, by = 0.2)
      } else {
        pretty(limits, n = 6)
      }
    },
    labels = function(x) {
      if (all(x <= 1, na.rm = TRUE)) {
        scales::percent(x, accuracy = 1)
      } else {
        scales::dollar(x, prefix = "€", accuracy = 1)
      }
    }
  ) +
  scale_fill_manual(values = c("2023" = "#95A5A6", "2024" = "#EC407A")) +
  labs(
    title = "Yield vs Load Factor by Weekday (Oct–Nov, 2023 vs 2024)",
    x = "Day of Week", y = NULL, fill = "Year"
  ) +
  theme_solarized() +
  theme(strip.text = element_text(size = 13))

Train-Level Analysis (2024)

We examine the commercial performance of trains in 2024, focusing on yield and load factor.

train_summary_2024 <- df %>%
  filter(Month %in% c("Oct", "Nov"), year(`Travel Date`) == 2024) %>%
  group_by(`Train Number`) %>%
  summarise(
    AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE),
    AvgLF = mean(BookedVolume / 882, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(AvgYield)) %>%
  mutate(`Train Number` = factor(`Train Number`, levels = `Train Number`))

scaling_factor <- max(train_summary_2024$AvgYield) / max(train_summary_2024$AvgLF)

ggplot(train_summary_2024, aes(x = `Train Number`)) +
  geom_col(aes(y = AvgYield), fill = "#E91E63") +
  geom_line(aes(y = AvgLF * scaling_factor, group = 1), color = "#2E86C1", linewidth = 1.2) +
  geom_point(aes(y = AvgLF * scaling_factor), color = "#2E86C1", size = 2) +
  scale_y_continuous(
    name = "Avg Yield (€)",
    labels = dollar_format(prefix = "€"),
    sec.axis = sec_axis(~ . / scaling_factor,
                        name = "Load Factor (%)",
                        labels = percent_format(accuracy = 1))
  ) +
  labs(title = "Train-Level Commercial Performance (2024)",
       x = "Train Number") +
  theme_solarized() +
  theme(
    axis.text.x = element_text(angle = 90, vjust = 0.5),
    axis.title.y = element_text(color = "#E91E63", size = 11, face = "bold"),
    axis.title.y.right = element_text(color = "#2E86C1", size = 11, face = "bold"),
    axis.text.y = element_text(color = "#E91E63", size = 10, face = "bold"),
    axis.text.y.right = element_text(color = "#2E86C1", size = 10, face = "bold")
  )

Train-Level Yield Change (2023 vs 2024)

This section visualizes the change in yield per train, with labels colored by revenue trend.

yield_dumbbell <- df %>%
  filter(Month %in% c("Oct", "Nov"),
         !is.na(BookedVolume), BookedVolume > 0) %>%
  group_by(`Train Number`, Year) %>%
  summarise(
    AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE),
    TotalRevenue = sum(Revenue, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  pivot_wider(names_from = Year, values_from = c(AvgYield, TotalRevenue)) %>%
  drop_na() %>%
  mutate(
    YieldChangePct = round(((AvgYield_2024 - AvgYield_2023) / AvgYield_2023) * 100, 1),
    YieldChangeLabel = ifelse(
      is.finite(YieldChangePct),
      paste0(ifelse(YieldChangePct > 0, "+", ""), YieldChangePct, "%"),
      "NA"
    ),
    RevTrend = ifelse(TotalRevenue_2024 > TotalRevenue_2023, "Increased", "Decreased"),
    LabelColor = ifelse(RevTrend == "Increased", "green", "red"),
    Train_Label = paste0("<span style='color:", LabelColor, "'>", `Train Number`, "</span>")
  ) %>%
  arrange(desc(RevTrend), desc(AvgYield_2024)) %>%
  mutate(Train_Label = factor(Train_Label, levels = unique(Train_Label)))

ggplot(yield_dumbbell, aes(y = Train_Label)) +
  geom_segment(aes(x = AvgYield_2023, xend = AvgYield_2024, yend = Train_Label),
               color = "#BDC3C7", size = 1.5) +
  geom_point(aes(x = AvgYield_2023, color = "2023"), size = 4) +
  geom_point(aes(x = AvgYield_2024, color = "2024"), size = 4) +
  geom_text(
    aes(x = pmax(AvgYield_2023, AvgYield_2024) + 1.5, label = YieldChangeLabel),
    color = "black",
    size = 3.5,
    family = "Times New Roman",
    fontface = "plain",
    hjust = 0
  ) +
  geom_point(data = data.frame(x = 0, y = Inf, RevTrend = c("Increased", "Decreased")),
             aes(x = x, y = y, color = RevTrend),
             size = 0, show.legend = TRUE) +
  scale_color_manual(
    name = NULL,
    values = c(
      "2023" = "#26A69A",
      "2024" = "#FF00FF",
      "Increased" = "green",
      "Decreased" = "red"
    ),
    labels = c(
      "2023 Yield",
      "2024 Yield",
      "Revenue Increased (Train Label → Green)",
      "Revenue Decreased (Train Label → Red)"
    ),
    guide = guide_legend(override.aes = list(size = 4))
  ) +
  scale_x_continuous(
    limits = c(100, 175),
    labels = dollar_format(prefix = "€")
  ) +
  labs(
    title = "Train-Level Yield Change with Revenue-Based Labels (2023 vs 2024)",
    x = "Average Yield (€)", y = "Train Number",
    caption = "Train label color reflects change in Total Revenue (2023 → 2024)"
  ) +
  theme_solarized() +
  theme(
    axis.text.y = ggtext::element_markdown(size = 9, face = "bold"),
    axis.title.x = element_text(face = "bold"),
    axis.title.y = element_text(face = "bold"),
    plot.title = element_text(size = 14, face = "bold"),
    legend.position = "top"
  )

RPS vs Yield Scatter Plot

This scatter plot examines the relationship between RPS and yield, highlighting pricing vs efficiency.

ggplot(df %>% filter(Month %in% c("Oct", "Nov")),
       aes(x = Revenue / 882, y = Revenue / BookedVolume, color = factor(Year))) +
  geom_point(alpha = 0.6, size = 3) +
  geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, linewidth = 1.2) +
  labs(title = "RPS vs Yield: Pricing vs Efficiency (Oct–Nov)",
       x = "Revenue per Available Seat (RPS €)",
       y = "Revenue per Booked Seat (Yield €)",
       color = "Year") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "€", accuracy = 1)) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "€", accuracy = 1)) +
  theme_solarized()

Individual Train Analysis (Train 9031)

We analyze the yield and volume for Train 9031 in 2024 by day of the week.

target_train <- 9031
train_trend <- df %>%
  filter(`Train Number` == target_train,
         year(`Travel Date`) == 2024) %>%
  mutate(
    DOW = wday(`Travel Date`, label = TRUE),
    Yield = Revenue / BookedVolume
  ) %>%
  group_by(DOW) %>%
  summarise(
    AvgYield = mean(Yield, na.rm = TRUE),
    AvgVolume = mean(BookedVolume, na.rm = TRUE),
    .groups = "drop"
  )

yield_max_display <- 175
volume_max_display <- 900
scaling_factor <- yield_max_display / volume_max_display

ggplot(train_trend, aes(x = DOW)) +
  geom_col(aes(y = AvgYield), fill = "#EC407A") +
  geom_line(aes(y = AvgVolume * scaling_factor), 
            group = 1, color = "#0277BD", size = 1.2) +
  geom_point(aes(y = AvgVolume * scaling_factor), 
             color = "#0277BD", size = 3) +
  scale_y_continuous(
    name = expression(paste("Average Yield (", euro, ")")),
    labels = scales::dollar_format(prefix = "€"),
    limits = c(0, yield_max_display),
    breaks = seq(0, yield_max_display, by = 25),
    sec.axis = sec_axis(
      trans = ~ . / scaling_factor,
      name = "Average Booked Volume",
      breaks = seq(0, 1000, by = 200),
      labels = scales::comma
    )
  ) +
  labs(title = "Train 9031 – Yield vs Volume by Day of Week (2024)",
       x = "Day of Week") +
  theme_solarized() +
  theme(
    axis.title.y.left = element_text(color = "#EC407A", size = 12, face = "bold"),
    axis.text.y.left  = element_text(color = "#EC407A"),
    axis.title.y.right = element_text(color = "#0277BD", size = 12, face = "bold"),
    axis.text.y.right  = element_text(color = "#0277BD")
  )

2024 Scatter Plot with Top Performing Trains

This scatter plot identifies trains in 2024 with the highest residuals from the RPS-yield regression.

df_scatter <- df %>%
  filter(Month %in% c("Oct", "Nov"),
         year(`Travel Date`) == 2024) %>%
  mutate(
    RPS = Revenue / 882,
    Yield = Revenue / BookedVolume,
    Year = year(`Travel Date`)
  )

model_2024 <- lm(Yield ~ RPS, data = df_scatter)
df_scatter <- df_scatter %>%
  mutate(
    PredictedYield = predict(model_2024, newdata = df_scatter),
    Residual = Yield - PredictedYield
  )

top_train_labels <- df_scatter %>%
  group_by(`Train Number`) %>%
  summarise(
    MaxResidual = max(Residual, na.rm = TRUE),
    RPS = RPS[which.max(Residual)],
    Yield = Yield[which.max(Residual)]
  ) %>%
  arrange(desc(MaxResidual)) %>%
  slice_head(n = 5)

ggplot(df_scatter, aes(x = RPS, y = Yield)) +
  geom_point(alpha = 0.6, color = "#00bfc4", size = 3) +
  geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, color = "#00bfc4", linewidth = 1.2) +
  geom_text_repel(data = top_train_labels,
                  aes(x = RPS, y = Yield, label = `Train Number`),
                  size = 3.5,
                  color = "forestgreen",
                  box.padding = 0.5,
                  min.segment.length = 0) +
  labs(title = "Strong Performing Trains Above RPS–Yield Regression (2024)",
       x = "Revenue per Available Seat (RPS €)",
       y = "Revenue per Booked Seat (Yield €)") +
  scale_x_continuous(labels = scales::dollar_format(prefix = "€")) +
  scale_y_continuous(labels = scales::dollar_format(prefix = "€")) +
  theme_minimal()

Passenger Volume Analysis

We compare passenger volumes for October and November between 2023 and 2024.

passenger_summary <- df %>%
  filter(Month %in% c("Oct", "Nov")) %>%
  mutate(
    Year = as.factor(year(`Travel Date`)),
    Month = month(`Travel Date`, label = TRUE)
  ) %>%
  group_by(Year, Month) %>%
  summarise(Passengers = sum(BookedVolume), .groups = "drop")

total_passengers <- passenger_summary %>%
  group_by(Year) %>%
  summarise(Total = sum(Passengers), .groups = "drop")

ggplot(passenger_summary, aes(x = Year, y = Passengers, fill = Month)) +
  geom_col(position = position_dodge(width = 0.7), width = 0.6) +
  geom_text(data = total_passengers,
            aes(x = Year, y = Total * 1.05,
                label = paste0("Total: ", scales::comma(Total))),
            inherit.aes = FALSE,
            fontface = "bold", size = 5, color = "black") +
  labs(title = "Passenger Volume Comparison by Year (Oct vs Nov)",
       x = "Year", y = "Total Passengers", fill = "Month") +
  scale_y_continuous(labels = scales::comma) +
  theme_solarized_2()

Revenue by Train

This section compares total revenue by train for October and November across 2023 and 2024. Note: The dataset train_revenue is assumed to be derived from df or provided separately.

train_revenue <- df %>%
  filter(Month %in% c("Oct", "Nov")) %>%
  group_by(`Train Number`, Year) %>%
  summarise(TotalRevenue = sum(Revenue, na.rm = TRUE), .groups = "drop")

train_order <- train_revenue %>%
  group_by(`Train Number`) %>%
  summarise(AllRevenue = sum(TotalRevenue, na.rm = TRUE)) %>%
  arrange(desc(AllRevenue)) %>%
  pull(`Train Number`)

train_revenue$`Train Number` <- factor(train_revenue$`Train Number`, levels = train_order)

ggplot(train_revenue, aes(x = `Train Number`, y = TotalRevenue, fill = factor(Year))) +
  geom_col(position = "dodge", width = 0.7) +
  labs(title = "Revenue Comparison by Train (Oct–Nov 2023 vs 2024)",
       x = "Train Number",
       y = "Total Revenue (€)",
       fill = "Year") +
  scale_y_continuous(labels = scales::comma) +
  theme_solarized() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))