This report analyzes travel data for October and November, comparing key metrics such as total revenue, revenue per seat (RPS), load factor, and yield between 2023 and 2024.
library(tidyverse)
library(lubridate)
library(readxl)
library(scales)
library(ggtext)
library(showtext)
library(colorspace)
library(ggalt)
library(ggthemes)
library(ggrepel)
library(readxl)
# Load and preprocess data
df <- read_excel("C:/Users/Veerpal Kaur/OneDrive/Desktop/Revenue Management Task/Revenue_Management_Analyst_intern_-_Business_Case.xlsx", sheet = 2)
df <- df %>%
mutate(`Travel Date` = as.Date(`Travel Date`),
Year = year(`Travel Date`),
Month = month(`Travel Date`, label = TRUE),
LoadFactor = BookedVolume / 882,
RPS = Revenue / 882)
# Filter to Oct and Nov only and summarize
df_summary <- df %>%
filter(Month %in% c("Oct", "Nov")) %>%
group_by(Year) %>%
summarise(
TotalRevenue = sum(Revenue, na.rm = TRUE),
AvgLoadFactor = mean(LoadFactor, na.rm = TRUE),
AvgRPS = mean(RPS, na.rm = TRUE),
.groups = "drop"
)
We compare the total revenue for October and November between 2023 and 2024.
ggplot(df_summary, aes(x = factor(Year), y = TotalRevenue, fill = factor(Year))) +
geom_col(width = 0.5) +
labs(title = "Total Revenue: Oct–Nov (2023 vs 2024)",
x = "Year", y = "Total Revenue (€)") +
scale_y_continuous(labels = comma, breaks = seq(0, 30000000, by = 5000000)) +
theme_solarized() +
theme(legend.position = "none")
revenue_2023 <- df_summary %>% filter(Year == 2023) %>% pull(TotalRevenue)
revenue_2024 <- df_summary %>% filter(Year == 2024) %>% pull(TotalRevenue)
revenue_change_pct <- ((revenue_2024 - revenue_2023) / revenue_2023) * 100
Revenue Change: Revenue changed by -10.36% from 2023 to 2024.
This section examines the average revenue per seat (RPS) for October and November.
ggplot(df_summary, aes(x = factor(Year), y = AvgRPS, fill = factor(Year))) +
geom_col(width = 0.5) +
labs(title = "Average Revenue Per Seat (RPS): Oct–Nov (2023 vs 2024)",
x = "Year", y = "RPS (€)") +
scale_y_continuous(labels = scales::dollar_format(prefix = "€")) +
theme_solarized_2() +
theme(legend.position = "none")
rps_summary <- df %>%
filter(Month %in% c("Oct", "Nov")) %>%
group_by(Year) %>%
summarise(AvgRPS = mean(RPS, na.rm = TRUE), .groups = "drop")
rps_2023 <- rps_summary %>% filter(Year == 2023) %>% pull(AvgRPS)
rps_2024 <- rps_summary %>% filter(Year == 2024) %>% pull(AvgRPS)
rps_change_pct <- ((rps_2024 - rps_2023) / rps_2023) * 100
RPS Change: Average RPS decreased by 4.17% from 2023 to 2024.
The load factor (BookedVolume / 882) is analyzed for October and November.
ggplot(df_summary, aes(x = factor(Year), y = AvgLoadFactor, fill = factor(Year))) +
geom_col(width = 0.5) +
geom_text(aes(label = scales::percent(AvgLoadFactor, accuracy = 0.1)),
vjust = -0.5, size = 4) +
labs(title = "Average Load Factor: Oct–Nov (2023 vs 2024)",
x = "Year", y = "Load Factor") +
scale_y_continuous(labels = scales::percent, breaks = seq(0, 1, by = 0.2)) +
theme_solarized() +
theme(legend.position = "none")
We calculate the average yield (Revenue / BookedVolume) by year and month.
yield_summary <- df %>%
filter(Month %in% c("Oct", "Nov")) %>%
mutate(Scope = as.character(Month)) %>%
group_by(Year) %>%
summarise(AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE), Scope = "Total", .groups = "drop") %>%
bind_rows(
df %>%
filter(Month %in% c("Oct", "Nov")) %>%
group_by(Year, Month) %>%
summarise(AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE), .groups = "drop") %>%
rename(Scope = Month)
)
yield_summary <- yield_summary %>%
mutate(Scope = factor(Scope, levels = c("Total", "Oct", "Nov")))
color_map <- c(
"Total" = "#E91E63",
"Oct" = "#7F8C8D",
"Nov" = "#4D5656"
)
ggplot(yield_summary, aes(x = factor(Year), y = AvgYield, fill = Scope)) +
geom_col(position = position_dodge(width = 0.7), width = 0.6, aes(group = Scope)) +
labs(title = "Average Revenue per Booked Seat (Yield): Oct–Nov Breakdown",
x = "Year", y = "Avg Yield (€)", fill = "Scope") +
scale_y_continuous(labels = dollar_format(prefix = "€"), breaks = pretty(yield_summary$AvgYield, n = 6)) +
scale_fill_manual(values = color_map, labels = c("Total", "October", "November"), name = "Scope") +
theme_solarized()
yield_change <- yield_summary %>%
select(Year, Scope, AvgYield) %>%
pivot_wider(names_from = Year, values_from = AvgYield) %>%
mutate(
PctChange = (`2024` - `2023`) / `2023` * 100,
Direction = ifelse(PctChange >= 0, "increase", "decrease")
) %>%
mutate(PctChange = round(PctChange, 2)) %>%
arrange(desc(Scope))
yield_change
## # A tibble: 3 × 5
## Scope `2023` `2024` PctChange Direction
## <fct> <dbl> <dbl> <dbl> <chr>
## 1 Nov 102. 123. 20.9 increase
## 2 Oct 153. 130. -14.9 decrease
## 3 Total 140. 129. -7.57 decrease
We analyze how load factor and yield vary by day of the week.
df_tradeoff <- df %>%
filter(Year %in% c(2023, 2024), Month %in% c("Oct", "Nov")) %>%
mutate(
Weekday = wday(`Travel Date`, label = TRUE, abbr = FALSE, week_start = 1),
LoadFactor = BookedVolume / 882,
Yield = Revenue / BookedVolume
) %>%
group_by(Year, Weekday) %>%
summarise(
AvgYield = mean(Yield, na.rm = TRUE),
AvgLoadFactor = mean(LoadFactor, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_longer(cols = c(AvgYield, AvgLoadFactor),
names_to = "Metric", values_to = "Value")
format_metric <- function(metric) {
if (metric == "AvgYield") {
scales::dollar_format(prefix = "€", accuracy = 1)
} else {
scales::percent_format(accuracy = 1)
}
}
ggplot(df_tradeoff, aes(x = Weekday, y = Value, fill = factor(Year))) +
geom_col(position = "dodge", width = 0.6) +
facet_wrap(
~Metric,
scales = "free_y",
labeller = as_labeller(
c(AvgYield = "Yield (€)", AvgLoadFactor = "Load Factor")
),
nrow = 1
) +
scale_y_continuous(
breaks = function(limits) {
if (diff(limits) <= 1) {
seq(0, 1, by = 0.2)
} else {
pretty(limits, n = 6)
}
},
labels = function(x) {
if (all(x <= 1, na.rm = TRUE)) {
scales::percent(x, accuracy = 1)
} else {
scales::dollar(x, prefix = "€", accuracy = 1)
}
}
) +
scale_fill_manual(values = c("2023" = "#95A5A6", "2024" = "#EC407A")) +
labs(
title = "Yield vs Load Factor by Weekday (Oct–Nov, 2023 vs 2024)",
x = "Day of Week", y = NULL, fill = "Year"
) +
theme_solarized() +
theme(strip.text = element_text(size = 13))
We examine the commercial performance of trains in 2024, focusing on yield and load factor.
train_summary_2024 <- df %>%
filter(Month %in% c("Oct", "Nov"), year(`Travel Date`) == 2024) %>%
group_by(`Train Number`) %>%
summarise(
AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE),
AvgLF = mean(BookedVolume / 882, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(AvgYield)) %>%
mutate(`Train Number` = factor(`Train Number`, levels = `Train Number`))
scaling_factor <- max(train_summary_2024$AvgYield) / max(train_summary_2024$AvgLF)
ggplot(train_summary_2024, aes(x = `Train Number`)) +
geom_col(aes(y = AvgYield), fill = "#E91E63") +
geom_line(aes(y = AvgLF * scaling_factor, group = 1), color = "#2E86C1", linewidth = 1.2) +
geom_point(aes(y = AvgLF * scaling_factor), color = "#2E86C1", size = 2) +
scale_y_continuous(
name = "Avg Yield (€)",
labels = dollar_format(prefix = "€"),
sec.axis = sec_axis(~ . / scaling_factor,
name = "Load Factor (%)",
labels = percent_format(accuracy = 1))
) +
labs(title = "Train-Level Commercial Performance (2024)",
x = "Train Number") +
theme_solarized() +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5),
axis.title.y = element_text(color = "#E91E63", size = 11, face = "bold"),
axis.title.y.right = element_text(color = "#2E86C1", size = 11, face = "bold"),
axis.text.y = element_text(color = "#E91E63", size = 10, face = "bold"),
axis.text.y.right = element_text(color = "#2E86C1", size = 10, face = "bold")
)
This section visualizes the change in yield per train, with labels colored by revenue trend.
yield_dumbbell <- df %>%
filter(Month %in% c("Oct", "Nov"),
!is.na(BookedVolume), BookedVolume > 0) %>%
group_by(`Train Number`, Year) %>%
summarise(
AvgYield = mean(Revenue / BookedVolume, na.rm = TRUE),
TotalRevenue = sum(Revenue, na.rm = TRUE),
.groups = "drop"
) %>%
pivot_wider(names_from = Year, values_from = c(AvgYield, TotalRevenue)) %>%
drop_na() %>%
mutate(
YieldChangePct = round(((AvgYield_2024 - AvgYield_2023) / AvgYield_2023) * 100, 1),
YieldChangeLabel = ifelse(
is.finite(YieldChangePct),
paste0(ifelse(YieldChangePct > 0, "+", ""), YieldChangePct, "%"),
"NA"
),
RevTrend = ifelse(TotalRevenue_2024 > TotalRevenue_2023, "Increased", "Decreased"),
LabelColor = ifelse(RevTrend == "Increased", "green", "red"),
Train_Label = paste0("<span style='color:", LabelColor, "'>", `Train Number`, "</span>")
) %>%
arrange(desc(RevTrend), desc(AvgYield_2024)) %>%
mutate(Train_Label = factor(Train_Label, levels = unique(Train_Label)))
ggplot(yield_dumbbell, aes(y = Train_Label)) +
geom_segment(aes(x = AvgYield_2023, xend = AvgYield_2024, yend = Train_Label),
color = "#BDC3C7", size = 1.5) +
geom_point(aes(x = AvgYield_2023, color = "2023"), size = 4) +
geom_point(aes(x = AvgYield_2024, color = "2024"), size = 4) +
geom_text(
aes(x = pmax(AvgYield_2023, AvgYield_2024) + 1.5, label = YieldChangeLabel),
color = "black",
size = 3.5,
family = "Times New Roman",
fontface = "plain",
hjust = 0
) +
geom_point(data = data.frame(x = 0, y = Inf, RevTrend = c("Increased", "Decreased")),
aes(x = x, y = y, color = RevTrend),
size = 0, show.legend = TRUE) +
scale_color_manual(
name = NULL,
values = c(
"2023" = "#26A69A",
"2024" = "#FF00FF",
"Increased" = "green",
"Decreased" = "red"
),
labels = c(
"2023 Yield",
"2024 Yield",
"Revenue Increased (Train Label → Green)",
"Revenue Decreased (Train Label → Red)"
),
guide = guide_legend(override.aes = list(size = 4))
) +
scale_x_continuous(
limits = c(100, 175),
labels = dollar_format(prefix = "€")
) +
labs(
title = "Train-Level Yield Change with Revenue-Based Labels (2023 vs 2024)",
x = "Average Yield (€)", y = "Train Number",
caption = "Train label color reflects change in Total Revenue (2023 → 2024)"
) +
theme_solarized() +
theme(
axis.text.y = ggtext::element_markdown(size = 9, face = "bold"),
axis.title.x = element_text(face = "bold"),
axis.title.y = element_text(face = "bold"),
plot.title = element_text(size = 14, face = "bold"),
legend.position = "top"
)
This scatter plot examines the relationship between RPS and yield, highlighting pricing vs efficiency.
ggplot(df %>% filter(Month %in% c("Oct", "Nov")),
aes(x = Revenue / 882, y = Revenue / BookedVolume, color = factor(Year))) +
geom_point(alpha = 0.6, size = 3) +
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, linewidth = 1.2) +
labs(title = "RPS vs Yield: Pricing vs Efficiency (Oct–Nov)",
x = "Revenue per Available Seat (RPS €)",
y = "Revenue per Booked Seat (Yield €)",
color = "Year") +
scale_x_continuous(labels = scales::dollar_format(prefix = "€", accuracy = 1)) +
scale_y_continuous(labels = scales::dollar_format(prefix = "€", accuracy = 1)) +
theme_solarized()
We analyze the yield and volume for Train 9031 in 2024 by day of the week.
target_train <- 9031
train_trend <- df %>%
filter(`Train Number` == target_train,
year(`Travel Date`) == 2024) %>%
mutate(
DOW = wday(`Travel Date`, label = TRUE),
Yield = Revenue / BookedVolume
) %>%
group_by(DOW) %>%
summarise(
AvgYield = mean(Yield, na.rm = TRUE),
AvgVolume = mean(BookedVolume, na.rm = TRUE),
.groups = "drop"
)
yield_max_display <- 175
volume_max_display <- 900
scaling_factor <- yield_max_display / volume_max_display
ggplot(train_trend, aes(x = DOW)) +
geom_col(aes(y = AvgYield), fill = "#EC407A") +
geom_line(aes(y = AvgVolume * scaling_factor),
group = 1, color = "#0277BD", size = 1.2) +
geom_point(aes(y = AvgVolume * scaling_factor),
color = "#0277BD", size = 3) +
scale_y_continuous(
name = expression(paste("Average Yield (", euro, ")")),
labels = scales::dollar_format(prefix = "€"),
limits = c(0, yield_max_display),
breaks = seq(0, yield_max_display, by = 25),
sec.axis = sec_axis(
trans = ~ . / scaling_factor,
name = "Average Booked Volume",
breaks = seq(0, 1000, by = 200),
labels = scales::comma
)
) +
labs(title = "Train 9031 – Yield vs Volume by Day of Week (2024)",
x = "Day of Week") +
theme_solarized() +
theme(
axis.title.y.left = element_text(color = "#EC407A", size = 12, face = "bold"),
axis.text.y.left = element_text(color = "#EC407A"),
axis.title.y.right = element_text(color = "#0277BD", size = 12, face = "bold"),
axis.text.y.right = element_text(color = "#0277BD")
)
This scatter plot identifies trains in 2024 with the highest residuals from the RPS-yield regression.
df_scatter <- df %>%
filter(Month %in% c("Oct", "Nov"),
year(`Travel Date`) == 2024) %>%
mutate(
RPS = Revenue / 882,
Yield = Revenue / BookedVolume,
Year = year(`Travel Date`)
)
model_2024 <- lm(Yield ~ RPS, data = df_scatter)
df_scatter <- df_scatter %>%
mutate(
PredictedYield = predict(model_2024, newdata = df_scatter),
Residual = Yield - PredictedYield
)
top_train_labels <- df_scatter %>%
group_by(`Train Number`) %>%
summarise(
MaxResidual = max(Residual, na.rm = TRUE),
RPS = RPS[which.max(Residual)],
Yield = Yield[which.max(Residual)]
) %>%
arrange(desc(MaxResidual)) %>%
slice_head(n = 5)
ggplot(df_scatter, aes(x = RPS, y = Yield)) +
geom_point(alpha = 0.6, color = "#00bfc4", size = 3) +
geom_smooth(method = "lm", se = FALSE, fullrange = TRUE, color = "#00bfc4", linewidth = 1.2) +
geom_text_repel(data = top_train_labels,
aes(x = RPS, y = Yield, label = `Train Number`),
size = 3.5,
color = "forestgreen",
box.padding = 0.5,
min.segment.length = 0) +
labs(title = "Strong Performing Trains Above RPS–Yield Regression (2024)",
x = "Revenue per Available Seat (RPS €)",
y = "Revenue per Booked Seat (Yield €)") +
scale_x_continuous(labels = scales::dollar_format(prefix = "€")) +
scale_y_continuous(labels = scales::dollar_format(prefix = "€")) +
theme_minimal()
We compare passenger volumes for October and November between 2023 and 2024.
passenger_summary <- df %>%
filter(Month %in% c("Oct", "Nov")) %>%
mutate(
Year = as.factor(year(`Travel Date`)),
Month = month(`Travel Date`, label = TRUE)
) %>%
group_by(Year, Month) %>%
summarise(Passengers = sum(BookedVolume), .groups = "drop")
total_passengers <- passenger_summary %>%
group_by(Year) %>%
summarise(Total = sum(Passengers), .groups = "drop")
ggplot(passenger_summary, aes(x = Year, y = Passengers, fill = Month)) +
geom_col(position = position_dodge(width = 0.7), width = 0.6) +
geom_text(data = total_passengers,
aes(x = Year, y = Total * 1.05,
label = paste0("Total: ", scales::comma(Total))),
inherit.aes = FALSE,
fontface = "bold", size = 5, color = "black") +
labs(title = "Passenger Volume Comparison by Year (Oct vs Nov)",
x = "Year", y = "Total Passengers", fill = "Month") +
scale_y_continuous(labels = scales::comma) +
theme_solarized_2()
This section compares total revenue by train for October and November
across 2023 and 2024. Note: The dataset train_revenue is
assumed to be derived from df or provided separately.
train_revenue <- df %>%
filter(Month %in% c("Oct", "Nov")) %>%
group_by(`Train Number`, Year) %>%
summarise(TotalRevenue = sum(Revenue, na.rm = TRUE), .groups = "drop")
train_order <- train_revenue %>%
group_by(`Train Number`) %>%
summarise(AllRevenue = sum(TotalRevenue, na.rm = TRUE)) %>%
arrange(desc(AllRevenue)) %>%
pull(`Train Number`)
train_revenue$`Train Number` <- factor(train_revenue$`Train Number`, levels = train_order)
ggplot(train_revenue, aes(x = `Train Number`, y = TotalRevenue, fill = factor(Year))) +
geom_col(position = "dodge", width = 0.7) +
labs(title = "Revenue Comparison by Train (Oct–Nov 2023 vs 2024)",
x = "Train Number",
y = "Total Revenue (€)",
fill = "Year") +
scale_y_continuous(labels = scales::comma) +
theme_solarized() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))