This R Markdown document analyzes an airline dataset to uncover insights into route performance, booking behaviors, and predictive factors for booking completion. We explore booking trends, geographic patterns, ancillary service preferences, and use machine learning to identify key drivers of successful bookings. The analysis is structured to first highlight geographic and predictive insights, followed by detailed booking trends.
The actual revenue data with RASK, CASK, LF, RPK, ASK, Yield and other metrics were difficult to source due to confidentiality and unavailability of data products like OAG, IATA DDS etc.
# Load packages using pacman
pacman::p_load(
pacman, rio, tidyverse, forcats, stringr, corrplot, magrittr, vcd, caret,
parallel, randomForest, skimr, psych, doParallel, sf, rnaturalearth,
rnaturalearthdata, tidyr, ggrepel, DescTools, car, broom, effects, ranger, here
)
# Load dataset using rio::import
data <- rio::import("C:/Users/Veerpal Kaur/OneDrive/Desktop/Airline Route Analysis/Airline Route Data.csv")
# Create numeric version for mean/sum operations
data_numeric <- data %>%
mutate(
booking_complete = as.numeric(booking_complete),
wants_extra_baggage = as.numeric(wants_extra_baggage),
wants_preferred_seat = as.numeric(wants_preferred_seat),
wants_in_flight_meals = as.numeric(wants_in_flight_meals)
)
# Overview of dataset structure
glimpse(data)## Rows: 50,000
## Columns: 14
## $ num_passengers <int> 2, 1, 2, 1, 2, 1, 3, 2, 1, 1, 2, 1, 4, 1, 1, 1, …
## $ sales_channel <chr> "Internet", "Internet", "Internet", "Internet", …
## $ trip_type <chr> "RoundTrip", "RoundTrip", "RoundTrip", "RoundTri…
## $ purchase_lead <int> 262, 112, 243, 96, 68, 3, 201, 238, 80, 378, 185…
## $ length_of_stay <int> 19, 20, 22, 31, 22, 48, 33, 19, 22, 30, 25, 43, …
## $ flight_hour <int> 7, 3, 17, 4, 15, 20, 6, 14, 4, 12, 14, 2, 19, 14…
## $ flight_day <chr> "Sat", "Sat", "Wed", "Sat", "Wed", "Thu", "Thu",…
## $ route <chr> "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL"…
## $ booking_origin <chr> "New Zealand", "New Zealand", "India", "New Zeal…
## $ wants_extra_baggage <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ wants_preferred_seat <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ wants_in_flight_meals <int> 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ flight_duration <dbl> 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, …
## $ booking_complete <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
# Identify top 10 countries by booking volume
top10 <- data %>%
count(booking_origin, sort = TRUE) %>%
slice_max(order_by = n, n = 10) %>%
pull(booking_origin)
# Summarize booking counts
df_top <- data %>% filter(booking_origin %in% top10)
country_counts <- df_top %>%
group_by(booking_origin, booking_complete) %>%
summarise(count = n(), .groups = "drop") %>%
pivot_wider(names_from = booking_complete, values_from = count, values_fill = 0, names_prefix = "status_") %>%
rename(Abandoned = status_0, Completed = status_1) %>%
mutate(
Abandoned = as.numeric(coalesce(Abandoned, 0)),
Completed = as.numeric(coalesce(Completed, 0)),
Total = Abandoned + Completed,
ConversionRate = round(100 * Completed / Total, 1)
)
# Prepare map data with centroids
world <- ne_countries(scale = "medium", returnclass = "sf")
world_top <- world %>% filter(name %in% top10)
world_centroids <- suppressWarnings(st_point_on_surface(world_top))
coords <- as.data.frame(st_coordinates(world_centroids))
world_coords <- bind_cols(world_top, coords)
# Merge booking and map data
map_data <- left_join(world_coords, country_counts, by = c("name" = "booking_origin")) %>%
mutate(
Completed = as.numeric(coalesce(Completed, 0)),
Abandoned = as.numeric(coalesce(Abandoned, 0)),
Total = as.numeric(coalesce(Total, 0))
)
# Reshape for plotting
map_data_long <- map_data %>%
select(name, X, Y, Completed, Abandoned) %>%
pivot_longer(cols = c(Completed, Abandoned), names_to = "Status", values_to = "Count") %>%
mutate(Count = as.numeric(coalesce(Count, 0)))
# Split for layering
abandoned_data <- map_data_long %>% filter(Status == "Abandoned")
completed_data <- map_data_long %>% filter(Status == "Completed")
# Plot proportional symbol map
status_colors <- c("Completed" = "#008080", "Abandoned" = "pink")
ggplot() +
geom_sf(data = world, fill = "gray98", color = "gray80", lwd = 0.2) +
geom_point(data = abandoned_data, aes(x = X, y = Y, size = Count, fill = Status),
shape = 21, color = "black", alpha = 0.8) +
geom_point(data = completed_data, aes(x = X, y = Y, size = Count, fill = Status),
shape = 21, color = "black", alpha = 0.8) +
scale_size_continuous(range = c(3, 12), name = "Booking Count") +
scale_fill_manual(values = status_colors, name = "Booking Status") +
coord_sf(xlim = c(-180, 180), ylim = c(-60, 90), expand = FALSE) +
theme_minimal(base_size = 13) +
labs(
title = "Booking Behavior – Top 10 Countries by Volume",
subtitle = "Completed vs. Abandoned Bookings",
caption = "Circles show booking volume; color indicates status; % shows completion rate",
x = NULL, y = NULL
) +
theme(
legend.position = "bottom", legend.box = "horizontal",
panel.spacing = unit(10, "lines"), aspect.ratio = 0.35,
legend.spacing.x = unit(0.5, 'cm'), legend.title = element_text(face = "bold"),
legend.text = element_text(size = 10), panel.grid = element_line(color = "gray90"),
plot.caption = element_text(size = 10, face = "italic", hjust = 0)
) +
guides(
size = guide_legend(order = 1, direction = "horizontal", title.position = "top"),
fill = guide_legend(order = 2, direction = "horizontal", title.position = "top")
) +
geom_text_repel(
data = map_data, aes(x = X, y = Y, label = paste0(name, " (", ConversionRate, "%)")),
size = 3, max.overlaps = 10, box.padding = 0.3, point.padding = 0.2, segment.color = 'grey50'
)# I used ranger as my PC does not support the complexity of Random Forest, but can gather similar results with Ranger
# Convert variables to factors
data <- data %>%
mutate(
sales_channel = as.factor(sales_channel),
trip_type = as.factor(trip_type),
flight_day = as.factor(flight_day),
route = as.factor(route),
booking_origin = as.factor(booking_origin),
wants_extra_baggage = as.factor(wants_extra_baggage),
wants_preferred_seat = as.factor(wants_preferred_seat),
wants_in_flight_meals = as.factor(wants_in_flight_meals),
booking_complete = as.factor(booking_complete)
)
# Select features and remove NAs
feature_data <- data %>%
select(
booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
length_of_stay, flight_hour, flight_day, route, booking_origin,
wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
) %>%
na.omit()
# Split into training (70%) and testing (30%)
set.seed(123)
train_index <- createDataPartition(feature_data$booking_complete, p = 0.7, list = FALSE)
train_data <- feature_data[train_index, ]
test_data <- feature_data[-train_index, ]
# Train random forest
rf_model <- ranger(
booking_complete ~ ., data = train_data, num.trees = 500,
importance = "impurity", probability = TRUE, seed = 42
)
# Predict and evaluate
rf_probs <- predict(rf_model, data = test_data)$predictions[, 2]
rf_preds <- ifelse(rf_probs > 0.5, "1", "0") |> factor(levels = c("0", "1"))
actuals <- test_data$booking_complete
conf_matrix <- confusionMatrix(rf_preds, actuals, positive = "1")
print(conf_matrix)## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 12656 2097
## 1 100 146
##
## Accuracy : 0.8535
## 95% CI : (0.8478, 0.8591)
## No Information Rate : 0.8505
## P-Value [Acc > NIR] : 0.1487
##
## Kappa : 0.0904
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.065091
## Specificity : 0.992161
## Pos Pred Value : 0.593496
## Neg Pred Value : 0.857859
## Prevalence : 0.149543
## Detection Rate : 0.009734
## Detection Prevalence : 0.016401
## Balanced Accuracy : 0.528626
##
## 'Positive' Class : 1
##
# Extract and plot feature importance
importance_df <- data.frame(
Feature = names(rf_model$variable.importance),
Importance = rf_model$variable.importance
) %>%
arrange(desc(Importance)) %>%
slice_head(n = 10)
ggplot(importance_df, aes(x = fct_reorder(Feature, Importance), y = Importance)) +
geom_col(fill = "#008080") +
coord_flip() +
labs(
title = "Top 10 Important Features for Booking Completion",
x = "Feature", y = "Gini-Based Importance"
) +
theme_minimal(base_size = 13)# Prepare data with limited factor levels
logit_data <- data %>%
mutate(
route = fct_lump(route, n = 20),
booking_origin = fct_lump(booking_origin, n = 20)
) %>%
select(
booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
length_of_stay, flight_hour, flight_day, route, booking_origin,
wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
) %>%
na.omit()
# Fit logistic regression
logit_model <- glm(booking_complete ~ ., data = logit_data, family = binomial(link = "logit"))
# Compute and plot route effects
eff <- allEffects(logit_model)
route_effects <- eff[["route"]] %>%
as.data.frame() %>%
rename(route = route, estimate = fit, conf.low = lower, conf.high = upper) %>%
arrange(desc(estimate)) %>%
slice_head(n = 10)
ggplot(route_effects, aes(x = fct_reorder(route, estimate), y = estimate)) +
geom_point(color = "#008080", size = 3) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
coord_flip() +
labs(
title = "Top 10 Routes by Booking Completion Odds",
x = "Route", y = "Predicted Probability"
) +
theme_minimal(base_size = 13)df_1 %>%
ggplot(aes(x = as.factor(flight_hour), fill = factor(booking_complete))) +
geom_bar(position = "stack") +
scale_fill_manual(
values = c("0" = "orange", "1" = "forestgreen"),
labels = c("Abandoned", "Completed"),
name = "Booking Status"
) +
labs(
title = "Bookings by Flight Hour (Stacked by Completion)",
x = "Hour of Day", y = "Number of Bookings"
) +
theme_classic()data_numeric %>%
group_by(flight_hour) %>%
summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
ggplot(aes(x = as.factor(flight_hour), y = conversion)) +
geom_col(fill = "darkcyan") +
labs(title = "Booking Conversion Rate by Flight Hour", x = "Hour", y = "Conversion Rate") +
scale_y_continuous(labels = scales::percent)df_1 %>%
ggplot(aes(x = flight_day, fill = factor(booking_complete))) +
geom_bar(position = "stack") +
scale_fill_manual(
values = c("0" = "grey50", "1" = "black"),
labels = c("Abandoned", "Completed"),
name = "Booking Status"
) +
labs(
title = "Bookings by Day of Week (Stacked by Completion)",
x = "Day", y = "Number of Bookings"
) +
theme_minimal()df_1 %>%
count(booking_origin, sort = TRUE) %>%
slice_head(n = 10) %>%
ggplot(aes(x = reorder(booking_origin, n), y = n)) +
geom_col(fill = "tomato") +
coord_flip() +
labs(title = "Top 10 Booking Origins", x = "Country", y = "Bookings")df_1 %>%
ggplot(aes(x = purchase_lead, fill = factor(booking_complete))) +
geom_histogram(bins = 50, position = "stack") +
scale_fill_manual(
values = c("0" = "darkred", "1" = "darkgreen"),
labels = c("Abandoned", "Completed"),
name = "Booking Status"
) +
scale_x_continuous(limits = c(0, 450), breaks = c(seq(0, 100, by = 10), seq(150, 450, by = 50))) +
scale_y_continuous(breaks = seq(0, 9000, 250)) +
labs(
title = "Purchase Lead Time Distribution (Stacked by Completion)",
x = "Days Before Departure", y = "Frequency",
caption = "Purchase lead = Days booked in advance"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
data_numeric %>%
group_by(route) %>%
summarise(total_stay = sum(length_of_stay, na.rm = TRUE)) %>%
slice_max(total_stay, n = 10) %>%
ggplot(aes(x = reorder(route, total_stay), y = total_stay, fill = route)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(x = "Route", y = "Total Length of Stay", title = "Top 10 Routes by Length of Stay") +
theme_grey()data_numeric %>%
group_by(route) %>%
summarise(avg_purchase_lead = mean(purchase_lead, na.rm = TRUE)) %>%
slice_max(avg_purchase_lead, n = 10) %>%
ggplot(aes(x = fct_reorder(route, avg_purchase_lead), y = avg_purchase_lead, fill = route)) +
geom_col(show.legend = FALSE) +
coord_flip() +
labs(
x = "Route", y = "Avg Purchase Lead (Days)",
title = "Top 10 Routes by Purchase Lead",
caption = "Avg purchase lead = Days booked in advance"
) +
theme_minimal()# Examine ancillary service preferences for top 6 routes.
top_routes_by_booking <- data_numeric %>%
group_by(route) %>%
summarise(completed_bookings = sum(booking_complete, na.rm = TRUE)) %>%
slice_max(completed_bookings, n = 6) %>%
pull(route)
ancillary_data <- data_numeric %>%
filter(route %in% top_routes_by_booking) %>%
select(route, wants_extra_baggage, wants_in_flight_meals, wants_preferred_seat) %>%
pivot_longer(cols = -route, names_to = "service", values_to = "selected")
ggplot(ancillary_data, aes(x = route, fill = factor(selected))) +
geom_bar(position = "fill") +
scale_fill_manual(values = c("0" = "gray80", "1" = "steelblue"), name = "Selected") +
scale_y_continuous(labels = scales::percent) +
facet_wrap(~service, ncol = 1) +
labs(
title = "Ancillary Preference Distribution by Top 6 Routes",
x = "Route", y = "Percentage"
) +
theme_minimal() +
theme(legend.position = "bottom")# Explore sales channels vs. trip types with a mosaic plot.
df_1 %>%
select(sales_channel, trip_type) %>%
table() %>%
mosaicplot(color = TRUE, main = "Sales Channel vs. Trip Type", las = 1)# Analyze ancillary service demand by flight duration.
df_3 <- data_numeric %>%
mutate(
flight_type = case_when(
flight_duration < 3 ~ "Short-haul",
flight_duration >= 3 & flight_duration <= 6 ~ "Medium-haul",
flight_duration > 6 ~ "Long-haul"
),
flight_type = factor(flight_type, levels = c("Short-haul", "Medium-haul", "Long-haul"))
)
addons_by_duration <- df_3 %>%
group_by(flight_type) %>%
summarise(
Extra_Baggage = mean(wants_extra_baggage, na.rm = TRUE),
Preferred_Seat = mean(wants_preferred_seat, na.rm = TRUE),
Inflight_Meals = mean(wants_in_flight_meals, na.rm = TRUE)
) %>%
pivot_longer(cols = -flight_type, names_to = "Service", values_to = "Proportion")
ggplot(addons_by_duration, aes(x = flight_type, y = Proportion, fill = Service)) +
geom_col(position = "dodge") +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
labs(title = "Add-on Service Demand by Flight Duration",
y = "Proportion of Users", x = "Flight Type") +
theme_bw()# Examine booking completion rates by lead time buckets.
data_numeric %>%
mutate(lead_bucket = cut(
purchase_lead,
breaks = c(0, 7, 14, 30, 60, 90, 180, 365, Inf),
labels = c("0–7", "8–14", "15–30", "31–60", "61–90", "91–180", "181–365", "365+")
)) %>%
group_by(lead_bucket) %>%
summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
ggplot(aes(x = lead_bucket, y = conversion)) +
geom_col(fill = "dodgerblue") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Booking Propensity by Lead-Time Bucket",
x = "Purchase Lead Bucket", y = "Conversion Rate")Key insights - Geographic Focus: Focus countries can be considered to solve region specific problems. - Booking Trends: Optimize for peak hours/days and enhance long-haul add-ons. - Service Offerings: Focus routes can be adjusted.
In a nutshell, it is the problem question that needs to be solved, which ensuring availability of the data. ```