This R Markdown document analyzes an airline dataset to uncover insights into route performance, booking behaviors, and predictive factors for booking completion. We explore booking trends, geographic patterns, ancillary service preferences, and use machine learning to identify key drivers of successful bookings. The analysis is structured to first highlight geographic and predictive insights, followed by detailed booking trends.

The actual revenue data with RASK, CASK, LF, RPK, ASK, Yield and other metrics were difficult to source due to confidentiality and unavailability of data products like OAG, IATA DDS etc.

Setup and Data Loading

# Load packages using pacman
pacman::p_load(
  pacman, rio, tidyverse, forcats, stringr, corrplot, magrittr, vcd, caret,
  parallel, randomForest, skimr, psych, doParallel, sf, rnaturalearth,
  rnaturalearthdata, tidyr, ggrepel, DescTools, car, broom, effects, ranger, here
)

# Load dataset using rio::import
data <- rio::import("C:/Users/Veerpal Kaur/OneDrive/Desktop/Airline Route Analysis/Airline Route Data.csv")

# Create numeric version for mean/sum operations
data_numeric <- data %>%
  mutate(
    booking_complete = as.numeric(booking_complete),
    wants_extra_baggage = as.numeric(wants_extra_baggage),
    wants_preferred_seat = as.numeric(wants_preferred_seat),
    wants_in_flight_meals = as.numeric(wants_in_flight_meals)
  )

# Overview of dataset structure
glimpse(data)
## Rows: 50,000
## Columns: 14
## $ num_passengers        <int> 2, 1, 2, 1, 2, 1, 3, 2, 1, 1, 2, 1, 4, 1, 1, 1, …
## $ sales_channel         <chr> "Internet", "Internet", "Internet", "Internet", …
## $ trip_type             <chr> "RoundTrip", "RoundTrip", "RoundTrip", "RoundTri…
## $ purchase_lead         <int> 262, 112, 243, 96, 68, 3, 201, 238, 80, 378, 185…
## $ length_of_stay        <int> 19, 20, 22, 31, 22, 48, 33, 19, 22, 30, 25, 43, …
## $ flight_hour           <int> 7, 3, 17, 4, 15, 20, 6, 14, 4, 12, 14, 2, 19, 14…
## $ flight_day            <chr> "Sat", "Sat", "Wed", "Sat", "Wed", "Thu", "Thu",…
## $ route                 <chr> "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL"…
## $ booking_origin        <chr> "New Zealand", "New Zealand", "India", "New Zeal…
## $ wants_extra_baggage   <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ wants_preferred_seat  <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ wants_in_flight_meals <int> 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ flight_duration       <dbl> 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, …
## $ booking_complete      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Geographic Analysis

Booking Behavior by Top 10 Countries

Visualize booking volumes and completion rates for top 10 countries on a world map.

# Identify top 10 countries by booking volume
top10 <- data %>%
  count(booking_origin, sort = TRUE) %>%
  slice_max(order_by = n, n = 10) %>%
  pull(booking_origin)

# Summarize booking counts
df_top <- data %>% filter(booking_origin %in% top10)
country_counts <- df_top %>%
  group_by(booking_origin, booking_complete) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = booking_complete, values_from = count, values_fill = 0, names_prefix = "status_") %>%
  rename(Abandoned = status_0, Completed = status_1) %>%
  mutate(
    Abandoned = as.numeric(coalesce(Abandoned, 0)),
    Completed = as.numeric(coalesce(Completed, 0)),
    Total = Abandoned + Completed,
    ConversionRate = round(100 * Completed / Total, 1)
  )

# Prepare map data with centroids
world <- ne_countries(scale = "medium", returnclass = "sf")
world_top <- world %>% filter(name %in% top10)
world_centroids <- suppressWarnings(st_point_on_surface(world_top))
coords <- as.data.frame(st_coordinates(world_centroids))
world_coords <- bind_cols(world_top, coords)

# Merge booking and map data
map_data <- left_join(world_coords, country_counts, by = c("name" = "booking_origin")) %>%
  mutate(
    Completed = as.numeric(coalesce(Completed, 0)),
    Abandoned = as.numeric(coalesce(Abandoned, 0)),
    Total = as.numeric(coalesce(Total, 0))
  )

# Reshape for plotting
map_data_long <- map_data %>%
  select(name, X, Y, Completed, Abandoned) %>%
  pivot_longer(cols = c(Completed, Abandoned), names_to = "Status", values_to = "Count") %>%
  mutate(Count = as.numeric(coalesce(Count, 0)))

# Split for layering
abandoned_data <- map_data_long %>% filter(Status == "Abandoned")
completed_data <- map_data_long %>% filter(Status == "Completed")

# Plot proportional symbol map
status_colors <- c("Completed" = "#008080", "Abandoned" = "pink")
ggplot() +
  geom_sf(data = world, fill = "gray98", color = "gray80", lwd = 0.2) +
  geom_point(data = abandoned_data, aes(x = X, y = Y, size = Count, fill = Status),
             shape = 21, color = "black", alpha = 0.8) +
  geom_point(data = completed_data, aes(x = X, y = Y, size = Count, fill = Status),
             shape = 21, color = "black", alpha = 0.8) +
  scale_size_continuous(range = c(3, 12), name = "Booking Count") +
  scale_fill_manual(values = status_colors, name = "Booking Status") +
  coord_sf(xlim = c(-180, 180), ylim = c(-60, 90), expand = FALSE) +
  theme_minimal(base_size = 13) +
  labs(
    title = "Booking Behavior – Top 10 Countries by Volume",
    subtitle = "Completed vs. Abandoned Bookings",
    caption = "Circles show booking volume; color indicates status; % shows completion rate",
    x = NULL, y = NULL
  ) +
  theme(
    legend.position = "bottom", legend.box = "horizontal",
    panel.spacing = unit(10, "lines"), aspect.ratio = 0.35,
    legend.spacing.x = unit(0.5, 'cm'), legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10), panel.grid = element_line(color = "gray90"),
    plot.caption = element_text(size = 10, face = "italic", hjust = 0)
  ) +
  guides(
    size = guide_legend(order = 1, direction = "horizontal", title.position = "top"),
    fill = guide_legend(order = 2, direction = "horizontal", title.position = "top")
  ) +
  geom_text_repel(
    data = map_data, aes(x = X, y = Y, label = paste0(name, " (", ConversionRate, "%)")),
    size = 3, max.overlaps = 10, box.padding = 0.3, point.padding = 0.2, segment.color = 'grey50'
  )

Predictive Modeling

Random Forest Model

Train a random forest model to predict booking completion.

# I used ranger as my PC does not support the complexity of Random Forest, but can gather similar results with Ranger 

# Convert variables to factors
data <- data %>%
  mutate(
    sales_channel = as.factor(sales_channel),
    trip_type = as.factor(trip_type),
    flight_day = as.factor(flight_day),
    route = as.factor(route),
    booking_origin = as.factor(booking_origin),
    wants_extra_baggage = as.factor(wants_extra_baggage),
    wants_preferred_seat = as.factor(wants_preferred_seat),
    wants_in_flight_meals = as.factor(wants_in_flight_meals),
    booking_complete = as.factor(booking_complete)
  )

# Select features and remove NAs
feature_data <- data %>%
  select(
    booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
    length_of_stay, flight_hour, flight_day, route, booking_origin,
    wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
  ) %>%
  na.omit()

# Split into training (70%) and testing (30%)
set.seed(123)
train_index <- createDataPartition(feature_data$booking_complete, p = 0.7, list = FALSE)
train_data <- feature_data[train_index, ]
test_data <- feature_data[-train_index, ]

# Train random forest
rf_model <- ranger(
  booking_complete ~ ., data = train_data, num.trees = 500,
  importance = "impurity", probability = TRUE, seed = 42
)

# Predict and evaluate
rf_probs <- predict(rf_model, data = test_data)$predictions[, 2]
rf_preds <- ifelse(rf_probs > 0.5, "1", "0") |> factor(levels = c("0", "1"))
actuals <- test_data$booking_complete
conf_matrix <- confusionMatrix(rf_preds, actuals, positive = "1")
print(conf_matrix)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 12656  2097
##          1   100   146
##                                           
##                Accuracy : 0.8535          
##                  95% CI : (0.8478, 0.8591)
##     No Information Rate : 0.8505          
##     P-Value [Acc > NIR] : 0.1487          
##                                           
##                   Kappa : 0.0904          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.065091        
##             Specificity : 0.992161        
##          Pos Pred Value : 0.593496        
##          Neg Pred Value : 0.857859        
##              Prevalence : 0.149543        
##          Detection Rate : 0.009734        
##    Detection Prevalence : 0.016401        
##       Balanced Accuracy : 0.528626        
##                                           
##        'Positive' Class : 1               
## 

Feature Importance

Plot top 10 features influencing booking completion.

# Extract and plot feature importance
importance_df <- data.frame(
  Feature = names(rf_model$variable.importance),
  Importance = rf_model$variable.importance
) %>%
  arrange(desc(Importance)) %>%
  slice_head(n = 10)

ggplot(importance_df, aes(x = fct_reorder(Feature, Importance), y = Importance)) +
  geom_col(fill = "#008080") +
  coord_flip() +
  labs(
    title = "Top 10 Important Features for Booking Completion",
    x = "Feature", y = "Gini-Based Importance"
  ) +
  theme_minimal(base_size = 13)

Logistic Regression: Route Effects

Estimate booking completion odds for top routes.

# Prepare data with limited factor levels
logit_data <- data %>%
  mutate(
    route = fct_lump(route, n = 20),
    booking_origin = fct_lump(booking_origin, n = 20)
  ) %>%
  select(
    booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
    length_of_stay, flight_hour, flight_day, route, booking_origin,
    wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
  ) %>%
  na.omit()

# Fit logistic regression
logit_model <- glm(booking_complete ~ ., data = logit_data, family = binomial(link = "logit"))

# Compute and plot route effects
eff <- allEffects(logit_model)
route_effects <- eff[["route"]] %>%
  as.data.frame() %>%
  rename(route = route, estimate = fit, conf.low = lower, conf.high = upper) %>%
  arrange(desc(estimate)) %>%
  slice_head(n = 10)

ggplot(route_effects, aes(x = fct_reorder(route, estimate), y = estimate)) +
  geom_point(color = "#008080", size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  coord_flip() +
  labs(
    title = "Top 10 Routes by Booking Completion Odds",
    x = "Route", y = "Predicted Probability"
  ) +
  theme_minimal(base_size = 13)

Visualize booking volumes by hour, stacked by completion status.

df_1 %>% 
  ggplot(aes(x = as.factor(flight_hour), fill = factor(booking_complete))) +
  geom_bar(position = "stack") +
  scale_fill_manual(
    values = c("0" = "orange", "1" = "forestgreen"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  labs(
    title = "Bookings by Flight Hour (Stacked by Completion)",
    x = "Hour of Day", y = "Number of Bookings"
  ) +
  theme_classic()

Conversion Rate by Flight Hour

Show percentage of bookings completed by hour.

data_numeric %>%
  group_by(flight_hour) %>%
  summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
  ggplot(aes(x = as.factor(flight_hour), y = conversion)) +
  geom_col(fill = "darkcyan") +
  labs(title = "Booking Conversion Rate by Flight Hour", x = "Hour", y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent)

Visualize booking volumes by day of week, stacked by completion status.

df_1 %>% 
  ggplot(aes(x = flight_day, fill = factor(booking_complete))) +
  geom_bar(position = "stack") +
  scale_fill_manual(
    values = c("0" = "grey50", "1" = "black"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  labs(
    title = "Bookings by Day of Week (Stacked by Completion)",
    x = "Day", y = "Number of Bookings"
  ) +
  theme_minimal()

Booking Origin Breakdown

Identify top 10 countries by booking volume.

df_1 %>%
  count(booking_origin, sort = TRUE) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(booking_origin, n), y = n)) +
  geom_col(fill = "tomato") +
  coord_flip() +
  labs(title = "Top 10 Booking Origins", x = "Country", y = "Bookings")

Purchase Lead Time

Analyze booking lead times, stacked by completion status.

df_1 %>% 
  ggplot(aes(x = purchase_lead, fill = factor(booking_complete))) +
  geom_histogram(bins = 50, position = "stack") +
  scale_fill_manual(
    values = c("0" = "darkred", "1" = "darkgreen"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  scale_x_continuous(limits = c(0, 450), breaks = c(seq(0, 100, by = 10), seq(150, 450, by = 50))) +
  scale_y_continuous(breaks = seq(0, 9000, 250)) +
  labs(
    title = "Purchase Lead Time Distribution (Stacked by Completion)",
    x = "Days Before Departure", y = "Frequency",
    caption = "Purchase lead = Days booked in advance"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))
## Warning: Removed 25 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Length of Stay by Route

Show total length of stay for top 10 routes.

data_numeric %>%
  group_by(route) %>%
  summarise(total_stay = sum(length_of_stay, na.rm = TRUE)) %>%
  slice_max(total_stay, n = 10) %>%
  ggplot(aes(x = reorder(route, total_stay), y = total_stay, fill = route)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = "Route", y = "Total Length of Stay", title = "Top 10 Routes by Length of Stay") +
  theme_grey()

Purchase Lead by Route

Highlight routes with longest average booking lead times.

data_numeric %>%
  group_by(route) %>% 
  summarise(avg_purchase_lead = mean(purchase_lead, na.rm = TRUE)) %>%
  slice_max(avg_purchase_lead, n = 10) %>%
  ggplot(aes(x = fct_reorder(route, avg_purchase_lead), y = avg_purchase_lead, fill = route)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    x = "Route", y = "Avg Purchase Lead (Days)", 
    title = "Top 10 Routes by Purchase Lead",
    caption = "Avg purchase lead = Days booked in advance"
  ) +
  theme_minimal()

Ancillary Preferences by Route

# Examine ancillary service preferences for top 6 routes.

top_routes_by_booking <- data_numeric %>%
  group_by(route) %>%
  summarise(completed_bookings = sum(booking_complete, na.rm = TRUE)) %>%
  slice_max(completed_bookings, n = 6) %>%
  pull(route)

ancillary_data <- data_numeric %>%
  filter(route %in% top_routes_by_booking) %>%
  select(route, wants_extra_baggage, wants_in_flight_meals, wants_preferred_seat) %>%
  pivot_longer(cols = -route, names_to = "service", values_to = "selected")

ggplot(ancillary_data, aes(x = route, fill = factor(selected))) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("0" = "gray80", "1" = "steelblue"), name = "Selected") +
  scale_y_continuous(labels = scales::percent) +
  facet_wrap(~service, ncol = 1) +
  labs(
    title = "Ancillary Preference Distribution by Top 6 Routes",
    x = "Route", y = "Percentage"
  ) +
  theme_minimal() +
  theme(legend.position = "bottom")

Sales Channel vs. Trip Type

# Explore sales channels vs. trip types with a mosaic plot.

df_1 %>%
  select(sales_channel, trip_type) %>%
  table() %>%
  mosaicplot(color = TRUE, main = "Sales Channel vs. Trip Type", las = 1)

Add-on Demand by Flight Duration

# Analyze ancillary service demand by flight duration.

df_3 <- data_numeric %>%
  mutate(
    flight_type = case_when(
      flight_duration < 3 ~ "Short-haul",
      flight_duration >= 3 & flight_duration <= 6 ~ "Medium-haul",
      flight_duration > 6 ~ "Long-haul"
    ),
    flight_type = factor(flight_type, levels = c("Short-haul", "Medium-haul", "Long-haul"))
  )

addons_by_duration <- df_3 %>%
  group_by(flight_type) %>%
  summarise(
    Extra_Baggage = mean(wants_extra_baggage, na.rm = TRUE),
    Preferred_Seat = mean(wants_preferred_seat, na.rm = TRUE),
    Inflight_Meals = mean(wants_in_flight_meals, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = -flight_type, names_to = "Service", values_to = "Proportion")

ggplot(addons_by_duration, aes(x = flight_type, y = Proportion, fill = Service)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "Add-on Service Demand by Flight Duration",
       y = "Proportion of Users", x = "Flight Type") +
  theme_bw()

Booking Propensity by Lead-Time Bucket

# Examine booking completion rates by lead time buckets.

data_numeric %>%
  mutate(lead_bucket = cut(
    purchase_lead,
    breaks = c(0, 7, 14, 30, 60, 90, 180, 365, Inf),
    labels = c("0–7", "8–14", "15–30", "31–60", "61–90", "91–180", "181–365", "365+")
  )) %>%
  group_by(lead_bucket) %>%
  summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
  ggplot(aes(x = lead_bucket, y = conversion)) +
  geom_col(fill = "dodgerblue") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Booking Propensity by Lead-Time Bucket",
       x = "Purchase Lead Bucket", y = "Conversion Rate")

Conclusion

Key insights - Geographic Focus: Focus countries can be considered to solve region specific problems. - Booking Trends: Optimize for peak hours/days and enhance long-haul add-ons. - Service Offerings: Focus routes can be adjusted.

In a nutshell, it is the problem question that needs to be solved, which ensuring availability of the data. ```