This dataset was sourced from a course offered by an airline for analytical practice.

This R Markdown document analyzes an airline dataset to uncover insights into route performance, booking behaviors, and predictive factors for booking completion. We explore booking trends, geographic patterns, and ancillary service preferences to identify key drivers of successful bookings. The analysis is structured to first highlight geographic and predictive insights, followed by detailed booking trends, with additional analyses on propensity scores, temporal decay, survival curves, and booking intensity.

The actual revenue data was difficult to source due to confidentiality and unavailability of data products like OAG, IATA DDS, etc.

Setup and Data Loading

pacman::p_load(
  pacman, rio, tidyverse, forcats, stringr, corrplot, magrittr, vcd, caret,
  parallel, randomForest, skimr, psych, doParallel, sf, rnaturalearth,
  rnaturalearthdata, tidyr, ggrepel, DescTools, car, broom, effects, ranger, scales, glue,
  here, ggthemes, ggeffects, survival, survminer, MatchIt, minpack.lm, scales, mgcv
)


data <- rio::import("C:/Users/Veerpal Kaur/OneDrive/Desktop/Airline Route Analysis/Airline Route Data.csv")

data_numeric <- data %>%
  mutate(
    booking_complete = as.numeric(booking_complete),
    wants_extra_baggage = as.numeric(wants_extra_baggage),
    wants_preferred_seat = as.numeric(wants_preferred_seat),
    wants_in_flight_meals = as.numeric(wants_in_flight_meals),
    sales_channel = factor(sales_channel, levels = c("Internet", "Mobile")),
    trip_type = factor(trip_type, levels = c("RoundTrip", "OneWay", "CircleTrip")),
    purchase_lead = pmin(purchase_lead, 365) # Cap lead time at 365 days
  )

# Overview of dataset structure
glimpse(data)

## Rows: 50,000
## Columns: 14
## $ num_passengers        <int> 2, 1, 2, 1, 2, 1, 3, 2, 1, 1, 2, 1, 4, 1, 1, 1, …
## $ sales_channel         <chr> "Internet", "Internet", "Internet", "Internet", …
## $ trip_type             <chr> "RoundTrip", "RoundTrip", "RoundTrip", "RoundTri…
## $ purchase_lead         <int> 262, 112, 243, 96, 68, 3, 201, 238, 80, 378, 185…
## $ length_of_stay        <int> 19, 20, 22, 31, 22, 48, 33, 19, 22, 30, 25, 43, …
## $ flight_hour           <int> 7, 3, 17, 4, 15, 20, 6, 14, 4, 12, 14, 2, 19, 14…
## $ flight_day            <chr> "Sat", "Sat", "Wed", "Sat", "Wed", "Thu", "Thu",…
## $ route                 <chr> "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL", "AKLDEL"…
## $ booking_origin        <chr> "New Zealand", "New Zealand", "India", "New Zeal…
## $ wants_extra_baggage   <int> 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, …
## $ wants_preferred_seat  <int> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, …
## $ wants_in_flight_meals <int> 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, …
## $ flight_duration       <dbl> 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, 5.52, …
## $ booking_complete      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …

Geographic Analysis

Booking Behavior by Top 10 Countries

Visualize booking volumes and completion rates for top 10 countries on a world map.

# Identify top 10 countries by booking volume
top10 <- data %>%
  count(booking_origin, sort = TRUE) %>%
  slice_max(order_by = n, n = 10) %>%
  pull(booking_origin)

# Summarize booking counts
df_top <- data %>% filter(booking_origin %in% top10)
country_counts <- df_top %>%
  group_by(booking_origin, booking_complete) %>%
  summarise(count = n(), .groups = "drop") %>%
  pivot_wider(names_from = booking_complete, values_from = count, values_fill = 0, names_prefix = "status_") %>%
  rename(Abandoned = status_0, Completed = status_1) %>%
  mutate(
    Abandoned = as.numeric(coalesce(Abandoned, 0)),
    Completed = as.numeric(coalesce(Completed, 0)),
    Total = Abandoned + Completed,
    ConversionRate = round(100 * Completed / Total, 1)
  )

# Prepare map data with centroids
world <- ne_countries(scale = "medium", returnclass = "sf")
world_top <- world %>% filter(name %in% top10)
world_centroids <- suppressWarnings(st_point_on_surface(world_top))
coords <- as.data.frame(st_coordinates(world_centroids))
world_coords <- bind_cols(world_top, coords)

# Merge booking and map data
map_data <- left_join(world_coords, country_counts, by = c("name" = "booking_origin")) %>%
  mutate(
    Completed = as.numeric(coalesce(Completed, 0)),
    Abandoned = as.numeric(coalesce(Abandoned, 0)),
    Total = as.numeric(coalesce(Total, 0))
  )

# Reshape for plotting
map_data_long <- map_data %>%
  select(name, X, Y, Completed, Abandoned) %>%
  pivot_longer(cols = c(Completed, Abandoned), names_to = "Status", values_to = "Count") %>%
  mutate(Count = as.numeric(coalesce(Count, 0)))

# Split for layering
abandoned_data <- map_data_long %>% filter(Status == "Abandoned")
completed_data <- map_data_long %>% filter(Status == "Completed")

# Plot proportional symbol map
status_colors <- c("Completed" = "#008080", "Abandoned" = "pink")
ggplot() +
  geom_sf(data = world, fill = "gray98", color = "gray80", lwd = 0.2) +
  geom_point(data = abandoned_data, aes(x = X, y = Y, size = Count, fill = Status),
             shape = 21, color = "black", alpha = 0.8) +
  geom_point(data = completed_data, aes(x = X, y = Y, size = Count, fill = Status),
             shape = 21, color = "black", alpha = 0.8) +
  scale_size_continuous(range = c(3, 12), name = "Booking Count") +
  scale_fill_manual(values = status_colors, name = "Booking Status") +
  coord_sf(xlim = c(-180, 180), ylim = c(-60, 90), expand = FALSE) +
  labs(
    title = "Booking Behavior – Top 10 Countries by Volume",
    subtitle = "Completed vs. Abandoned Bookings",
    caption = "Circles show booking volume; color indicates status; % shows completion rate",
    x = NULL, y = NULL
  ) +
  ggthemes::theme_solarized_2(base_size = 13) +
  theme(
    legend.position = "bottom", legend.box = "horizontal",
    panel.spacing = unit(10, "lines"), aspect.ratio = 0.35,
    legend.spacing.x = unit(0.5, 'cm'), legend.title = element_text(face = "bold"),
    legend.text = element_text(size = 10), panel.grid = element_line(color = "gray90"),
    plot.caption = element_text(size = 10, face = "italic", hjust = 0)
  ) +
  guides(
    size = guide_legend(order = 1, direction = "horizontal", title.position = "top"),
    fill = guide_legend(order = 2, direction = "horizontal", title.position = "top")
  ) +
  geom_text_repel(
    data = map_data, aes(x = X, y = Y, label = paste0(name, " (", ConversionRate, "%)")),
    size = 3, max.overlaps = 10, box.padding = 0.3, point.padding = 0.2, segment.color = 'grey50'
  )

Compare each country’s booking share within its IATA subregion with its conversion rate.

# Summarise country-level stats
country_summary <- data %>%
  group_by(booking_origin) %>%
  summarise(
    total_bookings = n(),
    completed = sum(booking_complete, na.rm = TRUE),
    conversion_rate = mean(booking_complete, na.rm = TRUE),
    .groups = "drop"
  )

# Filter out countries with very low bookings
min_booking_threshold <- 100
filtered_data <- country_summary %>%
  filter(total_bookings >= min_booking_threshold)

# Create IATA region + subregion mapping
iata_region_map <- tribble(
  ~booking_origin,     ~iata_region,       ~subregion,
  "Malaysia",          "TC3",              "Southeast Asia",
  "Indonesia",         "TC3",              "Southeast Asia",
  "Singapore",         "TC3",              "Southeast Asia",
  "Thailand",          "TC3",              "Southeast Asia",
  "Vietnam",           "TC3",              "Southeast Asia",
  "Philippines",       "TC3",              "Southeast Asia",
  "Cambodia",          "TC3",              "Southeast Asia",
  "Brunei",            "TC3",              "Southeast Asia",
  "China",             "TC3",              "East Asia",
  "Hong Kong",         "TC3",              "East Asia",
  "Macau",             "TC3",              "East Asia",
  "Japan",             "TC3",              "East Asia",
  "South Korea",       "TC3",              "East Asia",
  "Taiwan",            "TC3",              "East Asia",
  "India",             "TC3",              "South Asia",
  "Australia",         "TC3",              "Oceania",
  "New Zealand",       "TC3",              "Oceania",
  "United States",     "TC1",              "North America",
  "United Kingdom",    "TC2",              "Europe"
)

# Join region info
filtered_data <- filtered_data %>%
  left_join(iata_region_map, by = "booking_origin") %>%
  mutate(
    subregion = fct_explicit_na(subregion, na_level = "Other"),
    label_text = booking_origin
  )

# Calculate intra-regional share within subregions
filtered_data <- filtered_data %>%
  group_by(subregion) %>%
  mutate(
    intra_regional_share = total_bookings / sum(total_bookings)
  ) %>%
  ungroup()

# Averages for dashed reference lines
avg_conv <- mean(filtered_data$conversion_rate, na.rm = TRUE)
avg_share <- mean(filtered_data$intra_regional_share, na.rm = TRUE)

# Plot
ggplot(filtered_data, aes(x = intra_regional_share, y = conversion_rate)) +
  geom_point(aes(size = total_bookings, fill = subregion), shape = 21, color = "black", alpha = 0.85) +
  geom_hline(yintercept = avg_conv, linetype = "dashed", color = "gray50") +
  geom_vline(xintercept = avg_share, linetype = "dashed", color = "gray50") +
  geom_text_repel(
    aes(label = label_text),
    size = 3.2, max.overlaps = Inf, box.padding = 0.25, point.padding = 0.2
  ) +
  scale_fill_brewer(palette = "Set2", name = "Subregion (IATA)") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(labels = scales::percent_format(accuracy = 0.1)) +
  scale_size_continuous(name = "Booking Volume") +
  labs(
    title = "Intra-Regional Booking Share vs. Conversion Rate by Country",
    subtitle = "Each country’s share of bookings within its IATA subregion",
    x = "Booking Share (Within IATA Subregion)",
    y = "Booking Conversion Rate",
    caption = paste(
      "Dashed lines = Global Averages | Each point labeled by country name",
      "\nThis matrix compares each country’s share of total bookings within its IATA-defined subregion,",
      "\nwith its booking completion rate (excludes <", min_booking_threshold, " bookings).",
      "\nIt identifies dominant, efficient, or underperforming players relative to their regional peers."
    )
  ) +
  ggthemes::theme_solarized_2(light = TRUE, base_size = 13)

Booking Activity Towards Departure

Logistic growth model (sigmoid curve) to reflect how bookings typically ramp up non-linearly near departure.

# ---- Prepare Time Data ----
lead_dist <- data_numeric %>%
  filter(booking_complete == 1, purchase_lead > 0 & purchase_lead <= 365) %>%
  count(purchase_lead) %>%
  mutate(
    lead_day = 365 - purchase_lead  # 0 = departure
  ) %>%
  arrange(lead_day) %>%
  mutate(
    cumulative_bookings = cumsum(n),
    cum_pct = cumulative_bookings / sum(n)
  )

# ---- Fit Logistic Model ----
logistic_fit <- nlsLM(
  cum_pct ~ 1 / (1 + exp(-(a + b * lead_day))),
  data = lead_dist,
  start = list(a = -5, b = 0.05)
)
log_params <- coef(logistic_fit)

# ---- Milestones (Extended) ----
milestones <- c(0.25, 0.33, 0.50, 0.66, 0.75, 0.90)
milestone_days <- sapply(milestones, function(p) {
  lead_dist %>%
    filter(cum_pct >= p) %>%
    slice(1) %>%
    pull(lead_day)
})
milestone_df <- tibble(
  `Cumulative Bookings` = percent(milestones),
  `Days Before Flight` = 365 - milestone_days
)

# ---- Annotation Table ----
table_text <- c(
  "Cumulative Bookings  |  Days Before Flight",
  "---------------------|--------------------",
  sprintf("%-18s | %1d", milestone_df$`Cumulative Bookings`, milestone_df$`Days Before Flight`)
)

annotation_label <- paste(table_text, collapse = "\n")


# ---- Final Plot ----
ggplot(lead_dist, aes(x = 365 - lead_day, y = cum_pct)) +
  # ---- Annotated Time Bands (with dummy fill key) ----
annotate("rect", xmin = 0, xmax = 7, ymin = 0, ymax = Inf, alpha = 0.1, fill = "red") +
  annotate("rect", xmin = 8, xmax = 30, ymin = 0, ymax = Inf, alpha = 0.08, fill = "purple") +
  annotate("rect", xmin = 31, xmax = 90, ymin = 0, ymax = Inf, alpha = 0.08, fill = "green") +
  
  # ---- Actual Data (Scatters) ----
geom_point(aes(color = "Actual Data"), alpha = 0.6, size = 2) +
  
  # ---- Logistic Fit (Line) ----
stat_function(
  aes(color = "Logistic Fit"),
  fun = function(x) {
    x2 <- 365 - x
    1 / (1 + exp(-(log_params["a"] + log_params["b"] * x2)))
  },
  linewidth = 1.2
) +
  annotate("text", 
           x = 100,    # Far from departure (left side of plot)
           y = 0.7,    # Mid-upper height
           label = annotation_label,
           hjust = 0, family = "mono", size = 3.5)+
  
  
  # ---- Axes and Labels ----
scale_y_continuous(labels = percent_format()) +
  scale_color_manual(
    name = "Lines",
    values = c("Actual Data" = "steelblue", "Logistic Fit" = "firebrick")
  ) +
  
  # ---- Manual Legend for Zones (via guide/patch) ----
guides(
  fill = guide_legend(
    override.aes = list(alpha = c(0.1, 0.08, 0.08)),
    title = "Booking Windows"
  ),
  color = guide_legend(order = 1)
) +
  # text for shaded areas
  annotate("text", x = 3, y = 0.3, label = "0–7 days\nLast-Week", color = "red", size = 3.5) +
  
  annotate("text", x = 18, y = 0.17, label = "8–30 days\n", color = "purple", size = 3.5) +
  
  annotate("text", x = 60, y = 0.1, label = "31–90 days\n", color = "darkgreen", size = 3.5)+

  labs(
    title = "Booking Activity Increases Toward Departure",
    subtitle = glue("Logistic Model: f(t) = 1 / (1 + exp(-(a + b * t)))\n  a = {round(log_params['a'], 3)}, b = {round(log_params['b'], 3)}"),
    x = "Days Before Flight (Purchase Lead)",
    y = "Cumulative Booking Share",
    caption = glue(
      "This chart models how cumulative booking activity accelerates as the departure date approaches.\n\n",
      "We use a logistic growth model (sigmoid curve) to reflect how bookings typically ramp up non-linearly near departure.\n",
      "Unlike an exponential decay model (last plot of page), which assumes early peak activity, logistic models capture   \n",
      "slow initial growth, followed by rapid acceleration near the event (flight), then tapering off as saturation is reached.\n\n",
      "Booking activity was aggregated by day, and normalized to produce a cumulative proportion of total bookings \n",
      "(cumulative booking share). It is for all routes but it can be focused on one specific country if asked.\n\n",
      "Example: 50% of bookings occur within {milestone_df$`Days Before Flight`[3]} days of flight; ",
      "90% within {milestone_df$`Days Before Flight`[6]}."
    )
  ) +
  ggthemes::theme_solarized_2(base_size = 13) +
  theme(
    legend.position = c(0.9, 0.2),  
    legend.background = element_rect(fill = "#B2DFDB", color = "pink"),
    legend.text = element_text(size = 9, color = "black")
  )

Predictive Modeling

Random Forest Model

Train a random forest model to predict booking completion.

# Convert variables to factors
data <- data %>%
  mutate(
    sales_channel = as.factor(sales_channel),
    trip_type = as.factor(trip_type),
    flight_day = as.factor(flight_day),
    route = as.factor(route),
    booking_origin = as.factor(booking_origin),
    wants_extra_baggage = as.factor(wants_extra_baggage),
    wants_preferred_seat = as.factor(wants_preferred_seat),
    wants_in_flight_meals = as.factor(wants_in_flight_meals),
    booking_complete = as.factor(booking_complete)
  )

# Select features and remove NAs
feature_data <- data %>%
  select(
    booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
    length_of_stay, flight_hour, flight_day, route, booking_origin,
    wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
  ) %>%
  na.omit()

# Split into training (70%) and testing (30%)
set.seed(123)
train_index <- createDataPartition(feature_data$booking_complete, p = 0.7, list = FALSE)
train_data <- feature_data[train_index, ]
test_data <- feature_data[-train_index, ]

# Train random forest
rf_model <- ranger(
  booking_complete ~ ., data = train_data, num.trees = 500,
  importance = "impurity", probability = TRUE, seed = 42
)

# Predict and evaluate
rf_probs <- predict(rf_model, data = test_data)$predictions[, 2]
rf_preds <- ifelse(rf_probs > 0.5, "1", "0") %>% factor(levels = c("0", "1"))
actuals <- test_data$booking_complete
conf_matrix <- confusionMatrix(rf_preds, actuals, positive = "1")
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 12656  2097
##          1   100   146
##                                           
##                Accuracy : 0.8535          
##                  95% CI : (0.8478, 0.8591)
##     No Information Rate : 0.8505          
##     P-Value [Acc > NIR] : 0.1487          
##                                           
##                   Kappa : 0.0904          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.065091        
##             Specificity : 0.992161        
##          Pos Pred Value : 0.593496        
##          Neg Pred Value : 0.857859        
##              Prevalence : 0.149543        
##          Detection Rate : 0.009734        
##    Detection Prevalence : 0.016401        
##       Balanced Accuracy : 0.528626        
##                                           
##        'Positive' Class : 1               
##

Feature Importance

Plot top 10 features influencing booking completion.

# Extract and plot feature importance
importance_df <- data.frame(
  Feature = names(rf_model$variable.importance),
  Importance = rf_model$variable.importance
) %>%
  arrange(desc(Importance)) %>%
  slice_head(n = 10)

ggplot(importance_df, aes(x = fct_reorder(Feature, Importance), y = Importance)) +
  geom_col(fill = "#008080") +
  coord_flip() +
  labs(
    title = "Top 10 Important Features for Booking Completion",
    x = "Feature", y = "Gini-Based Importance"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Logistic Regression: Route Effects

Estimate booking completion odds for top routes.

# Prepare data with limited factor levels
logit_data <- data %>%
  mutate(
    route = fct_lump(route, n = 20),
    booking_origin = fct_lump(booking_origin, n = 20)
  ) %>%
  select(
    booking_complete, num_passengers, sales_channel, trip_type, purchase_lead,
    length_of_stay, flight_hour, flight_day, route, booking_origin,
    wants_extra_baggage, wants_preferred_seat, wants_in_flight_meals, flight_duration
  ) %>%
  na.omit()

# Fit logistic regression
logit_model <- glm(booking_complete ~ ., data = logit_data, family = binomial(link = "logit"))

# Compute and plot route effects
eff <- allEffects(logit_model)
route_effects <- eff[["route"]] %>%
  as.data.frame() %>%
  rename(route = route, estimate = fit, conf.low = lower, conf.high = upper) %>%
  arrange(desc(estimate)) %>%
  slice_head(n = 10)

ggplot(route_effects, aes(x = fct_reorder(route, estimate), y = estimate)) +
  geom_point(color = "#008080", size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  coord_flip() +
  labs(
    title = "Top 10 Routes by Booking Completion Odds",
    x = "Route", y = "Predicted Probability"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Booking Trends

Data Preparation

# Convert categorical variables to factors for analysis
df_1 <- data %>%
  mutate(
    flight_day = as.factor(flight_day),
    sales_channel = as.factor(sales_channel),
    trip_type = as.factor(trip_type),
    booking_origin = as.factor(booking_origin)
  )

Booking Conversion Rate

Calculate overall percentage of completed bookings.

data_numeric %>%
  summarise(conversion_rate = mean(booking_complete, na.rm = TRUE)) %>%
  print()

##   conversion_rate
## 1         0.14956

Booking Hour Trends

Visualize booking volumes by hour, stacked by completion status.

df_1 %>%
  ggplot(aes(x = as.factor(flight_hour), fill = factor(booking_complete))) +
  geom_bar(position = "stack") +
  scale_fill_manual(
    values = c("0" = "orange", "1" = "forestgreen"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  labs(
    title = "Bookings by Flight Hour (Stacked by Completion)",
    x = "Hour of Day", y = "Number of Bookings"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Conversion Rate by Flight Hour

Show percentage of bookings completed by hour.

data_numeric %>%
  group_by(flight_hour) %>%
  summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
  ggplot(aes(x = as.factor(flight_hour), y = conversion)) +
  geom_col(fill = "darkcyan") +
  labs(title = "Booking Conversion Rate by Flight Hour", x = "Hour", y = "Conversion Rate") +
  scale_y_continuous(labels = scales::percent) +
  ggthemes::theme_solarized_2(base_size = 13)

Booking Day Trends

Visualize booking volumes by day of week, stacked by completion status.

df_1 %>%
  ggplot(aes(x = flight_day, fill = factor(booking_complete))) +
  geom_bar(position = "stack") +
  scale_fill_manual(
    values = c("0" = "grey50", "1" = "black"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  labs(
    title = "Bookings by Day of Week (Stacked by Completion)",
    x = "Day", y = "Number of Bookings"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Booking Origin Breakdown

Identify top 10 countries by booking volume.

df_1 %>%
  count(booking_origin, sort = TRUE) %>%
  slice_head(n = 10) %>%
  ggplot(aes(x = reorder(booking_origin, n), y = n)) +
  geom_col(fill = "tomato") +
  coord_flip() +
  labs(title = "Top 10 Booking Origins", x = "Country", y = "Bookings") +
  ggthemes::theme_solarized_2(base_size = 13)

Purchase Lead Time

Analyze booking lead times, stacked by completion status.

df_1 %>%
  ggplot(aes(x = purchase_lead, fill = factor(booking_complete))) +
  geom_histogram(bins = 50, position = "stack") +
  scale_fill_manual(
    values = c("0" = "darkred", "1" = "darkgreen"),
    labels = c("Abandoned", "Completed"),
    name = "Booking Status"
  ) +
  scale_x_continuous(limits = c(0, 450), breaks = c(seq(0, 100, by = 10), seq(150, 450, by = 50))) +
  scale_y_continuous(breaks = seq(0, 9000, 250)) +
  labs(
    title = "Purchase Lead Time Distribution (Stacked by Completion)",
    x = "Days Before Departure", y = "Frequency",
    caption = "Purchase lead = Days booked in advance"
  ) +
  ggthemes::theme_solarized_2(base_size = 13) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Length of Stay by Route

Show total length of stay for top 10 routes.

data_numeric %>%
  group_by(route) %>%
  summarise(total_stay = sum(length_of_stay, na.rm = TRUE)) %>%
  slice_max(total_stay, n = 10) %>%
  ggplot(aes(x = reorder(route, total_stay), y = total_stay, fill = route)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(x = "Route", y = "Total Length of Stay", title = "Top 10 Routes by Length of Stay") +
  ggthemes::theme_solarized_2(base_size = 13)

Purchase Lead by Route

Highlight routes with longest average booking lead times.

data_numeric %>%
  group_by(route) %>%
  summarise(avg_purchase_lead = mean(purchase_lead, na.rm = TRUE)) %>%
  slice_max(avg_purchase_lead, n = 10) %>%
  ggplot(aes(x = fct_reorder(route, avg_purchase_lead), y = avg_purchase_lead, fill = route)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  labs(
    x = "Route", y = "Avg Purchase Lead (Days)",
    title = "Top 10 Routes by Purchase Lead",
    caption = "Avg purchase lead = Days booked in advance"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Ancillary Preferences by Route

Examine ancillary service preferences for top 6 routes.

top_routes_by_booking <- data_numeric %>%
  group_by(route) %>%
  summarise(completed_bookings = sum(booking_complete, na.rm = TRUE)) %>%
  slice_max(completed_bookings, n = 6) %>%
  pull(route)

ancillary_data <- data_numeric %>%
  filter(route %in% top_routes_by_booking) %>%
  select(route, wants_extra_baggage, wants_in_flight_meals, wants_preferred_seat) %>%
  pivot_longer(cols = -route, names_to = "service", values_to = "selected")

ggplot(ancillary_data, aes(x = route, fill = factor(selected))) +
  geom_bar(position = "fill") +
  scale_fill_manual(values = c("0" = "gray80", "1" = "steelblue"), name = "Selected") +
  scale_y_continuous(labels = scales::percent) +
  facet_wrap(~service, ncol = 1) +
  labs(
    title = "Ancillary Preference Distribution by Top 6 Routes",
    x = "Route", y = "Percentage"
  ) +
  ggthemes::theme_solarized_2(base_size = 13) +
  theme(legend.position = "bottom")

Sales Channel vs. Trip Type

Explore sales channels vs. trip types with a mosaic plot.

df_1 %>%
  select(sales_channel, trip_type) %>%
  table() %>%
  mosaicplot(color = TRUE, main = "Sales Channel vs. Trip Type", las = 1)

###Booking Behavior Across Lead Time & Channel

#Visualize smoothed booking probability curves by trip type and sales channel.

# Prep data
df_god <- data_numeric %>%
  filter(purchase_lead > 0, purchase_lead <= 365) %>%
  mutate(
    log_lead = log(purchase_lead),
    trip_type = fct_lump(trip_type, n = 3),
    sales_channel = fct_lump(sales_channel, n = 3)
  )

# Fit model only if there's enough data per group
plot_data <- df_god %>%
  group_by(trip_type, sales_channel) %>%
  group_modify(~ {
    df <- .x
    if (n_distinct(df$log_lead) < 6) return(tibble())
    
    model <- gam(booking_complete ~ s(log_lead, k = 5), data = df, family = binomial)
    
    tibble(log_lead = seq(log(1), log(365), length.out = 100)) %>%
      mutate(
        booking_prob = predict(model, newdata = pick(everything()), type = "response")
      )
  }) %>%
  ungroup()

# Plot
ggplot(plot_data, aes(x = exp(log_lead), y = booking_prob, color = trip_type)) +
  geom_line(size = 1.2) +
  facet_wrap(~sales_channel) +
  scale_x_log10(breaks = c(1, 3, 7, 14, 30, 90, 180, 365)) +
  scale_y_continuous(labels = scales::percent_format()) +
  labs(
    title = "Multiverse of Booking Behavior Across Lead Time & Channel",
    subtitle = "Smoothed booking probability curves by trip type and sales channel (log-scaled lead time)",
    x = "Days Before Flight (log scale)",
    y = "Booking Completion Probability",
    color = "Trip Type"
  ) +
  ggthemes::theme_solarized(base_size = 13)

Add-on Demand by Flight Duration

Analyze ancillary service demand by flight duration.

df_3 <- data_numeric %>%
  mutate(
    flight_type = case_when(
      flight_duration < 3 ~ "Short-haul",
      flight_duration >= 3 & flight_duration <= 6 ~ "Medium-haul",
      flight_duration > 6 ~ "Long-haul"
    ),
    flight_type = factor(flight_type, levels = c("Short-haul", "Medium-haul", "Long-haul"))
  )

addons_by_duration <- df_3 %>%
  group_by(flight_type) %>%
  summarise(
    Extra_Baggage = mean(wants_extra_baggage, na.rm = TRUE),
    Preferred_Seat = mean(wants_preferred_seat, na.rm = TRUE),
    Inflight_Meals = mean(wants_in_flight_meals, na.rm = TRUE)
  ) %>%
  pivot_longer(cols = -flight_type, names_to = "Service", values_to = "Proportion")

ggplot(addons_by_duration, aes(x = flight_type, y = Proportion, fill = Service)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(title = "Add-on Service Demand by Flight Duration",
       y = "Proportion of Users", x = "Flight Type") +
  ggthemes::theme_solarized_2(base_size = 13)

Propensity Score Analysis for Booking Completion

Analyze propensity score distribution by sales channel and booking status.

# Fit logistic regression for propensity scores
ps_model <- glm(
  booking_complete ~ sales_channel + num_passengers + purchase_lead + length_of_stay +
    flight_hour + wants_extra_baggage + wants_preferred_seat + wants_in_flight_meals +
    flight_duration + trip_type,
  family = binomial(),
  data = data_numeric
)

# Add propensity scores to dataset
data_numeric$prop_score <- predict(ps_model, type = "response")

# Prepare data for density plot
ps_data <- data_numeric %>%
  select(sales_channel, booking_complete, prop_score) %>%
  mutate(
    booking_complete = factor(booking_complete, levels = c(0, 1), labels = c("Abandoned", "Completed"))
  )

# Plot density plot of propensity scores
ggplot(ps_data, aes(x = prop_score, fill = sales_channel, linetype = booking_complete)) +
  geom_density(alpha = 0.4) +
  scale_fill_manual(values = c("Internet" = "#1b9e77", "Mobile" = "#d95f02"), name = "Sales Channel") +
  scale_linetype_manual(values = c("Abandoned" = "dashed", "Completed" = "solid"), name = "Booking Status") +
  scale_x_continuous(labels = scales::percent) +
  labs(
    title = "Propensity Score Distribution for Booking Completion",
    subtitle = "By Sales Channel and Booking Status",
    x = "Propensity Score",
    y = "Density"
  ) +
  ggthemes::theme_solarized_2(base_size = 13) +
  theme(
    legend.position = "bottom",
    legend.box = "vertical",
    plot.subtitle = element_text(size = 10, face = "italic"),
    panel.grid.minor = element_blank()
  )

Elasticity of Add-on Demand by Flight Duration

Analyze predicted probability of add-on selection by flight duration using logistic models.

# Fit logistic models for each add-on
logit_baggage <- glm(
  wants_extra_baggage ~ flight_duration,
  data = data_numeric,
  family = binomial
)

logit_meals <- glm(
  wants_in_flight_meals ~ flight_duration,
  data = data_numeric,
  family = binomial
)

logit_seat <- glm(
  wants_preferred_seat ~ flight_duration,
  data = data_numeric,
  family = binomial
)

# Get actual flight duration range from data
duration_range <- range(data_numeric$flight_duration, na.rm = TRUE)
duration_seq <- seq(duration_range[1], duration_range[2], length.out = 300)

# Predict probabilities over the range
df_pred <- tibble(
  flight_duration = duration_seq,
  Extra_Baggage = predict(logit_baggage, newdata = tibble(flight_duration = duration_seq), type = "response"),
  Inflight_Meals = predict(logit_meals, newdata = tibble(flight_duration = duration_seq), type = "response"),
  Preferred_Seat = predict(logit_seat, newdata = tibble(flight_duration = duration_seq), type = "response")
)

# Reshape and plot
df_pred %>%
  pivot_longer(cols = -flight_duration, names_to = "Service", values_to = "Demand_Prob") %>%
  ggplot(aes(x = flight_duration, y = Demand_Prob, color = Service)) +
  geom_line(linewidth = 1.2) +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  scale_x_continuous(limits = duration_range) +
  labs(
    title = "Elasticity of Add-on Demand by Flight Duration",
    subtitle = "Predicted Add-on Selection Probability (Logistic Model)",
    x = "Flight Duration (Hours)", y = "Predicted Demand (%)",
    color = "Ancillary Service"
  ) +
  ggthemes::theme_solarized(base_size = 13)

Booking Propensity by Lead-Time Bucket

Examine booking completion rates by lead time buckets.

data_numeric %>%
  mutate(lead_bucket = cut(
    purchase_lead,
    breaks = c(0, 7, 14, 30, 60, 90, 180, 365, Inf),
    labels = c("0–7", "8–14", "15–30", "31–60", "61–90", "91–180", "181–365", "365+")
  )) %>%
  group_by(lead_bucket) %>%
  summarise(conversion = mean(booking_complete, na.rm = TRUE)) %>%
  ggplot(aes(x = lead_bucket, y = conversion)) +
  geom_col(fill = "dodgerblue") +
  scale_y_continuous(labels = scales::percent) +
  labs(title = "Booking Propensity by Lead-Time Bucket",
       x = "Purchase Lead Bucket", y = "Conversion Rate") +
  ggthemes::theme_solarized(base_size = 13)

Temporal Booking Decay by Trip Type

Analyze cumulative booking completion rates by trip type for the top booking origin.

# Select top booking origin by volume
top_origins <- data_numeric %>%
  count(booking_origin) %>%
  arrange(desc(n)) %>%
  slice_head(n = 1) %>%
  pull(booking_origin)

# Prepare data for temporal decay analysis
decay_data <- data_numeric %>%
  filter(booking_origin %in% top_origins) %>%
  group_by(booking_origin, trip_type, purchase_lead) %>%
  summarise(
    total_bookings = n(),
    completed_bookings = sum(booking_complete, na.rm = TRUE),
    completion_rate = completed_bookings / total_bookings,
    .groups = "drop"
  ) %>%
  arrange(booking_origin, trip_type, purchase_lead) %>%
  group_by(booking_origin, trip_type) %>%
  mutate(
    cumulative_completion = cumsum(completed_bookings) / sum(total_bookings)
  )

# Calculate median lead time for completion
median_lead_times <- decay_data %>%
  group_by(booking_origin, trip_type) %>%
  summarise(
    median_lead = purchase_lead[which.min(abs(cumulative_completion - 0.5))],
    .groups = "drop"
  )

# Perform log-rank test for differences between trip types
log_rank_p <- decay_data %>%
  group_by(booking_origin) %>%
  summarise(
    p_value = tryCatch(
      {
        surv_diff <- survdiff(
          Surv(purchase_lead, booking_complete) ~ trip_type,
          data = data_numeric[data_numeric$booking_origin == booking_origin, ]
        )
        pchisq(surv_diff$chisq, df = length(unique(trip_type)) - 1, lower.tail = FALSE)
      },
      error = function(e) NA_real_
    ),
    .groups = "drop"
  ) %>%
  mutate(p_label = sprintf("Log-rank p = %.3f", p_value))

# Plot step-function decay curves
ggplot(decay_data, aes(x = purchase_lead, y = cumulative_completion, color = trip_type)) +
  geom_step(linewidth = 1) +
  geom_vline(
    data = median_lead_times,
    aes(xintercept = median_lead, color = trip_type),
    linetype = "dashed",
    alpha = 0.5
  ) +
  geom_text(
    data = median_lead_times,
    aes(x = median_lead, y = 0.9, label = sprintf("Median: %d days", median_lead), color = trip_type),
    size = 3.5, angle = 90, hjust = 1, vjust = -0.5
  ) +
  geom_text(
    data = log_rank_p,
    aes(x = 300, y = 0.1, label = p_label),
    color = "black", size = 3.5, inherit.aes = FALSE
  ) +
  facet_wrap(~booking_origin, nrow = 2, scales = "free_y") +
  scale_y_continuous(labels = scales::percent, limits = c(0, 0.06)) +
  scale_color_manual(
    values = c("RoundTrip" = "#1b9e77", "OneWay" = "#d95f02", "CircleTrip" = "#7570b3"),
    name = "Trip Type"
  ) +
  labs(
    title = "Country Case: Temporal Booking Decay by Trip Type",
    subtitle = "Cumulative Completion Rate Over Purchase Lead Time (Booking Origin: Australia)",
    x = "Purchase Lead Time (Days)",
    y = "Cumulative Completion Rate",
    caption = "Dashed lines indicate median lead time; Log-rank p-values test trip type differences"
  ) +
  ggthemes::theme_solarized(base_size = 12) +
  theme(
    legend.position = "bottom",
    plot.subtitle = element_text(size = 10, face = "italic"),
    plot.caption = element_text(size = 8),
    strip.text = element_text(face = "bold"),
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Survival Curve of Booking Completion

Visualize the probability of not completing a booking over purchase lead time.

# Build survival frame from data_numeric to ensure consistency
surv_df <- data_numeric %>%
  mutate(
    status = as.numeric(booking_complete),  # Ensure status is 0 or 1
    time = pmin(purchase_lead, 365)        # Cap purchase_lead at 365 days
  ) %>%
  filter(
    !is.na(time),      # Remove NA times
    !is.na(status),    # Remove NA statuses
    time > 0           # Remove non-positive times
  )

# Fit survival model
surv_fit <- survfit(Surv(time, status) ~ 1, data = surv_df)

# Plot survival curve with risk table
plot_obj <- ggsurvplot(
  surv_fit,
  conf.int = TRUE,
  risk.table = TRUE,
  title = "Survival Curve of Booking Completion Over Lead Time",
  subtitle = "Model: Kaplan–Meier Estimator",
  caption = "Probability of not completing a booking over time. Based on purchase lead in days.",
  xlab = "Purchase Lead Time (Days)",
  ylab = "Survival Probability",
  risk.table.height = 0.25,
  ggtheme = ggthemes::theme_solarized(base_size = 13)
)

# Ensure theme_solarized is applied to all plot components
plot_obj$plot <- plot_obj$plot + ggthemes::theme_solarized(base_size = 13)
plot_obj$table <- plot_obj$table + ggthemes::theme_solarized(base_size = 13)
if (!is.null(plot_obj$cumcensor)) {
  plot_obj$cumcensor <- plot_obj$cumcensor + ggthemes::theme_solarized(base_size = 13)
}

# Print the final themed survival plot
print(plot_obj)

Exponential Decay of Booking Activity Over Time

Model booking activity decline using an exponential decay model.

# Prepare and process
lead_dist <- data_numeric %>%
  filter(booking_complete == 1, purchase_lead > 0 & purchase_lead <= 365) %>%
  count(purchase_lead) %>%
  rename(day = purchase_lead, bookings = n) %>%
  mutate(booking_density = bookings / sum(bookings)) %>%
  arrange(day) %>%
  mutate(cum_pct = cumsum(bookings) / sum(bookings))

# Fit exponential model
fit <- nlsLM(
  booking_density ~ a * exp(-b * day),
  data = lead_dist,
  start = list(a = 0.1, b = 0.01)
)
params <- coef(fit)

# Milestones
milestones <- c(0.25, 0.5, 0.75, 0.9)
milestone_days <- sapply(milestones, function(p) {
  lead_dist %>% filter(cum_pct >= p) %>% slice(1) %>% pull(day)
})
milestone_df <- tibble(
  `Bookings Completed` = scales::percent(milestones),
  `Days Before Flight` = milestone_days
)

# Format table for chart annotation
table_text <- c(
  "Bookings Completed | Days Before Flight",
  "-------------------|-------------------",
  sprintf("%-18s | %3d", milestone_df$`Bookings Completed`, milestone_df$`Days Before Flight`)
)
annotation_label <- paste(table_text, collapse = "\n")

# Plot with table and explanatory caption
ggplot(lead_dist, aes(x = day, y = booking_density)) +
  geom_point(color = "steelblue", alpha = 0.6) +
  stat_function(
    fun = function(x) params["a"] * exp(-params["b"] * x),
    color = "firebrick", linewidth = 1.2
  ) +
  annotate("rect", xmin = 0, xmax = 7, ymin = 0, ymax = Inf, alpha = 0.1, fill = "red") +
  annotate("rect", xmin = 8, xmax = 30, ymin = 0, ymax = Inf, alpha = 0.08, fill = "yellow") +
  annotate("rect", xmin = 31, xmax = 90, ymin = 0, ymax = Inf, alpha = 0.08, fill = "green") +
  annotate("text", x = 180, y = 0.012, label = annotation_label, hjust = 0, family = "mono", size = 3.5) +
  labs(
    title = "Exponential Decay of Booking Activity Over Time",
    subtitle = glue::glue("Fitted Model: f(t) = {round(params['a'], 4)} * exp(-{round(params['b'], 4)} * t)"),
    caption = glue::glue(
      "This chart shows how booking activity declines as departure gets further away.\n",
      "\n",
      "Exponential model: f(t) = a × exp(–b × t), where a is initial booking intensity and b is the decay rate.\n",
      "• a = {round(params['a'], 4)}: booking intensity at day 0 (max activity), • b = {round(params['b'], 4)}: decay rate (how fast demand falls over time).\n",
      "For example, 50% of all bookings are made by day {milestone_days[2]},while 90% are completed by day {milestone_days[4]} before the flight."
    ),
    x = "Days Before Flight (Purchase Lead)",
    y = "Normalized Booking Intensity"
  ) +
  ggthemes::theme_solarized_2(base_size = 13)

Conclusion

Key insights - Geographic Focus: Focus countries can be considered to solve region specific problems. - Booking Trends: Optimize for peak hours/days and enhance long-haul add-ons. - Service Offerings: Focus routes can be adjusted.

In a nutshell, it is the problem question that needs to be solved. ```

Airline Route Performance Analysis

Arshdeep (Arsène) Kainth