library(dplyr); library(tidyr); library(ggplot2)
library(ggthemes); library(scales); library(lubridate)

df_flights  <- read.csv("flight delays.csv",  check.names = FALSE)
df_airlines <- read.csv("airlines.csv",       check.names = FALSE)

Introduction

This project is part of my Data Visualization and Decision Making course at Loyola University Maryland.
Using RStudio and tidyverse tools, I transform U.S. flight data into visual insights that explain airline performance patterns.
Through multiple visualizations—including delay distributions, airline identity charts, weekday trends, monthly volumes, and market share analyses—this report demonstrates how data visualization supports clear, evidence-based decision making.

1) Departure Delay Distribution (Trimmed)

This chart shows the distribution of departure delays across airlines after trimming extreme values (–20 to 120 minutes).
Dashed vertical lines mark the mean delay and solid lines mark the median.
For United (UA), the labels are nudged slightly to avoid overlapping.

library(grid)   # for unit()

# 1) Trim and prep data
df_two_trim <- df_flights %>%
  dplyr::filter(!is.na(DEPARTURE_DELAY),
                DEPARTURE_DELAY >= -20,
                DEPARTURE_DELAY <= 120)

# 2) Mean/median per airline for reference lines
stats_trim <- df_two_trim %>%
  dplyr::group_by(AIRLINE) %>%
  dplyr::summarise(mean_delay   = mean(DEPARTURE_DELAY),
                   median_delay = median(DEPARTURE_DELAY),
                   .groups = "drop")

# 3) Labels for mean/median (nudge UA a bit so tags don’t overlap)
label_df <- stats_trim %>%
  tidyr::pivot_longer(c(mean_delay, median_delay),
                      names_to = "stat", values_to = "xpos") %>%
  dplyr::mutate(
    stat_lab = ifelse(stat == "mean_delay", "Mean", "Median"),
    label    = paste0(AIRLINE, " ", stat_lab, ": ", round(xpos, 1)),
    # base y positions for density scale; tweak if your plot is taller/shorter
    ypos     = ifelse(stat_lab == "Mean", 0.22, 0.20),
    xpos     = dplyr::case_when(
      AIRLINE == "UA" & stat_lab == "Mean"   ~ xpos - 2,  # nudge left
      AIRLINE == "UA" & stat_lab == "Median" ~ xpos + 2,  # nudge right
      TRUE ~ xpos
    )
  )

# 4) Plot
ggplot(df_two_trim, aes(DEPARTURE_DELAY, fill = AIRLINE, color = AIRLINE)) +
  geom_density(alpha = 0.35) +
  geom_vline(data = stats_trim, aes(xintercept = mean_delay),  linetype = "dashed") +
  geom_vline(data = stats_trim, aes(xintercept = median_delay), linetype = "solid") +
  geom_label(data = label_df,
             aes(x = xpos, y = ypos, label = label, fill = AIRLINE),
             color = "white", size = 3.5, fontface = "bold",
             label.padding = unit(0.15, "lines")) +
  coord_cartesian(xlim = c(-20, 120)) +
  labs(title = "Departure Delay Distribution (Trimmed)",
       subtitle = "Dashed = Mean | Solid = Median",
       x = "Delay (minutes)", y = "Density",
       caption = "UA labels nudged left/right to avoid overlap") +
  theme_minimal()

## 2) Economist-styled list

This chart lists each airline’s IATA code with its full name.
We coord_flip() so names are easy to read, and use theme_economist() for a clean editorial look.

library(ggthemes)

ggplot(df_airlines, aes(IATA_CODE, 1, fill = AIRLINE)) +
  geom_col(width = 0.6) +
  coord_flip(clip = "off") +                                # allow labels outside
  geom_text(aes(y = 1.03, label = AIRLINE), hjust = 0, size = 3.6) +
  scale_y_continuous(limits = c(0, 1.18), expand = expansion(mult = c(0, 0))) +
  scale_fill_economist() +
  labs(title = "Airline Codes and Names", x = "IATA Code", y = NULL) +
  theme_economist() +
  theme(
    legend.position = "none",
    plot.margin = margin(10, 280, 10, 10),                   # extra room for long names
    axis.title.y  = element_text(margin = margin(r = 6))
  )

2.2 Custom palette (distinct colors)

my_colors <- c("#1f78b4","#33a02c","#e31a1c","#ff7f00",
               "#6a3d9a","#b15928","#a6cee3","#b2df8a",
               "#fb9a99","#fdbf6f","#cab2d6","#ffff99",
               "#8dd3c7","#ffffb3")

ggplot(df_airlines, aes(IATA_CODE, 1, fill = AIRLINE)) +
  geom_col(width = 0.55, color = "black", linewidth = 0.3) +
  geom_text(aes(y = 1.04, label = AIRLINE), hjust = 0, size = 3.8) +
  coord_flip(clip = "off") +
  scale_y_continuous(limits = c(0, 1.16), breaks = NULL, labels = NULL, expand = c(0, 0)) +
  scale_fill_manual(values = my_colors) +
  labs(title = "Airline Codes and Names (Distinct Colors)", x = "IATA Code", y = NULL) +
  theme_economist(base_size = 12) +
  theme(
    legend.position = "none",
    panel.grid.major.y = element_blank(),
    plot.margin = margin(10, 280, 10, 12),
    axis.title.y = element_text(margin = margin(r = 8))
  )

## 3) Flights by Day of Week — Total vs Top Origin Airport

This chart compares total flights (solid) to the busiest origin airport (dashed) for each weekday and year in the data.

library(lubridate); library(scales)

# 1) Build weekday safely from whatever columns exist
dow_levels <- c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")

flights_day <- df_flights
if ("DAY_OF_WEEK" %in% names(flights_day)) {
  flights_day <- dplyr::mutate(flights_day, day_num = as.integer(DAY_OF_WEEK))
} else if ("FL_DATE" %in% names(flights_day)) {
  flights_day <- dplyr::mutate(flights_day, day_num = wday(as.Date(FL_DATE), week_start = 1))
} else if (all(c("YEAR","MONTH","DAY") %in% names(flights_day))) {
  flights_day <- dplyr::mutate(flights_day, day_num = wday(make_date(YEAR, MONTH, DAY), week_start = 1))
} else {
  stop("No DAY_OF_WEEK or FL_DATE or YEAR/MONTH/DAY columns found in df_flights.")
}

flights_day <- flights_day %>%
  dplyr::mutate(day = factor(dow_levels[day_num], levels = dow_levels)) %>%
  dplyr::filter(!is.na(day))

# 2) Pick origin airport column name
origin_col <- if ("ORIGIN_AIRPORT" %in% names(flights_day)) {
  "ORIGIN_AIRPORT"
} else if ("ORIGIN" %in% names(flights_day)) {
  "ORIGIN"
} else {
  stop("No origin airport column found (expected ORIGIN_AIRPORT or ORIGIN).")
}

# 3) Series A: total flights per YEAR x weekday
totals_day <- flights_day %>%
  dplyr::count(YEAR, day, name = "value") %>%
  dplyr::mutate(series = "Total flights")

# 4) Series B: top origin airport per YEAR x weekday
top_origin_day <- flights_day %>%
  dplyr::count(YEAR, day, .data[[origin_col]], name = "n_org") %>%
  dplyr::group_by(YEAR, day) %>%
  dplyr::slice_max(n_org, n = 1, with_ties = FALSE) %>%
  dplyr::ungroup() %>%
  dplyr::transmute(YEAR, day,
                   series   = "Top origin airport",
                   value    = n_org,
                   top_code = .data[[origin_col]])

# 5) Combine and plot
plot_df <- dplyr::bind_rows(
  totals_day %>% dplyr::mutate(top_code = NA_character_),
  top_origin_day
)

ggplot(plot_df,
       aes(x = day, y = value,
           group = interaction(series, YEAR),
           color = series, linetype = series)) +
  geom_line(linewidth = 1.2) +
  geom_point(shape = 21, fill = "white", color = "black", stroke = 0.7, size = 3) +
  # label the dashed line with the origin airport code
  geom_text(
    data = subset(plot_df, series == "Top origin airport"),
    aes(label = top_code),
    vjust = 1.6, size = 3.1, color = "black", show.legend = FALSE
  ) +
  scale_color_manual(values = c("Total flights" = "#FB8C00", "Top origin airport" = "black"), name = NULL) +
  scale_linetype_manual(values = c("Total flights" = "solid", "Top origin airport" = "dashed"), name = NULL) +
  scale_y_continuous(labels = comma, expand = expansion(mult = c(0.05, 0.12))) +
  labs(title = "Flights by Day of Week — Total vs Top Origin Airport",
       x = NULL, y = "Number of Flights") +
  theme_minimal(base_size = 12) +
  theme(panel.grid.minor = element_blank(),
        plot.title = element_text(hjust = 0.5, face = "bold", size = 16))

4) Flights per Month by Airline (Top 5)

This chart shows monthly flight volumes for the five busiest airlines.

library(scales)

# Top 5 airlines by total flights in the dataset
top_air <- df_flights %>%
  dplyr::count(AIRLINE, sort = TRUE) %>%
  dplyr::slice_head(n = 5) %>%
  dplyr::pull(AIRLINE)

by_airline <- df_flights %>%
  dplyr::filter(AIRLINE %in% top_air) %>%
  dplyr::group_by(AIRLINE, MONTH) %>%
  dplyr::summarise(n = n(), .groups = "drop") %>%
  dplyr::mutate(months = factor(month.abb[MONTH], levels = month.abb, ordered = TRUE))

ggplot(by_airline, aes(x = months, y = n, fill = AIRLINE)) +
  geom_col(position = position_dodge(width = 0.85), width = 0.8) +
  scale_y_continuous(labels = comma) +
  labs(title = "Flights per Month by Airline (Top 5, 2015)",
       x = "Month", y = "Number of Flights", fill = "Airline") +
  theme_minimal(base_size = 12) +
  theme(panel.grid.minor = element_blank())

## 5) Donut — Share of Flights by Airline

This interactive donut groups smaller carriers into “Other” so the top airlines are easy to compare.

# Only run if plotly + htmlwidgets are available (keeps knitting clean anywhere)
if (requireNamespace("plotly", quietly = TRUE) &&
    requireNamespace("htmlwidgets", quietly = TRUE)) {

  library(plotly); library(htmlwidgets); library(dplyr); library(scales)

  air_share <- df_flights %>%
    count(AIRLINE, name = "flights") %>%
    arrange(desc(flights)) %>%
    mutate(rank = row_number(),
           group = if_else(rank <= 8, as.character(AIRLINE), "Other")) %>%
    group_by(group) %>%
    summarise(flights = sum(flights), .groups = "drop") %>%
    mutate(pct = flights / sum(flights))

  fig <- plot_ly(
    data = air_share,
    labels = ~group,
    values = ~flights,
    type   = "pie",
    hole   = 0.5,
    textinfo = "percent+label",
    textposition = "inside",
    hovertemplate = ~paste0(
      "<b>", group, "</b><br>",
      "Flights: ", comma(flights), "<br>",
      "Share: ", percent(pct, accuracy = 0.1),
      "<extra></extra>"
    )
  ) %>%
    layout(
      title = "Share of Flights by Airline (2015)",
      showlegend = FALSE,
      margin = list(l = 40, r = 40, b = 40, t = 60),
      annotations = list(
        list(text = "2015", x = 0.5, y = 0.5, showarrow = FALSE, font = list(size = 16))
      )
    )

  fig
} else {
  cat("Note: plotly/htmlwidgets not installed. Install with install.packages(c('plotly','htmlwidgets')) to see the interactive donut.")
}

Conclusion

Overall, this report demonstrates how effective visualization techniques in RStudio can turn large flight datasets into actionable business insights — a core learning goal of Loyola University Maryland’s MBA course in Data Visualization and Decision Making.