library(dplyr); library(tidyr); library(ggplot2)
library(ggthemes); library(scales); library(lubridate)
df_flights <- read.csv("flight delays.csv", check.names = FALSE)
df_airlines <- read.csv("airlines.csv", check.names = FALSE)
This project is part of my Data Visualization and Decision
Making course at Loyola University
Maryland.
Using RStudio and tidyverse tools, I
transform U.S. flight data into visual insights that explain airline
performance patterns.
Through multiple visualizations—including delay distributions, airline
identity charts, weekday trends, monthly volumes, and market share
analyses—this report demonstrates how data visualization supports clear,
evidence-based decision making.
This chart shows the distribution of departure
delays across airlines after trimming extreme values (–20 to
120 minutes).
Dashed vertical lines mark the mean delay and solid
lines mark the median.
For United (UA), the labels are nudged slightly to avoid
overlapping.
library(grid) # for unit()
# 1) Trim and prep data
df_two_trim <- df_flights %>%
dplyr::filter(!is.na(DEPARTURE_DELAY),
DEPARTURE_DELAY >= -20,
DEPARTURE_DELAY <= 120)
# 2) Mean/median per airline for reference lines
stats_trim <- df_two_trim %>%
dplyr::group_by(AIRLINE) %>%
dplyr::summarise(mean_delay = mean(DEPARTURE_DELAY),
median_delay = median(DEPARTURE_DELAY),
.groups = "drop")
# 3) Labels for mean/median (nudge UA a bit so tags don’t overlap)
label_df <- stats_trim %>%
tidyr::pivot_longer(c(mean_delay, median_delay),
names_to = "stat", values_to = "xpos") %>%
dplyr::mutate(
stat_lab = ifelse(stat == "mean_delay", "Mean", "Median"),
label = paste0(AIRLINE, " ", stat_lab, ": ", round(xpos, 1)),
# base y positions for density scale; tweak if your plot is taller/shorter
ypos = ifelse(stat_lab == "Mean", 0.22, 0.20),
xpos = dplyr::case_when(
AIRLINE == "UA" & stat_lab == "Mean" ~ xpos - 2, # nudge left
AIRLINE == "UA" & stat_lab == "Median" ~ xpos + 2, # nudge right
TRUE ~ xpos
)
)
# 4) Plot
ggplot(df_two_trim, aes(DEPARTURE_DELAY, fill = AIRLINE, color = AIRLINE)) +
geom_density(alpha = 0.35) +
geom_vline(data = stats_trim, aes(xintercept = mean_delay), linetype = "dashed") +
geom_vline(data = stats_trim, aes(xintercept = median_delay), linetype = "solid") +
geom_label(data = label_df,
aes(x = xpos, y = ypos, label = label, fill = AIRLINE),
color = "white", size = 3.5, fontface = "bold",
label.padding = unit(0.15, "lines")) +
coord_cartesian(xlim = c(-20, 120)) +
labs(title = "Departure Delay Distribution (Trimmed)",
subtitle = "Dashed = Mean | Solid = Median",
x = "Delay (minutes)", y = "Density",
caption = "UA labels nudged left/right to avoid overlap") +
theme_minimal()
## 2) Economist-styled list
This chart lists each airline’s IATA code with its full name.
We coord_flip()
so names are easy to read, and use
theme_economist()
for a clean editorial look.
library(ggthemes)
ggplot(df_airlines, aes(IATA_CODE, 1, fill = AIRLINE)) +
geom_col(width = 0.6) +
coord_flip(clip = "off") + # allow labels outside
geom_text(aes(y = 1.03, label = AIRLINE), hjust = 0, size = 3.6) +
scale_y_continuous(limits = c(0, 1.18), expand = expansion(mult = c(0, 0))) +
scale_fill_economist() +
labs(title = "Airline Codes and Names", x = "IATA Code", y = NULL) +
theme_economist() +
theme(
legend.position = "none",
plot.margin = margin(10, 280, 10, 10), # extra room for long names
axis.title.y = element_text(margin = margin(r = 6))
)
my_colors <- c("#1f78b4","#33a02c","#e31a1c","#ff7f00",
"#6a3d9a","#b15928","#a6cee3","#b2df8a",
"#fb9a99","#fdbf6f","#cab2d6","#ffff99",
"#8dd3c7","#ffffb3")
ggplot(df_airlines, aes(IATA_CODE, 1, fill = AIRLINE)) +
geom_col(width = 0.55, color = "black", linewidth = 0.3) +
geom_text(aes(y = 1.04, label = AIRLINE), hjust = 0, size = 3.8) +
coord_flip(clip = "off") +
scale_y_continuous(limits = c(0, 1.16), breaks = NULL, labels = NULL, expand = c(0, 0)) +
scale_fill_manual(values = my_colors) +
labs(title = "Airline Codes and Names (Distinct Colors)", x = "IATA Code", y = NULL) +
theme_economist(base_size = 12) +
theme(
legend.position = "none",
panel.grid.major.y = element_blank(),
plot.margin = margin(10, 280, 10, 12),
axis.title.y = element_text(margin = margin(r = 8))
)
## 3) Flights by Day of Week — Total vs Top Origin Airport
This chart compares total flights (solid) to the busiest origin airport (dashed) for each weekday and year in the data.
library(lubridate); library(scales)
# 1) Build weekday safely from whatever columns exist
dow_levels <- c("Mon","Tue","Wed","Thu","Fri","Sat","Sun")
flights_day <- df_flights
if ("DAY_OF_WEEK" %in% names(flights_day)) {
flights_day <- dplyr::mutate(flights_day, day_num = as.integer(DAY_OF_WEEK))
} else if ("FL_DATE" %in% names(flights_day)) {
flights_day <- dplyr::mutate(flights_day, day_num = wday(as.Date(FL_DATE), week_start = 1))
} else if (all(c("YEAR","MONTH","DAY") %in% names(flights_day))) {
flights_day <- dplyr::mutate(flights_day, day_num = wday(make_date(YEAR, MONTH, DAY), week_start = 1))
} else {
stop("No DAY_OF_WEEK or FL_DATE or YEAR/MONTH/DAY columns found in df_flights.")
}
flights_day <- flights_day %>%
dplyr::mutate(day = factor(dow_levels[day_num], levels = dow_levels)) %>%
dplyr::filter(!is.na(day))
# 2) Pick origin airport column name
origin_col <- if ("ORIGIN_AIRPORT" %in% names(flights_day)) {
"ORIGIN_AIRPORT"
} else if ("ORIGIN" %in% names(flights_day)) {
"ORIGIN"
} else {
stop("No origin airport column found (expected ORIGIN_AIRPORT or ORIGIN).")
}
# 3) Series A: total flights per YEAR x weekday
totals_day <- flights_day %>%
dplyr::count(YEAR, day, name = "value") %>%
dplyr::mutate(series = "Total flights")
# 4) Series B: top origin airport per YEAR x weekday
top_origin_day <- flights_day %>%
dplyr::count(YEAR, day, .data[[origin_col]], name = "n_org") %>%
dplyr::group_by(YEAR, day) %>%
dplyr::slice_max(n_org, n = 1, with_ties = FALSE) %>%
dplyr::ungroup() %>%
dplyr::transmute(YEAR, day,
series = "Top origin airport",
value = n_org,
top_code = .data[[origin_col]])
# 5) Combine and plot
plot_df <- dplyr::bind_rows(
totals_day %>% dplyr::mutate(top_code = NA_character_),
top_origin_day
)
ggplot(plot_df,
aes(x = day, y = value,
group = interaction(series, YEAR),
color = series, linetype = series)) +
geom_line(linewidth = 1.2) +
geom_point(shape = 21, fill = "white", color = "black", stroke = 0.7, size = 3) +
# label the dashed line with the origin airport code
geom_text(
data = subset(plot_df, series == "Top origin airport"),
aes(label = top_code),
vjust = 1.6, size = 3.1, color = "black", show.legend = FALSE
) +
scale_color_manual(values = c("Total flights" = "#FB8C00", "Top origin airport" = "black"), name = NULL) +
scale_linetype_manual(values = c("Total flights" = "solid", "Top origin airport" = "dashed"), name = NULL) +
scale_y_continuous(labels = comma, expand = expansion(mult = c(0.05, 0.12))) +
labs(title = "Flights by Day of Week — Total vs Top Origin Airport",
x = NULL, y = "Number of Flights") +
theme_minimal(base_size = 12) +
theme(panel.grid.minor = element_blank(),
plot.title = element_text(hjust = 0.5, face = "bold", size = 16))
This chart shows monthly flight volumes for the five busiest airlines.
library(scales)
# Top 5 airlines by total flights in the dataset
top_air <- df_flights %>%
dplyr::count(AIRLINE, sort = TRUE) %>%
dplyr::slice_head(n = 5) %>%
dplyr::pull(AIRLINE)
by_airline <- df_flights %>%
dplyr::filter(AIRLINE %in% top_air) %>%
dplyr::group_by(AIRLINE, MONTH) %>%
dplyr::summarise(n = n(), .groups = "drop") %>%
dplyr::mutate(months = factor(month.abb[MONTH], levels = month.abb, ordered = TRUE))
ggplot(by_airline, aes(x = months, y = n, fill = AIRLINE)) +
geom_col(position = position_dodge(width = 0.85), width = 0.8) +
scale_y_continuous(labels = comma) +
labs(title = "Flights per Month by Airline (Top 5, 2015)",
x = "Month", y = "Number of Flights", fill = "Airline") +
theme_minimal(base_size = 12) +
theme(panel.grid.minor = element_blank())
## 5) Donut — Share of Flights by Airline
This interactive donut groups smaller carriers into “Other” so the top airlines are easy to compare.
# Only run if plotly + htmlwidgets are available (keeps knitting clean anywhere)
if (requireNamespace("plotly", quietly = TRUE) &&
requireNamespace("htmlwidgets", quietly = TRUE)) {
library(plotly); library(htmlwidgets); library(dplyr); library(scales)
air_share <- df_flights %>%
count(AIRLINE, name = "flights") %>%
arrange(desc(flights)) %>%
mutate(rank = row_number(),
group = if_else(rank <= 8, as.character(AIRLINE), "Other")) %>%
group_by(group) %>%
summarise(flights = sum(flights), .groups = "drop") %>%
mutate(pct = flights / sum(flights))
fig <- plot_ly(
data = air_share,
labels = ~group,
values = ~flights,
type = "pie",
hole = 0.5,
textinfo = "percent+label",
textposition = "inside",
hovertemplate = ~paste0(
"<b>", group, "</b><br>",
"Flights: ", comma(flights), "<br>",
"Share: ", percent(pct, accuracy = 0.1),
"<extra></extra>"
)
) %>%
layout(
title = "Share of Flights by Airline (2015)",
showlegend = FALSE,
margin = list(l = 40, r = 40, b = 40, t = 60),
annotations = list(
list(text = "2015", x = 0.5, y = 0.5, showarrow = FALSE, font = list(size = 16))
)
)
fig
} else {
cat("Note: plotly/htmlwidgets not installed. Install with install.packages(c('plotly','htmlwidgets')) to see the interactive donut.")
}
Delays cluster around small positive values;
medians are near zero for most airlines.
High-volume carriers such as Southwest
(WN) and Delta (DL) dominate total flights,
while others serve regional routes.
Atlanta (ATL) consistently appears as the
busiest origin airport across weekdays.
Monthly flight trends show only modest seasonal
variations.
Market share visualizations reveal that a few large airlines control the majority of U.S. air traffic.
ATL (Atlanta) remains the busiest origin airport across weekdays.
Data visualization enables evidence-based insights into airline performance, supporting informed decision-making in aviation analysis.
Overall, this report demonstrates how effective visualization techniques in RStudio can turn large flight datasets into actionable business insights — a core learning goal of Loyola University Maryland’s MBA course in Data Visualization and Decision Making.