load necessary R libraries
library(tidyverse)
library(here)
library(scales)
set default plot theme
theme_set(theme_classic())
ingest airport data
plot_dc_airports <- read_csv(here("data", "dc_airport_traffic.csv"))
reformat year column as a factor
plot_dc_airports$year <- as.factor(plot_dc_airports$year)
take a quick peek at the data format
head(plot_dc_airports)
## # A tibble: 6 × 6
## year month domestic international total airport
## <fct> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 2003 1 441982 9103 451085 DCA
## 2 2003 1 451331 139746 591077 IAD
## 3 2003 1 634628 15912 650540 BWI
## 4 2003 2 441931 8006 449937 DCA
## 5 2003 2 390886 107879 498765 IAD
## 6 2003 2 557207 11668 568875 BWI
start off by seeing which airport has the highest total passenger numbers. this data is pretty coarse… we can do better
plot_dc_airports %>%
group_by(airport) %>%
summarise(total_pass = sum(total)) %>%
ggplot(aes(x = airport, y = total_pass)) +
geom_col() +
scale_y_continuous(labels = comma) +
labs(title = "Total Passengers at Major DC Area Airports",
subtitle = "2003 to 2023",
caption = "data from Bureau of Transportaiton Statistics",
x = "Airport",
y = "Total passengers")
when we take a closer look, at the boxplot showing the spread of the monthly data for each airport, we can clearly see several low outliers from 2020
plot_dc_airports %>%
ggplot(aes(x = airport, y = total)) +
geom_boxplot() +
geom_jitter(aes(color = year)) +
scale_y_continuous(labels = comma) +
labs(title = "Distribution of monthly passenger totals at Major DC Area Airports",
subtitle = "2003 to 2023",
caption = "data from Bureau of Transportaiton Statistics",
x = "Airport",
y = "Total passengers",
color = "Year")
when we plot the data by year, we can see all three airports had large declines in passengers in 2020
by_year <- plot_dc_airports %>%
group_by(airport, year) %>%
summarise(total_pass = sum(total))
## `summarise()` has grouped output by 'airport'. You can override using the
## `.groups` argument.
by_year %>%
ggplot(aes(x = year, y = total_pass, group = airport, color = airport)) +
geom_line() +
geom_point() +
scale_y_continuous(labels = comma, limits = c(0, NA)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Total Passengers at Major DC Area Airports by Year",
subtitle = "2003 to 2023",
caption = "data from Bureau of Transportaiton Statistics",
x = "Year",
y = "Total passengers",
color = "Airport")
when breaking out the data by month, we have to first filter out 2023 since not all months have data. we can we passenger traffic typically peaks in the summer months when international tourism is at its highest.
by_month <- plot_dc_airports %>%
filter(year != 2023) %>% # there is not a full year of 2023 data
group_by(airport, month) %>%
summarise(total_pass = sum(total))
## `summarise()` has grouped output by 'airport'. You can override using the
## `.groups` argument.
by_month %>%
ggplot(aes(x = month, y = total_pass, group = airport, color = airport)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = 1:12, labels = month.name[1:12])+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = comma) +
labs(title = "Total Passengers at Major DC Area Airports by Month",
subtitle = "2003 to 2022",
caption = "data from Bureau of Transportaiton Statistics",
x = "Month",
y = "Total passengers",
color = "Airport")
when we look at the ratio of domestic passengers to international passengers over time we see a big spike in 2020 at DCA. Additional analysis will be needed to explore this uptick although it likely has to do with travel restrictions from COVID-19. We can see both BWI and IAD live up to their names and serve more international passengers than DCA.
by_origin <- plot_dc_airports %>%
group_by(airport, year) %>%
summarise(dom_pass = sum(domestic),
intl_pass = sum(international))
## `summarise()` has grouped output by 'airport'. You can override using the
## `.groups` argument.
by_origin %>%
ggplot(aes(x = year, y = dom_pass/intl_pass, group = airport)) +
geom_line() +
facet_wrap(~ airport) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = comma) +
labs(title = "Ratio of Domestic Passengers to International Passengers at Major DC Area Airports by Year",
subtitle = "2003 to 2023",
caption = "data from Bureau of Transportaiton Statistics",
x = "Year",
y = "Domestic Passengers / International Passengers")
ggsave(here("output", "intl_pass_ratio_by_year.jpg"))
## Saving 7 x 5 in image
help provided from: