library(rvest)
library(stringi)
library(pdftools)
library(hrbrthemes)
library(tidyverse)
some URLs generate infinite redirection loops so be safe out there
safe_read_html <- safely(read_html)
grab the individual page URLs for each month available in each year
c("https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2017",
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2016",
"https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2015") %>%
map(function(x) {
read_html(x) %>%
html_nodes("a[href*='air-travel-consumer-report']") %>%
html_attr('href')
}) %>%
flatten_chr() %>%
discard(stri_detect_regex, "feedback|/air-travel-consumer-reports") %>% # filter out URLs we don't need
sprintf("https://www.transportation.gov%s", .) -> main_urls # make them useful
now, read in all the individual pages. do this separate from URL grabbing above and the PDF URL extraction below just to be even safer.
map(main_urls, safe_read_html) -> pages
URLs that generate said redirection loops will not have a valid result so ignor ethem and find the URLs for the monthly reports
discard(pages, ~is.null(.$result)) %>%
map("result") %>%
map(~html_nodes(., "a[href*='pdf']") %>%
html_attr('href') %>%
keep(stri_detect_fixed, "ATCR")) %>%
flatten_chr() -> pdf_urls
download them, being kind to the DoT server and not re-downloading anything we’ve successfully downloaded already. I really wish this was built-in functionality to download.file()
dir.create("atcr_pdfs")
walk(pdf_urls, ~if (!file.exists(file.path("atcr_pdfs", basename(.))))
download.file(., file.path("atcr_pdfs", basename(.))))
read in each PDF; find the pages with the tables we need to scrape; enable the text table to be read with read.table() and save the results
c("2017MarchATCR.pdf", "2016MarchATCR_2.pdf", "2015MarchATCR_1.pdf") %>%
file.path("atcr_pdfs", .) %>%
map(pdf_text) %>%
map(~keep(.x, stri_detect_fixed, "PASSENGERS DENIED BOARDING")[[2]]) %>%
map(stri_split_lines) %>%
map(flatten_chr) %>%
map(function(x) {
y <- which(stri_detect_regex(x, "Rank|RANK|TOTAL"))
grep("^\ +[[:digit:]]", x[y[1]:y[2]], value=TRUE) %>%
stri_trim() %>%
stri_replace_all_regex("([[:alpha:]])\\*+", "$1") %>%
stri_replace_all_regex(" ([[:alpha:]])", "_$1") %>%
paste0(collapse="\n") %>%
read.table(text=., header=FALSE, stringsAsFactors=FALSE)
}) -> denied
denied
## [[1]]
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 _HAWAIIAN_AIRLINES 326 49 10,824,495 0.05 358 29
## 2 2 _DELTA_AIR_LINES 129,825 1,238 129,281,098 0.10 145,406 1,938
## 3 3 _VIRGIN_AMERICA 2,375 94 7,945,329 0.12 1,722 80
## 4 4 _ALASKA_AIRLINES 6,806 931 23,390,900 0.40 5,412 740
## 5 5 _UNITED_AIRLINES 62,895 3,765 86,836,527 0.43 81,390 6,317
## 6 6 _SPIRIT_AIRLINES 10,444 1,117 19,418,650 0.58 6,589 496
## 7 7 _FRONTIER_AIRLINES 2,096 851 14,666,332 0.58 2,744 1,232
## 8 8 _AMERICAN_AIRLINES 54,259 8,312 130,894,653 0.64 50,317 7,504
## 9 9 _JETBLUE_AIRWAYS 1,705 3,176 34,710,003 0.92 1,841 73
## 10 10 _SKYWEST_AIRLINES 41,476 2,935 29,986,918 0.98 51,829 5,079
## 11 11 _SOUTHWEST_AIRLINES 88,628 14,979 150,655,354 0.99 96,513 15,608
## 12 12 _EXPRESSJET_AIRLINES 33,590 3,182 21,139,038 1.51 42,933 4,608
## V9 V10
## 1 10,462,344 0.03
## 2 125,044,855 0.15
## 3 6,928,805 0.12
## 4 22,095,126 0.33
## 5 82,081,914 0.77
## 6 16,010,164 0.31
## 7 12,343,540 1.00
## 8 97,091,951 0.77
## 9 31,949,251 0.02
## 10 28,562,760 1.78
## 11 143,932,752 1.08
## 12 24,736,601 1.86
##
## [[2]]
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 _JETBLUE_AIRWAYS 1,841 73 31,949,251 0.02 2,006 650
## 2 2 _HAWAIIAN_AIRLINES 358 29 10,462,344 0.03 366 116
## 3 3 _VIRGIN_AMERICA 1,722 80 6,928,805 0.12 910 57
## 4 4 _DELTA_AIR_LINES 145,406 1,938 125,044,855 0.16 107,706 4,052
## 5 5 _SPIRIT_AIRLINES 6,589 496 16,010,164 0.31 **** ****
## 6 6 _ALASKA_AIRLINES 5,412 740 22,095,126 0.33 4,176 864
## 7 7 _UNITED_AIRLINES 81,390 6,317 82,081,914 0.77 64,968 9,078
## 8 8 _AMERICAN_AIRLINES 50,317 7,504 97,091,951 0.77 35,152 3,188
## 9 9 _FRONTIER_AIRLINES 2,744 1,232 12,343,540 1.00 3,864 1,616
## 10 10 _SOUTHWEST_AIRLINES 96,513 15,608 143,932,752 1.08 82,039 12,041
## 11 11 _SKYWEST_AIRLINES 51,829 5,079 28,562,760 1.78 42,446 7,170
## 12 12 _EXPRESSJET_AIRLINES 42,933 4,608 24,736,601 1.86 55,525 7,961
## 13 13 _ENVOY_AIR 18,125 2,792 11,901,028 2.35 18,615 2,501
## V9 V10
## 1 29,264,332 0.22
## 2 10,084,811 0.12
## 3 6,438,023 0.09
## 4 115,737,180 0.35
## 5 **** ****
## 6 19,838,878 0.44
## 7 77,317,281 1.17
## 8 77,065,600 0.41
## 9 11,787,602 1.37
## 10 116,809,601 1.03
## 11 26,420,593 2.71
## 12 29,344,974 2.71
## 13 15,441,723 1.62
##
## [[3]]
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 1 _VIRGIN_AMERICA 910 57 6,438,023 0.09 351 26
## 2 2 _HAWAIIAN_AIRLINES 366 116 10,084,811 0.12 1,147 172
## 3 3 _JETBLUE_AIRWAYS 2,006 650 29,264,332 0.22 502 19
## 4 4 _DELTA_AIR_LINES 107,706 4,052 115,737,180 0.35 81,025 6,070
## 5 5 _AMERICAN_AIRLINES 60,924 7,471 135,748,581 0.55 ** **
## 6 6 _ALASKA_AIRLINES 4,176 864 19,838,878 0.44 3,834 714
## 7 7 _SOUTHWEST_AIRLINES 88,921 13,899 125,381,374 1.11 *** ***
## 8 8 _UNITED_AIRLINES 64,968 9,078 77,317,281 1.17 57,716 9,015
## 9 9 _FRONTIER_AIRLINES 3,864 1,616 11,787,602 1.37 3,493 1,272
## 10 10 _ENVOY_AIR 18,615 2,501 15,441,723 1.62 19,659 1,923
## 11 11 _EXPRESSJET_AIRLINES 55,525 7,961 29,344,974 2.71 47,844 6,422
## 12 12 _SKYWEST_AIRLINES 42,446 7,170 26,420,593 2.71 35,942 6,768
## V9 V10
## 1 6,244,574 0.04
## 2 9,928,830 0.17
## 3 28,166,771 0.01
## 4 106,783,155 0.57
## 5 ** **
## 6 18,517,953 0.39
## 7 *** ***
## 8 77,212,471 1.17
## 9 10,361,896 1.23
## 10 16,939,092 1.14
## 11 31,356,714 2.05
## 12 26,518,312 2.55
map2_df(2016:2014, denied, ~{
.y$year <- .x
set_names(.y[,c(1:6,11)],
c("rank", "airline", "voluntary_denied", "involuntary_denied",
"enplaned_ct", "involuntary_db_per_10k", "year")) %>%
mutate(airline = stri_trans_totitle(stri_trim(stri_replace_all_fixed(airline, "_", " ")))) %>%
readr::type_convert() %>%
tbl_df()
}) %>%
select(-rank) -> denied
glimpse(denied)
## Observations: 37
## Variables: 6
## $ airline <chr> "Hawaiian Airlines", "Delta Air Lines",...
## $ voluntary_denied <dbl> 326, 129825, 2375, 6806, 62895, 10444, ...
## $ involuntary_denied <dbl> 49, 1238, 94, 931, 3765, 1117, 851, 831...
## $ enplaned_ct <dbl> 10824495, 129281098, 7945329, 23390900,...
## $ involuntary_db_per_10k <dbl> 0.05, 0.10, 0.12, 0.40, 0.43, 0.58, 0.5...
## $ year <int> 2016, 2016, 2016, 2016, 2016, 2016, 201...
denied
## # A tibble: 37 × 6
## airline voluntary_denied involuntary_denied enplaned_ct
## <chr> <dbl> <dbl> <dbl>
## 1 Hawaiian Airlines 326 49 10824495
## 2 Delta Air Lines 129825 1238 129281098
## 3 Virgin America 2375 94 7945329
## 4 Alaska Airlines 6806 931 23390900
## 5 United Airlines 62895 3765 86836527
## 6 Spirit Airlines 10444 1117 19418650
## 7 Frontier Airlines 2096 851 14666332
## 8 American Airlines 54259 8312 130894653
## 9 Jetblue Airways 1705 3176 34710003
## 10 Skywest Airlines 41476 2935 29986918
## # ... with 27 more rows, and 2 more variables:
## # involuntary_db_per_10k <dbl>, year <int>
select(denied, airline, year, involuntary_db_per_10k) %>%
group_by(airline) %>%
mutate(yr_ct = n()) %>%
ungroup() %>%
filter(yr_ct == 3) %>%
select(-yr_ct) %>%
mutate(year = factor(year, rev(c(max(year)+1, unique(year))))) -> plot_df
update_geom_font_defaults(font_rc, size = 3)
ggplot() +
geom_line(data = plot_df, aes(year, involuntary_db_per_10k, group=airline, colour=airline)) +
geom_text(data = filter(plot_df, year=='2016') %>% mutate(lbl = sprintf("%s (%s)", airline, involuntary_db_per_10k)),
aes(x=year, y=involuntary_db_per_10k, label=lbl, colour=airline), hjust=0,
nudge_y=c(0,0,0,0,0,0,0,0,-0.0005,0.03,0), nudge_x=0.015) +
scale_x_discrete(expand=c(0,0), labels=c(2014:2016, ""), drop=FALSE) +
scale_y_continuous(trans="log1p") +
ggthemes::scale_color_tableau() +
labs(x=NULL, y=NULL,
title="Involuntary Disembark Rate Per 10K Passengers",
subtitle="Y-axis log scale; Only included airlines with 3-year span data",
caption="Source: U.S. DoT Air Travel Consumer Reports <https://www.transportation.gov/airconsumer/air-travel-consumer-reports>") +
theme_ipsum_rc(grid="X") +
theme(plot.caption=element_text(hjust=0)) +
theme(legend.position="none")