library(rvest)
library(stringi)
library(pdftools)
library(hrbrthemes)
library(tidyverse)

some URLs generate infinite redirection loops so be safe out there

safe_read_html <- safely(read_html)

grab the individual page URLs for each month available in each year

c("https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2017",
  "https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2016",
  "https://www.transportation.gov/airconsumer/air-travel-consumer-reports-2015") %>%
  map(function(x) {
    read_html(x) %>%
      html_nodes("a[href*='air-travel-consumer-report']") %>%
      html_attr('href')
  }) %>%
  flatten_chr() %>%
  discard(stri_detect_regex, "feedback|/air-travel-consumer-reports") %>% # filter out URLs we don't need
  sprintf("https://www.transportation.gov%s", .) -> main_urls # make them useful

now, read in all the individual pages. do this separate from URL grabbing above and the PDF URL extraction below just to be even safer.

map(main_urls, safe_read_html) -> pages

URLs that generate said redirection loops will not have a valid result so ignor ethem and find the URLs for the monthly reports

discard(pages, ~is.null(.$result)) %>%
  map("result") %>%
  map(~html_nodes(., "a[href*='pdf']") %>%
        html_attr('href') %>%
        keep(stri_detect_fixed, "ATCR")) %>%
  flatten_chr() -> pdf_urls

download them, being kind to the DoT server and not re-downloading anything we’ve successfully downloaded already. I really wish this was built-in functionality to download.file()

dir.create("atcr_pdfs")
walk(pdf_urls, ~if (!file.exists(file.path("atcr_pdfs", basename(.))))
  download.file(., file.path("atcr_pdfs", basename(.))))

read in each PDF; find the pages with the tables we need to scrape; enable the text table to be read with read.table() and save the results

c("2017MarchATCR.pdf", "2016MarchATCR_2.pdf", "2015MarchATCR_1.pdf") %>%
  file.path("atcr_pdfs", .) %>%
  map(pdf_text) %>%
  map(~keep(.x, stri_detect_fixed, "PASSENGERS DENIED BOARDING")[[2]]) %>%
  map(stri_split_lines) %>%
  map(flatten_chr) %>%
  map(function(x) {
    y <- which(stri_detect_regex(x, "Rank|RANK|TOTAL"))
    grep("^\ +[[:digit:]]", x[y[1]:y[2]], value=TRUE) %>%
      stri_trim() %>%
      stri_replace_all_regex("([[:alpha:]])\\*+", "$1") %>%
      stri_replace_all_regex(" ([[:alpha:]])", "_$1") %>%
      paste0(collapse="\n") %>%
      read.table(text=., header=FALSE, stringsAsFactors=FALSE)
  }) -> denied

denied
## [[1]]
##    V1                   V2      V3     V4          V5   V6      V7     V8
## 1   1   _HAWAIIAN_AIRLINES     326     49  10,824,495 0.05     358     29
## 2   2     _DELTA_AIR_LINES 129,825  1,238 129,281,098 0.10 145,406  1,938
## 3   3      _VIRGIN_AMERICA   2,375     94   7,945,329 0.12   1,722     80
## 4   4     _ALASKA_AIRLINES   6,806    931  23,390,900 0.40   5,412    740
## 5   5     _UNITED_AIRLINES  62,895  3,765  86,836,527 0.43  81,390  6,317
## 6   6     _SPIRIT_AIRLINES  10,444  1,117  19,418,650 0.58   6,589    496
## 7   7   _FRONTIER_AIRLINES   2,096    851  14,666,332 0.58   2,744  1,232
## 8   8   _AMERICAN_AIRLINES  54,259  8,312 130,894,653 0.64  50,317  7,504
## 9   9     _JETBLUE_AIRWAYS   1,705  3,176  34,710,003 0.92   1,841     73
## 10 10    _SKYWEST_AIRLINES  41,476  2,935  29,986,918 0.98  51,829  5,079
## 11 11  _SOUTHWEST_AIRLINES  88,628 14,979 150,655,354 0.99  96,513 15,608
## 12 12 _EXPRESSJET_AIRLINES  33,590  3,182  21,139,038 1.51  42,933  4,608
##             V9  V10
## 1   10,462,344 0.03
## 2  125,044,855 0.15
## 3    6,928,805 0.12
## 4   22,095,126 0.33
## 5   82,081,914 0.77
## 6   16,010,164 0.31
## 7   12,343,540 1.00
## 8   97,091,951 0.77
## 9   31,949,251 0.02
## 10  28,562,760 1.78
## 11 143,932,752 1.08
## 12  24,736,601 1.86
## 
## [[2]]
##    V1                   V2      V3     V4          V5   V6      V7     V8
## 1   1     _JETBLUE_AIRWAYS   1,841     73  31,949,251 0.02   2,006    650
## 2   2   _HAWAIIAN_AIRLINES     358     29  10,462,344 0.03     366    116
## 3   3      _VIRGIN_AMERICA   1,722     80   6,928,805 0.12     910     57
## 4   4     _DELTA_AIR_LINES 145,406  1,938 125,044,855 0.16 107,706  4,052
## 5   5     _SPIRIT_AIRLINES   6,589    496  16,010,164 0.31    ****   ****
## 6   6     _ALASKA_AIRLINES   5,412    740  22,095,126 0.33   4,176    864
## 7   7     _UNITED_AIRLINES  81,390  6,317  82,081,914 0.77  64,968  9,078
## 8   8   _AMERICAN_AIRLINES  50,317  7,504  97,091,951 0.77  35,152  3,188
## 9   9   _FRONTIER_AIRLINES   2,744  1,232  12,343,540 1.00   3,864  1,616
## 10 10  _SOUTHWEST_AIRLINES  96,513 15,608 143,932,752 1.08  82,039 12,041
## 11 11    _SKYWEST_AIRLINES  51,829  5,079  28,562,760 1.78  42,446  7,170
## 12 12 _EXPRESSJET_AIRLINES  42,933  4,608  24,736,601 1.86  55,525  7,961
## 13 13           _ENVOY_AIR  18,125  2,792  11,901,028 2.35  18,615  2,501
##             V9  V10
## 1   29,264,332 0.22
## 2   10,084,811 0.12
## 3    6,438,023 0.09
## 4  115,737,180 0.35
## 5         **** ****
## 6   19,838,878 0.44
## 7   77,317,281 1.17
## 8   77,065,600 0.41
## 9   11,787,602 1.37
## 10 116,809,601 1.03
## 11  26,420,593 2.71
## 12  29,344,974 2.71
## 13  15,441,723 1.62
## 
## [[3]]
##    V1                   V2      V3     V4          V5   V6     V7    V8
## 1   1      _VIRGIN_AMERICA     910     57   6,438,023 0.09    351    26
## 2   2   _HAWAIIAN_AIRLINES     366    116  10,084,811 0.12  1,147   172
## 3   3     _JETBLUE_AIRWAYS   2,006    650  29,264,332 0.22    502    19
## 4   4     _DELTA_AIR_LINES 107,706  4,052 115,737,180 0.35 81,025 6,070
## 5   5   _AMERICAN_AIRLINES  60,924  7,471 135,748,581 0.55     **    **
## 6   6     _ALASKA_AIRLINES   4,176    864  19,838,878 0.44  3,834   714
## 7   7  _SOUTHWEST_AIRLINES  88,921 13,899 125,381,374 1.11    ***   ***
## 8   8     _UNITED_AIRLINES  64,968  9,078  77,317,281 1.17 57,716 9,015
## 9   9   _FRONTIER_AIRLINES   3,864  1,616  11,787,602 1.37  3,493 1,272
## 10 10           _ENVOY_AIR  18,615  2,501  15,441,723 1.62 19,659 1,923
## 11 11 _EXPRESSJET_AIRLINES  55,525  7,961  29,344,974 2.71 47,844 6,422
## 12 12    _SKYWEST_AIRLINES  42,446  7,170  26,420,593 2.71 35,942 6,768
##             V9  V10
## 1    6,244,574 0.04
## 2    9,928,830 0.17
## 3   28,166,771 0.01
## 4  106,783,155 0.57
## 5           **   **
## 6   18,517,953 0.39
## 7          ***  ***
## 8   77,212,471 1.17
## 9   10,361,896 1.23
## 10  16,939,092 1.14
## 11  31,356,714 2.05
## 12  26,518,312 2.55
map2_df(2016:2014, denied, ~{
  .y$year <- .x
  set_names(.y[,c(1:6,11)],
            c("rank", "airline", "voluntary_denied", "involuntary_denied",
              "enplaned_ct", "involuntary_db_per_10k", "year")) %>%
    mutate(airline = stri_trans_totitle(stri_trim(stri_replace_all_fixed(airline, "_", " ")))) %>%
    readr::type_convert() %>%
    tbl_df()
}) %>%
  select(-rank) -> denied

glimpse(denied)
## Observations: 37
## Variables: 6
## $ airline                <chr> "Hawaiian Airlines", "Delta Air Lines",...
## $ voluntary_denied       <dbl> 326, 129825, 2375, 6806, 62895, 10444, ...
## $ involuntary_denied     <dbl> 49, 1238, 94, 931, 3765, 1117, 851, 831...
## $ enplaned_ct            <dbl> 10824495, 129281098, 7945329, 23390900,...
## $ involuntary_db_per_10k <dbl> 0.05, 0.10, 0.12, 0.40, 0.43, 0.58, 0.5...
## $ year                   <int> 2016, 2016, 2016, 2016, 2016, 2016, 201...
denied
## # A tibble: 37 × 6
##              airline voluntary_denied involuntary_denied enplaned_ct
##                <chr>            <dbl>              <dbl>       <dbl>
## 1  Hawaiian Airlines              326                 49    10824495
## 2    Delta Air Lines           129825               1238   129281098
## 3     Virgin America             2375                 94     7945329
## 4    Alaska Airlines             6806                931    23390900
## 5    United Airlines            62895               3765    86836527
## 6    Spirit Airlines            10444               1117    19418650
## 7  Frontier Airlines             2096                851    14666332
## 8  American Airlines            54259               8312   130894653
## 9    Jetblue Airways             1705               3176    34710003
## 10  Skywest Airlines            41476               2935    29986918
## # ... with 27 more rows, and 2 more variables:
## #   involuntary_db_per_10k <dbl>, year <int>
select(denied, airline, year, involuntary_db_per_10k) %>%
  group_by(airline) %>%
  mutate(yr_ct = n()) %>%
  ungroup() %>%
  filter(yr_ct == 3) %>%
  select(-yr_ct) %>%
  mutate(year = factor(year, rev(c(max(year)+1, unique(year))))) -> plot_df

update_geom_font_defaults(font_rc, size = 3)
ggplot() +
  geom_line(data = plot_df, aes(year, involuntary_db_per_10k, group=airline, colour=airline)) +
  geom_text(data = filter(plot_df, year=='2016') %>% mutate(lbl = sprintf("%s (%s)", airline, involuntary_db_per_10k)),
            aes(x=year, y=involuntary_db_per_10k, label=lbl, colour=airline), hjust=0,
            nudge_y=c(0,0,0,0,0,0,0,0,-0.0005,0.03,0), nudge_x=0.015) +
  scale_x_discrete(expand=c(0,0), labels=c(2014:2016, ""), drop=FALSE) +
  scale_y_continuous(trans="log1p") +
  ggthemes::scale_color_tableau() +
  labs(x=NULL, y=NULL,
       title="Involuntary Disembark Rate Per 10K Passengers",
       subtitle="Y-axis log scale; Only included airlines with 3-year span data",
       caption="Source: U.S. DoT Air Travel Consumer Reports <https://www.transportation.gov/airconsumer/air-travel-consumer-reports>") +
  theme_ipsum_rc(grid="X") +
  theme(plot.caption=element_text(hjust=0)) +
  theme(legend.position="none")