13.4.6

Exercise 1

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point() +
    coord_quickmap()

avg_delay <- flights %>%
  group_by(dest) %>%
  summarize(delay=mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c(dest = "faa"))
avg_delay %>%
  ggplot(aes(lon, lat, color=delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()

Exercise 2

airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(origin, dest) %>%
  left_join(airport_locations, by = c("origin" = "faa")) %>%
  left_join(airport_locations, by = c("dest" = "faa"), suffix = c(".origin", ".dest"))
## # A tibble: 336,776 x 6
##    origin dest  lat.origin lon.origin lat.dest lon.dest
##    <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
##  1 EWR    IAH         40.7      -74.2     30.0    -95.3
##  2 LGA    IAH         40.8      -73.9     30.0    -95.3
##  3 JFK    MIA         40.6      -73.8     25.8    -80.3
##  4 JFK    BQN         40.6      -73.8     NA       NA  
##  5 LGA    ATL         40.8      -73.9     33.6    -84.4
##  6 EWR    ORD         40.7      -74.2     42.0    -87.9
##  7 EWR    FLL         40.7      -74.2     26.1    -80.2
##  8 LGA    IAD         40.8      -73.9     38.9    -77.5
##  9 JFK    MCO         40.6      -73.8     28.4    -81.3
## 10 LGA    ORD         40.8      -73.9     42.0    -87.9
## # … with 336,766 more rows

Exercise 3

Arrival Delay

delays_by_age <- inner_join(flights, select(planes, tailnum, plane_year=year), by =  "tailnum") %>%
  mutate(age = 2013 - plane_year) %>%
  filter(!is.na(age)) %>%
  group_by(age) %>%
  summarize(avg_delay = mean(arr_delay, na.rm = TRUE))

ggplot(delays_by_age, aes(x=age, y=avg_delay)) +
  geom_point()

There does seem to be a strong positive correlation between plane age and arrival delay until the plane is about 10 years old, at which point the trend reverses to a negative correlation between plane age and arrival delay.

Departure Delay

delays_by_age <- inner_join(flights, select(planes, tailnum, plane_year=year), by =  "tailnum") %>%
  mutate(age = 2013 - plane_year) %>%
  filter(!is.na(age)) %>%
  group_by(age) %>%
  summarize(avg_delay = mean(dep_delay, na.rm = TRUE))

ggplot(delays_by_age, aes(x=age, y=avg_delay)) +
  geom_point()

Graphing age against departure delays shows similar trends to arrival delays.

Exercise 4

Precipitation and wind speed seem to have a positive correlation with departure delays - higher levels of precipitation and higher wind speeds correspond to delayed flights.

Gust speed did not appear to have much of a relationship with departure delay, except in the one outlier case of an over 60mph gust. Visibility had a negative correlation with departure delays - low visibility meant the flight was more likely to be delayed.

Precipitation

weather_delay <- flights %>%
  inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
  group_by(precip) %>%
  summarize(precip_delay=mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x=precip, y=precip_delay)) +
  geom_point()

Wind Speed

weather_delay <- flights %>%
  inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
  group_by(wind_speed) %>%
  summarize(wind_delay=mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x=wind_speed, y=wind_delay)) +
  geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

Gust Speed

weather_delay <- flights %>%
  inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
  group_by(wind_gust) %>%
  summarize(gust_delay=mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x=wind_gust, y=gust_delay)) +
  geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).

Visibility

weather_delay <- flights %>%
  inner_join(weather, by = c("origin", "year", "month", "day", "hour"))
weather_delay %>%
  group_by(visib) %>%
  summarize(visib_delay=mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x=visib, y=visib_delay)) +
  geom_point()

Exercise 5

On June 13, 2013, there was a severe storm in the mid- to south-east part of the United States (https://en.wikipedia.org/wiki/June_12%E2%80%9313,_2013_derecho_series).

flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarize(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()
## Warning: Removed 3 rows containing missing values (geom_point).

13.5.1

Exercise 1

flights %>%
  filter(is.na(tailnum))
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2       NA           1545        NA       NA           1910
##  2  2013     1     2       NA           1601        NA       NA           1735
##  3  2013     1     3       NA            857        NA       NA           1209
##  4  2013     1     3       NA            645        NA       NA            952
##  5  2013     1     4       NA            845        NA       NA           1015
##  6  2013     1     4       NA           1830        NA       NA           2044
##  7  2013     1     5       NA            840        NA       NA           1001
##  8  2013     1     7       NA            820        NA       NA            958
##  9  2013     1     8       NA           1645        NA       NA           1838
## 10  2013     1     9       NA            755        NA       NA           1012
## # … with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Flights without tail numbers do not have departure times or air times, which may mean that they were cancelled. If the flights were cancelled, there was no plane to record since it did not take off.

Exercise 2

top_flyers <- flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  count() %>%
  filter(n>=100)
top_flyers
## # A tibble: 1,217 x 2
## # Groups:   tailnum [1,217]
##    tailnum     n
##    <chr>   <int>
##  1 N0EGMQ    371
##  2 N10156    153
##  3 N10575    289
##  4 N11106    129
##  5 N11107    148
##  6 N11109    148
##  7 N11113    138
##  8 N11119    148
##  9 N11121    154
## 10 N11127    124
## # … with 1,207 more rows

Exercise 3

fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 x 12
##       id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integ…  1986 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  2  1834 Acura Integ…  1986 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  3  3037 Acura Integ…  1987 Subcom… Autom… Front-…     4   1.6 Regu…    28    22
##  4  3038 Acura Integ…  1987 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  5  4183 Acura Integ…  1988 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  6  4184 Acura Integ…  1988 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  7  5303 Acura Integ…  1989 Subcom… Autom… Front-…     4   1.6 Regu…    27    22
##  8  5304 Acura Integ…  1989 Subcom… Manua… Front-…     4   1.6 Regu…    28    23
##  9  6442 Acura Integ…  1990 Subcom… Autom… Front-…     4   1.8 Regu…    24    20
## 10  6443 Acura Integ…  1990 Subcom… Manua… Front-…     4   1.8 Regu…    26    21
## # … with 14,521 more rows

Exercise 4

delay_weather <- flights %>% 
  inner_join(weather, by= c("origin", "year", "month", "day", "hour")) %>%
  arrange(desc(dep_delay)) %>%
  slice(1:48)
delay_weather
## # A tibble: 48 x 29
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # … with 38 more rows, and 21 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour.x <dttm>, temp <dbl>,
## #   dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour.y <dttm>

I do not see any strong patterns.

Exercise 5

anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

The anti join here shows rows in flights that had destinations that are not on the FAA destinations list.

anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo…
##  2 06A   Moton Field Municipal A…  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo…
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo…
##  6 0A9   Elizabethton Municipal …  36.4  -82.2  1593    -5 A     America/New_Yo…
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo…
##  8 0G7   Finger Lakes Regional A…  42.9  -76.8   492    -5 A     America/New_Yo…
##  9 0P2   Shoestring Aviation Air…  39.8  -76.6  1000    -5 U     America/New_Yo…
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An…
## # … with 1,347 more rows

This anti join shows FAA destinations that no flights on our flights list arrived in.

Exercise 6

plane_airline <- flights %>%
  filter(!is.na(tailnum)) %>%
  distinct(tailnum, carrier)
multi_airline <- plane_airline %>%
  group_by(tailnum) %>%
  filter(n() > 1) %>%
  arrange(tailnum)
multi_airline
## # A tibble: 34 x 2
## # Groups:   tailnum [17]
##    carrier tailnum
##    <chr>   <chr>  
##  1 9E      N146PQ 
##  2 EV      N146PQ 
##  3 9E      N153PQ 
##  4 EV      N153PQ 
##  5 9E      N176PQ 
##  6 EV      N176PQ 
##  7 9E      N181PQ 
##  8 EV      N181PQ 
##  9 9E      N197PQ 
## 10 EV      N197PQ 
## # … with 24 more rows

There is not necessarily an implicit relationship between plane and airline; the data show that planes can be flown by multiple carriers (potentially due to airline partnerships such as the OneWorld Alliance, which could mean that a British Airways flight could be operated by American Airlines, for example).