pacman::p_load(tidyverse, nycflights13, maps, RColorBrewer, lubridate, knitr)

Exercises 13.4.6

https://r4ds.had.co.nz/relational-data.html#exercises-30

1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays.

delays <- filter(flights, arr_delay > 0) %>%        # delays data frame of late arrivals
   group_by (dest)  %>%                             # group by month
   summarize (delay = mean(arr_delay)) %>%          # the mean arrival delay
  arrange(delay)

# plot the delays on a US map

delays %>%
  left_join(airports, c("dest" = "faa")) %>%        # get the longitude and latitude for destination airports
   ggplot(aes(lon, lat, color = delay, size = 1.5, alpha = 0.6) ) +          
    borders("state") +
    geom_point(na.rm = TRUE) +
    scale_color_gradient(low = "grey", high = "darkred") +
    theme_light() +
    coord_quickmap()

2. Add the location of the origin and destination (i.e. the lat and lon) to flights.

latlon <- select(airports, faa, lat, lon)            # select the columns we want from airports
flights %>%
  left_join(latlon, c("origin" = "faa")) %>%          # join to get origin lat and lon
  rename(lat_orig = lat, lon_orig = lon) %>%          # rename the new columns for origin
  left_join(latlon, c("origin" = "faa")) %>%          # join to get destination lat & lon
  rename(lat_dest = lat, lon_dest = lon) %>%          # rename the new columns for destination
  head(5) %>% 
  select(origin:lon_dest) %>% 
  kable()
origin dest air_time distance hour minute time_hour lat_orig lon_orig lat_dest lon_dest
EWR IAH 227 1400 5 15 2013-01-01 05:00:00 40.69250 -74.16867 40.69250 -74.16867
LGA IAH 227 1416 5 29 2013-01-01 05:00:00 40.77725 -73.87261 40.77725 -73.87261
JFK MIA 160 1089 5 40 2013-01-01 05:00:00 40.63975 -73.77893 40.63975 -73.77893
JFK BQN 183 1576 5 45 2013-01-01 05:00:00 40.63975 -73.77893 40.63975 -73.77893
LGA ATL 116 762 6 0 2013-01-01 06:00:00 40.77725 -73.87261 40.77725 -73.87261

3. Is there a relationship between the age of a plane and its delays?

There does not appear to be a relationship between the age of a plane and it’s delays, when looking at the entire data set. Graphs of the total delay = departure delay + arrival delay by year don’t show a linear relationship.

However looking at the graph in time segments reveals some correlations. From about 1997 to around 2004 there is a positive relationship between the age the plane and total delays. With older planes having fewer delays. And from 2004 to 2013 there is a negative relationship with younger planes having fewer total delays.

flights %>% na.omit() %>%                                   # remove observations w/ NA
            select(tailnum, dep_delay, arr_delay) %>%
            mutate(tot_delay = dep_delay + arr_delay) %>%   # sum up the total delay for each flight
            filter(tot_delay > 0) %>%                       # use the records with a positive total delay
            left_join(planes) %>%                           # join with planes to get year put in service
            arrange(year) %>% 
                    ggplot(aes(x=year, y=tot_delay)) +      # plot a line graph
                    geom_smooth(na.rm = TRUE)
## Joining, by = "tailnum"
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

4. What weather conditions make it more likely to see a delay?

Precipitation appears to be the largest factor in longer delays. Greater delays happen with precipitation around 1.18, dew point around 79, wind gust above 65, visibility below 2.5, pressure about 1000, wind speed about 33, humidity at 100, and/or wind direction around 125.

flights %>% na.omit() %>%   
            left_join(weather, by = c("origin", "time_hour")) %>%
            select(arr_delay, dep_delay, temp:visib) %>%
            mutate(tot_delay = arr_delay + dep_delay) %>%
            arrange(desc(tot_delay)) %>%
            filter(tot_delay > 0) %>%
              pivot_longer(cols = temp:visib,
              names_to = "wthr",
              values_to = "amt"  ) %>%
            ggplot(aes(x = amt, y = tot_delay, color = wthr)) +
              geom_smooth(na.rm = TRUE) +
              facet_wrap(vars(wthr), scales = "free")
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

5. What happened on June 13 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.

Storms in the mid-Atlantic caused cancellation of 900 flights and 7400 delays.

fl2 <- na.omit (flights)  %>%                        # remove observations with NA values
       filter(month == 6 & day == 13) %>%
       mutate(tot_delay = arr_delay + dep_delay) %>% # create in-flight delay column
       filter(tot_delay > 0) %>%
       left_join(airports, c("dest" = "faa"))

fl2 %>%  ggplot(aes(lon, lat, color = tot_delay, size = 2)) +
    borders("state") +
    geom_point(na.rm = TRUE) +
    scale_color_gradient(low = "grey", high = "darkred") +
    theme_light() +
    coord_quickmap()

13.5.1 Exercises

https://r4ds.had.co.nz/relational-data.html#exercises-31

1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.)

Flights with a missing tailnum are flights that did not occur, they were cancelled. Flights that don’t have a matching record in planes are primarily flown by Envoy Air or American Airlines.

filter(flights, is.na(tailnum))                    # find flights with missing tailnum
## # A tibble: 2,512 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     2       NA           1545        NA       NA           1910
##  2  2013     1     2       NA           1601        NA       NA           1735
##  3  2013     1     3       NA            857        NA       NA           1209
##  4  2013     1     3       NA            645        NA       NA            952
##  5  2013     1     4       NA            845        NA       NA           1015
##  6  2013     1     4       NA           1830        NA       NA           2044
##  7  2013     1     5       NA            840        NA       NA           1001
##  8  2013     1     7       NA            820        NA       NA            958
##  9  2013     1     8       NA           1645        NA       NA           1838
## 10  2013     1     9       NA            755        NA       NA           1012
## # ... with 2,502 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
anti_join(flights, planes, by = "tailnum") %>%     # find flights w/o matching record in planes
  count(carrier) %>%                               # count by carrier
  left_join(airlines)                              # find the airline name
## Joining, by = "carrier"
## # A tibble: 10 x 3
##    carrier     n name                       
##    <chr>   <int> <chr>                      
##  1 9E       1044 Endeavor Air Inc.          
##  2 AA      22558 American Airlines Inc.     
##  3 B6        830 JetBlue Airways            
##  4 DL        110 Delta Air Lines Inc.       
##  5 F9         50 Frontier Airlines Inc.     
##  6 FL        187 AirTran Airways Corporation
##  7 MQ      25397 Envoy Air                  
##  8 UA       1693 United Air Lines Inc.      
##  9 US        699 US Airways Inc.            
## 10 WN         38 Southwest Airlines Co.

2. Filter flights to only show flights with planes that have flown at least 100 flights.

f100 <- flights %>% 
        count(tailnum) %>%           # find the planes that have flown 100+ flights
        filter(n >= 100)

flights %>% semi_join(f100)             # filter flights for planes that have flown 100+ flights
## Joining, by = "tailnum"
## # A tibble: 230,902 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # ... with 230,892 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

3. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

vehicles contains 33.442 rows, common contains 347 rows

The semi_join of vehicles to common selects 14,531 rows from vehicles.

pacman::p_load(fueleconomy)
count(vehicles)
## # A tibble: 1 x 1
##       n
##   <int>
## 1 33442
count(common)
## # A tibble: 1 x 1
##       n
##   <int>
## 1   347
semi_join(vehicles, common) %>%
  count()
## Joining, by = c("make", "model")
## # A tibble: 1 x 1
##       n
##   <int>
## 1 14531

4. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

The full year plot shows the greatest total delays in early July. Focusing into a few week period from late June to early July pinpoints the largest delays to around July 1. And the weather history does show heavy rain and fog on the morning of July 1.

fl2 <- na.omit (flights)  %>%                           # remove observations with NA values
        mutate(tot_delay = arr_delay + dep_delay)        # create total delay column

fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) +          # plot total delay for the entire year
          geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# Focus in on a few weeks around peak total delays in the year

fl2 <- fl2 %>% filter(time_hour > "2013-06-23 00:30:00" & time_hour < "2013-07-04 23:59:59" )  
fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) +
                 geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

# narrow down to the 48 hours around peak total delays in the year

fl2 <- fl2 %>% filter(time_hour > "2013-06-30 13:30:00" & time_hour < "2013-07-02 12:29:59" )  
fl2 %>% ggplot(aes(x=time_hour, y=tot_delay)) +
                 geom_smooth()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

Looks like a storm, with fog perhaps, was coming in on the afternoon of June 30th. Visibility was low, pressure increasing, humidity was high and there was a sharp increase in rain during the morning of July 1

fl2 <- left_join(fl2,weather, by = c("origin", "time_hour")) 
fl2$wind_gust <-  replace_na(0) 
fl2 %>% pivot_longer(cols = temp:visib,
              names_to = "wthr",
              values_to = "amt"  ) %>%
            ggplot(aes(x = time_hour, y = amt, color = wthr)) +
              geom_smooth(na.rm = TRUE) +
              facet_wrap(vars(wthr), scales = "free") +
              theme(axis.text.x = element_text(angle = 65, vjust = .55))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
## Warning: Computation failed in `stat_smooth()`:
## NA/NaN/Inf in foreign function call (arg 3)

5. What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

There are 7,602 flights where the destination is not included in the airports dataset. There are 1,357 airports that are not destinations for a flight in the flights dataset.

anti_join(flights, airports, by = c("dest" = "faa")) 
## # A tibble: 7,602 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # ... with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
anti_join(airports, flights, by = c("faa" = "dest")) 
## # A tibble: 1,357 x 8
##    faa   name                       lat    lon   alt    tz dst   tzone          
##    <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
##  1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
##  2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
##  3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
##  4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
##  5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
##  6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
##  7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
##  8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
##  9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
## 10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
## # ... with 1,347 more rows

6. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.

There are 17 planes that are each flown by two carriers, the hypothesis is rejected.

pf <- select(flights, tailnum, carrier) %>%     # select desired columns, get unique rows
      unique() %>% 
      right_join(planes, by = "tailnum")        # join to planes

p1 <- pf %>% 
      count(tailnum, sort = TRUE) %>%           
      filter(n > 1)                             # find planes w/more than one carrier
p1 %>%  left_join(pf) %>%                       # join to get the carrier codes
        kable()
## Joining, by = "tailnum"
tailnum n carrier year type manufacturer model engines seats speed engine
N146PQ 2 9E 2007 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N146PQ 2 EV 2007 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N153PQ 2 9E 2007 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N153PQ 2 EV 2007 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N176PQ 2 9E 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N176PQ 2 EV 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N181PQ 2 9E 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N181PQ 2 EV 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N197PQ 2 9E 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N197PQ 2 EV 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N200PQ 2 9E 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N200PQ 2 EV 2008 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N228PQ 2 9E 2009 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N228PQ 2 EV 2009 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N232PQ 2 9E 2009 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N232PQ 2 EV 2009 Fixed wing multi engine BOMBARDIER INC CL-600-2D24 2 95 NA Turbo-fan
N933AT 2 FL 2000 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N933AT 2 DL 2000 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N935AT 2 FL 2000 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N935AT 2 DL 2000 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N977AT 2 FL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N977AT 2 DL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N978AT 2 FL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N978AT 2 DL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N979AT 2 FL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N979AT 2 DL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N981AT 2 FL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N981AT 2 DL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N989AT 2 FL 2001 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N989AT 2 DL 2001 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N990AT 2 FL 2001 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N990AT 2 DL 2001 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N994AT 2 FL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan
N994AT 2 DL 2002 Fixed wing multi engine BOEING 717-200 2 100 NA Turbo-fan