## Warning: package 'viridis' was built under R version 4.2.1
## Warning: package 'fueleconomy' was built under R version 4.2.1


13.4 Mutating Joins


1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States:

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
  borders("state") +
  geom_point() +
  coord_quickmap()

avg_dest_delays <-
  flights %>%
  group_by(dest) %>%
  # arrival delay NA's are cancelled flights
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c(dest = "faa"))

avg_dest_delays %>%
  ggplot(aes(lon, lat, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()


2. Add the location of the origin and destination (i.e. the lat and lon) to flights.

airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(year:day, hour, origin, dest) %>%
  left_join(
    airport_locations,
    by = c("origin" = "faa")
  ) %>%
  left_join(
    airport_locations,
    by = c("dest" = "faa"),
    suffix = c("_origin", "_dest")
    # existing lat and lon variables in tibble gain the _origin suffix
    # new lat and lon variables are given _dest suffix
  )
## # A tibble: 336,776 × 10
##     year month   day  hour origin dest  lat_origin lon_origin lat_dest lon_dest
##    <int> <int> <int> <dbl> <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
##  1  2013     1     1     5 EWR    IAH         40.7      -74.2     30.0    -95.3
##  2  2013     1     1     5 LGA    IAH         40.8      -73.9     30.0    -95.3
##  3  2013     1     1     5 JFK    MIA         40.6      -73.8     25.8    -80.3
##  4  2013     1     1     5 JFK    BQN         40.6      -73.8     NA       NA  
##  5  2013     1     1     6 LGA    ATL         40.8      -73.9     33.6    -84.4
##  6  2013     1     1     5 EWR    ORD         40.7      -74.2     42.0    -87.9
##  7  2013     1     1     6 EWR    FLL         40.7      -74.2     26.1    -80.2
##  8  2013     1     1     6 LGA    IAD         40.8      -73.9     38.9    -77.5
##  9  2013     1     1     6 JFK    MCO         40.6      -73.8     28.4    -81.3
## 10  2013     1     1     6 LGA    ORD         40.8      -73.9     42.0    -87.9
## # … with 336,766 more rows


5.What happened on June 13, 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.

flights %>%
  filter(year == 2013, month == 6, day == 13) %>%
  group_by(dest) %>%
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c("dest" = "faa")) %>%
  ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap() +
  scale_colour_viridis()
## Warning: Removed 3 rows containing missing values (geom_point).


2. Filter flights to only show flights with planes that have flown at least 100 flights.

#Find all planes that have flown at least 100 flights. 
planes_gte100 <- flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  count() %>%
  filter(n >= 100)

#semi join the data frame of planes that have flown at least 100 flights to the data frame of flights to select the flights by those planes.
flights %>%
  semi_join(planes_gte100, by = "tailnum")
## # A tibble: 228,390 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # … with 228,380 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>


3.Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 × 12
##       id make  model    year class     trans drive   cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>   <dbl> <chr>     <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integra  1986 Subcompa… Auto… Fron…     4   1.6 Regu…    28    22
##  2  1834 Acura Integra  1986 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  3  3037 Acura Integra  1987 Subcompa… Auto… Fron…     4   1.6 Regu…    28    22
##  4  3038 Acura Integra  1987 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  5  4183 Acura Integra  1988 Subcompa… Auto… Fron…     4   1.6 Regu…    27    22
##  6  4184 Acura Integra  1988 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  7  5303 Acura Integra  1989 Subcompa… Auto… Fron…     4   1.6 Regu…    27    22
##  8  5304 Acura Integra  1989 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  9  6442 Acura Integra  1990 Subcompa… Auto… Fron…     4   1.8 Regu…    24    20
## 10  6443 Acura Integra  1990 Subcompa… Manu… Fron…     4   1.8 Regu…    26    21
## # … with 14,521 more rows


4.Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

#Group flights by hour of scheduled departure time and calculate the average delay. Then select the 48 observations (hours) with the highest average delay.
worst_hours <- flights %>%
  mutate(hour = sched_dep_time %/% 100) %>%
  group_by(origin, year, month, day, hour) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(dep_delay)) %>%
  slice(1:48)
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can
## override using the `.groups` argument.
#Then use semi_join() to get the weather for these hours.
weather_most_delayed <- semi_join(weather, worst_hours, 
                                  by = c("origin", "year",
                                         "month", "day", "hour"))
select(weather_most_delayed, temp, wind_speed, precip) %>%
  print(n = 48)
## # A tibble: 48 × 3
##     temp wind_speed precip
##    <dbl>      <dbl>  <dbl>
##  1  27.0      13.8    0   
##  2  28.0      19.6    0   
##  3  28.9      28.8    0   
##  4  33.8       9.21   0.06
##  5  34.0       8.06   0.05
##  6  80.1       8.06   0   
##  7  86        13.8    0   
##  8  73.4       6.90   0.08
##  9  84.0       5.75   0   
## 10  78.8      18.4    0.23
## 11  53.6       0      0   
## 12  60.8      31.1    0.11
## 13  55.4      17.3    0.14
## 14  53.1       9.21   0.01
## 15  55.9      11.5    0.1 
## 16  55.4       8.06   0.15
## 17  57.0      29.9    0   
## 18  33.8      20.7    0.02
## 19  34.0      19.6    0.01
## 20  36.0      21.9    0.01
## 21  37.9      16.1    0   
## 22  32        13.8    0.12
## 23  60.1      33.4    0.14
## 24  60.8      11.5    0.02
## 25  62.1      17.3    0   
## 26  66.9      10.4    0   
## 27  66.9      13.8    0   
## 28  79.0      10.4    0   
## 29  77        16.1    0.07
## 30  75.9      13.8    0   
## 31  82.4       8.06   0   
## 32  86         9.21   0   
## 33  80.1       9.21   0   
## 34  80.6      11.5    0   
## 35  78.1       6.90   0   
## 36  75.2      10.4    0.01
## 37  73.9       5.75   0.03
## 38  73.9       8.06   0   
## 39  75.0       4.60   0   
## 40  75.0       4.60   0.01
## 41  80.1       0      0.01
## 42  80.1       0      0   
## 43  77        10.4    0   
## 44  82.0      10.4    0   
## 45  72.0      13.8    0.3 
## 46  72.0       4.60   0.03
## 47  51.1       4.60   0   
## 48  54.0       6.90   0
ggplot(weather_most_delayed, aes(x = precip, y = wind_speed, color = temp)) +
  geom_point()