Relational data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)

Mutating joins

flights2 <- flights %>% 
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 × 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1     5 EWR    IAH   N14228  UA     
##  2  2013     1     1     5 LGA    IAH   N24211  UA     
##  3  2013     1     1     5 JFK    MIA   N619AA  AA     
##  4  2013     1     1     5 JFK    BQN   N804JB  B6     
##  5  2013     1     1     6 LGA    ATL   N668DN  DL     
##  6  2013     1     1     5 EWR    ORD   N39463  UA     
##  7  2013     1     1     6 EWR    FLL   N516JB  B6     
##  8  2013     1     1     6 LGA    IAD   N829AS  EV     
##  9  2013     1     1     6 JFK    MCO   N593JB  B6     
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA     
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 8
#>    year month   day  hour origin dest  tailnum carrier
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
#> 1  2013     1     1     5 EWR    IAH   N14228  UA     
#> 2  2013     1     1     5 LGA    IAH   N24211  UA     
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA     
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6     
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL     
#> 6  2013     1     1     5 EWR    ORD   N39463  UA     
#> # ℹ 336,770 more rows
flights2 %>%
  select(-origin, -dest) %>% 
  left_join(airlines, by = "carrier")
## # A tibble: 336,776 × 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 7
#>    year month   day  hour tailnum carrier name                  
#>   <int> <int> <int> <dbl> <chr>   <chr>   <chr>                 
#> 1  2013     1     1     5 N14228  UA      United Air Lines Inc. 
#> 2  2013     1     1     5 N24211  UA      United Air Lines Inc. 
#> 3  2013     1     1     5 N619AA  AA      American Airlines Inc.
#> 4  2013     1     1     5 N804JB  B6      JetBlue Airways       
#> 5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.  
#> 6  2013     1     1     5 N39463  UA      United Air Lines Inc. 
#> # ℹ 336,770 more rows
flights2 %>%
  select(-origin, -dest) %>% 
  mutate(name = airlines$name[match(carrier, airlines$carrier)])
## # A tibble: 336,776 × 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 7
#>    year month   day  hour tailnum carrier name                  
#>   <int> <int> <int> <dbl> <chr>   <chr>   <chr>                 
#> 1  2013     1     1     5 N14228  UA      United Air Lines Inc. 
#> 2  2013     1     1     5 N24211  UA      United Air Lines Inc. 
#> 3  2013     1     1     5 N619AA  AA      American Airlines Inc.
#> 4  2013     1     1     5 N804JB  B6      JetBlue Airways       
#> 5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.  
#> 6  2013     1     1     5 N39463  UA      United Air Lines Inc. 
#> # ℹ 336,770 more rows
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)
x %>% 
  inner_join(y, by = "key")
## # A tibble: 2 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2
#> # A tibble: 2 × 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     1, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2"
)
left_join(x, y, by = "key")
## # A tibble: 4 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x3    y2   
## 4     1 x4    y1
#> # A tibble: 4 × 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2   
#> 3     2 x3    y2   
#> 4     1 x4    y1
x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     2, "x3",
     3, "x4"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     2, "y3",
     3, "y4"
)
left_join(x, y, by = "key")
## Warning in left_join(x, y, by = "key"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 2 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
## # A tibble: 6 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     2 x2    y3   
## 4     2 x3    y2   
## 5     2 x3    y3   
## 6     3 x4    y4
#> Warning in left_join(x, y, by = "key"): Detected an unexpected many-to-many relationship between `x` and `y`.
#> ℹ Row 2 of `x` matches multiple rows in `y`.
#> ℹ Row 2 of `y` matches multiple rows in `x`.
#> ℹ If a many-to-many relationship is expected, set `relationship =
#>   "many-to-many"` to silence this warning.
#> # A tibble: 6 × 3
#>     key val_x val_y
#>   <dbl> <chr> <chr>
#> 1     1 x1    y1   
#> 2     2 x2    y2   
#> 3     2 x2    y3   
#> 4     2 x3    y2   
#> 5     2 x3    y3   
#> 6     3 x4    y4
flights2 %>% 
  left_join(weather)
## Joining with `by = join_by(year, month, day, hour, origin)`
## # A tibble: 336,776 × 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8
## # ℹ 336,766 more rows
## # ℹ 7 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
#> Joining with `by = join_by(year, month, day, hour, origin)`
#> # A tibble: 336,776 × 18
#>    year month   day  hour origin dest  tailnum carrier  temp  dewp humid
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
#> 2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
#> 6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
#> # ℹ 336,770 more rows
#> # ℹ 7 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
#> #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
flights2 %>% 
  left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type            
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>           
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing mult…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing mult…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing mult…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing mult…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing mult…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed wing mult…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixed wing mult…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixed wing mult…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixed wing mult…
## 10   2013     1     1     6 LGA    ORD   N3ALAA  AA          NA <NA>            
## # ℹ 336,766 more rows
## # ℹ 6 more variables: manufacturer <chr>, model <chr>, engines <int>,
## #   seats <int>, speed <int>, engine <chr>
#> # A tibble: 336,776 × 16
#>   year.x month   day  hour origin dest  tailnum carrier year.y type             
#>    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>            
#> 1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing multi…
#> 2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing multi…
#> 3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing multi…
#> 4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing multi…
#> 5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing multi…
#> 6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed wing multi…
#> # ℹ 336,770 more rows
#> # ℹ 6 more variables: manufacturer <chr>, model <chr>, engines <int>,
#> #   seats <int>, speed <int>, engine <chr>
flights2 %>% 
  left_join(airports, c("dest" = "faa"))
## # A tibble: 336,776 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Georg…  30.0 -95.3    97
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Georg…  30.0 -95.3    97
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami…  25.8 -80.3     8
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>    NA    NA      NA
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Harts…  33.6 -84.4  1026
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Chica…  42.0 -87.9   668
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Fort …  26.1 -80.2     9
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Washi…  38.9 -77.5   313
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      Orlan…  28.4 -81.3    96
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chica…  42.0 -87.9   668
## # ℹ 336,766 more rows
## # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>
#> # A tibble: 336,776 × 15
#>    year month   day  hour origin dest  tailnum carrier name      lat   lon   alt
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA      George…  30.0 -95.3    97
#> 2  2013     1     1     5 LGA    IAH   N24211  UA      George…  30.0 -95.3    97
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami …  25.8 -80.3     8
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>     NA    NA      NA
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL      Hartsf…  33.6 -84.4  1026
#> 6  2013     1     1     5 EWR    ORD   N39463  UA      Chicag…  42.0 -87.9   668
#> # ℹ 336,770 more rows
#> # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>

flights2 %>% 
  left_join(airports, c("origin" = "faa"))
## # A tibble: 336,776 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Newar…  40.7 -74.2    18
##  2  2013     1     1     5 LGA    IAH   N24211  UA      La Gu…  40.8 -73.9    22
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      John …  40.6 -73.8    13
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      John …  40.6 -73.8    13
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      La Gu…  40.8 -73.9    22
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Newar…  40.7 -74.2    18
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Newar…  40.7 -74.2    18
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      La Gu…  40.8 -73.9    22
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      John …  40.6 -73.8    13
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      La Gu…  40.8 -73.9    22
## # ℹ 336,766 more rows
## # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>
#> # A tibble: 336,776 × 15
#>    year month   day  hour origin dest  tailnum carrier name      lat   lon   alt
#>   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH   N14228  UA      Newark…  40.7 -74.2    18
#> 2  2013     1     1     5 LGA    IAH   N24211  UA      La Gua…  40.8 -73.9    22
#> 3  2013     1     1     5 JFK    MIA   N619AA  AA      John F…  40.6 -73.8    13
#> 4  2013     1     1     5 JFK    BQN   N804JB  B6      John F…  40.6 -73.8    13
#> 5  2013     1     1     6 LGA    ATL   N668DN  DL      La Gua…  40.8 -73.9    22
#> 6  2013     1     1     5 EWR    ORD   N39463  UA      Newark…  40.7 -74.2    18
#> # ℹ 336,770 more rows
#> # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>

Exercises

  1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays. Here’s an easy way to draw a map of the United States:

airports %>% semi_join(flights, c(“faa” = “dest”)) %>% ggplot(aes(lon, lat)) + borders(“state”) + geom_point() + coord_quickmap()

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
  borders("state") +
  geom_point() +
  coord_quickmap()

avg_dest_delays <-
  flights %>%
  group_by(dest) %>%
  # arrival delay NA's are cancelled flights
  summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
  inner_join(airports, by = c(dest = "faa"))
#> `summarise()` ungrouping output (override with `.groups` argument)
avg_dest_delays %>%
  ggplot(aes(lon, lat, colour = delay)) +
  borders("state") +
  geom_point() +
  coord_quickmap()

  1. Add the location of the origin and destination (i.e. the lat and lon) to flights
airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(year:day, hour, origin, dest) %>%
  left_join(
    airport_locations,
    by = c("origin" = "faa")
  ) %>%
  left_join(
    airport_locations,
    by = c("dest" = "faa")
  )
## # A tibble: 336,776 × 10
##     year month   day  hour origin dest  lat.x lon.x lat.y lon.y
##    <int> <int> <int> <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH    40.7 -74.2  30.0 -95.3
##  2  2013     1     1     5 LGA    IAH    40.8 -73.9  30.0 -95.3
##  3  2013     1     1     5 JFK    MIA    40.6 -73.8  25.8 -80.3
##  4  2013     1     1     5 JFK    BQN    40.6 -73.8  NA    NA  
##  5  2013     1     1     6 LGA    ATL    40.8 -73.9  33.6 -84.4
##  6  2013     1     1     5 EWR    ORD    40.7 -74.2  42.0 -87.9
##  7  2013     1     1     6 EWR    FLL    40.7 -74.2  26.1 -80.2
##  8  2013     1     1     6 LGA    IAD    40.8 -73.9  38.9 -77.5
##  9  2013     1     1     6 JFK    MCO    40.6 -73.8  28.4 -81.3
## 10  2013     1     1     6 LGA    ORD    40.8 -73.9  42.0 -87.9
## # ℹ 336,766 more rows
#> # A tibble: 336,776 x 10
#>    year month   day  hour origin dest  lat.x lon.x lat.y lon.y
#>   <int> <int> <int> <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <dbl>
#> 1  2013     1     1     5 EWR    IAH    40.7 -74.2  30.0 -95.3
#> 2  2013     1     1     5 LGA    IAH    40.8 -73.9  30.0 -95.3
#> 3  2013     1     1     5 JFK    MIA    40.6 -73.8  25.8 -80.3
#> 4  2013     1     1     5 JFK    BQN    40.6 -73.8  NA    NA  
#> 5  2013     1     1     6 LGA    ATL    40.8 -73.9  33.6 -84.4
#> 6  2013     1     1     5 EWR    ORD    40.7 -74.2  42.0 -87.9
#> # … with 336,770 more rows
airport_locations <- airports %>%
  select(faa, lat, lon)

flights %>%
  select(year:day, hour, origin, dest) %>%
  left_join(
    airport_locations,
    by = c("origin" = "faa")
  ) %>%
  left_join(
    airport_locations,
    by = c("dest" = "faa"),
    suffix = c("_origin", "_dest")
    # existing lat and lon variables in tibble gain the _origin suffix
    # new lat and lon variables are given _dest suffix
  )
## # A tibble: 336,776 × 10
##     year month   day  hour origin dest  lat_origin lon_origin lat_dest lon_dest
##    <int> <int> <int> <dbl> <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
##  1  2013     1     1     5 EWR    IAH         40.7      -74.2     30.0    -95.3
##  2  2013     1     1     5 LGA    IAH         40.8      -73.9     30.0    -95.3
##  3  2013     1     1     5 JFK    MIA         40.6      -73.8     25.8    -80.3
##  4  2013     1     1     5 JFK    BQN         40.6      -73.8     NA       NA  
##  5  2013     1     1     6 LGA    ATL         40.8      -73.9     33.6    -84.4
##  6  2013     1     1     5 EWR    ORD         40.7      -74.2     42.0    -87.9
##  7  2013     1     1     6 EWR    FLL         40.7      -74.2     26.1    -80.2
##  8  2013     1     1     6 LGA    IAD         40.8      -73.9     38.9    -77.5
##  9  2013     1     1     6 JFK    MCO         40.6      -73.8     28.4    -81.3
## 10  2013     1     1     6 LGA    ORD         40.8      -73.9     42.0    -87.9
## # ℹ 336,766 more rows
#> # A tibble: 336,776 x 10
#>    year month   day  hour origin dest  lat_origin lon_origin lat_dest lon_dest
#>   <int> <int> <int> <dbl> <chr>  <chr>      <dbl>      <dbl>    <dbl>    <dbl>
#> 1  2013     1     1     5 EWR    IAH         40.7      -74.2     30.0    -95.3
#> 2  2013     1     1     5 LGA    IAH         40.8      -73.9     30.0    -95.3
#> 3  2013     1     1     5 JFK    MIA         40.6      -73.8     25.8    -80.3
#> 4  2013     1     1     5 JFK    BQN         40.6      -73.8     NA       NA  
#> 5  2013     1     1     6 LGA    ATL         40.8      -73.9     33.6    -84.4
#> 6  2013     1     1     5 EWR    ORD         40.7      -74.2     42.0    -87.9
#> # … with 336,770 more rows

3.Is there a relationship between the age of a plane and its delays? To compare the age of the plane to flights delay, I merge flights with the planes, which contains a variable plane_year, with the year in which the plane was built. To look at the relationship between plane age and departure delay

plane_cohorts <- inner_join(flights,
  select(planes, tailnum, plane_year = year),
  by = "tailnum"
) %>%
  mutate(age = year - plane_year) %>%
  filter(!is.na(age)) %>%
  mutate(age = if_else(age > 25, 25L, age)) %>%
  group_by(age) %>%
  summarise(
    dep_delay_mean = mean(dep_delay, na.rm = TRUE),
    dep_delay_sd = sd(dep_delay, na.rm = TRUE),
    arr_delay_mean = mean(arr_delay, na.rm = TRUE),
    arr_delay_sd = sd(arr_delay, na.rm = TRUE),
    n_arr_delay = sum(!is.na(arr_delay)),
    n_dep_delay = sum(!is.na(dep_delay))
  )
#> `summarise()` ungrouping output (override with `.groups` argument)
ggplot(plane_cohorts, aes(x = age, y = dep_delay_mean)) +
  geom_point() +
  scale_x_continuous("Age of plane (years)", breaks = seq(0, 30, by = 10)) +
  scale_y_continuous("Mean Departure Delay (minutes)")

ggplot(plane_cohorts, aes(x = age, y = arr_delay_mean)) +
  geom_point() +
  scale_x_continuous("Age of Plane (years)", breaks = seq(0, 30, by = 10)) +
  scale_y_continuous("Mean Arrival Delay (minutes)")

  1. What weather conditions make it more likely to see a delay? Almost any amount of precipitation is associated with a delay. However, there is not a strong a trend above 0.02 in. of precipitation.
flight_weather <-
  flights %>%
  inner_join(weather, by = c(
    "origin" = "origin",
    "year" = "year",
    "month" = "month",
    "day" = "day",
    "hour" = "hour"
  ))
flight_weather %>%
  group_by(precip) %>%
  summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = precip, y = delay)) +
  geom_line() + geom_point()

#> `summarise()` ungrouping output (override with `.groups` argument)
flight_weather %>%
  ungroup() %>%
  mutate(visib_cat = cut_interval(visib, n = 10)) %>%
  group_by(visib_cat) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ggplot(aes(x = visib_cat, y = dep_delay)) +
  geom_point()

#> `summarise()` ungrouping output (override with `.groups` argument)
  1. What happened on June 13, 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather. There was a large series of storms (derechos) in the southeastern US

##Filtering Joins

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest
## # A tibble: 10 × 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705
#> # A tibble: 10 × 2
#>   dest      n
#>   <chr> <int>
#> 1 ORD   17283
#> 2 ATL   17215
#> 3 LAX   16174
#> 4 BOS   15508
#> 5 MCO   14082
#> 6 CLT   14064
#> # ℹ 4 more rows
flights %>% 
  filter(dest %in% top_dest$dest)
## # A tibble: 141,145 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#> # A tibble: 141,145 × 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      542            540         2      923            850
#> 2  2013     1     1      554            600        -6      812            837
#> 3  2013     1     1      554            558        -4      740            728
#> 4  2013     1     1      555            600        -5      913            854
#> 5  2013     1     1      557            600        -3      838            846
#> 6  2013     1     1      558            600        -2      753            745
#> # ℹ 141,139 more rows
#> # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% 
  semi_join(top_dest)
## Joining with `by = join_by(dest)`
## # A tibble: 141,145 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#> Joining with `by = join_by(dest)`
#> # A tibble: 141,145 × 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      542            540         2      923            850
#> 2  2013     1     1      554            600        -6      812            837
#> 3  2013     1     1      554            558        -4      740            728
#> 4  2013     1     1      555            600        -5      913            854
#> 5  2013     1     1      557            600        -3      838            846
#> 6  2013     1     1      558            600        -2      753            745
#> # ℹ 141,139 more rows
#> # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% 
  semi_join(top_dest)
## Joining with `by = join_by(dest)`
## # A tibble: 141,145 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#> Joining with `by = join_by(dest)`
#> # A tibble: 141,145 × 19
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      542            540         2      923            850
#> 2  2013     1     1      554            600        -6      812            837
#> 3  2013     1     1      554            558        -4      740            728
#> 4  2013     1     1      555            600        -5      913            854
#> 5  2013     1     1      557            600        -3      838            846
#> 6  2013     1     1      558            600        -2      753            745
#> # ℹ 141,139 more rows
#> # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#> #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#> #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)
## # A tibble: 722 × 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ℹ 712 more rows
#> # A tibble: 722 × 2
#>   tailnum     n
#>   <chr>   <int>
#> 1 <NA>     2512
#> 2 N725MQ    575
#> 3 N722MQ    513
#> 4 N723MQ    507
#> 5 N713MQ    483
#> 6 N735MQ    396
#> # ℹ 716 more rows

Exercises

  1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common? (Hint: one variable explains ~90% of the problems.) Flights that have a missing tailnum all have missing values of arr_time, meaning that the flight was canceled. Many of the tail numbers that don’t have a matching value in planes are registered to American Airlines (AA) or Envoy Airlines (MQ). The documentation for planes states

2.Filter flights to only show flights with planes that have flown at least 100 flights.

flights %>%
  filter(!is.na(tailnum)) %>%
  group_by(tailnum) %>%
  mutate(n = n()) %>%
  filter(n >= 100)
## # A tibble: 228,390 × 20
## # Groups:   tailnum [1,217]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      544            545        -1     1004           1022
##  4  2013     1     1      554            558        -4      740            728
##  5  2013     1     1      555            600        -5      913            854
##  6  2013     1     1      557            600        -3      709            723
##  7  2013     1     1      557            600        -3      838            846
##  8  2013     1     1      558            600        -2      849            851
##  9  2013     1     1      558            600        -2      853            856
## 10  2013     1     1      558            600        -2      923            937
## # ℹ 228,380 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, n <int>
#> # A tibble: 228,390 x 20
#> # Groups:   tailnum [1,217]
#>    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
#>   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
#> 1  2013     1     1      517            515         2      830            819
#> 2  2013     1     1      533            529         4      850            830
#> 3  2013     1     1      544            545        -1     1004           1022
#> 4  2013     1     1      554            558        -4      740            728
#> 5  2013     1     1      555            600        -5      913            854
#> 6  2013     1     1      557            600        -3      709            723
#> # … with 228,384 more rows, and 12 more variables: arr_delay <dbl>,
#> #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#> #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
#> #   n <int>

3.Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

fueleconomy::vehicles %>%
  semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 × 12
##       id make  model    year class     trans drive   cyl displ fuel    hwy   cty
##    <dbl> <chr> <chr>   <dbl> <chr>     <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
##  1  1833 Acura Integra  1986 Subcompa… Auto… Fron…     4   1.6 Regu…    28    22
##  2  1834 Acura Integra  1986 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  3  3037 Acura Integra  1987 Subcompa… Auto… Fron…     4   1.6 Regu…    28    22
##  4  3038 Acura Integra  1987 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  5  4183 Acura Integra  1988 Subcompa… Auto… Fron…     4   1.6 Regu…    27    22
##  6  4184 Acura Integra  1988 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  7  5303 Acura Integra  1989 Subcompa… Auto… Fron…     4   1.6 Regu…    27    22
##  8  5304 Acura Integra  1989 Subcompa… Manu… Fron…     4   1.6 Regu…    28    23
##  9  6442 Acura Integra  1990 Subcompa… Auto… Fron…     4   1.8 Regu…    24    20
## 10  6443 Acura Integra  1990 Subcompa… Manu… Fron…     4   1.8 Regu…    26    21
## # ℹ 14,521 more rows
#> # A tibble: 14,531 x 12
#>      id make  model   year class   trans   drive     cyl displ fuel    hwy   cty
#>   <dbl> <chr> <chr>  <dbl> <chr>   <chr>   <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
#> 1  1833 Acura Integ…  1986 Subcom… Automa… Front-…     4   1.6 Regu…    28    22
#> 2  1834 Acura Integ…  1986 Subcom… Manual… Front-…     4   1.6 Regu…    28    23
#> 3  3037 Acura Integ…  1987 Subcom… Automa… Front-…     4   1.6 Regu…    28    22
#> 4  3038 Acura Integ…  1987 Subcom… Manual… Front-…     4   1.6 Regu…    28    23
#> 5  4183 Acura Integ…  1988 Subcom… Automa… Front-…     4   1.6 Regu…    27    22
#> 6  4184 Acura Integ…  1988 Subcom… Manual… Front-…     4   1.6 Regu…    28    23
#> # … with 14,525 more rows
fueleconomy::vehicles %>%
  distinct(model, make) %>%
  group_by(model) %>%
  filter(n() > 1) %>%
  arrange(model)
## # A tibble: 126 × 2
## # Groups:   model [60]
##    model               make                  
##    <chr>               <chr>                 
##  1 200                 Audi                  
##  2 200                 Chrysler              
##  3 240 DL/240 GL Wagon Mcevoy Motors         
##  4 240 DL/240 GL Wagon Volvo                 
##  5 300E                Lambda Control Systems
##  6 300E                Mercedes-Benz         
##  7 300SL               J.K. Motors           
##  8 300SL               Mercedes-Benz         
##  9 500SE               Mercedes-Benz         
## 10 500SE               Texas Coach Company   
## # ℹ 116 more rows
#> # A tibble: 126 x 2
#> # Groups:   model [60]
#>   make                   model              
#>   <chr>                  <chr>              
#> 1 Audi                   200                
#> 2 Chrysler               200                
#> 3 Mcevoy Motors          240 DL/240 GL Wagon
#> 4 Volvo                  240 DL/240 GL Wagon
#> 5 Lambda Control Systems 300E               
#> 6 Mercedes-Benz          300E               
#> # … with 120 more rows
fueleconomy::common %>%
  distinct(model, make) %>%
  group_by(model) %>%
  filter(n() > 1) %>%
  arrange(model)
## # A tibble: 8 × 2
## # Groups:   model [3]
##   model     make      
##   <chr>     <chr>     
## 1 Colt      Dodge     
## 2 Colt      Plymouth  
## 3 Truck 2WD Mitsubishi
## 4 Truck 2WD Nissan    
## 5 Truck 2WD Toyota    
## 6 Truck 4WD Mitsubishi
## 7 Truck 4WD Nissan    
## 8 Truck 4WD Toyota
#> # A tibble: 8 x 2
#> # Groups:   model [3]
#>   make       model    
#>   <chr>      <chr>    
#> 1 Dodge      Colt     
#> 2 Plymouth   Colt     
#> 3 Mitsubishi Truck 2WD
#> 4 Nissan     Truck 2WD
#> 5 Toyota     Truck 2WD
#> 6 Mitsubishi Truck 4WD
#> # … with 2 more rows

4.Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patterns?

worst_hours <- flights %>%
  mutate(hour = sched_dep_time %/% 100) %>%
  group_by(origin, year, month, day, hour) %>%
  summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  ungroup() %>%
  arrange(desc(dep_delay)) %>%
  slice(1:48)
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can
## override using the `.groups` argument.
#> `summarise()` regrouping output by 'origin', 'year', 'month', 'day' (override with `.groups` argument)
weather_most_delayed <- semi_join(weather, worst_hours, 
                                  by = c("origin", "year",
                                         "month", "day", "hour"))
select(weather_most_delayed, temp, wind_speed, precip) %>%
  print(n = 48)
## # A tibble: 48 × 3
##     temp wind_speed precip
##    <dbl>      <dbl>  <dbl>
##  1  27.0      13.8    0   
##  2  28.0      19.6    0   
##  3  28.9      28.8    0   
##  4  33.8       9.21   0.06
##  5  34.0       8.06   0.05
##  6  80.1       8.06   0   
##  7  86        13.8    0   
##  8  73.4       6.90   0.08
##  9  84.0       5.75   0   
## 10  78.8      18.4    0.23
## 11  53.6       0      0   
## 12  60.8      31.1    0.11
## 13  55.4      17.3    0.14
## 14  53.1       9.21   0.01
## 15  55.9      11.5    0.1 
## 16  55.4       8.06   0.15
## 17  57.0      29.9    0   
## 18  33.8      20.7    0.02
## 19  34.0      19.6    0.01
## 20  36.0      21.9    0.01
## 21  37.9      16.1    0   
## 22  32        13.8    0.12
## 23  60.1      33.4    0.14
## 24  60.8      11.5    0.02
## 25  62.1      17.3    0   
## 26  66.9      10.4    0   
## 27  66.9      13.8    0   
## 28  79.0      10.4    0   
## 29  77        16.1    0.07
## 30  75.9      13.8    0   
## 31  82.4       8.06   0   
## 32  86         9.21   0   
## 33  80.1       9.21   0   
## 34  80.6      11.5    0   
## 35  78.1       6.90   0   
## 36  75.2      10.4    0.01
## 37  73.9       5.75   0.03
## 38  73.9       8.06   0   
## 39  75.0       4.60   0   
## 40  75.0       4.60   0.01
## 41  80.1       0      0.01
## 42  80.1       0      0   
## 43  77        10.4    0   
## 44  82.0      10.4    0   
## 45  72.0      13.8    0.3 
## 46  72.0       4.60   0.03
## 47  51.1       4.60   0   
## 48  54.0       6.90   0
#> # A tibble: 48 x 3
#>     temp wind_speed precip
#>    <dbl>      <dbl>  <dbl>
#>  1  27.0      13.8    0   
#>  2  28.0      19.6    0   
#>  3  28.9      28.8    0   
#>  4  33.8       9.21   0.06
#>  5  34.0       8.06   0.05
#>  6  80.1       8.06   0   
#>  7  86        13.8    0   
#>  8  73.4       6.90   0.08
#>  9  84.0       5.75   0   
#> 10  78.8      18.4    0.23
#> 11  53.6       0      0   
#> 12  60.8      31.1    0.11
#> 13  55.4      17.3    0.14
#> 14  53.1       9.21   0.01
#> 15  55.9      11.5    0.1 
#> 16  55.4       8.06   0.15
#> 17  57.0      29.9    0   
#> 18  33.8      20.7    0.02
#> 19  34.0      19.6    0.01
#> 20  36.0      21.9    0.01
#> 21  37.9      16.1    0   
#> 22  32        13.8    0.12
#> 23  60.1      33.4    0.14
#> 24  60.8      11.5    0.02
#> 25  62.1      17.3    0   
#> 26  66.9      10.4    0   
#> 27  66.9      13.8    0   
#> 28  79.0      10.4    0   
#> 29  77        16.1    0.07
#> 30  75.9      13.8    0   
#> 31  82.4       8.06   0   
#> 32  86         9.21   0   
#> 33  80.1       9.21   0   
#> 34  80.6      11.5    0   
#> 35  78.1       6.90   0   
#> 36  75.2      10.4    0.01
#> 37  73.9       5.75   0.03
#> 38  73.9       8.06   0   
#> 39  75.0       4.60   0   
#> 40  75.0       4.60   0.01
#> 41  80.1       0      0.01
#> 42  80.1       0      0   
#> 43  77        10.4    0   
#> 44  82.0      10.4    0   
#> 45  72.0      13.8    0.3 
#> 46  72.0       4.60   0.03
#> 47  51.1       4.60   0   
#> 48  54.0       6.90   0
ggplot(weather_most_delayed, aes(x = precip, y = wind_speed, color = temp)) +
  geom_point()

5.What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

anti_join(flights, airports, by = c("dest" = "faa")) %>% 
  distinct(dest)
## # A tibble: 4 × 1
##   dest 
##   <chr>
## 1 BQN  
## 2 SJU  
## 3 STT  
## 4 PSE
#> # A tibble: 4 x 1
#>   dest 
#>   <chr>
#> 1 BQN  
#> 2 SJU  
#> 3 STT  
#> 4 PSE
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 × 8
##    faa   name                             lat    lon   alt    tz dst   tzone    
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
## # ℹ 1,347 more rows
#> # A tibble: 1,357 x 8
#>   faa   name                          lat   lon   alt    tz dst   tzone         
#>   <chr> <chr>                       <dbl> <dbl> <dbl> <dbl> <chr> <chr>         
#> 1 04G   Lansdowne Airport            41.1 -80.6  1044    -5 A     America/New_Y…
#> 2 06A   Moton Field Municipal Airp…  32.5 -85.7   264    -6 A     America/Chica…
#> 3 06C   Schaumburg Regional          42.0 -88.1   801    -6 A     America/Chica…
#> 4 06N   Randall Airport              41.4 -74.4   523    -5 A     America/New_Y…
#> 5 09J   Jekyll Island Airport        31.1 -81.4    11    -5 A     America/New_Y…
#> 6 0A9   Elizabethton Municipal Air…  36.4 -82.2  1593    -5 A     America/New_Y…
#> # … with 1,351 more rows
  1. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.
planes_carriers <-
  flights %>%
  filter(!is.na(tailnum)) %>%
  distinct(tailnum, carrier)
planes_carriers %>%
  count(tailnum) %>%
  filter(n > 1) %>%
  nrow()
## [1] 17
#> [1] 17
carrier_transfer_tbl <- planes_carriers %>%
  # keep only planes which have flown for more than one airline
  group_by(tailnum) %>%
  filter(n() > 1) %>%
  # join with airlines to get airline names
  left_join(airlines, by = "carrier") %>%
  arrange(tailnum, carrier)

carrier_transfer_tbl
## # A tibble: 34 × 3
## # Groups:   tailnum [17]
##    tailnum carrier name                    
##    <chr>   <chr>   <chr>                   
##  1 N146PQ  9E      Endeavor Air Inc.       
##  2 N146PQ  EV      ExpressJet Airlines Inc.
##  3 N153PQ  9E      Endeavor Air Inc.       
##  4 N153PQ  EV      ExpressJet Airlines Inc.
##  5 N176PQ  9E      Endeavor Air Inc.       
##  6 N176PQ  EV      ExpressJet Airlines Inc.
##  7 N181PQ  9E      Endeavor Air Inc.       
##  8 N181PQ  EV      ExpressJet Airlines Inc.
##  9 N197PQ  9E      Endeavor Air Inc.       
## 10 N197PQ  EV      ExpressJet Airlines Inc.
## # ℹ 24 more rows
#> # A tibble: 34 x 3
#> # Groups:   tailnum [17]
#>   carrier tailnum name                    
#>   <chr>   <chr>   <chr>                   
#> 1 9E      N146PQ  Endeavor Air Inc.       
#> 2 EV      N146PQ  ExpressJet Airlines Inc.
#> 3 9E      N153PQ  Endeavor Air Inc.       
#> 4 EV      N153PQ  ExpressJet Airlines Inc.
#> 5 9E      N176PQ  Endeavor Air Inc.       
#> 6 EV      N176PQ  ExpressJet Airlines Inc.
#> # … with 28 more rows

Join Problems

airports %>% count(alt, lon) %>% filter(n > 1)
## # A tibble: 0 × 3
## # ℹ 3 variables: alt <dbl>, lon <dbl>, n <int>
#> # A tibble: 0 × 3
#> # ℹ 3 variables: alt <dbl>, lon <dbl>, n <int>

Set operations

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)
intersect(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     1

# Note that we get 3 rows, not 4
union(df1, df2)
## # A tibble: 3 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     2
#> # A tibble: 3 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     1
#> 2     2     1
#> 3     1     2

setdiff(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     2     1
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     2     1

setdiff(df2, df1)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     2
#> # A tibble: 1 × 2
#>       x     y
#>   <dbl> <dbl>
#> 1     1     2

Celtics 2025 nba champs