library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)

band <- tribble(
   ~name,     ~band,
  "Mick",  "Stones",
  "John", "Beatles",
  "Paul", "Beatles"
)

instrument <- tribble(
    ~name,   ~plays,
   "John", "guitar",
   "Paul",   "bass",
  "Keith", "guitar"
)

instrument2 <- tribble(
    ~artist,   ~plays,
   "John", "guitar",
   "Paul",   "bass",
  "Keith", "guitar"
)

nycflights13

#View(flights)
#View(airlines)

mutating joins

band %>% left_join(instrument, by = "name")
## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass
band %>% right_join(instrument, by = "name")
## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar
band %>% full_join(instrument, by = "name")
## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar
band %>% inner_join(instrument, by = "name")
## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

Your Turn 1

Which airlines had the largest arrival delays? Complete the code below.

  1. Join airlines to flights
  2. Compute and order the average arrival delays by airline. Display full names, no codes.

(Hint: Be sure to remove each _ before running the code)

flights %>%
  filter(!is.na(arr_delay)) %>%
  left_join(airlines, by = "carrier") %>%
  group_by(name) %>%
  summarise(delay = mean(arr_delay)) %>%
  arrange(delay)
## # A tibble: 16 × 2
##    name                         delay
##    <chr>                        <dbl>
##  1 Alaska Airlines Inc.        -9.93 
##  2 Hawaiian Airlines Inc.      -6.92 
##  3 American Airlines Inc.       0.364
##  4 Delta Air Lines Inc.         1.64 
##  5 Virgin America               1.76 
##  6 US Airways Inc.              2.13 
##  7 United Air Lines Inc.        3.56 
##  8 Endeavor Air Inc.            7.38 
##  9 JetBlue Airways              9.46 
## 10 Southwest Airlines Co.       9.65 
## 11 Envoy Air                   10.8  
## 12 SkyWest Airlines Inc.       11.9  
## 13 Mesa Airlines Inc.          15.6  
## 14 ExpressJet Airlines Inc.    15.8  
## 15 AirTran Airways Corporation 20.1  
## 16 Frontier Airlines Inc.      21.9

Your Turn 2

Join flights and airports by dest and faa.

Then for each name, compute the distance from NYC and the average arr_delay. Hint: use first() to get the first value of distance.

Order by average delay, worst to best.

(Hint: Be sure to remove each _ before running the code)

flights %>%
  filter(!is.na(arr_delay)) %>%
  left_join(airports, by = c("dest" = "faa")) %>%
  group_by(name) %>%
  summarise(distance = first(distance),
  delay = mean(arr_delay)) %>%
  arrange(desc(delay))
## # A tibble: 101 × 3
##    name                          distance delay
##    <chr>                            <dbl> <dbl>
##  1 Columbia Metropolitan              602  41.8
##  2 Tulsa Intl                        1215  33.7
##  3 Will Rogers World                 1325  30.6
##  4 Jackson Hole Airport              1874  28.1
##  5 Mc Ghee Tyson                      631  24.1
##  6 Dane Co Rgnl Truax Fld             799  20.2
##  7 Richmond Intl                      277  20.1
##  8 Akron Canton Regional Airport      397  19.7
##  9 Des Moines Intl                   1017  19.0
## 10 Gerald R Ford Intl                 605  18.2
## # … with 91 more rows

filtering joins

band %>% semi_join(instrument, by = "name")
## # A tibble: 2 × 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles
band %>% anti_join(instrument, by = "name")
## # A tibble: 1 × 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones

Your Turn 3

How many airports in airports are serviced by flights in flights? (i.e. how many places can you fly to direct from New York?)

Notice that the column to join on is named faa in the airports data set and dest in the flights data set.

airports %>%
  semi_join(flights, by = c("faa" = "dest")) %>%
  select(faa)
## # A tibble: 101 × 1
##    faa  
##    <chr>
##  1 ABQ  
##  2 ACK  
##  3 ALB  
##  4 ANC  
##  5 ATL  
##  6 AUS  
##  7 AVL  
##  8 BDL  
##  9 BGR  
## 10 BHM  
## # … with 91 more rows

Take aways