Joins and dplyr practice

Load the "nycflights13" package. You’ll also need to load dplyr or tidyverse, if you didn’t above.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("nycflights13")
library(nycflights13)
my_flights <- flights # store my own version
my_flights <- na.omit(my_flights) ## this dataset has data on cancelled flights too, 
                                  ## so we 'll remove those for now
my_flights %>% head()
## # A tibble: 6 × 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#my_flights %>% summary()
#any(is.na(my_flights))

Create a dataframe of the average arrival delays for each destination

arr_delay <- my_flights %>% group_by(dest) %>% summarise(avg_delay = mean(arr_delay ))

Then use left_join() to join on the “airports” dataframe, which has the airport information.

#airports
arr_delay <- inner_join(arr_delay, airports, join_by(dest == faa))
arr_delay
## # A tibble: 100 × 9
##    dest  avg_delay name                       lat    lon   alt    tz dst   tzone
##    <chr>     <dbl> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>
##  1 ABQ        4.38 Albuquerque Internation…  35.0 -107.   5355    -7 A     Amer…
##  2 ACK        4.85 Nantucket Mem             41.3  -70.1    48    -5 A     Amer…
##  3 ALB       14.4  Albany Intl               42.7  -73.8   285    -5 A     Amer…
##  4 ANC       -2.5  Ted Stevens Anchorage I…  61.2 -150.    152    -9 A     Amer…
##  5 ATL       11.3  Hartsfield Jackson Atla…  33.6  -84.4  1026    -5 A     Amer…
##  6 AUS        6.02 Austin Bergstrom Intl     30.2  -97.7   542    -6 A     Amer…
##  7 AVL        8.00 Asheville Regional Airp…  35.4  -82.5  2165    -5 A     Amer…
##  8 BDL        7.05 Bradley Intl              41.9  -72.7   173    -5 A     Amer…
##  9 BGR        8.03 Bangor Intl               44.8  -68.8   192    -5 A     Amer…
## 10 BHM       16.9  Birmingham Intl           33.6  -86.8   644    -6 A     Amer…
## # ℹ 90 more rows

Note here: the column names don’t match anymore!

Which airport had the largest average arrival delay?

largest_avg_arrival_delay <- arr_delay %>% summarise(Airport = arr_delay$name.x, max_delay = max(avg_delay))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `Airport = arr_delay$name.x`.
## Caused by warning:
## ! Unknown or uninitialised column: `name.x`.
largest_avg_arrival_delay
## # A tibble: 1 × 1
##   max_delay
##       <dbl>
## 1      41.8

Which airline had the smallest average departure delay?

To answer this, you need to create a dataframe of the average departure delay for each airline, then use left_join() to join on the “airlines” dataframe

smallest_departure <- my_flights %>% group_by(carrier) %>% summarise(avg_depart_delay = mean(dep_delay))
smallest_departure
## # A tibble: 16 × 2
##    carrier avg_depart_delay
##    <chr>              <dbl>
##  1 9E                 16.4 
##  2 AA                  8.57
##  3 AS                  5.83
##  4 B6                 13.0 
##  5 DL                  9.22
##  6 EV                 19.8 
##  7 F9                 20.2 
##  8 FL                 18.6 
##  9 HA                  4.90
## 10 MQ                 10.4 
## 11 OO                 12.6 
## 12 UA                 12.0 
## 13 US                  3.74
## 14 VX                 12.8 
## 15 WN                 17.7 
## 16 YV                 18.9
smallest_departure <- left_join(smallest_departure, airlines, by = "carrier")
smallest_departure
## # A tibble: 16 × 3
##    carrier avg_depart_delay name                       
##    <chr>              <dbl> <chr>                      
##  1 9E                 16.4  Endeavor Air Inc.          
##  2 AA                  8.57 American Airlines Inc.     
##  3 AS                  5.83 Alaska Airlines Inc.       
##  4 B6                 13.0  JetBlue Airways            
##  5 DL                  9.22 Delta Air Lines Inc.       
##  6 EV                 19.8  ExpressJet Airlines Inc.   
##  7 F9                 20.2  Frontier Airlines Inc.     
##  8 FL                 18.6  AirTran Airways Corporation
##  9 HA                  4.90 Hawaiian Airlines Inc.     
## 10 MQ                 10.4  Envoy Air                  
## 11 OO                 12.6  SkyWest Airlines Inc.      
## 12 UA                 12.0  United Air Lines Inc.      
## 13 US                  3.74 US Airways Inc.            
## 14 VX                 12.8  Virgin America             
## 15 WN                 17.7  Southwest Airlines Co.     
## 16 YV                 18.9  Mesa Airlines Inc.
smallest_avg_depart_airport <- smallest_departure %>% slice_min(avg_depart_delay)
smallest_avg_depart_airport
## # A tibble: 1 × 3
##   carrier avg_depart_delay name           
##   <chr>              <dbl> <chr>          
## 1 US                  3.74 US Airways Inc.

Examples from R4DS:

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest
## # A tibble: 10 × 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705

Now you want to find each flight that went to one of those destinations. You could, of course, filter. But! Now you can semi-join:

flights %>% 
  semi_join(top_dest)
## Joining with `by = join_by(dest)`
## # A tibble: 141,145 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?

# return all information of flights where dest not match with faa in airports
anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # ℹ 7,592 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# return all information of airlines where faa not match with dest in flights
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 × 8
##    faa   name                             lat    lon   alt    tz dst   tzone    
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
## # ℹ 1,347 more rows