Practicing joins with R

Exercises using R’s `tidyverse` and the `nycflights13` dataset

library(tidyverse)
library(nycflights13)
library(glue)

Dataset preparation

Before dealing with the data, I will create a *_dt (datetime) column with precise date and time of the flight departure and arrival dates and times, for both scheduled and real ones.

# function to create datetimes from a military time column (522 --> 17:22 hours) 
create_dt <- function(dataframe, time){
    dataframe |>
    mutate(datetime = make_datetime(
      year = year, month = month, day = day,
      hour = {{time}} %/% 100,
      min = {{time}} %% 100)
    ) |> 
    pull(datetime)
}

I will make some changes and reorder the columns.

flights <- flights |>
  mutate(dep_dt = create_dt(flights, dep_time),
         arr_dt = create_dt(flights, arr_time),
         sched_dep_dt = create_dt(flights, sched_dep_time),
         sched_arr_dt = create_dt(flights, sched_arr_time),
    .before=dep_time) |> 
  select(-c(dep_time, arr_time, sched_arr_time, sched_dep_time, time_hour)) |> 
  select(origin, dest, flight, tailnum, carrier, year:day, hour, minute,
         sched_dep_dt, dep_dt, sched_arr_dt, arr_dt,
         dep_delay:distance)

Level 1: Mutating Joins & Equijoins

These focus on adding information from one table to another using exact key matches.

1. Carrier names

Use a join to add the full name of the airline (from the airlines table) to the flights table. Which carrier has the highest average departure delay?

flights |> left_join(airlines) |> 
  rename(carrier_code = carrier, carrier = name) |> 
  group_by(carrier) |> 
  summarise(avg_dep_delay = round( mean(dep_delay, na.rm = T), 2)) |> 
  arrange(-avg_dep_delay) |> 
  head(1)

## # A tibble: 1 × 2
##   carrier                avg_dep_delay
##   <chr>                          <dbl>
## 1 Frontier Airlines Inc.          20.2

2. Airports metadata

Join flights and airports so that you only keep flights where the destination airport has a corresponding entry in the airports table. Why might some flights be dropped? (Hint: check the dest and faa columns).

flights |> inner_join(airports, join_by(dest==faa))

## # A tibble: 329,174 × 25
##    origin dest  flight tailnum carrier  year month   day  hour minute
##    <chr>  <chr>  <int> <chr>   <chr>   <int> <int> <int> <dbl>  <dbl>
##  1 EWR    IAH     1545 N14228  UA       2013     1     1     5     15
##  2 LGA    IAH     1714 N24211  UA       2013     1     1     5     29
##  3 JFK    MIA     1141 N619AA  AA       2013     1     1     5     40
##  4 LGA    ATL      461 N668DN  DL       2013     1     1     6      0
##  5 EWR    ORD     1696 N39463  UA       2013     1     1     5     58
##  6 EWR    FLL      507 N516JB  B6       2013     1     1     6      0
##  7 LGA    IAD     5708 N829AS  EV       2013     1     1     6      0
##  8 JFK    MCO       79 N593JB  B6       2013     1     1     6      0
##  9 LGA    ORD      301 N3ALAA  AA       2013     1     1     6      0
## 10 JFK    PBI       49 N793JB  B6       2013     1     1     6      0
## # ℹ 329,164 more rows
## # ℹ 15 more variables: sched_dep_dt <dttm>, dep_dt <dttm>, sched_arr_dt <dttm>,
## #   arr_dt <dttm>, dep_delay <dbl>, arr_delay <dbl>, air_time <dbl>,
## #   distance <dbl>, name <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## #   dst <chr>, tzone <chr>

total_flights <- nrow(flights)
valid_faa_destinations_count <- nrow(flights |> inner_join(airports, join_by(dest == faa)))
glue("There are a total of {total_flights} flights.")

## There are a total of 336776 flights.

glue("Only {valid_faa_destinations_count} ({round(valid_faa_destinations_count / total_flights * 100,2)}%) of them have a valid destination according to the FAA code.")

## Only 329174 (97.74%) of them have a valid destination according to the FAA code.

3. Average delay versus plane age

Join flights with the planes table. Calculate the age of the plane at the time of each flight (Flight Year - Manufacture Year) and find the correlation between plane age and arr_delay.

data <- flights |> left_join(planes, join_by(tailnum),
                             suffix=c("_flight", "_plane"))|>
  select(flight, dep_dt, arr_delay, tailnum, year_flight, year_plane) |> 
  mutate(plane_age = year_flight - year_plane) |> 
  select(-c(year_plane, year_flight)) |> 
  relocate(arr_delay, .after=last_col()) |> 
  arrange(dep_dt, flight)

data

## # A tibble: 336,776 × 5
##    flight dep_dt              tailnum plane_age arr_delay
##     <int> <dttm>              <chr>       <int>     <dbl>
##  1   1545 2013-01-01 05:17:00 N14228         14        11
##  2   1714 2013-01-01 05:33:00 N24211         15        20
##  3   1141 2013-01-01 05:42:00 N619AA         23        33
##  4    725 2013-01-01 05:44:00 N804JB          1       -18
##  5    461 2013-01-01 05:54:00 N668DN         22       -25
##  6   1696 2013-01-01 05:54:00 N39463          1        12
##  7    507 2013-01-01 05:55:00 N516JB         13        19
##  8     79 2013-01-01 05:57:00 N593JB          9        -8
##  9   5708 2013-01-01 05:57:00 N829AS         15       -14
## 10     49 2013-01-01 05:58:00 N793JB          2        -2
## # ℹ 336,766 more rows

data |> group_by(plane_age) |>
  summarise(avg_arrival_delay = mean(arr_delay, na.rm=T)) |> 
  ggplot(aes(x=plane_age, y=avg_arrival_delay))+
  geom_point(color="blue") +
  geom_smooth(method="lm", se=F, color="red") +
  scale_y_continuous(limits = c(-5,15)) +
  labs(x="Plane age", y="average arrival delay, minutes") +
  theme_light()

4. Missing weather records

Use a full_join between a subset of weather (e.g., only Jan 1st) and a subset of flights (only Jan 1st). Identify if there are any hours where we have weather data but no flights, or flights but no weather data.

flights |> filter(month==1 & day==1) |>
  select(flight, month, day, dep_dt) |> 
  full_join(weather|> filter(month==1 & day==1)) |>
  select( -year, -hour) |> 
  filter(if_any(everything(), is.na)) |> 
  select(-c(month, day, wind_gust))

## # A tibble: 34,626 × 12
##    flight dep_dt              origin  temp  dewp humid wind_dir wind_speed
##     <int> <dttm>              <chr>  <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1   1545 2013-01-01 05:17:00 EWR     39.0  26.1  59.4      270      10.4 
##  2   1545 2013-01-01 05:17:00 EWR     39.0  27.0  61.6      250       8.06
##  3   1545 2013-01-01 05:17:00 EWR     39.0  28.0  64.4      240      11.5 
##  4   1545 2013-01-01 05:17:00 EWR     39.9  28.0  62.2      250      12.7 
##  5   1545 2013-01-01 05:17:00 EWR     39.0  28.0  64.4      260      12.7 
##  6   1545 2013-01-01 05:17:00 EWR     37.9  28.0  67.2      240      11.5 
##  7   1545 2013-01-01 05:17:00 EWR     39.0  28.0  64.4      240      15.0 
##  8   1545 2013-01-01 05:17:00 EWR     39.9  28.0  62.2      250      10.4 
##  9   1545 2013-01-01 05:17:00 EWR     39.9  28.0  62.2      260      15.0 
## 10   1545 2013-01-01 05:17:00 EWR     41    28.0  59.6      260      13.8 
## # ℹ 34,616 more rows
## # ℹ 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

5. Multi-key equijoins

Join flights and weather using all common time variables (year, month, day, hour) and the origin airport. Calculate the average delay for flights departing in “high wind” conditions (e.g., wind speed > 20).

flights |> inner_join(weather, join_by(year, month, day, hour, origin)) |>
  filter(wind_speed > 20) |> 
  summarise(avg_dep_delay_minutes = round(mean(dep_delay, na.rm=T),2) )

## # A tibble: 1 × 1
##   avg_dep_delay_minutes
##                   <dbl>
## 1                  16.9

Level 2: Filtering Joins

These focus on using one table to filter another without adding new columns.

6. Top 10 most popular destinations

Create a table of the top 10 most frequent destination airports. Use a semi_join to filter the flights table to only include trips to these top 10 destinations.

most_frequent_destinations <- flights |> count(dest, sort=T, name="flights") |> head(10)
most_frequent_destinations

## # A tibble: 10 × 2
##    dest  flights
##    <chr>   <int>
##  1 ORD     17283
##  2 ATL     17215
##  3 LAX     16174
##  4 BOS     15508
##  5 MCO     14082
##  6 CLT     14064
##  7 SFO     13331
##  8 FLL     12055
##  9 MIA     11728
## 10 DCA      9705

flights |> semi_join(most_frequent_destinations) |> 
  select(flight, origin, dest, tailnum, dep_dt) |> 
  arrange(dep_dt, flight)

## # A tibble: 141,145 × 5
##    flight origin dest  tailnum dep_dt             
##     <int> <chr>  <chr> <chr>   <dttm>             
##  1   1141 JFK    MIA   N619AA  2013-01-01 05:42:00
##  2    461 LGA    ATL   N668DN  2013-01-01 05:54:00
##  3   1696 EWR    ORD   N39463  2013-01-01 05:54:00
##  4    507 EWR    FLL   N516JB  2013-01-01 05:55:00
##  5     79 JFK    MCO   N593JB  2013-01-01 05:57:00
##  6    194 JFK    LAX   N29129  2013-01-01 05:58:00
##  7    301 LGA    ORD   N3ALAA  2013-01-01 05:58:00
##  8   1124 EWR    SFO   N53441  2013-01-01 05:58:00
##  9   1806 JFK    BOS   N708JB  2013-01-01 05:59:00
## 10    371 LGA    FLL   N595JB  2013-01-01 06:00:00
## # ℹ 141,135 more rows

7. Checking data integrity, using anti-joins

Use an anti_join to find all flights that go to a destination (dest) that is not listed in the airports table. How many unique destinations are missing from the metadata?

flights |> anti_join(airports, join_by(dest==faa)) |> 
  select(flight, tailnum, origin, dest, dep_dt) |> 
  arrange(dep_dt, flight)

## # A tibble: 7,602 × 5
##    flight tailnum origin dest  dep_dt             
##     <int> <chr>   <chr>  <chr> <dttm>             
##  1    725 N804JB  JFK    BQN   2013-01-01 05:44:00
##  2    709 N794JB  JFK    SJU   2013-01-01 06:15:00
##  3    413 N3BAAA  JFK    SJU   2013-01-01 06:28:00
##  4   1203 N77296  EWR    SJU   2013-01-01 07:01:00
##  5    715 N651JB  JFK    SJU   2013-01-01 07:11:00
##  6    301 N900PC  JFK    SJU   2013-01-01 08:20:00
##  7    717 N527JB  JFK    SJU   2013-01-01 08:20:00
##  8   1357 N5FSAA  JFK    SJU   2013-01-01 08:40:00
##  9    655 N5EXAA  JFK    STT   2013-01-01 09:09:00
## 10   1519 N24715  EWR    STT   2013-01-01 09:13:00
## # ℹ 7,592 more rows

How many unique destinations are missing from the metadata? (destinations which are not listed in the airports dataset).

flights |> anti_join(airports, join_by(dest == faa)) |> 
  distinct(dest)

## # A tibble: 4 × 1
##   dest 
##   <chr>
## 1 BQN  
## 2 SJU  
## 3 STT  
## 4 PSE

Level 3: Non-Equi Joins

Note: These require dplyr 1.1.0 or later using the join_by() function.

8. Weather records prior to flight departure, using inequality joins

For every flight, find all weather observations at the origin airport that occurred up to 5 hours before the flight’s departure. (Use join_by(origin, time_hour > time_hour_weather)).

# weather dataset only records the datetime rounded to hours
# I need to mutate the dep_dt from flights to round to hours
flights |> mutate(dep_dt = round_date(dep_dt, unit="hour"),
                  last_48h = dep_dt - hours(5)) |>
  select(flight, dep_dt, origin, last_48h) |> 
  left_join(weather, join_by(origin,
                             dep_dt >= time_hour,
                             last_48h <= time_hour)
) |> 
  select(-c(last_48h, year, month, day, hour)) |> 
  select(flight, origin, dep_dt, weather_dt = time_hour,
         temp:last_col())

## # A tibble: 1,970,678 × 13
##    flight origin dep_dt              weather_dt           temp  dewp humid
##     <int> <chr>  <dttm>              <dttm>              <dbl> <dbl> <dbl>
##  1   1545 EWR    2013-01-01 05:00:00 NA                   NA    NA    NA  
##  2   1714 LGA    2013-01-01 06:00:00 2013-01-01 01:00:00  39.9  26.1  57.3
##  3   1141 JFK    2013-01-01 06:00:00 2013-01-01 01:00:00  39.0  26.1  59.4
##  4    725 JFK    2013-01-01 06:00:00 2013-01-01 01:00:00  39.0  26.1  59.4
##  5    461 LGA    2013-01-01 06:00:00 2013-01-01 01:00:00  39.9  26.1  57.3
##  6   1696 EWR    2013-01-01 06:00:00 2013-01-01 01:00:00  39.0  26.1  59.4
##  7    507 EWR    2013-01-01 06:00:00 2013-01-01 01:00:00  39.0  26.1  59.4
##  8   5708 LGA    2013-01-01 06:00:00 2013-01-01 01:00:00  39.9  26.1  57.3
##  9     79 JFK    2013-01-01 06:00:00 2013-01-01 01:00:00  39.0  26.1  59.4
## 10    301 LGA    2013-01-01 06:00:00 2013-01-01 01:00:00  39.9  26.1  57.3
## # ℹ 1,970,668 more rows
## # ℹ 6 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>

9. Rolling/range joins, using overlap

Sometimes weather isn’t recorded exactly on the hour. For each flight, join the weather data where the weather timestamp is within a 30-minute window (before or after) of the flight’s scheduled time_hour.

flights |>
  mutate(
    # Create a proper datetime object
    before_30min = sched_dep_dt - minutes(30),
    after_30min = sched_dep_dt + minutes(30)) |> 
  left_join(weather,
            join_by(origin,
                    before_30min <= time_hour,
                    after_30min >= time_hour))  |> 
  select(flight, dep_dt, weather_dt = time_hour,
         !ends_with(c(".x", ".y", "time", "min")))

## # A tibble: 370,484 × 23
##    flight dep_dt              weather_dt          origin dest  tailnum carrier
##     <int> <dttm>              <dttm>              <chr>  <chr> <chr>   <chr>  
##  1   1545 2013-01-01 05:17:00 NA                  EWR    IAH   N14228  UA     
##  2   1714 2013-01-01 05:33:00 NA                  LGA    IAH   N24211  UA     
##  3   1141 2013-01-01 05:42:00 2013-01-01 01:00:00 JFK    MIA   N619AA  AA     
##  4    725 2013-01-01 05:44:00 2013-01-01 01:00:00 JFK    BQN   N804JB  B6     
##  5    461 2013-01-01 05:54:00 2013-01-01 01:00:00 LGA    ATL   N668DN  DL     
##  6   1696 2013-01-01 05:54:00 2013-01-01 01:00:00 EWR    ORD   N39463  UA     
##  7    507 2013-01-01 05:55:00 2013-01-01 01:00:00 EWR    FLL   N516JB  B6     
##  8   5708 2013-01-01 05:57:00 2013-01-01 01:00:00 LGA    IAD   N829AS  EV     
##  9     79 2013-01-01 05:57:00 2013-01-01 01:00:00 JFK    MCO   N593JB  B6     
## 10    301 2013-01-01 05:58:00 2013-01-01 01:00:00 LGA    ORD   N3ALAA  AA     
## # ℹ 370,474 more rows
## # ℹ 16 more variables: minute <dbl>, sched_dep_dt <dttm>, sched_arr_dt <dttm>,
## #   arr_dt <dttm>, dep_delay <dbl>, arr_delay <dbl>, distance <dbl>,
## #   temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>,
## #   wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>

10. Records for next flight, using self joins

For each flight, find the next flight departing from the same origin. Use a non-equi join where flight_A$time_hour < flight_B$time_hour.

flights |>
  mutate(
    next_flight_within_15min = dep_dt + minutes(15)
    ) |> 
  select(origin, flight, tailnum, dep_dt, next_flight_within_15min) |> 
  left_join(flights, join_by(origin,
                    dep_dt < dep_dt,
                    next_flight_within_15min > dep_dt),
            suffix=c("", "_next")) |> 
  select(origin, flight, tailnum, dep_dt,
         flight_next, tailnum_next, dep_dt_next)

## # A tibble: 1,468,999 × 7
##    origin flight tailnum dep_dt              flight_next tailnum_next
##    <chr>   <int> <chr>   <dttm>                    <int> <chr>       
##  1 EWR      1545 N14228  2013-01-01 05:17:00          NA <NA>        
##  2 LGA      1714 N24211  2013-01-01 05:33:00          NA <NA>        
##  3 JFK      1141 N619AA  2013-01-01 05:42:00         725 N804JB      
##  4 JFK       725 N804JB  2013-01-01 05:44:00          79 N593JB      
##  5 JFK       725 N804JB  2013-01-01 05:44:00          49 N793JB      
##  6 JFK       725 N804JB  2013-01-01 05:44:00          71 N657JB      
##  7 JFK       725 N804JB  2013-01-01 05:44:00         194 N29129      
##  8 LGA       461 N668DN  2013-01-01 05:54:00        5708 N829AS      
##  9 LGA       461 N668DN  2013-01-01 05:54:00         301 N3ALAA      
## 10 LGA       461 N668DN  2013-01-01 05:54:00         707 N3DUAA      
## # ℹ 1,468,989 more rows
## # ℹ 1 more variable: dep_dt_next <dttm>

Calculate the time difference between consecutive departures at JFK.

flights_JFK <- flights |> filter(origin == "JFK") |> 
  mutate(
    next_dt_within_15min = dep_dt + minutes(30)
    ) |>
  select(flight, tailnum, dep_dt, next_dt_within_15min)

flights_JFK |> left_join(flights_JFK, join_by(dep_dt < dep_dt,
                                              next_dt_within_15min >= dep_dt),
                         suffix=c("", "_next")) |>
  arrange(dep_dt, flight, dep_dt_next) |> 
  select(!starts_with("next_dt_within_15min"))

## # A tibble: 1,010,452 × 6
##    flight tailnum dep_dt              flight_next tailnum_next
##     <int> <chr>   <dttm>                    <int> <chr>       
##  1   1141 N619AA  2013-01-01 05:42:00         725 N804JB      
##  2   1141 N619AA  2013-01-01 05:42:00          79 N593JB      
##  3   1141 N619AA  2013-01-01 05:42:00          49 N793JB      
##  4   1141 N619AA  2013-01-01 05:42:00          71 N657JB      
##  5   1141 N619AA  2013-01-01 05:42:00         194 N29129      
##  6   1141 N619AA  2013-01-01 05:42:00        1806 N708JB      
##  7   1141 N619AA  2013-01-01 05:42:00        1743 N3739P      
##  8   1141 N619AA  2013-01-01 05:42:00         303 N532UA      
##  9    725 N804JB  2013-01-01 05:44:00          79 N593JB      
## 10    725 N804JB  2013-01-01 05:44:00          49 N793JB      
## # ℹ 1,010,442 more rows
## # ℹ 1 more variable: dep_dt_next <dttm>