Introduction

NYC Flights 13

library(nycflights13)

Keys

Mutating Joins

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)
inner_join(x, y)
## Joining, by = "key"
## # A tibble: 2 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2

Outer Join

left_join(x, y, by = "key")
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>
right_join(x, y)
## Joining, by = "key"
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3
full_join(x, y)
## Joining, by = "key"
## # A tibble: 4 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3

Defining the key columns

flights %>% left_join (planes)
## Joining, by = c("year", "tailnum")
## # A tibble: 336,776 × 26
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 336,766 more rows, 16 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, type <chr>, manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>, and
## #   abbreviated variable names ¹​sched_dep_time, ²​dep_delay, ³​arr_time,
## #   ⁴​sched_arr_time, ⁵​arr_delay
flights %>% left_join (planes, by = "tailnum")
## # A tibble: 336,776 × 27
##    year.x month   day dep_time sched_d…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##     <int> <int> <int>    <int>     <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1   2013     1     1      517       515       2     830     819      11 UA     
##  2   2013     1     1      533       529       4     850     830      20 UA     
##  3   2013     1     1      542       540       2     923     850      33 AA     
##  4   2013     1     1      544       545      -1    1004    1022     -18 B6     
##  5   2013     1     1      554       600      -6     812     837     -25 DL     
##  6   2013     1     1      554       558      -4     740     728      12 UA     
##  7   2013     1     1      555       600      -5     913     854      19 B6     
##  8   2013     1     1      557       600      -3     709     723     -14 EV     
##  9   2013     1     1      557       600      -3     838     846      -8 B6     
## 10   2013     1     1      558       600      -2     753     745       8 AA     
## # … with 336,766 more rows, 17 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
## #   manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
## #   engine <chr>, and abbreviated variable names ¹​sched_dep_time, ²​dep_delay,
## #   ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

Filtering Joins

semi_join(x, y)
## Joining, by = "key"
## # A tibble: 2 × 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2
semi_join(y, x)
## Joining, by = "key"
## # A tibble: 2 × 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2
anti_join(x, y)
## Joining, by = "key"
## # A tibble: 1 × 2
##     key val_x
##   <dbl> <chr>
## 1     3 x3
anti_join(y, x)
## Joining, by = "key"
## # A tibble: 1 × 2
##     key val_y
##   <dbl> <chr>
## 1     4 y3

Join Problems

airports %>% count(name) %>% arrange(desc(n))
## # A tibble: 1,440 × 2
##    name                             n
##    <chr>                        <int>
##  1 Municipal Airport                5
##  2 All Airports                     3
##  3 Capital City Airport             2
##  4 Dillingham                       2
##  5 Douglas Municipal Airport        2
##  6 Executive                        2
##  7 Grand Canyon West Airport        2
##  8 Jefferson County Intl            2
##  9 Marshfield Municipal Airport     2
## 10 Penn Station                     2
## # … with 1,430 more rows
left_join (x, y)
## Joining, by = "key"
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>

Set Operations

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)
intersect(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
union(df1, df2)
## # A tibble: 3 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     2
setdiff(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     2     1