Introduction

nycflights13

airlines
## # A tibble: 16 × 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.
airports
## # A tibble: 1,458 × 8
##    faa   name                             lat    lon   alt    tz dst   tzone    
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
## # ℹ 1,448 more rows
planes
## # A tibble: 3,322 × 9
##    tailnum  year type              manufacturer model engines seats speed engine
##    <chr>   <int> <chr>             <chr>        <chr>   <int> <int> <int> <chr> 
##  1 N10156   2004 Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  2 N102UW   1998 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  3 N103US   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  4 N104UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  5 N10575   2002 Fixed wing multi… EMBRAER      EMB-…       2    55    NA Turbo…
##  6 N105UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  7 N107US   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  8 N108UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
##  9 N109UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
## 10 N110UW   1999 Fixed wing multi… AIRBUS INDU… A320…       2   182    NA Turbo…
## # ℹ 3,312 more rows
weather
## # A tibble: 26,115 × 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
## # ℹ 26,105 more rows
## # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

Keys

# A primary key uniquely identifies an observation in its own table. For example, planes$tailnum is a primary key because it uniquely identifies each plane in the planes table.

# A foreign key uniquely identifies an observation in another table. For example, flights$tailnum is a foreign key because it appears in the flights table where it matches each flight to a unique plane.

# A variable can be both a primary key and a foreign key. For example, origin is part of the weather primary key, and is also a foreign key for the airports table.

Mutating Joins

Inner Join

x <- tribble(
  ~key, ~val_x,
     1, "x1",
     2, "x2",
     3, "x3"
)
y <- tribble(
  ~key, ~val_y,
     1, "y1",
     2, "y2",
     4, "y3"
)

inner_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2

Outer Join

left_join(x, y, by = "key")
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>
right_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     4 <NA>  y3
full_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 4 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA> 
## 4     4 <NA>  y3

Defining the Key Columns

flights %>% left_join(planes)
## Joining with `by = join_by(year, tailnum)`
## # A tibble: 336,776 × 26
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 18 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, type <chr>, manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights %>% left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 27
##    year.x month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##     <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1   2013     1     1      517            515         2      830            819
##  2   2013     1     1      533            529         4      850            830
##  3   2013     1     1      542            540         2      923            850
##  4   2013     1     1      544            545        -1     1004           1022
##  5   2013     1     1      554            600        -6      812            837
##  6   2013     1     1      554            558        -4      740            728
##  7   2013     1     1      555            600        -5      913            854
##  8   2013     1     1      557            600        -3      709            723
##  9   2013     1     1      557            600        -3      838            846
## 10   2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
## #   manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
## #   engine <chr>

Filtering Joins

semi_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
##     key val_x
##   <dbl> <chr>
## 1     1 x1   
## 2     2 x2
semi_join(y, x)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
##     key val_y
##   <dbl> <chr>
## 1     1 y1   
## 2     2 y2
anti_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
##     key val_x
##   <dbl> <chr>
## 1     3 x3
anti_join(y, x)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
##     key val_y
##   <dbl> <chr>
## 1     4 y3

Join Problems

airports %>% count(faa)
## # A tibble: 1,458 × 2
##    faa       n
##    <chr> <int>
##  1 04G       1
##  2 06A       1
##  3 06C       1
##  4 06N       1
##  5 09J       1
##  6 0A9       1
##  7 0G6       1
##  8 0G7       1
##  9 0P2       1
## 10 0S9       1
## # ℹ 1,448 more rows
airports %>% count(faa) %>% arrange(desc(n))
## # A tibble: 1,458 × 2
##    faa       n
##    <chr> <int>
##  1 04G       1
##  2 06A       1
##  3 06C       1
##  4 06N       1
##  5 09J       1
##  6 0A9       1
##  7 0G6       1
##  8 0G7       1
##  9 0P2       1
## 10 0S9       1
## # ℹ 1,448 more rows
airports %>% count(name) %>% arrange(desc(n))
## # A tibble: 1,440 × 2
##    name                             n
##    <chr>                        <int>
##  1 Municipal Airport                5
##  2 All Airports                     3
##  3 Capital City Airport             2
##  4 Dillingham                       2
##  5 Douglas Municipal Airport        2
##  6 Executive                        2
##  7 Grand Canyon West Airport        2
##  8 Jefferson County Intl            2
##  9 Marshfield Municipal Airport     2
## 10 Penn Station                     2
## # ℹ 1,430 more rows
airports %>% count(lat, lon) %>% arrange(desc(n))
## # A tibble: 1,458 × 3
##      lat   lon     n
##    <dbl> <dbl> <int>
##  1  19.7 -155.     1
##  2  19.7 -156.     1
##  3  19.8 -156.     1
##  4  19.9 -156.     1
##  5  20.0 -156.     1
##  6  20.3 -156.     1
##  7  20.8 -157.     1
##  8  20.8 -156.     1
##  9  20.9 -156.     1
## 10  21.0 -157.     1
## # ℹ 1,448 more rows
left_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
##     key val_x val_y
##   <dbl> <chr> <chr>
## 1     1 x1    y1   
## 2     2 x2    y2   
## 3     3 x3    <NA>

Set Operations

df1 <- tribble(
  ~x, ~y,
   1,  1,
   2,  1
)
df2 <- tribble(
  ~x, ~y,
   1,  1,
   1,  2
)
intersect(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
union(df1, df2)
## # A tibble: 3 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     1
## 2     2     1
## 3     1     2
setdiff(df1, df2)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     2     1
setdiff(df2, df1)
## # A tibble: 1 × 2
##       x     y
##   <dbl> <dbl>
## 1     1     2