Introduction
nycflights13
airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
airports
## # A tibble: 1,458 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # ℹ 1,448 more rows
planes
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 3 N103US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 5 N10575 2002 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 7 N107US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## # ℹ 3,312 more rows
weather
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # ℹ 26,105 more rows
## # ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
Keys
# A primary key uniquely identifies an observation in its own table. For example, planes$tailnum is a primary key because it uniquely identifies each plane in the planes table.
# A foreign key uniquely identifies an observation in another table. For example, flights$tailnum is a foreign key because it appears in the flights table where it matches each flight to a unique plane.
# A variable can be both a primary key and a foreign key. For example, origin is part of the weather primary key, and is also a foreign key for the airports table.
Mutating Joins
Inner Join
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
inner_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
Outer Join
left_join(x, y, by = "key")
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
right_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
full_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 4 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
Defining the Key Columns
flights %>% left_join(planes)
## Joining with `by = join_by(year, tailnum)`
## # A tibble: 336,776 × 26
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 18 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, type <chr>, manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights %>% left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 27
## year.x month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
## # manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
## # engine <chr>
Filtering Joins
semi_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
semi_join(y, x)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
anti_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
## key val_x
## <dbl> <chr>
## 1 3 x3
anti_join(y, x)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
## key val_y
## <dbl> <chr>
## 1 4 y3
Join Problems
airports %>% count(faa)
## # A tibble: 1,458 × 2
## faa n
## <chr> <int>
## 1 04G 1
## 2 06A 1
## 3 06C 1
## 4 06N 1
## 5 09J 1
## 6 0A9 1
## 7 0G6 1
## 8 0G7 1
## 9 0P2 1
## 10 0S9 1
## # ℹ 1,448 more rows
airports %>% count(faa) %>% arrange(desc(n))
## # A tibble: 1,458 × 2
## faa n
## <chr> <int>
## 1 04G 1
## 2 06A 1
## 3 06C 1
## 4 06N 1
## 5 09J 1
## 6 0A9 1
## 7 0G6 1
## 8 0G7 1
## 9 0P2 1
## 10 0S9 1
## # ℹ 1,448 more rows
airports %>% count(name) %>% arrange(desc(n))
## # A tibble: 1,440 × 2
## name n
## <chr> <int>
## 1 Municipal Airport 5
## 2 All Airports 3
## 3 Capital City Airport 2
## 4 Dillingham 2
## 5 Douglas Municipal Airport 2
## 6 Executive 2
## 7 Grand Canyon West Airport 2
## 8 Jefferson County Intl 2
## 9 Marshfield Municipal Airport 2
## 10 Penn Station 2
## # ℹ 1,430 more rows
airports %>% count(lat, lon) %>% arrange(desc(n))
## # A tibble: 1,458 × 3
## lat lon n
## <dbl> <dbl> <int>
## 1 19.7 -155. 1
## 2 19.7 -156. 1
## 3 19.8 -156. 1
## 4 19.9 -156. 1
## 5 20.0 -156. 1
## 6 20.3 -156. 1
## 7 20.8 -157. 1
## 8 20.8 -156. 1
## 9 20.9 -156. 1
## 10 21.0 -157. 1
## # ℹ 1,448 more rows
left_join(x, y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
Set Operations
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(df1, df2)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 1 1
union(df1, df2)
## # A tibble: 3 × 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 2
setdiff(df1, df2)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 1 2