Introduction
nycflights13
Keys
Mutating joins
Inner Join
x <- tribble(
~key, ~val_x,
1, "x1",
2, "x2",
3, "x3"
)
y <- tribble(
~key, ~val_y,
1, "y1",
2, "y2",
4, "y3"
)
inner_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
Outer Join
left_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
right_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 3 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 4 <NA> y3
full_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 4 × 3
## key val_x val_y
## <dbl> <chr> <chr>
## 1 1 x1 y1
## 2 2 x2 y2
## 3 3 x3 <NA>
## 4 4 <NA> y3
Defining the key coloums
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()

Filtering joins
semi_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
## key val_x
## <dbl> <chr>
## 1 1 x1
## 2 2 x2
semi_join(y,x)
## Joining with `by = join_by(key)`
## # A tibble: 2 × 2
## key val_y
## <dbl> <chr>
## 1 1 y1
## 2 2 y2
anti_join(x,y)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
## key val_x
## <dbl> <chr>
## 1 3 x3
anti_join(y,x)
## Joining with `by = join_by(key)`
## # A tibble: 1 × 2
## key val_y
## <dbl> <chr>
## 1 4 y3
Join problems
airports %>% count(alt, lon) %>% filter(n > 1)
## # A tibble: 0 × 3
## # ℹ 3 variables: alt <dbl>, lon <dbl>, n <int>
#> # A tibble: 0 × 3
#> # ℹ 3 variables: alt <dbl>, lon <dbl>, n <int>
Set operations
df1 <- tribble(
~x, ~y,
1, 1,
2, 1
)
df2 <- tribble(
~x, ~y,
1, 1,
1, 2
)
intersect(df1, df2)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 1 1
#> # A tibble: 1 × 2
#> x y
#> <dbl> <dbl>
#> 1 1 1
# Note that we get 3 rows, not 4
union(df1, df2)
## # A tibble: 3 × 2
## x y
## <dbl> <dbl>
## 1 1 1
## 2 2 1
## 3 1 2
#> # A tibble: 3 × 2
#> x y
#> <dbl> <dbl>
#> 1 1 1
#> 2 2 1
#> 3 1 2
setdiff(df1, df2)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 2 1
#> # A tibble: 1 × 2
#> x y
#> <dbl> <dbl>
#> 1 2 1
setdiff(df2, df1)
## # A tibble: 1 × 2
## x y
## <dbl> <dbl>
## 1 1 2
#> # A tibble: 1 × 2
#> x y
#> <dbl> <dbl>
#> 1 1 2