library(nycflights13)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Mutating joins
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
flights2 %>%
left_join(airlines)
## Joining with `by = join_by(carrier)`
## # A tibble: 336,776 × 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 EWR FLL N516JB B6 JetBlue Airways
## 8 2013 1 1 6 LGA IAD N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 JFK MCO N593JB B6 JetBlue Airways
## 10 2013 1 1 6 LGA ORD N3ALAA AA American Airlines Inc.
## # ℹ 336,766 more rows
flights2 %>% left_join(weather)
## Joining with `by = join_by(year, month, day, hour, origin)`
## # A tibble: 336,776 × 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # ℹ 336,766 more rows
## # ℹ 7 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>
flights2 %>% left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixed wing mult…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixed wing mult…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixed wing mult…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixed wing mult…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixed wing mult…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixed wing mult…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixed wing mult…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixed wing mult…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixed wing mult…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA>
## # ℹ 336,766 more rows
## # ℹ 6 more variables: manufacturer <chr>, model <chr>, engines <int>,
## # seats <int>, speed <int>, engine <chr>
flights2 %>% left_join(airports, c("dest" = "faa"))
## # A tibble: 336,776 × 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Georg… 30.0 -95.3 97
## 2 2013 1 1 5 LGA IAH N24211 UA Georg… 30.0 -95.3 97
## 3 2013 1 1 5 JFK MIA N619AA AA Miami… 25.8 -80.3 8
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL Harts… 33.6 -84.4 1026
## 6 2013 1 1 5 EWR ORD N39463 UA Chica… 42.0 -87.9 668
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort … 26.1 -80.2 9
## 8 2013 1 1 6 LGA IAD N829AS EV Washi… 38.9 -77.5 313
## 9 2013 1 1 6 JFK MCO N593JB B6 Orlan… 28.4 -81.3 96
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chica… 42.0 -87.9 668
## # ℹ 336,766 more rows
## # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>
df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(3, 1), a = 10, b = "a")
df1 %>% inner_join(df2) %>% knitr::kable()
## Joining with `by = join_by(x)`
| x | y | a | b |
|---|---|---|---|
| 1 | 2 | 10 | a |
df1 %>% left_join(df2)
## Joining with `by = join_by(x)`
## # A tibble: 2 × 4
## x y a b
## <dbl> <int> <dbl> <chr>
## 1 1 2 10 a
## 2 2 1 NA <NA>
df1 %>% right_join(df2)
## Joining with `by = join_by(x)`
## # A tibble: 2 × 4
## x y a b
## <dbl> <int> <dbl> <chr>
## 1 1 2 10 a
## 2 3 NA 10 a
df1 %>% full_join(df2)
## Joining with `by = join_by(x)`
## # A tibble: 3 × 4
## x y a b
## <dbl> <int> <dbl> <chr>
## 1 1 2 10 a
## 2 2 1 NA <NA>
## 3 3 NA 10 a
df1 <- tibble(x = c(1, 1, 2), y = 1:3)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))
df1 %>% left_join(df2)
## Joining with `by = join_by(x)`
## Warning in left_join(., df2): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## # A tibble: 5 × 3
## x y z
## <dbl> <int> <chr>
## 1 1 1 a
## 2 1 1 b
## 3 1 2 a
## 4 1 2 b
## 5 2 3 a
#Filtering Joins
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 × 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # ℹ 712 more rows
df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))
df1 %>% inner_join(df2, by = "x") %>% nrow()
## Warning in inner_join(., df2, by = "x"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## [1] 4
df1 %>% semi_join(df2, by = "x") %>% nrow()
## [1] 2
df1 %>% nrow()
## [1] 4
#Set Operations
(df1 <- tibble(x = 1:2, y = c(1L, 1L)))
## # A tibble: 2 × 2
## x y
## <int> <int>
## 1 1 1
## 2 2 1
(df2 <- tibble(x = 1:2, y = 1:2))
## # A tibble: 2 × 2
## x y
## <int> <int>
## 1 1 1
## 2 2 2
intersect(df1, df2)
## # A tibble: 1 × 2
## x y
## <int> <int>
## 1 1 1
union(df1, df2)
## # A tibble: 3 × 2
## x y
## <int> <int>
## 1 1 1
## 2 2 1
## 3 2 2
setdiff(df1, df2)
## # A tibble: 1 × 2
## x y
## <int> <int>
## 1 2 1
setdiff(df2, df1)
## # A tibble: 1 × 2
## x y
## <int> <int>
## 1 2 2
Line 15-17 are controlling the table match with the default manner, which uses all variables in both tables
Line 19 controls the table join with character vector argument “by =”x” ” to only join by common variables An example would be joining a table of people with jobs, where the job column was the job title, and a table of projects with jobs, where the job column was an assignment name, and you would want to join the table on the jobs column, but maybe you would want to join by manager name columns for both tables
Line 21 shows you can join tables rows that have the same character vector value
for line 24, df1 has a row with the values 2,1, and df2 has a row with the values 3,10,a but they don’t match and don’t get joined Line 25 shows the left join which includes all observations of x in the argument (x,y) Line 26 shows the left join which includes all observations of y in the argument (x,y) Line 27 shows the left join which includes all observations from x and y in (x,y)
These joins are called outerjoins Line 32 shows that a join will add all combinitations of duplicate matches, going from 3 rows in df1 to 5 rows in the joined result
Line 34 shows using anti-join to find mismatches, when two tables should have matching values. An example is if you had a data set of two regions of a large companies employees, and the employer field was not the same for each entry. The way semi-join would work would be to join the data sets and match by the same name, to find employees that are listed as being in both region, when they should just be in one
Lines 45-50 show set operations in action, and a set is basically a list of unique values Explanation of the set functions are below: intersect(x, y): return only observations in both x and y union(x, y): return unique observations in x and y setdiff(x, y): return observations in x, but not in y. setdiff(y, x): return observations in y, but not in x.