HW: Extra Credit

library(nycflights13)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#Mutating joins
flights2 <- flights %>% select(year:day, hour, origin, dest, tailnum, carrier)
flights2 %>% 
  left_join(airlines)

## Joining with `by = join_by(carrier)`

## # A tibble: 336,776 × 9
##     year month   day  hour origin dest  tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      American Airlines Inc.  
## # ℹ 336,766 more rows

flights2 %>% left_join(weather)

## Joining with `by = join_by(year, month, day, hour, origin)`

## # A tibble: 336,776 × 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8
## # ℹ 336,766 more rows
## # ℹ 7 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>

flights2 %>% left_join(planes, by = "tailnum")

## # A tibble: 336,776 × 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type            
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>           
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing mult…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing mult…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing mult…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing mult…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing mult…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed wing mult…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixed wing mult…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixed wing mult…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixed wing mult…
## 10   2013     1     1     6 LGA    ORD   N3ALAA  AA          NA <NA>            
## # ℹ 336,766 more rows
## # ℹ 6 more variables: manufacturer <chr>, model <chr>, engines <int>,
## #   seats <int>, speed <int>, engine <chr>

flights2 %>% left_join(airports, c("dest" = "faa"))

## # A tibble: 336,776 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Georg…  30.0 -95.3    97
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Georg…  30.0 -95.3    97
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami…  25.8 -80.3     8
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>    NA    NA      NA
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Harts…  33.6 -84.4  1026
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Chica…  42.0 -87.9   668
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Fort …  26.1 -80.2     9
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Washi…  38.9 -77.5   313
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      Orlan…  28.4 -81.3    96
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chica…  42.0 -87.9   668
## # ℹ 336,766 more rows
## # ℹ 3 more variables: tz <dbl>, dst <chr>, tzone <chr>

df1 <- tibble(x = c(1, 2), y = 2:1)
df2 <- tibble(x = c(3, 1), a = 10, b = "a")
df1 %>% inner_join(df2) %>% knitr::kable()

## Joining with `by = join_by(x)`

x	y	a	b
1	2	10	a

df1 %>% left_join(df2)

## Joining with `by = join_by(x)`

## # A tibble: 2 × 4
##       x     y     a b    
##   <dbl> <int> <dbl> <chr>
## 1     1     2    10 a    
## 2     2     1    NA <NA>

df1 %>% right_join(df2)

## Joining with `by = join_by(x)`

## # A tibble: 2 × 4
##       x     y     a b    
##   <dbl> <int> <dbl> <chr>
## 1     1     2    10 a    
## 2     3    NA    10 a

df1 %>% full_join(df2)

## Joining with `by = join_by(x)`

## # A tibble: 3 × 4
##       x     y     a b    
##   <dbl> <int> <dbl> <chr>
## 1     1     2    10 a    
## 2     2     1    NA <NA> 
## 3     3    NA    10 a

df1 <- tibble(x = c(1, 1, 2), y = 1:3)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))

df1 %>% left_join(df2)

## Joining with `by = join_by(x)`

## Warning in left_join(., df2): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## # A tibble: 5 × 3
##       x     y z    
##   <dbl> <int> <chr>
## 1     1     1 a    
## 2     1     1 b    
## 3     1     2 a    
## 4     1     2 b    
## 5     2     3 a

#Filtering Joins
flights %>% 
  anti_join(planes, by = "tailnum") %>% 
  count(tailnum, sort = TRUE)

## # A tibble: 722 × 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # ℹ 712 more rows

df1 <- tibble(x = c(1, 1, 3, 4), y = 1:4)
df2 <- tibble(x = c(1, 1, 2), z = c("a", "b", "a"))
df1 %>% inner_join(df2, by = "x") %>% nrow()

## Warning in inner_join(., df2, by = "x"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 1 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## [1] 4

df1 %>% semi_join(df2, by = "x") %>% nrow()

## [1] 2

df1 %>% nrow()

## [1] 4

#Set Operations
(df1 <- tibble(x = 1:2, y = c(1L, 1L)))

## # A tibble: 2 × 2
##       x     y
##   <int> <int>
## 1     1     1
## 2     2     1

(df2 <- tibble(x = 1:2, y = 1:2))

## # A tibble: 2 × 2
##       x     y
##   <int> <int>
## 1     1     1
## 2     2     2

intersect(df1, df2)

## # A tibble: 1 × 2
##       x     y
##   <int> <int>
## 1     1     1

union(df1, df2)

## # A tibble: 3 × 2
##       x     y
##   <int> <int>
## 1     1     1
## 2     2     1
## 3     2     2

setdiff(df1, df2)

## # A tibble: 1 × 2
##       x     y
##   <int> <int>
## 1     2     1

setdiff(df2, df1)

## # A tibble: 1 × 2
##       x     y
##   <int> <int>
## 1     2     2

Line 15-17 are controlling the table match with the default manner, which uses all variables in both tables

Line 19 controls the table join with character vector argument “by =”x” ” to only join by common variables An example would be joining a table of people with jobs, where the job column was the job title, and a table of projects with jobs, where the job column was an assignment name, and you would want to join the table on the jobs column, but maybe you would want to join by manager name columns for both tables

Line 21 shows you can join tables rows that have the same character vector value

for line 24, df1 has a row with the values 2,1, and df2 has a row with the values 3,10,a but they don’t match and don’t get joined Line 25 shows the left join which includes all observations of x in the argument (x,y) Line 26 shows the left join which includes all observations of y in the argument (x,y) Line 27 shows the left join which includes all observations from x and y in (x,y)

These joins are called outerjoins Line 32 shows that a join will add all combinitations of duplicate matches, going from 3 rows in df1 to 5 rows in the joined result

Line 34 shows using anti-join to find mismatches, when two tables should have matching values. An example is if you had a data set of two regions of a large companies employees, and the employer field was not the same for each entry. The way semi-join would work would be to join the data sets and match by the same name, to find employees that are listed as being in both region, when they should just be in one

Lines 45-50 show set operations in action, and a set is basically a list of unique values Explanation of the set functions are below: intersect(x, y): return only observations in both x and y union(x, y): return unique observations in x and y setdiff(x, y): return observations in x, but not in y. setdiff(y, x): return observations in y, but not in x.

HW: Extra Credit

Pierre Aba

2023-08-18