library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
band <- tribble(
~name, ~band,
"Mick", "Stones",
"John", "Beatles",
"Paul", "Beatles"
)
instrument <- tribble(
~name, ~plays,
"John", "guitar",
"Paul", "bass",
"Keith", "guitar"
)
instrument2 <- tribble(
~artist, ~plays,
"John", "guitar",
"Paul", "bass",
"Keith", "guitar"
)
#View(flights)
#View(airlines)
band %>% left_join(instrument, by = "name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
band %>% right_join(instrument, by = "name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
## 3 Keith <NA> guitar
band %>% full_join(instrument, by = "name")
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
band %>% inner_join(instrument, by = "name")
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
Which airlines had the largest arrival delays? Complete the code below.
airlines
to flights
(Hint: Be sure to remove each _
before running the code)
flights %>%
filter(!is.na(arr_delay)) %>%
left_join(airlines, by = "carrier") %>%
group_by(name) %>%
summarise(delay = mean(arr_delay)) %>%
arrange(delay)
## # A tibble: 16 × 2
## name delay
## <chr> <dbl>
## 1 Alaska Airlines Inc. -9.93
## 2 Hawaiian Airlines Inc. -6.92
## 3 American Airlines Inc. 0.364
## 4 Delta Air Lines Inc. 1.64
## 5 Virgin America 1.76
## 6 US Airways Inc. 2.13
## 7 United Air Lines Inc. 3.56
## 8 Endeavor Air Inc. 7.38
## 9 JetBlue Airways 9.46
## 10 Southwest Airlines Co. 9.65
## 11 Envoy Air 10.8
## 12 SkyWest Airlines Inc. 11.9
## 13 Mesa Airlines Inc. 15.6
## 14 ExpressJet Airlines Inc. 15.8
## 15 AirTran Airways Corporation 20.1
## 16 Frontier Airlines Inc. 21.9
Join flights
and airports
by dest
and faa
.
Then for each name
, compute the distance
from NYC and the average arr_delay
. Hint: use first()
to get the first value of distance.
Order by average delay, worst to best.
(Hint: Be sure to remove each _
before running the code)
flights %>%
filter(!is.na(arr_delay)) %>%
left_join(airports, by = c("dest" = "faa")) %>%
group_by(name) %>%
summarise(distance = first(distance),
delay = mean(arr_delay)) %>%
arrange(desc(delay))
## # A tibble: 101 × 3
## name distance delay
## <chr> <dbl> <dbl>
## 1 Columbia Metropolitan 602 41.8
## 2 Tulsa Intl 1215 33.7
## 3 Will Rogers World 1325 30.6
## 4 Jackson Hole Airport 1874 28.1
## 5 Mc Ghee Tyson 631 24.1
## 6 Dane Co Rgnl Truax Fld 799 20.2
## 7 Richmond Intl 277 20.1
## 8 Akron Canton Regional Airport 397 19.7
## 9 Des Moines Intl 1017 19.0
## 10 Gerald R Ford Intl 605 18.2
## # … with 91 more rows
band %>% semi_join(instrument, by = "name")
## # A tibble: 2 × 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
band %>% anti_join(instrument, by = "name")
## # A tibble: 1 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
How many airports in airports
are serviced by flights in flights
? (i.e. how many places can you fly to direct from New York?)
Notice that the column to join on is named faa
in the airports data set and dest
in the flights data set.
airports %>%
semi_join(flights, by = c("faa" = "dest")) %>%
select(faa)
## # A tibble: 101 × 1
## faa
## <chr>
## 1 ABQ
## 2 ACK
## 3 ALB
## 4 ANC
## 5 ATL
## 6 AUS
## 7 AVL
## 8 BDL
## 9 BGR
## 10 BHM
## # … with 91 more rows
left_join()
retains all cases in left data set
right_join()
retains all cases in right data set
full_join()
retains all cases in either data set
inner_join()
retains only cases in both data sets
semi_join()
extracts cases that have a match
anti_join()
extracts cases that do not have a match