dplyr practiceLoad the "nycflights13" package. You’ll also need to
load dplyr or tidyverse, if you didn’t
above.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#install.packages("nycflights13")
library(nycflights13)
my_flights <- flights # store my own version
my_flights <- na.omit(my_flights) ## this dataset has data on cancelled flights too,
## so we 'll remove those for now
my_flights %>% head()
## # A tibble: 6 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#my_flights %>% summary()
#any(is.na(my_flights))
Create a dataframe of the average arrival delays for each destination
arr_delay <- my_flights %>% group_by(dest) %>% summarise(avg_delay = mean(arr_delay ))
Then use left_join() to join on the “airports”
dataframe, which has the airport information.
#airports
arr_delay <- inner_join(arr_delay, airports, join_by(dest == faa))
arr_delay
## # A tibble: 100 × 9
## dest avg_delay name lat lon alt tz dst tzone
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 ABQ 4.38 Albuquerque Internation… 35.0 -107. 5355 -7 A Amer…
## 2 ACK 4.85 Nantucket Mem 41.3 -70.1 48 -5 A Amer…
## 3 ALB 14.4 Albany Intl 42.7 -73.8 285 -5 A Amer…
## 4 ANC -2.5 Ted Stevens Anchorage I… 61.2 -150. 152 -9 A Amer…
## 5 ATL 11.3 Hartsfield Jackson Atla… 33.6 -84.4 1026 -5 A Amer…
## 6 AUS 6.02 Austin Bergstrom Intl 30.2 -97.7 542 -6 A Amer…
## 7 AVL 8.00 Asheville Regional Airp… 35.4 -82.5 2165 -5 A Amer…
## 8 BDL 7.05 Bradley Intl 41.9 -72.7 173 -5 A Amer…
## 9 BGR 8.03 Bangor Intl 44.8 -68.8 192 -5 A Amer…
## 10 BHM 16.9 Birmingham Intl 33.6 -86.8 644 -6 A Amer…
## # ℹ 90 more rows
Note here: the column names don’t match anymore!
Which airport had the largest average arrival delay?
largest_avg_arrival_delay <- arr_delay %>% summarise(Airport = arr_delay$name.x, max_delay = max(avg_delay))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `Airport = arr_delay$name.x`.
## Caused by warning:
## ! Unknown or uninitialised column: `name.x`.
largest_avg_arrival_delay
## # A tibble: 1 × 1
## max_delay
## <dbl>
## 1 41.8
Which airline had the smallest average departure delay?
To answer this, you need to create a dataframe of the average
departure delay for each airline, then use
left_join() to join on the “airlines” dataframe
left_join() to join on the “airlines”
dataframesmallest_departure <- my_flights %>% group_by(carrier) %>% summarise(avg_depart_delay = mean(dep_delay))
smallest_departure
## # A tibble: 16 × 2
## carrier avg_depart_delay
## <chr> <dbl>
## 1 9E 16.4
## 2 AA 8.57
## 3 AS 5.83
## 4 B6 13.0
## 5 DL 9.22
## 6 EV 19.8
## 7 F9 20.2
## 8 FL 18.6
## 9 HA 4.90
## 10 MQ 10.4
## 11 OO 12.6
## 12 UA 12.0
## 13 US 3.74
## 14 VX 12.8
## 15 WN 17.7
## 16 YV 18.9
smallest_departure <- left_join(smallest_departure, airlines, by = "carrier")
smallest_departure
## # A tibble: 16 × 3
## carrier avg_depart_delay name
## <chr> <dbl> <chr>
## 1 9E 16.4 Endeavor Air Inc.
## 2 AA 8.57 American Airlines Inc.
## 3 AS 5.83 Alaska Airlines Inc.
## 4 B6 13.0 JetBlue Airways
## 5 DL 9.22 Delta Air Lines Inc.
## 6 EV 19.8 ExpressJet Airlines Inc.
## 7 F9 20.2 Frontier Airlines Inc.
## 8 FL 18.6 AirTran Airways Corporation
## 9 HA 4.90 Hawaiian Airlines Inc.
## 10 MQ 10.4 Envoy Air
## 11 OO 12.6 SkyWest Airlines Inc.
## 12 UA 12.0 United Air Lines Inc.
## 13 US 3.74 US Airways Inc.
## 14 VX 12.8 Virgin America
## 15 WN 17.7 Southwest Airlines Co.
## 16 YV 18.9 Mesa Airlines Inc.
smallest_avg_depart_airport <- smallest_departure %>% slice_min(avg_depart_delay)
smallest_avg_depart_airport
## # A tibble: 1 × 3
## carrier avg_depart_delay name
## <chr> <dbl> <chr>
## 1 US 3.74 US Airways Inc.
Examples from R4DS:
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 × 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
Now you want to find each flight that went to one of those destinations. You could, of course, filter. But! Now you can semi-join:
flights %>%
semi_join(top_dest)
## Joining with `by = join_by(dest)`
## # A tibble: 141,145 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 555 600 -5 913 854
## 5 2013 1 1 557 600 -3 838 846
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 559 0 702 706
## 10 2013 1 1 600 600 0 851 858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?
# return all information of flights where dest not match with faa in airports
anti_join(flights, airports, by = c("dest" = "faa"))
## # A tibble: 7,602 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # ℹ 7,592 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
# return all information of airlines where faa not match with dest in flights
anti_join(airports, flights, by = c("faa" = "dest"))
## # A tibble: 1,357 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # ℹ 1,347 more rows