–metadata title=“R for Data Science”
download the package and data
if(!require(tidyverse))
{install.packages("tidyverse")}
## Loading required package: tidyverse
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'forcats' was built under R version 4.5.2
## Warning: package 'lubridate' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.2
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyverse)
if(!require(nycflights13))
{install.packages("nycflights13")} #https://rdrr.io/cran/nycflights13/
## Loading required package: nycflights13
## Warning: package 'nycflights13' was built under R version 4.5.2
library(nycflights13)
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
In a single pipeline for each condition, find all flights that meet the condition:
flights %>%
filter(arr_delay >=120)%>%
arrange(desc(arr_delay))
## # A tibble: 10,200 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # ℹ 10,190 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(dest == "IAH" | dest == "HOU")
## # A tibble: 9,313 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ℹ 9,303 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(carrier == "UA"| carrier == "AA" |carrier == "DL")
## # A tibble: 139,504 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 554 600 -6 812 837
## 5 2013 1 1 554 558 -4 740 728
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 600 -1 941 910
## 10 2013 1 1 559 600 -1 854 902
## # ℹ 139,494 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(month %in% c(7:9))
## # A tibble: 86,326 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 1 1 2029 212 236 2359
## 2 2013 7 1 2 2359 3 344 344
## 3 2013 7 1 29 2245 104 151 1
## 4 2013 7 1 43 2130 193 322 14
## 5 2013 7 1 44 2150 174 300 100
## 6 2013 7 1 46 2051 235 304 2358
## 7 2013 7 1 48 2001 287 308 2305
## 8 2013 7 1 58 2155 183 335 43
## 9 2013 7 1 100 2146 194 327 30
## 10 2013 7 1 100 2245 135 337 135
## # ℹ 86,316 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(arr_delay > 120 & dep_delay <=0)
## # A tibble: 29 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 27 1419 1420 -1 1754 1550
## 2 2013 10 7 1350 1350 0 1736 1526
## 3 2013 10 7 1357 1359 -2 1858 1654
## 4 2013 10 16 657 700 -3 1258 1056
## 5 2013 11 1 658 700 -2 1329 1015
## 6 2013 3 18 1844 1847 -3 39 2219
## 7 2013 4 17 1635 1640 -5 2049 1845
## 8 2013 4 18 558 600 -2 1149 850
## 9 2013 4 18 655 700 -5 1213 950
## 10 2013 5 22 1827 1830 -3 2217 2010
## # ℹ 19 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
filter(dep_delay >= 60 & (dep_delay - arr_delay)>= 30 )
## # A tibble: 2,074 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 1716 1545 91 2140 2039
## 2 2013 1 1 2205 1720 285 46 2040
## 3 2013 1 1 2326 2130 116 131 18
## 4 2013 1 3 1503 1221 162 1803 1555
## 5 2013 1 3 1821 1530 171 2131 1910
## 6 2013 1 3 1839 1700 99 2056 1950
## 7 2013 1 3 1850 1745 65 2148 2120
## 8 2013 1 3 1923 1815 68 2036 1958
## 9 2013 1 3 1941 1759 102 2246 2139
## 10 2013 1 3 1950 1845 65 2228 2227
## # ℹ 2,064 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Sort flights to find the flights with the longest departure delays. Find the flights that left earliest in the morning.
Sort flights to find the fastest flights. (Hint: Try including a math calculation inside of your function.)
Was there a flight on every day of 2013?
Which flights traveled the farthest distance? Which traveled the least distance?
Does it matter what order you used filter() and arrange() if you’re using both? Why/why not? Think about the results and how much work the functions would have to do.
flights %>%
arrange(desc(dep_delay))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
arrange(hour, minute)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 27 NA 106 NA NA 245
## 2 2013 1 2 458 500 -2 703 650
## 3 2013 1 3 458 500 -2 650 650
## 4 2013 1 4 456 500 -4 631 650
## 5 2013 1 5 458 500 -2 640 650
## 6 2013 1 6 458 500 -2 718 650
## 7 2013 1 7 454 500 -6 637 648
## 8 2013 1 8 454 500 -6 625 648
## 9 2013 1 9 457 500 -3 647 648
## 10 2013 1 10 450 500 -10 634 648
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
fastest_flights <- flights %>%
mutate(speed = distance/air_time,
.after = day,
.keep = "used", year, month, day) %>%
arrange(desc(speed), na.rm = TRUE)
# Count unique dates
unique_days <- flights %>%
distinct(year, month, day) %>%
nrow()
# Is it equal to 365?
unique_days == 365
## [1] TRUE
farthest <- flights %>%
slice_max(order_by = distance, n = 1)
farthest %>%
select(carrier, flight, distance)%>%
distinct
## # A tibble: 1 × 3
## carrier flight distance
## <chr> <int> <dbl>
## 1 HA 51 4983
flights %>%
slice_min(order_by = distance, n = 1)%>%
select(carrier, flight, distance)%>%
distinct
## # A tibble: 1 × 3
## carrier flight distance
## <chr> <int> <dbl>
## 1 US 1632 17
Compare dep_time, sched_dep_time, and dep_delay. How would you expect those three numbers to be related?
flights %>%
mutate(any_diff = (dep_time - sched_dep_time) ,
.keep = "used", dep_delay)
## # A tibble: 336,776 × 4
## dep_time sched_dep_time dep_delay any_diff
## <int> <int> <dbl> <int>
## 1 517 515 2 2
## 2 533 529 4 4
## 3 542 540 2 2
## 4 544 545 -1 -1
## 5 554 600 -6 -46
## 6 554 558 -4 -4
## 7 555 600 -5 -45
## 8 557 600 -3 -43
## 9 557 600 -3 -43
## 10 558 600 -2 -42
## # ℹ 336,766 more rows
Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
flights%>%
select(dep_time, dep_delay, arr_time, arr_delay)
## # A tibble: 336,776 × 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ℹ 336,766 more rows
flights%>%
select(dep_time:arr_delay, -contains("sched"))
## # A tibble: 336,776 × 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ℹ 336,766 more rows
flights%>%
select(4, 6, 7, 9)
## # A tibble: 336,776 × 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ℹ 336,766 more rows
flights%>%
select((starts_with("dep_") | starts_with("arr_")) &
ends_with("time")|ends_with("delay"))
## # A tibble: 336,776 × 4
## dep_time arr_time dep_delay arr_delay
## <int> <int> <dbl> <dbl>
## 1 517 830 2 11
## 2 533 850 4 20
## 3 542 923 2 33
## 4 544 1004 -1 -18
## 5 554 812 -6 -25
## 6 554 740 -4 12
## 7 555 913 -5 19
## 8 557 709 -3 -14
## 9 557 838 -3 -8
## 10 558 753 -2 8
## # ℹ 336,766 more rows
What happens if you specify the name of the same variable multiple times in a select() call?
flights %>%
select(dep_time, dep_time)
## # A tibble: 336,776 × 1
## dep_time
## <int>
## 1 517
## 2 533
## 3 542
## 4 544
## 5 554
## 6 554
## 7 555
## 8 557
## 9 557
## 10 558
## # ℹ 336,766 more rows
It only appears once.
What does the any_of() function do? Why might it be helpful in conjunction with this vector?
variables <- c(“year”, “month”, “day”, “dep_delay”, “arr_delay”)
variables <- c("year", "month", "day", "dep_delay", "arr_delay", "test")
flights %>%
select(any_of(variables))
## # A tibble: 336,776 × 5
## year month day dep_delay arr_delay
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 2 11
## 2 2013 1 1 4 20
## 3 2013 1 1 2 33
## 4 2013 1 1 -1 -18
## 5 2013 1 1 -6 -25
## 6 2013 1 1 -4 12
## 7 2013 1 1 -5 19
## 8 2013 1 1 -3 -14
## 9 2013 1 1 -3 -8
## 10 2013 1 1 -2 8
## # ℹ 336,766 more rows
Does the result of running the following code surprise you? How do the select helpers deal with upper and lower case by default? How can you change that default?
flights |> select(contains(“TIME”))
flights %>%
select(contains("TIME"))
## # A tibble: 336,776 × 6
## dep_time sched_dep_time arr_time sched_arr_time air_time time_hour
## <int> <int> <int> <int> <dbl> <dttm>
## 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 533 529 850 830 227 2013-01-01 05:00:00
## 3 542 540 923 850 160 2013-01-01 05:00:00
## 4 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 554 600 812 837 116 2013-01-01 06:00:00
## 6 554 558 740 728 150 2013-01-01 05:00:00
## 7 555 600 913 854 158 2013-01-01 06:00:00
## 8 557 600 709 723 53 2013-01-01 06:00:00
## 9 557 600 838 846 140 2013-01-01 06:00:00
## 10 558 600 753 745 138 2013-01-01 06:00:00
## # ℹ 336,766 more rows
flights %>%
select(contains("TIME", ignore.case = FALSE))
## # A tibble: 336,776 × 0
default: ignore.case = TRUE
Rename air_time to air_time_min to indicate units of measurement and move it to the beginning of the data frame.
flights %>%
rename(air_time_min = air_time) %>%
relocate(air_time_min)
## # A tibble: 336,776 × 19
## air_time_min year month day dep_time sched_dep_time dep_delay arr_time
## <dbl> <int> <int> <int> <int> <int> <dbl> <int>
## 1 227 2013 1 1 517 515 2 830
## 2 227 2013 1 1 533 529 4 850
## 3 160 2013 1 1 542 540 2 923
## 4 183 2013 1 1 544 545 -1 1004
## 5 116 2013 1 1 554 600 -6 812
## 6 150 2013 1 1 554 558 -4 740
## 7 158 2013 1 1 555 600 -5 913
## 8 53 2013 1 1 557 600 -3 709
## 9 140 2013 1 1 557 600 -3 838
## 10 138 2013 1 1 558 600 -2 753
## # ℹ 336,766 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Why doesn’t the following work, and what does the error mean?
flights |> select(tailnum) |> arrange(arr_delay)
flights |>
arrange(arr_delay) |>
select(tailnum)
## # A tibble: 336,776 × 1
## tailnum
## <chr>
## 1 N843VA
## 2 N840VA
## 3 N851UA
## 4 N3KCAA
## 5 N551AS
## 6 N24212
## 7 N3760C
## 8 N806UA
## 9 N805JB
## 10 N855VA
## # ℹ 336,766 more rows
#flights %>%
#select(tailnum) %>%
#arrange(arr_delay)
daily <- flights |>
group_by(year, month, day)
daily
## # A tibble: 336,776 × 19
## # Groups: year, month, day [365]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
daily_flights <- daily |>
summarize(n = n(),
.groups = "drop_last") #It removes only the innermost (the last) layer of grouping
daily_flights
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day n
## <int> <int> <int> <int>
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## # ℹ 355 more rows
daily |>
ungroup() |>
summarize(
avg_delay = mean(dep_delay, na.rm = TRUE),
flights = n()
)
## # A tibble: 1 × 2
## avg_delay flights
## <dbl> <int>
## 1 12.6 336776
meandelay <- flights |>
summarize(
delay = mean(dep_delay, na.rm = TRUE),
n = n(),
.by = month
)
meandelay
## # A tibble: 12 × 3
## month delay n
## <int> <dbl> <int>
## 1 1 10.0 27004
## 2 10 6.24 28889
## 3 11 5.44 27268
## 4 12 16.6 28135
## 5 2 10.8 24951
## 6 3 13.2 28834
## 7 4 13.9 28330
## 8 5 13.0 28796
## 9 6 20.8 28243
## 10 7 21.7 29425
## 11 8 12.6 29327
## 12 9 6.72 27574
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Which carrier has the worst average delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights |> group_by(carrier, dest) |> summarize(n()))
flights |>
group_by(carrier) |>
summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE),
n= n(),
.groups = "drop")
## # A tibble: 16 × 3
## carrier mean_arr_delay n
## <chr> <dbl> <int>
## 1 9E 7.38 18460
## 2 AA 0.364 32729
## 3 AS -9.93 714
## 4 B6 9.46 54635
## 5 DL 1.64 48110
## 6 EV 15.8 54173
## 7 F9 21.9 685
## 8 FL 20.1 3260
## 9 HA -6.92 342
## 10 MQ 10.8 26397
## 11 OO 11.9 32
## 12 UA 3.56 58665
## 13 US 2.13 20536
## 14 VX 1.76 5162
## 15 WN 9.65 12275
## 16 YV 15.6 601
flights |> group_by(carrier, dest) |>
summarize(n(),
.groups = "drop")
## # A tibble: 314 × 3
## carrier dest `n()`
## <chr> <chr> <int>
## 1 9E ATL 59
## 2 9E AUS 2
## 3 9E AVL 10
## 4 9E BGR 1
## 5 9E BNA 474
## 6 9E BOS 914
## 7 9E BTV 2
## 8 9E BUF 833
## 9 9E BWI 856
## 10 9E CAE 3
## # ℹ 304 more rows
Find the flights that are most delayed upon departure to each destination.
flights |>
group_by(dest) |>
summarise(max(arr_delay, na.rm = TRUE),
n = n(),
.groups = "drop")
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `max(arr_delay, na.rm = TRUE)`.
## ℹ In group 52: `dest = "LGA"`.
## Caused by warning in `max()`:
## ! no non-missing arguments to max; returning -Inf
## # A tibble: 105 × 3
## dest `max(arr_delay, na.rm = TRUE)` n
## <chr> <dbl> <int>
## 1 ABQ 153 254
## 2 ACK 221 265
## 3 ALB 328 439
## 4 ANC 39 8
## 5 ATL 895 17215
## 6 AUS 349 2439
## 7 AVL 228 275
## 8 BDL 266 443
## 9 BGR 238 375
## 10 BHM 291 297
## # ℹ 95 more rows
flights |>
filter(dest == "LGA") |>
arrange(desc(arr_delay))
## # A tibble: 1 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 7 27 NA 106 NA NA 245
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
How do delays vary over the course of the day? Illustrate your answer with a plot.
delay <- flights |>
group_by(hour) |>
summarise(average_dep_delay = mean(dep_delay, na.rm = TRUE),
average_arr_delay = mean(arr_delay, na.rm = TRUE),
n = n(),
.groups = "drop") |>
drop_na(average_dep_delay, average_arr_delay)
delay
## # A tibble: 19 × 4
## hour average_dep_delay average_arr_delay n
## <dbl> <dbl> <dbl> <int>
## 1 5 0.688 -4.80 1953
## 2 6 1.64 -3.38 25951
## 3 7 1.91 -5.30 22821
## 4 8 4.13 -1.11 27242
## 5 9 4.58 -1.45 20312
## 6 10 6.50 0.954 16708
## 7 11 7.19 1.48 16033
## 8 12 8.61 3.49 18181
## 9 13 11.4 6.54 19956
## 10 14 13.8 9.20 21706
## 11 15 16.9 12.3 23888
## 12 16 18.8 12.6 23002
## 13 17 21.1 16.0 24426
## 14 18 21.1 14.8 21783
## 15 19 24.8 16.7 21441
## 16 20 24.3 16.7 16739
## 17 21 24.2 18.4 10933
## 18 22 18.8 16.0 2639
## 19 23 14.0 11.8 1061
slice_min(delay, order_by = average_dep_delay, n= 1)
## # A tibble: 1 × 4
## hour average_dep_delay average_arr_delay n
## <dbl> <dbl> <dbl> <int>
## 1 5 0.688 -4.80 1953
# same as delay |> slice_min(average_dep_delay, n= 1)
slice_min(delay, order_by = average_dep_delay, n= -1) # show all but sorted
## # A tibble: 18 × 4
## hour average_dep_delay average_arr_delay n
## <dbl> <dbl> <dbl> <int>
## 1 5 0.688 -4.80 1953
## 2 6 1.64 -3.38 25951
## 3 7 1.91 -5.30 22821
## 4 8 4.13 -1.11 27242
## 5 9 4.58 -1.45 20312
## 6 10 6.50 0.954 16708
## 7 11 7.19 1.48 16033
## 8 12 8.61 3.49 18181
## 9 13 11.4 6.54 19956
## 10 14 13.8 9.20 21706
## 11 23 14.0 11.8 1061
## 12 15 16.9 12.3 23888
## 13 16 18.8 12.6 23002
## 14 22 18.8 16.0 2639
## 15 17 21.1 16.0 24426
## 16 18 21.1 14.8 21783
## 17 21 24.2 18.4 10933
## 18 20 24.3 16.7 16739
delay |>
pivot_longer(
cols = c(average_dep_delay, average_arr_delay),
names_to = "delay_type",
values_to = "delay_minutes"
) |>
ggplot(aes(x = hour, y = delay_minutes, color = delay_type)) +
geom_line() +
geom_point() +
labs(
title = "Average Flight Delays by Hour",
x = "Hour of Day",
y = "Average Delay (minutes)",
color = "Delay Type"
)
Suppose we have the following tiny data frame:
df <- tibble(
x = 1:5,
y = c("a", "b", "a", "a", "b"),
z = c("K", "K", "L", "L", "K")
)
#Write down what you think the output will look like, then check if you were correct, and describe what group_by() does.
df |>
group_by(y)
## # A tibble: 5 × 3
## # Groups: y [2]
## x y z
## <int> <chr> <chr>
## 1 1 a K
## 2 2 b K
## 3 3 a L
## 4 4 a L
## 5 5 b K
#Write down what you think the output will look like, then check if you were correct, and describe what arrange() does. Also, comment on how it’s different from the group_by() in part (a).
df |>
arrange(y)
## # A tibble: 5 × 3
## x y z
## <int> <chr> <chr>
## 1 1 a K
## 2 3 a L
## 3 4 a L
## 4 2 b K
## 5 5 b K
#Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does.
df |>
group_by(y) |>
summarize(mean_x = mean(x))
## # A tibble: 2 × 2
## y mean_x
## <chr> <dbl>
## 1 a 2.67
## 2 b 3.5
#Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. Then, comment on what the message says.
df |>
group_by(y, z) |>
summarize(mean_x = mean(x))
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups: y [2]
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
#Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. How is the output different from the one in part (d)?
df |>
group_by(y, z) |>
summarize(mean_x = mean(x), .groups = "drop")
## # A tibble: 3 × 3
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
#Write down what you think the outputs will look like, then check if you were correct, and describe what each pipeline does. How are the outputs of the two pipelines different?
df |>
group_by(y, z) |>
summarize(mean_x = mean(x))
## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.
## # A tibble: 3 × 3
## # Groups: y [2]
## y z mean_x
## <chr> <chr> <dbl>
## 1 a K 1
## 2 a L 3.5
## 3 b K 3.5
df |>
group_by(y, z) |>
mutate(mean_x = mean(x))
## # A tibble: 5 × 4
## # Groups: y, z [3]
## x y z mean_x
## <int> <chr> <chr> <dbl>
## 1 1 a K 1
## 2 2 b K 3.5
## 3 3 a L 3.5
## 4 4 a L 3.5
## 5 5 b K 3.5
if(!require(Lahman))
{install.packages("Lahman")}
## Loading required package: Lahman
## Warning: package 'Lahman' was built under R version 4.5.2
library(Lahman)
batters <- Lahman::Batting |>
group_by(playerID) |>
summarize(
performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
n = sum(AB, na.rm = TRUE)
)
batters
## # A tibble: 20,985 × 3
## playerID performance n
## <chr> <dbl> <int>
## 1 aardsda01 0 4
## 2 aaronha01 0.305 12364
## 3 aaronto01 0.229 944
## 4 aasedo01 0 5
## 5 abadan01 0.0952 21
## 6 abadfe01 0.111 9
## 7 abadijo01 0.224 49
## 8 abbated01 0.254 3044
## 9 abbeybe01 0.169 225
## 10 abbeych01 0.281 1756
## # ℹ 20,975 more rows
batters |>
filter(n > 100) |>
ggplot(aes(x = n, y = performance)) +
geom_point(alpha = 1 / 10) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
batters |>
arrange(desc(performance))
## # A tibble: 20,985 × 3
## playerID performance n
## <chr> <dbl> <int>
## 1 abramge01 1 1
## 2 alberan01 1 1
## 3 banisje01 1 1
## 4 bartocl01 1 1
## 5 bassdo01 1 1
## 6 birasst01 1 2
## 7 bruneju01 1 1
## 8 burnscb01 1 1
## 9 cammaer01 1 1
## 10 campsh01 1 1
## # ℹ 20,975 more rows
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.