Prerequisites
library(nycflights13)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Names
# Strive for
short_flights <- flights |> filter(air_time < 60)
short_flights
## # A tibble: 52,433 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 557 600 -3 709 723
## 2 2013 1 1 559 559 0 702 706
## 3 2013 1 1 629 630 -1 721 740
## 4 2013 1 1 632 608 24 740 728
## 5 2013 1 1 639 640 -1 739 749
## 6 2013 1 1 733 736 -3 854 850
## 7 2013 1 1 801 805 -4 900 919
## 8 2013 1 1 803 810 -7 903 925
## 9 2013 1 1 820 830 -10 940 954
## 10 2013 1 1 821 825 -4 932 945
## # ℹ 52,423 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
spaces
# Strive for
a <- 5
b <- 6
d <- 7
z <- (a + b)^2 / d
z
## [1] 17.28571
flights |>
mutate(
speed = distance / air_time,
dep_hour = dep_time %/% 100,
dep_minute = dep_time %% 100
)
## # A tibble: 336,776 × 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>, dep_hour <dbl>,
## # dep_minute <dbl>
Pipes
flights |>
filter(!is.na(arr_delay), !is.na(tailnum)) |>
count(dest)
## # A tibble: 104 × 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ℹ 94 more rows
flights |>
group_by(tailnum) |>
summarise(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
## # A tibble: 4,044 × 3
## tailnum delay n
## <chr> <dbl> <int>
## 1 D942DN 31.5 4
## 2 N0EGMQ 9.98 371
## 3 N10156 12.7 153
## 4 N102UW 2.94 48
## 5 N103US -6.93 46
## 6 N104UW 1.80 47
## 7 N10575 20.7 289
## 8 N105UW -0.267 45
## 9 N107US -5.73 41
## 10 N108UW -1.25 60
## # ℹ 4,034 more rows
ggplot2
flights |>
group_by(month) |>
summarise(
delay = mean(arr_delay, na.rm = TRUE)
) |>
ggplot(aes(x = month, y = delay)) +
geom_point() +
geom_line()

Exercises
Restyle the following
pipelines following the guidelines above.
flights|>filter(dest=="IAH")|>group_by(year,month,day)|>summarize(n=n(),
delay=mean(arr_delay,na.rm=TRUE))|>filter(n>10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day n delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 20 17.8
## 2 2013 1 2 20 7
## 3 2013 1 3 19 18.3
## 4 2013 1 4 20 -3.2
## 5 2013 1 5 13 20.2
## 6 2013 1 6 18 9.28
## 7 2013 1 7 19 -7.74
## 8 2013 1 8 19 7.79
## 9 2013 1 9 19 18.1
## 10 2013 1 10 19 6.68
## # ℹ 355 more rows
flights|>filter(carrier=="UA",dest%in%c("IAH","HOU"),sched_dep_time>
0900,sched_arr_time<2000)|>group_by(flight)|>summarize(delay=mean(
arr_delay,na.rm=TRUE),cancelled=sum(is.na(arr_delay)),n=n())|>filter(n>10)
## # A tibble: 74 × 4
## flight delay cancelled n
## <int> <dbl> <int> <int>
## 1 53 12.5 2 18
## 2 112 14.1 0 14
## 3 205 -1.71 0 14
## 4 235 -5.36 0 14
## 5 255 -9.47 0 15
## 6 268 38.6 1 15
## 7 292 6.57 0 21
## 8 318 10.7 1 20
## 9 337 20.1 2 21
## 10 370 17.5 0 11
## # ℹ 64 more rows
flights |>
filter(dest == "IAH") |>
group_by(year,month,day) |>
summarise(n = n(),
delay = mean(arr_delay, na.rm = TRUE)) |>
filter(n > 10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day n delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 20 17.8
## 2 2013 1 2 20 7
## 3 2013 1 3 19 18.3
## 4 2013 1 4 20 -3.2
## 5 2013 1 5 13 20.2
## 6 2013 1 6 18 9.28
## 7 2013 1 7 19 -7.74
## 8 2013 1 8 19 7.79
## 9 2013 1 9 19 18.1
## 10 2013 1 10 19 6.68
## # ℹ 355 more rows
flights |>
filter(carrier == "UA",
dest %in% c("IAH", "HOU"),
sched_dep_time > 0900,
sched_arr_time < 2000) |>
group_by(flight) |>
summarise(delay = mean(arr_delay, na.rm = TRUE),
cancelled = sum(is.na(arr_delay)),
n = n()) |>
filter (n > 10)
## # A tibble: 74 × 4
## flight delay cancelled n
## <int> <dbl> <int> <int>
## 1 53 12.5 2 18
## 2 112 14.1 0 14
## 3 205 -1.71 0 14
## 4 235 -5.36 0 14
## 5 255 -9.47 0 15
## 6 268 38.6 1 15
## 7 292 6.57 0 21
## 8 318 10.7 1 20
## 9 337 20.1 2 21
## 10 370 17.5 0 11
## # ℹ 64 more rows