1 Prerequisites

library(nycflights13)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)

2 Names

# Strive for

short_flights <- flights |> filter(air_time < 60)

short_flights
## # A tibble: 52,433 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      557            600        -3      709            723
##  2  2013     1     1      559            559         0      702            706
##  3  2013     1     1      629            630        -1      721            740
##  4  2013     1     1      632            608        24      740            728
##  5  2013     1     1      639            640        -1      739            749
##  6  2013     1     1      733            736        -3      854            850
##  7  2013     1     1      801            805        -4      900            919
##  8  2013     1     1      803            810        -7      903            925
##  9  2013     1     1      820            830       -10      940            954
## 10  2013     1     1      821            825        -4      932            945
## # ℹ 52,423 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

3 spaces

# Strive for 

a <- 5
b <- 6
d <- 7

z <- (a + b)^2 / d

z
## [1] 17.28571
flights |> 
  mutate(
    speed      = distance / air_time,
    dep_hour   = dep_time %/% 100,
    dep_minute = dep_time %% 100
  )
## # A tibble: 336,776 × 22
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>, dep_hour <dbl>,
## #   dep_minute <dbl>

4 Pipes

flights |> 
  filter(!is.na(arr_delay), !is.na(tailnum)) |> 
  count(dest)
## # A tibble: 104 × 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ℹ 94 more rows
flights |>
  group_by(tailnum) |> 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )
## # A tibble: 4,044 × 3
##    tailnum  delay     n
##    <chr>    <dbl> <int>
##  1 D942DN  31.5       4
##  2 N0EGMQ   9.98    371
##  3 N10156  12.7     153
##  4 N102UW   2.94     48
##  5 N103US  -6.93     46
##  6 N104UW   1.80     47
##  7 N10575  20.7     289
##  8 N105UW  -0.267    45
##  9 N107US  -5.73     41
## 10 N108UW  -1.25     60
## # ℹ 4,034 more rows

5 ggplot2

flights |> 
  group_by(month) |> 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE)
  ) |> 
  ggplot(aes(x = month, y = delay)) +
  geom_point() +
  geom_line()

6 Exercises

6.1 Restyle the following pipelines following the guidelines above.

flights|>filter(dest=="IAH")|>group_by(year,month,day)|>summarize(n=n(),
delay=mean(arr_delay,na.rm=TRUE))|>filter(n>10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day     n delay
##    <int> <int> <int> <int> <dbl>
##  1  2013     1     1    20 17.8 
##  2  2013     1     2    20  7   
##  3  2013     1     3    19 18.3 
##  4  2013     1     4    20 -3.2 
##  5  2013     1     5    13 20.2 
##  6  2013     1     6    18  9.28
##  7  2013     1     7    19 -7.74
##  8  2013     1     8    19  7.79
##  9  2013     1     9    19 18.1 
## 10  2013     1    10    19  6.68
## # ℹ 355 more rows
flights|>filter(carrier=="UA",dest%in%c("IAH","HOU"),sched_dep_time>
0900,sched_arr_time<2000)|>group_by(flight)|>summarize(delay=mean(
arr_delay,na.rm=TRUE),cancelled=sum(is.na(arr_delay)),n=n())|>filter(n>10)
## # A tibble: 74 × 4
##    flight delay cancelled     n
##     <int> <dbl>     <int> <int>
##  1     53 12.5          2    18
##  2    112 14.1          0    14
##  3    205 -1.71         0    14
##  4    235 -5.36         0    14
##  5    255 -9.47         0    15
##  6    268 38.6          1    15
##  7    292  6.57         0    21
##  8    318 10.7          1    20
##  9    337 20.1          2    21
## 10    370 17.5          0    11
## # ℹ 64 more rows
flights |> 
  filter(dest == "IAH") |> 
  group_by(year,month,day) |> 
  summarise(n = n(),
            delay = mean(arr_delay, na.rm = TRUE)) |> 
  filter(n > 10)
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day     n delay
##    <int> <int> <int> <int> <dbl>
##  1  2013     1     1    20 17.8 
##  2  2013     1     2    20  7   
##  3  2013     1     3    19 18.3 
##  4  2013     1     4    20 -3.2 
##  5  2013     1     5    13 20.2 
##  6  2013     1     6    18  9.28
##  7  2013     1     7    19 -7.74
##  8  2013     1     8    19  7.79
##  9  2013     1     9    19 18.1 
## 10  2013     1    10    19  6.68
## # ℹ 355 more rows
flights |> 
  filter(carrier == "UA",
         dest %in% c("IAH", "HOU"),
         sched_dep_time > 0900,
         sched_arr_time < 2000) |> 
  group_by(flight) |> 
  summarise(delay = mean(arr_delay, na.rm = TRUE),
            cancelled = sum(is.na(arr_delay)),
            n = n()) |> 
  filter (n > 10)
## # A tibble: 74 × 4
##    flight delay cancelled     n
##     <int> <dbl>     <int> <int>
##  1     53 12.5          2    18
##  2    112 14.1          0    14
##  3    205 -1.71         0    14
##  4    235 -5.36         0    14
##  5    255 -9.47         0    15
##  6    268 38.6          1    15
##  7    292  6.57         0    21
##  8    318 10.7          1    20
##  9    337 20.1          2    21
## 10    370 17.5          0    11
## # ℹ 64 more rows