R for Data Science Chapter 3: Data Transformation

1 Prerequisites

library(nycflights13)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)

2 dplyr basics

glimpse(flights)

## Rows: 336,776
## Columns: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

flights |>
  filter(dest == "IAH") |>
  group_by(year, month, day) |>
  summarise(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )

## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.

## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day arr_delay
##    <int> <int> <int>     <dbl>
##  1  2013     1     1     17.8 
##  2  2013     1     2      7   
##  3  2013     1     3     18.3 
##  4  2013     1     4     -3.2 
##  5  2013     1     5     20.2 
##  6  2013     1     6      9.28
##  7  2013     1     7     -7.74
##  8  2013     1     8      7.79
##  9  2013     1     9     18.1 
## 10  2013     1    10      6.68
## # ℹ 355 more rows

3 Rows

# Flights that are more than 120 minutes late

flights |>
  filter(dep_delay > 120)

## # A tibble: 9,723 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848           1835       853     1001           1950
##  2  2013     1     1      957            733       144     1056            853
##  3  2013     1     1     1114            900       134     1447           1222
##  4  2013     1     1     1540           1338       122     2020           1825
##  5  2013     1     1     1815           1325       290     2120           1542
##  6  2013     1     1     1842           1422       260     1958           1535
##  7  2013     1     1     1856           1645       131     2212           2005
##  8  2013     1     1     1934           1725       129     2126           1855
##  9  2013     1     1     1938           1703       155     2109           1823
## 10  2013     1     1     1942           1705       157     2124           1830
## # ℹ 9,713 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# Flights that departed on January 1st

flights |>
  filter(month == 1 & day == 1)

## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# Flights that departed in January or February

flights |>
  filter(month == 1 | month == 2)

## # A tibble: 51,955 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# A shorter way to do the above

flights |>
  filter(month %in% c(1, 2))

## # A tibble: 51,955 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 51,945 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

jan1 <- flights |>
  filter(month == 1 & day == 1)

4 arrange()

flights |>
  arrange(year, month, day, dep_time)

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

flights |> 
  arrange(desc(dep_delay))

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

5 distinct()

# Remove any duplicate rows

flights |>
  distinct()

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

# Find all unique origin and destination pairs

flights |> 
  distinct(origin, dest)

## # A tibble: 224 × 2
##    origin dest 
##    <chr>  <chr>
##  1 EWR    IAH  
##  2 LGA    IAH  
##  3 JFK    MIA  
##  4 JFK    BQN  
##  5 LGA    ATL  
##  6 EWR    ORD  
##  7 EWR    FLL  
##  8 LGA    IAD  
##  9 JFK    MCO  
## 10 LGA    ORD  
## # ℹ 214 more rows

flights |>
  count(origin, dest, sort = TRUE)

## # A tibble: 224 × 3
##    origin dest      n
##    <chr>  <chr> <int>
##  1 JFK    LAX   11262
##  2 LGA    ATL   10263
##  3 LGA    ORD    8857
##  4 JFK    SFO    8204
##  5 LGA    CLT    6168
##  6 EWR    ORD    6100
##  7 JFK    BOS    5898
##  8 LGA    MIA    5781
##  9 JFK    MCO    5464
## 10 EWR    BOS    5327
## # ℹ 214 more rows

6 Exercises

6.1 In a single pipeline for each condition, find all flights that meet the condition:

flights |>
  filter(arr_delay > 120,
         dest %in% c("IAH", "HOU"),
         carrier %in% c("UA", "AA", "Dl"),
         month %in% c(7,8,9),
         arr_delay > 120 & dep_delay <=0,
         dep_delay >= 60 & (dep_delay - arr_delay) >= 30)

## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

6.2 Sort flights to find the flights with the longest departure delays. Find the flights that left earliest in the morning.

flights |> 
  arrange(desc(arr_delay)) |>
  head(10)

## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     7    22     2257            759       898      121           1026
##  9  2013    12     5      756           1700       896     1058           2020
## 10  2013     5     3     1133           2055       878     1250           2215
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

flights |>
  arrange(time_hour) |>
  head(10)

## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            558        -4      740            728
##  6  2013     1     1      559            559         0      702            706
##  7  2013     1     1      554            600        -6      812            837
##  8  2013     1     1      555            600        -5      913            854
##  9  2013     1     1      557            600        -3      709            723
## 10  2013     1     1      557            600        -3      838            846
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

6.3 Sort flights to find the fastest flights. (Hint: Try including a math calculation inside of your function.)

flights |>
  arrange((dep_time - arr_time)) |>
  head(10)

## # A tibble: 10 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3    23      959            920        39     2129           1240
##  2  2013     6    23     1259           1300        -1     2358           1555
##  3  2013     9     3      653            659        -6     1704            857
##  4  2013    10    19      625            630        -5     1635            922
##  5  2013     5    21      755            800        -5     1737           1025
##  6  2013     1    24      644            655       -11     1624           1030
##  7  2013     7     6     1252           1255        -3     2208           1605
##  8  2013    12     3     1112           1114        -2     2046           1407
##  9  2013     7    23      843            835         8     1755           1024
## 10  2013    11     2      733            735        -2     1642           1039
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

6.4 Was there a flight on every day of 2013?

flights |>
  mutate(date = as.Date(time_hour)) |>
  group_by(date) |>
  summarise(n_flights = n()) |>
  summarise(total_days = n()) |>
  pull(total_days)

## [1] 366

Yes, there was a flight everyday in 2013.

6.5 Which flights traveled the farthest distance? Which traveled the least distance?

flights |>
  arrange(distance)

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     7    27       NA            106        NA       NA            245
##  2  2013     1     3     2127           2129        -2     2222           2224
##  3  2013     1     4     1240           1200        40     1333           1306
##  4  2013     1     4     1829           1615       134     1937           1721
##  5  2013     1     4     2128           2129        -1     2218           2224
##  6  2013     1     5     1155           1200        -5     1241           1306
##  7  2013     1     6     2125           2129        -4     2224           2224
##  8  2013     1     7     2124           2129        -5     2212           2224
##  9  2013     1     8     2127           2130        -3     2304           2225
## 10  2013     1     9     2126           2129        -3     2217           2224
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Flight 51 traveled the farthest distance and flight 1632 traveled the least.

6.6 Does it matter what order you used filter() and arrange() if you’re using both? Why/why not? Think about the results and how much work the functions would have to do.

I’d argue it is better practice to filter your dataset first before applying the arrange function. This reduces the numbers of rows that will be sorted by the arrange function. Doing this later seems inefficient.

7 The pipe

7.1 We’ve shown you simple examples of the pipe above, but its real power arises when you start to combine multiple verbs. For example, imagine that you wanted to find the fastest flights to Houston’s IAH airport: you need to combine filter(), mutate(), select(), and arrange():

flights |>
  filter(dest == "IAH") |>
  mutate(speed = distance / air_time * 60) |>
  select(year:day, dep_time, carrier, flight, speed) |>
  arrange(desc(speed))

## # A tibble: 7,198 × 7
##     year month   day dep_time carrier flight speed
##    <int> <int> <int>    <int> <chr>    <int> <dbl>
##  1  2013     7     9      707 UA         226  522.
##  2  2013     8    27     1850 UA        1128  521.
##  3  2013     8    28      902 UA        1711  519.
##  4  2013     8    28     2122 UA        1022  519.
##  5  2013     6    11     1628 UA        1178  515.
##  6  2013     8    27     1017 UA         333  515.
##  7  2013     8    27     1205 UA        1421  515.
##  8  2013     8    27     1758 UA         302  515.
##  9  2013     9    27      521 UA         252  515.
## 10  2013     8    28      625 UA         559  515.
## # ℹ 7,188 more rows

7.2 Groups

flights |> 
  group_by(month)

## # A tibble: 336,776 × 19
## # Groups:   month [12]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

flights |> 
  group_by(month) |> 
  summarise(
    avg_delay = mean(dep_delay, na.rm = TRUE)
  )

## # A tibble: 12 × 2
##    month avg_delay
##    <int>     <dbl>
##  1     1     10.0 
##  2     2     10.8 
##  3     3     13.2 
##  4     4     13.9 
##  5     5     13.0 
##  6     6     20.8 
##  7     7     21.7 
##  8     8     12.6 
##  9     9      6.72
## 10    10      6.24
## 11    11      5.44
## 12    12     16.6

flights |> 
  group_by(month) |> 
  summarise(
    avg_delay = mean(dep_delay, na.rm = TRUE),
    n = n ()
  )

## # A tibble: 12 × 3
##    month avg_delay     n
##    <int>     <dbl> <int>
##  1     1     10.0  27004
##  2     2     10.8  24951
##  3     3     13.2  28834
##  4     4     13.9  28330
##  5     5     13.0  28796
##  6     6     20.8  28243
##  7     7     21.7  29425
##  8     8     12.6  29327
##  9     9      6.72 27574
## 10    10      6.24 28889
## 11    11      5.44 27268
## 12    12     16.6  28135

flights |> 
  group_by(dest) |>
  slice_max(arr_delay, n = 1) |> 
  relocate(dest)

## # A tibble: 108 × 19
## # Groups:   dest [105]
##    dest   year month   day dep_time sched_dep_time dep_delay arr_time
##    <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 ABQ    2013     7    22     2145           2007        98      132
##  2 ACK    2013     7    23     1139            800       219     1250
##  3 ALB    2013     1    25      123           2000       323      229
##  4 ANC    2013     8    17     1740           1625        75     2042
##  5 ATL    2013     7    22     2257            759       898      121
##  6 AUS    2013     7    10     2056           1505       351     2347
##  7 AVL    2013     8    13     1156            832       204     1417
##  8 BDL    2013     2    21     1728           1316       252     1839
##  9 BGR    2013    12     1     1504           1056       248     1628
## 10 BHM    2013     4    10       25           1900       325      136
## # ℹ 98 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

daily <- flights |> 
  group_by(year, month, day)

daily

## # A tibble: 336,776 × 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

daily_flights <- daily |> 
  summarise(n = n(),
            .groups = "drop_last"
  )

daily |> 
  ungroup()

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

daily |> 
  ungroup() |>
  summarise(
    avg_delay = mean(dep_delay, na.rm = TRUE),
    flights = n()
  )

## # A tibble: 1 × 2
##   avg_delay flights
##       <dbl>   <int>
## 1      12.6  336776

flights |> 
  summarise(
    delay = mean(dep_delay, na.rm = TRUE),
    n = n(),
    .by = month
  )

## # A tibble: 12 × 3
##    month delay     n
##    <int> <dbl> <int>
##  1     1 10.0  27004
##  2    10  6.24 28889
##  3    11  5.44 27268
##  4    12 16.6  28135
##  5     2 10.8  24951
##  6     3 13.2  28834
##  7     4 13.9  28330
##  8     5 13.0  28796
##  9     6 20.8  28243
## 10     7 21.7  29425
## 11     8 12.6  29327
## 12     9  6.72 27574

8 Exercises

8.1 Which carrier has the worst average delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not? (Hint: think about flights |> group_by(carrier, dest) |> summarize(n()))

flights |> 
  group_by(carrier) |>
  summarise(
    avg_delay = mean(dep_delay, na.rm = TRUE),
    n = n()
  ) |>
  arrange(avg_delay)

## # A tibble: 16 × 3
##    carrier avg_delay     n
##    <chr>       <dbl> <int>
##  1 US           3.78 20536
##  2 HA           4.90   342
##  3 AS           5.80   714
##  4 AA           8.59 32729
##  5 DL           9.26 48110
##  6 MQ          10.6  26397
##  7 UA          12.1  58665
##  8 OO          12.6     32
##  9 VX          12.9   5162
## 10 B6          13.0  54635
## 11 9E          16.7  18460
## 12 WN          17.7  12275
## 13 FL          18.7   3260
## 14 YV          19.0    601
## 15 EV          20.0  54173
## 16 F9          20.2    685

The worst average delay was by the carrier F9 with an average delay of 20 minutes.

8.2 Find the flights that are most delayed upon departure from each destination.

flights |>
  filter(dep_delay > 0) |> 
  group_by(dest) |> 
  slice_max(dep_delay, n = 1) |> 
  relocate(dest)

## # A tibble: 103 × 19
## # Groups:   dest [103]
##    dest   year month   day dep_time sched_dep_time dep_delay arr_time
##    <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1 ABQ    2013    12    14     2223           2001       142      133
##  2 ACK    2013     7    23     1139            800       219     1250
##  3 ALB    2013     1    25      123           2000       323      229
##  4 ANC    2013     8    17     1740           1625        75     2042
##  5 ATL    2013     7    22     2257            759       898      121
##  6 AUS    2013     7    10     2056           1505       351     2347
##  7 AVL    2013     6    14     1158            816       222     1335
##  8 BDL    2013     2    21     1728           1316       252     1839
##  9 BGR    2013    12     1     1504           1056       248     1628
## 10 BHM    2013     4    10       25           1900       325      136
## # ℹ 93 more rows
## # ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

8.3 How do delays vary over the course of the day? Illustrate your answer with a plot.

flights |> 
  ggplot(aes(x = dep_time, y = dep_delay), na.rm = TRUE) +
  geom_point()

## Warning: Removed 8255 rows containing missing values or values outside the scale range
## (`geom_point()`).

There is a lot of delays around 5-10am and a noticeable trend of increased delays during the night.

8.4 What happens if you supply a negative n to slice_min() and friends?

flights |>
  slice_min(dep_delay, n = -1)

## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013    10    23     1907           1932       -25     2143           2143
##  8  2013     3    30     2030           2055       -25     2213           2250
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

If a -1 is provided then just random observations are returned instead of being in order of the smallest first.

####Explain what count() does in terms of the dplyr verbs you just learned. What does the sort argument to count() do?

result <- flights |> 
    count(carrier, sort = TRUE)

result

## # A tibble: 16 × 2
##    carrier     n
##    <chr>   <int>
##  1 UA      58665
##  2 B6      54635
##  3 EV      54173
##  4 DL      48110
##  5 AA      32729
##  6 MQ      26397
##  7 US      20536
##  8 9E      18460
##  9 WN      12275
## 10 VX       5162
## 11 FL       3260
## 12 AS        714
## 13 F9        685
## 14 YV        601
## 15 HA        342
## 16 OO         32

Count() counts the number of observations of a category within a dataframe. As shown above, it counts the number of times each carrier is shown within the dataframe.The sort argument allows the user to sort putting the largest value first. As shown the example, this shows us that UA has the most flights.

8.5 Suppose we have the following tiny data frame:

df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)

df |> 
  group_by(y)

## # A tibble: 5 × 3
## # Groups:   y [2]
##       x y     z    
##   <int> <chr> <chr>
## 1     1 a     K    
## 2     2 b     K    
## 3     3 a     L    
## 4     4 a     L    
## 5     5 b     K

df |> 
  arrange(y)

## # A tibble: 5 × 3
##       x y     z    
##   <int> <chr> <chr>
## 1     1 a     K    
## 2     3 a     L    
## 3     4 a     L    
## 4     2 b     K    
## 5     5 b     K

df |> 
  group_by(y) |> 
  summarise(mean_x = mean(x))

## # A tibble: 2 × 2
##   y     mean_x
##   <chr>  <dbl>
## 1 a       2.67
## 2 b       3.5

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.

## # A tibble: 3 × 3
## # Groups:   y [2]
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop")

## # A tibble: 3 × 3
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

## `summarise()` has grouped output by 'y'. You can override using the `.groups`
## argument.

## # A tibble: 3 × 3
## # Groups:   y [2]
##   y     z     mean_x
##   <chr> <chr>  <dbl>
## 1 a     K        1  
## 2 a     L        3.5
## 3 b     K        3.5

df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x))

## # A tibble: 5 × 4
## # Groups:   y, z [3]
##       x y     z     mean_x
##   <int> <chr> <chr>  <dbl>
## 1     1 a     K        1  
## 2     2 b     K        3.5
## 3     3 a     L        3.5
## 4     4 a     L        3.5
## 5     5 b     K        3.5

I think that grouping by Y will create two groups based on the letters present.
For the arrange, I predict that this will arrange the y column in alphabetical order.
This will calculate the mean of the x column and create a group based on the y column.
This will calculate the mean of the x column and create a group based on the y and z column.
This output will put a mean_x column in the dataframe and then ungroups the y and z groups.
The first code runs and replaces the x column with the mean_x column. While the other function keeps the x column and has the mean_x column added too.

9 Case study: aggregates and sample size

9.1 Whenever you do any aggregation, it’s always a good idea to include a count (n()). That way, you can ensure that you’re not drawing conclusions based on very small amounts of data. We’ll demonstrate this with some baseball data from the Lahman package. Specifically, we will compare what proportion of times a player gets a hit (H) vs. the number of times they try to put the ball in play (AB):

batters <- Lahman::Batting |> 
  group_by(playerID) |> 
  summarize(
    performance = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    n = sum(AB, na.rm = TRUE)
  )

batters

## # A tibble: 20,730 × 3
##    playerID  performance     n
##    <chr>           <dbl> <int>
##  1 aardsda01      0          4
##  2 aaronha01      0.305  12364
##  3 aaronto01      0.229    944
##  4 aasedo01       0          5
##  5 abadan01       0.0952    21
##  6 abadfe01       0.111      9
##  7 abadijo01      0.224     49
##  8 abbated01      0.254   3044
##  9 abbeybe01      0.169    225
## 10 abbeych01      0.281   1756
## # ℹ 20,720 more rows

batters |> 
  filter(n > 100) |> 
  ggplot(aes(x = n, y = performance)) +
  geom_point(alpha = 1 / 10) +
  geom_smooth(se = FALSE) +
  labs(title = "Baseball Players")

## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

batters |> 
  arrange(desc(performance))

## # A tibble: 20,730 × 3
##    playerID  performance     n
##    <chr>           <dbl> <int>
##  1 abramge01           1     1
##  2 alberan01           1     1
##  3 banisje01           1     1
##  4 bartocl01           1     1
##  5 bassdo01            1     1
##  6 birasst01           1     2
##  7 bruneju01           1     1
##  8 burnscb01           1     1
##  9 cammaer01           1     1
## 10 campsh01            1     1
## # ℹ 20,720 more rows