Data101_Project_2

Author

Ask Moystad

PROJECT 2: DATA TRANSFORMATION

Or: Why you should not fly through New York

Appendix

Packages

install.packages("tidyverse")

Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
(as 'lib' is unspecified)

install.packages("nycflights13")

Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.3'
(as 'lib' is unspecified)

Libraries

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)

glimpse(flights)

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

glimpse(airlines)

Rows: 16
Columns: 2
$ carrier <chr> "9E", "AA", "AS", "B6", "DL", "EV", "F9", "FL", "HA", "MQ", "O…
$ name    <chr> "Endeavor Air Inc.", "American Airlines Inc.", "Alaska Airline…

flights |> filter(month == 3)

# A tibble: 28,834 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     3     1        4           2159       125      318             56
 2  2013     3     1       50           2358        52      526            438
 3  2013     3     1      117           2245       152      223           2354
 4  2013     3     1      454            500        -6      633            648
 5  2013     3     1      505            515       -10      746            810
 6  2013     3     1      521            530        -9      813            827
 7  2013     3     1      537            540        -3      856            850
 8  2013     3     1      541            545        -4     1014           1023
 9  2013     3     1      549            600       -11      639            703
10  2013     3     1      550            600       -10      747            801
# ℹ 28,824 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

flights |> select(month, day, carrier, dep_time) |> filter(month == 5, carrier == "WN")

# A tibble: 1,006 × 4
   month   day carrier dep_time
   <int> <int> <chr>      <int>
 1     5     1 WN           557
 2     5     1 WN           559
 3     5     1 WN           620
 4     5     1 WN           632
 5     5     1 WN           652
 6     5     1 WN           708
 7     5     1 WN           718
 8     5     1 WN           732
 9     5     1 WN           739
10     5     1 WN           926
# ℹ 996 more rows

flights |> select(month, day, carrier, dep_time, dep_delay) |> filter (month %in% c(5,6,7,8,9), carrier == "WN") |> group_by(month) |> summarise(total_flights=n())

# A tibble: 5 × 2
  month total_flights
  <int>         <int>
1     5          1006
2     6          1028
3     7          1076
4     8          1047
5     9          1010

flights |>
  filter(dest == "IAH") |> 
  group_by(year, month, day) |> 
  summarize(
    arr_delay = mean(arr_delay, na.rm = TRUE)
  )

`summarise()` has grouped output by 'year', 'month'. You can override using the
`.groups` argument.

# A tibble: 365 × 4
# Groups:   year, month [12]
    year month   day arr_delay
   <int> <int> <int>     <dbl>
 1  2013     1     1     17.8 
 2  2013     1     2      7   
 3  2013     1     3     18.3 
 4  2013     1     4     -3.2 
 5  2013     1     5     20.2 
 6  2013     1     6      9.28
 7  2013     1     7     -7.74
 8  2013     1     8      7.79
 9  2013     1     9     18.1 
10  2013     1    10      6.68
# ℹ 355 more rows

3.2.5 Exercises

1: In a single pipeline for each condition, find all flights that meet the condition:

> Had an arrival delay of two or more hours

flights |> filter(arr_delay >= 120)

# A tibble: 10,200 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      811            630       101     1047            830
 2  2013     1     1      848           1835       853     1001           1950
 3  2013     1     1      957            733       144     1056            853
 4  2013     1     1     1114            900       134     1447           1222
 5  2013     1     1     1505           1310       115     1638           1431
 6  2013     1     1     1525           1340       105     1831           1626
 7  2013     1     1     1549           1445        64     1912           1656
 8  2013     1     1     1558           1359       119     1718           1515
 9  2013     1     1     1732           1630        62     2028           1825
10  2013     1     1     1803           1620       103     2008           1750
# ℹ 10,190 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

> Flew to Houston (IAH or HOU)

flights |> filter(dest %in% c("IAH","HOU"))

# A tibble: 9,313 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      623            627        -4      933            932
 4  2013     1     1      728            732        -4     1041           1038
 5  2013     1     1      739            739         0     1104           1038
 6  2013     1     1      908            908         0     1228           1219
 7  2013     1     1     1028           1026         2     1350           1339
 8  2013     1     1     1044           1045        -1     1352           1351
 9  2013     1     1     1114            900       134     1447           1222
10  2013     1     1     1205           1200         5     1503           1505
# ℹ 9,303 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

> Were operated by United, American, or Delta.

flights |> filter(carrier %in% c("DL","AA","UA"))

# A tibble: 139,504 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      554            600        -6      812            837
 5  2013     1     1      554            558        -4      740            728
 6  2013     1     1      558            600        -2      753            745
 7  2013     1     1      558            600        -2      924            917
 8  2013     1     1      558            600        -2      923            937
 9  2013     1     1      559            600        -1      941            910
10  2013     1     1      559            600        -1      854            902
# ℹ 139,494 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

> Departed in summer (July, August, and September)

flights |> 
  filter(month %in% c(7,8,9))

# A tibble: 86,326 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     7     1        1           2029       212      236           2359
 2  2013     7     1        2           2359         3      344            344
 3  2013     7     1       29           2245       104      151              1
 4  2013     7     1       43           2130       193      322             14
 5  2013     7     1       44           2150       174      300            100
 6  2013     7     1       46           2051       235      304           2358
 7  2013     7     1       48           2001       287      308           2305
 8  2013     7     1       58           2155       183      335             43
 9  2013     7     1      100           2146       194      327             30
10  2013     7     1      100           2245       135      337            135
# ℹ 86,316 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

> Arrived more than two hours late, but didnt leave late

flights |> 
  filter(arr_delay > 120) |> filter(dep_delay == 0)

# A tibble: 3 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013    10     7     1350           1350         0     1736           1526
2  2013     5    23     1810           1810         0     2208           2000
3  2013     7     1      905            905         0     1443           1223
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

> Were delayed by at least an hour, but made up over 30 minutes in flight

flights |> 
  filter(arr_delay > 60) |> filter(dep_delay - arr_delay > 30)

# A tibble: 1,066 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1     2205           1720       285       46           2040
 2  2013     1     1     2326           2130       116      131             18
 3  2013     1     3     1503           1221       162     1803           1555
 4  2013     1     3     1839           1700        99     2056           1950
 5  2013     1     3     1941           1759       102     2246           2139
 6  2013     1     3     2257           2000       177       45           2224
 7  2013     1     4     1917           1700       137     2135           1950
 8  2013     1     4     2010           1745       145     2257           2120
 9  2013     1     4     2058           1730       208        2           2110
10  2013     1     4     2100           1920       100     2224           2121
# ℹ 1,056 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

2. UNSOLVED: Sort flights to find the flights with longest departure delays. Find the flights that left earliest in the morning.

3. Sort flights to find the fastest flights

flights |> 
  mutate(
    speed = distance / air_time
  ,.before = 1) -> FlightsOnSpeed

FlightsOnSpeed |> arrange(desc(speed))

# A tibble: 336,776 × 20
   speed  year month   day dep_time sched_dep_time dep_delay arr_time
   <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1 11.7   2013     5    25     1709           1700         9     1923
 2 10.8   2013     7     2     1558           1513        45     1745
 3 10.8   2013     5    13     2040           2025        15     2225
 4 10.7   2013     3    23     1914           1910         4     2045
 5  9.86  2013     1    12     1559           1600        -1     1849
 6  9.4   2013    11    17      650            655        -5     1059
 7  9.29  2013     2    21     2355           2358        -3      412
 8  9.27  2013    11    17      759            800        -1     1212
 9  9.24  2013    11    16     2003           1925        38       17
10  9.24  2013    11    16     2349           2359       -10      402
# ℹ 336,766 more rows
# ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

# Figure out what the units are.

4. Was there a flight on every day of 2013?

flights |> 
  count(month, day, sort = TRUE) |> arrange(n)

# A tibble: 365 × 3
   month   day     n
   <int> <int> <int>
 1    11    28   634
 2    11    29   661
 3     1    19   674
 4    10    12   676
 5     1    26   680
 6     8    31   680
 7     2     2   682
 8     9    28   682
 9     2     9   684
10    10    19   684
# ℹ 355 more rows

Yes there were flights every day

5. Which flights traveled the farthest distance? Which traveled the least distance?

The flights that traveled the furthest were the direct flights to Hanoi, while the shortest distances were covered by the flights to philadelphia

flights |> arrange(desc(distance))

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      857            900        -3     1516           1530
 2  2013     1     2      909            900         9     1525           1530
 3  2013     1     3      914            900        14     1504           1530
 4  2013     1     4      900            900         0     1516           1530
 5  2013     1     5      858            900        -2     1519           1530
 6  2013     1     6     1019            900        79     1558           1530
 7  2013     1     7     1042            900       102     1620           1530
 8  2013     1     8      901            900         1     1504           1530
 9  2013     1     9      641            900      1301     1242           1530
10  2013     1    10      859            900        -1     1449           1530
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

flights |> arrange((distance))

# A tibble: 336,776 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     7    27       NA            106        NA       NA            245
 2  2013     1     3     2127           2129        -2     2222           2224
 3  2013     1     4     1240           1200        40     1333           1306
 4  2013     1     4     1829           1615       134     1937           1721
 5  2013     1     4     2128           2129        -1     2218           2224
 6  2013     1     5     1155           1200        -5     1241           1306
 7  2013     1     6     2125           2129        -4     2224           2224
 8  2013     1     7     2124           2129        -5     2212           2224
 9  2013     1     8     2127           2130        -3     2304           2225
10  2013     1     9     2126           2129        -3     2217           2224
# ℹ 336,766 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

6. Does it matter what order you used filter() and arrange() if you’re using both? Why/why not? Think about the results and how much work the functions would have to do.

Yes, it might matter, if there is some way to filter the information quickly then the arrange function would use less computing power to sort the remaining results.

3.3.5

2. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.

3. What happens if you specify the name of the same variable multiple times in a select() call?

flights |> select(arr_time, arr_time, arr_time)

# A tibble: 336,776 × 1
   arr_time
      <int>
 1      830
 2      850
 3      923
 4     1004
 5      812
 6      740
 7      913
 8      709
 9      838
10      753
# ℹ 336,766 more rows

4. What does the any_of() function do? Why might it be helpful in conjunction with this vector?

variables <- c("year", "month", "day", "dep_delay", "arr_delay")

it returns true as long as it can identify any of the elements,

flights |> select(variables)

Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
ℹ Please use `all_of()` or `any_of()` instead.
  # Was:
  data %>% select(variables)

  # Now:
  data %>% select(all_of(variables))

See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.

# A tibble: 336,776 × 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013     1     1         2        11
 2  2013     1     1         4        20
 3  2013     1     1         2        33
 4  2013     1     1        -1       -18
 5  2013     1     1        -6       -25
 6  2013     1     1        -4        12
 7  2013     1     1        -5        19
 8  2013     1     1        -3       -14
 9  2013     1     1        -3        -8
10  2013     1     1        -2         8
# ℹ 336,766 more rows

flights |> select(any_of(variables))

# A tibble: 336,776 × 5
    year month   day dep_delay arr_delay
   <int> <int> <int>     <dbl>     <dbl>
 1  2013     1     1         2        11
 2  2013     1     1         4        20
 3  2013     1     1         2        33
 4  2013     1     1        -1       -18
 5  2013     1     1        -6       -25
 6  2013     1     1        -4        12
 7  2013     1     1        -5        19
 8  2013     1     1        -3       -14
 9  2013     1     1        -3        -8
10  2013     1     1        -2         8
# ℹ 336,766 more rows

5. Does the result of running the following code surprise you? How do the select helpers deal with upper and lower case by default? How can you change that default?

It did suprise me. One can change the setting by using a select helper called ingore.case.

flights |> select(contains("TIME", ignore.case = FALSE))

# A tibble: 336,776 × 0

6. Rename air_time to air_time_min to indicate units of measurement and move it to the beginning of the data frame.

FlightsOnSpeed |> 
  rename(air_time_min = air_time) |> relocate(air_time_min)

# A tibble: 336,776 × 20
   air_time_min speed  year month   day dep_time sched_dep_time dep_delay
          <dbl> <dbl> <int> <int> <int>    <int>          <int>     <dbl>
 1          227  6.17  2013     1     1      517            515         2
 2          227  6.24  2013     1     1      533            529         4
 3          160  6.81  2013     1     1      542            540         2
 4          183  8.61  2013     1     1      544            545        -1
 5          116  6.57  2013     1     1      554            600        -6
 6          150  4.79  2013     1     1      554            558        -4
 7          158  6.74  2013     1     1      555            600        -5
 8           53  4.32  2013     1     1      557            600        -3
 9          140  6.74  2013     1     1      557            600        -3
10          138  5.31  2013     1     1      558            600        -2
# ℹ 336,766 more rows
# ℹ 12 more variables: arr_time <int>, sched_arr_time <int>, arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

7. Why doesn’t the following work, and what does the error mean?

The issue seems to be that the code is looking first for the tailnumber and then asking to arrange, issue with syntax and appears to be resolved by changing order. This is probably because select filters the data, brings us the tail number set in which there is no arr_delay variable present.

flights |>
  arrange(arr_delay) |> select(tailnum)

# A tibble: 336,776 × 1
   tailnum
   <chr>  
 1 N843VA 
 2 N840VA 
 3 N851UA 
 4 N3KCAA 
 5 N551AS 
 6 N24212 
 7 N3760C 
 8 N806UA 
 9 N805JB 
10 N855VA 
# ℹ 336,766 more rows

3.5.7 Exercises

1. Which carrier has the worst average delays? Challenge: can you disentangle the effects of bad airports vs. bad carriers? Why/why not?

That might take a considerable amount of time. I guess I could using the mean delay

flights |> 
  group_by(carrier) |> 
  summarize(
    avg_delay = mean(dep_delay, na.rm = TRUE)) |> 
  arrange(avg_delay)

# A tibble: 16 × 2
   carrier avg_delay
   <chr>       <dbl>
 1 US           3.78
 2 HA           4.90
 3 AS           5.80
 4 AA           8.59
 5 DL           9.26
 6 MQ          10.6 
 7 UA          12.1 
 8 OO          12.6 
 9 VX          12.9 
10 B6          13.0 
11 9E          16.7 
12 WN          17.7 
13 FL          18.7 
14 YV          19.0 
15 EV          20.0 
16 F9          20.2

However I think this problem might simply take me too long to solve in the way that I originally inteded.

2. Find the flights that are most delayed upon departure from each destination.

flights |> 
  group_by(origin) |> 
  summarize(
    arr_avg_delay = mean(arr_delay, na.rm = TRUE)) |> 
  arrange(arr_avg_delay)

# A tibble: 3 × 2
  origin arr_avg_delay
  <chr>          <dbl>
1 JFK             5.55
2 LGA             5.78
3 EWR             9.11

3. How do delays vary over the course of the day. Illustrate your answer with a plot.

flights |> summarise(hour)

Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
dplyr 1.1.0.
ℹ Please use `reframe()` instead.
ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
  always returns an ungrouped data frame and adjust accordingly.

# A tibble: 336,776 × 1
    hour
   <dbl>
 1     5
 2     5
 3     5
 4     5
 5     6
 6     5
 7     6
 8     6
 9     6
10     6
# ℹ 336,766 more rows

avg_fuckups_by_hour <- flights |> 
  group_by(hour) |> 
  summarize(
    arr_avg_delay = mean(dep_delay, na.rm = TRUE)) |> 
  arrange(arr_avg_delay)

ggplot(avg_fuckups_by_hour, aes(x = hour, y = arr_avg_delay)) + 
    geom_point(na.rm = TRUE) +
    labs(title = "Average Departure Delay by Hour of Day",
       y = "Average Departure Delay (minutes)",
       x = "Hour of Day") + 
    scale_x_continuous(breaks = seq(0, 25, 1))

4. What happens if you supply a negative n to slice_min() and friends?

flights |> 
  group_by(dest) |> 
  slice_min(arr_delay, n = -10 ) |>
  relocate(dest)

# A tibble: 336,509 × 19
# Groups:   dest [101]
   dest   year month   day dep_time sched_dep_time dep_delay arr_time
   <chr> <int> <int> <int>    <int>          <int>     <dbl>    <int>
 1 ABQ    2013    11    13     1953           2000        -7     2202
 2 ABQ    2013     5    14     1958           2001        -3     2210
 3 ABQ    2013     6     4     1957           2001        -4     2212
 4 ABQ    2013     5     4     1956           2001        -5     2213
 5 ABQ    2013     8    25     1959           2007        -8     2204
 6 ABQ    2013     8    24     2001           2007        -6     2207
 7 ABQ    2013     8    27     1959           2007        -8     2208
 8 ABQ    2013     6    15     1954           2001        -7     2219
 9 ABQ    2013     5    17     1955           2001        -6     2223
10 ABQ    2013     7    12     2005           2007        -2     2214
# ℹ 336,499 more rows
# ℹ 11 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
#   flight <int>, tailnum <chr>, origin <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

This was hard to see, and i didn’t notice what was happening in the data. But looking it up i found that “A negative value of n or prop will be subtracted from the group size. For example, n = -2 with a group of 5 rows will select 5 - 2 = 3 rows; prop = -0.25 with 8 rows will select 8 * (1 - 0.25) = 6 rows.”

5. Explain what count() does in terms of the dplyr verbs you just learned. What does the sort argument to count() do?

It lets me create a list of all the unique variables and then count the number of times they crop up. For example:

flights |> count(hour)

# A tibble: 20 × 2
    hour     n
   <dbl> <int>
 1     1     1
 2     5  1953
 3     6 25951
 4     7 22821
 5     8 27242
 6     9 20312
 7    10 16708
 8    11 16033
 9    12 18181
10    13 19956
11    14 21706
12    15 23888
13    16 23002
14    17 24426
15    18 21783
16    19 21441
17    20 16739
18    21 10933
19    22  2639
20    23  1061

6. Suppose we have the following tiny data frame:

df <- tibble(
  x = 1:5,
  y = c("a", "b", "a", "a", "b"),
  z = c("K", "K", "L", "L", "K")
)

I think it will combine these two vectors, putting them next to each other without changing the order. I assume that x will show up as the integers 1 to 5, creating a numbered table.

a. Write down what you think the output will look like, then check if you were correct, and describe what group_by() does.

df |>
  group_by(y)

# A tibble: 5 × 3
# Groups:   y [2]
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     2 b     K    
3     3 a     L    
4     4 a     L    
5     5 b     K

I assume what the group_by() function has done is to create this as a “grouped by y” table, and that operations will now work by “y”. Which may be important to the rest of the arguments used on the data.

b. Write down what you think the output will look like, then check if you were correct, and describe what arrange() does. Also comment on how it’s different from the group_by() in part (a)?

I did not know what it would look like, however I should have guessed that it would arrange the values of y alphabetically and then it created a table, all the values are therefore arranged according to their relation to y.

df |>
  arrange(y)

# A tibble: 5 × 3
      x y     z    
  <int> <chr> <chr>
1     1 a     K    
2     3 a     L    
3     4 a     L    
4     2 b     K    
5     5 b     K

c. Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does.

This function will first create a table and group them by the column values of the y. It will then create a mean from the integers in the column of x and the summarize function will show the means against the variables in y.

df |>
  group_by(y) |>
  summarize(mean_x = mean(x))

# A tibble: 2 × 2
  y     mean_x
  <chr>  <dbl>
1 a       2.67
2 b       3.5

d. Write down what you think the output will look like, then check if you were correct, and describe what the pipeline does. How is the output different from the one in part (d).

First of all it now also groups based on z and so z shows up. What it is doing is grouping according to the three different combinations of x and y values.

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x), .groups = "drop")

# A tibble: 3 × 3
  y     z     mean_x
  <chr> <chr>  <dbl>
1 a     K        1  
2 a     L        3.5
3 b     K        3.5

f. Write down what you think the outputs will look like, then check if you were correct, and describe what each pipeline does. How are the outputs of the two pipelines different?

So I got this warning earlier, and it is because we are not using the last argument that was present in the previous question. namely the .groups = “drop”. The previous argument had not grouped the table by y or z at all, but neither. This table however does still assign y as the group value.

The second table creates a new column through the mutate function and groupes by x and y, however it also keeps the x column.

df |>
  group_by(y, z) |>
  summarize(mean_x = mean(x))

`summarise()` has grouped output by 'y'. You can override using the `.groups`
argument.

# A tibble: 3 × 3
# Groups:   y [2]
  y     z     mean_x
  <chr> <chr>  <dbl>
1 a     K        1  
2 a     L        3.5
3 b     K        3.5

df |>
  group_by(y, z) |>
  mutate(mean_x = mean(x))

# A tibble: 5 × 4
# Groups:   y, z [3]
      x y     z     mean_x
  <int> <chr> <chr>  <dbl>
1     1 a     K        1  
2     2 b     K        3.5
3     3 a     L        3.5
4     4 a     L        3.5
5     5 b     K        3.5