Chapter 13 1 Chapter 13 - Numbers

library(tidyverse)

Warning: package 'ggplot2' was built under R version 4.3.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.0     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(nycflights13)

Warning: package 'nycflights13' was built under R version 4.3.3

flights %>% glimpse()

Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…

1.1 13.3.1 Exercises

How can you use count() to count the number of rows with a missing value for a given variable?

flights %>% 
  count(is.na(dep_time))

# A tibble: 2 × 2
  `is.na(dep_time)`      n
  <lgl>              <int>
1 FALSE             328521
2 TRUE                8255

Expand the following calls to count() to instead use group_by(), summarize(), and arrange():

a. flights |> count(dest, sort = TRUE)

flights |> count(dest, sort = TRUE)

# A tibble: 105 × 2
   dest      n
   <chr> <int>
 1 ORD   17283
 2 ATL   17215
 3 LAX   16174
 4 BOS   15508
 5 MCO   14082
 6 CLT   14064
 7 SFO   13331
 8 FLL   12055
 9 MIA   11728
10 DCA    9705
# ℹ 95 more rows

Check with revised code

flights %>% 
  group_by(dest) %>% 
  summarize(dest1 = n()) %>% 
  arrange(desc(dest1))

# A tibble: 105 × 2
   dest  dest1
   <chr> <int>
 1 ORD   17283
 2 ATL   17215
 3 LAX   16174
 4 BOS   15508
 5 MCO   14082
 6 CLT   14064
 7 SFO   13331
 8 FLL   12055
 9 MIA   11728
10 DCA    9705
# ℹ 95 more rows

flights |> count(tailnum, wt = distance)

flights |> count(tailnum, wt = distance)

# A tibble: 4,044 × 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  250866
 3 N10156  115966
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   25157
 7 N10575  150194
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ℹ 4,034 more rows

Check with revised code

flights %>% 
  group_by(tailnum) %>% 
  summarize(dist_travel= sum(distance))

# A tibble: 4,044 × 2
   tailnum dist_travel
   <chr>         <dbl>
 1 D942DN         3418
 2 N0EGMQ       250866
 3 N10156       115966
 4 N102UW        25722
 5 N103US        24619
 6 N104UW        25157
 7 N10575       150194
 8 N105UW        23618
 9 N107US        21677
10 N108UW        32070
# ℹ 4,034 more rows

1.2 Flight example for data of only two month (Jan and Feb) using filter

flight1 = flights |> 
  filter(month ==1 | month == 2)

flight1

# A tibble: 51,955 × 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      542            540         2      923            850
 4  2013     1     1      544            545        -1     1004           1022
 5  2013     1     1      554            600        -6      812            837
 6  2013     1     1      554            558        -4      740            728
 7  2013     1     1      555            600        -5      913            854
 8  2013     1     1      557            600        -3      709            723
 9  2013     1     1      557            600        -3      838            846
10  2013     1     1      558            600        -2      753            745
# ℹ 51,945 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

see the table now is 51,955 rows and 19 columns

1.3 Exercse-2, 13.4.8

Round dep_time and arr_time to the nearest five minutes.

grouping into hours and minutes

flights |> 
  mutate(
    hour = dep_time %/% 100,
    minute = dep_time %% 100,
        .keep = "used"
  )

# A tibble: 336,776 × 3
   dep_time  hour minute
      <int> <dbl>  <dbl>
 1      517     5     17
 2      533     5     33
 3      542     5     42
 4      544     5     44
 5      554     5     54
 6      554     5     54
 7      555     5     55
 8      557     5     57
 9      557     5     57
10      558     5     58
# ℹ 336,766 more rows

Changing the minutes to the nearest 5 minutes

flights |> 
  mutate(
    hour = dep_time %/% 100,
    minute = dep_time %% 100,
    minute = round (minute/5)*5,
    .keep = "used"
  )

# A tibble: 336,776 × 3
   dep_time  hour minute
      <int> <dbl>  <dbl>
 1      517     5     15
 2      533     5     35
 3      542     5     40
 4      544     5     45
 5      554     5     55
 6      554     5     55
 7      555     5     55
 8      557     5     55
 9      557     5     55
10      558     5     60
# ℹ 336,766 more rows

keeping in the Orginal way

flights |> 
  mutate(
    hour = dep_time %/% 100,
    minute = dep_time %% 100,
    minute = round (minute/5)*5,
    dep_time_new= ((hour*100) + minute),
    .keep = "used"
  )

# A tibble: 336,776 × 4
   dep_time  hour minute dep_time_new
      <int> <dbl>  <dbl>        <dbl>
 1      517     5     15          515
 2      533     5     35          535
 3      542     5     40          540
 4      544     5     45          545
 5      554     5     55          555
 6      554     5     55          555
 7      555     5     55          555
 8      557     5     55          555
 9      557     5     55          555
10      558     5     60          560
# ℹ 336,766 more rows

Other way of doing the same thing

flights |> 
  mutate(
    dep_time_new= round (dep_time/5)*5,
    .keep = "used"
  )

# A tibble: 336,776 × 2
   dep_time dep_time_new
      <int>        <dbl>
 1      517          515
 2      533          535
 3      542          540
 4      544          545
 5      554          555
 6      554          555
 7      555          555
 8      557          555
 9      557          555
10      558          560
# ℹ 336,766 more rows

1.4 Exercise 3 , 13.5.4

Find the 10 most delayed flights using a ranking function. How do you want to handle ties? Carefully read the documentation for min_rank().

flights |> 
  group_by(flight) %>% 
  summarize(delay= sum(dep_delay)) %>% 
  arrange (desc(delay)) %>% 
  mutate(rank = min_rank(desc(delay)))

# A tibble: 3,844 × 3
   flight delay  rank
    <int> <dbl> <int>
 1   1373  7055     1
 2    580  7038     2
 3    141  6571     3
 4    201  6252     4
 5   1295  5707     5
 6   1091  4767     6
 7   1680  4705     7
 8   1284  4008     8
 9   1105  3966     9
10    263  3955    10
# ℹ 3,834 more rows

To deal with ties

flights |> 
  group_by(flight) %>% 
  summarize(delay= sum(dep_delay)) %>% 
  arrange (desc(delay)) %>% 
  mutate(rank = dense_rank(desc(delay))) %>% 
  top_n(-10, rank) # for top ten ranks ( 1 to 10), -10 is for top 10 less values

# A tibble: 10 × 3
   flight delay  rank
    <int> <dbl> <int>
 1   1373  7055     1
 2    580  7038     2
 3    141  6571     3
 4    201  6252     4
 5   1295  5707     5
 6   1091  4767     6
 7   1680  4705     7
 8   1284  4008     8
 9   1105  3966     9
10    263  3955    10