Sample codes from R4DS.

Load the required libraries

library(tidyverse)
library(nycflights13)

Peek into data

head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515        2.      830
## 2  2013     1     1      533            529        4.      850
## 3  2013     1     1      542            540        2.      923
## 4  2013     1     1      544            545       -1.     1004
## 5  2013     1     1      554            600       -6.      812
## 6  2013     1     1      554            558       -4.      740
## # ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>

Lets filter just the not-cancelled flights i.e remove rows with missing dep_delay or arry_delay

not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

Whats the mean arrival delay for flights not cancelled grouped by year, month & day ?

not_cancelled %>%
  group_by(year, month, day) %>%
  summarise(
    avg_delay1=mean(arr_delay), 
    avg_delay2=mean(arr_delay[arr_delay > 0])
  )
## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day avg_delay1 avg_delay2
##    <int> <int> <int>      <dbl>      <dbl>
##  1  2013     1     1     12.7         32.5
##  2  2013     1     2     12.7         32.0
##  3  2013     1     3      5.73        27.7
##  4  2013     1     4     -1.93        28.3
##  5  2013     1     5     -1.53        22.6
##  6  2013     1     6      4.24        24.4
##  7  2013     1     7     -4.95        27.8
##  8  2013     1     8     -3.23        20.8
##  9  2013     1     9     -0.264       25.6
## 10  2013     1    10     -5.90        27.3
## # ... with 355 more rows

Whats the departure time for first and last flight? grouped by Y/M/D

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first = min(dep_time),
    last = max(dep_time)
  )
## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day first  last
##    <int> <int> <int> <dbl> <dbl>
##  1  2013     1     1  517. 2356.
##  2  2013     1     2   42. 2354.
##  3  2013     1     3   32. 2349.
##  4  2013     1     4   25. 2358.
##  5  2013     1     5   14. 2357.
##  6  2013     1     6   16. 2355.
##  7  2013     1     7   49. 2359.
##  8  2013     1     8  454. 2351.
##  9  2013     1     9    2. 2252.
## 10  2013     1    10    3. 2320.
## # ... with 355 more rows

Or the same can be achieved using ‘first’ and ‘last’ instead of min and max

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )
## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day first_dep last_dep
##    <int> <int> <int>     <int>    <int>
##  1  2013     1     1       517     2356
##  2  2013     1     2        42     2354
##  3  2013     1     3        32     2349
##  4  2013     1     4        25     2358
##  5  2013     1     5        14     2357
##  6  2013     1     6        16     2355
##  7  2013     1     7        49     2359
##  8  2013     1     8       454     2351
##  9  2013     1     9         2     2252
## 10  2013     1    10         3     2320
## # ... with 355 more rows

Or using the min_rank() and range() utility functions

not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))
## # A tibble: 770 x 20
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515        2.      830
##  2  2013     1     1     2356           2359       -3.      425
##  3  2013     1     2       42           2359       43.      518
##  4  2013     1     2     2354           2359       -5.      413
##  5  2013     1     3       32           2359       33.      504
##  6  2013     1     3     2349           2359      -10.      434
##  7  2013     1     4       25           2359       26.      505
##  8  2013     1     4     2358           2359       -1.      429
##  9  2013     1     4     2358           2359       -1.      436
## 10  2013     1     5       14           2359       15.      503
## # ... with 760 more rows, and 13 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, r <int>
head(with(not_cancelled, min_rank(desc(dep_time))))
## [1] 325665 325407 324801 324665 322407 322407
nc <- not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(desc(dep_time))) %>% 
  filter(r %in% range(r))
head(nc)
## # A tibble: 6 x 20
## # Groups:   year, month, day [3]
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515        2.      830
## 2  2013     1     1     2356           2359       -3.      425
## 3  2013     1     2       42           2359       43.      518
## 4  2013     1     2     2354           2359       -5.      413
## 5  2013     1     3       32           2359       33.      504
## 6  2013     1     3     2349           2359      -10.      434
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, r <int>

As range() picks the min and max from min_rank() output, min_rank(desc(dep_time)) is not required, just min_rank(dep_time) will do

nc2 <- not_cancelled %>% 
  group_by(year, month, day) %>% 
  mutate(r = min_rank(dep_time)) %>% 
  filter(r %in% range(r))
head(nc2)
## # A tibble: 6 x 20
## # Groups:   year, month, day [3]
##    year month   day dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>
## 1  2013     1     1      517            515        2.      830
## 2  2013     1     1     2356           2359       -3.      425
## 3  2013     1     2       42           2359       43.      518
## 4  2013     1     2     2354           2359       -5.      413
## 5  2013     1     3       32           2359       33.      504
## 6  2013     1     3     2349           2359      -10.      434
## # ... with 13 more variables: sched_arr_time <int>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
## #   time_hour <dttm>, r <int>

Bring the cols: year, month, day, dep_time, r to front and display the remaining afterwards

select(nc, year, month, day, dep_time, r, everything())
## # A tibble: 770 x 20
## # Groups:   year, month, day [365]
##     year month   day dep_time     r sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int> <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517   831            515        2.      830
##  2  2013     1     1     2356     1           2359       -3.      425
##  3  2013     1     2       42   928           2359       43.      518
##  4  2013     1     2     2354     1           2359       -5.      413
##  5  2013     1     3       32   900           2359       33.      504
##  6  2013     1     3     2349     1           2359      -10.      434
##  7  2013     1     4       25   908           2359       26.      505
##  8  2013     1     4     2358     1           2359       -1.      429
##  9  2013     1     4     2358     1           2359       -1.      436
## 10  2013     1     5       14   717           2359       15.      503
## # ... with 760 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
select(nc2, year, month, day, dep_time, r, everything())
## # A tibble: 770 x 20
## # Groups:   year, month, day [365]
##     year month   day dep_time     r sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int> <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517     1            515        2.      830
##  2  2013     1     1     2356   831           2359       -3.      425
##  3  2013     1     2       42     1           2359       43.      518
##  4  2013     1     2     2354   928           2359       -5.      413
##  5  2013     1     3       32     1           2359       33.      504
##  6  2013     1     3     2349   900           2359      -10.      434
##  7  2013     1     4       25     1           2359       26.      505
##  8  2013     1     4     2358   907           2359       -1.      429
##  9  2013     1     4     2358   907           2359       -1.      436
## 10  2013     1     5       14     1           2359       15.      503
## # ... with 760 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>

How many distinct carriers are available for a given dest?

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(
    first_dep = first(dep_time), 
    last_dep = last(dep_time)
  )
## # A tibble: 365 x 5
## # Groups:   year, month [?]
##     year month   day first_dep last_dep
##    <int> <int> <int>     <int>    <int>
##  1  2013     1     1       517     2356
##  2  2013     1     2        42     2354
##  3  2013     1     3        32     2349
##  4  2013     1     4        25     2358
##  5  2013     1     5        14     2357
##  6  2013     1     6        16     2355
##  7  2013     1     7        49     2359
##  8  2013     1     8       454     2351
##  9  2013     1     9         2     2252
## 10  2013     1    10         3     2320
## # ... with 355 more rows
not_cancelled %>% 
  group_by(dest) %>% 
  summarise(carriers = n_distinct(carrier)) %>% 
  arrange(desc(carriers))
## # A tibble: 104 x 2
##    dest  carriers
##    <chr>    <int>
##  1 ATL          7
##  2 BOS          7
##  3 CLT          7
##  4 ORD          7
##  5 TPA          7
##  6 AUS          6
##  7 DCA          6
##  8 DTW          6
##  9 IAD          6
## 10 MSP          6
## # ... with 94 more rows

How many destinations in total?

not_cancelled %>% 
  count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows
not_cancelled %>% 
  count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
##    tailnum       n
##    <chr>     <dbl>
##  1 D942DN    3418.
##  2 N0EGMQ  239143.
##  3 N10156  109664.
##  4 N102UW   25722.
##  5 N103US   24619.
##  6 N104UW   24616.
##  7 N10575  139903.
##  8 N105UW   23618.
##  9 N107US   21677.
## 10 N108UW   32070.
## # ... with 4,027 more rows

Or this way

not_cancelled %>%
  group_by(tailnum) %>%
  summarise(dist= sum(distance))
## # A tibble: 4,037 x 2
##    tailnum    dist
##    <chr>     <dbl>
##  1 D942DN    3418.
##  2 N0EGMQ  239143.
##  3 N10156  109664.
##  4 N102UW   25722.
##  5 N103US   24619.
##  6 N104UW   24616.
##  7 N10575  139903.
##  8 N105UW   23618.
##  9 N107US   21677.
## 10 N108UW   32070.
## # ... with 4,027 more rows

How many flights left before 5am? (these usually indicate delayed flights from the previous day) grouped by Y/M/D

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(n_early = sum(dep_time < 500))
## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day n_early
##    <int> <int> <int>   <int>
##  1  2013     1     1       0
##  2  2013     1     2       3
##  3  2013     1     3       4
##  4  2013     1     4       3
##  5  2013     1     5       3
##  6  2013     1     6       2
##  7  2013     1     7       2
##  8  2013     1     8       1
##  9  2013     1     9       3
## 10  2013     1    10       3
## # ... with 355 more rows

Or if you want just the total count

not_cancelled %>%
  summarise(n_early=sum(dep_time < 500))
## # A tibble: 1 x 1
##   n_early
##     <int>
## 1    1476

What proportion of flights are delayed by more than an hour?

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(hour_perc = mean(arr_delay > 60))
## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day hour_perc
##    <int> <int> <int>     <dbl>
##  1  2013     1     1    0.0722
##  2  2013     1     2    0.0851
##  3  2013     1     3    0.0567
##  4  2013     1     4    0.0396
##  5  2013     1     5    0.0349
##  6  2013     1     6    0.0470
##  7  2013     1     7    0.0333
##  8  2013     1     8    0.0213
##  9  2013     1     9    0.0202
## 10  2013     1    10    0.0183
## # ... with 355 more rows

by year, month

not_cancelled %>%
  group_by(year, month) %>%
  summarise(hr_perc= mean(arr_delay > 60))
## # A tibble: 12 x 3
## # Groups:   year [?]
##     year month hr_perc
##    <int> <int>   <dbl>
##  1  2013     1  0.0705
##  2  2013     2  0.0689
##  3  2013     3  0.0837
##  4  2013     4  0.102 
##  5  2013     5  0.0795
##  6  2013     6  0.142 
##  7  2013     7  0.145 
##  8  2013     8  0.0835
##  9  2013     9  0.0476
## 10  2013    10  0.0451
## 11  2013    11  0.0415
## 12  2013    12  0.107

Total flight count daily

daily <- group_by(flights, year, month, day)

flight count for each day, month , year

(per_day   <- summarise(daily, flights = n()))
## # A tibble: 365 x 4
## # Groups:   year, month [?]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ... with 355 more rows
(per_month <- summarise(per_day, flights = sum(flights)))
## # A tibble: 12 x 3
## # Groups:   year [?]
##     year month flights
##    <int> <int>   <int>
##  1  2013     1   27004
##  2  2013     2   24951
##  3  2013     3   28834
##  4  2013     4   28330
##  5  2013     5   28796
##  6  2013     6   28243
##  7  2013     7   29425
##  8  2013     8   29327
##  9  2013     9   27574
## 10  2013    10   28889
## 11  2013    11   27268
## 12  2013    12   28135
(per_year  <- summarise(per_month, flights = sum(flights)))
## # A tibble: 1 x 2
##    year flights
##   <int>   <int>
## 1  2013  336776

Flight count for each destinatio

not_cancelled %>% count(dest)
## # A tibble: 104 x 2
##    dest      n
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

or

not_cancelled %>%
  group_by(dest) %>%
  summarise(N=n())
## # A tibble: 104 x 2
##    dest      N
##    <chr> <int>
##  1 ABQ     254
##  2 ACK     264
##  3 ALB     418
##  4 ANC       8
##  5 ATL   16837
##  6 AUS    2411
##  7 AVL     261
##  8 BDL     412
##  9 BGR     358
## 10 BHM     269
## # ... with 94 more rows

Total distance covered for each tailnum using count() and weight ‘wt’ parameter

not_cancelled %>% count(tailnum, wt = distance)
## # A tibble: 4,037 x 2
##    tailnum       n
##    <chr>     <dbl>
##  1 D942DN    3418.
##  2 N0EGMQ  239143.
##  3 N10156  109664.
##  4 N102UW   25722.
##  5 N103US   24619.
##  6 N104UW   24616.
##  7 N10575  139903.
##  8 N105UW   23618.
##  9 N107US   21677.
## 10 N108UW   32070.
## # ... with 4,027 more rows

or

not_cancelled %>%
  group_by(tailnum) %>%
  summarise(sum(distance))
## # A tibble: 4,037 x 2
##    tailnum `sum(distance)`
##    <chr>             <dbl>
##  1 D942DN            3418.
##  2 N0EGMQ          239143.
##  3 N10156          109664.
##  4 N102UW           25722.
##  5 N103US           24619.
##  6 N104UW           24616.
##  7 N10575          139903.
##  8 N105UW           23618.
##  9 N107US           21677.
## 10 N108UW           32070.
## # ... with 4,027 more rows

Count tailnum

not_cancelled %>%
  count(tailnum)
## # A tibble: 4,037 x 2
##    tailnum     n
##    <chr>   <int>
##  1 D942DN      4
##  2 N0EGMQ    352
##  3 N10156    145
##  4 N102UW     48
##  5 N103US     46
##  6 N104UW     46
##  7 N10575    269
##  8 N105UW     45
##  9 N107US     41
## 10 N108UW     60
## # ... with 4,027 more rows

Total number of fights by carrier for each destination

flights %>% 
  group_by(carrier, dest) %>% 
  summarise(N=n())
## # A tibble: 314 x 3
## # Groups:   carrier [?]
##    carrier dest      N
##    <chr>   <chr> <int>
##  1 9E      ATL      59
##  2 9E      AUS       2
##  3 9E      AVL      10
##  4 9E      BGR       1
##  5 9E      BNA     474
##  6 9E      BOS     914
##  7 9E      BTV       2
##  8 9E      BUF     833
##  9 9E      BWI     856
## 10 9E      CAE       3
## # ... with 304 more rows