Import data

flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

Filter rows

filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
filter(flights, month == 1 & day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
filter(flights, month == 1 | day == 1)
## # A tibble: 37,198 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013     1     1      517        515       2     830     819      11 UA     
##  2  2013     1     1      533        529       4     850     830      20 UA     
##  3  2013     1     1      542        540       2     923     850      33 AA     
##  4  2013     1     1      544        545      -1    1004    1022     -18 B6     
##  5  2013     1     1      554        600      -6     812     837     -25 DL     
##  6  2013     1     1      554        558      -4     740     728      12 UA     
##  7  2013     1     1      555        600      -5     913     854      19 B6     
##  8  2013     1     1      557        600      -3     709     723     -14 EV     
##  9  2013     1     1      557        600      -3     838     846      -8 B6     
## 10  2013     1     1      558        600      -2     753     745       8 AA     
## # … with 37,188 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013    11     1        5       2359       6     352     345       7 B6     
##  2  2013    11     1       35       2250     105     123    2356      87 B6     
##  3  2013    11     1      455        500      -5     641     651     -10 US     
##  4  2013    11     1      539        545      -6     856     827      29 UA     
##  5  2013    11     1      542        545      -3     831     855     -24 AA     
##  6  2013    11     1      549        600     -11     912     923     -11 UA     
##  7  2013    11     1      550        600     -10     705     659       6 US     
##  8  2013    11     1      554        600      -6     659     701      -2 US     
##  9  2013    11     1      554        600      -6     826     827      -1 DL     
## 10  2013    11     1      554        600      -6     749     751      -2 DL     
## # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

Arrange rows

arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
##    <int> <int> <int>    <int>      <int>   <dbl>   <int>   <int>   <dbl> <chr>  
##  1  2013    12    31       13       2359      14     439     437       2 B6     
##  2  2013    12    31       18       2359      19     449     444       5 DL     
##  3  2013    12    31       26       2245     101     129    2353      96 B6     
##  4  2013    12    31      459        500      -1     655     651       4 US     
##  5  2013    12    31      514        515      -1     814     812       2 UA     
##  6  2013    12    31      549        551      -2     925     900      25 UA     
##  7  2013    12    31      550        600     -10     725     745     -20 AA     
##  8  2013    12    31      552        600      -8     811     826     -15 EV     
##  9  2013    12    31      553        600      -7     741     754     -13 DL     
## 10  2013    12    31      554        550       4    1024    1027      -3 B6     
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>, and abbreviated variable names
## #   ¹​sched_dep_time, ²​dep_delay, ³​arr_time, ⁴​sched_arr_time, ⁵​arr_delay

Select columns

select(flights, year:dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # … with 336,766 more rows
select(flights, year, month, day, dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # … with 336,766 more rows
select(flights, year, month, day, dep_time, dep_delay)
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # … with 336,766 more rows
select(flights, year, month, day, starts_with("dep"))
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # … with 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
##     year month   day dep_t…¹ sched…² arr_t…³ sched…⁴ air_t…⁵ time_hour          
##    <int> <int> <int>   <int>   <int>   <int>   <int>   <dbl> <dttm>             
##  1  2013     1     1     517     515     830     819     227 2013-01-01 05:00:00
##  2  2013     1     1     533     529     850     830     227 2013-01-01 05:00:00
##  3  2013     1     1     542     540     923     850     160 2013-01-01 05:00:00
##  4  2013     1     1     544     545    1004    1022     183 2013-01-01 05:00:00
##  5  2013     1     1     554     600     812     837     116 2013-01-01 06:00:00
##  6  2013     1     1     554     558     740     728     150 2013-01-01 05:00:00
##  7  2013     1     1     555     600     913     854     158 2013-01-01 06:00:00
##  8  2013     1     1     557     600     709     723      53 2013-01-01 06:00:00
##  9  2013     1     1     557     600     838     846     140 2013-01-01 06:00:00
## 10  2013     1     1     558     600     753     745     138 2013-01-01 06:00:00
## # … with 336,766 more rows, and abbreviated variable names ¹​dep_time,
## #   ²​sched_dep_time, ³​arr_time, ⁴​sched_arr_time, ⁵​air_time
select(flights, year, month, day, ends_with("time"))
## # A tibble: 336,776 × 8
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # … with 336,766 more rows
select(flights, year, month, day, contains("time"), everything())
## # A tibble: 336,776 × 19
##     year month   day dep_t…¹ sched…² arr_t…³ sched…⁴ air_t…⁵ time_hour          
##    <int> <int> <int>   <int>   <int>   <int>   <int>   <dbl> <dttm>             
##  1  2013     1     1     517     515     830     819     227 2013-01-01 05:00:00
##  2  2013     1     1     533     529     850     830     227 2013-01-01 05:00:00
##  3  2013     1     1     542     540     923     850     160 2013-01-01 05:00:00
##  4  2013     1     1     544     545    1004    1022     183 2013-01-01 05:00:00
##  5  2013     1     1     554     600     812     837     116 2013-01-01 06:00:00
##  6  2013     1     1     554     558     740     728     150 2013-01-01 05:00:00
##  7  2013     1     1     555     600     913     854     158 2013-01-01 06:00:00
##  8  2013     1     1     557     600     709     723      53 2013-01-01 06:00:00
##  9  2013     1     1     557     600     838     846     140 2013-01-01 06:00:00
## 10  2013     1     1     558     600     753     745     138 2013-01-01 06:00:00
## # … with 336,766 more rows, 10 more variables: dep_delay <dbl>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## #   variable names ¹​dep_time, ²​sched_dep_time, ³​arr_time, ⁴​sched_arr_time,
## #   ⁵​air_time

Add columns

mutate(flights,
       gain = dep_delay - arr_delay) %>%
   
    # Select year, month, day, and gain
    select(year:day, gain)
## # A tibble: 336,776 × 4
##     year month   day  gain
##    <int> <int> <int> <dbl>
##  1  2013     1     1    -9
##  2  2013     1     1   -16
##  3  2013     1     1   -31
##  4  2013     1     1    17
##  5  2013     1     1    19
##  6  2013     1     1   -16
##  7  2013     1     1   -24
##  8  2013     1     1    11
##  9  2013     1     1     5
## 10  2013     1     1   -10
## # … with 336,766 more rows
# Just keep gain
mutate(flights,
       gain = dep_delay - arr_delay) %>%
   
    # Select year, month, day, and gain
    select(gain)
## # A tibble: 336,776 × 1
##     gain
##    <dbl>
##  1    -9
##  2   -16
##  3   -31
##  4    17
##  5    19
##  6   -16
##  7   -24
##  8    11
##  9     5
## 10   -10
## # … with 336,766 more rows
# alternative using transmute()
transmute(flights,
          gain = dep_delay - arr_delay)
## # A tibble: 336,776 × 1
##     gain
##    <dbl>
##  1    -9
##  2   -16
##  3   -31
##  4    17
##  5    19
##  6   -16
##  7   -24
##  8    11
##  9     5
## 10   -10
## # … with 336,766 more rows
# lag()
select(flights, dep_time) %>%
   
    mutate(dep_time_lag1 = lag(dep_time))
## # A tibble: 336,776 × 2
##    dep_time dep_time_lag1
##       <int>         <int>
##  1      517            NA
##  2      533           517
##  3      542           533
##  4      544           542
##  5      554           544
##  6      554           554
##  7      555           554
##  8      557           555
##  9      557           557
## 10      558           557
## # … with 336,766 more rows
# cumsum()
select(flights, minute) %>%
   
    mutate(minute_cumsum = cumsum(minute))
## # A tibble: 336,776 × 2
##    minute minute_cumsum
##     <dbl>         <dbl>
##  1     15            15
##  2     29            44
##  3     40            84
##  4     45           129
##  5      0           129
##  6     58           187
##  7      0           187
##  8      0           187
##  9      0           187
## 10      0           187
## # … with 336,766 more rows

Summarize by groups