Import data

flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Filter rows

filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month ==12, day ==25)
## # A tibble: 719 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    25      456            500        -4      649            651
##  2  2013    12    25      524            515         9      805            814
##  3  2013    12    25      542            540         2      832            850
##  4  2013    12    25      546            550        -4     1022           1027
##  5  2013    12    25      556            600        -4      730            745
##  6  2013    12    25      557            600        -3      743            752
##  7  2013    12    25      557            600        -3      818            831
##  8  2013    12    25      559            600        -1      855            856
##  9  2013    12    25      559            600        -1      849            855
## 10  2013    12    25      600            600         0      850            846
## # ℹ 709 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # ℹ 55,393 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, !(arr_delay > 120 | dep_delay > 120))
## # A tibble: 316,050 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 316,040 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, carrier == "UA")
## # A tibble: 58,665 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      558            600        -2      924            917
##  5  2013     1     1      558            600        -2      923            937
##  6  2013     1     1      559            600        -1      854            902
##  7  2013     1     1      607            607         0      858            915
##  8  2013     1     1      611            600        11      945            931
##  9  2013     1     1      623            627        -4      933            932
## 10  2013     1     1      628            630        -2     1016            947
## # ℹ 58,655 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest == "IAH", dest == "HOU")
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 7, month == 8, month == 9)
## # A tibble: 0 × 19
## # ℹ 19 variables: year <int>, month <int>, day <int>, dep_time <int>,
## #   sched_dep_time <int>, dep_delay <dbl>, arr_time <int>,
## #   sched_arr_time <int>, arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Arrange rows

arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    31       13           2359        14      439            437
##  2  2013    12    31       18           2359        19      449            444
##  3  2013    12    31       26           2245       101      129           2353
##  4  2013    12    31      459            500        -1      655            651
##  5  2013    12    31      514            515        -1      814            812
##  6  2013    12    31      549            551        -2      925            900
##  7  2013    12    31      550            600       -10      725            745
##  8  2013    12    31      552            600        -8      811            826
##  9  2013    12    31      553            600        -7      741            754
## 10  2013    12    31      554            550         4     1024           1027
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(dep_delay), desc(arr_delay))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     6    27      959           1900       899     1236           2226
##  9  2013     7    22     2257            759       898      121           1026
## 10  2013    12     5      756           1700       896     1058           2020
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
arrange(flights, desc(air_time))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     3    17     1337           1335         2     1937           1836
##  2  2013     2     6      853            900        -7     1542           1540
##  3  2013     3    15     1001           1000         1     1551           1530
##  4  2013     3    17     1006           1000         6     1607           1530
##  5  2013     3    16     1001           1000         1     1544           1530
##  6  2013     2     5      900            900         0     1555           1540
##  7  2013    11    12      936            930         6     1630           1530
##  8  2013     3    14      958           1000        -2     1542           1530
##  9  2013    11    20     1006           1000         6     1639           1555
## 10  2013     3    15     1342           1335         7     1924           1836
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Select columns

select(flights, year:dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # ℹ 336,766 more rows
select(flights, year, month, day, dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # ℹ 336,766 more rows
select(flights, year, month, day, dep_time, dep_delay)
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # ℹ 336,766 more rows
select(flights, year, month, day, starts_with("dep"))
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
## # ℹ 1 more variable: time_hour <dttm>
select(flights, year, month, day, ends_with("time"))
## # A tibble: 336,776 × 8
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"),everything())
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
## # ℹ 11 more variables: time_hour <dttm>, dep_delay <dbl>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   distance <dbl>, hour <dbl>, minute <dbl>
rename(flights, tail_num = tailnum)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tail_num <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, time_hour, air_time, everything())
## # A tibble: 336,776 × 19
##    time_hour           air_time  year month   day dep_time sched_dep_time
##    <dttm>                 <dbl> <int> <int> <int>    <int>          <int>
##  1 2013-01-01 05:00:00      227  2013     1     1      517            515
##  2 2013-01-01 05:00:00      227  2013     1     1      533            529
##  3 2013-01-01 05:00:00      160  2013     1     1      542            540
##  4 2013-01-01 05:00:00      183  2013     1     1      544            545
##  5 2013-01-01 06:00:00      116  2013     1     1      554            600
##  6 2013-01-01 05:00:00      150  2013     1     1      554            558
##  7 2013-01-01 06:00:00      158  2013     1     1      555            600
##  8 2013-01-01 06:00:00       53  2013     1     1      557            600
##  9 2013-01-01 06:00:00      140  2013     1     1      557            600
## 10 2013-01-01 06:00:00      138  2013     1     1      558            600
## # ℹ 336,766 more rows
## # ℹ 12 more variables: dep_delay <dbl>, arr_time <int>, sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## #   dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>
select(flights, time_hour, time_hour )
## # A tibble: 336,776 × 1
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 05:00:00
##  4 2013-01-01 05:00:00
##  5 2013-01-01 06:00:00
##  6 2013-01-01 05:00:00
##  7 2013-01-01 06:00:00
##  8 2013-01-01 06:00:00
##  9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ℹ 336,766 more rows

Add columns

mutate(flights,
       gain = dep_delay - arr_delay) %>%
    # Select year, month, day, and gain
    select(year:day, gain)
## # A tibble: 336,776 × 4
##     year month   day  gain
##    <int> <int> <int> <dbl>
##  1  2013     1     1    -9
##  2  2013     1     1   -16
##  3  2013     1     1   -31
##  4  2013     1     1    17
##  5  2013     1     1    19
##  6  2013     1     1   -16
##  7  2013     1     1   -24
##  8  2013     1     1    11
##  9  2013     1     1     5
## 10  2013     1     1   -10
## # ℹ 336,766 more rows
# Just keep gain
mutate(flights,
       gain = dep_delay - arr_delay) %>%
    #select year, month, day, and gain
    select(gain)
## # A tibble: 336,776 × 1
##     gain
##    <dbl>
##  1    -9
##  2   -16
##  3   -31
##  4    17
##  5    19
##  6   -16
##  7   -24
##  8    11
##  9     5
## 10   -10
## # ℹ 336,766 more rows
# Alternative using transmute()
transmute(flights, 
          gain = dep_delay - arr_delay)
## # A tibble: 336,776 × 1
##     gain
##    <dbl>
##  1    -9
##  2   -16
##  3   -31
##  4    17
##  5    19
##  6   -16
##  7   -24
##  8    11
##  9     5
## 10   -10
## # ℹ 336,766 more rows
# Lag()
select(flights, dep_time) %>%
    
    mutate(dep_time_lag1 = lag(dep_time))
## # A tibble: 336,776 × 2
##    dep_time dep_time_lag1
##       <int>         <int>
##  1      517            NA
##  2      533           517
##  3      542           533
##  4      544           542
##  5      554           544
##  6      554           554
##  7      555           554
##  8      557           555
##  9      557           557
## 10      558           557
## # ℹ 336,766 more rows
#cumsum()
select(flights, minute)%>%
    
    mutate(minute_cumsum = cumsum(minute))
## # A tibble: 336,776 × 2
##    minute minute_cumsum
##     <dbl>         <dbl>
##  1     15            15
##  2     29            44
##  3     40            84
##  4     45           129
##  5      0           129
##  6     58           187
##  7      0           187
##  8      0           187
##  9      0           187
## 10      0           187
## # ℹ 336,766 more rows

Summarize by groups

Collapsing data to a single row

flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# average departure delay
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1  12.6

Summarize by group

flights %>%
    
    # Group by airlines
    group_by(carrier) %>%
    
    # Calculate average departure delay
    summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
    
    # Sort it
    arrange(delay)
## # A tibble: 16 × 2
##    carrier delay
##    <chr>   <dbl>
##  1 US       3.78
##  2 HA       4.90
##  3 AS       5.80
##  4 AA       8.59
##  5 DL       9.26
##  6 MQ      10.6 
##  7 UA      12.1 
##  8 OO      12.6 
##  9 VX      12.9 
## 10 B6      13.0 
## 11 9E      16.7 
## 12 WN      17.7 
## 13 FL      18.7 
## 14 YV      19.0 
## 15 EV      20.0 
## 16 F9      20.2
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")

Delays increase with distance up to ~750 miles and then decrease

flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  
    # Plot
    ggplot(mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = 0.3) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Grouping multiple variables and Ungrouping

flights %>%
    group_by(year, month, day)%>%
    summarise(count = n()) %>%
    ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
##     year month   day count
##    <int> <int> <int> <int>
##  1  2013     1     1   842
##  2  2013     1     2   943
##  3  2013     1     3   914
##  4  2013     1     4   915
##  5  2013     1     5   720
##  6  2013     1     6   832
##  7  2013     1     7   933
##  8  2013     1     8   899
##  9  2013     1     9   902
## 10  2013     1    10   932
## # ℹ 355 more rows

Missing values

flights %>%
    
    # Remove missing values 
    filter(!is.na(dep_delay))
## # A tibble: 328,521 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 328,511 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1    NA
##  2  2013     1     2    NA
##  3  2013     1     3    NA
##  4  2013     1     4    NA
##  5  2013     1     5    NA
##  6  2013     1     6    NA
##  7  2013     1     7    NA
##  8  2013     1     8    NA
##  9  2013     1     9    NA
## 10  2013     1    10    NA
## # ℹ 355 more rows
flights %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ℹ 355 more rows

Counts

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )

ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay, na.rm = TRUE),
    n = n()
  )

ggplot(data = delays, mapping = aes(x = n, y = delay)) + 
  geom_point(alpha = 1/10)

delays %>% 
  filter(n > 25) %>% 
  ggplot(mapping = aes(x = n, y = delay)) + 
    geom_point(alpha = 1/10)

batting <- as_tibble(Lahman::Batting)

batters <- batting %>% 
  group_by(playerID) %>% 
  summarise(
    ba = sum(H, na.rm = TRUE) / sum(AB, na.rm = TRUE),
    ab = sum(AB, na.rm = TRUE)
  )

# Performance of batters in baseball

batters %>% 
  filter(ab > 100) %>% 
  ggplot(mapping = aes(x = ab, y = ba)) +
    geom_point() + 
    geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

batters %>% 
  arrange(desc(ba))
## # A tibble: 20,469 × 3
##    playerID     ba    ab
##    <chr>     <dbl> <int>
##  1 abramge01     1     1
##  2 alberan01     1     1
##  3 banisje01     1     1
##  4 bartocl01     1     1
##  5 bassdo01      1     1
##  6 birasst01     1     2
##  7 bruneju01     1     1
##  8 burnscb01     1     1
##  9 cammaer01     1     1
## 10 campsh01      1     1
## # ℹ 20,459 more rows

Mutate by groups

flights %>% 
  group_by(year, month, day) %>%
  filter(rank(desc(arr_delay)) < 10)
## # A tibble: 3,306 × 19
## # Groups:   year, month, day [365]
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848           1835       853     1001           1950
##  2  2013     1     1     1815           1325       290     2120           1542
##  3  2013     1     1     1842           1422       260     1958           1535
##  4  2013     1     1     1942           1705       157     2124           1830
##  5  2013     1     1     2006           1630       216     2230           1848
##  6  2013     1     1     2115           1700       255     2330           1920
##  7  2013     1     1     2205           1720       285       46           2040
##  8  2013     1     1     2312           2000       192       21           2110
##  9  2013     1     1     2343           1724       379      314           1938
## 10  2013     1     2     1244            900       224     1431           1104
## # ℹ 3,296 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
popular_dests <- flights %>% 
  group_by(dest) %>% 
  filter(n() > 365)

popular_dests %>% 
  filter(arr_delay > 0) %>% 
  mutate(prop_delay = arr_delay / sum(arr_delay)) %>% 
  select(year:day, dest, arr_delay, prop_delay)
## # A tibble: 131,106 × 6
## # Groups:   dest [77]
##     year month   day dest  arr_delay prop_delay
##    <int> <int> <int> <chr>     <dbl>      <dbl>
##  1  2013     1     1 IAH          11  0.000111 
##  2  2013     1     1 IAH          20  0.000201 
##  3  2013     1     1 MIA          33  0.000235 
##  4  2013     1     1 ORD          12  0.0000424
##  5  2013     1     1 FLL          19  0.0000938
##  6  2013     1     1 ORD           8  0.0000283
##  7  2013     1     1 LAX           7  0.0000344
##  8  2013     1     1 DFW          31  0.000282 
##  9  2013     1     1 ATL          12  0.0000400
## 10  2013     1     1 DTW          16  0.000116 
## # ℹ 131,096 more rows