Import data

flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Filter rows

filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Arrange rows

arrange(flights, dep_delay, year, month, day)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12     7     2040           2123       -43       40           2352
##  2  2013     2     3     2022           2055       -33     2240           2338
##  3  2013    11    10     1408           1440       -32     1549           1559
##  4  2013     1    11     1900           1930       -30     2233           2243
##  5  2013     1    29     1703           1730       -27     1947           1957
##  6  2013     8     9      729            755       -26     1002            955
##  7  2013     3    30     2030           2055       -25     2213           2250
##  8  2013    10    23     1907           1932       -25     2143           2143
##  9  2013     3     2     1431           1455       -24     1601           1631
## 10  2013     5     5      934            958       -24     1225           1309
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Select columns

select(flights, day, month, year)
## # A tibble: 336,776 × 3
##      day month  year
##    <int> <int> <int>
##  1     1     1  2013
##  2     1     1  2013
##  3     1     1  2013
##  4     1     1  2013
##  5     1     1  2013
##  6     1     1  2013
##  7     1     1  2013
##  8     1     1  2013
##  9     1     1  2013
## 10     1     1  2013
## # ℹ 336,766 more rows

Add columns

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)
mutate(flights_sml,
  speed = distance / air_time * 60
) 
## # A tibble: 336,776 × 8
##     year month   day dep_delay arr_delay distance air_time speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227  370.
##  2  2013     1     1         4        20     1416      227  374.
##  3  2013     1     1         2        33     1089      160  408.
##  4  2013     1     1        -1       -18     1576      183  517.
##  5  2013     1     1        -6       -25      762      116  394.
##  6  2013     1     1        -4        12      719      150  288.
##  7  2013     1     1        -5        19     1065      158  404.
##  8  2013     1     1        -3       -14      229       53  259.
##  9  2013     1     1        -3        -8      944      140  405.
## 10  2013     1     1        -2         8      733      138  319.
## # ℹ 336,766 more rows

Summarize by groups

#average departure delay
summarise(flights, avg_delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 × 1
##   avg_delay
##       <dbl>
## 1      12.6
#grouped by airline
flights %>%
    group_by(carrier) %>%
#average delay by airline
    summarise(avg_delay = mean(dep_delay, na.rm = TRUE)) %>%
    arrange(avg_delay)
## # A tibble: 16 × 2
##    carrier avg_delay
##    <chr>       <dbl>
##  1 US           3.78
##  2 HA           4.90
##  3 AS           5.80
##  4 AA           8.59
##  5 DL           9.26
##  6 MQ          10.6 
##  7 UA          12.1 
##  8 OO          12.6 
##  9 VX          12.9 
## 10 B6          13.0 
## 11 9E          16.7 
## 12 WN          17.7 
## 13 FL          18.7 
## 14 YV          19.0 
## 15 EV          20.0 
## 16 F9          20.2
#delays by distance
delays <- flights %>% 
  group_by(dest) %>% 
  summarise(
    count = n(),
    dist = mean(distance, na.rm = TRUE),
    delay = mean(arr_delay, na.rm = TRUE)
  ) %>% 
  filter(count > 20, dest != "HNL")
ggplot(data = delays, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#missing values
flights %>%
    filter(!is.na(dep_delay))
## # A tibble: 328,521 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 328,511 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#delays by tail number
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

delays <- not_cancelled %>% 
  group_by(tailnum) %>% 
  summarise(
    delay = mean(arr_delay)
  )

ggplot(data = delays, mapping = aes(x = delay)) + 
  geom_freqpoly(binwidth = 10)

Grouping Multiple Variables

flights %>%
    group_by(year, month, day) %>%
    summarise(count = n()) %>%
    ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
##     year month   day count
##    <int> <int> <int> <int>
##  1  2013     1     1   842
##  2  2013     1     2   943
##  3  2013     1     3   914
##  4  2013     1     4   915
##  5  2013     1     5   720
##  6  2013     1     6   832
##  7  2013     1     7   933
##  8  2013     1     8   899
##  9  2013     1     9   902
## 10  2013     1    10   932
## # ℹ 355 more rows