Import data

flights 
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Filter rows

filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 1 & day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 1 | day == 1)
## # A tibble: 37,198 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 37,188 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    11     1        5           2359         6      352            345
##  2  2013    11     1       35           2250       105      123           2356
##  3  2013    11     1      455            500        -5      641            651
##  4  2013    11     1      539            545        -6      856            827
##  5  2013    11     1      542            545        -3      831            855
##  6  2013    11     1      549            600       -11      912            923
##  7  2013    11     1      550            600       -10      705            659
##  8  2013    11     1      554            600        -6      659            701
##  9  2013    11     1      554            600        -6      826            827
## 10  2013    11     1      554            600        -6      749            751
## # ℹ 55,393 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Arrange rows

arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013    12    31       13           2359        14      439            437
##  2  2013    12    31       18           2359        19      449            444
##  3  2013    12    31       26           2245       101      129           2353
##  4  2013    12    31      459            500        -1      655            651
##  5  2013    12    31      514            515        -1      814            812
##  6  2013    12    31      549            551        -2      925            900
##  7  2013    12    31      550            600       -10      725            745
##  8  2013    12    31      552            600        -8      811            826
##  9  2013    12    31      553            600        -7      741            754
## 10  2013    12    31      554            550         4     1024           1027
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Select columns

select(flights, year:dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # ℹ 336,766 more rows
select(flights, year, month, day, dep_time)
## # A tibble: 336,776 × 4
##     year month   day dep_time
##    <int> <int> <int>    <int>
##  1  2013     1     1      517
##  2  2013     1     1      533
##  3  2013     1     1      542
##  4  2013     1     1      544
##  5  2013     1     1      554
##  6  2013     1     1      554
##  7  2013     1     1      555
##  8  2013     1     1      557
##  9  2013     1     1      557
## 10  2013     1     1      558
## # ℹ 336,766 more rows
select(flights, year, month, day,dep_time, dep_delay)
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # ℹ 336,766 more rows
select(flights, year, month, day, starts_with("dep"))
## # A tibble: 336,776 × 5
##     year month   day dep_time dep_delay
##    <int> <int> <int>    <int>     <dbl>
##  1  2013     1     1      517         2
##  2  2013     1     1      533         4
##  3  2013     1     1      542         2
##  4  2013     1     1      544        -1
##  5  2013     1     1      554        -6
##  6  2013     1     1      554        -4
##  7  2013     1     1      555        -5
##  8  2013     1     1      557        -3
##  9  2013     1     1      557        -3
## 10  2013     1     1      558        -2
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
## # ℹ 1 more variable: time_hour <dttm>
select(flights, year, month, day, ends_with("time"))
## # A tibble: 336,776 × 8
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"), everything())
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time arr_time sched_arr_time air_time
##    <int> <int> <int>    <int>          <int>    <int>          <int>    <dbl>
##  1  2013     1     1      517            515      830            819      227
##  2  2013     1     1      533            529      850            830      227
##  3  2013     1     1      542            540      923            850      160
##  4  2013     1     1      544            545     1004           1022      183
##  5  2013     1     1      554            600      812            837      116
##  6  2013     1     1      554            558      740            728      150
##  7  2013     1     1      555            600      913            854      158
##  8  2013     1     1      557            600      709            723       53
##  9  2013     1     1      557            600      838            846      140
## 10  2013     1     1      558            600      753            745      138
## # ℹ 336,766 more rows
## # ℹ 11 more variables: time_hour <dttm>, dep_delay <dbl>, arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   distance <dbl>, hour <dbl>, minute <dbl>

Add columns

flights_sml <- select(flights, 
  year:day, 
  ends_with("delay"), 
  distance, 
  air_time
)

mutate(flights_sml,
  gain = dep_delay - arr_delay,
  speed = distance / air_time * 60
)
## # A tibble: 336,776 × 9
##     year month   day dep_delay arr_delay distance air_time  gain speed
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9  370.
##  2  2013     1     1         4        20     1416      227   -16  374.
##  3  2013     1     1         2        33     1089      160   -31  408.
##  4  2013     1     1        -1       -18     1576      183    17  517.
##  5  2013     1     1        -6       -25      762      116    19  394.
##  6  2013     1     1        -4        12      719      150   -16  288.
##  7  2013     1     1        -5        19     1065      158   -24  404.
##  8  2013     1     1        -3       -14      229       53    11  259.
##  9  2013     1     1        -3        -8      944      140     5  405.
## 10  2013     1     1        -2         8      733      138   -10  319.
## # ℹ 336,766 more rows
mutate(flights_sml,
  gain = dep_delay - arr_delay,
  hours = air_time / 60,
  gain_per_hour = gain / hours
)
## # A tibble: 336,776 × 10
##     year month   day dep_delay arr_delay distance air_time  gain hours
##    <int> <int> <int>     <dbl>     <dbl>    <dbl>    <dbl> <dbl> <dbl>
##  1  2013     1     1         2        11     1400      227    -9 3.78 
##  2  2013     1     1         4        20     1416      227   -16 3.78 
##  3  2013     1     1         2        33     1089      160   -31 2.67 
##  4  2013     1     1        -1       -18     1576      183    17 3.05 
##  5  2013     1     1        -6       -25      762      116    19 1.93 
##  6  2013     1     1        -4        12      719      150   -16 2.5  
##  7  2013     1     1        -5        19     1065      158   -24 2.63 
##  8  2013     1     1        -3       -14      229       53    11 0.883
##  9  2013     1     1        -3        -8      944      140     5 2.33 
## 10  2013     1     1        -2         8      733      138   -10 2.3  
## # ℹ 336,766 more rows
## # ℹ 1 more variable: gain_per_hour <dbl>
transmute(flights,
  dep_time,
  hour = dep_time %/% 100,
  minute = dep_time %% 100
)
## # A tibble: 336,776 × 3
##    dep_time  hour minute
##       <int> <dbl>  <dbl>
##  1      517     5     17
##  2      533     5     33
##  3      542     5     42
##  4      544     5     44
##  5      554     5     54
##  6      554     5     54
##  7      555     5     55
##  8      557     5     57
##  9      557     5     57
## 10      558     5     58
## # ℹ 336,766 more rows

Summarize by groups

flights
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#average departure delay
summarise(flights, delay = mean(dep_delay, na.rn = TRUE))
## # A tibble: 1 × 1
##   delay
##   <dbl>
## 1    NA
flights %>%
    # group by airline
    group_by(carrier) %>%
    
    #calculate average departure delay 
    summarise (delay = mean (dep_delay, na.rn = TRUE)) %>%
    
    #Sort it 
    arrange(delay)
## # A tibble: 16 × 2
##    carrier delay
##    <chr>   <dbl>
##  1 HA       4.90
##  2 9E      NA   
##  3 AA      NA   
##  4 AS      NA   
##  5 B6      NA   
##  6 DL      NA   
##  7 EV      NA   
##  8 F9      NA   
##  9 FL      NA   
## 10 MQ      NA   
## 11 OO      NA   
## 12 UA      NA   
## 13 US      NA   
## 14 VX      NA   
## 15 WN      NA   
## 16 YV      NA
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")

# It looks like delays increase with distance up to ~750 miles 
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
  geom_point(aes(size = count), alpha = 1/3) +
  geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
not_cancelled <- flights %>% 
  filter(!is.na(dep_delay), !is.na(arr_delay))

not_cancelled %>% 
  group_by(year, month, day) %>% 
  summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.4 
##  2  2013     1     2 13.7 
##  3  2013     1     3 10.9 
##  4  2013     1     4  8.97
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.56
##  9  2013     1     9  2.30
## 10  2013     1    10  2.84
## # ℹ 355 more rows
daily <- group_by(flights, year, month, day)
(per_day   <- summarise(daily, flights = n()))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day flights
##    <int> <int> <int>   <int>
##  1  2013     1     1     842
##  2  2013     1     2     943
##  3  2013     1     3     914
##  4  2013     1     4     915
##  5  2013     1     5     720
##  6  2013     1     6     832
##  7  2013     1     7     933
##  8  2013     1     8     899
##  9  2013     1     9     902
## 10  2013     1    10     932
## # ℹ 355 more rows
daily %>% 
  ungroup() %>%             
  summarise(flights = n())  
## # A tibble: 1 × 1
##   flights
##     <int>
## 1  336776