library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(nycflights13)
flights_sml <- select(flights, year:day, ends_with("delay"), distance, air_time)
mutate(flights_sml, gain = arr_delay - dep_delay, speed = distance/air_time * 60)
## # A tibble: 336,776 × 9
## year month day dep_delay arr_delay distance air_time gain speed
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 370.0441
## 2 2013 1 1 4 20 1416 227 16 374.2731
## 3 2013 1 1 2 33 1089 160 31 408.3750
## 4 2013 1 1 -1 -18 1576 183 -17 516.7213
## 5 2013 1 1 -6 -25 762 116 -19 394.1379
## 6 2013 1 1 -4 12 719 150 16 287.6000
## 7 2013 1 1 -5 19 1065 158 24 404.4304
## 8 2013 1 1 -3 -14 229 53 -11 259.2453
## 9 2013 1 1 -3 -8 944 140 -5 404.5714
## 10 2013 1 1 -2 8 733 138 10 318.6957
## # ... with 336,766 more rows
mutate(flights_sml,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 × 10
## year month day dep_delay arr_delay distance air_time gain hours
## <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 2 11 1400 227 9 3.7833333
## 2 2013 1 1 4 20 1416 227 16 3.7833333
## 3 2013 1 1 2 33 1089 160 31 2.6666667
## 4 2013 1 1 -1 -18 1576 183 -17 3.0500000
## 5 2013 1 1 -6 -25 762 116 -19 1.9333333
## 6 2013 1 1 -4 12 719 150 16 2.5000000
## 7 2013 1 1 -5 19 1065 158 24 2.6333333
## 8 2013 1 1 -3 -14 229 53 -11 0.8833333
## 9 2013 1 1 -3 -8 944 140 -5 2.3333333
## 10 2013 1 1 -2 8 733 138 10 2.3000000
## # ... with 336,766 more rows, and 1 more variables: gain_per_hour <dbl>
transmute(flights,
gain = arr_delay - dep_delay,
hours = air_time / 60,
gain_per_hour = gain / hours
)
## # A tibble: 336,776 × 3
## gain hours gain_per_hour
## <dbl> <dbl> <dbl>
## 1 9 3.7833333 2.378855
## 2 16 3.7833333 4.229075
## 3 31 2.6666667 11.625000
## 4 -17 3.0500000 -5.573770
## 5 -19 1.9333333 -9.827586
## 6 16 2.5000000 6.400000
## 7 24 2.6333333 9.113924
## 8 -11 0.8833333 -12.452830
## 9 -5 2.3333333 -2.142857
## 10 10 2.3000000 4.347826
## # ... with 336,766 more rows
x <- 1:10
lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
cumsum(x)
## [1] 1 3 6 10 15 21 28 36 45 55
cummean(x)
## [1] 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
Grouped Summaries with Summarise()
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.63907
by_day <- group_by(flights, year, month, day)
summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
## Source: local data frame [365 x 4]
## Groups: year, month [?]
##
## year month day delay
## <int> <int> <int> <dbl>
## 1 2013 1 1 11.548926
## 2 2013 1 2 13.858824
## 3 2013 1 3 10.987832
## 4 2013 1 4 8.951595
## 5 2013 1 5 5.732218
## 6 2013 1 6 7.148014
## 7 2013 1 7 5.417204
## 8 2013 1 8 2.553073
## 9 2013 1 9 2.276477
## 10 2013 1 10 2.844995
## # ... with 355 more rows
Combining multiple operations with the pipe
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess'
