library("tidyverse")
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("nycflights13")
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(aes(y = ..density..), binwidth = 60 * 60)

sched_dep <- flights_dt %>%
mutate(minute = minute(sched_dep_time)) %>%
group_by(minute) %>%
summarise(
avg_delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
flights_dt %>%
filter(!is.na(dep_time)) %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
mutate(month = factor(month(dep_time))) %>%
ggplot(aes(dep_hour, color = month)) +
geom_freqpoly(binwidth = 60 * 60)

flights_dt %>%
mutate(dep_time_ = sched_dep_time + dep_delay * 60) %>%
filter(dep_time_ != dep_time) %>%
select(dep_time_, dep_time, sched_dep_time, dep_delay)
## # A tibble: 1,205 x 4
## dep_time_ dep_time sched_dep_time dep_delay
## <dttm> <dttm> <dttm> <dbl>
## 1 2013-01-02 08:48:00 2013-01-01 08:48:00 2013-01-01 18:35:00 853
## 2 2013-01-03 00:42:00 2013-01-02 00:42:00 2013-01-02 23:59:00 43
## 3 2013-01-03 01:26:00 2013-01-02 01:26:00 2013-01-02 22:50:00 156
## 4 2013-01-04 00:32:00 2013-01-03 00:32:00 2013-01-03 23:59:00 33
## 5 2013-01-04 00:50:00 2013-01-03 00:50:00 2013-01-03 21:45:00 185
## 6 2013-01-04 02:35:00 2013-01-03 02:35:00 2013-01-03 23:59:00 156
## 7 2013-01-05 00:25:00 2013-01-04 00:25:00 2013-01-04 23:59:00 26
## 8 2013-01-05 01:06:00 2013-01-04 01:06:00 2013-01-04 22:45:00 141
## 9 2013-01-06 00:14:00 2013-01-05 00:14:00 2013-01-05 23:59:00 15
## 10 2013-01-06 00:37:00 2013-01-05 00:37:00 2013-01-05 22:30:00 127
## # ... with 1,195 more rows
flights_dt %>%
mutate(sched_dep_hour = hour(sched_dep_time)) %>%
group_by(sched_dep_hour) %>%
summarise(dep_delay = mean(dep_delay)) %>%
ggplot(aes(y = dep_delay, x = sched_dep_hour)) +
geom_point() +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
group_by(wday) %>%
summarize(ave_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = wday, y = ave_dep_delay)) +
geom_bar(stat = "identity")

flights_dt %>%
mutate(minute = minute(dep_time),
early = dep_delay < 0) %>%
group_by(minute) %>%
summarise(
early = mean(early, na.rm = TRUE),
n = n()) %>%
ggplot(aes(minute, early)) +
geom_line()
