library(tidyverse)
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.6
## v forcats 1.0.1 v stringr 1.6.0
## v ggplot2 4.0.1 v tibble 3.3.1
## v lubridate 1.9.4 v tidyr 1.3.2
## v purrr 1.2.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(lubridate)
ymd(c("2010-10-10", "bananas"))
## Warning: 1 failed to parse.
## [1] "2010-10-10" NA
d1 <- "January 1, 2010"
mdy(d1)
## [1] "2010-01-01"
d2 <- "2015-Mar-07"
ymd(d2)
## [1] "2015-03-07"
d3 <- "06-Jun-2017"
dmy(d3)
## [1] "2017-06-06"
d4 <- c("August 19 (2015)", "July 1 (2015)")
mdy(d4)
## [1] "2015-08-19" "2015-07-01"
d5 <- "12/30/14"
mdy(d5)
## [1] "2014-12-30"
make_datetime_100 <- function(year, month, day, time, tz = "EST") {
make_datetime(year, month, day, time %/% 100, time %% 100, 0, tz)
}
flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
)
## # A tibble: 328,063 x 19
## year month day dep_time sched_dep_time dep_delay
## <int> <int> <int> <dttm> <dttm> <dbl>
## 1 2013 1 1 2013-01-01 05:17:00 2013-01-01 05:15:00 2
## 2 2013 1 1 2013-01-01 05:33:00 2013-01-01 05:29:00 4
## 3 2013 1 1 2013-01-01 05:42:00 2013-01-01 05:40:00 2
## 4 2013 1 1 2013-01-01 05:44:00 2013-01-01 05:45:00 -1
## 5 2013 1 1 2013-01-01 05:54:00 2013-01-01 06:00:00 -6
## 6 2013 1 1 2013-01-01 05:54:00 2013-01-01 05:58:00 -4
## 7 2013 1 1 2013-01-01 05:55:00 2013-01-01 06:00:00 -5
## 8 2013 1 1 2013-01-01 05:57:00 2013-01-01 06:00:00 -3
## 9 2013 1 1 2013-01-01 05:57:00 2013-01-01 06:00:00 -3
## 10 2013 1 1 2013-01-01 05:58:00 2013-01-01 06:00:00 -2
## # i 328,053 more rows
## # i 13 more variables: arr_time <dttm>, sched_arr_time <dttm>, arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
airports1 <- airports %>%
select(faa, tzone)
flights1 <- flights %>%
left_join(airports1, by = c("dest" = "faa")) %>%
rename("dest_tzone" = "tzone")
flights1 %>%
left_join(airports1, by = c("origin" = "faa")) %>%
rename("origin_tzone" = "tzone") -> flights1
flights_dt <- flights1 %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time, tz = origin_tzone),
arr_time = make_datetime_100(year, month, day, arr_time, tz = dest_tzone),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time, tz = origin_tzone),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time, tz = dest_tzone)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"), air_time, origin_tzone, dest_tzone)
flights_dt
## # A tibble: 328,063 x 11
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
## 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
## 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
## 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
## 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
## 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
## 7 EWR FLL -5 19 2013-01-01 05:55:00 2013-01-01 06:00:00
## 8 LGA IAD -3 -14 2013-01-01 05:57:00 2013-01-01 06:00:00
## 9 JFK MCO -3 -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA ORD -2 8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # i 328,053 more rows
## # i 5 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>,
## # origin_tzone <chr>, dest_tzone <chr>
flights_dt %>%
filter(dep_time > ymd("2013-06-01", tz = Sys.timezone())) %>%
arrange(dep_time)
## # A tibble: 194,177 x 11
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 JFK PSE 3 -9 2013-06-01 00:02:00 2013-06-01 23:59:00
## 2 EWR CLT -9 -16 2013-06-01 04:51:00 2013-06-01 05:00:00
## 3 EWR IAH -9 -45 2013-06-01 05:06:00 2013-06-01 05:15:00
## 4 LGA IAH -11 -29 2013-06-01 05:34:00 2013-06-01 05:45:00
## 5 JFK BQN -7 3 2013-06-01 05:38:00 2013-06-01 05:45:00
## 6 JFK MIA -1 -8 2013-06-01 05:39:00 2013-06-01 05:40:00
## 7 EWR RSW -14 -20 2013-06-01 05:46:00 2013-06-01 06:00:00
## 8 LGA DFW -9 -22 2013-06-01 05:51:00 2013-06-01 06:00:00
## 9 LGA PHL -8 -8 2013-06-01 05:52:00 2013-06-01 06:00:00
## 10 JFK IAD -7 -11 2013-06-01 05:53:00 2013-06-01 06:00:00
## # i 194,167 more rows
## # i 5 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>,
## # origin_tzone <chr>, dest_tzone <chr>
flights_dt %>%
filter(dep_time > ymd("2013-06-01", tz = Sys.timezone()))
## # A tibble: 194,177 x 11
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR CLT -13 -34 2013-10-01 04:47:00 2013-10-01 05:00:00
## 2 EWR IAH 5 -22 2013-10-01 05:22:00 2013-10-01 05:17:00
## 3 JFK MIA -9 -46 2013-10-01 05:36:00 2013-10-01 05:45:00
## 4 LGA IAH -6 -26 2013-10-01 05:39:00 2013-10-01 05:45:00
## 5 JFK SJU -6 -16 2013-10-01 05:39:00 2013-10-01 05:45:00
## 6 JFK BQN -6 -20 2013-10-01 05:44:00 2013-10-01 05:50:00
## 7 JFK IAD -11 -23 2013-10-01 05:49:00 2013-10-01 06:00:00
## 8 LGA PHL -10 -12 2013-10-01 05:50:00 2013-10-01 06:00:00
## 9 LGA DCA -10 -10 2013-10-01 05:50:00 2013-10-01 06:00:00
## 10 EWR ORD -9 -3 2013-10-01 05:51:00 2013-10-01 06:00:00
## # i 194,167 more rows
## # i 5 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>,
## # origin_tzone <chr>, dest_tzone <chr>
10 the frist index is 1 The observations may appear in an unsorted or irregular order
flights_dt <- flights_dt %>%
mutate(hour = hour(dep_time))
flights_dt %>%
group_by(hour) %>%
summarise(mean_arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(mean_arr_delay))
## # A tibble: 24 x 2
## hour mean_arr_delay
## <int> <dbl>
## 1 3 288.
## 2 2 225.
## 3 1 196.
## 4 0 119.
## 5 23 90.7
## 6 22 62.1
## 7 21 35.6
## 8 20 21.6
## 9 19 14.6
## 10 18 13.4
## # i 14 more rows
flights_dt %>%
group_by(hour) %>%
summarise(mean_air_time = mean(air_time, na.rm = TRUE)) %>%
arrange(desc(mean_air_time))
## # A tibble: 24 x 2
## hour mean_air_time
## <int> <dbl>
## 1 18 170.
## 2 17 169.
## 3 7 166.
## 4 9 161.
## 5 10 161.
## 6 19 156.
## 7 16 153.
## 8 6 151.
## 9 13 149.
## 10 15 149.
## # i 14 more rows
flights_dt %>%
group_by(hour) %>%
summarise(mean_air_time = mean(air_time, na.rm = TRUE)) %>%
arrange(mean_air_time)
## # A tibble: 24 x 2
## hour mean_air_time
## <int> <dbl>
## 1 4 84.2
## 2 3 99
## 3 22 104.
## 4 23 129.
## 5 1 130.
## 6 21 132.
## 7 14 133.
## 8 5 133.
## 9 2 135.
## 10 12 136.
## # i 14 more rows
dt2 <- ymd_hms("2023-04-05 03:12:34pm", tz = Sys.timezone())
floor_date(dt2, "year")
## [1] "2023-01-01 EST"
round_date(dt2, "week")
## [1] "2023-04-09 EDT"
the first day of that week
birth <- ymd_hm("2007-05-17 09:36", tz = "Asia/Shanghai")
now <- now(tzone = "Asia/Shanghai")
diff <- as.duration(now - birth)
diff / ddays(1)
## [1] 6910.519
diff
## [1] "597068804.077531s (~18.92 years)"