Creating Date/Times


There are three types of date/time data.
  1. A date - <date>
  2. A time - <time>
  3. A date-time - <dttm> or POSIXct

For the current date or date-time use today() or now().

> library(tidyverse)
> library(lubridate)
> library(nycflights13)
> today()
[1] "2020-09-30"
> now()
[1] "2020-09-30 10:02:53 EDT"

To create dates/times from strings you can parse them with:

  • y = year
  • m = month
  • d = day
  • _h = hours
  • _m = minutes
  • _s = seconds
> ymd("2017-01-31")
[1] "2017-01-31"
> mdy("January 31st, 2017")
[1] "2017-01-31"
> mdy("01-05-2005")
[1] "2005-01-05"
> dmy("31-Jan-2017")
[1] "2017-01-31"
> # it can also take unquoted numbers
> ymd(20170131)
[1] "2017-01-31"
> ymd_hms("2017-01-31 20:11:59")
[1] "2017-01-31 20:11:59 UTC"
> mdy_hm("01/31/2017 08:01")
[1] "2017-01-31 08:01:00 UTC"
> # add a timezone to force creation
> # of a date-time
> ymd(20170131, tz = "UTC")
[1] "2017-01-31 UTC"

Or you can create them from individual components.

> flights %>% 
+   select(year, month, day, hour, minute)
# A tibble: 336,776 x 5
    year month   day  hour minute
   <int> <int> <int> <dbl>  <dbl>
 1  2013     1     1     5     15
 2  2013     1     1     5     29
 3  2013     1     1     5     40
 4  2013     1     1     5     45
 5  2013     1     1     6      0
 6  2013     1     1     5     58
 7  2013     1     1     6      0
 8  2013     1     1     6      0
 9  2013     1     1     6      0
10  2013     1     1     6      0
# ... with 336,766 more rows

Use make_date() for dates and make_datetime() for date-times.

> flights %>% 
+   select(year, month, day, hour, minute) %>% 
+   mutate(departure = make_datetime(year, 
+                      month, day, hour, minute))
# A tibble: 336,776 x 6
    year month   day  hour minute departure          
   <int> <int> <int> <dbl>  <dbl> <dttm>             
 1  2013     1     1     5     15 2013-01-01 05:15:00
 2  2013     1     1     5     29 2013-01-01 05:29:00
 3  2013     1     1     5     40 2013-01-01 05:40:00
 4  2013     1     1     5     45 2013-01-01 05:45:00
 5  2013     1     1     6      0 2013-01-01 06:00:00
 6  2013     1     1     5     58 2013-01-01 05:58:00
 7  2013     1     1     6      0 2013-01-01 06:00:00
 8  2013     1     1     6      0 2013-01-01 06:00:00
 9  2013     1     1     6      0 2013-01-01 06:00:00
10  2013     1     1     6      0 2013-01-01 06:00:00
# ... with 336,766 more rows

We can do the same for the arrival and departure times, but we need to pull out the hour and minutes with modulus arithmetic.

> make_datetime_100 <- function(year, month, day, time) {
+   make_datetime(year, month, day, time %/% 100, time %% 100)
+ }
> 
> flights_dt <- flights %>% 
+   filter(!is.na(dep_time), !is.na(arr_time)) %>% 
+   mutate(
+     dep_time = make_datetime_100(year, month, day, dep_time),
+     arr_time = make_datetime_100(year, month, day, arr_time),
+     sched_dep_time = make_datetime_100(year, month, day, 
+                                        sched_dep_time),
+     sched_arr_time = make_datetime_100(year, month, day, 
+                                        sched_arr_time)
+   ) %>% 
+   select(origin, dest, ends_with("delay"), ends_with("time"))
> 
> flights_dt
# A tibble: 328,063 x 9
   origin dest  dep_delay arr_delay dep_time            sched_dep_time     
   <chr>  <chr>     <dbl>     <dbl> <dttm>              <dttm>             
 1 EWR    IAH           2        11 2013-01-01 05:17:00 2013-01-01 05:15:00
 2 LGA    IAH           4        20 2013-01-01 05:33:00 2013-01-01 05:29:00
 3 JFK    MIA           2        33 2013-01-01 05:42:00 2013-01-01 05:40:00
 4 JFK    BQN          -1       -18 2013-01-01 05:44:00 2013-01-01 05:45:00
 5 LGA    ATL          -6       -25 2013-01-01 05:54:00 2013-01-01 06:00:00
 6 EWR    ORD          -4        12 2013-01-01 05:54:00 2013-01-01 05:58:00
 7 EWR    FLL          -5        19 2013-01-01 05:55:00 2013-01-01 06:00:00
 8 LGA    IAD          -3       -14 2013-01-01 05:57:00 2013-01-01 06:00:00
 9 JFK    MCO          -3        -8 2013-01-01 05:57:00 2013-01-01 06:00:00
10 LGA    ORD          -2         8 2013-01-01 05:58:00 2013-01-01 06:00:00
# ... with 328,053 more rows, and 3 more variables: arr_time <dttm>,
#   sched_arr_time <dttm>, air_time <dbl>

You can now view the frequency by day.

> flights_dt %>% 
+   ggplot(aes(dep_time)) + 
+   geom_freqpoly(binwidth = 86400,
+                 color="firebrick") # 86400 seconds = 1 day

Or the frequency across a single day.

> flights_dt %>% 
+   filter(dep_time < ymd(20130102)) %>% 
+   ggplot(aes(dep_time)) + 
+   geom_freqpoly(binwidth = 600,
+                 color="firebrick") # 600 s = 10 minutes

To switch between a date-time and a date use as_datetime() and as_date().

> as_datetime(today())
[1] "2020-09-30 UTC"
> as_date(now())
[1] "2020-09-30"

Or, if offsets from the “Unix Epoch,” 1970-01-01.

> # offset in seconds
> as_datetime(60 * 60 * 10)
[1] "1970-01-01 10:00:00 UTC"
> # offset in days
> as_date(365 * 10 + 2)
[1] "1980-01-01"

Date-Time Components


It’s also useful to retrieve and set individual components.
> datetime <- ymd_hms("2016-07-08 12:34:56")
> year(datetime) # get year
[1] 2016
> month(datetime) # get month
[1] 7
> mday(datetime) # get day of the month
[1] 8
> yday(datetime) # get day of the year
[1] 190
> wday(datetime) # get day of the week
[1] 6

For month() and wday() you can also get an abbreviated or complete label.

> #abbreviated label
> month(datetime, label = TRUE)
[1] Jul
12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
> #full label
> wday(datetime, label = TRUE, abbr = FALSE)
[1] Friday
7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday

More flights depart during the week than on the weekend.

> flights_dt %>% 
+   mutate(wday = wday(dep_time, label = TRUE)) %>% 
+   ggplot(aes(x = wday)) +
+   geom_bar(fill="firebrick")

It looks like flights leaving in minutes 20-30 and 50-60 have lower delays then the rest of the hour, but there’s no pattern when we look at scheduled departure times.

> library(gridExtra)
> 
> flights_dt2 <- flights_dt %>% 
+   mutate(minute = minute(dep_time)) %>% 
+   group_by(minute) %>% 
+   summarize(
+     avg_delay = mean(arr_delay, na.rm = TRUE),
+     n = n()) 
> 
> flights_dt2 %>% ggplot(aes(minute, avg_delay)) +
+   geom_line(color="firebrick")+
+   labs(y="avg dep delay")->p1
> 
> sched_dep <- flights_dt %>% 
+   mutate(minute = minute(sched_dep_time)) %>% 
+   group_by(minute) %>% 
+   summarise(
+     avg_delay = mean(arr_delay, na.rm = TRUE),
+     n = n())
> 
> sched_dep %>% ggplot(aes(minute, avg_delay)) +
+   geom_line(color="firebrick")+
+   labs(y="avg sched dep delay")->p2
> 
> grid.arrange(p1, p2, ncol = 2)

When we look at number of flights by minute we can see that most are in 5 minute intervals, with on-the-hour and on-the-half-hour as the most popular.

> ggplot(sched_dep, aes(minute, n)) +
+   geom_line(color="firebrick")

You can also round to a nearby unit of time with floor_date(),round_date(), and ceiling_date().

> #round to start of the week
> flights_dt %>% 
+   mutate(week = floor_date(dep_time, "week")) %>% 
+   select(dep_time,week)
# A tibble: 328,063 x 2
   dep_time            week               
   <dttm>              <dttm>             
 1 2013-01-01 05:17:00 2012-12-30 00:00:00
 2 2013-01-01 05:33:00 2012-12-30 00:00:00
 3 2013-01-01 05:42:00 2012-12-30 00:00:00
 4 2013-01-01 05:44:00 2012-12-30 00:00:00
 5 2013-01-01 05:54:00 2012-12-30 00:00:00
 6 2013-01-01 05:54:00 2012-12-30 00:00:00
 7 2013-01-01 05:55:00 2012-12-30 00:00:00
 8 2013-01-01 05:57:00 2012-12-30 00:00:00
 9 2013-01-01 05:57:00 2012-12-30 00:00:00
10 2013-01-01 05:58:00 2012-12-30 00:00:00
# ... with 328,053 more rows
> #round to nearest hour
> flights_dt %>% 
+   mutate(rdhour = round_date(dep_time, "hour")) %>% 
+   select(dep_time,rdhour)
# A tibble: 328,063 x 2
   dep_time            rdhour             
   <dttm>              <dttm>             
 1 2013-01-01 05:17:00 2013-01-01 05:00:00
 2 2013-01-01 05:33:00 2013-01-01 06:00:00
 3 2013-01-01 05:42:00 2013-01-01 06:00:00
 4 2013-01-01 05:44:00 2013-01-01 06:00:00
 5 2013-01-01 05:54:00 2013-01-01 06:00:00
 6 2013-01-01 05:54:00 2013-01-01 06:00:00
 7 2013-01-01 05:55:00 2013-01-01 06:00:00
 8 2013-01-01 05:57:00 2013-01-01 06:00:00
 9 2013-01-01 05:57:00 2013-01-01 06:00:00
10 2013-01-01 05:58:00 2013-01-01 06:00:00
# ... with 328,053 more rows
> #plot flights per week
> flights_dt %>% 
+   count(week = floor_date(dep_time, "week")) %>% 
+   ggplot(aes(week, n)) +
+   geom_line(color="firebrick")

You can also set the components of a date/time.

> (datetime <- ymd_hms("2016-07-08 12:34:56"))
[1] "2016-07-08 12:34:56 UTC"
> year(datetime) <- 2020
> datetime
[1] "2020-07-08 12:34:56 UTC"
> month(datetime) <- 01
> datetime
[1] "2020-01-08 12:34:56 UTC"
> hour(datetime) <- hour(datetime) + 1
> datetime
[1] "2020-01-08 13:34:56 UTC"

Or you can create a new date with update(), which allows you to set multiple values at once. If the values are too big the date will roll over.

> update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
[1] "2020-02-02 02:34:56 UTC"
> ymd("2015-02-01") %>% 
+   update(mday = 30)
[1] "2015-03-02"
> ymd("2015-02-01") %>% 
+   update(hour = 400)
[1] "2015-02-17 16:00:00 UTC"

By changing every day to Jan 1st with yday() we can view the frequency of flights across the course of a day for the entire year.

> flights_dt %>% 
+   mutate(dep_hour = update(dep_time, yday = 1)) %>% 
+   select(dep_time,dep_hour) %>% 
+   arrange(desc(dep_time))
# A tibble: 328,063 x 2
   dep_time            dep_hour           
   <dttm>              <dttm>             
 1 2013-12-31 23:56:00 2013-01-01 23:56:00
 2 2013-12-31 23:55:00 2013-01-01 23:55:00
 3 2013-12-31 23:32:00 2013-01-01 23:32:00
 4 2013-12-31 23:28:00 2013-01-01 23:28:00
 5 2013-12-31 23:21:00 2013-01-01 23:21:00
 6 2013-12-31 23:10:00 2013-01-01 23:10:00
 7 2013-12-31 22:45:00 2013-01-01 22:45:00
 8 2013-12-31 22:35:00 2013-01-01 22:35:00
 9 2013-12-31 22:18:00 2013-01-01 22:18:00
10 2013-12-31 22:11:00 2013-01-01 22:11:00
# ... with 328,053 more rows
> flights_dt %>% 
+   mutate(dep_hour = update(dep_time, yday = 1)) %>% 
+   ggplot(aes(dep_hour)) +
+   geom_freqpoly(binwidth = 300,
+                 color="firebrick")

Time Spans


There are three classes that represent time spans.
  1. Durations - represent exact number of seconds.
  2. Periods - represent human units like weeks and months.
  3. Intervals - represent a starting and ending point.

When you subtract a date you get a difftime object. It could be seconds, minutes, hours, days, etc.

> # How old is Hadley?
> (h_age <- today() - ymd(19791014))
Time difference of 14962 days

To keep it consistent you could use duration (seconds).

> as.duration(h_age)
[1] "1292716800s (~40.96 years)"
> dseconds(15)
[1] "15s"
> dminutes(10)
[1] "600s (~10 minutes)"
> dhours(c(12, 24))
[1] "43200s (~12 hours)" "86400s (~1 days)"  
> ddays(0:5)
[1] "0s"                "86400s (~1 days)"  "172800s (~2 days)"
[4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
> dweeks(3)
[1] "1814400s (~3 weeks)"
> dyears(1)
[1] "31557600s (~1 years)"

You can add, multiply, and subtract durations.

> 2 * dyears(1)
[1] "63115200s (~2 years)"
> dyears(1) + dweeks(12) + dhours(15)
[1] "38869200s (~1.23 years)"
> tomorrow <- today() + ddays(1)
> last_year <- today() - dyears(1)

Since durations are converted at the standard rates - 60 seconds/min, 24 hours/day, 365 days/year, etc. - you can get unusual results. Some days don’t have 24 hours (daylight savings) and some years don’t have 365 days (leap years).

> # march 12th had 23 hours - move clock ahead
> one_pm <- ymd_hms("2016-03-12 13:00:00", 
+                   tz = "America/New_York")
> one_pm + ddays(1)
[1] "2016-03-13 14:00:00 EDT"

Periods don’t have a fixed length in seconds, so they can be more intuitive.

> one_pm
[1] "2016-03-12 13:00:00 EST"
> one_pm + days(1)
[1] "2016-03-13 13:00:00 EDT"

The functions are similar but don’t have the extra \(d\) at the start.

> seconds(15)
[1] "15S"
> minutes(10)
[1] "10M 0S"
> hours(c(12, 24))
[1] "12H 0M 0S" "24H 0M 0S"
> days(7)
[1] "7d 0H 0M 0S"
> months(1:6)
[1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
[5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
> weeks(3)
[1] "21d 0H 0M 0S"
> years(1)
[1] "1y 0m 0d 0H 0M 0S"

Arithmetic is also possible.

> 10 * (months(6) + days(1))
[1] "60m 10d 0H 0M 0S"
> days(50) + hours(25) + minutes(2)
[1] "50d 25H 2M 0S"

The outputs aren’t affected by daylight savings or leap year.

> # A leap year
> ymd("2016-01-01") + dyears(1)
[1] "2016-12-31 06:00:00 UTC"
> ymd("2016-01-01") + years(1)
[1] "2017-01-01"
> # Daylight Savings Time
> one_pm + ddays(1)
[1] "2016-03-13 14:00:00 EDT"
> one_pm + days(1)
[1] "2016-03-13 13:00:00 EDT"

There are a number of flights with arrival times less than departure times. They are overnight flights, but we used the departure day when setting the date-time. We can use days() to fix it.

> flights_dt %>% 
+   filter(arr_time < dep_time)
# A tibble: 10,633 x 9
   origin dest  dep_delay arr_delay dep_time            sched_dep_time     
   <chr>  <chr>     <dbl>     <dbl> <dttm>              <dttm>             
 1 EWR    BQN           9        -4 2013-01-01 19:29:00 2013-01-01 19:20:00
 2 JFK    DFW          59        NA 2013-01-01 19:39:00 2013-01-01 18:40:00
 3 EWR    TPA          -2         9 2013-01-01 20:58:00 2013-01-01 21:00:00
 4 EWR    SJU          -6       -12 2013-01-01 21:02:00 2013-01-01 21:08:00
 5 EWR    SFO          11       -14 2013-01-01 21:08:00 2013-01-01 20:57:00
 6 LGA    FLL         -10        -2 2013-01-01 21:20:00 2013-01-01 21:30:00
 7 EWR    MCO          41        43 2013-01-01 21:21:00 2013-01-01 20:40:00
 8 JFK    LAX          -7       -24 2013-01-01 21:28:00 2013-01-01 21:35:00
 9 EWR    FLL          49        28 2013-01-01 21:34:00 2013-01-01 20:45:00
10 EWR    FLL          -9       -14 2013-01-01 21:36:00 2013-01-01 21:45:00
# ... with 10,623 more rows, and 3 more variables: arr_time <dttm>,
#   sched_arr_time <dttm>, air_time <dbl>
> flights_dt <- flights_dt %>% 
+   mutate(
+     overnight = arr_time < dep_time,
+     arr_time = arr_time + days(overnight * 1),
+     sched_arr_time = sched_arr_time + days(overnight * 1)
+   )

Now it doesn’t seem like the planes traveled back in time.

> flights_dt %>% 
+   filter(overnight, arr_time < dep_time) 
# A tibble: 0 x 10
# ... with 10 variables: origin <chr>, dest <chr>, dep_delay <dbl>,
#   arr_delay <dbl>, dep_time <dttm>, sched_dep_time <dttm>, arr_time <dttm>,
#   sched_arr_time <dttm>, air_time <dbl>, overnight <lgl>

Sometimes period arithmetic can be unclear. Every four years we have an extra day. R can’t be sure what year you are referring to.

> years(1) / days(1)
[1] 365.25

Instead we can use an interval.

> next_year <- today() + years(1)
> (today() %--% next_year)
[1] 2020-09-30 UTC--2021-09-30 UTC
> (today() %--% next_year) / ddays(1)
[1] 365

For periods we need integer division.

> (today() %--% next_year) %/% days(1)
[1] 365

Time Zones


Timezones in R are represented by the form <continent>/<city>.

You can find your current timzeone with:

> Sys.timezone()
[1] "America/New_York"

And see the list of all timezones with OlsonNames().

> length(OlsonNames())
[1] 594
> head(OlsonNames())
[1] "Africa/Abidjan"     "Africa/Accra"       "Africa/Addis_Ababa"
[4] "Africa/Algiers"     "Africa/Asmara"      "Africa/Asmera"     

These three objects represent the same instance in time, but with different time zones:

> (x1 <- ymd_hms("2015-06-01 12:00:00", tz = "America/New_York"))
[1] "2015-06-01 12:00:00 EDT"
> (x2 <- ymd_hms("2015-06-01 18:00:00", tz = "Europe/Copenhagen"))
[1] "2015-06-01 18:00:00 CEST"
> (x3 <- ymd_hms("2015-06-02 04:00:00", tz = "Pacific/Auckland"))
[1] "2015-06-02 04:00:00 NZST"

This can be confirmed with subtraction.

> x1 - x2
Time difference of 0 secs
> x1 - x3
Time difference of 0 secs

When they’re combined the timezone is dropped, so they all convert to local time.

> (x4 <- c(x1, x2, x3))
[1] "2015-06-01 12:00:00 EDT" "2015-06-01 12:00:00 EDT"
[3] "2015-06-01 12:00:00 EDT"

You can add one using with_tz().

> (x4a <- with_tz(x4, tzone = "Australia/Lord_Howe"))
[1] "2015-06-02 02:30:00 +1030" "2015-06-02 02:30:00 +1030"
[3] "2015-06-02 02:30:00 +1030"
> x4a - x4
Time differences in secs
[1] 0 0 0

However, it changes it from your local time to the time zone that you specified. If you want to keep the local time and just switch to a different time zone you can do so with force_tz().

> (x4b <- force_tz(x4, tzone = "Australia/Lord_Howe"))
[1] "2015-06-01 12:00:00 +1030" "2015-06-01 12:00:00 +1030"
[3] "2015-06-01 12:00:00 +1030"
> x4b - x4
Time differences in hours
[1] -14.5 -14.5 -14.5