Ch 15
Introduction
Prerequisites
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Creating Factors
x1 <- c("Dec", "Apr", "Jan", "Mar")
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> [1] Dec Apr Jan Mar
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
#> [1] Jan Mar Apr Dec
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Apr Dec Jan Mar
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
f2 <- x1 %>% factor() %>% fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"
#> [1] "Dec" "Apr" "Jan" "Mar"
General Social Survey
gss_cat
## # A tibble: 21,483 × 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near … Prot… Sout… 12
## 2 2000 Divorced 48 White $8000 to 9999 Not str r… Prot… Bapt… NA
## 3 2000 Widowed 67 White Not applicable Independe… Prot… No d… 2
## 4 2000 Never married 39 White Not applicable Ind,near … Orth… Not … 4
## 5 2000 Divorced 25 White Not applicable Not str d… None Not … 1
## 6 2000 Married 25 White $20000 - 24999 Strong de… Prot… Sout… NA
## 7 2000 Never married 36 White $25000 or more Not str r… Chri… Not … 3
## 8 2000 Divorced 44 White $7000 to 7999 Ind,near … Prot… Luth… NA
## 9 2000 Married 44 White $25000 or more Not str d… Prot… Other 0
## 10 2000 Married 47 White $25000 or more Strong re… Prot… Sout… 3
## # ℹ 21,473 more rows
#> # A tibble: 21,483 × 9
#> year marital age race rincome partyid relig denom tvhours
#> <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
#> 1 2000 Never married 26 White $8000 to 9999 Ind,near r… Prot… Sout… 12
#> 2 2000 Divorced 48 White $8000 to 9999 Not str re… Prot… Bapt… NA
#> 3 2000 Widowed 67 White Not applicable Independent Prot… No d… 2
#> 4 2000 Never married 39 White Not applicable Ind,near r… Orth… Not … 4
#> 5 2000 Divorced 25 White Not applicable Not str de… None Not … 1
#> 6 2000 Married 25 White $20000 - 24999 Strong dem… Prot… Sout… NA
#> # ℹ 21,477 more rows
gss_cat %>%
count(race)
## # A tibble: 3 × 2
## race n
## <fct> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
#> # A tibble: 3 × 2
#> race n
#> <fct> <int>
#> 1 Other 1959
#> 2 Black 3129
#> 3 White 16395
ggplot(gss_cat, aes(race)) +
geom_bar()

ggplot(gss_cat, aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)

Modifying factor order
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(relig_summary, aes(tvhours, relig)) + geom_point()

ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()

relig_summary %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
ggplot(aes(tvhours, relig)) +
geom_point()

rincome_summary <- gss_cat %>%
group_by(rincome) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(rincome_summary, aes(age, fct_reorder(rincome, age))) + geom_point()

ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
geom_point()

by_age <- gss_cat %>%
filter(!is.na(age)) %>%
count(age, marital) %>%
group_by(age) %>%
mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, colour = marital)) +
geom_line(na.rm = TRUE)

ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
geom_line() +
labs(colour = "marital")

Modifying factor levels
gss_cat %>% count(partyid)
## # A tibble: 10 × 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
#> # A tibble: 10 × 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Strong republican 2314
#> 5 Not str republican 3032
#> 6 Ind,near rep 1791
#> # ℹ 4 more rows
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
## # A tibble: 10 × 2
## partyid n
## <fct> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
#> # A tibble: 10 × 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Republican, strong 2314
#> 5 Republican, weak 3032
#> 6 Independent, near rep 1791
#> # ℹ 4 more rows
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat",
"Other" = "No answer",
"Other" = "Don't know",
"Other" = "Other party"
)) %>%
count(partyid)
## # A tibble: 8 × 2
## partyid n
## <fct> <int>
## 1 Other 548
## 2 Republican, strong 2314
## 3 Republican, weak 3032
## 4 Independent, near rep 1791
## 5 Independent 4119
## 6 Independent, near dem 2499
## 7 Democrat, weak 3690
## 8 Democrat, strong 3490
#> # A tibble: 8 × 2
#> partyid n
#> <fct> <int>
#> 1 Other 548
#> 2 Republican, strong 2314
#> 3 Republican, weak 3032
#> 4 Independent, near rep 1791
#> 5 Independent 4119
#> 6 Independent, near dem 2499
#> # ℹ 2 more rows
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
## # A tibble: 4 × 2
## partyid n
## <fct> <int>
## 1 other 548
## 2 rep 5346
## 3 ind 8409
## 4 dem 7180
#> # A tibble: 4 × 2
#> partyid n
#> <fct> <int>
#> 1 other 548
#> 2 rep 5346
#> 3 ind 8409
#> 4 dem 7180
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
## # A tibble: 2 × 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Other 10637
#> # A tibble: 2 × 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Other 10637
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
## # A tibble: 10 × 2
## relig n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Other 458
## 6 Jewish 388
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95
#> # A tibble: 10 × 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Catholic 5124
#> 3 None 3523
#> 4 Christian 689
#> 5 Other 458
#> 6 Jewish 388
#> 7 Buddhism 147
#> 8 Inter-nondenominational 109
#> 9 Moslem/islam 104
#> 10 Orthodox-christian 95
Ch 16
Prerequisites
library(tidyverse)
library(lubridate)
library(nycflights13)
Creating date/times
today()
## [1] "2025-04-09"
#> [1] "2025-02-18"
now()
## [1] "2025-04-09 17:09:05 EDT"
#> [1] "2025-02-18 17:31:20 UTC"
From Strings
ymd("2017-01-31")
## [1] "2017-01-31"
#> [1] "2017-01-31"
mdy("January 31st, 2017")
## [1] "2017-01-31"
#> [1] "2017-01-31"
dmy("31-Jan-2017")
## [1] "2017-01-31"
#> [1] "2017-01-31"
ymd(20170131)
## [1] "2017-01-31"
#> [1] "2017-01-31"
ymd_hms("2017-01-31 20:11:59")
## [1] "2017-01-31 20:11:59 UTC"
#> [1] "2017-01-31 20:11:59 UTC"
mdy_hm("01/31/2017 08:01")
## [1] "2017-01-31 08:01:00 UTC"
#> [1] "2017-01-31 08:01:00 UTC"
ymd(20170131, tz = "UTC")
## [1] "2017-01-31 UTC"
#> [1] "2017-01-31 UTC"
From individual Components
flights %>%
select(year, month, day, hour, minute)
## # A tibble: 336,776 × 5
## year month day hour minute
## <int> <int> <int> <dbl> <dbl>
## 1 2013 1 1 5 15
## 2 2013 1 1 5 29
## 3 2013 1 1 5 40
## 4 2013 1 1 5 45
## 5 2013 1 1 6 0
## 6 2013 1 1 5 58
## 7 2013 1 1 6 0
## 8 2013 1 1 6 0
## 9 2013 1 1 6 0
## 10 2013 1 1 6 0
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 5
#> year month day hour minute
#> <int> <int> <int> <dbl> <dbl>
#> 1 2013 1 1 5 15
#> 2 2013 1 1 5 29
#> 3 2013 1 1 5 40
#> 4 2013 1 1 5 45
#> 5 2013 1 1 6 0
#> 6 2013 1 1 5 58
#> # ℹ 336,770 more rows
flights %>%
select(year, month, day, hour, minute) %>%
mutate(departure = make_datetime(year, month, day, hour, minute))
## # A tibble: 336,776 × 6
## year month day hour minute departure
## <int> <int> <int> <dbl> <dbl> <dttm>
## 1 2013 1 1 5 15 2013-01-01 05:15:00
## 2 2013 1 1 5 29 2013-01-01 05:29:00
## 3 2013 1 1 5 40 2013-01-01 05:40:00
## 4 2013 1 1 5 45 2013-01-01 05:45:00
## 5 2013 1 1 6 0 2013-01-01 06:00:00
## 6 2013 1 1 5 58 2013-01-01 05:58:00
## 7 2013 1 1 6 0 2013-01-01 06:00:00
## 8 2013 1 1 6 0 2013-01-01 06:00:00
## 9 2013 1 1 6 0 2013-01-01 06:00:00
## 10 2013 1 1 6 0 2013-01-01 06:00:00
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 6
#> year month day hour minute departure
#> <int> <int> <int> <dbl> <dbl> <dttm>
#> 1 2013 1 1 5 15 2013-01-01 05:15:00
#> 2 2013 1 1 5 29 2013-01-01 05:29:00
#> 3 2013 1 1 5 40 2013-01-01 05:40:00
#> 4 2013 1 1 5 45 2013-01-01 05:45:00
#> 5 2013 1 1 6 0 2013-01-01 06:00:00
#> 6 2013 1 1 5 58 2013-01-01 05:58:00
#> # ℹ 336,770 more rows
make_datetime_100 <- function(year, month, day, time) {
make_datetime(year, month, day, time %/% 100, time %% 100)
}
flights_dt <- flights %>%
filter(!is.na(dep_time), !is.na(arr_time)) %>%
mutate(
dep_time = make_datetime_100(year, month, day, dep_time),
arr_time = make_datetime_100(year, month, day, arr_time),
sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
) %>%
select(origin, dest, ends_with("delay"), ends_with("time"))
flights_dt
## # A tibble: 328,063 × 9
## origin dest dep_delay arr_delay dep_time sched_dep_time
## <chr> <chr> <dbl> <dbl> <dttm> <dttm>
## 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
## 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
## 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
## 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
## 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
## 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
## 7 EWR FLL -5 19 2013-01-01 05:55:00 2013-01-01 06:00:00
## 8 LGA IAD -3 -14 2013-01-01 05:57:00 2013-01-01 06:00:00
## 9 JFK MCO -3 -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA ORD -2 8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # ℹ 328,053 more rows
## # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
#> # A tibble: 328,063 × 9
#> origin dest dep_delay arr_delay dep_time sched_dep_time
#> <chr> <chr> <dbl> <dbl> <dttm> <dttm>
#> 1 EWR IAH 2 11 2013-01-01 05:17:00 2013-01-01 05:15:00
#> 2 LGA IAH 4 20 2013-01-01 05:33:00 2013-01-01 05:29:00
#> 3 JFK MIA 2 33 2013-01-01 05:42:00 2013-01-01 05:40:00
#> 4 JFK BQN -1 -18 2013-01-01 05:44:00 2013-01-01 05:45:00
#> 5 LGA ATL -6 -25 2013-01-01 05:54:00 2013-01-01 06:00:00
#> 6 EWR ORD -4 12 2013-01-01 05:54:00 2013-01-01 05:58:00
#> # ℹ 328,057 more rows
#> # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
flights_dt %>%
ggplot(aes(dep_time)) +
geom_freqpoly(binwidth = 86400) # 86400 seconds = 1 day

From other types
as_datetime(today())
## [1] "2025-04-09 UTC"
#> [1] "2025-02-18 UTC"
as_date(now())
## [1] "2025-04-09"
#> [1] "2025-02-18"
as_datetime(60 * 60 * 10)
## [1] "1970-01-01 10:00:00 UTC"
#> [1] "1970-01-01 10:00:00 UTC"
as_date(365 * 10 + 2)
## [1] "1980-01-01"
#> [1] "1980-01-01"
Date-time components
datetime <- ymd_hms("2016-07-08 12:34:56")
year(datetime)
## [1] 2016
#> [1] 2016
month(datetime)
## [1] 7
#> [1] 7
mday(datetime)
## [1] 8
#> [1] 8
yday(datetime)
## [1] 190
#> [1] 190
wday(datetime)
## [1] 6
#> [1] 6
month(datetime, label = TRUE)
## [1] Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
#> [1] Jul
#> 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
## [1] Friday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
#> [1] Friday
#> 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
flights_dt %>%
mutate(wday = wday(dep_time, label = TRUE)) %>%
ggplot(aes(x = wday)) +
geom_bar()

flights_dt %>%
count(week = floor_date(dep_time, "week")) %>%
ggplot(aes(week, n)) +
geom_line()

(datetime <- ymd_hms("2016-07-08 12:34:56"))
## [1] "2016-07-08 12:34:56 UTC"
#> [1] "2016-07-08 12:34:56 UTC"
year(datetime) <- 2020
datetime
## [1] "2020-07-08 12:34:56 UTC"
#> [1] "2020-07-08 12:34:56 UTC"
month(datetime) <- 01
datetime
## [1] "2020-01-08 12:34:56 UTC"
#> [1] "2020-01-08 12:34:56 UTC"
hour(datetime) <- hour(datetime) + 1
datetime
## [1] "2020-01-08 13:34:56 UTC"
#> [1] "2020-01-08 13:34:56 UTC"
update(datetime, year = 2020, month = 2, mday = 2, hour = 2)
## [1] "2020-02-02 02:34:56 UTC"
#> [1] "2020-02-02 02:34:56 UTC"
ymd("2015-02-01") %>%
update(mday = 30)
## [1] "2015-03-02"
#> [1] "2015-03-02"
ymd("2015-02-01") %>%
update(hour = 400)
## [1] "2015-02-17 16:00:00 UTC"
#> [1] "2015-02-17 16:00:00 UTC"
flights_dt %>%
mutate(dep_hour = update(dep_time, yday = 1)) %>%
ggplot(aes(dep_hour)) +
geom_freqpoly(binwidth = 300)

Time spans
# How old is Hadley?
h_age <- today() - ymd(19791014)
h_age
## Time difference of 16614 days
#> Time difference of 16564 days
2 * dyears(1)
## [1] "63115200s (~2 years)"
#> [1] "63115200s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
## [1] "38869200s (~1.23 years)"
#> [1] "38869200s (~1.23 years)"
one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")
one_pm
## [1] "2016-03-12 13:00:00 EST"
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
## [1] "2016-03-13 14:00:00 EDT"
#> [1] "2016-03-13 14:00:00 EDT"
Periods
one_pm
## [1] "2016-03-12 13:00:00 EST"
#> [1] "2016-03-12 13:00:00 EST"
one_pm + days(1)
## [1] "2016-03-13 13:00:00 EDT"
#> [1] "2016-03-13 13:00:00 EDT"
seconds(15)
## [1] "15S"
#> [1] "15S"
minutes(10)
## [1] "10M 0S"
#> [1] "10M 0S"
hours(c(12, 24))
## [1] "12H 0M 0S" "24H 0M 0S"
#> [1] "12H 0M 0S" "24H 0M 0S"
days(7)
## [1] "7d 0H 0M 0S"
#> [1] "7d 0H 0M 0S"
months(1:6)
## [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
## [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
#> [1] "1m 0d 0H 0M 0S" "2m 0d 0H 0M 0S" "3m 0d 0H 0M 0S" "4m 0d 0H 0M 0S"
#> [5] "5m 0d 0H 0M 0S" "6m 0d 0H 0M 0S"
weeks(3)
## [1] "21d 0H 0M 0S"
#> [1] "21d 0H 0M 0S"
years(1)
## [1] "1y 0m 0d 0H 0M 0S"
#> [1] "1y 0m 0d 0H 0M 0S"
10 * (months(6) + days(1))
## [1] "60m 10d 0H 0M 0S"
#> [1] "60m 10d 0H 0M 0S"
days(50) + hours(25) + minutes(2)
## [1] "50d 25H 2M 0S"
#> [1] "50d 25H 2M 0S"
Intervals
years(1) / days(1)
## [1] 365.25
#> [1] 365.25
next_year <- today() + years(1)
(today() %--% next_year) / ddays(1)
## [1] 365
#> [1] 365
Time zones
Sys.timezone()
## [1] "America/New_York"
#> [1] "UTC"
length(OlsonNames())
## [1] 597
#> [1] 597
head(OlsonNames())
## [1] "Africa/Abidjan" "Africa/Accra" "Africa/Addis_Ababa"
## [4] "Africa/Algiers" "Africa/Asmara" "Africa/Asmera"
#> [1] "Africa/Abidjan" "Africa/Accra" "Africa/Addis_Ababa"
#> [4] "Africa/Algiers" "Africa/Asmara" "Africa/Asmera"
(x1 <- ymd_hms("2015-06-01 12:00:00", tz = "America/New_York"))
## [1] "2015-06-01 12:00:00 EDT"
#> [1] "2015-06-01 12:00:00 EDT"
(x2 <- ymd_hms("2015-06-01 18:00:00", tz = "Europe/Copenhagen"))
## [1] "2015-06-01 18:00:00 CEST"
#> [1] "2015-06-01 18:00:00 CEST"
(x3 <- ymd_hms("2015-06-02 04:00:00", tz = "Pacific/Auckland"))
## [1] "2015-06-02 04:00:00 NZST"
#> [1] "2015-06-02 04:00:00 NZST"