Chapter 15

Creating Factors

x1 <- c("Dec", "Apr", "Jan", "Mar")
x2 <- c("Dec", "Apr", "Jam", "Mar")
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c(
    "Jan", "Feb", "Mar", "Apr", "May", "Jun",
    "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)

# orders months of x1 
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# orders months of x2 correcting incorrect values with NA
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec  Apr  <NA> Mar 
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- parse_factor(x2, levels = month_levels)
## Warning: 1 parsing failure.
## row col           expected actual
##   3  -- value in level set    Jam
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
f2 <- x1 %>% factor() %>% fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar

General Social Survey

ggplot(gss_cat, aes(race)) +
    geom_bar()

ggplot(gss_cat, aes(race)) +
    geom_bar() +
    scale_x_discrete(drop = FALSE)

relig_summary <- gss_cat %>%
    group_by(relig) %>%
    summarise(
        age = mean(age, na.rm = TRUE),
        tvhours = mean(tvhours, na.rm = TRUE),
        n = n()
    )

ggplot(relig_summary, aes(tvhours, relig)) + geom_point()

Modifying Factor Order

relig_summary %>%
    mutate(relig = fct_reorder(relig, tvhours)) %>%
    ggplot(aes(tvhours, relig)) +
    geom_point()

rincome_summary <- gss_cat %>%
    group_by(rincome) %>%
    summarise(
        age = mean(age, na.rm = TRUE),
        tvhours = mean(tvhours, na.rm = TRUE),
        n = n()
    )

ggplot(rincome_summary, aes(age, fct_reorder(rincome, age))) + geom_point()

ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
    geom_point()

Chapter 16

Creating Date/Times

today()
## [1] "2025-04-12"
now()
## [1] "2025-04-12 19:32:00 EDT"
ymd("2025-04-12")
## [1] "2025-04-12"
mdy("April 12th, 2025")
## [1] "2025-04-12"
dmy("12-Apr-2025")
## [1] "2025-04-12"
ymd(20250412)
## [1] "2025-04-12"
ymd_hms("2025-04-12 20:11:59")
## [1] "2025-04-12 20:11:59 UTC"
mdy_hm("04/12/2025 08:01")
## [1] "2025-04-12 08:01:00 UTC"
ymd(20250412, tz = "UTC")
## [1] "2025-04-12 UTC"
flights %>%
    select(year, month, day, hour, minute)
## # A tibble: 336,776 × 5
##     year month   day  hour minute
##    <int> <int> <int> <dbl>  <dbl>
##  1  2013     1     1     5     15
##  2  2013     1     1     5     29
##  3  2013     1     1     5     40
##  4  2013     1     1     5     45
##  5  2013     1     1     6      0
##  6  2013     1     1     5     58
##  7  2013     1     1     6      0
##  8  2013     1     1     6      0
##  9  2013     1     1     6      0
## 10  2013     1     1     6      0
## # ℹ 336,766 more rows
flights %>%
    select(year, month, day, hour, minute) %>%
    mutate(departure = make_datetime(year, month, day, hour, minute))
## # A tibble: 336,776 × 6
##     year month   day  hour minute departure          
##    <int> <int> <int> <dbl>  <dbl> <dttm>             
##  1  2013     1     1     5     15 2013-01-01 05:15:00
##  2  2013     1     1     5     29 2013-01-01 05:29:00
##  3  2013     1     1     5     40 2013-01-01 05:40:00
##  4  2013     1     1     5     45 2013-01-01 05:45:00
##  5  2013     1     1     6      0 2013-01-01 06:00:00
##  6  2013     1     1     5     58 2013-01-01 05:58:00
##  7  2013     1     1     6      0 2013-01-01 06:00:00
##  8  2013     1     1     6      0 2013-01-01 06:00:00
##  9  2013     1     1     6      0 2013-01-01 06:00:00
## 10  2013     1     1     6      0 2013-01-01 06:00:00
## # ℹ 336,766 more rows
make_datetime_100 <- function(year, month, day, time) {
    make_datetime(year, month, day, time %/% 100, time %% 100)
}

flights_dt <- flights %>%
    filter(!is.na(dep_time), !is.na(arr_time)) %>%
    mutate(
        dep_time = make_datetime_100(year, month, day, dep_time),
        arr_time = make_datetime_100(year, month, day, arr_time),
        sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
        sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
    ) %>%
    select(origin, dest, ends_with("delay"), ends_with("time"))

flights_dt
## # A tibble: 328,063 × 9
##    origin dest  dep_delay arr_delay dep_time            sched_dep_time     
##    <chr>  <chr>     <dbl>     <dbl> <dttm>              <dttm>             
##  1 EWR    IAH           2        11 2013-01-01 05:17:00 2013-01-01 05:15:00
##  2 LGA    IAH           4        20 2013-01-01 05:33:00 2013-01-01 05:29:00
##  3 JFK    MIA           2        33 2013-01-01 05:42:00 2013-01-01 05:40:00
##  4 JFK    BQN          -1       -18 2013-01-01 05:44:00 2013-01-01 05:45:00
##  5 LGA    ATL          -6       -25 2013-01-01 05:54:00 2013-01-01 06:00:00
##  6 EWR    ORD          -4        12 2013-01-01 05:54:00 2013-01-01 05:58:00
##  7 EWR    FLL          -5        19 2013-01-01 05:55:00 2013-01-01 06:00:00
##  8 LGA    IAD          -3       -14 2013-01-01 05:57:00 2013-01-01 06:00:00
##  9 JFK    MCO          -3        -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA    ORD          -2         8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # ℹ 328,053 more rows
## # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
flights_dt %>%
    ggplot(aes(dep_time)) +
    geom_freqpoly(binwidth = 86400)

flights_dt %>% 
    filter(dep_time < ymd(20130102)) %>%
    ggplot(aes(dep_time)) +
    geom_freqpoly(binwidth = 600) # 600 s = 10 min

Date-Time Components

datetime <- ymd_hms("2016-07-08 12:34:56")

year(datetime)
## [1] 2016
month(datetime)
## [1] 7
mday(datetime)
## [1] 8
yday(datetime)
## [1] 190
wday(datetime)
## [1] 6
month(datetime, label = TRUE)
## [1] Jul
## 12 Levels: Jan < Feb < Mar < Apr < May < Jun < Jul < Aug < Sep < ... < Dec
wday(datetime, label = TRUE, abbr = FALSE)
## [1] Friday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
flights_dt %>%
    mutate(wday = wday(dep_time, label = TRUE)) %>%
    ggplot(aes(x = wday)) +
    geom_bar()

flights_dt %>%
    mutate(minute = minute(dep_time)) %>%
    group_by(minute) %>%
    summarise(
        avg_delay = mean(arr_delay, na.rm = TRUE),
        n = n()) %>%
    ggplot(aes(minute, avg_delay)) +
    geom_line()

sched_dep <- flights_dt %>%
    mutate(minute = minute(sched_dep_time)) %>%
    group_by(minute) %>%
    summarise(
        avg_delay = mean(arr_delay, na.rm = TRUE),
        n = n())

ggplot(sched_dep, aes(minute, avg_delay)) +
    geom_line()

ggplot(sched_dep, aes(minute, n)) +
    geom_line()

flights_dt %>%
    mutate(dep_hour = update(dep_time, yday = 1)) %>%
    ggplot(aes(dep_hour)) +
    geom_freqpoly(binwidth = 300)

Time Spans

h_age <- today() - ymd(20050911)
h_age
## Time difference of 7153 days
as.duration(h_age)
## [1] "618019200s (~19.58 years)"
dseconds(15)
## [1] "15s"
dminutes(10)
## [1] "600s (~10 minutes)"
dhours(c(12, 24))
## [1] "43200s (~12 hours)" "86400s (~1 days)"
ddays(0:5)
## [1] "0s"                "86400s (~1 days)"  "172800s (~2 days)"
## [4] "259200s (~3 days)" "345600s (~4 days)" "432000s (~5 days)"
dweeks(3)
## [1] "1814400s (~3 weeks)"
dyears(1)
## [1] "31557600s (~1 years)"
2 * dyears(1)
## [1] "63115200s (~2 years)"
dyears(1) + dweeks(12) + dhours(15)
## [1] "38869200s (~1.23 years)"
tomorrow <- today() + ddays(1)
last_year <- today() - dyears(1)

tomorrow
## [1] "2025-04-13"
last_year
## [1] "2024-04-11 18:00:00 UTC"