Chap15 Factors

Creating factors

x1 <- c("Dec", "Apr", "Jan", "Mar")
x2 <- c("Dec", "Apr", "Jam", "Mar")

# sort in a useful way
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
# list of valid levels
month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)

# now you can create factor
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# any values not in the set will be silently converted to NA (so in case of typos) :
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec  Apr  <NA> Mar 
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# order of the levels match the order of the first appearance in the data. You can do that when creating the factor by setting levels to unique(x), or after the fact, with fct_inorder():
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
f2 <- x1 %>% factor() %>% fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar

General Social Survey

gss_cat
## # A tibble: 21,483 × 9
##     year marital         age race  rincome        partyid    relig denom tvhours
##    <int> <fct>         <int> <fct> <fct>          <fct>      <fct> <fct>   <int>
##  1  2000 Never married    26 White $8000 to 9999  Ind,near … Prot… Sout…      12
##  2  2000 Divorced         48 White $8000 to 9999  Not str r… Prot… Bapt…      NA
##  3  2000 Widowed          67 White Not applicable Independe… Prot… No d…       2
##  4  2000 Never married    39 White Not applicable Ind,near … Orth… Not …       4
##  5  2000 Divorced         25 White Not applicable Not str d… None  Not …       1
##  6  2000 Married          25 White $20000 - 24999 Strong de… Prot… Sout…      NA
##  7  2000 Never married    36 White $25000 or more Not str r… Chri… Not …       3
##  8  2000 Divorced         44 White $7000 to 7999  Ind,near … Prot… Luth…      NA
##  9  2000 Married          44 White $25000 or more Not str d… Prot… Other       0
## 10  2000 Married          47 White $25000 or more Strong re… Prot… Sout…       3
## # ℹ 21,473 more rows

Modifying factor order

Unordered factor levels

# Transform data: calculate average tv hours by religion
tvhours_by_relig <- gss_cat %>%
    
    group_by(relig) %>%
    summarise(
        avg_tvhours = mean(tvhours, na.rm = TRUE),
    )

tvhours_by_relig
## # A tibble: 15 × 2
##    relig                   avg_tvhours
##    <fct>                         <dbl>
##  1 No answer                      2.72
##  2 Don't know                     4.62
##  3 Inter-nondenominational        2.87
##  4 Native american                3.46
##  5 Christian                      2.79
##  6 Orthodox-christian             2.42
##  7 Moslem/islam                   2.44
##  8 Other eastern                  1.67
##  9 Hinduism                       1.89
## 10 Buddhism                       2.38
## 11 Other                          2.73
## 12 None                           2.71
## 13 Jewish                         2.52
## 14 Catholic                       2.96
## 15 Protestant                     3.15
# Plot
tvhours_by_relig %>%
    
    ggplot(aes(x = avg_tvhours, y = relig)) +
    geom_point()

This is difficult to interpret because no overall pattern, so we use fct_reorder, like this:

Ordered factor levels

tvhours_by_relig %>%
    
    ggplot(aes(x = avg_tvhours, y = fct_reorder(.f = relig, .x = avg_tvhours))) +
    geom_point() +

    # Labeling
    labs(y = NULL, x = "Mean Daily Hours Watching TV")

Moving a single level to the front

tvhours_by_relig %>%
    
    ggplot(aes(x = avg_tvhours, 
               y = fct_reorder(.f = relig, .x = avg_tvhours) %>%
                   fct_relevel("Don't know"))) +
    geom_point() +

    # Labeling
    labs(y = NULL, x = "Mean Daily Hours Watching TV")

Modifying factor levels

gss_cat %>%
  mutate(partyid = fct_recode(partyid,
    "Republican, strong"    = "Strong republican",
    "Republican, weak"      = "Not str republican",
    "Independent, near rep" = "Ind,near rep",
    "Independent, near dem" = "Ind,near dem",
    "Democrat, weak"        = "Not str democrat",
    "Democrat, strong"      = "Strong democrat"
  )) %>%
  count(partyid)
## # A tibble: 10 × 2
##    partyid                   n
##    <fct>                 <int>
##  1 No answer               154
##  2 Don't know                1
##  3 Other party             393
##  4 Republican, strong     2314
##  5 Republican, weak       3032
##  6 Independent, near rep  1791
##  7 Independent            4119
##  8 Independent, near dem  2499
##  9 Democrat, weak         3690
## 10 Democrat, strong       3490
# New = old
gss_cat %>% distinct(race)
## # A tibble: 3 × 1
##   race 
##   <fct>
## 1 White
## 2 Black
## 3 Other
# Recode
gss_cat %>%
    
    # Rename levels
    mutate(race_rev = fct_recode(race, "African American" = "Black")) %>%
    select(race, race_rev) %>%
    filter(race == "Black")
## # A tibble: 3,129 × 2
##    race  race_rev        
##    <fct> <fct>           
##  1 Black African American
##  2 Black African American
##  3 Black African American
##  4 Black African American
##  5 Black African American
##  6 Black African American
##  7 Black African American
##  8 Black African American
##  9 Black African American
## 10 Black African American
## # ℹ 3,119 more rows
# Collapse multiple levels into one
gss_cat %>%
    
    mutate(race_col = fct_collapse(race, "Minority" = c("Black", "Other"))) %>%
    select(race, race_col) %>%
    filter(race != "White")
## # A tibble: 5,088 × 2
##    race  race_col
##    <fct> <fct>   
##  1 Black Minority
##  2 Black Minority
##  3 Black Minority
##  4 Other Minority
##  5 Black Minority
##  6 Other Minority
##  7 Black Minority
##  8 Other Minority
##  9 Black Minority
## 10 Black Minority
## # ℹ 5,078 more rows
# Lump small levels into other levels
gss_cat %>% count(race)
## # A tibble: 3 × 2
##   race      n
##   <fct> <int>
## 1 Other  1959
## 2 Black  3129
## 3 White 16395
gss_cat %>% mutate(race_lump = fct_lump(race)) %>% distinct(race_lump)
## # A tibble: 2 × 1
##   race_lump
##   <fct>    
## 1 White    
## 2 Other

Chap16 Dates and times

Introduction

Creating dates/times

From strings

# From strings
"2022/10/28" %>% ymd()
## [1] "2022-10-28"
# - also work instead of /

# From numbers
20221028 %>% ymd()
## [1] "2022-10-28"
"2022-10-28 4-41-30" %>% ymd_hms()
## [1] "2022-10-28 04:41:30 UTC"

From individual components

flights %>%
    select(year:day, hour, minute) %>%
    mutate(departure = make_datetime(year  = year, 
                                     month = month, 
                                     day   = day, 
                                     hour  = hour, 
                                     min   = minute))
## # A tibble: 336,776 × 6
##     year month   day  hour minute departure          
##    <int> <int> <int> <dbl>  <dbl> <dttm>             
##  1  2013     1     1     5     15 2013-01-01 05:15:00
##  2  2013     1     1     5     29 2013-01-01 05:29:00
##  3  2013     1     1     5     40 2013-01-01 05:40:00
##  4  2013     1     1     5     45 2013-01-01 05:45:00
##  5  2013     1     1     6      0 2013-01-01 06:00:00
##  6  2013     1     1     5     58 2013-01-01 05:58:00
##  7  2013     1     1     6      0 2013-01-01 06:00:00
##  8  2013     1     1     6      0 2013-01-01 06:00:00
##  9  2013     1     1     6      0 2013-01-01 06:00:00
## 10  2013     1     1     6      0 2013-01-01 06:00:00
## # ℹ 336,766 more rows

From other types

# From date to date-time
today() %>% as_datetime()
## [1] "2026-04-02 UTC"
# From date-time to date
now() %>% as_date()
## [1] "2026-04-02"

Date-time components

Getting components

date_time <- ymd_hms("2022-10-28 18-18-18")
date_time
## [1] "2022-10-28 18:18:18 UTC"
year(date_time)
## [1] 2022
month(date_time, label = TRUE, abbr = FALSE)
## [1] October
## 12 Levels: January < February < March < April < May < June < ... < December
yday(date_time)
## [1] 301
mday(date_time)
## [1] 28
wday(date_time, label = TRUE, abbr = FALSE)
## [1] Friday
## 7 Levels: Sunday < Monday < Tuesday < Wednesday < Thursday < ... < Saturday
make_datetime_100 <- function(year, month, day, time) {
  make_datetime(year, month, day, time %/% 100, time %% 100)
}

flights_dt <- flights %>% 
  filter(!is.na(dep_time), !is.na(arr_time)) %>% 
  mutate(
    dep_time = make_datetime_100(year, month, day, dep_time),
    arr_time = make_datetime_100(year, month, day, arr_time),
    sched_dep_time = make_datetime_100(year, month, day, sched_dep_time),
    sched_arr_time = make_datetime_100(year, month, day, sched_arr_time)
  ) %>% 
  select(origin, dest, ends_with("delay"), ends_with("time"))

flights_dt
## # A tibble: 328,063 × 9
##    origin dest  dep_delay arr_delay dep_time            sched_dep_time     
##    <chr>  <chr>     <dbl>     <dbl> <dttm>              <dttm>             
##  1 EWR    IAH           2        11 2013-01-01 05:17:00 2013-01-01 05:15:00
##  2 LGA    IAH           4        20 2013-01-01 05:33:00 2013-01-01 05:29:00
##  3 JFK    MIA           2        33 2013-01-01 05:42:00 2013-01-01 05:40:00
##  4 JFK    BQN          -1       -18 2013-01-01 05:44:00 2013-01-01 05:45:00
##  5 LGA    ATL          -6       -25 2013-01-01 05:54:00 2013-01-01 06:00:00
##  6 EWR    ORD          -4        12 2013-01-01 05:54:00 2013-01-01 05:58:00
##  7 EWR    FLL          -5        19 2013-01-01 05:55:00 2013-01-01 06:00:00
##  8 LGA    IAD          -3       -14 2013-01-01 05:57:00 2013-01-01 06:00:00
##  9 JFK    MCO          -3        -8 2013-01-01 05:57:00 2013-01-01 06:00:00
## 10 LGA    ORD          -2         8 2013-01-01 05:58:00 2013-01-01 06:00:00
## # ℹ 328,053 more rows
## # ℹ 3 more variables: arr_time <dttm>, sched_arr_time <dttm>, air_time <dbl>
flights_dt %>%
    transmute(wday = wday(dep_time, label = TRUE)) %>%
    
    ggplot(aes(wday)) +
    geom_bar()

Rounding

# floor_date for rounding down
flights_dt %>%
    
    mutate(week = floor_date(dep_time, "month")) %>%
    select(dep_time, week) %>%
    slice(2000:2010) %>%
    sample_n(10)
## # A tibble: 10 × 2
##    dep_time            week               
##    <dttm>              <dttm>             
##  1 2013-01-03 09:12:00 2013-01-01 00:00:00
##  2 2013-01-03 09:14:00 2013-01-01 00:00:00
##  3 2013-01-03 09:17:00 2013-01-01 00:00:00
##  4 2013-01-03 09:13:00 2013-01-01 00:00:00
##  5 2013-01-03 09:23:00 2013-01-01 00:00:00
##  6 2013-01-03 09:14:00 2013-01-01 00:00:00
##  7 2013-01-03 09:23:00 2013-01-01 00:00:00
##  8 2013-01-03 09:23:00 2013-01-01 00:00:00
##  9 2013-01-03 09:16:00 2013-01-01 00:00:00
## 10 2013-01-03 09:12:00 2013-01-01 00:00:00
# ceiling_date for rounding up
flights_dt %>%
    
    mutate(week = ceiling_date(dep_time, "month")) %>%
    select(dep_time, week) %>%
    slice(2000:2010) %>%
    sample_n(10)
## # A tibble: 10 × 2
##    dep_time            week               
##    <dttm>              <dttm>             
##  1 2013-01-03 09:23:00 2013-02-01 00:00:00
##  2 2013-01-03 09:16:00 2013-02-01 00:00:00
##  3 2013-01-03 09:17:00 2013-02-01 00:00:00
##  4 2013-01-03 09:23:00 2013-02-01 00:00:00
##  5 2013-01-03 09:13:00 2013-02-01 00:00:00
##  6 2013-01-03 09:12:00 2013-02-01 00:00:00
##  7 2013-01-03 09:12:00 2013-02-01 00:00:00
##  8 2013-01-03 09:11:00 2013-02-01 00:00:00
##  9 2013-01-03 09:14:00 2013-02-01 00:00:00
## 10 2013-01-03 09:14:00 2013-02-01 00:00:00

Setting components

flights_dt %>%
    mutate(dep_hour = update(dep_time, yday = 1)) %>%
    select(dep_time, dep_hour) %>%
    sample_n(10)
## # A tibble: 10 × 2
##    dep_time            dep_hour           
##    <dttm>              <dttm>             
##  1 2013-03-01 18:32:00 2013-01-01 18:32:00
##  2 2013-12-19 14:49:00 2013-01-01 14:49:00
##  3 2013-01-25 19:26:00 2013-01-01 19:26:00
##  4 2013-10-30 06:30:00 2013-01-01 06:30:00
##  5 2013-12-28 16:20:00 2013-01-01 16:20:00
##  6 2013-02-24 13:39:00 2013-01-01 13:39:00
##  7 2013-09-21 06:51:00 2013-01-01 06:51:00
##  8 2013-03-28 07:58:00 2013-01-01 07:58:00
##  9 2013-01-25 23:32:00 2013-01-01 23:32:00
## 10 2013-01-10 19:41:00 2013-01-01 19:41:00
flights_dt %>% 
  mutate(dep_hour = update(dep_time, yday = 1)) %>% 
  ggplot(aes(dep_hour)) +
    geom_freqpoly(binwidth = 300)

Time spans

one_pm <- ymd_hms("2016-03-12 13:00:00", tz = "America/New_York")

one_pm
## [1] "2016-03-12 13:00:00 EST"
#> [1] "2016-03-12 13:00:00 EST"
one_pm + ddays(1)
## [1] "2016-03-13 14:00:00 EDT"
#> [1] "2016-03-13 14:00:00 EDT"

# this is not + 1 day really because of light saving time