Objective

Understand and usage of functions from the package forcats (a short name for “For categorical variables”) and some base R functions. We also touch upon the use of lubridate and aweek for special factor cases related to epidemiological weeks.

Cat ^^

Load packages

pacman::p_load(
  rio,           # import/export
  here,          # filepaths
  lubridate,     # working with dates
  forcats,       # factors
  aweek,         # create epiweeks with automatic factor levels
  janitor,       # tables
  tidyverse      # data mgmt and viz
  )

Import data

linelist <- import(here("data", "linelist_cleaned.rds"))

Create new categorical variable

linelist <- linelist %>% 
  mutate(delay_cat = case_when(
    days_onset_hosp < 2                        ~ "<2 days",
    days_onset_hosp >= 2 & days_onset_hosp < 5 ~ "2-5 days",
    days_onset_hosp >= 5                       ~ ">5 days",
    is.na(days_onset_hosp)                     ~ NA_character_,
    TRUE                                       ~ "Check me"
  ))
table(linelist$delay_cat, useNA = "always")
## 
##  <2 days  >5 days 2-5 days     <NA> 
##     3182      603     2103        0
ggplot(data = linelist) +
  geom_bar(mapping = aes(x = delay_cat))

# Convert to factor Using fct_relevel.

linelist <- linelist %>% 
  mutate(delay_cat = fct_relevel(delay_cat))

The unique “values” in this column are now considered “levels” of the factor. The levels have an order. By default, the order of the levels will be alpha-numeric, as before. Note that NA is not a factor level.

levels(linelist$delay_cat)
## [1] "<2 days"  ">5 days"  "2-5 days"

The function fct_relevel() has the additional utility of allowing you to manually specify the level order.

# manualy define the levels 
linelist <- linelist %>% 
  mutate(delay_cat = fct_relevel(delay_cat, "<2 days", "2-5 days", ">5 days"))

linelist <- linelist %>% 
  mutate(delay_cat = fct_relevel(delay_cat, "2-5 days", ">5 days", "<2 days")) 

levels(linelist$delay_cat)
## [1] "2-5 days" ">5 days"  "<2 days"
ggplot(data = linelist) +
  geom_bar(mapping = aes(x = delay_cat))

Add or drop levels

Add

linelist <- linelist %>% 
  mutate(delay_cat = fct_expand(
    delay_cat, # factor variable
    "Not admitted to hospital",   # add new level 
    "Transfer to other jurisdiction"))  # add new level 

table(linelist$delay_cat, useNA = "always")
## 
##                       2-5 days                        >5 days 
##                           2103                            603 
##                        <2 days       Not admitted to hospital 
##                           3182                              0 
## Transfer to other jurisdiction                           <NA> 
##                              0                              0
# or
linelist %>% 
  mutate(delay_cat = fct_expand(
    delay_cat, # factor variable
    "Not admitted to hospital",   # add new level 
    "Transfer to other jurisdiction")) %>%  # add new level 
  janitor::tabyl(delay_cat)
##                       delay_cat    n   percent
##                        2-5 days 2103 0.3571671
##                         >5 days  603 0.1024117
##                         <2 days 3182 0.5404212
##        Not admitted to hospital    0 0.0000000
##  Transfer to other jurisdiction    0 0.0000000

Drop

linelist %>% 
  mutate(delay_cat = fct_drop(delay_cat)) %>% 
  tabyl(delay_cat)
##  delay_cat    n   percent
##   2-5 days 2103 0.3571671
##    >5 days  603 0.1024117
##    <2 days 3182 0.5404212
# "Not admitted to hospital", "Transfer to other jurisdiction" with 0 value are drop 

Adjust level order

Re-define level order

linelist <- linelist %>% 
  mutate(delay_cat = fct_relevel(delay_cat, c("<2 days", "2-5 days", ">5 days")))
# re-define level order
linelist %>% 
  mutate(delay_cat = fct_relevel(delay_cat, "<2 days", after = 1)) %>% 
  tabyl(delay_cat)
##                       delay_cat    n   percent
##                        2-5 days 2103 0.3571671
##                         <2 days 3182 0.5404212
##                         >5 days  603 0.1024117
##        Not admitted to hospital    0 0.0000000
##  Transfer to other jurisdiction    0 0.0000000
ggplot(data = linelist) +
  geom_bar(mapping = aes(x = delay_cat))

# Factor level order adjusted within ggplot 
ggplot(data = linelist) +
  geom_bar(mapping = aes(x = fct_relevel(delay_cat, c("2-5 days", ">5 days", "<2 days"))))

Reverse and By frequency

# ordered by frequency
a<- ggplot(data = linelist, aes(x = fct_infreq(delay_cat))) +
  geom_bar() +
  labs(x = "Delay onset to admission (days)",
       title = "Ordered by frequency")
# reversed frequency
b <- ggplot(data = linelist, aes(x = fct_rev(fct_infreq(delay_cat)))) +
  geom_bar() +
  labs(x = "Delay onset to admission (days)",
       title = "Reverse of order by frequency")
gridExtra::grid.arrange(a, b, ncol = 2, nrow = 2) # arrange plot in 1 same page

### By summary statistic of another column The x-axis is delay_cat; The y-axis is numeric column ct_blood (cycle-threshold value); Box plots show the CT value distribution by delay_cat group. We want to order the box plots in ascending order by the group median CT value.

# boxplots ordered by original factor levels
ggplot(data = linelist)+
  geom_boxplot(
    aes(x = delay_cat,
        y = ct_blood, 
        fill = delay_cat))+
  labs(x = "Delay onset to admission (days)",
       title = "Ordered by original alpha-numeric levels")+
  theme_classic()+
  theme(legend.position = "none")

# boxplots ordered by median CT value
ggplot(data = linelist)+
  geom_boxplot(
    aes(x = fct_reorder(delay_cat, ct_blood, "median"),
        y = ct_blood,
        fill = delay_cat))+
  labs(x = "Delay onset to admission (days)",
       title = "Ordered by median CT value in group")+
  theme_classic()+
  theme(legend.position = "none")

By “end” value

Using lubridate::floor_date

x <- ymd_hms("2023-04-18 12:01:59.23")
lubridate::floor_date(x, "day")
## [1] "2023-04-18 UTC"
lubridate::floor_date(x, "week")
## [1] "2023-04-16 UTC"
lubridate::floor_date(x, "month")
## [1] "2023-04-01 UTC"

Using fct_reorder2()

epidemic_data <- linelist %>% 
  filter(date_onset < as.Date("2014-09-21")) %>% # cut-off date, for visual clarity
  count(epiweek = lubridate::floor_date(date_onset, "week"), hospital) # get case counts per week and by hospital

view(epidemic_data)
plotweek <- ggplot(epidemic_data) +
  geom_line(
    aes(
      x = epiweek,
      y = n,
      color = fct_reorder2(hospital, epiweek, n))) +
  labs(title = "Factor levels (and legend display) by line, height at the end of plot",
       color = "Hospital")

plotweek

Missing values

Using fct_explicit_na()

linelist %>% 
  mutate(delay_cat = fct_na_value_to_level(delay_cat)) %>% 
  tabyl(delay_cat)
##                       delay_cat    n   percent
##                         <2 days 3182 0.5404212
##                        2-5 days 2103 0.3571671
##                         >5 days  603 0.1024117
##        Not admitted to hospital    0 0.0000000
##  Transfer to other jurisdiction    0 0.0000000
##                            <NA>    0 0.0000000

Combine levels

Factor recode

Using fct_recode() for the creation of new factor levels. fct_level use NEW = OLD

levels(linelist$delay_cat)
## [1] "<2 days"                        "2-5 days"                      
## [3] ">5 days"                        "Not admitted to hospital"      
## [5] "Transfer to other jurisdiction"

Syntax fct_recode(column, "new" = "old", "new" = "old", "new" = "old")

linelist %>% 
  mutate(delay_cat = fct_recode(
    delay_cat,
    "Less than 2 days" = "<2 days",
    "2 to 5 days" = "2-5 days",
    "More than 5 days" = ">5 days"
  )) %>% 
  tabyl(delay_cat)
##                       delay_cat    n   percent
##                Less than 2 days 3182 0.5404212
##                     2 to 5 days 2103 0.3571671
##                More than 5 days  603 0.1024117
##        Not admitted to hospital    0 0.0000000
##  Transfer to other jurisdiction    0 0.0000000

Reduce to “Other”

linelist %>% 
  mutate(hospital = fct_other(
    hospital,    # column, variable 
    keep = c("Port Hospital", "Central Hospital"),
    other_level = "Other Hospital"
  )) %>% 
  tabyl(hospital)
##          hospital    n    percent
##  Central Hospital  454 0.07710598
##     Port Hospital 1762 0.29925272
##    Other Hospital 3672 0.62364130

Reduce by frequency

Combine the least-frequent factor levels automatically using fct_lump(). To “lump” together many low-frequency levels into an “Other” group.

There are 2 ways: - Set n = as the number of groups you want to keep. The n most-frequent levels will be kept, and all others will combine into “Other”. - Set prop = as the threshold frequency proportion for levels above which you want to keep. All other values will combine into “Other”.

linelist %>% 
  mutate(hospital = fct_lump(
    hospital,
    n = 2,                         # keep top 2 levels
    other_level = "Other Hospital"
  )) %>% 
  tabyl(hospital)
##        hospital    n   percent
##         Missing 1469 0.2494905
##   Port Hospital 1762 0.2992527
##  Other Hospital 2657 0.4512568
linelist %>% 
  mutate(hospital = fct_lump(
    hospital, 
    prop = 0.29,
    other_level = "Other Hospital"
  )) %>% 
  tabyl(hospital)
##        hospital    n   percent
##   Port Hospital 1762 0.2992527
##  Other Hospital 4126 0.7007473

Epiweeks

Epiweek in plot

epiweek_date <- linelist %>% 
  mutate(epiweek_date = floor_date(date_onset, 'week')) %>% 
  ggplot() +
  geom_histogram(mapping = aes(x = epiweek_date)) +
  scale_x_date(date_labels = '%Y-W%W')

epiweek_date

# arrange plot in 1 same page - horizontal 
gridExtra::grid.arrange(epiweek_date, plotweek, ncol = 2, nrow = 2) 

# arrange plot in 1 same page - vertical 
if (!require("pacman")) install.packages("pacman")
pacman::p_load(ggplot2, cowplot)
plot_grid(
  epiweek_date,
  plotweek,
  align = "hv",
  ncol = 1
)

Epiweek in data

# Option 1
linelist <- linelist %>% 
  mutate(epiweek_date = floor_date(date_onset, 'week'),
         epiweek_formatted = format(epiweek_date, '%Y-W%W'),
         epiweek_formatted = factor(epiweek_formatted))

levels(linelist$epiweek_formatted)
##  [1] "2014-W13" "2014-W14" "2014-W15" "2014-W16" "2014-W17" "2014-W18"
##  [7] "2014-W19" "2014-W20" "2014-W21" "2014-W22" "2014-W23" "2014-W24"
## [13] "2014-W25" "2014-W26" "2014-W27" "2014-W28" "2014-W29" "2014-W30"
## [19] "2014-W31" "2014-W32" "2014-W33" "2014-W34" "2014-W35" "2014-W36"
## [25] "2014-W37" "2014-W38" "2014-W39" "2014-W40" "2014-W41" "2014-W42"
## [31] "2014-W43" "2014-W44" "2014-W45" "2014-W46" "2014-W47" "2014-W48"
## [37] "2014-W49" "2014-W50" "2014-W51" "2015-W00" "2015-W01" "2015-W02"
## [43] "2015-W03" "2015-W04" "2015-W05" "2015-W06" "2015-W07" "2015-W08"
## [49] "2015-W09" "2015-W10" "2015-W11" "2015-W12" "2015-W13" "2015-W14"
## [55] "2015-W15" "2015-W16"
# Option 2
df <- linelist %>% 
  mutate(epiweek = date2week(date_onset, week_start = 'Monday', factor = TRUE))

levels(df$epiweek)
##  [1] "2014-W15" "2014-W16" "2014-W17" "2014-W18" "2014-W19" "2014-W20"
##  [7] "2014-W21" "2014-W22" "2014-W23" "2014-W24" "2014-W25" "2014-W26"
## [13] "2014-W27" "2014-W28" "2014-W29" "2014-W30" "2014-W31" "2014-W32"
## [19] "2014-W33" "2014-W34" "2014-W35" "2014-W36" "2014-W37" "2014-W38"
## [25] "2014-W39" "2014-W40" "2014-W41" "2014-W42" "2014-W43" "2014-W44"
## [31] "2014-W45" "2014-W46" "2014-W47" "2014-W48" "2014-W49" "2014-W50"
## [37] "2014-W51" "2014-W52" "2015-W01" "2015-W02" "2015-W03" "2015-W04"
## [43] "2015-W05" "2015-W06" "2015-W07" "2015-W08" "2015-W09" "2015-W10"
## [49] "2015-W11" "2015-W12" "2015-W13" "2015-W14" "2015-W15" "2015-W16"
## [55] "2015-W17" "2015-W18"

References

  1. forcats
  2. aweek means ‘any week’
  3. How to vertically arrange ggplots with single set of axes and legend?
  4. R for Applied Epidemiologist