Understand and usage of functions from the package
forcats (a short name for “For categorical variables”) and
some base R functions. We also touch upon the use of lubridate and aweek
for special factor cases related to epidemiological weeks.
Cat ^^
pacman::p_load(
rio, # import/export
here, # filepaths
lubridate, # working with dates
forcats, # factors
aweek, # create epiweeks with automatic factor levels
janitor, # tables
tidyverse # data mgmt and viz
)
linelist <- import(here("data", "linelist_cleaned.rds"))
linelist <- linelist %>%
mutate(delay_cat = case_when(
days_onset_hosp < 2 ~ "<2 days",
days_onset_hosp >= 2 & days_onset_hosp < 5 ~ "2-5 days",
days_onset_hosp >= 5 ~ ">5 days",
is.na(days_onset_hosp) ~ NA_character_,
TRUE ~ "Check me"
))
table(linelist$delay_cat, useNA = "always")
##
## <2 days >5 days 2-5 days <NA>
## 3182 603 2103 0
ggplot(data = linelist) +
geom_bar(mapping = aes(x = delay_cat))
# Convert to factor Using
fct_relevel.
linelist <- linelist %>%
mutate(delay_cat = fct_relevel(delay_cat))
The unique “values” in this column are now considered “levels” of the factor. The levels have an order. By default, the order of the levels will be alpha-numeric, as before. Note that NA is not a factor level.
levels(linelist$delay_cat)
## [1] "<2 days" ">5 days" "2-5 days"
The function fct_relevel() has the additional utility of
allowing you to manually specify the level order.
# manualy define the levels
linelist <- linelist %>%
mutate(delay_cat = fct_relevel(delay_cat, "<2 days", "2-5 days", ">5 days"))
linelist <- linelist %>%
mutate(delay_cat = fct_relevel(delay_cat, "2-5 days", ">5 days", "<2 days"))
levels(linelist$delay_cat)
## [1] "2-5 days" ">5 days" "<2 days"
ggplot(data = linelist) +
geom_bar(mapping = aes(x = delay_cat))
linelist <- linelist %>%
mutate(delay_cat = fct_expand(
delay_cat, # factor variable
"Not admitted to hospital", # add new level
"Transfer to other jurisdiction")) # add new level
table(linelist$delay_cat, useNA = "always")
##
## 2-5 days >5 days
## 2103 603
## <2 days Not admitted to hospital
## 3182 0
## Transfer to other jurisdiction <NA>
## 0 0
# or
linelist %>%
mutate(delay_cat = fct_expand(
delay_cat, # factor variable
"Not admitted to hospital", # add new level
"Transfer to other jurisdiction")) %>% # add new level
janitor::tabyl(delay_cat)
## delay_cat n percent
## 2-5 days 2103 0.3571671
## >5 days 603 0.1024117
## <2 days 3182 0.5404212
## Not admitted to hospital 0 0.0000000
## Transfer to other jurisdiction 0 0.0000000
linelist %>%
mutate(delay_cat = fct_drop(delay_cat)) %>%
tabyl(delay_cat)
## delay_cat n percent
## 2-5 days 2103 0.3571671
## >5 days 603 0.1024117
## <2 days 3182 0.5404212
# "Not admitted to hospital", "Transfer to other jurisdiction" with 0 value are drop
linelist <- linelist %>%
mutate(delay_cat = fct_relevel(delay_cat, c("<2 days", "2-5 days", ">5 days")))
# re-define level order
linelist %>%
mutate(delay_cat = fct_relevel(delay_cat, "<2 days", after = 1)) %>%
tabyl(delay_cat)
## delay_cat n percent
## 2-5 days 2103 0.3571671
## <2 days 3182 0.5404212
## >5 days 603 0.1024117
## Not admitted to hospital 0 0.0000000
## Transfer to other jurisdiction 0 0.0000000
ggplot(data = linelist) +
geom_bar(mapping = aes(x = delay_cat))
# Factor level order adjusted within ggplot
ggplot(data = linelist) +
geom_bar(mapping = aes(x = fct_relevel(delay_cat, c("2-5 days", ">5 days", "<2 days"))))
# ordered by frequency
a<- ggplot(data = linelist, aes(x = fct_infreq(delay_cat))) +
geom_bar() +
labs(x = "Delay onset to admission (days)",
title = "Ordered by frequency")
# reversed frequency
b <- ggplot(data = linelist, aes(x = fct_rev(fct_infreq(delay_cat)))) +
geom_bar() +
labs(x = "Delay onset to admission (days)",
title = "Reverse of order by frequency")
gridExtra::grid.arrange(a, b, ncol = 2, nrow = 2) # arrange plot in 1 same page
### By summary statistic of another column The x-axis is delay_cat; The
y-axis is numeric column ct_blood (cycle-threshold value); Box plots
show the CT value distribution by delay_cat
group. We want to order the box plots in ascending order by the
group median CT value.
# boxplots ordered by original factor levels
ggplot(data = linelist)+
geom_boxplot(
aes(x = delay_cat,
y = ct_blood,
fill = delay_cat))+
labs(x = "Delay onset to admission (days)",
title = "Ordered by original alpha-numeric levels")+
theme_classic()+
theme(legend.position = "none")
# boxplots ordered by median CT value
ggplot(data = linelist)+
geom_boxplot(
aes(x = fct_reorder(delay_cat, ct_blood, "median"),
y = ct_blood,
fill = delay_cat))+
labs(x = "Delay onset to admission (days)",
title = "Ordered by median CT value in group")+
theme_classic()+
theme(legend.position = "none")
Using lubridate::floor_date
x <- ymd_hms("2023-04-18 12:01:59.23")
lubridate::floor_date(x, "day")
## [1] "2023-04-18 UTC"
lubridate::floor_date(x, "week")
## [1] "2023-04-16 UTC"
lubridate::floor_date(x, "month")
## [1] "2023-04-01 UTC"
Using fct_reorder2()
epidemic_data <- linelist %>%
filter(date_onset < as.Date("2014-09-21")) %>% # cut-off date, for visual clarity
count(epiweek = lubridate::floor_date(date_onset, "week"), hospital) # get case counts per week and by hospital
view(epidemic_data)
plotweek <- ggplot(epidemic_data) +
geom_line(
aes(
x = epiweek,
y = n,
color = fct_reorder2(hospital, epiweek, n))) +
labs(title = "Factor levels (and legend display) by line, height at the end of plot",
color = "Hospital")
plotweek
Using fct_explicit_na()
linelist %>%
mutate(delay_cat = fct_na_value_to_level(delay_cat)) %>%
tabyl(delay_cat)
## delay_cat n percent
## <2 days 3182 0.5404212
## 2-5 days 2103 0.3571671
## >5 days 603 0.1024117
## Not admitted to hospital 0 0.0000000
## Transfer to other jurisdiction 0 0.0000000
## <NA> 0 0.0000000
Using fct_recode() for the creation of new factor levels. fct_level use NEW = OLD
levels(linelist$delay_cat)
## [1] "<2 days" "2-5 days"
## [3] ">5 days" "Not admitted to hospital"
## [5] "Transfer to other jurisdiction"
Syntax
fct_recode(column, "new" = "old", "new" = "old", "new" = "old")
linelist %>%
mutate(delay_cat = fct_recode(
delay_cat,
"Less than 2 days" = "<2 days",
"2 to 5 days" = "2-5 days",
"More than 5 days" = ">5 days"
)) %>%
tabyl(delay_cat)
## delay_cat n percent
## Less than 2 days 3182 0.5404212
## 2 to 5 days 2103 0.3571671
## More than 5 days 603 0.1024117
## Not admitted to hospital 0 0.0000000
## Transfer to other jurisdiction 0 0.0000000
linelist %>%
mutate(hospital = fct_other(
hospital, # column, variable
keep = c("Port Hospital", "Central Hospital"),
other_level = "Other Hospital"
)) %>%
tabyl(hospital)
## hospital n percent
## Central Hospital 454 0.07710598
## Port Hospital 1762 0.29925272
## Other Hospital 3672 0.62364130
Combine the least-frequent factor levels
automatically using fct_lump(). To “lump” together many
low-frequency levels into an “Other” group.
There are 2 ways: - Set n = as the number of groups you
want to keep. The n most-frequent levels will be kept, and all others
will combine into “Other”. - Set prop = as the threshold
frequency proportion for levels above which you want to keep. All other
values will combine into “Other”.
linelist %>%
mutate(hospital = fct_lump(
hospital,
n = 2, # keep top 2 levels
other_level = "Other Hospital"
)) %>%
tabyl(hospital)
## hospital n percent
## Missing 1469 0.2494905
## Port Hospital 1762 0.2992527
## Other Hospital 2657 0.4512568
linelist %>%
mutate(hospital = fct_lump(
hospital,
prop = 0.29,
other_level = "Other Hospital"
)) %>%
tabyl(hospital)
## hospital n percent
## Port Hospital 1762 0.2992527
## Other Hospital 4126 0.7007473
epiweek_date <- linelist %>%
mutate(epiweek_date = floor_date(date_onset, 'week')) %>%
ggplot() +
geom_histogram(mapping = aes(x = epiweek_date)) +
scale_x_date(date_labels = '%Y-W%W')
epiweek_date
# arrange plot in 1 same page - horizontal
gridExtra::grid.arrange(epiweek_date, plotweek, ncol = 2, nrow = 2)
# arrange plot in 1 same page - vertical
if (!require("pacman")) install.packages("pacman")
pacman::p_load(ggplot2, cowplot)
plot_grid(
epiweek_date,
plotweek,
align = "hv",
ncol = 1
)
# Option 1
linelist <- linelist %>%
mutate(epiweek_date = floor_date(date_onset, 'week'),
epiweek_formatted = format(epiweek_date, '%Y-W%W'),
epiweek_formatted = factor(epiweek_formatted))
levels(linelist$epiweek_formatted)
## [1] "2014-W13" "2014-W14" "2014-W15" "2014-W16" "2014-W17" "2014-W18"
## [7] "2014-W19" "2014-W20" "2014-W21" "2014-W22" "2014-W23" "2014-W24"
## [13] "2014-W25" "2014-W26" "2014-W27" "2014-W28" "2014-W29" "2014-W30"
## [19] "2014-W31" "2014-W32" "2014-W33" "2014-W34" "2014-W35" "2014-W36"
## [25] "2014-W37" "2014-W38" "2014-W39" "2014-W40" "2014-W41" "2014-W42"
## [31] "2014-W43" "2014-W44" "2014-W45" "2014-W46" "2014-W47" "2014-W48"
## [37] "2014-W49" "2014-W50" "2014-W51" "2015-W00" "2015-W01" "2015-W02"
## [43] "2015-W03" "2015-W04" "2015-W05" "2015-W06" "2015-W07" "2015-W08"
## [49] "2015-W09" "2015-W10" "2015-W11" "2015-W12" "2015-W13" "2015-W14"
## [55] "2015-W15" "2015-W16"
# Option 2
df <- linelist %>%
mutate(epiweek = date2week(date_onset, week_start = 'Monday', factor = TRUE))
levels(df$epiweek)
## [1] "2014-W15" "2014-W16" "2014-W17" "2014-W18" "2014-W19" "2014-W20"
## [7] "2014-W21" "2014-W22" "2014-W23" "2014-W24" "2014-W25" "2014-W26"
## [13] "2014-W27" "2014-W28" "2014-W29" "2014-W30" "2014-W31" "2014-W32"
## [19] "2014-W33" "2014-W34" "2014-W35" "2014-W36" "2014-W37" "2014-W38"
## [25] "2014-W39" "2014-W40" "2014-W41" "2014-W42" "2014-W43" "2014-W44"
## [31] "2014-W45" "2014-W46" "2014-W47" "2014-W48" "2014-W49" "2014-W50"
## [37] "2014-W51" "2014-W52" "2015-W01" "2015-W02" "2015-W03" "2015-W04"
## [43] "2015-W05" "2015-W06" "2015-W07" "2015-W08" "2015-W09" "2015-W10"
## [49] "2015-W11" "2015-W12" "2015-W13" "2015-W14" "2015-W15" "2015-W16"
## [55] "2015-W17" "2015-W18"