1. Purpose.

The purpose of this noteboook is to illustrate how the forcats package can be used to work with factors or categorical variables.

2. Load libraries and view practice dataset.

library(forcats)
library(dplyr)
library(ggplot2)
gss_cat
## # A tibble: 21,483 x 9
##     year marital         age race  rincome  partyid  relig  denom  tvhours
##    <int> <fct>         <int> <fct> <fct>    <fct>    <fct>  <fct>    <int>
##  1  2000 Never married    26 White $8000 t… Ind,nea… Prote… South…      12
##  2  2000 Divorced         48 White $8000 t… Not str… Prote… Bapti…      NA
##  3  2000 Widowed          67 White Not app… Indepen… Prote… No de…       2
##  4  2000 Never married    39 White Not app… Ind,nea… Ortho… Not a…       4
##  5  2000 Divorced         25 White Not app… Not str… None   Not a…       1
##  6  2000 Married          25 White $20000 … Strong … Prote… South…      NA
##  7  2000 Never married    36 White $25000 … Not str… Chris… Not a…       3
##  8  2000 Divorced         44 White $7000 t… Ind,nea… Prote… Luthe…      NA
##  9  2000 Married          44 White $25000 … Not str… Prote… Other        0
## 10  2000 Married          47 White $25000 … Strong … Prote… South…       3
## # ... with 21,473 more rows

3. Use factors where there is a limited set of possible values to prevent typos and support more useful sorting.

x1 <- c("Dec", "Apr", "Jan", "Mar")
x1
## [1] "Dec" "Apr" "Jan" "Mar"
class(x1)
## [1] "character"
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)

x1 <- factor(x1, levels = month_levels)
x1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
class(x1)
## [1] "factor"
sort(x1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
x2 <- c("Dec", "Apr", "Jam", "Mar")
factor(x2, levels = month_levels)
## [1] Dec  Apr  <NA> Mar 
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

4. Change levels in a factor

glimpse(gss_cat)
## Observations: 21,483
## Variables: 9
## $ year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, ...
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divor...
## $ age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51...
## $ race    <fct> White, White, White, White, White, White, White, White...
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not appl...
## $ partyid <fct> Ind,near rep, Not str republican, Independent, Ind,nea...
## $ relig   <fct> Protestant, Protestant, Protestant, Orthodox-christian...
## $ denom   <fct> Southern baptist, Baptist-dk which, No denomination, N...
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, ...
fct_relevel(gss_cat$race, c("White", "Black", "Other", "Not applicable")) %>% head(50)
##  [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: White Black Other Not applicable
fct_relevel(gss_cat$race, c("Black")) %>% head(50) 
##  [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: Black Other White Not applicable
fct_relevel(gss_cat$race, c("Not applicable")) %>% head(50)
##  [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: Not applicable Other Black White

5. Change values in a factor

fct_recode(gss_cat$race, "W" = "White", "B" = "Black", "O" = "Other") %>% head(50)
##  [1] W W W W W W W W W W W W W W W B W W W B W W W W W W W W W W W W B W W
## [36] O W W W W W W B W W W W W W W
## Levels: O B W Not applicable

6. Lump minority values into an Other category.

gss_cat %>% 
  group_by(relig) %>% 
  count() %>% 
  arrange(desc(n)) %>% 
  ungroup() %>% 
  mutate(prop=n/sum(n)) %>% 
  mutate(cumsum(prop))
## # A tibble: 15 x 4
##    relig                       n     prop `cumsum(prop)`
##    <fct>                   <int>    <dbl>          <dbl>
##  1 Protestant              10846 0.505             0.505
##  2 Catholic                 5124 0.239             0.743
##  3 None                     3523 0.164             0.907
##  4 Christian                 689 0.0321            0.939
##  5 Jewish                    388 0.0181            0.958
##  6 Other                     224 0.0104            0.968
##  7 Buddhism                  147 0.00684           0.975
##  8 Inter-nondenominational   109 0.00507           0.980
##  9 Moslem/islam              104 0.00484           0.985
## 10 Orthodox-christian         95 0.00442           0.989
## 11 No answer                  93 0.00433           0.993
## 12 Hinduism                   71 0.00330           0.997
## 13 Other eastern              32 0.00149           0.998
## 14 Native american            23 0.00107           0.999
## 15 Don't know                 15 0.000698          1
mutate(gss_cat, relig_lump = fct_lump(relig, n = 3)) %>% 
  group_by(relig_lump) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 4 x 2
## # Groups:   relig_lump [4]
##   relig_lump     n
##   <fct>      <int>
## 1 Protestant 10846
## 2 Catholic    5124
## 3 None        3523
## 4 Other       1990
mutate(gss_cat, relig_lump = fct_lump(relig, prop = 0.05)) %>% 
  group_by(relig_lump) %>% 
  count() %>% 
  arrange(desc(n))
## # A tibble: 4 x 2
## # Groups:   relig_lump [4]
##   relig_lump     n
##   <fct>      <int>
## 1 Protestant 10846
## 2 Catholic    5124
## 3 None        3523
## 4 Other       1990

7. Change the order of factor levels in a visualisation.

relig_summary <- gss_cat %>%
  group_by(relig) %>%
  summarise(
    age = mean(age, na.rm = T),
    tvhours = mean(tvhours, na.rm = T),
    n = n()
  )
relig_summary
## # A tibble: 15 x 4
##    relig                     age tvhours     n
##    <fct>                   <dbl>   <dbl> <int>
##  1 No answer                49.5    2.72    93
##  2 Don't know               35.9    4.62    15
##  3 Inter-nondenominational  40.0    2.87   109
##  4 Native american          38.9    3.46    23
##  5 Christian                40.1    2.79   689
##  6 Orthodox-christian       50.4    2.42    95
##  7 Moslem/islam             37.6    2.44   104
##  8 Other eastern            45.9    1.67    32
##  9 Hinduism                 37.7    1.89    71
## 10 Buddhism                 44.7    2.38   147
## 11 Other                    41.0    2.73   224
## 12 None                     41.2    2.71  3523
## 13 Jewish                   52.4    2.52   388
## 14 Catholic                 46.9    2.96  5124
## 15 Protestant               49.9    3.15 10846
ggplot(relig_summary, aes(x = tvhours, y = relig)) +
  geom_point()

ggplot(relig_summary, aes(x = tvhours, y = fct_reorder(relig, tvhours))) +
  geom_point()