forcats trainingThe purpose of this noteboook is to illustrate how the forcats package can be used to work with factors or categorical variables.
library(forcats)
library(dplyr)
library(ggplot2)
gss_cat
## # A tibble: 21,483 x 9
## year marital age race rincome partyid relig denom tvhours
## <int> <fct> <int> <fct> <fct> <fct> <fct> <fct> <int>
## 1 2000 Never married 26 White $8000 t… Ind,nea… Prote… South… 12
## 2 2000 Divorced 48 White $8000 t… Not str… Prote… Bapti… NA
## 3 2000 Widowed 67 White Not app… Indepen… Prote… No de… 2
## 4 2000 Never married 39 White Not app… Ind,nea… Ortho… Not a… 4
## 5 2000 Divorced 25 White Not app… Not str… None Not a… 1
## 6 2000 Married 25 White $20000 … Strong … Prote… South… NA
## 7 2000 Never married 36 White $25000 … Not str… Chris… Not a… 3
## 8 2000 Divorced 44 White $7000 t… Ind,nea… Prote… Luthe… NA
## 9 2000 Married 44 White $25000 … Not str… Prote… Other 0
## 10 2000 Married 47 White $25000 … Strong … Prote… South… 3
## # ... with 21,473 more rows
x1 <- c("Dec", "Apr", "Jan", "Mar")
x1
## [1] "Dec" "Apr" "Jan" "Mar"
class(x1)
## [1] "character"
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
x1 <- factor(x1, levels = month_levels)
x1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
class(x1)
## [1] "factor"
sort(x1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
x2 <- c("Dec", "Apr", "Jam", "Mar")
factor(x2, levels = month_levels)
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
glimpse(gss_cat)
## Observations: 21,483
## Variables: 9
## $ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, ...
## $ marital <fct> Never married, Divorced, Widowed, Never married, Divor...
## $ age <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51...
## $ race <fct> White, White, White, White, White, White, White, White...
## $ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not appl...
## $ partyid <fct> Ind,near rep, Not str republican, Independent, Ind,nea...
## $ relig <fct> Protestant, Protestant, Protestant, Orthodox-christian...
## $ denom <fct> Southern baptist, Baptist-dk which, No denomination, N...
## $ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, ...
fct_relevel(gss_cat$race, c("White", "Black", "Other", "Not applicable")) %>% head(50)
## [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: White Black Other Not applicable
fct_relevel(gss_cat$race, c("Black")) %>% head(50)
## [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: Black Other White Not applicable
fct_relevel(gss_cat$race, c("Not applicable")) %>% head(50)
## [1] White White White White White White White White White White White
## [12] White White White White Black White White White Black White White
## [23] White White White White White White White White White White Black
## [34] White White Other White White White White White White Black White
## [45] White White White White White White
## Levels: Not applicable Other Black White
fct_recode(gss_cat$race, "W" = "White", "B" = "Black", "O" = "Other") %>% head(50)
## [1] W W W W W W W W W W W W W W W B W W W B W W W W W W W W W W W W B W W
## [36] O W W W W W W B W W W W W W W
## Levels: O B W Not applicable
Other category.gss_cat %>%
group_by(relig) %>%
count() %>%
arrange(desc(n)) %>%
ungroup() %>%
mutate(prop=n/sum(n)) %>%
mutate(cumsum(prop))
## # A tibble: 15 x 4
## relig n prop `cumsum(prop)`
## <fct> <int> <dbl> <dbl>
## 1 Protestant 10846 0.505 0.505
## 2 Catholic 5124 0.239 0.743
## 3 None 3523 0.164 0.907
## 4 Christian 689 0.0321 0.939
## 5 Jewish 388 0.0181 0.958
## 6 Other 224 0.0104 0.968
## 7 Buddhism 147 0.00684 0.975
## 8 Inter-nondenominational 109 0.00507 0.980
## 9 Moslem/islam 104 0.00484 0.985
## 10 Orthodox-christian 95 0.00442 0.989
## 11 No answer 93 0.00433 0.993
## 12 Hinduism 71 0.00330 0.997
## 13 Other eastern 32 0.00149 0.998
## 14 Native american 23 0.00107 0.999
## 15 Don't know 15 0.000698 1
mutate(gss_cat, relig_lump = fct_lump(relig, n = 3)) %>%
group_by(relig_lump) %>%
count() %>%
arrange(desc(n))
## # A tibble: 4 x 2
## # Groups: relig_lump [4]
## relig_lump n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Other 1990
mutate(gss_cat, relig_lump = fct_lump(relig, prop = 0.05)) %>%
group_by(relig_lump) %>%
count() %>%
arrange(desc(n))
## # A tibble: 4 x 2
## # Groups: relig_lump [4]
## relig_lump n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Other 1990
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = T),
tvhours = mean(tvhours, na.rm = T),
n = n()
)
relig_summary
## # A tibble: 15 x 4
## relig age tvhours n
## <fct> <dbl> <dbl> <int>
## 1 No answer 49.5 2.72 93
## 2 Don't know 35.9 4.62 15
## 3 Inter-nondenominational 40.0 2.87 109
## 4 Native american 38.9 3.46 23
## 5 Christian 40.1 2.79 689
## 6 Orthodox-christian 50.4 2.42 95
## 7 Moslem/islam 37.6 2.44 104
## 8 Other eastern 45.9 1.67 32
## 9 Hinduism 37.7 1.89 71
## 10 Buddhism 44.7 2.38 147
## 11 Other 41.0 2.73 224
## 12 None 41.2 2.71 3523
## 13 Jewish 52.4 2.52 388
## 14 Catholic 46.9 2.96 5124
## 15 Protestant 49.9 3.15 10846
ggplot(relig_summary, aes(x = tvhours, y = relig)) +
geom_point()
ggplot(relig_summary, aes(x = tvhours, y = fct_reorder(relig, tvhours))) +
geom_point()