It is often helpful to convert characters into factors when working with categorical variables. With factors you can set levels, which allow you to control how they are displayed.
> library(tidyverse)
> library(forcats)
>
> #Using a string to record a variable
> x1 <- c("Dec", "Apr", "Jan", "Mar")The two problems with using a string:
[1] "Apr" "Dec" "Jan" "Mar"
These problems can be fixed by creating levels, which can be used when converting to a factor.
> month_levels <- c(
+ "Jan", "Feb", "Mar", "Apr", "May", "Jun",
+ "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+ )[1] Dec Apr Jan Mar
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
[1] Jan Mar Apr Dec
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Values that are not in the set of levels will be converted to NA.
[1] Dec Apr <NA> Mar
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
Warning: 1 parsing failure.
row col expected actual
3 -- value in level set Jam
[1] Dec Apr <NA> Mar
attr(,"problems")
# A tibble: 1 x 4
row col expected actual
<int> <int> <chr> <chr>
1 3 NA value in level set Jam
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
If you do not specify the levels it will create them alphabetically.
[1] Dec Apr Jan Mar
Levels: Apr Dec Jan Mar
To set the order of the levels according to the order of the data you can use unique(), or if the levels have already been set you can use fct_inorder().
[1] Dec Apr Jan Mar
Levels: Dec Apr Jan Mar
[1] Dec Apr Jan Mar
Levels: Dec Apr Jan Mar
To access the set of levels you can use levels()
[1] "Dec" "Apr" "Jan" "Mar"
Changing the level order can be very useful, especially in visualizations.
The plot below is difficult to interpret because there is no clear pattern.
> relig_summary <- gss_cat %>%
+ group_by(relig) %>%
+ summarize(
+ age = mean(age, na.rm = TRUE),
+ tvhours = mean(tvhours, na.rm = TRUE),
+ n = n()
+ )
>
> ggplot(relig_summary, aes(tvhours, relig)) +
+ geom_point(color="firebrick", size=2)Instead, we can reorder the levels with fct_reorder(), which takes three arguments:
f - The factor that will be modifiedx - A numeric vector that will be used to reorderx values for a factor. The default is median.> ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
+ geom_point(color="firebrick", size=2)It is often easier to reorder before ggplot() by using mutate().
> relig_summary %>%
+ mutate(relig = fct_reorder(relig, tvhours)) %>%
+ ggplot(aes(tvhours, relig)) +
+ geom_point(color="firebrick", size=2)However, sometimes reordering the levels isn’t a good idea if there’s already an established order. We can display average age by income, but if we reorder by age it will shuffle the income ranges in an unusual way.
> rincome_summary <- gss_cat %>%
+ group_by(rincome) %>%
+ summarize(
+ age = mean(age, na.rm = TRUE),
+ tvhours = mean(tvhours, na.rm = TRUE),
+ n = n()
+ )
>
> ggplot(rincome_summary, aes(age, fct_reorder(rincome, age)))+
+ geom_point(color="firebrick", size=2)Instead, we can leave the income levels and shift “Not applicable” to the front with fct_relevel.
> ggplot(rincome_summary, aes(age, fct_relevel(rincome,
+ "Not applicable"))) +
+ geom_point(color="firebrick", size=2)When coloring the lines on a plot you can use fct_reorder2(), which will help the lines match up with legend. It reorders the factor by the \(y\) values associated with the largest \(x\) values.
> library(gridExtra)
>
> by_age <- gss_cat %>%
+ filter(!is.na(age)) %>%
+ count(age, marital) %>%
+ group_by(age) %>%
+ mutate(prop = n / sum(n))
>
> ggplot(by_age, aes(age, prop, color = marital)) +
+ geom_line(na.rm = TRUE) -> p1
>
> ggplot(by_age, aes(age, prop,
+ color = fct_reorder2(marital, age, prop))) +
+ geom_line() +
+ labs(color = "marital") -> p2
>
> grid.arrange(p1, p2, ncol = 2)For bar plots fct_infreg() can be used to order the levels by frequency. It sets the levels in descending order, with the largest first, and so fct_rev() can be used to reverse the order.
> gss_cat %>%
+ mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
+ ggplot(aes(marital)) +
+ geom_bar(fill="firebrick")tvhours. Is the mean a good summary? Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 1.000 2.000 2.981 4.000 24.000 10146
> gss_cat %>%
+ filter(!is.na(tvhours)) %>%
+ ggplot(aes(x = tvhours)) +
+ geom_histogram( binwidth = 1,
+ fill="firebrick")+
+ geom_vline(aes(xintercept=mean(tvhours)),
+ color="blue", linetype="dashed", size=1)+
+ geom_vline(aes(xintercept=median(tvhours)),
+ color="green", linetype="dashed", size=1)gss_cat identify whether the order of the levels is arbitrary or principled.Rows: 21,483
Columns: 9
$ year <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,...
$ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, ...
$ age <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52,...
$ race <fct> White, White, White, White, White, White, White, White, Whi...
$ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicabl...
$ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,n...
$ relig <fct> Protestant, Protestant, Protestant, Orthodox-christian, Non...
$ denom <fct> Southern baptist, Baptist-dk which, No denomination, Not ap...
$ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3...
marital -arbitrary
[1] "No answer" "Never married" "Separated" "Divorced"
[5] "Widowed" "Married"
# A tibble: 6 x 2
marital n
<fct> <int>
1 No answer 17
2 Never married 5416
3 Separated 743
4 Divorced 3383
5 Widowed 1807
6 Married 10117
race -principled
[1] "Other" "Black" "White" "Not applicable"
# A tibble: 3 x 2
race n
<fct> <int>
1 Other 1959
2 Black 3129
3 White 16395
income-principled
[1] "No answer" "Don't know" "Refused" "$25000 or more"
[5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
[9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
[13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
# A tibble: 16 x 2
rincome n
<fct> <int>
1 No answer 183
2 Don't know 267
3 Refused 975
4 $25000 or more 7363
5 $20000 - 24999 1283
6 $15000 - 19999 1048
7 $10000 - 14999 1168
8 $8000 to 9999 340
9 $7000 to 7999 188
10 $6000 to 6999 215
11 $5000 to 5999 227
12 $4000 to 4999 226
13 $3000 to 3999 276
14 $1000 to 2999 395
15 Lt $1000 286
16 Not applicable 7043
partyid-arbitrary
[1] "No answer" "Don't know" "Other party"
[4] "Strong republican" "Not str republican" "Ind,near rep"
[7] "Independent" "Ind,near dem" "Not str democrat"
[10] "Strong democrat"
# A tibble: 10 x 2
partyid n
<fct> <int>
1 No answer 154
2 Don't know 1
3 Other party 393
4 Strong republican 2314
5 Not str republican 3032
6 Ind,near rep 1791
7 Independent 4119
8 Ind,near dem 2499
9 Not str democrat 3690
10 Strong democrat 3490
relig-arbitrary
[1] "No answer" "Don't know"
[3] "Inter-nondenominational" "Native american"
[5] "Christian" "Orthodox-christian"
[7] "Moslem/islam" "Other eastern"
[9] "Hinduism" "Buddhism"
[11] "Other" "None"
[13] "Jewish" "Catholic"
[15] "Protestant" "Not applicable"
# A tibble: 15 x 2
relig n
<fct> <int>
1 No answer 93
2 Don't know 15
3 Inter-nondenominational 109
4 Native american 23
5 Christian 689
6 Orthodox-christian 95
7 Moslem/islam 104
8 Other eastern 32
9 Hinduism 71
10 Buddhism 147
11 Other 224
12 None 3523
13 Jewish 388
14 Catholic 5124
15 Protestant 10846
denom-arbitrary
[1] "No answer" "Don't know" "No denomination"
[4] "Other" "Episcopal" "Presbyterian-dk wh"
[7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
[10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
[13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
[16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
[19] "Other methodist" "United methodist" "Afr meth ep zion"
[22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
[25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
[28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
# A tibble: 30 x 2
denom n
<fct> <int>
1 No answer 117
2 Don't know 52
3 No denomination 1683
4 Other 2534
5 Episcopal 397
6 Presbyterian-dk wh 244
7 Presbyterian, merged 67
8 Other presbyterian 47
9 United pres ch in us 110
10 Presbyterian c in us 104
# ... with 20 more rows
Sometimes you’d rather change the level values rather than just their order.
For example, these values could be better:
# A tibble: 10 x 2
partyid n
<fct> <int>
1 No answer 154
2 Don't know 1
3 Other party 393
4 Strong republican 2314
5 Not str republican 3032
6 Ind,near rep 1791
7 Independent 4119
8 Ind,near dem 2499
9 Not str democrat 3690
10 Strong democrat 3490
You can use fct_recode() to change them. Levels that aren’t specified will remain the same.
> gss_cat %>%
+ mutate(partyid = fct_recode(partyid,
+ "Republican, strong" = "Strong republican",
+ "Republican, weak" = "Not str republican",
+ "Independent, near rep" = "Ind,near rep",
+ "Independent, near dem" = "Ind,near dem",
+ "Democrat, weak" = "Not str democrat",
+ "Democrat, strong" = "Strong democrat"
+ )) %>%
+ count(partyid)# A tibble: 10 x 2
partyid n
<fct> <int>
1 No answer 154
2 Don't know 1
3 Other party 393
4 Republican, strong 2314
5 Republican, weak 3032
6 Independent, near rep 1791
7 Independent 4119
8 Independent, near dem 2499
9 Democrat, weak 3690
10 Democrat, strong 3490
To combine groups just give the same name to multiple levels.
> gss_cat %>%
+ mutate(partyid = fct_recode(partyid,
+ "Republican, strong" = "Strong republican",
+ "Republican, weak" = "Not str republican",
+ "Independent, near rep" = "Ind,near rep",
+ "Independent, near dem" = "Ind,near dem",
+ "Democrat, weak" = "Not str democrat",
+ "Democrat, strong" = "Strong democrat",
+ "Other" = "No answer",
+ "Other" = "Don't know",
+ "Other" = "Other party"
+ )) %>%
+ count(partyid)# A tibble: 8 x 2
partyid n
<fct> <int>
1 Other 548
2 Republican, strong 2314
3 Republican, weak 3032
4 Independent, near rep 1791
5 Independent 4119
6 Independent, near dem 2499
7 Democrat, weak 3690
8 Democrat, strong 3490
fct_collapse() works similarly, but for each value you can provide a vector of old levels.
> gss_cat %>%
+ mutate(partyid = fct_collapse(partyid,
+ other = c("No answer", "Don't know", "Other party"),
+ rep = c("Strong republican", "Not str republican"),
+ ind = c("Ind,near rep", "Independent", "Ind,near dem"),
+ dem = c("Not str democrat", "Strong democrat")
+ )) %>%
+ count(partyid)# A tibble: 4 x 2
partyid n
<fct> <int>
1 other 548
2 rep 5346
3 ind 8409
4 dem 7180
Finally, you can lump together many small groups with fct_lump(). It makes sure that the aggregate is still the smallest group, but that doesn’t help much here:
# A tibble: 2 x 2
relig n
<fct> <int>
1 Protestant 10846
2 Other 10637
Instead, we can specify the number of groups.
> gss_cat %>%
+ mutate(relig = fct_lump(relig, n = 10)) %>%
+ count(relig, sort = TRUE) %>%
+ print(n = Inf)# A tibble: 10 x 2
relig n
<fct> <int>
1 Protestant 10846
2 Catholic 5124
3 None 3523
4 Christian 689
5 Other 458
6 Jewish 388
7 Buddhism 147
8 Inter-nondenominational 109
9 Moslem/islam 104
10 Orthodox-christian 95
> gss_cat %>%
+ mutate(partyid =
+ fct_collapse(partyid,
+ other = c("No answer", "Don't know", "Other party"),
+ rep = c("Strong republican", "Not str republican"),
+ ind = c("Ind,near rep", "Independent", "Ind,near dem"),
+ dem = c("Not str democrat", "Strong democrat")
+ )
+ ) %>%
+ count(year, partyid) %>%
+ group_by(year) %>%
+ mutate(prop = n / sum(n)) %>%
+ ggplot(aes(
+ x = year, y = prop,
+ color = fct_reorder2(partyid, year, prop)
+ )) +
+ geom_point() +
+ geom_line() +
+ scale_x_continuous(breaks =
+ seq(from = 2000, to = 2014, by = 2))+
+ theme(axis.text.x =
+ element_text(angle = 90, vjust=0.5, hjust=1))+
+ labs(color = "Party", y = "Proportion")rincome into a small set of categories?> library("stringr")
>
> #Combine groups
> no_answer <- c("No answer", "Don't know",
+ "Refused", "Not applicable")
>
> ltfivek <- c("Lt $1000",str_c("$", c("1000", "3000", "4000"),
+ " to ", c("2999", "3999", "4999")))
>
> lttenk <- str_c("$", c("5000", "6000", "7000", "8000"),
+ " to ", c("5999", "6999", "7999", "9999"))
>
> #Plot combined groups
> gss_cat %>%
+ mutate(rincome = fct_collapse(rincome,
+ `Unknown` = no_answer,
+ `Lt $5000` = ltfivek,
+ `$5000 to 10000` = lttenk)) %>%
+ ggplot(aes(x = rincome)) +
+ geom_bar(fill="firebrick") +
+ coord_flip()