Creating Factors


It is often helpful to convert characters into factors when working with categorical variables. With factors you can set levels, which allow you to control how they are displayed.

> library(tidyverse)
> library(forcats)
> 
> #Using a string to record a variable
> x1 <- c("Dec", "Apr", "Jan", "Mar")

The two problems with using a string:

> # 1. You could spell something wrong
> x2 <- c("Dec", "Apr", "Jam", "Mar")
> # 2. It sorts by alphabetical order, not chronologically
> sort(x1)
[1] "Apr" "Dec" "Jan" "Mar"

These problems can be fixed by creating levels, which can be used when converting to a factor.

> month_levels <- c(
+   "Jan", "Feb", "Mar", "Apr", "May", "Jun", 
+   "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
+ )
> #Convert to a factor
> (y1 <- factor(x1, levels = month_levels))
[1] Dec Apr Jan Mar
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
> #They can now be sorted chronologically
> sort(y1)
[1] Jan Mar Apr Dec
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

Values that are not in the set of levels will be converted to NA.

> # The misspelling Jam is not in the set
> (y2 <- factor(x2, levels = month_levels))
[1] Dec  Apr  <NA> Mar 
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
> # To display an error you can use parse_factor
> (y2 <- parse_factor(x2, levels = month_levels))
Warning: 1 parsing failure.
row col           expected actual
  3  -- value in level set    Jam
[1] Dec  Apr  <NA> Mar 
attr(,"problems")
# A tibble: 1 x 4
    row   col expected           actual
  <int> <int> <chr>              <chr> 
1     3    NA value in level set Jam   
Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

If you do not specify the levels it will create them alphabetically.

> factor(x1) 
[1] Dec Apr Jan Mar
Levels: Apr Dec Jan Mar

To set the order of the levels according to the order of the data you can use unique(), or if the levels have already been set you can use fct_inorder().

> (f1 <- factor(x1, levels = unique(x1)))
[1] Dec Apr Jan Mar
Levels: Dec Apr Jan Mar
> (f2 <- x1 %>% factor() %>% fct_inorder())
[1] Dec Apr Jan Mar
Levels: Dec Apr Jan Mar

To access the set of levels you can use levels()

> levels(f2)
[1] "Dec" "Apr" "Jan" "Mar"

General Social Survey


forecats::gss_cat is sample data from the General Social Survey.

> gss_cat %>% glimpse
Rows: 21,483
Columns: 9
$ year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,...
$ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, ...
$ age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52,...
$ race    <fct> White, White, White, White, White, White, White, White, Whi...
$ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicabl...
$ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,n...
$ relig   <fct> Protestant, Protestant, Protestant, Orthodox-christian, Non...
$ denom   <fct> Southern baptist, Baptist-dk which, No denomination, Not ap...
$ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3...

One way to see a factor’s levels is with count().

> #race levels
> levels(gss_cat$race)
[1] "Other"          "Black"          "White"          "Not applicable"
> gss_cat %>% count(race)
# A tibble: 3 x 2
  race      n
  <fct> <int>
1 Other  1959
2 Black  3129
3 White 16395

Or with a bar chart.

> ggplot(gss_cat, aes(race)) +
+   geom_bar(fill="firebrick")

ggplot() will drop levels that don’t have values, but you can force them to be displayed.

> ggplot(gss_cat, aes(race)) +
+   geom_bar(fill="firebrick") +
+   scale_x_discrete(drop = FALSE)

General Social Survey - Exercises

  1. Explore the distribution of rincome (reported income). What makes the default bar chart hard to understand? How could you improve the plot?
  • It is difficult to read the labels
> library(tidyverse)
> ggplot(gss_cat)+geom_bar(aes(rincome),fill="firebrick")

  • We can switch the axis, which helps, but we still have “Not applicable” and several categories without values.
> ggplot(gss_cat)+geom_bar(aes(rincome),fill="firebrick")+
+   coord_flip()

  • We can get rid of “Not applicable” and combine the other categories.
> gss_cat %>%
+   filter(rincome!="Not applicable") %>% 
+   # Collapse into Other
+   mutate(rincome=fct_collapse(rincome,
+     Other=c("Refused",
+             "Don't know","No answer"))) %>% 
+   # Change formatting
+   mutate(rincome = fct_recode(rincome,
+           "Less than $1000" = "Lt $1000")) %>% 
+   # Add column to fill by
+   mutate(rincome_na = rincome =="Other") %>%
+   ggplot() +
+   geom_bar(aes(rincome, fill=rincome_na))+
+   coord_flip()+
+   scale_y_continuous("Survey Participants", 
+                      labels = scales::comma) +
+   scale_x_discrete("Income Levels") +
+   scale_fill_manual(values = c("FALSE" = "firebrick", 
+                                "TRUE" = "gray"))+
+   theme(legend.position = "None")

  1. What is the most common relig in this survey? What’s the most common partyid?
> # Top 5 relig
> gss_cat %>% count(relig, sort=TRUE) %>% 
+   head(5)
# A tibble: 5 x 2
  relig          n
  <fct>      <int>
1 Protestant 10846
2 Catholic    5124
3 None        3523
4 Christian    689
5 Jewish       388
> # Top 5 partyid
> gss_cat %>% count(partyid, sort=TRUE) %>% 
+   head(5)
# A tibble: 5 x 2
  partyid                n
  <fct>              <int>
1 Independent         4119
2 Not str democrat    3690
3 Strong democrat     3490
4 Not str republican  3032
5 Ind,near dem        2499
  1. Which relig does denom (denomination) apply to? How can you find out with a table? How can you find out with a visualization?
  • Answer = Protestant

Table

> gss_cat %>% 
+   select(relig,denom) %>% 
+   unique() %>% 
+   filter(!denom %in% c(
+     "No answer", "Other", "Don't know", 
+     "Not applicable","No denomination")) 
# A tibble: 25 x 2
   relig      denom               
   <fct>      <fct>               
 1 Protestant Southern baptist    
 2 Protestant Baptist-dk which    
 3 Protestant Lutheran-mo synod   
 4 Protestant United methodist    
 5 Protestant Episcopal           
 6 Protestant Other lutheran      
 7 Protestant Afr meth ep zion    
 8 Protestant Am bapt ch in usa   
 9 Protestant Other methodist     
10 Protestant Presbyterian c in us
# ... with 15 more rows

Visualization

> gss_cat %>%
+   count(relig, denom) %>%
+   ggplot(aes(x = relig, y = denom, size = n)) +
+   geom_point(color="firebrick") +
+   theme(axis.text.x = element_text(angle = 90))

Modifying Factor Order


Changing the level order can be very useful, especially in visualizations.

The plot below is difficult to interpret because there is no clear pattern.

> relig_summary <- gss_cat %>%
+   group_by(relig) %>%
+   summarize(
+     age = mean(age, na.rm = TRUE),
+     tvhours = mean(tvhours, na.rm = TRUE),
+     n = n()
+   )
> 
> ggplot(relig_summary, aes(tvhours, relig)) + 
+   geom_point(color="firebrick", size=2)

Instead, we can reorder the levels with fct_reorder(), which takes three arguments:

  1. f - The factor that will be modified
  2. x - A numeric vector that will be used to reorder
  3. An optional function to be used if there are multiple x values for a factor. The default is median.
> ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
+   geom_point(color="firebrick", size=2)

It is often easier to reorder before ggplot() by using mutate().

> relig_summary %>%
+   mutate(relig = fct_reorder(relig, tvhours)) %>% 
+   ggplot(aes(tvhours, relig)) +
+   geom_point(color="firebrick", size=2)

However, sometimes reordering the levels isn’t a good idea if there’s already an established order. We can display average age by income, but if we reorder by age it will shuffle the income ranges in an unusual way.

> rincome_summary <- gss_cat %>%
+   group_by(rincome) %>%
+   summarize(
+     age = mean(age, na.rm = TRUE),
+     tvhours = mean(tvhours, na.rm = TRUE),
+     n = n()
+   )
> 
> ggplot(rincome_summary, aes(age, fct_reorder(rincome, age)))+
+   geom_point(color="firebrick", size=2)

Instead, we can leave the income levels and shift “Not applicable” to the front with fct_relevel.

> ggplot(rincome_summary, aes(age, fct_relevel(rincome, 
+                                   "Not applicable"))) +
+   geom_point(color="firebrick", size=2)

When coloring the lines on a plot you can use fct_reorder2(), which will help the lines match up with legend. It reorders the factor by the \(y\) values associated with the largest \(x\) values.

> library(gridExtra)
> 
> by_age <- gss_cat %>%
+   filter(!is.na(age)) %>%
+   count(age, marital) %>%
+   group_by(age) %>%
+   mutate(prop = n / sum(n))
> 
> ggplot(by_age, aes(age, prop, color = marital)) +
+   geom_line(na.rm = TRUE) -> p1
> 
> ggplot(by_age, aes(age, prop, 
+                    color = fct_reorder2(marital, age, prop))) +
+   geom_line() +
+   labs(color = "marital") -> p2
> 
> grid.arrange(p1, p2, ncol = 2)

For bar plots fct_infreg() can be used to order the levels by frequency. It sets the levels in descending order, with the largest first, and so fct_rev() can be used to reverse the order.

> gss_cat %>%
+   mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
+   ggplot(aes(marital)) +
+   geom_bar(fill="firebrick")

Modifying Factor Order - Exercises

  1. There are some suspiciously high numbers in tvhours. Is the mean a good summary?
  • The data is right skewed. The median would be better.
> summary(gss_cat$tvhours)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
  0.000   1.000   2.000   2.981   4.000  24.000   10146 
> gss_cat %>%
+   filter(!is.na(tvhours)) %>%
+   ggplot(aes(x = tvhours)) +
+   geom_histogram( binwidth = 1,
+                  fill="firebrick")+
+   geom_vline(aes(xintercept=mean(tvhours)),
+              color="blue", linetype="dashed", size=1)+
+   geom_vline(aes(xintercept=median(tvhours)),
+              color="green", linetype="dashed", size=1)

  1. For each factor in gss_cat identify whether the order of the levels is arbitrary or principled.
> gss_cat %>% glimpse()
Rows: 21,483
Columns: 9
$ year    <int> 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000, 2000,...
$ marital <fct> Never married, Divorced, Widowed, Never married, Divorced, ...
$ age     <int> 26, 48, 67, 39, 25, 25, 36, 44, 44, 47, 53, 52, 52, 51, 52,...
$ race    <fct> White, White, White, White, White, White, White, White, Whi...
$ rincome <fct> $8000 to 9999, $8000 to 9999, Not applicable, Not applicabl...
$ partyid <fct> "Ind,near rep", "Not str republican", "Independent", "Ind,n...
$ relig   <fct> Protestant, Protestant, Protestant, Orthodox-christian, Non...
$ denom   <fct> Southern baptist, Baptist-dk which, No denomination, Not ap...
$ tvhours <int> 12, NA, 2, 4, 1, NA, 3, NA, 0, 3, 2, NA, 1, NA, 1, 7, NA, 3...

marital -arbitrary

  • Not alphabetical
  • Not by Count
> levels(gss_cat$marital)
[1] "No answer"     "Never married" "Separated"     "Divorced"     
[5] "Widowed"       "Married"      
> gss_cat %>% count(marital)
# A tibble: 6 x 2
  marital           n
  <fct>         <int>
1 No answer        17
2 Never married  5416
3 Separated       743
4 Divorced       3383
5 Widowed        1807
6 Married       10117

race -principled

  • By count
> levels(gss_cat$race)
[1] "Other"          "Black"          "White"          "Not applicable"
> gss_cat %>% count(race)
# A tibble: 3 x 2
  race      n
  <fct> <int>
1 Other  1959
2 Black  3129
3 White 16395

income-principled

  • By income amount
> levels(gss_cat$rincome)
 [1] "No answer"      "Don't know"     "Refused"        "$25000 or more"
 [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" 
 [9] "$7000 to 7999"  "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999" 
[13] "$3000 to 3999"  "$1000 to 2999"  "Lt $1000"       "Not applicable"
> gss_cat %>% count(rincome)
# A tibble: 16 x 2
   rincome            n
   <fct>          <int>
 1 No answer        183
 2 Don't know       267
 3 Refused          975
 4 $25000 or more  7363
 5 $20000 - 24999  1283
 6 $15000 - 19999  1048
 7 $10000 - 14999  1168
 8 $8000 to 9999    340
 9 $7000 to 7999    188
10 $6000 to 6999    215
11 $5000 to 5999    227
12 $4000 to 4999    226
13 $3000 to 3999    276
14 $1000 to 2999    395
15 Lt $1000         286
16 Not applicable  7043

partyid-arbitrary

  • Not alphabetical
  • Not by count
> levels(gss_cat$partyid)
 [1] "No answer"          "Don't know"         "Other party"       
 [4] "Strong republican"  "Not str republican" "Ind,near rep"      
 [7] "Independent"        "Ind,near dem"       "Not str democrat"  
[10] "Strong democrat"   
> gss_cat %>% count(partyid)
# A tibble: 10 x 2
   partyid                n
   <fct>              <int>
 1 No answer            154
 2 Don't know             1
 3 Other party          393
 4 Strong republican   2314
 5 Not str republican  3032
 6 Ind,near rep        1791
 7 Independent         4119
 8 Ind,near dem        2499
 9 Not str democrat    3690
10 Strong democrat     3490

relig-arbitrary

  • Not alphabetical
  • Not by count
> levels(gss_cat$relig)
 [1] "No answer"               "Don't know"             
 [3] "Inter-nondenominational" "Native american"        
 [5] "Christian"               "Orthodox-christian"     
 [7] "Moslem/islam"            "Other eastern"          
 [9] "Hinduism"                "Buddhism"               
[11] "Other"                   "None"                   
[13] "Jewish"                  "Catholic"               
[15] "Protestant"              "Not applicable"         
> gss_cat %>% count(relig)
# A tibble: 15 x 2
   relig                       n
   <fct>                   <int>
 1 No answer                  93
 2 Don't know                 15
 3 Inter-nondenominational   109
 4 Native american            23
 5 Christian                 689
 6 Orthodox-christian         95
 7 Moslem/islam              104
 8 Other eastern              32
 9 Hinduism                   71
10 Buddhism                  147
11 Other                     224
12 None                     3523
13 Jewish                    388
14 Catholic                 5124
15 Protestant              10846

denom-arbitrary

  • Not alphabetical
  • Not by count
> levels(gss_cat$denom)
 [1] "No answer"            "Don't know"           "No denomination"     
 [4] "Other"                "Episcopal"            "Presbyterian-dk wh"  
 [7] "Presbyterian, merged" "Other presbyterian"   "United pres ch in us"
[10] "Presbyterian c in us" "Lutheran-dk which"    "Evangelical luth"    
[13] "Other lutheran"       "Wi evan luth synod"   "Lutheran-mo synod"   
[16] "Luth ch in america"   "Am lutheran"          "Methodist-dk which"  
[19] "Other methodist"      "United methodist"     "Afr meth ep zion"    
[22] "Afr meth episcopal"   "Baptist-dk which"     "Other baptists"      
[25] "Southern baptist"     "Nat bapt conv usa"    "Nat bapt conv of am" 
[28] "Am bapt ch in usa"    "Am baptist asso"      "Not applicable"      
> gss_cat %>% count(denom)
# A tibble: 30 x 2
   denom                    n
   <fct>                <int>
 1 No answer              117
 2 Don't know              52
 3 No denomination       1683
 4 Other                 2534
 5 Episcopal              397
 6 Presbyterian-dk wh     244
 7 Presbyterian, merged    67
 8 Other presbyterian      47
 9 United pres ch in us   110
10 Presbyterian c in us   104
# ... with 20 more rows
  1. Why did moving “Not applicable” to the front of the levels move it to the bottom of the plot?
  • “Not applicable” became an integer value of 1.

Modifying Factor Levels


Sometimes you’d rather change the level values rather than just their order.

For example, these values could be better:

> gss_cat %>% count(partyid)
# A tibble: 10 x 2
   partyid                n
   <fct>              <int>
 1 No answer            154
 2 Don't know             1
 3 Other party          393
 4 Strong republican   2314
 5 Not str republican  3032
 6 Ind,near rep        1791
 7 Independent         4119
 8 Ind,near dem        2499
 9 Not str democrat    3690
10 Strong democrat     3490

You can use fct_recode() to change them. Levels that aren’t specified will remain the same.

> gss_cat %>%
+   mutate(partyid = fct_recode(partyid,
+     "Republican, strong"    = "Strong republican",
+     "Republican, weak"      = "Not str republican",
+     "Independent, near rep" = "Ind,near rep",
+     "Independent, near dem" = "Ind,near dem",
+     "Democrat, weak"        = "Not str democrat",
+     "Democrat, strong"      = "Strong democrat"
+   )) %>%
+   count(partyid)
# A tibble: 10 x 2
   partyid                   n
   <fct>                 <int>
 1 No answer               154
 2 Don't know                1
 3 Other party             393
 4 Republican, strong     2314
 5 Republican, weak       3032
 6 Independent, near rep  1791
 7 Independent            4119
 8 Independent, near dem  2499
 9 Democrat, weak         3690
10 Democrat, strong       3490

To combine groups just give the same name to multiple levels.

> gss_cat %>%
+   mutate(partyid = fct_recode(partyid,
+     "Republican, strong"    = "Strong republican",
+     "Republican, weak"      = "Not str republican",
+     "Independent, near rep" = "Ind,near rep",
+     "Independent, near dem" = "Ind,near dem",
+     "Democrat, weak"        = "Not str democrat",
+     "Democrat, strong"      = "Strong democrat",
+     "Other"                 = "No answer",
+     "Other"                 = "Don't know",
+     "Other"                 = "Other party"
+   )) %>%
+   count(partyid)
# A tibble: 8 x 2
  partyid                   n
  <fct>                 <int>
1 Other                   548
2 Republican, strong     2314
3 Republican, weak       3032
4 Independent, near rep  1791
5 Independent            4119
6 Independent, near dem  2499
7 Democrat, weak         3690
8 Democrat, strong       3490

fct_collapse() works similarly, but for each value you can provide a vector of old levels.

> gss_cat %>%
+   mutate(partyid = fct_collapse(partyid,
+     other = c("No answer", "Don't know", "Other party"),
+     rep = c("Strong republican", "Not str republican"),
+     ind = c("Ind,near rep", "Independent", "Ind,near dem"),
+     dem = c("Not str democrat", "Strong democrat")
+   )) %>%
+   count(partyid)
# A tibble: 4 x 2
  partyid     n
  <fct>   <int>
1 other     548
2 rep      5346
3 ind      8409
4 dem      7180

Finally, you can lump together many small groups with fct_lump(). It makes sure that the aggregate is still the smallest group, but that doesn’t help much here:

> gss_cat %>%
+   mutate(relig = fct_lump(relig)) %>%
+   count(relig)
# A tibble: 2 x 2
  relig          n
  <fct>      <int>
1 Protestant 10846
2 Other      10637

Instead, we can specify the number of groups.

> gss_cat %>%
+   mutate(relig = fct_lump(relig, n = 10)) %>%
+   count(relig, sort = TRUE) %>%
+   print(n = Inf)
# A tibble: 10 x 2
   relig                       n
   <fct>                   <int>
 1 Protestant              10846
 2 Catholic                 5124
 3 None                     3523
 4 Christian                 689
 5 Other                     458
 6 Jewish                    388
 7 Buddhism                  147
 8 Inter-nondenominational   109
 9 Moslem/islam              104
10 Orthodox-christian         95

Modifying Factor Levels - Exercises

  1. How have the proportions of people identifying as Democrat, Republican, and Independent changed over time?
> gss_cat %>%
+   mutate(partyid =
+     fct_collapse(partyid,
+       other = c("No answer", "Don't know", "Other party"),
+       rep = c("Strong republican", "Not str republican"),
+       ind = c("Ind,near rep", "Independent", "Ind,near dem"),
+       dem = c("Not str democrat", "Strong democrat")
+       )
+   ) %>%
+   count(year, partyid) %>%
+   group_by(year) %>%
+   mutate(prop = n / sum(n)) %>%
+   ggplot(aes(
+     x = year, y = prop,
+     color = fct_reorder2(partyid, year, prop)
+   )) +
+   geom_point() +
+   geom_line() +
+   scale_x_continuous(breaks = 
+             seq(from = 2000, to = 2014, by = 2))+
+   theme(axis.text.x = 
+           element_text(angle = 90, vjust=0.5, hjust=1))+
+   labs(color = "Party", y = "Proportion")

  1. How could you collapse rincome into a small set of categories?
> library("stringr")
> 
> #Combine groups
> no_answer <- c("No answer", "Don't know", 
+   "Refused", "Not applicable")
> 
> ltfivek <- c("Lt $1000",str_c("$", c("1000", "3000", "4000"),
+             " to ", c("2999", "3999", "4999")))
> 
> lttenk <- str_c("$", c("5000", "6000", "7000", "8000"),
+        " to ", c("5999", "6999", "7999", "9999"))
> 
> #Plot combined groups
> gss_cat %>%
+   mutate(rincome = fct_collapse(rincome,
+         `Unknown` = no_answer,
+         `Lt $5000` = ltfivek,
+         `$5000 to 10000` = lttenk)) %>% 
+   ggplot(aes(x = rincome)) +
+   geom_bar(fill="firebrick") +
+   coord_flip()