The {forcats} R package

Processing Data with R

DragonflyStats.github.io

library(forcats)

library(magrittr)

library(dplyr)
## Warning: The package `vctrs` (>= 0.3.8) is required as of rlang 1.0.0.
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

R Markdown

  • as_factor
  • fct_anon
  • fct_c
  • fct_collapse
  • fct_count
  • fct_drop
  • fct_expand
  • fct_explicit_na
  • fct_inorder
  • fct_lump
  • fct_other
  • fct_recode
  • fct_relabel
  • fct_relevel
  • fct_reorder
  • fct_rev
  • fct_shift
  • fct_shuffle
  • fct_unify
  • fct_unique
  • gss_cat
  • lvls
  • lvls_union
x <- factor(c( "A", "D", "B", "A", "B", "A", "A", "B", "B", "A", "C"))

fct_recode(x, O = "C", O = "D") # O for other
##  [1] A O B A B A A B B A O
## Levels: A B O
# Dropping Unused Levels

f <- factor(c("a", "b"), levels = c("a", "b", "c"))

f
## [1] a b
## Levels: a b c
fct_drop(f)
## [1] a b
## Levels: a b
# Set only to restrict which levels to drop

fct_drop(f, only = "a")
## [1] a b
## Levels: a b c
fct_drop(f, only = "c")
## [1] a b
## Levels: a b
gss_cat
## # A tibble: 21,483 x 9
##     year marital     age race  rincome    partyid     relig     denom    tvhours
##    <int> <fct>     <int> <fct> <fct>      <fct>       <fct>     <fct>      <int>
##  1  2000 Never ma~    26 White $8000 to ~ Ind,near r~ Protesta~ Souther~      12
##  2  2000 Divorced     48 White $8000 to ~ Not str re~ Protesta~ Baptist~      NA
##  3  2000 Widowed      67 White Not appli~ Independent Protesta~ No deno~       2
##  4  2000 Never ma~    39 White Not appli~ Ind,near r~ Orthodox~ Not app~       4
##  5  2000 Divorced     25 White Not appli~ Not str de~ None      Not app~       1
##  6  2000 Married      25 White $20000 - ~ Strong dem~ Protesta~ Souther~      NA
##  7  2000 Never ma~    36 White $25000 or~ Not str re~ Christian Not app~       3
##  8  2000 Divorced     44 White $7000 to ~ Ind,near d~ Protesta~ Luthera~      NA
##  9  2000 Married      44 White $25000 or~ Not str de~ Protesta~ Other          0
## 10  2000 Married      47 White $25000 or~ Strong rep~ Protesta~ Souther~       3
## # ... with 21,473 more rows
table(gss_cat$partyid)
## 
##          No answer         Don't know        Other party  Strong republican 
##                154                  1                393               2314 
## Not str republican       Ind,near rep        Independent       Ind,near dem 
##               3032               1791               4119               2499 
##   Not str democrat    Strong democrat 
##               3690               3490
fct_count(gss_cat$partyid)
## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490
partyid2 <- fct_collapse(gss_cat$partyid,
   missing = c("No answer", "Don't know"),
   other = "Other party",
   rep = c("Strong republican", "Not str republican"),
   ind = c("Ind,near rep", "Independent", "Ind,near dem"),
   dem = c("Not str democrat", "Strong democrat")
)

fct_count(partyid2)
## # A tibble: 5 x 2
##   f           n
##   <fct>   <int>
## 1 missing   155
## 2 other     393
## 3 rep      5346
## 4 ind      8409
## 5 dem      7180
x <- factor(rep(LETTERS[1:9], times = c(40, 30, 7, 5, 2, 1, 1, 1, 1)))

fct_other(x, keep = c("A", "B"))
##  [1] A     A     A     A     A     A     A     A     A     A     A     A    
## [13] A     A     A     A     A     A     A     A     A     A     A     A    
## [25] A     A     A     A     A     A     A     A     A     A     A     A    
## [37] A     A     A     A     B     B     B     B     B     B     B     B    
## [49] B     B     B     B     B     B     B     B     B     B     B     B    
## [61] B     B     B     B     B     B     B     B     B     B     Other Other
## [73] Other Other Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other
## Levels: A B Other
fct_other(x, drop = c("A", "B"))
##  [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other Other Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other Other Other Other Other C     C    
## [73] C     C     C     C     C     D     D     D     D     D     E     E    
## [85] F     G     H     I    
## Levels: C D E F G H I Other
f <- factor(c("a", "b", "c"))

fct_shuffle(f)
## [1] a b c
## Levels: b c a
fct_shuffle(f)
## [1] a b c
## Levels: b a c
set.seed(1337)

Fact <- LETTERS[rbinom(1000, 6, 0.25) + 1 ]

Fact <- factor(Fact, levels=c("A","B","C","D","E","F","G") )

Var1 <- sample(100:200, 1000, TRUE)

Var2 <- rexp(1000, 1.25) %>% round(2)

myDF <- data.frame(Fact, Var1, Var2)
f <- factor(c("b", "b", "a", "c", "c", "c"))
f
## [1] b b a c c c
## Levels: a b c
fct_inorder(f)
## [1] b b a c c c
## Levels: b a c
fct_infreq(f)
## [1] b b a c c c
## Levels: c b a
fct_inorder(f, ordered = TRUE)
## [1] b b a c c c
## Levels: b < a < c

fct_count Count entries in a factor

fct_drop Drop unused levels

fct_anon Anonymise factor levels

fct_collapse : Collapse factor levels into manually defined groups

fct_c : Concatenate factors, combining levels

fct_other : Replace levels with "other"

fct_inorder

Reorder factors levels by first appearance or frequency