library(forcats)

library(magrittr)

library(dplyr)

## Warning: The package `vctrs` (>= 0.3.8) is required as of rlang 1.0.0.

## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'tibble'

## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

R Markdown

as_factor
fct_anon
fct_c
fct_collapse
fct_count
fct_drop
fct_expand
fct_explicit_na
fct_inorder
fct_lump
fct_other
fct_recode
fct_relabel
fct_relevel
fct_reorder
fct_rev
fct_shift
fct_shuffle
fct_unify
fct_unique
gss_cat
lvls
lvls_union

x <- factor(c( "A", "D", "B", "A", "B", "A", "A", "B", "B", "A", "C"))

fct_recode(x, O = "C", O = "D") # O for other

##  [1] A O B A B A A B B A O
## Levels: A B O

# Dropping Unused Levels

f <- factor(c("a", "b"), levels = c("a", "b", "c"))

f

## [1] a b
## Levels: a b c

fct_drop(f)

## [1] a b
## Levels: a b

# Set only to restrict which levels to drop

fct_drop(f, only = "a")

## [1] a b
## Levels: a b c

fct_drop(f, only = "c")

## [1] a b
## Levels: a b

gss_cat

## # A tibble: 21,483 x 9
##     year marital     age race  rincome    partyid     relig     denom    tvhours
##    <int> <fct>     <int> <fct> <fct>      <fct>       <fct>     <fct>      <int>
##  1  2000 Never ma~    26 White $8000 to ~ Ind,near r~ Protesta~ Souther~      12
##  2  2000 Divorced     48 White $8000 to ~ Not str re~ Protesta~ Baptist~      NA
##  3  2000 Widowed      67 White Not appli~ Independent Protesta~ No deno~       2
##  4  2000 Never ma~    39 White Not appli~ Ind,near r~ Orthodox~ Not app~       4
##  5  2000 Divorced     25 White Not appli~ Not str de~ None      Not app~       1
##  6  2000 Married      25 White $20000 - ~ Strong dem~ Protesta~ Souther~      NA
##  7  2000 Never ma~    36 White $25000 or~ Not str re~ Christian Not app~       3
##  8  2000 Divorced     44 White $7000 to ~ Ind,near d~ Protesta~ Luthera~      NA
##  9  2000 Married      44 White $25000 or~ Not str de~ Protesta~ Other          0
## 10  2000 Married      47 White $25000 or~ Strong rep~ Protesta~ Souther~       3
## # ... with 21,473 more rows

table(gss_cat$partyid)

## 
##          No answer         Don't know        Other party  Strong republican 
##                154                  1                393               2314 
## Not str republican       Ind,near rep        Independent       Ind,near dem 
##               3032               1791               4119               2499 
##   Not str democrat    Strong democrat 
##               3690               3490

fct_count(gss_cat$partyid)

## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 No answer            154
##  2 Don't know             1
##  3 Other party          393
##  4 Strong republican   2314
##  5 Not str republican  3032
##  6 Ind,near rep        1791
##  7 Independent         4119
##  8 Ind,near dem        2499
##  9 Not str democrat    3690
## 10 Strong democrat     3490

partyid2 <- fct_collapse(gss_cat$partyid,
   missing = c("No answer", "Don't know"),
   other = "Other party",
   rep = c("Strong republican", "Not str republican"),
   ind = c("Ind,near rep", "Independent", "Ind,near dem"),
   dem = c("Not str democrat", "Strong democrat")
)

fct_count(partyid2)

## # A tibble: 5 x 2
##   f           n
##   <fct>   <int>
## 1 missing   155
## 2 other     393
## 3 rep      5346
## 4 ind      8409
## 5 dem      7180

x <- factor(rep(LETTERS[1:9], times = c(40, 30, 7, 5, 2, 1, 1, 1, 1)))

fct_other(x, keep = c("A", "B"))

##  [1] A     A     A     A     A     A     A     A     A     A     A     A    
## [13] A     A     A     A     A     A     A     A     A     A     A     A    
## [25] A     A     A     A     A     A     A     A     A     A     A     A    
## [37] A     A     A     A     B     B     B     B     B     B     B     B    
## [49] B     B     B     B     B     B     B     B     B     B     B     B    
## [61] B     B     B     B     B     B     B     B     B     B     Other Other
## [73] Other Other Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other Other
## Levels: A B Other

fct_other(x, drop = c("A", "B"))

##  [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other Other Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other Other Other Other Other C     C    
## [73] C     C     C     C     C     D     D     D     D     D     E     E    
## [85] F     G     H     I    
## Levels: C D E F G H I Other

f <- factor(c("a", "b", "c"))

fct_shuffle(f)

## [1] a b c
## Levels: b c a

fct_shuffle(f)

## [1] a b c
## Levels: b a c

set.seed(1337)

Fact <- LETTERS[rbinom(1000, 6, 0.25) + 1 ]

Fact <- factor(Fact, levels=c("A","B","C","D","E","F","G") )

Var1 <- sample(100:200, 1000, TRUE)

Var2 <- rexp(1000, 1.25) %>% round(2)

myDF <- data.frame(Fact, Var1, Var2)

f <- factor(c("b", "b", "a", "c", "c", "c"))
f

## [1] b b a c c c
## Levels: a b c

fct_inorder(f)

## [1] b b a c c c
## Levels: b a c

fct_infreq(f)

## [1] b b a c c c
## Levels: c b a

fct_inorder(f, ordered = TRUE)

## [1] b b a c c c
## Levels: b < a < c

The {forcats} R package

Processing Data with R

DragonflyStats.github.io

R Markdown

fct_count Count entries in a factor

fct_drop Drop unused levels

fct_anon Anonymise factor levels

fct_collapse : Collapse factor levels into manually defined groups

fct_c : Concatenate factors, combining levels

fct_other : Replace levels with "other"

fct_inorder

Reorder factors levels by first appearance or frequency