使用R包forcats操作因子变量-几个有用函数

as_factor:转换因子

library(tidyverse)

## -- Attaching packages -----------------

## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0

## -- Conflicts --------------------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

x <- c("A","z","g",NA)
x %>% as.factor()  # 默认按照字母排序

## [1] A    z    g    <NA>
## Levels: A g z

x %>% as_factor()  # 默认按照出现顺序排列

## [1] A    z    g    <NA>
## Levels: A z g

y <- c("1.1", "11", "2.2", "22")
y

## [1] "1.1" "11"  "2.2" "22"

y %>% as.factor()

## [1] 1.1 11  2.2 22 
## Levels: 1.1 11 2.2 22

y %>% as_factor()

## [1] 1.1 11  2.2 22 
## Levels: 1.1 11 2.2 22

fct_anon:用任意数字标识符替换因子级别

gss_cat$relig %>% fct_count() %>% arrange(-n)

## # A tibble: 16 x 2
##    f                           n
##    <fct>                   <int>
##  1 Protestant              10846
##  2 Catholic                 5124
##  3 None                     3523
##  4 Christian                 689
##  5 Jewish                    388
##  6 Other                     224
##  7 Buddhism                  147
##  8 Inter-nondenominational   109
##  9 Moslem/islam              104
## 10 Orthodox-christian         95
## 11 No answer                  93
## 12 Hinduism                   71
## 13 Other eastern              32
## 14 Native american            23
## 15 Don't know                 15
## 16 Not applicable              0

gss_cat$relig %>% fct_anon() %>% fct_count() %>% arrange(-n)

## # A tibble: 16 x 2
##    f         n
##    <fct> <int>
##  1 02    10846
##  2 14     5124
##  3 06     3523
##  4 16      689
##  5 04      388
##  6 15      224
##  7 07      147
##  8 12      109
##  9 10      104
## 10 13       95
## 11 11       93
## 12 05       71
## 13 01       32
## 14 09       23
## 15 08       15
## 16 03        0

gss_cat$relig %>% fct_anon("X") %>% fct_count()

## # A tibble: 16 x 2
##    f         n
##    <fct> <int>
##  1 X01     689
##  2 X02     104
##  3 X03     147
##  4 X04    5124
##  5 X05      71
##  6 X06      23
##  7 X07     388
##  8 X08    3523
##  9 X09       0
## 10 X10      32
## 11 X11      95
## 12 X12      93
## 13 X13     109
## 14 X14      15
## 15 X15     224
## 16 X16   10846

fct_c:合并级别，连接因子

fa <- factor("a")
fb <- factor("b")
fab <- factor(c("a", "b"))
c(fa, fb, fab)

## [1] 1 1 1 2

fct_c(fa, fb, fab) # 连接因子很有意思

## [1] a b a b
## Levels: a b

# You can also pass a list of factors with !!!
fs <- list(fa, fb, fab)
fct_c(!!!fs)

## [1] a b a b
## Levels: a b

fct_collapse：因子转换，将多个因子合并

一个有用的函数，可以将多个因子合并成较少几个。

fct_count(gss_cat$partyid) %>% arrange(n)

## # A tibble: 10 x 2
##    f                      n
##    <fct>              <int>
##  1 Don't know             1
##  2 No answer            154
##  3 Other party          393
##  4 Ind,near rep        1791
##  5 Strong republican   2314
##  6 Ind,near dem        2499
##  7 Not str republican  3032
##  8 Strong democrat     3490
##  9 Not str democrat    3690
## 10 Independent         4119

partyid2 <- fct_collapse(gss_cat$partyid,
                          missing = c("No answer", "Don't know"),
                          other = "Other party",
                          rep = c("Strong republican", "Not str republican"),
                          ind = c("Ind,near rep", "Independent", "Ind,near dem"),
                          dem = c("Not str democrat", "Strong democrat")
)
fct_count(partyid2)  # 五个因子水平了

## # A tibble: 5 x 2
##   f           n
##   <fct>   <int>
## 1 missing   155
## 2 other     393
## 3 rep      5346
## 4 ind      8409
## 5 dem      7180

fct_count:计算因子水平的数目

sample(letters)[rpois(1000, 10)] %>% class()

## [1] "character"

f <- factor(sample(letters)[rpois(1000, 10)])
f %>% fct_count() %>% arrange(-n)

## # A tibble: 23 x 2
##    f         n
##    <fct> <int>
##  1 h       144
##  2 t       130
##  3 e       129
##  4 m       103
##  5 j        94
##  6 c        79
##  7 v        63
##  8 g        62
##  9 x        51
## 10 d        32
## # ... with 13 more rows

fct_count(f, sort = TRUE)

## # A tibble: 23 x 2
##    f         n
##    <fct> <int>
##  1 h       144
##  2 t       130
##  3 e       129
##  4 m       103
##  5 j        94
##  6 c        79
##  7 v        63
##  8 g        62
##  9 x        51
## 10 d        32
## # ... with 13 more rows

fct_count(f, sort = TRUE, prop = TRUE)

## # A tibble: 23 x 3
##    f         n     p
##    <fct> <int> <dbl>
##  1 h       144 0.144
##  2 t       130 0.13 
##  3 e       129 0.129
##  4 m       103 0.103
##  5 j        94 0.094
##  6 c        79 0.079
##  7 v        63 0.063
##  8 g        62 0.062
##  9 x        51 0.051
## 10 d        32 0.032
## # ... with 13 more rows

fct_cross:交互处理多个因子数据

fruit <- factor(c("apple", "kiwi", "apple", "apple"))
colour <- factor(c("red","green", "green",  "green"))
eaten <- c("yes", "no", "yes", "no")
fruit

## [1] apple kiwi  apple apple
## Levels: apple kiwi

colour

## [1] red   green green green
## Levels: green red

eaten

## [1] "yes" "no"  "yes" "no"

fct_cross(fruit, colour)

## [1] apple:red   kiwi:green  apple:green apple:green
## Levels: apple:green apple:red kiwi:green

fct_cross(fruit, colour, eaten)

## [1] apple:red:yes   kiwi:green:no   apple:green:yes apple:green:no 
## Levels: apple:green:no apple:green:yes apple:red:yes kiwi:green:no

fct_cross(fruit, colour, keep_empty = TRUE) # If TRUE, keep combinations with no observations as levels

## [1] apple:red   kiwi:green  apple:green apple:green
## Levels: apple:green kiwi:green apple:red kiwi:red

fct_drop:删除没有用的因子

f <- factor(c("a", "b"), levels = c("a", "b", "c"))
f

## [1] a b
## Levels: a b c

fct_drop(f)

## [1] a b
## Levels: a b

# Set only to restrict which levels to drop
fct_drop(f, only = "a")

## [1] a b
## Levels: a b c

fct_drop(f, only = "c")

## [1] a b
## Levels: a b

fct_expand:扩展因子

个人感觉用处不大

f <- factor(sample(letters[1:3], 20, replace = TRUE))
f

##  [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c

fct_expand(f, "d", "e", "f")

##  [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c d e f

fct_expand(f, letters[1:6])

##  [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c d e f

fct_explicit_na:直接显示缺失值

f1 <- factor(c("a", "a", NA, NA, "a", "b", NA, "c", "a", "c", "b"))
table(f1,useNA = "ifany")

## f1
##    a    b    c <NA> 
##    4    2    2    3

f2 <- fct_explicit_na(f1)
table(f2)

## f2
##         a         b         c (Missing) 
##         4         2         2         3

fct_inorder:按照因子出现顺序排列因子

f <- factor(c("b", "b", "a", "c", "c", "c"))
f                   # Levels: a b c

## [1] b b a c c c
## Levels: a b c

fct_inorder(f)      # Levels: b a c

## [1] b b a c c c
## Levels: b a c

fct_infreq(f)       # Levels: c b a按照因子出现次数排列因子

## [1] b b a c c c
## Levels: c b a

fct_inorder(f, ordered = TRUE)

## [1] b b a c c c
## Levels: b < a < c

f <- factor(sample(1:10))
fct_inseq(f)

##  [1] 8  4  6  9  1  3  5  7  2  10
## Levels: 1 2 3 4 5 6 7 8 9 10

fct_lump:合并出现少的因子

其实可以用fct_collapse操作

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()

## .
##  A  B  C  D  E  F  G  H  I 
## 40 10  5 27  1  1  1  1  1

x %>% fct_lump() %>% table()

## .
##     A     D Other 
##    40    27    20

x %>% fct_lump() %>% fct_inorder() %>% table()

## .
##     A Other     D 
##    40    20    27

x <- factor(letters[rpois(100, 5)])
x

##   [1] e d o d e d b f d j d b c b e d d e c j b e d c a e e d b d i g g d d g e
##  [38] f e f h g f g a e c f d d e e f c g g e d c c d d d e g f b e a a b e c f
##  [75] d l b e g c d a h e c h c b d g h d b h d j d g k i
## Levels: a b c d e f g h i j k l o

table(x)

## x
##  a  b  c  d  e  f  g  h  i  j  k  l  o 
##  5 10 11 24 18  8 11  5  2  3  1  1  1

table(fct_lump(x))

## 
##  a  b  c  d  e  f  g  h  i  j  k  l  o 
##  5 10 11 24 18  8 11  5  2  3  1  1  1

# Use positive values to collapse the rarest
fct_lump(x, n = 3) %>% table()

## .
##     c     d     e     g Other 
##    11    24    18    11    36

fct_lump(x, prop = 0.1)

##   [1] e     d     Other d     e     d     Other Other d     Other d     Other
##  [13] c     Other e     d     d     e     c     Other Other e     d     c    
##  [25] Other e     e     d     Other d     Other g     g     d     d     g    
##  [37] e     Other e     Other Other g     Other g     Other e     c     Other
##  [49] d     d     e     e     Other c     g     g     e     d     c     c    
##  [61] d     d     d     e     g     Other Other e     Other Other Other e    
##  [73] c     Other d     Other Other e     g     c     d     Other Other e    
##  [85] c     Other c     Other d     g     Other d     Other Other d     Other
##  [97] d     g     Other Other
## Levels: c d e g Other

# Use negative values to collapse the most common
fct_lump(x, n = -3) %>% table()

## .
##     k     l     o Other 
##     1     1     1    97

fct_lump(x, prop = -0.1)

##   [1] Other Other o     Other Other Other b     f     Other j     Other b    
##  [13] Other b     Other Other Other Other Other j     b     Other Other Other
##  [25] a     Other Other Other b     Other i     Other Other Other Other Other
##  [37] Other f     Other f     h     Other f     Other a     Other Other f    
##  [49] Other Other Other Other f     Other Other Other Other Other Other Other
##  [61] Other Other Other Other Other f     b     Other a     a     b     Other
##  [73] Other f     Other l     b     Other Other Other Other a     h     Other
##  [85] Other h     Other b     Other Other h     Other b     h     Other j    
##  [97] Other Other k     i    
## Levels: a b f h i j k l o Other

# Use weighted frequencies
w <- c(rep(2, 50), rep(1, 50))
fct_lump(x, n = 5, w = w)

##   [1] e     d     Other d     e     d     b     Other d     Other d     b    
##  [13] c     b     e     d     d     e     c     Other b     e     d     c    
##  [25] Other e     e     d     b     d     Other g     g     d     d     g    
##  [37] e     Other e     Other Other g     Other g     Other e     c     Other
##  [49] d     d     e     e     Other c     g     g     e     d     c     c    
##  [61] d     d     d     e     g     Other b     e     Other Other b     e    
##  [73] c     Other d     Other b     e     g     c     d     Other Other e    
##  [85] c     Other c     b     d     g     Other d     b     Other d     Other
##  [97] d     g     Other Other
## Levels: b c d e g Other

# Use ties.method to control how tied factors are collapsed
fct_lump(x, n = 6)

##   [1] e     d     Other d     e     d     b     f     d     Other d     b    
##  [13] c     b     e     d     d     e     c     Other b     e     d     c    
##  [25] Other e     e     d     b     d     Other g     g     d     d     g    
##  [37] e     f     e     f     Other g     f     g     Other e     c     f    
##  [49] d     d     e     e     f     c     g     g     e     d     c     c    
##  [61] d     d     d     e     g     f     b     e     Other Other b     e    
##  [73] c     f     d     Other b     e     g     c     d     Other Other e    
##  [85] c     Other c     b     d     g     Other d     b     Other d     Other
##  [97] d     g     Other Other
## Levels: b c d e f g Other

fct_lump(x, n = 6, ties.method = "max")

##   [1] e     d     Other d     e     d     b     f     d     Other d     b    
##  [13] c     b     e     d     d     e     c     Other b     e     d     c    
##  [25] Other e     e     d     b     d     Other g     g     d     d     g    
##  [37] e     f     e     f     Other g     f     g     Other e     c     f    
##  [49] d     d     e     e     f     c     g     g     e     d     c     c    
##  [61] d     d     d     e     g     f     b     e     Other Other b     e    
##  [73] c     f     d     Other b     e     g     c     d     Other Other e    
##  [85] c     Other c     b     d     g     Other d     b     Other d     Other
##  [97] d     g     Other Other
## Levels: b c d e f g Other

x <- factor(letters[rpois(100, 5)])
fct_lump_min(x, min = 10)

##  [1] Other c     c     e     Other d     g     f     e     c     Other f    
## [13] f     Other c     g     f     e     c     g     e     c     g     g    
## [25] f     e     e     f     f     d     e     d     Other c     e     Other
## [37] Other Other c     c     f     d     d     g     Other f     g     e    
## [49] d     Other d     Other Other c     Other g     Other f     e     Other
## [61] Other Other Other c     g     e     Other e     c     e     e     d    
## [73] Other d     e     Other d     f     Other Other d     d     g     g    
## [85] c     e     g     d     f     d     Other d     e     e     d     d    
## [97] Other Other c    
## Levels: c d e f g Other

fct_match:因子匹配

gss_cat$marital %>% table

## .
##     No answer Never married     Separated      Divorced       Widowed 
##            17          5416           743          3383          1807 
##       Married 
##         10117

(fct_match(gss_cat$marital, c("Married", "Divorced"))) %>% table()

## .
## FALSE  TRUE 
##  7983 13500

fct_other:留下需要的因子，其它other

x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x

##  [1] A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A
## [39] A A B B B B B B B B B B C C C C C D D D D D D D D D D D D D D D D D D D D D
## [77] D D D D D D E F G H I
## Levels: A B C D E F G H I

fct_other(x, keep = c("A", "B"))

##  [1] A     A     A     A     A     A     A     A     A     A     A     A    
## [13] A     A     A     A     A     A     A     A     A     A     A     A    
## [25] A     A     A     A     A     A     A     A     A     A     A     A    
## [37] A     A     A     A     B     B     B     B     B     B     B     B    
## [49] B     B     Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other Other Other Other Other Other Other
## [73] Other Other Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other
## Levels: A B Other

fct_other(x, drop = c("A", "B"))

##  [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other Other Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other C     C     C     C     C     D     D     D     D     D    
## [61] D     D     D     D     D     D     D     D     D     D     D     D    
## [73] D     D     D     D     D     D     D     D     D     D     E     F    
## [85] G     H     I    
## Levels: C D E F G H I Other

fct_recode:重新编码

x <- factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = "apple", fruit = "banana")

## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

# If you make a mistake you'll get a warning
fct_recode(x, fruit = "apple", fruit = "banana")

## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

# If you name the level NULL it will be removed
fct_recode(x, NULL = "apple", fruit = "banana")

## [1] <NA>  bear  fruit dear 
## Levels: fruit bear dear

# When passing a named vector to rename levels use !!! to splice
x <- factor(c("apple", "bear", "banana", "dear"))
levels <- c(fruit = "apple", fruit = "banana")
fct_recode(x, !!!levels)

## [1] fruit bear  fruit dear 
## Levels: fruit bear dear

fct_reorder:通过对另一个变量排序来重新排序因子级别

fct_reorder()对于将因子映射到位置的1d显示非常有用;fct_reorder2()用于将因子映射到非位置美学的2d显示。

boxplot(Sepal.Width ~ Species, data = iris)
boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width), data = iris)
boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width, .desc = TRUE), data = iris)
chks <- subset(ChickWeight, as.integer(Chick) < 10)
chks <- transform(chks, Chick = fct_shuffle(Chick))
if (require("ggplot2")) {
ggplot(chks, aes(Time, weight, colour = Chick)) +
geom_point() +
geom_line()
# Note that lines match order in legend
ggplot(chks, aes(Time, weight, colour = fct_reorder2(Chick, Time, weight))) +
geom_point() +
geom_line() +
labs(colour = "Chick")
}

fct_rev:翻转因子

f <- factor(c("a", "b", "c"))
fct_rev(f)

## [1] a b c
## Levels: c b a

f <- factor(letters[rpois(100, 10)])
f

##   [1] k r e l f i j i m k i k n d n l h f g e j j i l f i p k m i k l k h l d i
##  [38] f g m g k i k n j j o j n h p m l j i g i n m g e l f i g j h i g j l h n
##  [75] g h k e h l i i k h g k m j b h m h i j g h d q n i
## Levels: b d e f g h i j k l m n o p q r

unique(f) # in order of appearance

##  [1] k r e l f i j m n d h g p o b q
## Levels: b d e f g h i j k l m n o p q r

fct_unique(f) # in order of levels

##  [1] b d e f g h i j k l m n o p q r
## Levels: b d e f g h i j k l m n o p q r

使用R包forcats操作因子变量-几个有用函数

LJJ

2020/2/23

as_factor:转换因子

fct_anon:用任意数字标识符替换因子级别

fct_c:合并级别，连接因子

fct_collapse：因子转换，将多个因子合并

fct_count:计算因子水平的数目

fct_cross:交互处理多个因子数据

fct_drop:删除没有用的因子

fct_expand:扩展因子

fct_explicit_na:直接显示缺失值

fct_inorder:按照因子出现顺序排列因子

fct_lump:合并出现少的因子

fct_match:因子匹配

fct_other:留下需要的因子，其它other

fct_recode:重新编码

fct_reorder:通过对另一个变量排序来重新排序因子级别

fct_rev:翻转因子