library(tidyverse)
## -- Attaching packages -----------------
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts --------------------------
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
x <- c("A","z","g",NA)
x %>% as.factor() # 默认按照字母排序
## [1] A z g <NA>
## Levels: A g z
x %>% as_factor() # 默认按照出现顺序排列
## [1] A z g <NA>
## Levels: A z g
y <- c("1.1", "11", "2.2", "22")
y
## [1] "1.1" "11" "2.2" "22"
y %>% as.factor()
## [1] 1.1 11 2.2 22
## Levels: 1.1 11 2.2 22
y %>% as_factor()
## [1] 1.1 11 2.2 22
## Levels: 1.1 11 2.2 22
gss_cat$relig %>% fct_count() %>% arrange(-n)
## # A tibble: 16 x 2
## f n
## <fct> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Jewish 388
## 6 Other 224
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95
## 11 No answer 93
## 12 Hinduism 71
## 13 Other eastern 32
## 14 Native american 23
## 15 Don't know 15
## 16 Not applicable 0
gss_cat$relig %>% fct_anon() %>% fct_count() %>% arrange(-n)
## # A tibble: 16 x 2
## f n
## <fct> <int>
## 1 02 10846
## 2 14 5124
## 3 06 3523
## 4 16 689
## 5 04 388
## 6 15 224
## 7 07 147
## 8 12 109
## 9 10 104
## 10 13 95
## 11 11 93
## 12 05 71
## 13 01 32
## 14 09 23
## 15 08 15
## 16 03 0
gss_cat$relig %>% fct_anon("X") %>% fct_count()
## # A tibble: 16 x 2
## f n
## <fct> <int>
## 1 X01 689
## 2 X02 104
## 3 X03 147
## 4 X04 5124
## 5 X05 71
## 6 X06 23
## 7 X07 388
## 8 X08 3523
## 9 X09 0
## 10 X10 32
## 11 X11 95
## 12 X12 93
## 13 X13 109
## 14 X14 15
## 15 X15 224
## 16 X16 10846
fa <- factor("a")
fb <- factor("b")
fab <- factor(c("a", "b"))
c(fa, fb, fab)
## [1] 1 1 1 2
fct_c(fa, fb, fab) # 连接因子很有意思
## [1] a b a b
## Levels: a b
# You can also pass a list of factors with !!!
fs <- list(fa, fb, fab)
fct_c(!!!fs)
## [1] a b a b
## Levels: a b
一个有用的函数,可以将多个因子合并成较少几个。
fct_count(gss_cat$partyid) %>% arrange(n)
## # A tibble: 10 x 2
## f n
## <fct> <int>
## 1 Don't know 1
## 2 No answer 154
## 3 Other party 393
## 4 Ind,near rep 1791
## 5 Strong republican 2314
## 6 Ind,near dem 2499
## 7 Not str republican 3032
## 8 Strong democrat 3490
## 9 Not str democrat 3690
## 10 Independent 4119
partyid2 <- fct_collapse(gss_cat$partyid,
missing = c("No answer", "Don't know"),
other = "Other party",
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)
fct_count(partyid2) # 五个因子水平了
## # A tibble: 5 x 2
## f n
## <fct> <int>
## 1 missing 155
## 2 other 393
## 3 rep 5346
## 4 ind 8409
## 5 dem 7180
sample(letters)[rpois(1000, 10)] %>% class()
## [1] "character"
f <- factor(sample(letters)[rpois(1000, 10)])
f %>% fct_count() %>% arrange(-n)
## # A tibble: 23 x 2
## f n
## <fct> <int>
## 1 h 144
## 2 t 130
## 3 e 129
## 4 m 103
## 5 j 94
## 6 c 79
## 7 v 63
## 8 g 62
## 9 x 51
## 10 d 32
## # ... with 13 more rows
fct_count(f, sort = TRUE)
## # A tibble: 23 x 2
## f n
## <fct> <int>
## 1 h 144
## 2 t 130
## 3 e 129
## 4 m 103
## 5 j 94
## 6 c 79
## 7 v 63
## 8 g 62
## 9 x 51
## 10 d 32
## # ... with 13 more rows
fct_count(f, sort = TRUE, prop = TRUE)
## # A tibble: 23 x 3
## f n p
## <fct> <int> <dbl>
## 1 h 144 0.144
## 2 t 130 0.13
## 3 e 129 0.129
## 4 m 103 0.103
## 5 j 94 0.094
## 6 c 79 0.079
## 7 v 63 0.063
## 8 g 62 0.062
## 9 x 51 0.051
## 10 d 32 0.032
## # ... with 13 more rows
fruit <- factor(c("apple", "kiwi", "apple", "apple"))
colour <- factor(c("red","green", "green", "green"))
eaten <- c("yes", "no", "yes", "no")
fruit
## [1] apple kiwi apple apple
## Levels: apple kiwi
colour
## [1] red green green green
## Levels: green red
eaten
## [1] "yes" "no" "yes" "no"
fct_cross(fruit, colour)
## [1] apple:red kiwi:green apple:green apple:green
## Levels: apple:green apple:red kiwi:green
fct_cross(fruit, colour, eaten)
## [1] apple:red:yes kiwi:green:no apple:green:yes apple:green:no
## Levels: apple:green:no apple:green:yes apple:red:yes kiwi:green:no
fct_cross(fruit, colour, keep_empty = TRUE) # If TRUE, keep combinations with no observations as levels
## [1] apple:red kiwi:green apple:green apple:green
## Levels: apple:green kiwi:green apple:red kiwi:red
f <- factor(c("a", "b"), levels = c("a", "b", "c"))
f
## [1] a b
## Levels: a b c
fct_drop(f)
## [1] a b
## Levels: a b
# Set only to restrict which levels to drop
fct_drop(f, only = "a")
## [1] a b
## Levels: a b c
fct_drop(f, only = "c")
## [1] a b
## Levels: a b
个人感觉用处不大
f <- factor(sample(letters[1:3], 20, replace = TRUE))
f
## [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c
fct_expand(f, "d", "e", "f")
## [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c d e f
fct_expand(f, letters[1:6])
## [1] b a a c c b a c a b c c a c a c c b b b
## Levels: a b c d e f
f1 <- factor(c("a", "a", NA, NA, "a", "b", NA, "c", "a", "c", "b"))
table(f1,useNA = "ifany")
## f1
## a b c <NA>
## 4 2 2 3
f2 <- fct_explicit_na(f1)
table(f2)
## f2
## a b c (Missing)
## 4 2 2 3
f <- factor(c("b", "b", "a", "c", "c", "c"))
f # Levels: a b c
## [1] b b a c c c
## Levels: a b c
fct_inorder(f) # Levels: b a c
## [1] b b a c c c
## Levels: b a c
fct_infreq(f) # Levels: c b a按照因子出现次数排列因子
## [1] b b a c c c
## Levels: c b a
fct_inorder(f, ordered = TRUE)
## [1] b b a c c c
## Levels: b < a < c
f <- factor(sample(1:10))
fct_inseq(f)
## [1] 8 4 6 9 1 3 5 7 2 10
## Levels: 1 2 3 4 5 6 7 8 9 10
其实可以用fct_collapse操作
x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x %>% table()
## .
## A B C D E F G H I
## 40 10 5 27 1 1 1 1 1
x %>% fct_lump() %>% table()
## .
## A D Other
## 40 27 20
x %>% fct_lump() %>% fct_inorder() %>% table()
## .
## A Other D
## 40 20 27
x <- factor(letters[rpois(100, 5)])
x
## [1] e d o d e d b f d j d b c b e d d e c j b e d c a e e d b d i g g d d g e
## [38] f e f h g f g a e c f d d e e f c g g e d c c d d d e g f b e a a b e c f
## [75] d l b e g c d a h e c h c b d g h d b h d j d g k i
## Levels: a b c d e f g h i j k l o
table(x)
## x
## a b c d e f g h i j k l o
## 5 10 11 24 18 8 11 5 2 3 1 1 1
table(fct_lump(x))
##
## a b c d e f g h i j k l o
## 5 10 11 24 18 8 11 5 2 3 1 1 1
# Use positive values to collapse the rarest
fct_lump(x, n = 3) %>% table()
## .
## c d e g Other
## 11 24 18 11 36
fct_lump(x, prop = 0.1)
## [1] e d Other d e d Other Other d Other d Other
## [13] c Other e d d e c Other Other e d c
## [25] Other e e d Other d Other g g d d g
## [37] e Other e Other Other g Other g Other e c Other
## [49] d d e e Other c g g e d c c
## [61] d d d e g Other Other e Other Other Other e
## [73] c Other d Other Other e g c d Other Other e
## [85] c Other c Other d g Other d Other Other d Other
## [97] d g Other Other
## Levels: c d e g Other
# Use negative values to collapse the most common
fct_lump(x, n = -3) %>% table()
## .
## k l o Other
## 1 1 1 97
fct_lump(x, prop = -0.1)
## [1] Other Other o Other Other Other b f Other j Other b
## [13] Other b Other Other Other Other Other j b Other Other Other
## [25] a Other Other Other b Other i Other Other Other Other Other
## [37] Other f Other f h Other f Other a Other Other f
## [49] Other Other Other Other f Other Other Other Other Other Other Other
## [61] Other Other Other Other Other f b Other a a b Other
## [73] Other f Other l b Other Other Other Other a h Other
## [85] Other h Other b Other Other h Other b h Other j
## [97] Other Other k i
## Levels: a b f h i j k l o Other
# Use weighted frequencies
w <- c(rep(2, 50), rep(1, 50))
fct_lump(x, n = 5, w = w)
## [1] e d Other d e d b Other d Other d b
## [13] c b e d d e c Other b e d c
## [25] Other e e d b d Other g g d d g
## [37] e Other e Other Other g Other g Other e c Other
## [49] d d e e Other c g g e d c c
## [61] d d d e g Other b e Other Other b e
## [73] c Other d Other b e g c d Other Other e
## [85] c Other c b d g Other d b Other d Other
## [97] d g Other Other
## Levels: b c d e g Other
# Use ties.method to control how tied factors are collapsed
fct_lump(x, n = 6)
## [1] e d Other d e d b f d Other d b
## [13] c b e d d e c Other b e d c
## [25] Other e e d b d Other g g d d g
## [37] e f e f Other g f g Other e c f
## [49] d d e e f c g g e d c c
## [61] d d d e g f b e Other Other b e
## [73] c f d Other b e g c d Other Other e
## [85] c Other c b d g Other d b Other d Other
## [97] d g Other Other
## Levels: b c d e f g Other
fct_lump(x, n = 6, ties.method = "max")
## [1] e d Other d e d b f d Other d b
## [13] c b e d d e c Other b e d c
## [25] Other e e d b d Other g g d d g
## [37] e f e f Other g f g Other e c f
## [49] d d e e f c g g e d c c
## [61] d d d e g f b e Other Other b e
## [73] c f d Other b e g c d Other Other e
## [85] c Other c b d g Other d b Other d Other
## [97] d g Other Other
## Levels: b c d e f g Other
x <- factor(letters[rpois(100, 5)])
fct_lump_min(x, min = 10)
## [1] Other c c e Other d g f e c Other f
## [13] f Other c g f e c g e c g g
## [25] f e e f f d e d Other c e Other
## [37] Other Other c c f d d g Other f g e
## [49] d Other d Other Other c Other g Other f e Other
## [61] Other Other Other c g e Other e c e e d
## [73] Other d e Other d f Other Other d d g g
## [85] c e g d f d Other d e e d d
## [97] Other Other c
## Levels: c d e f g Other
gss_cat$marital %>% table
## .
## No answer Never married Separated Divorced Widowed
## 17 5416 743 3383 1807
## Married
## 10117
(fct_match(gss_cat$marital, c("Married", "Divorced"))) %>% table()
## .
## FALSE TRUE
## 7983 13500
x <- factor(rep(LETTERS[1:9], times = c(40, 10, 5, 27, 1, 1, 1, 1, 1)))
x
## [1] A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A A
## [39] A A B B B B B B B B B B C C C C C D D D D D D D D D D D D D D D D D D D D D
## [77] D D D D D D E F G H I
## Levels: A B C D E F G H I
fct_other(x, keep = c("A", "B"))
## [1] A A A A A A A A A A A A
## [13] A A A A A A A A A A A A
## [25] A A A A A A A A A A A A
## [37] A A A A B B B B B B B B
## [49] B B Other Other Other Other Other Other Other Other Other Other
## [61] Other Other Other Other Other Other Other Other Other Other Other Other
## [73] Other Other Other Other Other Other Other Other Other Other Other Other
## [85] Other Other Other
## Levels: A B Other
fct_other(x, drop = c("A", "B"))
## [1] Other Other Other Other Other Other Other Other Other Other Other Other
## [13] Other Other Other Other Other Other Other Other Other Other Other Other
## [25] Other Other Other Other Other Other Other Other Other Other Other Other
## [37] Other Other Other Other Other Other Other Other Other Other Other Other
## [49] Other Other C C C C C D D D D D
## [61] D D D D D D D D D D D D
## [73] D D D D D D D D D D E F
## [85] G H I
## Levels: C D E F G H I Other
x <- factor(c("apple", "bear", "banana", "dear"))
fct_recode(x, fruit = "apple", fruit = "banana")
## [1] fruit bear fruit dear
## Levels: fruit bear dear
# If you make a mistake you'll get a warning
fct_recode(x, fruit = "apple", fruit = "banana")
## [1] fruit bear fruit dear
## Levels: fruit bear dear
# If you name the level NULL it will be removed
fct_recode(x, NULL = "apple", fruit = "banana")
## [1] <NA> bear fruit dear
## Levels: fruit bear dear
# When passing a named vector to rename levels use !!! to splice
x <- factor(c("apple", "bear", "banana", "dear"))
levels <- c(fruit = "apple", fruit = "banana")
fct_recode(x, !!!levels)
## [1] fruit bear fruit dear
## Levels: fruit bear dear
fct_reorder()对于将因子映射到位置的1d显示非常有用;fct_reorder2()用于将因子映射到非位置美学的2d显示。
boxplot(Sepal.Width ~ Species, data = iris)
boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width), data = iris)
boxplot(Sepal.Width ~ fct_reorder(Species, Sepal.Width, .desc = TRUE), data = iris)
chks <- subset(ChickWeight, as.integer(Chick) < 10)
chks <- transform(chks, Chick = fct_shuffle(Chick))
if (require("ggplot2")) {
ggplot(chks, aes(Time, weight, colour = Chick)) +
geom_point() +
geom_line()
# Note that lines match order in legend
ggplot(chks, aes(Time, weight, colour = fct_reorder2(Chick, Time, weight))) +
geom_point() +
geom_line() +
labs(colour = "Chick")
}
f <- factor(c("a", "b", "c"))
fct_rev(f)
## [1] a b c
## Levels: c b a
f <- factor(letters[rpois(100, 10)])
f
## [1] k r e l f i j i m k i k n d n l h f g e j j i l f i p k m i k l k h l d i
## [38] f g m g k i k n j j o j n h p m l j i g i n m g e l f i g j h i g j l h n
## [75] g h k e h l i i k h g k m j b h m h i j g h d q n i
## Levels: b d e f g h i j k l m n o p q r
unique(f) # in order of appearance
## [1] k r e l f i j m n d h g p o b q
## Levels: b d e f g h i j k l m n o p q r
fct_unique(f) # in order of levels
## [1] b d e f g h i j k l m n o p q r
## Levels: b d e f g h i j k l m n o p q r