# 12章 forcatsでファクタ
# 「nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading
# 「personal/sakai · master · nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading/tree/master/personal/sakai
# 「R for Data Science」 http://r4ds.had.co.nz/factors.html
# 「r4ds-exercise-solutions/factors.Rmd at master · jrnold/r4ds-exercise-solutions」 https://github.com/jrnold/r4ds-exercise-solutions/blob/master/factors.Rmd
# 「R for Data Science Solutions」 https://jrnold.github.io/r4ds-exercise-solutions/factors.html
# 「RPubs - r4ds_ch12」 http://rpubs.com/tocci36/r4ds_ch12
# 12.1 はじめに
# Rにおいてファクタは、固定した既知集合の可能値をとるカテゴリ変数の処理に使われます。
# ファクタは、文字列順以外で文字ベクトルを表示するにも役立ちます。
# In R, factors are used to work with categorical variables, variables that have a fixed and known set of possible values.
# They are also useful when you want to display character vectors in a non-alphabetical order.
# 突然難しくなった。
# カテゴリ変数:棒グラフの軸に使われるような変数
# Roger Pengの「stringsAsFactors: Anunauthorized biography」http://bit.ly/stringsfactorsbio https://simplystatistics.org/2015/07/24/stringsasfactors-an-unauthorized-biography/
# Thomas Lumleyの「stringsAsFactors= <sigh>」http://bit.ly/stringsfactorsigh https://simplystatistics.org/2015/07/24/stringsasfactors-an-unauthorized-biography/
# 12.1.1 用意するもの
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(forcats)
# 12.2 ファクタを作る
x1 <- c("Dec", "Apr", "Jan", "Mar")
# 2つの問題があります。
# 1. 12か月しかないのに、誤字脱字を防ぐ手立てがない。
x2 <- c("Dec", "Apr", "Jam", "Mar")
# 2. 有効な整列ができない。
sort(x1)
## [1] "Apr" "Dec" "Jan" "Mar"
# こういった問題がファクタで処理できます。ファクタを作るには、まず正しい水準のリストを作成します。
# (creating a list of the valid levels:)
# 水準=levels
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
# 次にファクタを作成します。
y1 <- factor(x1, levels = month_levels)
y1
## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
sort(y1)
## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# 水準集合にない値は暗黙にNAに変換されます。
y2 <- factor(x2, levels = month_levels)
y2
## [1] Dec Apr <NA> Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# 警告メッセージを出したい場合はreadr::parse_factor()を使います。
y2 <- parse_factor(x2, levels = month_levels)
## Warning: 1 parsing failure.
## row # A tibble: 1 x 4 col row col expected actual expected <int> <int> <chr> <chr> actual 1 3 NA value in level set Jam
# 水準集合を与えないと、データの文字列順に作られます。
factor(x1)
## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar
#a <- read.csv("test.csv", stringsAsFactors = FALSE)
# 水準の順序をデータの出現順に
f1 <- factor(x1, levels = unique(x1))
f1
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
f2 <- x1 %>% factor() %>% fct_inorder()
f2
## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar
# 正しい水準に直接アクセス
levels(f2)
## [1] "Dec" "Apr" "Jan" "Mar"
# 12.3 総合的社会調査(15.3 General Social Survey)
# 総合的社会調査 http://gss.norc.org/
# 「総合的社会調査 - Wikipedia」 https://ja.wikipedia.org/wiki/%E7%B7%8F%E5%90%88%E7%9A%84%E7%A4%BE%E4%BC%9A%E8%AA%BF%E6%9F%BB
gss_cat
## # A tibble: 21,483 x 9
## year marital age race rincome partyid
## <int> <fctr> <int> <fctr> <fctr> <fctr>
## 1 2000 Never married 26 White $8000 to 9999 Ind,near rep
## 2 2000 Divorced 48 White $8000 to 9999 Not str republican
## 3 2000 Widowed 67 White Not applicable Independent
## 4 2000 Never married 39 White Not applicable Ind,near rep
## 5 2000 Divorced 25 White Not applicable Not str democrat
## 6 2000 Married 25 White $20000 - 24999 Strong democrat
## 7 2000 Never married 36 White $25000 or more Not str republican
## 8 2000 Divorced 44 White $7000 to 7999 Ind,near dem
## 9 2000 Married 44 White $25000 or more Not str democrat
## 10 2000 Married 47 White $25000 or more Strong republican
## # ... with 21,473 more rows, and 3 more variables: relig <fctr>,
## # denom <fctr>, tvhours <int>
#?gss_cat
# 日本版もあるようだ
# 「JGSS | Center for Social Research and Data Archives (CSRDA)」 http://csrda.iss.u-tokyo.ac.jp/joint/jgss/
gss_cat %>%
count(race)
## # A tibble: 3 x 2
## race n
## <fctr> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
gss_cat %>%
count(year)
## # A tibble: 8 x 2
## year n
## <int> <int>
## 1 2000 2817
## 2 2002 2765
## 3 2004 2812
## 4 2006 4510
## 5 2008 2023
## 6 2010 2044
## 7 2012 1974
## 8 2014 2538
ggplot(gss_cat, aes(race)) +
geom_bar()

ggplot(gss_cat, aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)

# dplyrにはこのdropオプションがまだありませんが将来はできる予定です。
# →いつ?
#?dplyr
# 「A Grammar of Data Manipulation • dplyr」 http://dplyr.tidyverse.org/
# 「tidyverse/dplyr: Dplyr: A grammar of data manipulation」 https://github.com/tidyverse/dplyr
# 練習問題 p199 (15.3.1 Exercise)
# 1. rincome(申告所得)の分布を調べなさい。なぜデフォルトの棒グラフが理解しがたいのだろう
# か。図を改善するにはどうするか。
rincome_plot <- ggplot(gss_cat, aes(rincome)) +
geom_bar()
rincome_plot

rincome_plot +
theme(axis.text.x = element_text(angle = 90))

rincome_plot +
theme(axis.text.x = element_text(angle = -90))

rincome_plot +
coord_flip()

# 2. 総合的社会調査で最も多いreligは何か。最も多いpartyidは何か。
gss_cat %>%
count(relig) %>%
arrange(-n) %>%
head(1)
## # A tibble: 1 x 2
## relig n
## <fctr> <int>
## 1 Protestant 10846
# count_(): nseの処理
# 8位がよく知らないものだった
# 8 Inter-nondenominational 109
gss_cat %>%
count(partyid) %>%
arrange(-n) %>%
head(1)
## # A tibble: 1 x 2
## partyid n
## <fctr> <int>
## 1 Independent 4119
# 区分がいまいちわからず
gss_cat %>%
count(partyid) %>%
arrange(-n)
## # A tibble: 10 x 2
## partyid n
## <fctr> <int>
## 1 Independent 4119
## 2 Not str democrat 3690
## 3 Strong democrat 3490
## 4 Not str republican 3032
## 5 Ind,near dem 2499
## 6 Strong republican 2314
## 7 Ind,near rep 1791
## 8 Other party 393
## 9 No answer 154
## 10 Don't know 1
# 3. どのreligにdenom(宗派)が適用されるか。表からどのように探し出すか。可視化してどう探すか。
levels(gss_cat$denom)
## [1] "No answer" "Don't know" "No denomination"
## [4] "Other" "Episcopal" "Presbyterian-dk wh"
## [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
## [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
## [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
## [19] "Other methodist" "United methodist" "Afr meth ep zion"
## [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
## [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
## [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
gss_cat %>%
filter(!denom %in% c("No answer", "Other", "Don't know", "Not applicable",
"No denomination")) %>%
count(relig)
## # A tibble: 1 x 2
## relig n
## <fctr> <int>
## 1 Protestant 7025
gss_cat %>%
count(relig, denom) %>%
ggplot(aes(x = relig, y = denom, size = n)) +
geom_point() +
theme(axis.text.x = element_text(angle = 90))

# いまいちよくわからず→プロテスタントが多いということ
# 12.4 ファクタ順序の変更
relig <- gss_cat %>%
group_by(relig) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(relig, aes(tvhours, relig)) + geom_point()

ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()

# オプションfun
relig %>%
mutate(relig = fct_reorder(relig, tvhours)) %>%
ggplot(aes(tvhours, relig)) +
geom_point()

rincome <- gss_cat %>%
group_by(rincome) %>%
summarize(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
ggplot(
rincome,
aes(age, fct_reorder(rincome, age))
) + geom_point()

ggplot(
rincome,
aes(age, fct_relevel(rincome, "Not applicable"))
) +
geom_point()

# 本のまま(グラフがおかしい。propが整数)
by_age <- gss_cat %>%
filter(!is.na(age)) %>%
group_by(age, marital) %>%
count() %>%
mutate(prop = n / sum(n))
# 修正版
by_age <- gss_cat %>%
filter(!is.na(age)) %>%
group_by(age, marital) %>%
count() %>%
ungroup() %>%
group_by(age) %>%
mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, color = marital)) +
geom_line(na.rm = TRUE)

ggplot(
by_age,
aes(age, prop, color = fct_reorder2(marital, age, prop))
) +
geom_line() +
labs(color = "marital")

#?fct_reorder2
# ソース
fct_reorder
## function (f, x, fun = median, ..., .desc = FALSE)
## {
## f <- check_factor(f)
## stopifnot(length(f) == length(x))
## summary <- tapply(x, f, fun, ...)
## if (!is.numeric(summary)) {
## stop("`fun` must return a single number per group", call. = FALSE)
## }
## lvls_reorder(f, order(summary, decreasing = .desc))
## }
## <environment: namespace:forcats>
fct_reorder2
## function (f, x, y, fun = last2, ..., .desc = TRUE)
## {
## f <- check_factor(f)
## stopifnot(length(f) == length(x), length(x) == length(y))
## summary <- tapply(seq_along(x), f, function(i) fun(x[i],
## y[i], ...))
## if (!is.numeric(summary)) {
## stop("`fun` must return a single number per group", call. = FALSE)
## }
## lvls_reorder(f, order(summary, decreasing = .desc))
## }
## <environment: namespace:forcats>
last
## function (x, order_by = NULL, default = default_missing(x))
## {
## nth(x, -1L, order_by = order_by, default = default)
## }
## <environment: namespace:dplyr>
#last2
forcats:::last2
## function (x, y)
## {
## y[order(x, na.last = FALSE)][length(y)]
## }
## <environment: namespace:forcats>
#?last
#??last
#??last2
# 「{forcats}パッケージでカテゴリカル変数(factor型データ)をいじってみる」 https://kazutan.github.io/kazutanR/forcats_test.html
# #他の2変数に従ってソートするようlevelsを再整列(fct_reorder2)
# fct_reorder2(f, x, y, fun = last2, ..., .desc = FALSE):
# 他の変数を使って,levelsをソートします。fct_reorderとの違いは,ソートに利用する変数が2つだという点です。
# もちろんlast2の代わりにmedianやmeanなどの関数を当てることも可能
# last2(x, y):xの昇順に従ってyをソートし,それの一番下の要素を返す
# R4DSの本の例では、凡例をグラフの右端(ageがMax)のときのy軸(prop)順に並べたかったということ。
# 少し前にlast2がexportされるようなコミットがある。
# 「last2 referenced in fct_reorder2 but not exported · Issue #108 · tidyverse/forcats · GitHub」 https://github.com/tidyverse/forcats/issues/108
gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()

# 練習問題 p203 ()
# 1. tvhoursの高い値は怪しい。平均は要約として有用か。
summary(gss_cat[["tvhours"]])
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 1.000 2.000 2.981 4.000 24.000 10146
gss_cat %>%
filter(!is.na(tvhours)) %>%
ggplot(aes(x = tvhours)) +
geom_histogram(binwidth = 1)

# Whether the mean is the best summary depends on what you are using it for :-), i.e. your objective.
# But probably the median would be what most people prefer. And the hours of TV doesn’t look that surprising to me.
# 2. gss_catの各ファクタについて、水準の順序が任意か定まっているか調べなさい。
keep(gss_cat, is.factor) %>% names()
## [1] "marital" "race" "rincome" "partyid" "relig" "denom"
#?keep
levels(gss_cat[["marital"]])
## [1] "No answer" "Never married" "Separated" "Divorced"
## [5] "Widowed" "Married"
gss_cat %>%
ggplot(aes(x = marital)) +
geom_bar()

levels(gss_cat$race)
## [1] "Other" "Black" "White" "Not applicable"
gss_cat %>%
ggplot(aes(race)) +
geom_bar(drop = FALSE)
## Warning: Ignoring unknown parameters: drop

levels(gss_cat$rincome)
## [1] "No answer" "Don't know" "Refused" "$25000 or more"
## [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
## [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
## [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
levels(gss_cat$relig)
## [1] "No answer" "Don't know"
## [3] "Inter-nondenominational" "Native american"
## [5] "Christian" "Orthodox-christian"
## [7] "Moslem/islam" "Other eastern"
## [9] "Hinduism" "Buddhism"
## [11] "Other" "None"
## [13] "Jewish" "Catholic"
## [15] "Protestant" "Not applicable"
gss_cat %>%
ggplot(aes(relig)) +
geom_bar() +
coord_flip()

levels(gss_cat$denom)
## [1] "No answer" "Don't know" "No denomination"
## [4] "Other" "Episcopal" "Presbyterian-dk wh"
## [7] "Presbyterian, merged" "Other presbyterian" "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which" "Evangelical luth"
## [13] "Other lutheran" "Wi evan luth synod" "Lutheran-mo synod"
## [16] "Luth ch in america" "Am lutheran" "Methodist-dk which"
## [19] "Other methodist" "United methodist" "Afr meth ep zion"
## [22] "Afr meth episcopal" "Baptist-dk which" "Other baptists"
## [25] "Southern baptist" "Nat bapt conv usa" "Nat bapt conv of am"
## [28] "Am bapt ch in usa" "Am baptist asso" "Not applicable"
levels(gss_cat$partyid)
## [1] "No answer" "Don't know" "Other party"
## [4] "Strong republican" "Not str republican" "Ind,near rep"
## [7] "Independent" "Ind,near dem" "Not str democrat"
## [10] "Strong democrat"
# 3. "Not applicable"を水準の前に移動することがグラフの最下段に移すことになったのはなぜか。
# Because that gives the level “Not applicable” an integer value of 1.
# 12.5 ファクタ水準の変更 (15.5 Modifying factor levels)
gss_cat %>% count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fctr> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Strong republican 2314
## 5 Not str republican 3032
## 6 Ind,near rep 1791
## 7 Independent 4119
## 8 Ind,near dem 2499
## 9 Not str democrat 3690
## 10 Strong democrat 3490
# Independent: 無党派
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
## # A tibble: 10 x 2
## partyid n
## <fctr> <int>
## 1 No answer 154
## 2 Don't know 1
## 3 Other party 393
## 4 Republican, strong 2314
## 5 Republican, weak 3032
## 6 Independent, near rep 1791
## 7 Independent 4119
## 8 Independent, near dem 2499
## 9 Democrat, weak 3690
## 10 Democrat, strong 3490
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat",
"Other" = "No answer",
"Other" = "Don't know",
"Other" = "Other party"
)) %>%
count(partyid)
## # A tibble: 8 x 2
## partyid n
## <fctr> <int>
## 1 Other 548
## 2 Republican, strong 2314
## 3 Republican, weak 3032
## 4 Independent, near rep 1791
## 5 Independent 4119
## 6 Independent, near dem 2499
## 7 Democrat, weak 3690
## 8 Democrat, strong 3490
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
## # A tibble: 4 x 2
## partyid n
## <fctr> <int>
## 1 other 548
## 2 rep 5346
## 3 ind 8409
## 4 dem 7180
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
## # A tibble: 2 x 2
## relig n
## <fctr> <int>
## 1 Protestant 10846
## 2 Other 10637
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
## # A tibble: 10 x 2
## relig n
## <fctr> <int>
## 1 Protestant 10846
## 2 Catholic 5124
## 3 None 3523
## 4 Christian 689
## 5 Other 458
## 6 Jewish 388
## 7 Buddhism 147
## 8 Inter-nondenominational 109
## 9 Moslem/islam 104
## 10 Orthodox-christian 95
#?print
# 練習問題 p206
# 1. 民主党員、共和党員、独立党員と称する人の割合は時間とともにどう変化してきたか。
levels(gss_cat$partyid)
## [1] "No answer" "Don't know" "Other party"
## [4] "Strong republican" "Not str republican" "Ind,near rep"
## [7] "Independent" "Ind,near dem" "Not str democrat"
## [10] "Strong democrat"
gss_cat %>%
mutate(partyid =
fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat"))) %>%
count(year, partyid) %>%
group_by(year) %>%
mutate(p = n / sum(n)) %>%
ggplot(aes(x = year, y = p,
colour = fct_reorder2(partyid, year, p))) +
geom_point() +
geom_line() +
labs(colour = "Party ID.")

# 2. rincomeをより少数のカテゴリにまとめるにはどうするか。
levels(gss_cat$rincome)
## [1] "No answer" "Don't know" "Refused" "$25000 or more"
## [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999"
## [9] "$7000 to 7999" "$6000 to 6999" "$5000 to 5999" "$4000 to 4999"
## [13] "$3000 to 3999" "$1000 to 2999" "Lt $1000" "Not applicable"
library("stringr")
gss_cat %>%
mutate(rincome =
fct_collapse(
rincome,
`Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"),
`Lt $5000` = c("Lt $1000", str_c("$", c("1000", "3000", "4000"),
" to ", c("2999", "3999", "4999"))),
`$5000 to 10000` = str_c("$", c("5000", "6000", "7000", "8000"),
" to ", c("5999", "6999", "7999", "9999"))
)) %>%
ggplot(aes(x = rincome)) +
geom_bar() +
coord_flip()

# forcats内の関数一覧
ls("package:forcats")
## [1] "%>%" "as_factor" "fct_anon"
## [4] "fct_c" "fct_collapse" "fct_count"
## [7] "fct_drop" "fct_expand" "fct_explicit_na"
## [10] "fct_infreq" "fct_inorder" "fct_lump"
## [13] "fct_other" "fct_recode" "fct_relabel"
## [16] "fct_relevel" "fct_reorder" "fct_reorder2"
## [19] "fct_rev" "fct_shift" "fct_shuffle"
## [22] "fct_unify" "fct_unique" "gss_cat"
## [25] "lvls_expand" "lvls_reorder" "lvls_revalue"
## [28] "lvls_union"