r4ds_ch12.R

# 12章　forcatsでファクタ

# 「nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading
# 「personal/sakai · master · nhn-techorus.datascienceteam / bookreading · GitLab」 https://gitlab.com/nhn-techorus.datascienceteam/bookreading/tree/master/personal/sakai

# 「R for Data Science」 http://r4ds.had.co.nz/factors.html

# 「r4ds-exercise-solutions/factors.Rmd at master · jrnold/r4ds-exercise-solutions」 https://github.com/jrnold/r4ds-exercise-solutions/blob/master/factors.Rmd
# 「R for Data Science Solutions」 https://jrnold.github.io/r4ds-exercise-solutions/factors.html

# 「RPubs - r4ds_ch12」 http://rpubs.com/tocci36/r4ds_ch12

# 12.1　はじめに

# Rにおいてファクタは、固定した既知集合の可能値をとるカテゴリ変数の処理に使われます。
# ファクタは、文字列順以外で文字ベクトルを表示するにも役立ちます。
# In R, factors are used to work with categorical variables, variables that have a fixed and known set of possible values. 
# They are also useful when you want to display character vectors in a non-alphabetical order.

# 突然難しくなった。
# カテゴリ変数：棒グラフの軸に使われるような変数

# Roger Pengの「stringsAsFactors: Anunauthorized biography」http://bit.ly/stringsfactorsbio https://simplystatistics.org/2015/07/24/stringsasfactors-an-unauthorized-biography/
# Thomas Lumleyの「stringsAsFactors= <sigh>」http://bit.ly/stringsfactorsigh https://simplystatistics.org/2015/07/24/stringsasfactors-an-unauthorized-biography/

# 12.1.1　用意するもの

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(forcats)

# 12.2　ファクタを作る
x1 <- c("Dec", "Apr", "Jan", "Mar")

# 2つの問題があります。
# 1. 12か月しかないのに、誤字脱字を防ぐ手立てがない。
x2 <- c("Dec", "Apr", "Jam", "Mar")
# 2. 有効な整列ができない。
sort(x1)

## [1] "Apr" "Dec" "Jan" "Mar"

# こういった問題がファクタで処理できます。ファクタを作るには、まず正しい水準のリストを作成します。
# (creating a list of the valid levels:)
# 水準=levels
month_levels <- c(
  "Jan", "Feb", "Mar", "Apr", "May", "Jun",
  "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
# 次にファクタを作成します。
y1 <- factor(x1, levels = month_levels)
y1

## [1] Dec Apr Jan Mar
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

sort(y1)

## [1] Jan Mar Apr Dec
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

# 水準集合にない値は暗黙にNAに変換されます。
y2 <- factor(x2, levels = month_levels)
y2

## [1] Dec  Apr  <NA> Mar 
## Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec

# 警告メッセージを出したい場合はreadr::parse_factor()を使います。
y2 <- parse_factor(x2, levels = month_levels)

## Warning: 1 parsing failure.
## row # A tibble: 1 x 4 col     row   col           expected actual expected   <int> <int>              <chr>  <chr> actual 1     3    NA value in level set    Jam

# 水準集合を与えないと、データの文字列順に作られます。
factor(x1)

## [1] Dec Apr Jan Mar
## Levels: Apr Dec Jan Mar

#a <- read.csv("test.csv", stringsAsFactors = FALSE)

# 水準の順序をデータの出現順に
f1 <- factor(x1, levels = unique(x1))
f1

## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar

f2 <- x1 %>% factor() %>% fct_inorder()
f2

## [1] Dec Apr Jan Mar
## Levels: Dec Apr Jan Mar

# 正しい水準に直接アクセス
levels(f2)

## [1] "Dec" "Apr" "Jan" "Mar"

# 12.3　総合的社会調査(15.3 General Social Survey)

# 総合的社会調査 http://gss.norc.org/
# 「総合的社会調査 - Wikipedia」 https://ja.wikipedia.org/wiki/%E7%B7%8F%E5%90%88%E7%9A%84%E7%A4%BE%E4%BC%9A%E8%AA%BF%E6%9F%BB

gss_cat

## # A tibble: 21,483 x 9
##     year       marital   age   race        rincome            partyid
##    <int>        <fctr> <int> <fctr>         <fctr>             <fctr>
##  1  2000 Never married    26  White  $8000 to 9999       Ind,near rep
##  2  2000      Divorced    48  White  $8000 to 9999 Not str republican
##  3  2000       Widowed    67  White Not applicable        Independent
##  4  2000 Never married    39  White Not applicable       Ind,near rep
##  5  2000      Divorced    25  White Not applicable   Not str democrat
##  6  2000       Married    25  White $20000 - 24999    Strong democrat
##  7  2000 Never married    36  White $25000 or more Not str republican
##  8  2000      Divorced    44  White  $7000 to 7999       Ind,near dem
##  9  2000       Married    44  White $25000 or more   Not str democrat
## 10  2000       Married    47  White $25000 or more  Strong republican
## # ... with 21,473 more rows, and 3 more variables: relig <fctr>,
## #   denom <fctr>, tvhours <int>

#?gss_cat

# 日本版もあるようだ
# 「JGSS | Center for Social Research and Data Archives (CSRDA)」 http://csrda.iss.u-tokyo.ac.jp/joint/jgss/

gss_cat %>%
  count(race)

## # A tibble: 3 x 2
##     race     n
##   <fctr> <int>
## 1  Other  1959
## 2  Black  3129
## 3  White 16395

gss_cat %>%
  count(year)

## # A tibble: 8 x 2
##    year     n
##   <int> <int>
## 1  2000  2817
## 2  2002  2765
## 3  2004  2812
## 4  2006  4510
## 5  2008  2023
## 6  2010  2044
## 7  2012  1974
## 8  2014  2538

ggplot(gss_cat, aes(race)) +
  geom_bar()

ggplot(gss_cat, aes(race)) +
  geom_bar() +
  scale_x_discrete(drop = FALSE)

# dplyrにはこのdropオプションがまだありませんが将来はできる予定です。
# →いつ？
#?dplyr
# 「A Grammar of Data Manipulation • dplyr」 http://dplyr.tidyverse.org/
# 「tidyverse/dplyr: Dplyr: A grammar of data manipulation」 https://github.com/tidyverse/dplyr



# 練習問題 p199 (15.3.1 Exercise)
# 1. rincome（申告所得）の分布を調べなさい。なぜデフォルトの棒グラフが理解しがたいのだろう
#  か。図を改善するにはどうするか。
rincome_plot <- ggplot(gss_cat, aes(rincome)) +
  geom_bar()
rincome_plot

rincome_plot +
  theme(axis.text.x = element_text(angle = 90))

rincome_plot +
  theme(axis.text.x = element_text(angle = -90))

rincome_plot +
  coord_flip()

# 2. 総合的社会調査で最も多いreligは何か。最も多いpartyidは何か。
gss_cat %>%
  count(relig) %>%
  arrange(-n) %>%
  head(1)

## # A tibble: 1 x 2
##        relig     n
##       <fctr> <int>
## 1 Protestant 10846

# count_(): nseの処理

# 8位がよく知らないものだった 
#  8 Inter-nondenominational   109

gss_cat %>%
  count(partyid) %>%
  arrange(-n) %>%
  head(1)

## # A tibble: 1 x 2
##       partyid     n
##        <fctr> <int>
## 1 Independent  4119

# 区分がいまいちわからず
gss_cat %>%
  count(partyid) %>%
  arrange(-n)

## # A tibble: 10 x 2
##               partyid     n
##                <fctr> <int>
##  1        Independent  4119
##  2   Not str democrat  3690
##  3    Strong democrat  3490
##  4 Not str republican  3032
##  5       Ind,near dem  2499
##  6  Strong republican  2314
##  7       Ind,near rep  1791
##  8        Other party   393
##  9          No answer   154
## 10         Don't know     1

# 3. どのreligにdenom（宗派）が適用されるか。表からどのように探し出すか。可視化してどう探すか。
levels(gss_cat$denom)

##  [1] "No answer"            "Don't know"           "No denomination"     
##  [4] "Other"                "Episcopal"            "Presbyterian-dk wh"  
##  [7] "Presbyterian, merged" "Other presbyterian"   "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which"    "Evangelical luth"    
## [13] "Other lutheran"       "Wi evan luth synod"   "Lutheran-mo synod"   
## [16] "Luth ch in america"   "Am lutheran"          "Methodist-dk which"  
## [19] "Other methodist"      "United methodist"     "Afr meth ep zion"    
## [22] "Afr meth episcopal"   "Baptist-dk which"     "Other baptists"      
## [25] "Southern baptist"     "Nat bapt conv usa"    "Nat bapt conv of am" 
## [28] "Am bapt ch in usa"    "Am baptist asso"      "Not applicable"

gss_cat %>%
  filter(!denom %in% c("No answer", "Other", "Don't know", "Not applicable",
                       "No denomination")) %>%
  count(relig)

## # A tibble: 1 x 2
##        relig     n
##       <fctr> <int>
## 1 Protestant  7025

gss_cat %>%
  count(relig, denom) %>%
  ggplot(aes(x = relig, y = denom, size = n)) +
  geom_point() +
  theme(axis.text.x = element_text(angle = 90))

# いまいちよくわからず→プロテスタントが多いということ

# 12.4　ファクタ順序の変更
relig <- gss_cat %>%
  group_by(relig) %>%
  summarize(
    age = mean(age, na.rm = TRUE),
    tvhours = mean(tvhours, na.rm = TRUE),
    n = n()
  )
ggplot(relig, aes(tvhours, relig)) + geom_point()

ggplot(relig, aes(tvhours, fct_reorder(relig, tvhours))) +
  geom_point()

# オプションfun

relig %>%
  mutate(relig = fct_reorder(relig, tvhours)) %>%
  ggplot(aes(tvhours, relig)) +
  geom_point()

rincome <- gss_cat %>%
  group_by(rincome) %>%
  summarize(
    age = mean(age, na.rm = TRUE),
    tvhours = mean(tvhours, na.rm = TRUE),
    n = n()
  )

ggplot(
  rincome,
  aes(age, fct_reorder(rincome, age))
) + geom_point()

ggplot(
  rincome,
  aes(age, fct_relevel(rincome, "Not applicable"))
) +
  geom_point()

# 本のまま（グラフがおかしい。propが整数）
by_age <- gss_cat %>%
  filter(!is.na(age)) %>%
  group_by(age, marital) %>%
  count() %>%
  mutate(prop = n / sum(n))

# 修正版
by_age <- gss_cat %>%
  filter(!is.na(age)) %>%
  group_by(age, marital) %>%
  count() %>%
  ungroup() %>%
  group_by(age) %>%
  mutate(prop = n / sum(n))

ggplot(by_age, aes(age, prop, color = marital)) +
  geom_line(na.rm = TRUE)

ggplot(
  by_age,
  aes(age, prop, color = fct_reorder2(marital, age, prop))
) +
  geom_line() +
  labs(color = "marital")

#?fct_reorder2
# ソース
fct_reorder

## function (f, x, fun = median, ..., .desc = FALSE) 
## {
##     f <- check_factor(f)
##     stopifnot(length(f) == length(x))
##     summary <- tapply(x, f, fun, ...)
##     if (!is.numeric(summary)) {
##         stop("`fun` must return a single number per group", call. = FALSE)
##     }
##     lvls_reorder(f, order(summary, decreasing = .desc))
## }
## <environment: namespace:forcats>

fct_reorder2

## function (f, x, y, fun = last2, ..., .desc = TRUE) 
## {
##     f <- check_factor(f)
##     stopifnot(length(f) == length(x), length(x) == length(y))
##     summary <- tapply(seq_along(x), f, function(i) fun(x[i], 
##         y[i], ...))
##     if (!is.numeric(summary)) {
##         stop("`fun` must return a single number per group", call. = FALSE)
##     }
##     lvls_reorder(f, order(summary, decreasing = .desc))
## }
## <environment: namespace:forcats>

last

## function (x, order_by = NULL, default = default_missing(x)) 
## {
##     nth(x, -1L, order_by = order_by, default = default)
## }
## <environment: namespace:dplyr>

#last2
forcats:::last2

## function (x, y) 
## {
##     y[order(x, na.last = FALSE)][length(y)]
## }
## <environment: namespace:forcats>

#?last
#??last
#??last2

# 「{forcats}パッケージでカテゴリカル変数(factor型データ)をいじってみる」 https://kazutan.github.io/kazutanR/forcats_test.html
# #他の2変数に従ってソートするようlevelsを再整列(fct_reorder2)

# fct_reorder2(f, x, y, fun = last2, ..., .desc = FALSE)：
#   他の変数を使って，levelsをソートします。fct_reorderとの違いは，ソートに利用する変数が2つだという点です。
#   もちろんlast2の代わりにmedianやmeanなどの関数を当てることも可能
# last2(x, y)：xの昇順に従ってyをソートし，それの一番下の要素を返す
# R4DSの本の例では、凡例をグラフの右端（ageがMax）のときのy軸（prop）順に並べたかったということ。

# 少し前にlast2がexportされるようなコミットがある。
# 「last2 referenced in fct_reorder2 but not exported · Issue #108 · tidyverse/forcats · GitHub」 https://github.com/tidyverse/forcats/issues/108

gss_cat %>%
  mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
  ggplot(aes(marital)) +
  geom_bar()

# 練習問題 p203 ()
# 1. tvhoursの高い値は怪しい。平均は要約として有用か。
summary(gss_cat[["tvhours"]])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   1.000   2.000   2.981   4.000  24.000   10146

gss_cat %>%
  filter(!is.na(tvhours)) %>%
  ggplot(aes(x = tvhours)) +
  geom_histogram(binwidth = 1)

# Whether the mean is the best summary depends on what you are using it for :-), i.e. your objective.
# But probably the median would be what most people prefer. And the hours of TV doesn’t look that surprising to me.

# 2. gss_catの各ファクタについて、水準の順序が任意か定まっているか調べなさい。
keep(gss_cat, is.factor) %>% names()

## [1] "marital" "race"    "rincome" "partyid" "relig"   "denom"

#?keep

levels(gss_cat[["marital"]])

## [1] "No answer"     "Never married" "Separated"     "Divorced"     
## [5] "Widowed"       "Married"

gss_cat %>%
  ggplot(aes(x = marital)) +
  geom_bar()

levels(gss_cat$race)

## [1] "Other"          "Black"          "White"          "Not applicable"

gss_cat %>%
  ggplot(aes(race)) +
  geom_bar(drop = FALSE)

## Warning: Ignoring unknown parameters: drop

levels(gss_cat$rincome)

##  [1] "No answer"      "Don't know"     "Refused"        "$25000 or more"
##  [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" 
##  [9] "$7000 to 7999"  "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999" 
## [13] "$3000 to 3999"  "$1000 to 2999"  "Lt $1000"       "Not applicable"

levels(gss_cat$relig)

##  [1] "No answer"               "Don't know"             
##  [3] "Inter-nondenominational" "Native american"        
##  [5] "Christian"               "Orthodox-christian"     
##  [7] "Moslem/islam"            "Other eastern"          
##  [9] "Hinduism"                "Buddhism"               
## [11] "Other"                   "None"                   
## [13] "Jewish"                  "Catholic"               
## [15] "Protestant"              "Not applicable"

gss_cat %>%
  ggplot(aes(relig)) +
  geom_bar() +
  coord_flip()

levels(gss_cat$denom)

##  [1] "No answer"            "Don't know"           "No denomination"     
##  [4] "Other"                "Episcopal"            "Presbyterian-dk wh"  
##  [7] "Presbyterian, merged" "Other presbyterian"   "United pres ch in us"
## [10] "Presbyterian c in us" "Lutheran-dk which"    "Evangelical luth"    
## [13] "Other lutheran"       "Wi evan luth synod"   "Lutheran-mo synod"   
## [16] "Luth ch in america"   "Am lutheran"          "Methodist-dk which"  
## [19] "Other methodist"      "United methodist"     "Afr meth ep zion"    
## [22] "Afr meth episcopal"   "Baptist-dk which"     "Other baptists"      
## [25] "Southern baptist"     "Nat bapt conv usa"    "Nat bapt conv of am" 
## [28] "Am bapt ch in usa"    "Am baptist asso"      "Not applicable"

levels(gss_cat$partyid)

##  [1] "No answer"          "Don't know"         "Other party"       
##  [4] "Strong republican"  "Not str republican" "Ind,near rep"      
##  [7] "Independent"        "Ind,near dem"       "Not str democrat"  
## [10] "Strong democrat"

# 3. "Not applicable"を水準の前に移動することがグラフの最下段に移すことになったのはなぜか。

# Because that gives the level “Not applicable” an integer value of 1.


# 12.5　ファクタ水準の変更 (15.5 Modifying factor levels)
gss_cat %>% count(partyid)

## # A tibble: 10 x 2
##               partyid     n
##                <fctr> <int>
##  1          No answer   154
##  2         Don't know     1
##  3        Other party   393
##  4  Strong republican  2314
##  5 Not str republican  3032
##  6       Ind,near rep  1791
##  7        Independent  4119
##  8       Ind,near dem  2499
##  9   Not str democrat  3690
## 10    Strong democrat  3490

# Independent: 無党派
gss_cat %>%
  mutate(partyid = fct_recode(partyid,
                              "Republican, strong" = "Strong republican",
                              "Republican, weak" = "Not str republican",
                              "Independent, near rep" = "Ind,near rep",
                              "Independent, near dem" = "Ind,near dem",
                              "Democrat, weak" = "Not str democrat",
                              "Democrat, strong" = "Strong democrat"
  )) %>%
  count(partyid)

## # A tibble: 10 x 2
##                  partyid     n
##                   <fctr> <int>
##  1             No answer   154
##  2            Don't know     1
##  3           Other party   393
##  4    Republican, strong  2314
##  5      Republican, weak  3032
##  6 Independent, near rep  1791
##  7           Independent  4119
##  8 Independent, near dem  2499
##  9        Democrat, weak  3690
## 10      Democrat, strong  3490

gss_cat %>%
  mutate(partyid = fct_recode(partyid,
                              "Republican, strong" = "Strong republican",
                              "Republican, weak" = "Not str republican",
                              "Independent, near rep" = "Ind,near rep",
                              "Independent, near dem" = "Ind,near dem",
                              "Democrat, weak" = "Not str democrat",
                              "Democrat, strong" = "Strong democrat",
                              "Other" = "No answer",
                              "Other" = "Don't know",
                              "Other" = "Other party"
  )) %>%
  count(partyid)

## # A tibble: 8 x 2
##                 partyid     n
##                  <fctr> <int>
## 1                 Other   548
## 2    Republican, strong  2314
## 3      Republican, weak  3032
## 4 Independent, near rep  1791
## 5           Independent  4119
## 6 Independent, near dem  2499
## 7        Democrat, weak  3690
## 8      Democrat, strong  3490

gss_cat %>%
  mutate(partyid = fct_collapse(partyid,
                                other = c("No answer", "Don't know", "Other party"),
                                rep = c("Strong republican", "Not str republican"),
                                ind = c("Ind,near rep", "Independent", "Ind,near dem"),
                                dem = c("Not str democrat", "Strong democrat")
  )) %>%
  count(partyid)

## # A tibble: 4 x 2
##   partyid     n
##    <fctr> <int>
## 1   other   548
## 2     rep  5346
## 3     ind  8409
## 4     dem  7180

gss_cat %>%
  mutate(relig = fct_lump(relig)) %>%
  count(relig)

## # A tibble: 2 x 2
##        relig     n
##       <fctr> <int>
## 1 Protestant 10846
## 2      Other 10637

gss_cat %>%
  mutate(relig = fct_lump(relig, n = 10)) %>%
  count(relig, sort = TRUE) %>%
  print(n = Inf)

## # A tibble: 10 x 2
##                      relig     n
##                     <fctr> <int>
##  1              Protestant 10846
##  2                Catholic  5124
##  3                    None  3523
##  4               Christian   689
##  5                   Other   458
##  6                  Jewish   388
##  7                Buddhism   147
##  8 Inter-nondenominational   109
##  9            Moslem/islam   104
## 10      Orthodox-christian    95

#?print


# 練習問題 p206
# 1. 民主党員、共和党員、独立党員と称する人の割合は時間とともにどう変化してきたか。
levels(gss_cat$partyid)

##  [1] "No answer"          "Don't know"         "Other party"       
##  [4] "Strong republican"  "Not str republican" "Ind,near rep"      
##  [7] "Independent"        "Ind,near dem"       "Not str democrat"  
## [10] "Strong democrat"

gss_cat %>%
  mutate(partyid =
           fct_collapse(partyid,
                        other = c("No answer", "Don't know", "Other party"),
                        rep = c("Strong republican", "Not str republican"),
                        ind = c("Ind,near rep", "Independent", "Ind,near dem"),
                        dem = c("Not str democrat", "Strong democrat"))) %>%
  count(year, partyid)  %>%
  group_by(year) %>%
  mutate(p = n / sum(n)) %>%
  ggplot(aes(x = year, y = p,
             colour = fct_reorder2(partyid, year, p))) +
  geom_point() +
  geom_line() +
  labs(colour = "Party ID.")

# 2. rincomeをより少数のカテゴリにまとめるにはどうするか。
levels(gss_cat$rincome)

##  [1] "No answer"      "Don't know"     "Refused"        "$25000 or more"
##  [5] "$20000 - 24999" "$15000 - 19999" "$10000 - 14999" "$8000 to 9999" 
##  [9] "$7000 to 7999"  "$6000 to 6999"  "$5000 to 5999"  "$4000 to 4999" 
## [13] "$3000 to 3999"  "$1000 to 2999"  "Lt $1000"       "Not applicable"

library("stringr")

gss_cat %>%
  mutate(rincome =
           fct_collapse(
             rincome,
             `Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"),
             `Lt $5000` = c("Lt $1000", str_c("$", c("1000", "3000", "4000"),
                                              " to ", c("2999", "3999", "4999"))),
             `$5000 to 10000` = str_c("$", c("5000", "6000", "7000", "8000"),
                                      " to ", c("5999", "6999", "7999", "9999"))
           )) %>%
  ggplot(aes(x = rincome)) +
  geom_bar() +
  coord_flip()

# forcats内の関数一覧
ls("package:forcats")

##  [1] "%>%"             "as_factor"       "fct_anon"       
##  [4] "fct_c"           "fct_collapse"    "fct_count"      
##  [7] "fct_drop"        "fct_expand"      "fct_explicit_na"
## [10] "fct_infreq"      "fct_inorder"     "fct_lump"       
## [13] "fct_other"       "fct_recode"      "fct_relabel"    
## [16] "fct_relevel"     "fct_reorder"     "fct_reorder2"   
## [19] "fct_rev"         "fct_shift"       "fct_shuffle"    
## [22] "fct_unify"       "fct_unique"      "gss_cat"        
## [25] "lvls_expand"     "lvls_reorder"    "lvls_revalue"   
## [28] "lvls_union"

r4ds_ch12.R

toshiyuki.sakai

Tue Mar 13 17:38:48 2018