library(car)
library(vcdExtra)
## Loading required package: vcd
## Loading required package: grid
## Loading required package: gnm
## 
## Attaching package: 'vcdExtra'
## The following object is masked from 'package:car':
## 
##     Burt
library(FactoMineR)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter():    dplyr, stats
## lag():       dplyr, stats
## recode():    dplyr, car
## some():      purrr, car
## summarise(): dplyr, vcdExtra

Package car(John Fox 先生の“Companion for Applied Regression”のパッケージ)についているデータをload

data("Womenlf")
Womenlf %>% tbl_df()
## # A tibble: 263 x 4
##      partic hincome children  region
##  *   <fctr>   <int>   <fctr>  <fctr>
##  1 not.work      15  present Ontario
##  2 not.work      13  present Ontario
##  3 not.work      45  present Ontario
##  4 not.work      23  present Ontario
##  5 not.work      19  present Ontario
##  6 not.work       7  present Ontario
##  7 not.work      15  present Ontario
##  8 fulltime       7  present Ontario
##  9 not.work      15  present Ontario
## 10 not.work      23  present Ontario
## # ... with 253 more rows
str(Womenlf)
## 'data.frame':    263 obs. of  4 variables:
##  $ partic  : Factor w/ 3 levels "fulltime","not.work",..: 2 2 2 2 2 2 2 1 2 2 ...
##  $ hincome : int  15 13 45 23 19 7 15 7 15 23 ...
##  $ children: Factor w/ 2 levels "absent","present": 2 2 2 2 2 2 2 2 2 2 ...
##  $ region  : Factor w/ 5 levels "Atlantic","BC",..: 3 3 3 3 3 3 3 3 3 3 ...

このデータは、263 行(ケース)、4 列(変数)のデータフレーム data の詳細は、

?Womenlf

である。

変数名を確認する

colnames(Womenlf)
## [1] "partic"   "hincome"  "children" "region"

partic (勤務形態:フルタイム、仕事してない、パートタイム Labour-Force Participation. A factor with levels (note: out of order): fulltime, Working full-time; not.work, Not working outside the home; parttime, Working part-time.

hincome 夫の収入 単位$1000 Husband’s income, $1000s.

children 子供:いない、いる Presence of children in the household. A factor with levels: absent, present.

region 居住地域: A factor with levels: Atlantic, Atlantic Canada; BC, British Columbia; Ontario; Prairie, Prairie provinces; Quebec.

クロス表をつくる

  • table(行変数,列変数) とすることで、行変数と列変数のクロス表がつくられる。
  • ?table でhelp
  • exclude=NULL を指定しないと NAが除去された集計となる。
table(Womenlf$partic,Womenlf$children)
##           
##            absent present
##   fulltime     46      20
##   not.work     26     129
##   parttime      7      35

別の表記

with(Womenlf, table(partic,children))
##           children
## partic     absent present
##   fulltime     46      20
##   not.work     26     129
##   parttime      7      35

xtabs を使う

xtabs(~partic + children, data=Womenlf)
##           children
## partic     absent present
##   fulltime     46      20
##   not.work     26     129
##   parttime      7      35
# カイ二乗検定を行う。chisq.test(

)

chisq.test に投げ込めばよい

chisq.test(table(Womenlf$partic,Womenlf$children))
## 
##  Pearson's Chi-squared test
## 
## data:  table(Womenlf$partic, Womenlf$children)
## X-squared = 65.945, df = 2, p-value = 4.788e-15

なにか変数に付値して、それを投げ込んでもよい。

.tbl <- xtabs(~partic + children, data=Womenlf)
chisq.test(.tbl)
## 
##  Pearson's Chi-squared test
## 
## data:  .tbl
## X-squared = 65.945, df = 2, p-value = 4.788e-15
mosaic(.tbl,main = "pattic vs children")

## ピアソン残差を色分けする。

mosaic(.tbl,main = "pattic vs children",shade=TRUE)

mosaicplot も使える

ただし、表での行と列の位置がひっくり返っている。使うならvcd::mosaic()を。

mosaic() で日本語使う方法は、slidshareを参照してください。

mosaicplot(.tbl)

データの属性を確認する

summary(Womenlf)
##       partic       hincome         children        region   
##  fulltime: 66   Min.   : 1.00   absent : 79   Atlantic: 30  
##  not.work:155   1st Qu.:10.00   present:184   BC      : 29  
##  parttime: 42   Median :14.00                 Ontario :108  
##                 Mean   :14.76                 Prairie : 31  
##                 3rd Qu.:19.00                 Quebec  : 65  
##                 Max.   :45.00
str(Womenlf)
## 'data.frame':    263 obs. of  4 variables:
##  $ partic  : Factor w/ 3 levels "fulltime","not.work",..: 2 2 2 2 2 2 2 1 2 2 ...
##  $ hincome : int  15 13 45 23 19 7 15 7 15 23 ...
##  $ children: Factor w/ 2 levels "absent","present": 2 2 2 2 2 2 2 2 2 2 ...
##  $ region  : Factor w/ 5 levels "Atlantic","BC",..: 3 3 3 3 3 3 3 3 3 3 ...

hincome は整数、それ以外は、因子(factor)。なので、この三つの変数を使ってMCAをやってみる。

res.MCA <- MCA(Womenlf[,c(1,3,4)])

col.var = で 変数(var)を色分けする。

plot.MCA(res.MCA,invisible = "ind",col.var = c(rep(1,3),rep(3,2),rep(4,5)))

res.MCA <- MCA(Womenlf,quanti.sup = 2)

#explor::explor(res.MCA)
res <- explor::prepare_results(res.MCA)
explor::MCA_ind_plot(res, xax = 1, yax = 2,ind_sup = FALSE,
    lab_var = NULL, , ind_lab_min_contrib = 0,
    col_var = "partic", labels_size = 9,
    point_opacity = 0.5, opacity_var = NULL, point_size = 64,
    ellipses = TRUE, transitions = TRUE, labels_positions = NULL,
    xlim = c(-1.43, 2.14), ylim = c(-1.16, 2.41))
res <- explor::prepare_results(res.MCA)
explor::MCA_ind_plot(res, xax = 1, yax = 2,ind_sup = FALSE,
    lab_var = NULL, , ind_lab_min_contrib = 0,
    col_var = "children", labels_size = 9,
    point_opacity = 0.5, opacity_var = NULL, point_size = 64,
    ellipses = TRUE, transitions = TRUE, labels_positions = NULL,
    xlim = c(-1.39, 2.17), ylim = c(-1.55, 2.01))

クラスタリング

res.HCPC <- HCPC(res.MCA,nb.clust = 6)

plot.HCPC(res.HCPC)

#plot.HCPC(res.HCPC,choice = "tree")