Rに組み込まれているTitanicデータでクロス表分析を行う.

ただし,Titanicデータはクロス表化されたデータ形式なので,これをepitoolsパッケージのexpand.tableを使って,データをtidy化する.そのために,初回はinstall.packages("epitools")でパッケージをインストールしておく(すでに入っていれば必要ない).

library(epitools)
data<-expand.table(Titanic)
head(data)
##   Class  Sex   Age Survived
## 1   1st Male Child      Yes
## 2   1st Male Child      Yes
## 3   1st Male Child      Yes
## 4   1st Male Child      Yes
## 5   1st Male Child      Yes
## 6   1st Male Adult       No

等級×生存

等級を行として,生存を列としたクロス表を構成する.tableでクロス表をつくり,addmarginsで周辺度数を付け加える.

crossClassSurvived<-table(data$Class,data$Survived)
addmargins(crossClassSurvived)
##       
##          No  Yes  Sum
##   1st   122  203  325
##   2nd   167  118  285
##   3rd   528  178  706
##   Crew  673  212  885
##   Sum  1490  711 2201

行パーセントはprop.tableで計算する.

rowpClassSurvived<-prop.table(crossClassSurvived,margin = 1)
rowpClassSurvived<-cbind(rbind(rowpClassSurvived,
                               Sum=table(data$Survived)/sum(table(data$Survived))),
                  Sum=rep(1,nrow(rowpClassSurvived)+1))
rowpClassSurvived
##             No       Yes Sum
## 1st  0.3753846 0.6246154   1
## 2nd  0.5859649 0.4140351   1
## 3rd  0.7478754 0.2521246   1
## Crew 0.7604520 0.2395480   1
## Sum  0.6769650 0.3230350   1

帯グラフを作成する.

barplot(prop.table(table(data$Survived,data$Class),margin = 2),
        col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
        args.legend = list(x ='bottomleft', bty='n'))

カイ二乗値はchisq.testで算出できる(デフォルトでは2x2クロス表の時,「イェーツの補正」を勝手にするので,correct = FALSEでオフにしておく.イェーツの補正は検定に関係する).

chisq.test(crossClassSurvived,correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  crossClassSurvived
## X-squared = 190.4, df = 3, p-value < 2.2e-16

Cramer’s Vを計算するための関数を定義して計算する.

CramerV<- function(x) {## x should be a contingency table
  chisq<-unname(chisq.test(x,correct = FALSE)$statistic)
  sqrt(chisq/(sum(x)*(min(ncol(x),nrow(x))-1)))
}
CramerV(crossClassSurvived)
## [1] 0.2941201

あるいは,vcdパッケージのassociates関数であれば,いくつかの関連指標をまとめて算出してくれる.

library(vcd)
## Loading required package: grid
## 
## Attaching package: 'vcd'
## The following object is masked from 'package:epitools':
## 
##     oddsratio
assocstats(crossClassSurvived)
##                    X^2 df P(> X^2)
## Likelihood Ratio 180.9  3        0
## Pearson          190.4  3        0
## 
## Phi-Coefficient   : NA 
## Contingency Coeff.: 0.282 
## Cramer's V        : 0.294

性別×生存

crossSexSurvived<-table(data$Sex,data$Survived)
addmargins(crossSexSurvived)
##         
##            No  Yes  Sum
##   Male   1364  367 1731
##   Female  126  344  470
##   Sum    1490  711 2201
rowpSexSurvived<-prop.table(crossSexSurvived,margin = 1)
rowpSexSurvived<-cbind(rbind(rowpSexSurvived,
                               Sum=table(data$Survived)/sum(table(data$Survived))),
                         Sum=rep(1,nrow(rowpSexSurvived)+1))
rowpSexSurvived
##               No       Yes Sum
## Male   0.7879838 0.2120162   1
## Female 0.2680851 0.7319149   1
## Sum    0.6769650 0.3230350   1
barplot(prop.table(table(data$Survived,data$Sex),margin = 2),
        col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
        args.legend = list(x ='bottomleft', bty='n'))

### chi square test
chisq.test(crossSexSurvived,correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  crossSexSurvived
## X-squared = 456.87, df = 1, p-value < 2.2e-16
### Cramer's V
CramerV(crossSexSurvived)
## [1] 0.4556048

ファイ係数を計算するための関数を定義して計算する.

Phi<- function (x) {## x should be a 2x2 contingency table
  numer<-x[1,1]*x[2,2]-x[1,2]*x[2,1]
  m1<-log(sum(x[1,1:2]))
  m2<-log(sum(x[2,1:2]))
  m3<-log(sum(x[1:2,1]))
  m4<-log(sum(x[1:2,2]))
  denom<-sqrt(exp(m1+m2+m3+m4))
  numer/denom
}
Phi(crossSexSurvived)
## [1] 0.4556048

相対リスクとオッズ比も計算する.

RR<- function (x) {## x should be a 2x2 contingency table
  p1<-x[1,1]/sum(x[1,1:2])
  p2<-x[2,1]/sum(x[2,1:2])
  p1/p2
}
RR(crossSexSurvived)
## [1] 2.939305
OR<- function (x) {## x should be a 2x2 contingency table
  odds1<-x[1,1]/x[1,2]
  odds2<-x[2,1]/x[2,2]
  odds1/odds2
}
OR(crossSexSurvived)
## [1] 10.14697
### log odds ratio
log(OR(crossSexSurvived))
## [1] 2.317175

年齢×生存

data$Age<-factor(data$Age,levels=c("Adult","Child"))
crossAgeSurvived<-table(data$Age,data$Survived)
addmargins(crossAgeSurvived)
##        
##           No  Yes  Sum
##   Adult 1438  654 2092
##   Child   52   57  109
##   Sum   1490  711 2201
rowpAgeSurvived<-prop.table(crossAgeSurvived,margin = 1)
rowpAgeSurvived<-cbind(rbind(rowpAgeSurvived,
                             Sum=table(data$Survived)/sum(table(data$Survived))),
                       Sum=rep(1,nrow(rowpAgeSurvived)+1))
rowpAgeSurvived
##              No       Yes Sum
## Adult 0.6873805 0.3126195   1
## Child 0.4770642 0.5229358   1
## Sum   0.6769650 0.3230350   1
barplot(prop.table(table(data$Survived,data$Age),margin = 2),
        col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
        args.legend = list(x ='bottomleft', bty='n'))

### chi square test
chisq.test(crossAgeSurvived,correct = FALSE)
## 
##  Pearson's Chi-squared test
## 
## data:  crossAgeSurvived
## X-squared = 20.956, df = 1, p-value = 4.701e-06
### Cramer's V
CramerV(crossAgeSurvived)
## [1] 0.09757511
### phi coefficient
Phi(crossAgeSurvived)
## [1] 0.09757511
### relative risk
RR(crossAgeSurvived)
## [1] 1.440855
### odds ratio
OR(crossAgeSurvived)
## [1] 2.410198
### log odds ratio
log(OR(crossAgeSurvived))
## [1] 0.8797087