Rに組み込まれているTitanic
データでクロス表分析を行う.
ただし,Titanic
データはクロス表化されたデータ形式なので,これをepitools
パッケージのexpand.table
を使って,データをtidy化する.そのために,初回はinstall.packages("epitools")
でパッケージをインストールしておく(すでに入っていれば必要ない).
library(epitools)
data<-expand.table(Titanic)
head(data)
## Class Sex Age Survived
## 1 1st Male Child Yes
## 2 1st Male Child Yes
## 3 1st Male Child Yes
## 4 1st Male Child Yes
## 5 1st Male Child Yes
## 6 1st Male Adult No
等級を行として,生存を列としたクロス表を構成する.table
でクロス表をつくり,addmargins
で周辺度数を付け加える.
crossClassSurvived<-table(data$Class,data$Survived)
addmargins(crossClassSurvived)
##
## No Yes Sum
## 1st 122 203 325
## 2nd 167 118 285
## 3rd 528 178 706
## Crew 673 212 885
## Sum 1490 711 2201
行パーセントはprop.table
で計算する.
rowpClassSurvived<-prop.table(crossClassSurvived,margin = 1)
rowpClassSurvived<-cbind(rbind(rowpClassSurvived,
Sum=table(data$Survived)/sum(table(data$Survived))),
Sum=rep(1,nrow(rowpClassSurvived)+1))
rowpClassSurvived
## No Yes Sum
## 1st 0.3753846 0.6246154 1
## 2nd 0.5859649 0.4140351 1
## 3rd 0.7478754 0.2521246 1
## Crew 0.7604520 0.2395480 1
## Sum 0.6769650 0.3230350 1
帯グラフを作成する.
barplot(prop.table(table(data$Survived,data$Class),margin = 2),
col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
args.legend = list(x ='bottomleft', bty='n'))
カイ二乗値はchisq.test
で算出できる(デフォルトでは2x2クロス表の時,「イェーツの補正」を勝手にするので,correct = FALSE
でオフにしておく.イェーツの補正は検定に関係する).
chisq.test(crossClassSurvived,correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: crossClassSurvived
## X-squared = 190.4, df = 3, p-value < 2.2e-16
Cramer’s Vを計算するための関数を定義して計算する.
CramerV<- function(x) {## x should be a contingency table
chisq<-unname(chisq.test(x,correct = FALSE)$statistic)
sqrt(chisq/(sum(x)*(min(ncol(x),nrow(x))-1)))
}
CramerV(crossClassSurvived)
## [1] 0.2941201
あるいは,vcd
パッケージのassociates
関数であれば,いくつかの関連指標をまとめて算出してくれる.
library(vcd)
## Loading required package: grid
##
## Attaching package: 'vcd'
## The following object is masked from 'package:epitools':
##
## oddsratio
assocstats(crossClassSurvived)
## X^2 df P(> X^2)
## Likelihood Ratio 180.9 3 0
## Pearson 190.4 3 0
##
## Phi-Coefficient : NA
## Contingency Coeff.: 0.282
## Cramer's V : 0.294
crossSexSurvived<-table(data$Sex,data$Survived)
addmargins(crossSexSurvived)
##
## No Yes Sum
## Male 1364 367 1731
## Female 126 344 470
## Sum 1490 711 2201
rowpSexSurvived<-prop.table(crossSexSurvived,margin = 1)
rowpSexSurvived<-cbind(rbind(rowpSexSurvived,
Sum=table(data$Survived)/sum(table(data$Survived))),
Sum=rep(1,nrow(rowpSexSurvived)+1))
rowpSexSurvived
## No Yes Sum
## Male 0.7879838 0.2120162 1
## Female 0.2680851 0.7319149 1
## Sum 0.6769650 0.3230350 1
barplot(prop.table(table(data$Survived,data$Sex),margin = 2),
col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
args.legend = list(x ='bottomleft', bty='n'))
### chi square test
chisq.test(crossSexSurvived,correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: crossSexSurvived
## X-squared = 456.87, df = 1, p-value < 2.2e-16
### Cramer's V
CramerV(crossSexSurvived)
## [1] 0.4556048
ファイ係数を計算するための関数を定義して計算する.
Phi<- function (x) {## x should be a 2x2 contingency table
numer<-x[1,1]*x[2,2]-x[1,2]*x[2,1]
m1<-log(sum(x[1,1:2]))
m2<-log(sum(x[2,1:2]))
m3<-log(sum(x[1:2,1]))
m4<-log(sum(x[1:2,2]))
denom<-sqrt(exp(m1+m2+m3+m4))
numer/denom
}
Phi(crossSexSurvived)
## [1] 0.4556048
相対リスクとオッズ比も計算する.
RR<- function (x) {## x should be a 2x2 contingency table
p1<-x[1,1]/sum(x[1,1:2])
p2<-x[2,1]/sum(x[2,1:2])
p1/p2
}
RR(crossSexSurvived)
## [1] 2.939305
OR<- function (x) {## x should be a 2x2 contingency table
odds1<-x[1,1]/x[1,2]
odds2<-x[2,1]/x[2,2]
odds1/odds2
}
OR(crossSexSurvived)
## [1] 10.14697
### log odds ratio
log(OR(crossSexSurvived))
## [1] 2.317175
data$Age<-factor(data$Age,levels=c("Adult","Child"))
crossAgeSurvived<-table(data$Age,data$Survived)
addmargins(crossAgeSurvived)
##
## No Yes Sum
## Adult 1438 654 2092
## Child 52 57 109
## Sum 1490 711 2201
rowpAgeSurvived<-prop.table(crossAgeSurvived,margin = 1)
rowpAgeSurvived<-cbind(rbind(rowpAgeSurvived,
Sum=table(data$Survived)/sum(table(data$Survived))),
Sum=rep(1,nrow(rowpAgeSurvived)+1))
rowpAgeSurvived
## No Yes Sum
## Adult 0.6873805 0.3126195 1
## Child 0.4770642 0.5229358 1
## Sum 0.6769650 0.3230350 1
barplot(prop.table(table(data$Survived,data$Age),margin = 2),
col=c("darkgray","lightgreen"),horiz = TRUE, legend.text = TRUE,
args.legend = list(x ='bottomleft', bty='n'))
### chi square test
chisq.test(crossAgeSurvived,correct = FALSE)
##
## Pearson's Chi-squared test
##
## data: crossAgeSurvived
## X-squared = 20.956, df = 1, p-value = 4.701e-06
### Cramer's V
CramerV(crossAgeSurvived)
## [1] 0.09757511
### phi coefficient
Phi(crossAgeSurvived)
## [1] 0.09757511
### relative risk
RR(crossAgeSurvived)
## [1] 1.440855
### odds ratio
OR(crossAgeSurvived)
## [1] 2.410198
### log odds ratio
log(OR(crossAgeSurvived))
## [1] 0.8797087