Constructing tables with R (Japanese ver)
## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = TRUE, fig.width = 10, fig.height = 8)
options(width = 116, scipen = 10)
# 使い方
# この記載をすべてコピーして手元のRのエディタに貼付ける。
# Windows版では左上 [ファイル] → [新しいスクリプト]でエディタ。
# Mac版では真っ白い紙の形のアイコンでエディタ。
# 実行する行、あるいは範囲を選択して
# WindowsであればCtrl+R、MacであればCommand+Returnで実行。
# Introductory Statistics with R 2nd ed. (Peter Dalgaard)のデータを使っています。
# 1版は和訳 Rによる医療統計学 (岡田 昌史) もあります。
# 表の作り方のデモンストレーション。
# 参考資料: http://www.statmethods.net/stats/frequencies.html
# Introductory Statistics with Rの本のデータ集がなければ勝手にインストールします。
# rownames(installed.packages()でインストールされているパッケージの一覧
# "ISwR" %in% ... という表現は...の中にISwRが含まれればTRUEを返します。!をつけて評価を逆転。
if(!("ISwR" %in% rownames(installed.packages()))) {install.packages("ISwR", dep=TRUE)}
# ISwRのデータ集パッケージを読み込みます。
library(ISwR)
# ISwRの中にあるデータの一覧を表示。
data(package="ISwR")
# EstoniaのTartuという都市の脳血管障害のデータを使用します。
# J. Korv, M. Roose, and A.E. Kaasik (1997). Stroke Registry of Tartu, Estonia, from 1991 through 1993. Cerebrovascular Disorders 7:154–162.
# http://content.karger.com/ProdukteDB/produkte.asp?Aktion=ShowAbstract&ArtikelNr=108182&Ausgabe=232923&ProduktNr=224153
data(stroke)
# データベースの説明は?strokeでみられます。
?stroke
# 先頭10行を観察してい見る。head()はデータの先頭を覗き見するコマンド。
head(stroke, n=10)
sex died dstr age dgn coma diab minf han dead obsmonths
1 Male 1991-01-07 1991-01-02 76 INF No No Yes No TRUE 0.16340
2 Male <NA> 1991-01-03 58 INF No No No No FALSE 59.60784
3 Male 1991-06-02 1991-01-08 74 INF No No Yes Yes TRUE 4.73856
4 Female 1991-01-13 1991-01-11 77 ICH No Yes No Yes TRUE 0.06536
5 Female <NA> 1991-01-13 76 INF No Yes No Yes FALSE 59.28105
6 Male 1991-01-13 1991-01-13 48 ICH Yes No No Yes TRUE 0.10000
7 Female 1993-12-01 1991-01-14 81 INF No No No Yes TRUE 34.37908
8 Male 1991-12-12 1991-01-14 53 INF No No Yes Yes TRUE 10.84967
9 Female <NA> 1991-01-15 73 ID No No No Yes FALSE 59.21569
10 Female 1993-11-10 1991-01-15 69 INF No No No Yes TRUE 33.66013
# データ構造の観察。
class(stroke) ## class()でデータの形式がわかる。data.frame
[1] "data.frame"
str(stroke) ## STRucture()で中身の構造がわかる。各コラムの名前とデータ構造がでる。
'data.frame': 829 obs. of 11 variables:
$ sex : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 1 2 1 1 ...
$ died : Date, format: "1991-01-07" NA "1991-06-02" "1991-01-13" ...
$ dstr : Date, format: "1991-01-02" "1991-01-03" "1991-01-08" "1991-01-11" ...
$ age : int 76 58 74 77 76 48 81 53 73 69 ...
$ dgn : Factor w/ 4 levels "ICH","ID","INF",..: 3 3 3 1 3 1 3 3 2 3 ...
$ coma : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
$ diab : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 1 1 1 ...
$ minf : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 2 1 1 ...
$ han : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 2 2 2 ...
$ dead : logi TRUE FALSE TRUE TRUE FALSE TRUE ...
$ obsmonths: num 0.1634 59.6078 4.7386 0.0654 59.281 ...
### Factorがカテゴリー変数。Dateは日付。logiはTRUE or FALSE。numは連続変数。
# 診断名の縦列だけ抜き出す。
stroke$dgn ## strokeデータのdgn成分の抜き出しという意味。
[1] INF INF INF ICH INF ICH INF INF ID INF ID INF INF INF INF INF INF ID ICH INF INF INF INF ICH INF INF ID
[28] ID INF INF INF INF ID INF INF INF ICH INF INF INF INF INF INF INF INF INF INF SAH ICH INF INF INF INF INF
[55] ICH ICH INF INF ID INF ID INF INF ID ICH INF INF INF INF INF INF ID ID INF INF INF INF INF ICH ICH SAH
[82] INF INF INF INF INF INF ID INF ID INF ID INF INF ID INF INF INF INF INF INF INF INF INF ID INF INF INF
[109] SAH INF INF INF INF INF INF INF INF INF SAH INF INF INF INF INF ICH INF ID ICH INF ICH INF SAH INF ID INF
[136] ICH INF ID INF INF SAH INF INF ID INF INF INF INF INF ID INF INF INF INF INF INF INF INF INF INF INF INF
[163] INF ICH SAH INF ICH INF ID INF INF INF INF INF INF INF INF SAH ID INF INF ID INF INF ID SAH ICH INF SAH
[190] INF ID ICH ID INF INF ID ID INF INF INF ID INF ID ID INF INF INF INF INF INF INF INF INF INF SAH INF
[217] INF INF ID INF SAH ID INF ID INF INF ID INF INF INF INF ID ID ICH INF INF INF INF INF INF ID ICH INF
[244] ICH ICH INF INF INF ID INF INF INF INF INF ID INF SAH ID INF SAH ICH INF INF INF INF INF INF INF INF INF
[271] ID ID INF ID ID ICH ID INF INF SAH ID SAH ICH INF ID SAH ID ID INF SAH SAH ID INF SAH ICH ID INF
[298] INF ICH INF ID INF INF ID INF ID INF INF INF INF INF ID ID INF ID ID ID INF INF INF ID INF INF INF
[325] INF INF INF SAH INF INF INF INF INF ID ID ID SAH INF INF INF INF ID INF INF INF INF INF INF INF ID ID
[352] INF INF INF INF INF ID INF INF INF ID ICH INF INF ID INF INF ICH INF INF ID ID INF INF ICH ICH ID INF
[379] INF INF ID INF ID ICH ID ID SAH INF ICH ID ID INF INF ICH ID INF INF ID INF INF INF ID ID ICH INF
[406] ICH INF ICH ICH INF ID INF INF INF ID ID INF INF ID SAH INF ID INF INF INF INF INF INF INF ID ID ID
[433] ID INF ID INF ID INF INF ICH ID ID ID ID SAH ID ID INF ID INF SAH SAH INF ID ID INF INF ID INF
[460] ID INF SAH INF INF INF INF ID ICH ID ID ID ID INF ID INF INF INF INF SAH INF INF INF INF INF INF SAH
[487] ID INF INF INF ICH INF INF ID INF SAH INF ID ID INF ID INF INF ID ID INF INF INF INF INF ID ID INF
[514] ID SAH INF INF INF INF ID ID INF INF INF INF INF SAH INF ID ICH INF ID INF ID INF INF INF INF SAH INF
[541] INF ID INF INF ID INF ID INF SAH ID INF ID INF SAH INF INF ID ID ICH INF ID INF ID INF ICH INF ID
[568] INF ID ID INF ICH INF ID ICH ICH ID INF INF INF INF INF INF INF ID ICH ICH INF ID INF INF INF INF INF
[595] INF INF ID INF ICH ICH INF SAH INF ID INF ICH INF ID ID ID SAH ID ID ID INF INF INF ID INF INF INF
[622] INF INF ID INF INF INF ID INF INF INF INF ID ID INF ID INF ID INF INF ID ICH ICH INF INF INF INF ICH
[649] ID ID ID INF INF ICH INF SAH INF INF INF INF ID INF INF INF INF ID INF ID ICH ICH ID INF INF INF ID
[676] ID ID SAH ID ICH ICH INF ICH INF INF ICH INF INF INF INF INF ID INF INF INF INF INF INF ICH INF ICH INF
[703] ID ICH INF INF ID INF INF INF INF INF INF ID ID INF ICH INF ID INF ID INF ID INF INF ICH INF ICH SAH
[730] INF INF INF SAH INF INF INF INF ID INF ID SAH INF INF ID INF INF INF ICH ICH ICH INF INF ID INF ID INF
[757] INF ID ID ID ID SAH INF ID ID ID INF INF INF INF ICH INF ID ICH INF INF INF INF ID INF INF ID ID
[784] ID INF INF ID ID INF INF INF ID INF ID ICH INF SAH INF ID SAH INF INF INF INF INF ICH ICH ICH INF ICH
[811] INF ID SAH INF INF INF INF INF ID INF INF INF INF INF INF INF INF INF INF
Levels: ICH ID INF SAH
# あるいは
stroke[ ,"dgn"] ## strokeデータのdgn立て列を抜き出しという意味。
[1] INF INF INF ICH INF ICH INF INF ID INF ID INF INF INF INF INF INF ID ICH INF INF INF INF ICH INF INF ID
[28] ID INF INF INF INF ID INF INF INF ICH INF INF INF INF INF INF INF INF INF INF SAH ICH INF INF INF INF INF
[55] ICH ICH INF INF ID INF ID INF INF ID ICH INF INF INF INF INF INF ID ID INF INF INF INF INF ICH ICH SAH
[82] INF INF INF INF INF INF ID INF ID INF ID INF INF ID INF INF INF INF INF INF INF INF INF ID INF INF INF
[109] SAH INF INF INF INF INF INF INF INF INF SAH INF INF INF INF INF ICH INF ID ICH INF ICH INF SAH INF ID INF
[136] ICH INF ID INF INF SAH INF INF ID INF INF INF INF INF ID INF INF INF INF INF INF INF INF INF INF INF INF
[163] INF ICH SAH INF ICH INF ID INF INF INF INF INF INF INF INF SAH ID INF INF ID INF INF ID SAH ICH INF SAH
[190] INF ID ICH ID INF INF ID ID INF INF INF ID INF ID ID INF INF INF INF INF INF INF INF INF INF SAH INF
[217] INF INF ID INF SAH ID INF ID INF INF ID INF INF INF INF ID ID ICH INF INF INF INF INF INF ID ICH INF
[244] ICH ICH INF INF INF ID INF INF INF INF INF ID INF SAH ID INF SAH ICH INF INF INF INF INF INF INF INF INF
[271] ID ID INF ID ID ICH ID INF INF SAH ID SAH ICH INF ID SAH ID ID INF SAH SAH ID INF SAH ICH ID INF
[298] INF ICH INF ID INF INF ID INF ID INF INF INF INF INF ID ID INF ID ID ID INF INF INF ID INF INF INF
[325] INF INF INF SAH INF INF INF INF INF ID ID ID SAH INF INF INF INF ID INF INF INF INF INF INF INF ID ID
[352] INF INF INF INF INF ID INF INF INF ID ICH INF INF ID INF INF ICH INF INF ID ID INF INF ICH ICH ID INF
[379] INF INF ID INF ID ICH ID ID SAH INF ICH ID ID INF INF ICH ID INF INF ID INF INF INF ID ID ICH INF
[406] ICH INF ICH ICH INF ID INF INF INF ID ID INF INF ID SAH INF ID INF INF INF INF INF INF INF ID ID ID
[433] ID INF ID INF ID INF INF ICH ID ID ID ID SAH ID ID INF ID INF SAH SAH INF ID ID INF INF ID INF
[460] ID INF SAH INF INF INF INF ID ICH ID ID ID ID INF ID INF INF INF INF SAH INF INF INF INF INF INF SAH
[487] ID INF INF INF ICH INF INF ID INF SAH INF ID ID INF ID INF INF ID ID INF INF INF INF INF ID ID INF
[514] ID SAH INF INF INF INF ID ID INF INF INF INF INF SAH INF ID ICH INF ID INF ID INF INF INF INF SAH INF
[541] INF ID INF INF ID INF ID INF SAH ID INF ID INF SAH INF INF ID ID ICH INF ID INF ID INF ICH INF ID
[568] INF ID ID INF ICH INF ID ICH ICH ID INF INF INF INF INF INF INF ID ICH ICH INF ID INF INF INF INF INF
[595] INF INF ID INF ICH ICH INF SAH INF ID INF ICH INF ID ID ID SAH ID ID ID INF INF INF ID INF INF INF
[622] INF INF ID INF INF INF ID INF INF INF INF ID ID INF ID INF ID INF INF ID ICH ICH INF INF INF INF ICH
[649] ID ID ID INF INF ICH INF SAH INF INF INF INF ID INF INF INF INF ID INF ID ICH ICH ID INF INF INF ID
[676] ID ID SAH ID ICH ICH INF ICH INF INF ICH INF INF INF INF INF ID INF INF INF INF INF INF ICH INF ICH INF
[703] ID ICH INF INF ID INF INF INF INF INF INF ID ID INF ICH INF ID INF ID INF ID INF INF ICH INF ICH SAH
[730] INF INF INF SAH INF INF INF INF ID INF ID SAH INF INF ID INF INF INF ICH ICH ICH INF INF ID INF ID INF
[757] INF ID ID ID ID SAH INF ID ID ID INF INF INF INF ICH INF ID ICH INF INF INF INF ID INF INF ID ID
[784] ID INF INF ID ID INF INF INF ID INF ID ICH INF SAH INF ID SAH INF INF INF INF INF ICH ICH ICH INF ICH
[811] INF ID SAH INF INF INF INF INF ID INF INF INF INF INF INF INF INF INF INF
Levels: ICH ID INF SAH
# table()を使って集計してみる。
table(stroke$dgn) ##それぞれの診断名の数がでる。INF:infarction, ID:indeterminateとな。
ICH ID INF SAH
79 202 501 47
table(stroke$sex) ## 男女の数を集計。
Female Male
510 319
addmargins(table(stroke$sex)) ## 合計欄が欲しいときはADD MARGINS()でくくる。
Female Male Sum
510 319 829
# 2変数の集計表にするには必要な2縦列のみtable()に与える。
table(stroke[ ,c("sex","dgn")])
dgn
sex ICH ID INF SAH
Female 48 140 295 27
Male 31 62 206 20
addmargins(table(stroke[ ,c("sex","dgn")]))
dgn
sex ICH ID INF SAH Sum
Female 48 140 295 27 510
Male 31 62 206 20 319
Sum 79 202 501 47 829
# 3変数でやるとこんな感じになる。ちょっと見にくい。
table(stroke[ ,c("sex","dgn","dead")])
, , dead = FALSE
dgn
sex ICH ID INF SAH
Female 15 37 123 14
Male 10 17 116 12
, , dead = TRUE
dgn
sex ICH ID INF SAH
Female 33 103 172 13
Male 21 45 90 8
addmargins(table(stroke[ ,c("sex","dgn","dead")]))
, , dead = FALSE
dgn
sex ICH ID INF SAH Sum
Female 15 37 123 14 189
Male 10 17 116 12 155
Sum 25 54 239 26 344
, , dead = TRUE
dgn
sex ICH ID INF SAH Sum
Female 33 103 172 13 321
Male 21 45 90 8 164
Sum 54 148 262 21 485
, , dead = Sum
dgn
sex ICH ID INF SAH Sum
Female 48 140 295 27 510
Male 31 62 206 20 319
Sum 79 202 501 47 829
# xtabs()はX(cross) TABleSで2変数以上の集計をするときに使用。
# やることは同じですが、この方がなにをしているか読みやすいという向きもあるかもしれません。
xtabs(data=stroke, formula= ~ sex + dgn)
dgn
sex ICH ID INF SAH
Female 48 140 295 27
Male 31 62 206 20
xtabs(data=stroke, formula= ~ sex + dgn + dead)
, , dead = FALSE
dgn
sex ICH ID INF SAH
Female 15 37 123 14
Male 10 17 116 12
, , dead = TRUE
dgn
sex ICH ID INF SAH
Female 33 103 172 13
Male 21 45 90 8
# 3変数の場合はftable()やvcdパッケージにあるstructable()がよいかも。
# この二つは2,3番目の変数のならべ方が違う。vcdはなければインストール。
ftable(stroke[ ,c("sex","dgn","dead")])
dead FALSE TRUE
sex dgn
Female ICH 15 33
ID 37 103
INF 123 172
SAH 14 13
Male ICH 10 21
ID 17 45
INF 116 90
SAH 12 8
if(!("vcd" %in% rownames(installed.packages()))) {install.packages("vcd", dep=TRUE)}
library(vcd)
structable(stroke[ ,c("sex","dgn","dead")])
dgn ICH ID INF SAH
sex dead
Female FALSE 15 37 123 14
TRUE 33 103 172 13
Male FALSE 10 17 116 12
TRUE 21 45 90 8
# 2変数の表は合計欄のない状態(addmargin()なし)でchisq.test(), fisher.test()に与えて検定できます。
## カイ二乗検定
chisq.test(table(stroke[ ,c("sex","dgn")]))
Pearson's Chi-squared test
data: table(stroke[, c("sex", "dgn")])
X-squared = 6.995, df = 3, p-value = 0.07205
chisq.test(xtabs(data=stroke, formula= ~ sex + dgn))
Pearson's Chi-squared test
data: xtabs(data = stroke, formula = ~sex + dgn)
X-squared = 6.995, df = 3, p-value = 0.07205
## フィッシャー正確確率検定
fisher.test(table(stroke[ ,c("sex","dgn")]))
Fisher's Exact Test for Count Data
data: table(stroke[, c("sex", "dgn")])
p-value = 0.06762
alternative hypothesis: two.sided
fisher.test(xtabs(data=stroke, formula= ~ sex + dgn))
Fisher's Exact Test for Count Data
data: xtabs(data = stroke, formula = ~sex + dgn)
p-value = 0.06762
alternative hypothesis: two.sided
# gmodelsパッケージにCrossTable()という便利な?コマンドがある。なければgmodelsインストール。
if(!("gmodels" %in% rownames(installed.packages()))) {install.packages("gmodels", dep=TRUE)}
library(gmodels)
# SPSS風とかSAS風とかに設定可能、初期設定だとくどいので、オプションでシンプルにしました。
# カイ二乗検定やフィッシャーの正確確率検定を同時に施行できます。
CrossTable(x=stroke$sex, y=stroke$dgn, digits=1,
prop.t=FALSE, prop.chisq=FALSE,
format="SPSS", chisq=TRUE, fisher=TRUE)
Cell Contents
|-------------------------|
| Count |
| Row Percent |
| Column Percent |
|-------------------------|
Total Observations in Table: 829
| stroke$dgn
stroke$sex | ICH | ID | INF | SAH | Row Total |
-------------|-----------|-----------|-----------|-----------|-----------|
Female | 48 | 140 | 295 | 27 | 510 |
| 9.4% | 27.5% | 57.8% | 5.3% | 61.5% |
| 60.8% | 69.3% | 58.9% | 57.4% | |
-------------|-----------|-----------|-----------|-----------|-----------|
Male | 31 | 62 | 206 | 20 | 319 |
| 9.7% | 19.4% | 64.6% | 6.3% | 38.5% |
| 39.2% | 30.7% | 41.1% | 42.6% | |
-------------|-----------|-----------|-----------|-----------|-----------|
Column Total | 79 | 202 | 501 | 47 | 829 |
| 9.5% | 24.4% | 60.4% | 5.7% | |
-------------|-----------|-----------|-----------|-----------|-----------|
Statistics for All Table Factors
Pearson's Chi-squared test
------------------------------------------------------------
Chi^2 = 6.995 d.f. = 3 p = 0.07205
Fisher's Exact Test for Count Data
------------------------------------------------------------
Alternative hypothesis: two.sided
p = 0.06762
Minimum expected frequency: 18.09