Constructing tables with R (Japanese ver)

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE, 
    echo = TRUE, fig.width = 10, fig.height = 8)
options(width = 116, scipen = 10)
# 使い方
# この記載をすべてコピーして手元のRのエディタに貼付ける。
# Windows版では左上 [ファイル] → [新しいスクリプト]でエディタ。
# Mac版では真っ白い紙の形のアイコンでエディタ。
# 実行する行、あるいは範囲を選択して
# WindowsであればCtrl+R、MacであればCommand+Returnで実行。

# Introductory Statistics with R 2nd ed. (Peter Dalgaard)のデータを使っています。
# 1版は和訳 Rによる医療統計学 (岡田 昌史) もあります。

# 表の作り方のデモンストレーション。
# 参考資料: http://www.statmethods.net/stats/frequencies.html


# Introductory Statistics with Rの本のデータ集がなければ勝手にインストールします。
# rownames(installed.packages()でインストールされているパッケージの一覧
# "ISwR" %in% ... という表現は...の中にISwRが含まれればTRUEを返します。!をつけて評価を逆転。
if(!("ISwR" %in% rownames(installed.packages()))) {install.packages("ISwR", dep=TRUE)}


# ISwRのデータ集パッケージを読み込みます。
library(ISwR)

# ISwRの中にあるデータの一覧を表示。
data(package="ISwR")

# EstoniaのTartuという都市の脳血管障害のデータを使用します。
# J. Korv, M. Roose, and A.E. Kaasik (1997). Stroke Registry of Tartu, Estonia, from 1991 through 1993. Cerebrovascular Disorders 7:154–162.
# http://content.karger.com/ProdukteDB/produkte.asp?Aktion=ShowAbstract&ArtikelNr=108182&Ausgabe=232923&ProduktNr=224153
data(stroke)

# データベースの説明は?strokeでみられます。
?stroke

# 先頭10行を観察してい見る。head()はデータの先頭を覗き見するコマンド。
head(stroke, n=10)
      sex       died       dstr age dgn coma diab minf han  dead obsmonths
1    Male 1991-01-07 1991-01-02  76 INF   No   No  Yes  No  TRUE   0.16340
2    Male       <NA> 1991-01-03  58 INF   No   No   No  No FALSE  59.60784
3    Male 1991-06-02 1991-01-08  74 INF   No   No  Yes Yes  TRUE   4.73856
4  Female 1991-01-13 1991-01-11  77 ICH   No  Yes   No Yes  TRUE   0.06536
5  Female       <NA> 1991-01-13  76 INF   No  Yes   No Yes FALSE  59.28105
6    Male 1991-01-13 1991-01-13  48 ICH  Yes   No   No Yes  TRUE   0.10000
7  Female 1993-12-01 1991-01-14  81 INF   No   No   No Yes  TRUE  34.37908
8    Male 1991-12-12 1991-01-14  53 INF   No   No  Yes Yes  TRUE  10.84967
9  Female       <NA> 1991-01-15  73  ID   No   No   No Yes FALSE  59.21569
10 Female 1993-11-10 1991-01-15  69 INF   No   No   No Yes  TRUE  33.66013

# データ構造の観察。
class(stroke)  ## class()でデータの形式がわかる。data.frame
[1] "data.frame"
str(stroke)    ## STRucture()で中身の構造がわかる。各コラムの名前とデータ構造がでる。
'data.frame':   829 obs. of  11 variables:
 $ sex      : Factor w/ 2 levels "Female","Male": 2 2 2 1 1 2 1 2 1 1 ...
 $ died     : Date, format: "1991-01-07" NA "1991-06-02" "1991-01-13" ...
 $ dstr     : Date, format: "1991-01-02" "1991-01-03" "1991-01-08" "1991-01-11" ...
 $ age      : int  76 58 74 77 76 48 81 53 73 69 ...
 $ dgn      : Factor w/ 4 levels "ICH","ID","INF",..: 3 3 3 1 3 1 3 3 2 3 ...
 $ coma     : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 2 1 1 1 1 ...
 $ diab     : Factor w/ 2 levels "No","Yes": 1 1 1 2 2 1 1 1 1 1 ...
 $ minf     : Factor w/ 2 levels "No","Yes": 2 1 2 1 1 1 1 2 1 1 ...
 $ han      : Factor w/ 2 levels "No","Yes": 1 1 2 2 2 2 2 2 2 2 ...
 $ dead     : logi  TRUE FALSE TRUE TRUE FALSE TRUE ...
 $ obsmonths: num  0.1634 59.6078 4.7386 0.0654 59.281 ...
               ### Factorがカテゴリー変数。Dateは日付。logiはTRUE or FALSE。numは連続変数。

# 診断名の縦列だけ抜き出す。
stroke$dgn     ## strokeデータのdgn成分の抜き出しという意味。
  [1] INF INF INF ICH INF ICH INF INF ID  INF ID  INF INF INF INF INF INF ID  ICH INF INF INF INF ICH INF INF ID 
 [28] ID  INF INF INF INF ID  INF INF INF ICH INF INF INF INF INF INF INF INF INF INF SAH ICH INF INF INF INF INF
 [55] ICH ICH INF INF ID  INF ID  INF INF ID  ICH INF INF INF INF INF INF ID  ID  INF INF INF INF INF ICH ICH SAH
 [82] INF INF INF INF INF INF ID  INF ID  INF ID  INF INF ID  INF INF INF INF INF INF INF INF INF ID  INF INF INF
[109] SAH INF INF INF INF INF INF INF INF INF SAH INF INF INF INF INF ICH INF ID  ICH INF ICH INF SAH INF ID  INF
[136] ICH INF ID  INF INF SAH INF INF ID  INF INF INF INF INF ID  INF INF INF INF INF INF INF INF INF INF INF INF
[163] INF ICH SAH INF ICH INF ID  INF INF INF INF INF INF INF INF SAH ID  INF INF ID  INF INF ID  SAH ICH INF SAH
[190] INF ID  ICH ID  INF INF ID  ID  INF INF INF ID  INF ID  ID  INF INF INF INF INF INF INF INF INF INF SAH INF
[217] INF INF ID  INF SAH ID  INF ID  INF INF ID  INF INF INF INF ID  ID  ICH INF INF INF INF INF INF ID  ICH INF
[244] ICH ICH INF INF INF ID  INF INF INF INF INF ID  INF SAH ID  INF SAH ICH INF INF INF INF INF INF INF INF INF
[271] ID  ID  INF ID  ID  ICH ID  INF INF SAH ID  SAH ICH INF ID  SAH ID  ID  INF SAH SAH ID  INF SAH ICH ID  INF
[298] INF ICH INF ID  INF INF ID  INF ID  INF INF INF INF INF ID  ID  INF ID  ID  ID  INF INF INF ID  INF INF INF
[325] INF INF INF SAH INF INF INF INF INF ID  ID  ID  SAH INF INF INF INF ID  INF INF INF INF INF INF INF ID  ID 
[352] INF INF INF INF INF ID  INF INF INF ID  ICH INF INF ID  INF INF ICH INF INF ID  ID  INF INF ICH ICH ID  INF
[379] INF INF ID  INF ID  ICH ID  ID  SAH INF ICH ID  ID  INF INF ICH ID  INF INF ID  INF INF INF ID  ID  ICH INF
[406] ICH INF ICH ICH INF ID  INF INF INF ID  ID  INF INF ID  SAH INF ID  INF INF INF INF INF INF INF ID  ID  ID 
[433] ID  INF ID  INF ID  INF INF ICH ID  ID  ID  ID  SAH ID  ID  INF ID  INF SAH SAH INF ID  ID  INF INF ID  INF
[460] ID  INF SAH INF INF INF INF ID  ICH ID  ID  ID  ID  INF ID  INF INF INF INF SAH INF INF INF INF INF INF SAH
[487] ID  INF INF INF ICH INF INF ID  INF SAH INF ID  ID  INF ID  INF INF ID  ID  INF INF INF INF INF ID  ID  INF
[514] ID  SAH INF INF INF INF ID  ID  INF INF INF INF INF SAH INF ID  ICH INF ID  INF ID  INF INF INF INF SAH INF
[541] INF ID  INF INF ID  INF ID  INF SAH ID  INF ID  INF SAH INF INF ID  ID  ICH INF ID  INF ID  INF ICH INF ID 
[568] INF ID  ID  INF ICH INF ID  ICH ICH ID  INF INF INF INF INF INF INF ID  ICH ICH INF ID  INF INF INF INF INF
[595] INF INF ID  INF ICH ICH INF SAH INF ID  INF ICH INF ID  ID  ID  SAH ID  ID  ID  INF INF INF ID  INF INF INF
[622] INF INF ID  INF INF INF ID  INF INF INF INF ID  ID  INF ID  INF ID  INF INF ID  ICH ICH INF INF INF INF ICH
[649] ID  ID  ID  INF INF ICH INF SAH INF INF INF INF ID  INF INF INF INF ID  INF ID  ICH ICH ID  INF INF INF ID 
[676] ID  ID  SAH ID  ICH ICH INF ICH INF INF ICH INF INF INF INF INF ID  INF INF INF INF INF INF ICH INF ICH INF
[703] ID  ICH INF INF ID  INF INF INF INF INF INF ID  ID  INF ICH INF ID  INF ID  INF ID  INF INF ICH INF ICH SAH
[730] INF INF INF SAH INF INF INF INF ID  INF ID  SAH INF INF ID  INF INF INF ICH ICH ICH INF INF ID  INF ID  INF
[757] INF ID  ID  ID  ID  SAH INF ID  ID  ID  INF INF INF INF ICH INF ID  ICH INF INF INF INF ID  INF INF ID  ID 
[784] ID  INF INF ID  ID  INF INF INF ID  INF ID  ICH INF SAH INF ID  SAH INF INF INF INF INF ICH ICH ICH INF ICH
[811] INF ID  SAH INF INF INF INF INF ID  INF INF INF INF INF INF INF INF INF INF
Levels: ICH ID INF SAH
# あるいは
stroke[ ,"dgn"] ## strokeデータのdgn立て列を抜き出しという意味。
  [1] INF INF INF ICH INF ICH INF INF ID  INF ID  INF INF INF INF INF INF ID  ICH INF INF INF INF ICH INF INF ID 
 [28] ID  INF INF INF INF ID  INF INF INF ICH INF INF INF INF INF INF INF INF INF INF SAH ICH INF INF INF INF INF
 [55] ICH ICH INF INF ID  INF ID  INF INF ID  ICH INF INF INF INF INF INF ID  ID  INF INF INF INF INF ICH ICH SAH
 [82] INF INF INF INF INF INF ID  INF ID  INF ID  INF INF ID  INF INF INF INF INF INF INF INF INF ID  INF INF INF
[109] SAH INF INF INF INF INF INF INF INF INF SAH INF INF INF INF INF ICH INF ID  ICH INF ICH INF SAH INF ID  INF
[136] ICH INF ID  INF INF SAH INF INF ID  INF INF INF INF INF ID  INF INF INF INF INF INF INF INF INF INF INF INF
[163] INF ICH SAH INF ICH INF ID  INF INF INF INF INF INF INF INF SAH ID  INF INF ID  INF INF ID  SAH ICH INF SAH
[190] INF ID  ICH ID  INF INF ID  ID  INF INF INF ID  INF ID  ID  INF INF INF INF INF INF INF INF INF INF SAH INF
[217] INF INF ID  INF SAH ID  INF ID  INF INF ID  INF INF INF INF ID  ID  ICH INF INF INF INF INF INF ID  ICH INF
[244] ICH ICH INF INF INF ID  INF INF INF INF INF ID  INF SAH ID  INF SAH ICH INF INF INF INF INF INF INF INF INF
[271] ID  ID  INF ID  ID  ICH ID  INF INF SAH ID  SAH ICH INF ID  SAH ID  ID  INF SAH SAH ID  INF SAH ICH ID  INF
[298] INF ICH INF ID  INF INF ID  INF ID  INF INF INF INF INF ID  ID  INF ID  ID  ID  INF INF INF ID  INF INF INF
[325] INF INF INF SAH INF INF INF INF INF ID  ID  ID  SAH INF INF INF INF ID  INF INF INF INF INF INF INF ID  ID 
[352] INF INF INF INF INF ID  INF INF INF ID  ICH INF INF ID  INF INF ICH INF INF ID  ID  INF INF ICH ICH ID  INF
[379] INF INF ID  INF ID  ICH ID  ID  SAH INF ICH ID  ID  INF INF ICH ID  INF INF ID  INF INF INF ID  ID  ICH INF
[406] ICH INF ICH ICH INF ID  INF INF INF ID  ID  INF INF ID  SAH INF ID  INF INF INF INF INF INF INF ID  ID  ID 
[433] ID  INF ID  INF ID  INF INF ICH ID  ID  ID  ID  SAH ID  ID  INF ID  INF SAH SAH INF ID  ID  INF INF ID  INF
[460] ID  INF SAH INF INF INF INF ID  ICH ID  ID  ID  ID  INF ID  INF INF INF INF SAH INF INF INF INF INF INF SAH
[487] ID  INF INF INF ICH INF INF ID  INF SAH INF ID  ID  INF ID  INF INF ID  ID  INF INF INF INF INF ID  ID  INF
[514] ID  SAH INF INF INF INF ID  ID  INF INF INF INF INF SAH INF ID  ICH INF ID  INF ID  INF INF INF INF SAH INF
[541] INF ID  INF INF ID  INF ID  INF SAH ID  INF ID  INF SAH INF INF ID  ID  ICH INF ID  INF ID  INF ICH INF ID 
[568] INF ID  ID  INF ICH INF ID  ICH ICH ID  INF INF INF INF INF INF INF ID  ICH ICH INF ID  INF INF INF INF INF
[595] INF INF ID  INF ICH ICH INF SAH INF ID  INF ICH INF ID  ID  ID  SAH ID  ID  ID  INF INF INF ID  INF INF INF
[622] INF INF ID  INF INF INF ID  INF INF INF INF ID  ID  INF ID  INF ID  INF INF ID  ICH ICH INF INF INF INF ICH
[649] ID  ID  ID  INF INF ICH INF SAH INF INF INF INF ID  INF INF INF INF ID  INF ID  ICH ICH ID  INF INF INF ID 
[676] ID  ID  SAH ID  ICH ICH INF ICH INF INF ICH INF INF INF INF INF ID  INF INF INF INF INF INF ICH INF ICH INF
[703] ID  ICH INF INF ID  INF INF INF INF INF INF ID  ID  INF ICH INF ID  INF ID  INF ID  INF INF ICH INF ICH SAH
[730] INF INF INF SAH INF INF INF INF ID  INF ID  SAH INF INF ID  INF INF INF ICH ICH ICH INF INF ID  INF ID  INF
[757] INF ID  ID  ID  ID  SAH INF ID  ID  ID  INF INF INF INF ICH INF ID  ICH INF INF INF INF ID  INF INF ID  ID 
[784] ID  INF INF ID  ID  INF INF INF ID  INF ID  ICH INF SAH INF ID  SAH INF INF INF INF INF ICH ICH ICH INF ICH
[811] INF ID  SAH INF INF INF INF INF ID  INF INF INF INF INF INF INF INF INF INF
Levels: ICH ID INF SAH


# table()を使って集計してみる。
table(stroke$dgn)  ##それぞれの診断名の数がでる。INF:infarction, ID:indeterminateとな。

ICH  ID INF SAH 
 79 202 501  47 
table(stroke$sex)  ## 男女の数を集計。

Female   Male 
   510    319 
addmargins(table(stroke$sex))  ## 合計欄が欲しいときはADD MARGINS()でくくる。

Female   Male    Sum 
   510    319    829 

# 2変数の集計表にするには必要な2縦列のみtable()に与える。
table(stroke[ ,c("sex","dgn")])
        dgn
sex      ICH  ID INF SAH
  Female  48 140 295  27
  Male    31  62 206  20
addmargins(table(stroke[ ,c("sex","dgn")]))
        dgn
sex      ICH  ID INF SAH Sum
  Female  48 140 295  27 510
  Male    31  62 206  20 319
  Sum     79 202 501  47 829

# 3変数でやるとこんな感じになる。ちょっと見にくい。
table(stroke[ ,c("sex","dgn","dead")])
, , dead = FALSE

        dgn
sex      ICH  ID INF SAH
  Female  15  37 123  14
  Male    10  17 116  12

, , dead = TRUE

        dgn
sex      ICH  ID INF SAH
  Female  33 103 172  13
  Male    21  45  90   8

addmargins(table(stroke[ ,c("sex","dgn","dead")]))
, , dead = FALSE

        dgn
sex      ICH  ID INF SAH Sum
  Female  15  37 123  14 189
  Male    10  17 116  12 155
  Sum     25  54 239  26 344

, , dead = TRUE

        dgn
sex      ICH  ID INF SAH Sum
  Female  33 103 172  13 321
  Male    21  45  90   8 164
  Sum     54 148 262  21 485

, , dead = Sum

        dgn
sex      ICH  ID INF SAH Sum
  Female  48 140 295  27 510
  Male    31  62 206  20 319
  Sum     79 202 501  47 829



# xtabs()はX(cross) TABleSで2変数以上の集計をするときに使用。
# やることは同じですが、この方がなにをしているか読みやすいという向きもあるかもしれません。
xtabs(data=stroke, formula= ~ sex + dgn)
        dgn
sex      ICH  ID INF SAH
  Female  48 140 295  27
  Male    31  62 206  20
xtabs(data=stroke, formula= ~ sex + dgn + dead)
, , dead = FALSE

        dgn
sex      ICH  ID INF SAH
  Female  15  37 123  14
  Male    10  17 116  12

, , dead = TRUE

        dgn
sex      ICH  ID INF SAH
  Female  33 103 172  13
  Male    21  45  90   8



# 3変数の場合はftable()やvcdパッケージにあるstructable()がよいかも。
# この二つは2,3番目の変数のならべ方が違う。vcdはなければインストール。
ftable(stroke[ ,c("sex","dgn","dead")])
           dead FALSE TRUE
sex    dgn                
Female ICH         15   33
       ID          37  103
       INF        123  172
       SAH         14   13
Male   ICH         10   21
       ID          17   45
       INF        116   90
       SAH         12    8

if(!("vcd" %in% rownames(installed.packages()))) {install.packages("vcd", dep=TRUE)}
library(vcd)
structable(stroke[ ,c("sex","dgn","dead")])
             dgn ICH  ID INF SAH
sex    dead                     
Female FALSE      15  37 123  14
       TRUE       33 103 172  13
Male   FALSE      10  17 116  12
       TRUE       21  45  90   8


# 2変数の表は合計欄のない状態(addmargin()なし)でchisq.test(), fisher.test()に与えて検定できます。
## カイ二乗検定
chisq.test(table(stroke[ ,c("sex","dgn")]))

    Pearson's Chi-squared test

data:  table(stroke[, c("sex", "dgn")]) 
X-squared = 6.995, df = 3, p-value = 0.07205

chisq.test(xtabs(data=stroke, formula= ~ sex + dgn))

    Pearson's Chi-squared test

data:  xtabs(data = stroke, formula = ~sex + dgn) 
X-squared = 6.995, df = 3, p-value = 0.07205

## フィッシャー正確確率検定
fisher.test(table(stroke[ ,c("sex","dgn")]))

    Fisher's Exact Test for Count Data

data:  table(stroke[, c("sex", "dgn")]) 
p-value = 0.06762
alternative hypothesis: two.sided 

fisher.test(xtabs(data=stroke, formula= ~ sex + dgn))

    Fisher's Exact Test for Count Data

data:  xtabs(data = stroke, formula = ~sex + dgn) 
p-value = 0.06762
alternative hypothesis: two.sided 



# gmodelsパッケージにCrossTable()という便利な?コマンドがある。なければgmodelsインストール。
if(!("gmodels" %in% rownames(installed.packages()))) {install.packages("gmodels", dep=TRUE)}
library(gmodels)

# SPSS風とかSAS風とかに設定可能、初期設定だとくどいので、オプションでシンプルにしました。
# カイ二乗検定やフィッシャーの正確確率検定を同時に施行できます。
CrossTable(x=stroke$sex, y=stroke$dgn, digits=1,
           prop.t=FALSE, prop.chisq=FALSE,
           format="SPSS", chisq=TRUE, fisher=TRUE)

   Cell Contents
|-------------------------|
|                   Count |
|             Row Percent |
|          Column Percent |
|-------------------------|

Total Observations in Table:  829 

             | stroke$dgn 
  stroke$sex |      ICH  |       ID  |      INF  |      SAH  | Row Total | 
-------------|-----------|-----------|-----------|-----------|-----------|
      Female |       48  |      140  |      295  |       27  |      510  | 
             |      9.4% |     27.5% |     57.8% |      5.3% |     61.5% | 
             |     60.8% |     69.3% |     58.9% |     57.4% |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
        Male |       31  |       62  |      206  |       20  |      319  | 
             |      9.7% |     19.4% |     64.6% |      6.3% |     38.5% | 
             |     39.2% |     30.7% |     41.1% |     42.6% |           | 
-------------|-----------|-----------|-----------|-----------|-----------|
Column Total |       79  |      202  |      501  |       47  |      829  | 
             |      9.5% |     24.4% |     60.4% |      5.7% |           | 
-------------|-----------|-----------|-----------|-----------|-----------|


Statistics for All Table Factors


Pearson's Chi-squared test 
------------------------------------------------------------
Chi^2 =  6.995     d.f. =  3     p =  0.07205 



Fisher's Exact Test for Count Data
------------------------------------------------------------
Alternative hypothesis: two.sided
p =  0.06762 


       Minimum expected frequency: 18.09