動画
https://web.microsoftstream.com/video/34f3e16a-4e6a-4bb4-ab94-0ec99572b7df
参考資料
R tips 72一般化線形モデル http://cse.naro.affrc.go.jp/takezawa/r-tips/r/72.html

事前準備

以下のCodeを走らせてデータセットを作成する.
Dataset.0が作成されればOK

# ここから38行目まではサンプルデータセットの作成です。
set.seed(1111)
Dataset.0<-
  data.frame(
    Sex=factor(rnorm(n=800,mean=0.5,sd=1)>0,labels=c("Male","Female")),
    Age=50+rpois(n=800,lambda=25)+round(rnorm(n=800,mean=0,sd=5),0),
    Severity=rpois(n=800,lambda=3))
Dataset.0$Comorbidity=
  rpois(n=800,lambda=1)+round(Dataset.0$Age*rnorm(n=800,mean=0.03,sd=0.08)^2,0)
Dataset.0$UnknownConfounder=
  factor(
    with(Dataset.0,
         rnorm(n=800,mean=0)+
           I(Sex=="Male")*rnorm(n=800,mean=0.02)*0.01+
           I(Age-80)*rnorm(n=800,mean=0.002)*0.001+
           I(Severity>3)*rnorm(n=800,mean=0.05)*0.01)>1,
    labels=c("No","Yes"))
Dataset.0$Intervention<-
  factor(
    with(Dataset.0,
         rnorm(n=800,mean=0)+
           I(Sex=="Female")*rnorm(n=800,mean=0.2)+
           I(Age-80)*rnorm(n=800,mean=-0.1)*0.1+
           I(Severity-2)*rnorm(n=800,mean=0.4)+
           I(Comorbidity-1)*rnorm(n=800,mean=-1.2)*0.6+
           I(UnknownConfounder=="Yes")*rnorm(n=800,mean=0.2)*0.2)>0,
    labels=c("No","Yes"))
Dataset.0$Outcome<-
  factor(
    with(Dataset.0,
         rnorm(n=800,mean=0)*10+
         I(Sex=="Male")*rnorm(n=800,mean=0.01)+
           I(Age-75)*rnorm(n=800,mean=0.3)*5+
           I(Severity-1)*rnorm(n=800,mean=0.7)*6+
           I(Comorbidity-1)*rnorm(n=800,mean=0.9)*5+
           I(Intervention=="Yes")*rnorm(n=800,mean=-0.6)*4+
           I(UnknownConfounder=="Yes")*rnorm(n=800,mean=0.3)*0.7)>20,labels=c("No","Yes"))
Dataset.0$UnknownConfounder<-NULL

データセットの確認

head(Dataset.0) #head関数で確認
##      Sex Age Severity Comorbidity Intervention Outcome
## 1 Female  77        0           1          Yes      No
## 2 Female  64        0           3          Yes     Yes
## 3 Female  84        3           0          Yes     Yes
## 4 Female  67        3           0           No      No
## 5 Female  70        2           4           No      No
## 6   Male  87        3           0          Yes      No
colnames(Dataset.0) #カラムの名前を確認
## [1] "Sex"          "Age"          "Severity"     "Comorbidity" 
## [5] "Intervention" "Outcome"
names(Dataset.0) #変数のみ確認
## [1] "Sex"          "Age"          "Severity"     "Comorbidity" 
## [5] "Intervention" "Outcome"
summary(Dataset.0) #サマリーしてみる
##      Sex           Age            Severity      Comorbidity   Intervention
##  Male  :254   Min.   : 53.00   Min.   :0.000   Min.   :0.00   No :371     
##  Female:546   1st Qu.: 70.00   1st Qu.:2.000   1st Qu.:1.00   Yes:429     
##               Median : 75.00   Median :3.000   Median :1.00               
##               Mean   : 75.02   Mean   :3.027   Mean   :1.48               
##               3rd Qu.: 80.00   3rd Qu.:4.000   3rd Qu.:2.00               
##               Max.   :107.00   Max.   :9.000   Max.   :8.00               
##  Outcome  
##  No :511  
##  Yes:289  
##           
##           
##           
## 

データセットで介入群,非介入群でアウトカムの分布が違うかみてみる

summary(subset(Dataset.0, Intervention=="Yes")) #subsetで介入がYesの集団を作りサマリーする
##      Sex           Age           Severity      Comorbidity    Intervention
##  Male  :139   Min.   :53.00   Min.   :0.000   Min.   :0.000   No :  0     
##  Female:290   1st Qu.:70.00   1st Qu.:2.000   1st Qu.:0.000   Yes:429     
##               Median :75.00   Median :3.000   Median :1.000               
##               Mean   :75.07   Mean   :3.252   Mean   :1.142               
##               3rd Qu.:80.00   3rd Qu.:4.000   3rd Qu.:2.000               
##               Max.   :96.00   Max.   :9.000   Max.   :8.000               
##  Outcome  
##  No :289  
##  Yes:140  
##           
##           
##           
## 
summary(subset(Dataset.0, Intervention=="No")) #subsetで介入がNoの集団を作りサマリーする
##      Sex           Age            Severity      Comorbidity   
##  Male  :115   Min.   : 57.00   Min.   :0.000   Min.   :0.000  
##  Female:256   1st Qu.: 70.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 75.00   Median :3.000   Median :2.000  
##               Mean   : 74.97   Mean   :2.768   Mean   :1.871  
##               3rd Qu.: 80.00   3rd Qu.:4.000   3rd Qu.:3.000  
##               Max.   :107.00   Max.   :9.000   Max.   :7.000  
##  Intervention Outcome  
##  No :371      No :222  
##  Yes:  0      Yes:149  
##                        
##                        
##                        
## 

==は右辺と左辺が同じですか?という条件式.
=は代入.
Yes, Noで見比べてみると重症度,合併症が群間で違いそう.

ではOutcomeで分けたデータセットを見てみよう.

summary(subset(Dataset.0, Outcome=="Yes")) 
##      Sex           Age            Severity      Comorbidity   
##  Male  : 89   Min.   : 53.00   Min.   :0.000   Min.   :0.000  
##  Female:200   1st Qu.: 71.00   1st Qu.:2.000   1st Qu.:1.000  
##               Median : 77.00   Median :3.000   Median :1.000  
##               Mean   : 76.33   Mean   :3.419   Mean   :1.668  
##               3rd Qu.: 82.00   3rd Qu.:4.000   3rd Qu.:2.000  
##               Max.   :107.00   Max.   :9.000   Max.   :7.000  
##  Intervention Outcome  
##  No :149      No :  0  
##  Yes:140      Yes:289  
##                        
##                        
##                        
## 
summary(subset(Dataset.0, Outcome=="No")) 
##      Sex           Age           Severity      Comorbidity    Intervention
##  Male  :165   Min.   :56.00   Min.   :0.000   Min.   :0.000   No :222     
##  Female:346   1st Qu.:70.00   1st Qu.:2.000   1st Qu.:1.000   Yes:289     
##               Median :74.00   Median :3.000   Median :1.000               
##               Mean   :74.28   Mean   :2.806   Mean   :1.374               
##               3rd Qu.:79.00   3rd Qu.:4.000   3rd Qu.:2.000               
##               Max.   :94.00   Max.   :8.000   Max.   :8.000               
##  Outcome  
##  No :511  
##  Yes:  0  
##           
##           
##           
## 

アウトカムからみると重症度が高いとアウトカムが起きていそう.

面倒なのでTableOneでみてしまおう.

library(tableone)
## Warning: package 'tableone' was built under R version 3.5.3
Table1<-CreateTableOne(vars = c("Sex","Age","Severity","Comorbidity","Outcome"), strata = "Intervention", data = Dataset.0)
Table1
##                          Stratified by Intervention
##                           No            Yes           p      test
##   n                         371           429                    
##   Sex = Female (%)          256 (69.0)    290 (67.6)   0.727     
##   Age (mean (SD))         74.97 (7.32)  75.07 (7.01)   0.837     
##   Severity (mean (SD))     2.77 (1.68)   3.25 (1.75)  <0.001     
##   Comorbidity (mean (SD))  1.87 (1.38)   1.14 (1.13)  <0.001     
##   Outcome = Yes (%)         149 (40.2)    140 (32.6)   0.033
print(Table1, contDigits = 1) #小数点1位まで表示
##                          Stratified by Intervention
##                           No           Yes          p      test
##   n                        371          429                    
##   Sex = Female (%)         256 (69.0)   290 (67.6)   0.727     
##   Age (mean (SD))         75.0 (7.3)   75.1 (7.0)    0.837     
##   Severity (mean (SD))     2.8 (1.7)    3.3 (1.8)   <0.001     
##   Comorbidity (mean (SD))  1.9 (1.4)    1.1 (1.1)   <0.001     
##   Outcome = Yes (%)        149 (40.2)   140 (32.6)   0.033

デフォルトでχ二乗検定をしている

別の表示の仕方としてxtableを使用する.

xtabs(~Outcome+Intervention, data = Dataset.0) #クロス集計を行うxtabs関数
##        Intervention
## Outcome  No Yes
##     No  222 289
##     Yes 149 140

~は回帰を示すtildeという記号.これの後に変数を+でつないでいく
いわゆるクロス集計ができた.

χ二乗検定をしてみよう

chisq.test(xtabs(~Outcome+Intervention, data=Dataset.0))
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  xtabs(~Outcome + Intervention, data = Dataset.0)
## X-squared = 4.5649, df = 1, p-value = 0.03263

どうやら単変量解析ではInterventionとOutcomeの関連がありそう.

Generalized linier model(GLM)

次は,これまでと同じことをGLMを使ってやってみる.
関数 glm() を用いることで一般化線形モデルを扱うことが出来る.

一般化線形モデルのクラスは,応答変数の分布が正規分布(gaussian),二項分布(binomial),ポアソン分布(poisson),逆正規分布(inverse.gaussian),ガンマ分布(Gamma),そして応答分布がはっきりしないときのための擬似尤度モデル(quasi)を備えており,引数 family で指定することが出来る.ファミリー名とリンク関数の表を以下に挙げる

引数 family リンク関数
binomial logit, probit, log, cloglog
gaussian identity, log, inverse
Gamma identity, inverse, log
inverse.gaussian 1/mu^2, identity, inverse, log
poisson identity, log, sqrt
quasi logit, probit, cloglog, identity, inverse, log, 1/mu^2, sqrt
glm(I(Outcome=="Yes")~Intervention, binomial,
    data = Dataset.0)
## 
## Call:  glm(formula = I(Outcome == "Yes") ~ Intervention, family = binomial, 
##     data = Dataset.0)
## 
## Coefficients:
##     (Intercept)  InterventionYes  
##         -0.3987          -0.3261  
## 
## Degrees of Freedom: 799 Total (i.e. Null);  798 Residual
## Null Deviance:       1047 
## Residual Deviance: 1042  AIC: 1046

アウトカムがYesであることをInterventionで回帰している.
binomialを指定してリンク関数をlogitを選択している.
これで単変量のロジスティクス回帰分析になっている.

実行した結果をサマリーして要約統計量を見る

summary(glm(I(Outcome=="Yes")~Intervention, binomial,
    data = Dataset.0))
## 
## Call:
## glm(formula = I(Outcome == "Yes") ~ Intervention, family = binomial, 
##     data = Dataset.0)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0134  -1.0134  -0.8889   1.3507   1.4965  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -0.3987     0.1059  -3.765 0.000167 ***
## InterventionYes  -0.3261     0.1477  -2.207 0.027290 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1046.6  on 799  degrees of freedom
## Residual deviance: 1041.7  on 798  degrees of freedom
## AIC: 1045.7
## 
## Number of Fisher Scoring iterations: 4

ORにしたいので,Estimate Stdをexp関数に入れる

exp(-0.3261)
## [1] 0.721733

##多変量解析
ここまで単変量でやってきたが,交絡を調整しないといけないので,glmの式に交絡因子を足していく.

glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
    data = Dataset.0)
## 
## Call:  glm(formula = I(Outcome == "Yes") ~ Intervention + Age + Sex + 
##     Severity + Comorbidity, family = binomial, data = Dataset.0)
## 
## Coefficients:
##     (Intercept)  InterventionYes              Age        SexFemale  
##        -4.90852         -0.36576          0.04630          0.06277  
##        Severity      Comorbidity  
##         0.24879          0.15085  
## 
## Degrees of Freedom: 799 Total (i.e. Null);  794 Residual
## Null Deviance:       1047 
## Residual Deviance: 989.1     AIC: 1001
summary(glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
    data = Dataset.0))
## 
## Call:
## glm(formula = I(Outcome == "Yes") ~ Intervention + Age + Sex + 
##     Severity + Comorbidity, family = binomial, data = Dataset.0)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7096  -0.9472  -0.7379   1.2383   2.0986  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -4.90852    0.87322  -5.621 1.90e-08 ***
## InterventionYes -0.36576    0.16093  -2.273   0.0230 *  
## Age              0.04630    0.01096   4.224 2.40e-05 ***
## SexFemale        0.06277    0.16450   0.382   0.7028    
## Severity         0.24879    0.04565   5.450 5.02e-08 ***
## Comorbidity      0.15085    0.06048   2.494   0.0126 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1046.61  on 799  degrees of freedom
## Residual deviance:  989.13  on 794  degrees of freedom
## AIC: 1001.1
## 
## Number of Fisher Scoring iterations: 4
E<-c(-0.36576, 0.04630, 0.06277,0.24879,0.15085) #ORをそれぞれ出す
exp(E)
## [1] 0.6936693 1.0473886 1.0647819 1.2824727 1.1628222

既知の交絡因子を調整した上でもInterventionはOutcomeの独立した予測因子であり,関連していた.
この時InterventionがOutcomeを改善した,のように因果を言うことはできない(未知,未測定の交絡因子があるため).

それはさておき,せっかく結果がでたので,まとめましょう.
tableoneにはShowReg関数(show reggression table)という全部ORとして示してくれる関数がある.
吉田先生ありがとうございます

ShowRegTable(glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
    data = Dataset.0))
##                 exp(coef) [confint] p     
## (Intercept)     0.01 [0.00, 0.04]   <0.001
## InterventionYes 0.69 [0.51, 0.95]    0.023
## Age             1.05 [1.03, 1.07]   <0.001
## SexFemale       1.06 [0.77, 1.47]    0.703
## Severity        1.28 [1.17, 1.40]   <0.001
## Comorbidity     1.16 [1.03, 1.31]    0.013