動画
https://web.microsoftstream.com/video/34f3e16a-4e6a-4bb4-ab94-0ec99572b7df
参考資料
R tips 72一般化線形モデル http://cse.naro.affrc.go.jp/takezawa/r-tips/r/72.html
以下のCodeを走らせてデータセットを作成する.
Dataset.0が作成されればOK
# ここから38行目まではサンプルデータセットの作成です。
set.seed(1111)
Dataset.0<-
data.frame(
Sex=factor(rnorm(n=800,mean=0.5,sd=1)>0,labels=c("Male","Female")),
Age=50+rpois(n=800,lambda=25)+round(rnorm(n=800,mean=0,sd=5),0),
Severity=rpois(n=800,lambda=3))
Dataset.0$Comorbidity=
rpois(n=800,lambda=1)+round(Dataset.0$Age*rnorm(n=800,mean=0.03,sd=0.08)^2,0)
Dataset.0$UnknownConfounder=
factor(
with(Dataset.0,
rnorm(n=800,mean=0)+
I(Sex=="Male")*rnorm(n=800,mean=0.02)*0.01+
I(Age-80)*rnorm(n=800,mean=0.002)*0.001+
I(Severity>3)*rnorm(n=800,mean=0.05)*0.01)>1,
labels=c("No","Yes"))
Dataset.0$Intervention<-
factor(
with(Dataset.0,
rnorm(n=800,mean=0)+
I(Sex=="Female")*rnorm(n=800,mean=0.2)+
I(Age-80)*rnorm(n=800,mean=-0.1)*0.1+
I(Severity-2)*rnorm(n=800,mean=0.4)+
I(Comorbidity-1)*rnorm(n=800,mean=-1.2)*0.6+
I(UnknownConfounder=="Yes")*rnorm(n=800,mean=0.2)*0.2)>0,
labels=c("No","Yes"))
Dataset.0$Outcome<-
factor(
with(Dataset.0,
rnorm(n=800,mean=0)*10+
I(Sex=="Male")*rnorm(n=800,mean=0.01)+
I(Age-75)*rnorm(n=800,mean=0.3)*5+
I(Severity-1)*rnorm(n=800,mean=0.7)*6+
I(Comorbidity-1)*rnorm(n=800,mean=0.9)*5+
I(Intervention=="Yes")*rnorm(n=800,mean=-0.6)*4+
I(UnknownConfounder=="Yes")*rnorm(n=800,mean=0.3)*0.7)>20,labels=c("No","Yes"))
Dataset.0$UnknownConfounder<-NULL
head(Dataset.0) #head関数で確認
## Sex Age Severity Comorbidity Intervention Outcome
## 1 Female 77 0 1 Yes No
## 2 Female 64 0 3 Yes Yes
## 3 Female 84 3 0 Yes Yes
## 4 Female 67 3 0 No No
## 5 Female 70 2 4 No No
## 6 Male 87 3 0 Yes No
colnames(Dataset.0) #カラムの名前を確認
## [1] "Sex" "Age" "Severity" "Comorbidity"
## [5] "Intervention" "Outcome"
names(Dataset.0) #変数のみ確認
## [1] "Sex" "Age" "Severity" "Comorbidity"
## [5] "Intervention" "Outcome"
summary(Dataset.0) #サマリーしてみる
## Sex Age Severity Comorbidity Intervention
## Male :254 Min. : 53.00 Min. :0.000 Min. :0.00 No :371
## Female:546 1st Qu.: 70.00 1st Qu.:2.000 1st Qu.:1.00 Yes:429
## Median : 75.00 Median :3.000 Median :1.00
## Mean : 75.02 Mean :3.027 Mean :1.48
## 3rd Qu.: 80.00 3rd Qu.:4.000 3rd Qu.:2.00
## Max. :107.00 Max. :9.000 Max. :8.00
## Outcome
## No :511
## Yes:289
##
##
##
##
summary(subset(Dataset.0, Intervention=="Yes")) #subsetで介入がYesの集団を作りサマリーする
## Sex Age Severity Comorbidity Intervention
## Male :139 Min. :53.00 Min. :0.000 Min. :0.000 No : 0
## Female:290 1st Qu.:70.00 1st Qu.:2.000 1st Qu.:0.000 Yes:429
## Median :75.00 Median :3.000 Median :1.000
## Mean :75.07 Mean :3.252 Mean :1.142
## 3rd Qu.:80.00 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :96.00 Max. :9.000 Max. :8.000
## Outcome
## No :289
## Yes:140
##
##
##
##
summary(subset(Dataset.0, Intervention=="No")) #subsetで介入がNoの集団を作りサマリーする
## Sex Age Severity Comorbidity
## Male :115 Min. : 57.00 Min. :0.000 Min. :0.000
## Female:256 1st Qu.: 70.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 75.00 Median :3.000 Median :2.000
## Mean : 74.97 Mean :2.768 Mean :1.871
## 3rd Qu.: 80.00 3rd Qu.:4.000 3rd Qu.:3.000
## Max. :107.00 Max. :9.000 Max. :7.000
## Intervention Outcome
## No :371 No :222
## Yes: 0 Yes:149
##
##
##
##
==は右辺と左辺が同じですか?という条件式.
=は代入.
Yes, Noで見比べてみると重症度,合併症が群間で違いそう.
ではOutcomeで分けたデータセットを見てみよう.
summary(subset(Dataset.0, Outcome=="Yes"))
## Sex Age Severity Comorbidity
## Male : 89 Min. : 53.00 Min. :0.000 Min. :0.000
## Female:200 1st Qu.: 71.00 1st Qu.:2.000 1st Qu.:1.000
## Median : 77.00 Median :3.000 Median :1.000
## Mean : 76.33 Mean :3.419 Mean :1.668
## 3rd Qu.: 82.00 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :107.00 Max. :9.000 Max. :7.000
## Intervention Outcome
## No :149 No : 0
## Yes:140 Yes:289
##
##
##
##
summary(subset(Dataset.0, Outcome=="No"))
## Sex Age Severity Comorbidity Intervention
## Male :165 Min. :56.00 Min. :0.000 Min. :0.000 No :222
## Female:346 1st Qu.:70.00 1st Qu.:2.000 1st Qu.:1.000 Yes:289
## Median :74.00 Median :3.000 Median :1.000
## Mean :74.28 Mean :2.806 Mean :1.374
## 3rd Qu.:79.00 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :94.00 Max. :8.000 Max. :8.000
## Outcome
## No :511
## Yes: 0
##
##
##
##
アウトカムからみると重症度が高いとアウトカムが起きていそう.
面倒なのでTableOneでみてしまおう.
library(tableone)
## Warning: package 'tableone' was built under R version 3.5.3
Table1<-CreateTableOne(vars = c("Sex","Age","Severity","Comorbidity","Outcome"), strata = "Intervention", data = Dataset.0)
Table1
## Stratified by Intervention
## No Yes p test
## n 371 429
## Sex = Female (%) 256 (69.0) 290 (67.6) 0.727
## Age (mean (SD)) 74.97 (7.32) 75.07 (7.01) 0.837
## Severity (mean (SD)) 2.77 (1.68) 3.25 (1.75) <0.001
## Comorbidity (mean (SD)) 1.87 (1.38) 1.14 (1.13) <0.001
## Outcome = Yes (%) 149 (40.2) 140 (32.6) 0.033
print(Table1, contDigits = 1) #小数点1位まで表示
## Stratified by Intervention
## No Yes p test
## n 371 429
## Sex = Female (%) 256 (69.0) 290 (67.6) 0.727
## Age (mean (SD)) 75.0 (7.3) 75.1 (7.0) 0.837
## Severity (mean (SD)) 2.8 (1.7) 3.3 (1.8) <0.001
## Comorbidity (mean (SD)) 1.9 (1.4) 1.1 (1.1) <0.001
## Outcome = Yes (%) 149 (40.2) 140 (32.6) 0.033
デフォルトでχ二乗検定をしている
別の表示の仕方としてxtableを使用する.
xtabs(~Outcome+Intervention, data = Dataset.0) #クロス集計を行うxtabs関数
## Intervention
## Outcome No Yes
## No 222 289
## Yes 149 140
~は回帰を示すtildeという記号.これの後に変数を+でつないでいく
いわゆるクロス集計ができた.
χ二乗検定をしてみよう
chisq.test(xtabs(~Outcome+Intervention, data=Dataset.0))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: xtabs(~Outcome + Intervention, data = Dataset.0)
## X-squared = 4.5649, df = 1, p-value = 0.03263
どうやら単変量解析ではInterventionとOutcomeの関連がありそう.
次は,これまでと同じことをGLMを使ってやってみる.
関数 glm() を用いることで一般化線形モデルを扱うことが出来る.
一般化線形モデルのクラスは,応答変数の分布が正規分布(gaussian),二項分布(binomial),ポアソン分布(poisson),逆正規分布(inverse.gaussian),ガンマ分布(Gamma),そして応答分布がはっきりしないときのための擬似尤度モデル(quasi)を備えており,引数 family で指定することが出来る.ファミリー名とリンク関数の表を以下に挙げる
引数 family | リンク関数 |
---|---|
binomial | logit, probit, log, cloglog |
gaussian | identity, log, inverse |
Gamma | identity, inverse, log |
inverse.gaussian | 1/mu^2, identity, inverse, log |
poisson | identity, log, sqrt |
quasi | logit, probit, cloglog, identity, inverse, log, 1/mu^2, sqrt |
glm(I(Outcome=="Yes")~Intervention, binomial,
data = Dataset.0)
##
## Call: glm(formula = I(Outcome == "Yes") ~ Intervention, family = binomial,
## data = Dataset.0)
##
## Coefficients:
## (Intercept) InterventionYes
## -0.3987 -0.3261
##
## Degrees of Freedom: 799 Total (i.e. Null); 798 Residual
## Null Deviance: 1047
## Residual Deviance: 1042 AIC: 1046
アウトカムがYesであることをInterventionで回帰している.
binomialを指定してリンク関数をlogitを選択している.
これで単変量のロジスティクス回帰分析になっている.
実行した結果をサマリーして要約統計量を見る
summary(glm(I(Outcome=="Yes")~Intervention, binomial,
data = Dataset.0))
##
## Call:
## glm(formula = I(Outcome == "Yes") ~ Intervention, family = binomial,
## data = Dataset.0)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0134 -1.0134 -0.8889 1.3507 1.4965
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.3987 0.1059 -3.765 0.000167 ***
## InterventionYes -0.3261 0.1477 -2.207 0.027290 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1046.6 on 799 degrees of freedom
## Residual deviance: 1041.7 on 798 degrees of freedom
## AIC: 1045.7
##
## Number of Fisher Scoring iterations: 4
ORにしたいので,Estimate Stdをexp関数に入れる
exp(-0.3261)
## [1] 0.721733
##多変量解析
ここまで単変量でやってきたが,交絡を調整しないといけないので,glmの式に交絡因子を足していく.
glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
data = Dataset.0)
##
## Call: glm(formula = I(Outcome == "Yes") ~ Intervention + Age + Sex +
## Severity + Comorbidity, family = binomial, data = Dataset.0)
##
## Coefficients:
## (Intercept) InterventionYes Age SexFemale
## -4.90852 -0.36576 0.04630 0.06277
## Severity Comorbidity
## 0.24879 0.15085
##
## Degrees of Freedom: 799 Total (i.e. Null); 794 Residual
## Null Deviance: 1047
## Residual Deviance: 989.1 AIC: 1001
summary(glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
data = Dataset.0))
##
## Call:
## glm(formula = I(Outcome == "Yes") ~ Intervention + Age + Sex +
## Severity + Comorbidity, family = binomial, data = Dataset.0)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7096 -0.9472 -0.7379 1.2383 2.0986
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.90852 0.87322 -5.621 1.90e-08 ***
## InterventionYes -0.36576 0.16093 -2.273 0.0230 *
## Age 0.04630 0.01096 4.224 2.40e-05 ***
## SexFemale 0.06277 0.16450 0.382 0.7028
## Severity 0.24879 0.04565 5.450 5.02e-08 ***
## Comorbidity 0.15085 0.06048 2.494 0.0126 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1046.61 on 799 degrees of freedom
## Residual deviance: 989.13 on 794 degrees of freedom
## AIC: 1001.1
##
## Number of Fisher Scoring iterations: 4
E<-c(-0.36576, 0.04630, 0.06277,0.24879,0.15085) #ORをそれぞれ出す
exp(E)
## [1] 0.6936693 1.0473886 1.0647819 1.2824727 1.1628222
既知の交絡因子を調整した上でもInterventionはOutcomeの独立した予測因子であり,関連していた.
この時InterventionがOutcomeを改善した,のように因果を言うことはできない(未知,未測定の交絡因子があるため).
それはさておき,せっかく結果がでたので,まとめましょう.
tableoneにはShowReg関数(show reggression table)という全部ORとして示してくれる関数がある.
吉田先生ありがとうございます
ShowRegTable(glm(I(Outcome=="Yes")~Intervention+Age+Sex+Severity+Comorbidity, binomial,
data = Dataset.0))
## exp(coef) [confint] p
## (Intercept) 0.01 [0.00, 0.04] <0.001
## InterventionYes 0.69 [0.51, 0.95] 0.023
## Age 1.05 [1.03, 1.07] <0.001
## SexFemale 1.06 [0.77, 1.47] 0.703
## Severity 1.28 [1.17, 1.40] <0.001
## Comorbidity 1.16 [1.03, 1.31] 0.013