データは“R_Seminar_data.csv”を使用
library(tidyverse)
library(ggplot2)
df<-read_csv("R_Seminar_data.csv")
df %>% glimpse()
## Observations: 500
## Variables: 17
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
## $ Year <dbl> 2010, 2012, 2012, 2011, 2013, 2010, 2014, 2011, 2012,...
## $ Admday <chr> "2010/10/24", "2012/9/24", "2012/12/9", "2011/9/9", "...
## $ Discday <chr> "2010/11/5", "2012/10/3", "2012/12/14", "2011/9/19", ...
## $ New_Treatment <dbl> 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0,...
## $ Age <dbl> 62, 82, 75, 78, 78, 68, 72, 71, 80, 72, 75, 63, 77, 6...
## $ Sex <dbl> 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1,...
## $ Height <dbl> 167, 156, 155, 153, 154, 157, 168, 154, 165, 152, 167...
## $ Weight <dbl> 75.8, 57.0, 61.2, 49.5, 52.5, 61.1, 64.3, 47.3, 61.9,...
## $ DM <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Stroke <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ MI <dbl> 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Severity <dbl> 2, 3, 2, 2, 8, 3, 1, 7, 11, 2, 6, 1, 2, 1, 1, 2, 1, 1...
## $ Death <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ BMI2 <dbl> 27.17917, 23.42209, 25.47347, 21.14571, 22.13695, 24....
## $ BMI2_cat <chr> "Overweight", "High_normal", "Overweight", "Low_norma...
## $ LOS2 <dbl> 13, 10, 6, 11, 15, 9, 19, 22, 15, 10, 7, 10, 13, 6, 6...
Observations: 500 500行ある
Variables: 21 21列ある
サマリーしてみる
df %>% summary()
## id Year Admday Discday
## Min. : 1.0 Min. :2010 Length:500 Length:500
## 1st Qu.:125.8 1st Qu.:2011 Class :character Class :character
## Median :250.5 Median :2012 Mode :character Mode :character
## Mean :250.5 Mean :2012
## 3rd Qu.:375.2 3rd Qu.:2013
## Max. :500.0 Max. :2014
## New_Treatment Age Sex Height Weight
## Min. :0.000 Min. :54.0 Min. :1.000 Min. :139.0 Min. :28.40
## 1st Qu.:0.000 1st Qu.:70.0 1st Qu.:1.000 1st Qu.:151.0 1st Qu.:49.70
## Median :0.000 Median :75.0 Median :2.000 Median :155.0 Median :55.00
## Mean :0.358 Mean :74.9 Mean :1.608 Mean :155.4 Mean :55.03
## 3rd Qu.:1.000 3rd Qu.:79.0 3rd Qu.:2.000 3rd Qu.:159.0 3rd Qu.:60.60
## Max. :1.000 Max. :91.0 Max. :2.000 Max. :172.0 Max. :75.80
## DM Stroke MI Severity
## Min. :0.000 Min. :0.000 Min. :0.00 Min. : 1.000
## 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.00 1st Qu.: 1.000
## Median :0.000 Median :0.000 Median :0.00 Median : 4.000
## Mean :0.146 Mean :0.064 Mean :0.11 Mean : 4.512
## 3rd Qu.:0.000 3rd Qu.:0.000 3rd Qu.:0.00 3rd Qu.: 7.000
## Max. :1.000 Max. :1.000 Max. :1.00 Max. :13.000
## Death BMI2 BMI2_cat LOS2
## Min. :0.000 Min. :14.70 Length:500 Min. : 4.00
## 1st Qu.:0.000 1st Qu.:21.22 Class :character 1st Qu.: 9.00
## Median :0.000 Median :22.85 Mode :character Median :12.00
## Mean :0.132 Mean :22.70 Mean :13.37
## 3rd Qu.:0.000 3rd Qu.:24.31 3rd Qu.:16.00
## Max. :1.000 Max. :28.14 Max. :57.00
summary()は全ての変数について数字型なら最小大値・四分位値・中央値・平均値が出る
df$Year %>% table() #指定したデータの中身の頻度を出す
## .
## 2010 2011 2012 2013 2014
## 51 113 165 137 34
集計が美しくない.
そんなときは詳しく集計する便利なパッケージを使用する
臨床研究の論文で患者背景を表1(table1)からネーミング
治療有無別に患者背景情報を一気にまとめて集計する事ができる
CreateTableOne(vars, strata, factorVars, data)
の構文で使用する
vars: 表示する変数をc()の中に指定する.変数は""でくくる
strata: 層別したい変数を入れる__変数は""でくくる__
factorVars: 因子として取り扱う変数を指定する__変数は""でくくる__
data: 使用するデータを指定する
詳細はhelpで確認すること
install.packages("tableone", repos="http://cran.rstudio.com/")
library(tableone)
table1<-CreateTableOne(vars=c("Age", "Sex", "Severity"), strata = "New_Treatment", factorVars = c("Sex"),data=df)
table1
## Stratified by New_Treatment
## 0 1 p test
## n 321 179
## Age (mean (SD)) 72.58 (5.15) 79.06 (5.00) <0.001
## Sex = 2 (%) 198 (61.7) 106 (59.2) 0.656
## Severity (mean (SD)) 3.38 (2.77) 6.54 (3.23) <0.001
ちなみにfactoreVarsを指定しないカテゴリ変数が平均やSDででてくる.
table2 <- df %>% CreateTableOne(vars=c("Age", "Sex", "Severity"), strata = "New_Treatment", data=df)
table2
## Stratified by New_Treatment
## 0 1 p test
## n 321 179
## Age (mean (SD)) 72.58 (5.15) 79.06 (5.00) <0.001
## Sex (mean (SD)) 1.62 (0.49) 1.59 (0.49) 0.589
## Severity (mean (SD)) 3.38 (2.77) 6.54 (3.23) <0.001
testExaxtでfisher.testを適用できる
table3 <- CreateTableOne(vars=c("Age", "Sex", "Severity"), strata = "New_Treatment", factorVars = c("Sex"),testExact = "sex", data=df)
table3
## Stratified by New_Treatment
## 0 1 p test
## n 321 179
## Age (mean (SD)) 72.58 (5.15) 79.06 (5.00) <0.001
## Sex = 2 (%) 198 (61.7) 106 (59.2) 0.656
## Severity (mean (SD)) 3.38 (2.77) 6.54 (3.23) <0.001
まとめた情報はCSVファイルにすることができるので,学会発表,論文報告のために簡単に表1を作成できる
Fileはプロジェクトフォルダ内に作成
table1 %>% print() %>% write.csv(file=“table1.csv”)
##2群間の比較検定
###連続変数の二軍比較
検定したい目的変数が連続変数(LOS: 在院日数など)である場合,分布を確認する
正規分布に近いかたち→t-test
サンプル数が少ない or 外れ値が目立つ→wilcoxon rank sum test
まずは分布の確認はヒストグラムで行う LOSがないのでLOSを作っておく.
library(lubridate)
library(tidyverse)
df %>% mutate(Admday = ymd(Admday), Discday = ymd(Discday)) -> df
#日付の引き算
df %>%mutate(LOS = as.integer(Discday - Admday)+1) -> df #regressionに使えないのでas.integerで整数に変えておく
summary(df$LOS)
ggplot(data=df,aes(x=LOS))+
geom_histogram()
正規分布に近い形と判断してt-test
t.test(目的変数~群訳変数, data)
の形で入れる
t.test(LOS~New_Treatment,data=df)
##
## Welch Two Sample t-test
##
## data: LOS by New_Treatment
## t = -4.9642, df = 303.78, p-value = 1.152e-06
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.836858 -2.090760
## sample estimates:
## mean in group 0 mean in group 1
## 12.13396 15.59777
wilcox.test(LOS~New_Treatment, data=df)
の形で入れる
wilcox.test(df$LOS~df$New_Treatment)
##
## Wilcoxon rank sum test with continuity correction
##
## data: df$LOS by df$New_Treatment
## W = 20304, p-value = 5.022e-08
## alternative hypothesis: true location shift is not equal to 0
検定したい目的変数がカテゴリである場合
フィッシャー正確検定
χ二乗検定
fisher.test(Data$Variable, Data$Variable)
fisher.test(data=df,Death, New_Treatment)では検定されない
fisher.test(df$Death, df$New_Treatment)
##
## Fisher's Exact Test for Count Data
##
## data: df$Death and df$New_Treatment
## p-value = 0.0388
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.2736981 0.9887565
## sample estimates:
## odds ratio
## 0.5326503
同様にData$Variableの型で入れる
chisq.test(df$Death, df$New_Treatment)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: df$Death and df$New_Treatment
## X-squared = 3.8588, df = 1, p-value = 0.04949
Tab1<-CreateTableOne(data = df, vars = c("DM","Stroke","MI","Severity"),strata = "New_Treatment")
Tab1 #factorVarsをいれないと全部連続量になる
## Stratified by New_Treatment
## 0 1 p test
## n 321 179
## DM (mean (SD)) 0.08 (0.27) 0.27 (0.44) <0.001
## Stroke (mean (SD)) 0.01 (0.11) 0.16 (0.36) <0.001
## MI (mean (SD)) 0.10 (0.30) 0.13 (0.34) 0.200
## Severity (mean (SD)) 3.38 (2.77) 6.54 (3.23) <0.001
Tabf1<-CreateTableOne(data = df, vars = c("DM","Stroke","MI","Severity"),strata = "New_Treatment", factorVars = c("DM","Stroke","MI"))
Tabf1
## Stratified by New_Treatment
## 0 1 p test
## n 321 179
## DM = 1 (%) 25 (7.8) 48 (26.8) <0.001
## Stroke = 1 (%) 4 (1.2) 28 (15.6) <0.001
## MI = 1 (%) 31 (9.7) 24 (13.4) 0.256
## Severity (mean (SD)) 3.38 (2.77) 6.54 (3.23) <0.001
#create BMI
df %>% mutate(BMI = Weight/Height/Height*10000) -> df # ->で最後に入れ込む
summary(df$BMI) # check the variale!
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.70 21.22 22.85 22.70 24.31 28.14
df %>% ggplot(aes(x=BMI))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
t.test(BMI~Sex, data=df)
##
## Welch Two Sample t-test
##
## data: BMI by Sex
## t = 0.30569, df = 405.45, p-value = 0.76
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.3527804 0.4826975
## sample estimates:
## mean in group 1 mean in group 2
## 22.73589 22.67093