読み込み
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(foreign)
library(rms)
## 要求されたパッケージ Hmisc をロード中です
##
## 次のパッケージを付け加えます: 'Hmisc'
##
## 以下のオブジェクトは 'package:dplyr' からマスクされています:
##
## src, summarize
##
## 以下のオブジェクトは 'package:base' からマスクされています:
##
## format.pval, units
library(tableone)
library(ggplot2)
library(gtsummary)
library(summarytools)
## Warning in system2("/usr/bin/otool", c("-L", shQuote(DSO)), stdout = TRUE):
## 命令 ''/usr/bin/otool' -L
## '/Library/Frameworks/R.framework/Resources/library/tcltk/libs//tcltk.so''
## の実行は状態 1 を持ちました
##
## 次のパッケージを付け加えます: 'summarytools'
##
## 以下のオブジェクトは 'package:Hmisc' からマスクされています:
##
## label, label<-
##
## 以下のオブジェクトは 'package:tibble' からマスクされています:
##
## view
library(skimr)
library(car)
## 要求されたパッケージ carData をロード中です
##
## 次のパッケージを付け加えます: 'car'
##
## 以下のオブジェクトは 'package:rms' からマスクされています:
##
## Predict, vif
##
## 以下のオブジェクトは 'package:dplyr' からマスクされています:
##
## recode
##
## 以下のオブジェクトは 'package:purrr' からマスクされています:
##
## some
データ取り込みとcolの確認 csv1
df1<-read.csv("GustoW1.csv"
,stringsAsFactors=F) %>% select(-1,-AGE10) %>% mutate(region="west")
colnames(df1)
## [1] "DAY30" "SEX" "AGE" "A65" "KILLIP" "SHO" "DIA" "HYP"
## [9] "HRT" "ANT" "PMI" "HEI" "WEI" "HTN" "SMK" "LIP"
## [17] "PAN" "FAM" "STE" "ST4" "TTR" "ESAMP" "GRPL" "GRPS"
## [25] "REGL" "HIG" "region"
データ取り込みとcolの確認 csv2
df2<-read.csv("sample2.csv"
,stringsAsFactors=F) %>% select(-1) %>% mutate(region="sample2")
colnames(df2)
## [1] "day30" "sex" "age" "a65" "killip" "sho" "dia" "hyp"
## [9] "hrt" "ant" "pmi" "hei" "wei" "htn" "smk" "lip"
## [17] "pan" "fam" "ste" "st4" "ttr" "esamp" "grpl" "grps"
## [25] "regl" "hig" "region"
データ取り込みとcolの確認 csv4
df4<-read.csv("sample4.csv"
,stringsAsFactors=F) %>% select(-1) %>% mutate(region="sample4")
colnames(df4)
## [1] "DAY30" "SEX" "AGE" "A65" "KILLIP" "SHO" "DIA" "HYP"
## [9] "HRT" "ANT" "PMI" "HEI" "WEI" "HTN" "SMK" "LIP"
## [17] "PAN" "FAM" "STE" "ST4" "TTR" "ESAMP" "GRPL" "GRPS"
## [25] "REGL" "HIG" "region"
データ取り込みとcolの確認 csv5
df5<-read.csv("sample5.csv"
,stringsAsFactors=F) %>% select(-1) %>% mutate(region="sample5")
colnames(df5)
## [1] "DAY30" "SEX" "AGE" "A65" "KILLIP" "SHO" "DIA" "HYP"
## [9] "HRT" "ANT" "PMI" "HEI" "WEI" "HTN" "SMK" "LIP"
## [17] "PAN" "FAM" "STE" "ST4" "TTR" "ESAMP" "GRPL" "GRPS"
## [25] "REGL" "HIG" "region"
setdiff(colnames(df1), colnames(df2))
## [1] "DAY30" "SEX" "AGE" "A65" "KILLIP" "SHO" "DIA" "HYP"
## [9] "HRT" "ANT" "PMI" "HEI" "WEI" "HTN" "SMK" "LIP"
## [17] "PAN" "FAM" "STE" "ST4" "TTR" "ESAMP" "GRPL" "GRPS"
## [25] "REGL" "HIG"
setdiff(colnames(df2), colnames(df1))
## [1] "day30" "sex" "age" "a65" "killip" "sho" "dia" "hyp"
## [9] "hrt" "ant" "pmi" "hei" "wei" "htn" "smk" "lip"
## [17] "pan" "fam" "ste" "st4" "ttr" "esamp" "grpl" "grps"
## [25] "regl" "hig"
データのマージ
merged_df <- rbind(df1, df4, df5)
データの確認
dfSummary(merged_df) %>% view()
## Switching method to 'browser'
## Output file written: /var/folders/n9/tf_wmwpn3gl2t7l1cz4tqk000000gn/T//RtmpYRQI89/fileedde11544a96.html
テーブル作成、連続、因子のベクトル化まとめ
col_continuous = c("AGE", "HEI", "WEI")
col_factors = c("DAY30", "SEX", "A65", "KILLIP", "SHO", "DIA", "HYP", "HRT", "ANT", "PMI", "HTN", "SMK", "STE", "LIP", "PAN", "FAM", "ST4", "TTR", "HIG")
# 順序に従って変数を並べ替え
ordered_vars <- c("DAY30", "AGE", "A65", "SEX", "KILLIP", "SHO", "DIA", "HYP", "HRT", "ANT", "PMI", "HEI", "WEI", "SMK", "HTN", "LIP", "PAN", "FAM", "STE", "ST4", "TTR")
テーブル作成
CreateTableOne(vars = ordered_vars, factorVars = col_factors, data = merged_df, strata = "region") -> tableone
print(tableone, missing = T, test = F, explain = F)
## Stratified by region
## sample4 sample5 west Missing
## n 785 429 2188
## DAY30 = 1 52 ( 6.6) 24 ( 5.6) 135 ( 6.2) 0.0
## AGE 61.82 (11.12) 59.89 (11.77) 60.47 (12.03) 0.0
## A65 = 1 327 (41.7) 157 (36.6) 839 (38.3) 0.0
## SEX = 1 203 (25.9) 114 (26.6) 544 (24.9) 0.0
## KILLIP 0.0
## 1 616 (78.5) 367 (85.5) 1944 (88.8)
## 2 148 (18.9) 55 (12.8) 212 ( 9.7)
## 3 20 ( 2.5) 3 ( 0.7) 19 ( 0.9)
## 4 1 ( 0.1) 4 ( 0.9) 13 ( 0.6)
## SHO = 1 21 ( 2.7) 7 ( 1.6) 32 ( 1.5) 0.0
## DIA = 1 85 (10.8) 56 (13.1) 312 (14.3) 0.0
## HYP = 1 40 ( 5.1) 47 (11.0) 211 ( 9.6) 0.0
## HRT = 1 209 (26.6) 151 (35.2) 730 (33.4) 0.0
## ANT = 1 279 (35.5) 154 (35.9) 815 (37.2) 0.0
## PMI = 1 140 (17.8) 65 (15.2) 375 (17.1) 0.0
## HEI 169.24 (8.95) 171.91 (10.47) 172.13 (10.09) 0.0
## WEI 75.24 (14.00) 82.69 (18.83) 82.89 (17.69) 0.0
## SMK 0.0
## 1 262 (33.4) 184 (42.9) 903 (41.3)
## 2 306 (39.0) 127 (29.6) 674 (30.8)
## 3 217 (27.6) 118 (27.5) 611 (27.9)
## HTN = 1 295 (37.6) 168 (39.2) 883 (40.4) 0.0
## LIP = 1 305 (38.9) 148 (34.5) 886 (40.5) 0.0
## PAN = 1 295 (37.6) 134 (31.2) 745 (34.0) 0.0
## FAM = 1 325 (41.4) 205 (47.8) 1041 (47.6) 0.0
## STE 0.0
## 0 14 ( 1.8) 7 ( 1.6) 31 ( 1.4)
## 1 15 ( 1.9) 20 ( 4.7) 98 ( 4.5)
## 2 55 ( 7.0) 61 (14.2) 287 (13.1)
## 3 256 (32.6) 130 (30.3) 680 (31.1)
## 4 120 (15.3) 57 (13.3) 313 (14.3)
## 5 122 (15.5) 50 (11.7) 270 (12.3)
## 6 96 (12.2) 39 ( 9.1) 238 (10.9)
## 7 71 ( 9.0) 43 (10.0) 183 ( 8.4)
## 8 32 ( 4.1) 19 ( 4.4) 71 ( 3.2)
## 9 3 ( 0.4) 3 ( 0.7) 12 ( 0.5)
## 10 1 ( 0.1) 0 ( 0.0) 3 ( 0.1)
## 11 0 ( 0.0) 0 ( 0.0) 2 ( 0.1)
## ST4 = 1 325 (41.4) 154 (35.9) 779 (35.6) 0.0
## TTR = 1 395 (50.3) 263 (61.3) 1332 (60.9) 0.0
授業で紹介されたtbl_summary使ってみた。
merged_df %>%
select(DAY30, AGE, A65, SEX, KILLIP, SHO, DIA, HYP, HRT, ANT, PMI, HEI, WEI, SMK, HTN, LIP, PAN, FAM, STE, ST4, TTR, region) %>% #文字にも””いらない。factor入力不要わける変数も入れる必要がある。連続変数へのデフォルトはmedian(IQR)、指定は不要
tbl_summary(by=region #byでグループ化
#, statistic = list(all_continuous() ~ "{mean} ({sd})") # mean(SD)で記載したい
#, statistic = list(all_categorical() ~ "{n} / {N} ({p}%)")) # n(%)で記載したい
)
| Characteristic | sample4, N = 7851 | sample5, N = 4291 | west, N = 2,1881 |
|---|---|---|---|
| DAY30 | 52 (6.6%) | 24 (5.6%) | 135 (6.2%) |
| AGE | 63 (53, 71) | 60 (51, 69) | 61 (51, 70) |
| A65 | 327 (42%) | 157 (37%) | 839 (38%) |
| SEX | 203 (26%) | 114 (27%) | 544 (25%) |
| KILLIP | |||
| 1 | 616 (78%) | 367 (86%) | 1,944 (89%) |
| 2 | 148 (19%) | 55 (13%) | 212 (9.7%) |
| 3 | 20 (2.5%) | 3 (0.7%) | 19 (0.9%) |
| 4 | 1 (0.1%) | 4 (0.9%) | 13 (0.6%) |
| SHO | 21 (2.7%) | 7 (1.6%) | 32 (1.5%) |
| DIA | 85 (11%) | 56 (13%) | 312 (14%) |
| HYP | 40 (5.1%) | 47 (11%) | 211 (9.6%) |
| HRT | 209 (27%) | 151 (35%) | 730 (33%) |
| ANT | 279 (36%) | 154 (36%) | 815 (37%) |
| PMI | 140 (18%) | 65 (15%) | 375 (17%) |
| HEI | 170 (163, 175) | 173 (165, 180) | 173 (165, 180) |
| WEI | 75 (66, 84) | 80 (70, 92) | 82 (71, 92) |
| SMK | |||
| 1 | 262 (33%) | 184 (43%) | 903 (41%) |
| 2 | 306 (39%) | 127 (30%) | 674 (31%) |
| 3 | 217 (28%) | 118 (28%) | 611 (28%) |
| HTN | 295 (38%) | 168 (39%) | 883 (40%) |
| LIP | 305 (39%) | 148 (34%) | 886 (40%) |
| PAN | 295 (38%) | 134 (31%) | 745 (34%) |
| FAM | 325 (41%) | 205 (48%) | 1,041 (48%) |
| STE | 4 (3, 6) | 3 (3, 5) | 3 (3, 5) |
| ST4 | 325 (41%) | 154 (36%) | 779 (36%) |
| TTR | 395 (50%) | 263 (61%) | 1,332 (61%) |
| 1 n (%); Median (IQR) | |||
duplicated_rows_merged_df <- duplicated(merged_df)
num_duplicated_rows_merged_df <- sum(duplicated_rows_merged_df)
# 重複行の数を表示
cat("Number of duplicated rows in merged_df:", num_duplicated_rows_merged_df, "\n")
## Number of duplicated rows in merged_df: 0
# 重複行がある場合、それらの行を表示(オプション)
if (num_duplicated_rows_merged_df > 0) {
cat("Duplicated rows in merged_df:\n")
print(merged_df[duplicated_rows_merge_df, ])
}
dfSummary(merged_df)
## Data Frame Summary
## merged_df
## Dimensions: 3402 x 27
## Duplicates: 0
##
## ---------------------------------------------------------------------------------------------------------------
## No Variable Stats / Values Freqs (% of Valid) Graph Valid Missing
## ---- ------------- ------------------------- ---------------------- ---------------------- ---------- ---------
## 1 DAY30 Min : 0 0 : 3191 (93.8%) IIIIIIIIIIIIIIIIII 3402 0
## [integer] Mean : 0.1 1 : 211 ( 6.2%) I (100.0%) (0.0%)
## Max : 1
##
## 2 SEX Min : 0 0 : 2541 (74.7%) IIIIIIIIIIIIII 3402 0
## [integer] Mean : 0.3 1 : 861 (25.3%) IIIII (100.0%) (0.0%)
## Max : 1
##
## 3 AGE Mean (sd) : 60.7 (11.8) 2100 distinct values . : : . 3402 0
## [numeric] min < med < max: : : : : : (100.0%) (0.0%)
## 23.9 < 61.1 < 89.5 : : : : :
## IQR (CV) : 18.5 (0.2) . : : : : : :
## . : : : : : : : .
##
## 4 A65 Min : 0 0 : 2079 (61.1%) IIIIIIIIIIII 3402 0
## [integer] Mean : 0.4 1 : 1323 (38.9%) IIIIIII (100.0%) (0.0%)
## Max : 1
##
## 5 KILLIP Mean (sd) : 1.2 (0.4) 1 : 2927 (86.0%) IIIIIIIIIIIIIIIII 3402 0
## [integer] min < med < max: 2 : 415 (12.2%) II (100.0%) (0.0%)
## 1 < 1 < 4 3 : 42 ( 1.2%)
## IQR (CV) : 0 (0.4) 4 : 18 ( 0.5%)
##
## 6 SHO Min : 0 0 : 3342 (98.2%) IIIIIIIIIIIIIIIIIII 3402 0
## [integer] Mean : 0 1 : 60 ( 1.8%) (100.0%) (0.0%)
## Max : 1
##
## 7 DIA Min : 0 0 : 2949 (86.7%) IIIIIIIIIIIIIIIII 3402 0
## [integer] Mean : 0.1 1 : 453 (13.3%) II (100.0%) (0.0%)
## Max : 1
##
## 8 HYP Min : 0 0 : 3104 (91.2%) IIIIIIIIIIIIIIIIII 3402 0
## [integer] Mean : 0.1 1 : 298 ( 8.8%) I (100.0%) (0.0%)
## Max : 1
##
## 9 HRT Min : 0 0 : 2312 (68.0%) IIIIIIIIIIIII 3402 0
## [integer] Mean : 0.3 1 : 1090 (32.0%) IIIIII (100.0%) (0.0%)
## Max : 1
##
## 10 ANT Min : 0 0 : 2154 (63.3%) IIIIIIIIIIII 3402 0
## [integer] Mean : 0.4 1 : 1248 (36.7%) IIIIIII (100.0%) (0.0%)
## Max : 1
##
## 11 PMI Min : 0 0 : 2822 (83.0%) IIIIIIIIIIIIIIII 3402 0
## [integer] Mean : 0.2 1 : 580 (17.0%) III (100.0%) (0.0%)
## Max : 1
##
## 12 HEI Mean (sd) : 171.4 (10) 269 distinct values : 3402 0
## [numeric] min < med < max: : : . (100.0%) (0.0%)
## 140.9 < 172.6 < 205.7 . : : :
## IQR (CV) : 13 (0.1) : : : : :
## : : : : : : .
##
## 13 WEI Mean (sd) : 81.1 (17.4) 463 distinct values : : 3402 0
## [numeric] min < med < max: : : (100.0%) (0.0%)
## 36 < 80 < 180 : :
## IQR (CV) : 20.3 (0.2) : : : :
## . : : : : . .
##
## 14 HTN Min : 0 0 : 2056 (60.4%) IIIIIIIIIIII 3402 0
## [integer] Mean : 0.4 1 : 1346 (39.6%) IIIIIII (100.0%) (0.0%)
## Max : 1
##
## 15 SMK Mean (sd) : 1.9 (0.8) 1 : 1349 (39.7%) IIIIIII 3402 0
## [integer] min < med < max: 2 : 1107 (32.5%) IIIIII (100.0%) (0.0%)
## 1 < 2 < 3 3 : 946 (27.8%) IIIII
## IQR (CV) : 2 (0.4)
##
## 16 LIP Min : 0 0 : 2063 (60.6%) IIIIIIIIIIII 3402 0
## [integer] Mean : 0.4 1 : 1339 (39.4%) IIIIIII (100.0%) (0.0%)
## Max : 1
##
## 17 PAN Min : 0 0 : 2228 (65.5%) IIIIIIIIIIIII 3402 0
## [integer] Mean : 0.3 1 : 1174 (34.5%) IIIIII (100.0%) (0.0%)
## Max : 1
##
## 18 FAM Min : 0 0 : 1831 (53.8%) IIIIIIIIII 3402 0
## [integer] Mean : 0.5 1 : 1571 (46.2%) IIIIIIIII (100.0%) (0.0%)
## Max : 1
##
## 19 STE Mean (sd) : 4.1 (1.9) 12 distinct values : 3402 0
## [integer] min < med < max: : (100.0%) (0.0%)
## 0 < 4 < 11 : .
## IQR (CV) : 2 (0.5) : : : : . .
## : : : : : : : .
##
## 20 ST4 Min : 0 0 : 2144 (63.0%) IIIIIIIIIIII 3402 0
## [integer] Mean : 0.4 1 : 1258 (37.0%) IIIIIII (100.0%) (0.0%)
## Max : 1
##
## 21 TTR Min : 0 0 : 1412 (41.5%) IIIIIIII 3402 0
## [integer] Mean : 0.6 1 : 1990 (58.5%) IIIIIIIIIII (100.0%) (0.0%)
## Max : 1
##
## 22 ESAMP 1 distinct value 1 : 3402 (100.0%) IIIIIIIIIIIIIIIIIIII 3402 0
## [integer] (100.0%) (0.0%)
##
## 23 GRPL Mean (sd) : 2.1 (1.1) 1 : 1083 (31.8%) IIIIII 3402 0
## [integer] min < med < max: 2 : 1534 (45.1%) IIIIIIIII (100.0%) (0.0%)
## 1 < 2 < 4 4 : 785 (23.1%) IIII
## IQR (CV) : 1 (0.5)
##
## 24 GRPS Mean (sd) : 5.3 (3) 1 : 400 (11.8%) II 3402 0
## [integer] min < med < max: 2 : 259 ( 7.6%) I (100.0%) (0.0%)
## 1 < 5 < 11 3 : 282 ( 8.3%) I
## IQR (CV) : 3 (0.6) 4 : 329 ( 9.7%) I
## 5 : 858 (25.2%) IIIII
## 6 : 489 (14.4%) II
## 9 : 327 ( 9.6%) I
## 10 : 195 ( 5.7%) I
## 11 : 263 ( 7.7%) I
##
## 25 REGL Min : 1 1 : 2617 (76.9%) IIIIIIIIIIIIIII 3402 0
## [integer] Mean : 1.2 2 : 785 (23.1%) IIII (100.0%) (0.0%)
## Max : 2
##
## 26 HIG Min : 0 0 : 1771 (52.1%) IIIIIIIIII 3402 0
## [integer] Mean : 0.5 1 : 1631 (47.9%) IIIIIIIII (100.0%) (0.0%)
## Max : 1
##
## 27 region 1. sample4 785 (23.1%) IIII 3402 0
## [character] 2. sample5 429 (12.6%) II (100.0%) (0.0%)
## 3. west 2188 (64.3%) IIIIIIIIIIII
## ---------------------------------------------------------------------------------------------------------------
こちらの方が見やすいがHTML形式
dfSummary(merged_df) %>% view()
## Switching method to 'browser'
## Output file written: /var/folders/n9/tf_wmwpn3gl2t7l1cz4tqk000000gn/T//RtmpYRQI89/fileedde6d85192b.html
連続量にggplotを全部書くとこれ
merged_df |> #全体
select(col_continuous) |>
pivot_longer(cols = col_continuous, names_to = "name", values_to = "value") |>
ggplot()+
geom_histogram(aes(x = value), color = "black")+
facet_wrap(~ name, scales = "free", ncol = 5) +
theme_bw()+
theme(text = element_text(size = 12))
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(col_continuous)
##
## # Now:
## data %>% select(all_of(col_continuous))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
for (col_name in colnames(merged_df)) {
if (!(col_name %in% col_continuous)) {
merged_df[[col_name]] <- as.factor(merged_df[[col_name]])
}
}
変数が本当にfactorになったか確認
str(merged_df)
## 'data.frame': 3402 obs. of 27 variables:
## $ DAY30 : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 1 ...
## $ SEX : Factor w/ 2 levels "0","1": 1 1 1 2 1 1 1 1 1 2 ...
## $ AGE : num 70.3 59.8 59 80.4 64.8 ...
## $ A65 : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 1 ...
## $ KILLIP: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 2 1 1 ...
## $ SHO : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ DIA : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 1 1 1 ...
## $ HYP : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ HRT : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 1 1 ...
## $ ANT : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 2 1 ...
## $ PMI : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ HEI : num 177 172 170 155 167 ...
## $ WEI : num 84 115 76 50 97.4 100 82 90.9 96.4 61 ...
## $ HTN : Factor w/ 2 levels "0","1": 2 2 2 1 1 2 1 2 2 2 ...
## $ SMK : Factor w/ 3 levels "1","2","3": 3 1 1 3 1 1 2 2 2 2 ...
## $ LIP : Factor w/ 2 levels "0","1": 2 1 1 1 1 2 1 1 1 2 ...
## $ PAN : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 1 1 ...
## $ FAM : Factor w/ 2 levels "0","1": 1 2 2 1 2 2 1 2 2 2 ...
## $ STE : Factor w/ 12 levels "0","1","2","3",..: 2 7 4 4 3 5 4 8 9 4 ...
## $ ST4 : Factor w/ 2 levels "0","1": 1 2 1 1 1 1 1 2 2 1 ...
## $ TTR : Factor w/ 2 levels "0","1": 2 1 1 1 2 1 2 2 2 2 ...
## $ ESAMP : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
## $ GRPL : Factor w/ 3 levels "1","2","4": 2 1 1 1 1 1 1 1 1 1 ...
## $ GRPS : Factor w/ 9 levels "1","2","3","4",..: 5 2 6 2 6 6 6 1 1 6 ...
## $ REGL : Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
## $ HIG : Factor w/ 2 levels "0","1": 2 2 1 1 1 2 1 2 2 1 ...
## $ region: Factor w/ 3 levels "sample4","sample5",..: 3 3 3 3 3 3 3 3 3 3 ...
White(人種), Enrolled in United States(アメリカ人), Cerebrovascular disease, Pror bypass surgery, Prior angiography, sBP(sBP低い=HYPで代用), dBP, Infarct locatioはanteriorのみ, time courseに関しては変数なくtable化できず
col_continuous = c("AGE", "HEI", "WEI")
col_factors = c("DAY30", "SEX", "A65", "KILLIP", "SHO", "DIA", "HYP", "HRT", "ANT", "PMI", "HTN", "SMK", "STE", "LIP", "PAN", "FAM", "ST4", "TTR", "HIG")
# 順序に従って変数を並べ替え
ordered_vars <- c( "AGE","SEX","HEI","WEI", "HTN","DIA","SMK","LIP","FAM","PMI","PAN","HYP","HRT","SHO", "ANT","KILLIP")
CreateTableOne(vars = ordered_vars, factorVars = col_factors, data = merged_df, strata = "DAY30", addOverall = TRUE) -> tableone
print(tableone, missing =F , test = F, explain = T) #近藤さんのアドバイスよりadd_overall使ってみる
## Stratified by DAY30
## Overall 0 1
## n 3402 3191 211
## AGE (mean (SD)) 60.71 (11.80) 60.02 (11.56) 71.09 (10.62)
## SEX = 1 (%) 861 (25.3) 778 (24.4) 83 (39.3)
## HEI (mean (SD)) 171.44 (9.96) 171.68 (9.84) 167.69 (11.05)
## WEI (mean (SD)) 81.10 (17.36) 81.61 (17.23) 73.44 (17.57)
## HTN = 1 (%) 1346 (39.6) 1248 (39.1) 98 (46.4)
## DIA = 1 (%) 453 (13.3) 415 (13.0) 38 (18.0)
## SMK (%)
## 1 1349 (39.7) 1297 (40.6) 52 (24.6)
## 2 1107 (32.5) 1027 (32.2) 80 (37.9)
## 3 946 (27.8) 867 (27.2) 79 (37.4)
## LIP = 1 (%) 1339 (39.4) 1262 (39.5) 77 (36.5)
## FAM = 1 (%) 1571 (46.2) 1487 (46.6) 84 (39.8)
## PMI = 1 (%) 580 (17.0) 517 (16.2) 63 (29.9)
## PAN = 1 (%) 1174 (34.5) 1077 (33.8) 97 (46.0)
## HYP = 1 (%) 298 ( 8.8) 256 ( 8.0) 42 (19.9)
## HRT = 1 (%) 1090 (32.0) 982 (30.8) 108 (51.2)
## SHO = 1 (%) 60 ( 1.8) 32 ( 1.0) 28 (13.3)
## ANT = 1 (%) 1248 (36.7) 1127 (35.3) 121 (57.3)
## KILLIP (%)
## 1 2927 (86.0) 2798 (87.7) 129 (61.1)
## 2 415 (12.2) 361 (11.3) 54 (25.6)
## 3 42 ( 1.2) 24 ( 0.8) 18 ( 8.5)
## 4 18 ( 0.5) 8 ( 0.3) 10 ( 4.7)
Ageでモデル作成
fit_age <- glm(DAY30 ~ AGE, data = merged_df, family = binomial(link = "logit"))
anova_res_age <- anova(fit_age, test = "Chisq")
anova_res_age
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: DAY30
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 3401 1581.9
## AGE 1 190.91 3400 1391.0 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
chi2統計量算出
chi2_age <- round(anova_res_age$Deviance[2], 4)
chi2_age
## [1] 190.9144
df算出
df_age <- anova_res_age$Df[2]
df_age
## [1] 1
オッズ比・信頼区間・P値(wald)算出の準備
coef_table_age <- summary(fit_age)$coefficients
coef_table_age
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.99162103 0.549439156 -16.36509 3.395276e-60
## AGE 0.09517668 0.007746973 12.28566 1.081576e-34
オッズ比
OR_age <- round(exp(coef_table_age[2, 1]), 4)
OR_age
## [1] 1.0999
信頼区間
lower_ci_age <- round(exp(coef_table_age[2, 1] - 1.96 * coef_table_age[2, 2]), 4)
upper_ci_age <- round(exp(coef_table_age[2, 1] + 1.96 * coef_table_age[2, 2]), 4)
lower_ci_age
## [1] 1.0833
upper_ci_age
## [1] 1.1167
wald P値
wald_p_value_age <- round(coef_table_age[2, 4], 4)
wald_p_value_age
## [1] 0
課題5準備の内容を一気にforループでまとめる
# 各変数とDAY30との関連を調べる
variables <- c("AGE", "SEX", "DIA", "HYP", "HRT", "ANT", "PMI", "HTN", "SMK", "LIP", "PAN", "FAM", "TTR", "STE", "WEI", "HEI")
results <- data.frame()
for (var in variables) {
# 単変量ロジスティック回帰
fit <- glm(formula(paste("DAY30 ~", var)), data = merged_df, family = binomial(link = "logit"))
# カイ2乗統計量、p値、自由度を計算
anova_res <- anova(fit, test = "Chisq")
chi2 <- round(anova_res$Deviance[2], 4)
df <- anova_res$Df[2]
# オッズ比と95%信頼区間を計算
coef_table <- summary(fit)$coefficients
OR <- round(exp(coef_table[2, 1]), 4)
lower_ci <- round(exp(coef_table[2, 1] - 1.96 * coef_table[2, 2]), 4)
upper_ci <- round(exp(coef_table[2, 1] + 1.96 * coef_table[2, 2]), 4)
# Waldのp値を計算
wald_p_value <- round(coef_table[2, 4], 4)
# 結果をデータフレームにまとめる
temp_df <- data.frame(Variable = var, Chi2 = chi2, Df = df, P_value = wald_p_value, Odds_Ratio = OR, Lower_CI = lower_ci, Upper_CI = upper_ci)
results <- rbind(results, temp_df)
}
# 結果を表示
print(results)
## Variable Chi2 Df P_value Odds_Ratio Lower_CI Upper_CI
## 1 AGE 190.9144 1 0.0000 1.0999 1.0833 1.1167
## 2 SEX 21.4178 1 0.0000 2.0112 1.5082 2.6818
## 3 DIA 3.9551 1 0.0393 1.4693 1.0190 2.1186
## 4 HYP 27.1392 1 0.0000 2.8493 1.9853 4.0891
## 5 HRT 35.3915 1 0.0000 2.3587 1.7823 3.1215
## 6 ANT 39.5905 1 0.0000 2.4622 1.8566 3.2654
## 7 PMI 22.5992 1 0.0000 2.2017 1.6156 3.0003
## 8 HTN 4.3860 1 0.0354 1.3502 1.0208 1.7860
## 9 SMK 23.3759 2 0.0003 1.9429 1.3574 2.7810
## 10 LIP 0.7810 1 0.3792 0.8783 0.6578 1.1728
## 11 PAN 12.5858 1 0.0003 1.6702 1.2616 2.2109
## 12 FAM 3.7029 1 0.0560 0.7579 0.5704 1.0072
## 13 TTR 9.9844 1 0.0020 1.6028 1.1881 2.1622
## 14 STE 33.0947 11 0.8912 0.9074 0.2255 3.6511
## 15 WEI 48.4413 1 0.0000 0.9689 0.9599 0.9780
## 16 HEI 31.0721 1 0.0000 0.9618 0.9488 0.9750
# ロジスティック回帰モデルの構築
logistic_model <- glm(DAY30 ~ AGE + SEX + DIA + HYP + HRT + ANT + PMI + HTN + SMK + LIP + PAN + FAM + TTR + STE + WEI + HEI, family=binomial, data=merged_df)
# VIFを計算
vif_values <- vif(logistic_model)
print(vif_values)
## GVIF Df GVIF^(1/(2*Df))
## AGE 1.403436 1 1.184667
## SEX 2.367537 1 1.538680
## DIA 1.093940 1 1.045916
## HYP 1.089308 1 1.043699
## HRT 1.089584 1 1.043831
## ANT 1.441947 1 1.200811
## PMI 1.201310 1 1.096043
## HTN 1.118969 1 1.057813
## SMK 1.328080 2 1.073510
## LIP 1.099194 1 1.048425
## PAN 1.207484 1 1.098856
## FAM 1.081687 1 1.040042
## TTR 1.018052 1 1.008986
## STE 1.694841 11 1.024271
## WEI 1.809314 1 1.345107
## HEI 2.791547 1 1.670792
相関係数を出してみる。ただ相関係数はfacでは回らない。num変換したdfを作成ておく。
merged_df_num <- merged_df
# 数値でない変数を数値に変換
numeric_columns <- c("DAY30", "SEX", "A65", "KILLIP", "SHO", "DIA", "HYP", "HRT", "ANT", "PMI", "HTN", "SMK", "LIP", "PAN", "FAM", "STE", "ST4", "TTR", "ESAMP", "GRPL", "GRPS", "REGL", "HIG", "region")
merged_df_num[, numeric_columns] <- lapply(merged_df_num[, numeric_columns], function(x) as.numeric(as.character(x)))
## Warning in FUN(X[[i]], ...): 強制変換により NA が生成されました
# 相関係数を計算
cor_matrix <- cor(merged_df_num[,c('AGE','SEX','DIA','HYP','HRT','ANT','PMI','HTN','SMK','LIP','PAN','FAM','TTR','STE','WEI','HEI')])
print(cor_matrix)
## AGE SEX DIA HYP HRT ANT
## AGE 1.00000000 0.240956598 0.037322830 0.015698354 -0.03512409 0.045916736
## SEX 0.24095660 1.000000000 0.046469205 0.063565347 0.05380418 -0.004000619
## DIA 0.03732283 0.046469205 1.000000000 0.019339382 0.09429913 0.040967111
## HYP 0.01569835 0.063565347 0.019339382 1.000000000 -0.07237066 -0.054624260
## HRT -0.03512409 0.053804184 0.094299128 -0.072370662 1.00000000 0.133504992
## ANT 0.04591674 -0.004000619 0.040967111 -0.054624260 0.13350499 1.000000000
## PMI 0.10224262 -0.035578399 0.068488708 -0.004991698 0.03713332 -0.025574945
## HTN 0.16212032 0.138737860 0.118140775 -0.027437318 0.04862023 -0.022165380
## SMK 0.35192754 0.100646744 0.059253601 -0.046948927 -0.03633378 0.078673404
## LIP -0.06968258 0.038910236 0.013641363 0.003638442 -0.01291370 -0.030215534
## PAN 0.10226935 0.039644019 0.063103963 -0.012766589 0.03690381 -0.013693939
## FAM -0.16645320 0.012749203 0.003141893 -0.011705654 0.01851360 -0.044422501
## TTR -0.01774476 -0.044788132 0.028125637 0.049979606 0.03887055 0.025972556
## STE 0.01680141 0.016356893 0.031046738 -0.032695313 0.10885496 0.510638970
## WEI -0.30268768 -0.387028843 0.118447011 -0.046279089 0.06061579 -0.025924615
## HEI -0.23843175 -0.682184760 -0.050941331 -0.031410087 -0.03419494 -0.006086802
## PMI HTN SMK LIP PAN FAM
## AGE 0.102242625 0.16212032 0.35192754 -0.0696825817 0.10226935 -0.166453202
## SEX -0.035578399 0.13873786 0.10064674 0.0389102360 0.03964402 0.012749203
## DIA 0.068488708 0.11814078 0.05925360 0.0136413633 0.06310396 0.003141893
## HYP -0.004991698 -0.02743732 -0.04694893 0.0036384424 -0.01276659 -0.011705654
## HRT 0.037133324 0.04862023 -0.03633378 -0.0129136961 0.03690381 0.018513600
## ANT -0.025574945 -0.02216538 0.07867340 -0.0302155337 -0.01369394 -0.044422501
## PMI 1.000000000 0.05838171 0.02856943 0.0859430422 0.33351784 0.045724419
## HTN 0.058381709 1.00000000 0.12014826 0.1356254107 0.11444384 0.058401276
## SMK 0.028569425 0.12014826 1.00000000 0.0300686924 0.06548009 -0.065943337
## LIP 0.085943042 0.13562541 0.03006869 1.0000000000 0.08976143 0.135974891
## PAN 0.333517843 0.11444384 0.06548009 0.0897614336 1.00000000 0.060601067
## FAM 0.045724419 0.05840128 -0.06594334 0.1359748914 0.06060107 1.000000000
## TTR 0.039228723 -0.00529810 0.00127339 -0.0003029414 0.01665045 0.010821685
## STE -0.045396756 0.01749708 0.03234702 -0.0385377051 -0.02558700 -0.041456472
## WEI 0.008278587 0.05311352 -0.08039774 0.0407625308 -0.01959279 0.049680874
## HEI -0.014284866 -0.10324884 -0.13239462 -0.0189568298 -0.05170059 0.012901563
## TTR STE WEI HEI
## AGE -0.0177447577 0.016801406 -0.302687676 -0.238431747
## SEX -0.0447881317 0.016356893 -0.387028843 -0.682184760
## DIA 0.0281256374 0.031046738 0.118447011 -0.050941331
## HYP 0.0499796061 -0.032695313 -0.046279089 -0.031410087
## HRT 0.0388705483 0.108854961 0.060615787 -0.034194943
## ANT 0.0259725555 0.510638970 -0.025924615 -0.006086802
## PMI 0.0392287235 -0.045396756 0.008278587 -0.014284866
## HTN -0.0052980997 0.017497080 0.053113520 -0.103248838
## SMK 0.0012733900 0.032347021 -0.080397739 -0.132394618
## LIP -0.0003029414 -0.038537705 0.040762531 -0.018956830
## PAN 0.0166504520 -0.025587004 -0.019592785 -0.051700592
## FAM 0.0108216849 -0.041456472 0.049680874 0.012901563
## TTR 1.0000000000 0.050510937 0.034448781 0.030388885
## STE 0.0505109370 1.000000000 -0.032954770 -0.007745443
## WEI 0.0344487805 -0.032954770 1.000000000 0.543993244
## HEI 0.0303888848 -0.007745443 0.543993244 1.000000000
GVIFは一般化VIFであり複数の自由度をもつ変数に対して適用可能。 GVIF^(1/(2Df))はGVIFの変数を自由度で調整した指標。 GVIF^(1/(2Df))で確認するのが実務的であり、5以上で多重共線性ありと考えられる。 しかし、本結果ではどの因子も5未満であるためすべての変数は投入可能と考えられる。
# datadistオプションを設定
options(datadist = "ddist")
# データを用意
ddist <- datadist(merged_df)
# 年齢と死亡のlog(odds)の関係を4つのKnotを持つRCSでモデル化 glm関数ではまわせなかった。
fit_age <- lrm(DAY30 ~ rcs(AGE, 4), data = merged_df, x = TRUE, y = TRUE)
AIC(fit_age)
## [1] 1392.673
plot(rms::Predict(fit_age, AGE))
# 体重と死亡のlog(odds)の関係を4つのKnotを持つRCSでモデル化 glm関数ではまわせなかった。
fit_wei <- lrm(DAY30 ~ rcs(WEI, 4), data = merged_df, x = TRUE, y = TRUE)
AIC(fit_wei)
## [1] 1531.48
plot(rms::Predict(fit_wei, WEI))
# 身長と死亡のlog(odds)の関係を4つのKnotを持つRCSでモデル化 glm関数ではまわせなかった。
fit_hei <- lrm(DAY30 ~ rcs(HEI, 4), data = merged_df, x = TRUE, y = TRUE)
AIC(fit_hei)
## [1] 1555.067
plot(rms::Predict(fit_hei, HEI))
lrmでやってみる
# 交互作用なしのモデル
fit1 <- lrm(DAY30 ~ AGE + SEX, data = merged_df)
# 交互作用ありのモデル
fit2 <- lrm(DAY30 ~ AGE * SEX, data = merged_df)
# 尤度比検定
lrm_result <- lrtest(fit1, fit2)
print(lrm_result)
##
## Model 1: DAY30 ~ AGE + SEX
## Model 2: DAY30 ~ AGE * SEX
##
## L.R. Chisq d.f. P
## 1.0128974 1.0000000 0.3142097
glmでもやってみる
# 交互作用なしのモデル
fit1_glm <- glm(DAY30 ~ AGE + SEX, data = merged_df, family = binomial())
# 交互作用ありのモデル
fit2_glm <- glm(DAY30 ~ AGE * SEX, data = merged_df, family = binomial())
# 尤度比検定
glm_result <- anova(fit1_glm, fit2_glm, test = "Chisq")
print(glm_result)
## Analysis of Deviance Table
##
## Model 1: DAY30 ~ AGE + SEX
## Model 2: DAY30 ~ AGE * SEX
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 3399 1389.3
## 2 3398 1388.3 1 1.0129 0.3142
以上の結果から交互作用のありモデルとなしモデルではp=0.31と有意差は認めず、年齢と性別の交互作用モデルは用いる必要はないと判断される。
年齢・性別の交互作用は不要。 年齢のスプラインは直線でありスプライン変数は不要。 体重は曲線でありスプライン変数が必要。 身長も線形でありスプライン変数は不要。 以上よりロジスティクモデルを作成する。
# 'WEI'のスプライン変数を考慮したモデル
fit_final <- lrm(DAY30 ~ AGE + SEX + DIA + HYP + HRT + ANT + PMI + HTN + SMK + LIP + PAN + FAM + TTR + STE + rcs(WEI, 4) + HEI, data = merged_df)
# 推定値(回帰係数、標準誤差、オッズ比、95%信頼区間、P値)の表示
summary(fit_final)
## Effects Response : DAY30
##
## Factor Low High Diff. Effect S.E. Lower 0.95 Upper 0.95
## AGE 51.479 69.949 18.471 1.731500 0.17689 1.3848e+00 2.0782e+00
## Odds Ratio 51.479 69.949 18.471 5.649100 NA 3.9940e+00 7.9900e+00
## WEI 70.000 90.300 20.300 -0.403240 0.22871 -8.5151e-01 4.5023e-02
## Odds Ratio 70.000 90.300 20.300 0.668150 NA 4.2677e-01 1.0461e+00
## HEI 165.000 178.000 13.000 -0.021745 0.16102 -3.3734e-01 2.9385e-01
## Odds Ratio 165.000 178.000 13.000 0.978490 NA 7.1367e-01 1.3416e+00
## SEX - 1:0 1.000 2.000 NA -0.194630 0.24798 -6.8067e-01 2.9140e-01
## Odds Ratio 1.000 2.000 NA 0.823140 NA 5.0628e-01 1.3383e+00
## DIA - 1:0 1.000 2.000 NA 0.212020 0.21387 -2.0715e-01 6.3120e-01
## Odds Ratio 1.000 2.000 NA 1.236200 NA 8.1290e-01 1.8799e+00
## HYP - 1:0 1.000 2.000 NA 1.374400 0.21275 9.5740e-01 1.7913e+00
## Odds Ratio 1.000 2.000 NA 3.952600 NA 2.6049e+00 5.9975e+00
## HRT - 1:0 1.000 2.000 NA 0.859220 0.16095 5.4377e-01 1.1747e+00
## Odds Ratio 1.000 2.000 NA 2.361300 NA 1.7225e+00 3.2371e+00
## ANT - 1:0 1.000 2.000 NA 0.538310 0.18513 1.7546e-01 9.0115e-01
## Odds Ratio 1.000 2.000 NA 1.713100 NA 1.1918e+00 2.4624e+00
## PMI - 1:0 1.000 2.000 NA 0.579230 0.18979 2.0725e-01 9.5122e-01
## Odds Ratio 1.000 2.000 NA 1.784700 NA 1.2303e+00 2.5889e+00
## HTN - 1:0 1.000 2.000 NA 0.038377 0.16344 -2.8197e-01 3.5872e-01
## Odds Ratio 1.000 2.000 NA 1.039100 NA 7.5430e-01 1.4315e+00
## SMK - 2:1 1.000 2.000 NA -0.018901 0.20997 -4.3043e-01 3.9263e-01
## Odds Ratio 1.000 2.000 NA 0.981280 NA 6.5023e-01 1.4809e+00
## SMK - 3:1 1.000 3.000 NA -0.076767 0.21634 -5.0079e-01 3.4726e-01
## Odds Ratio 1.000 3.000 NA 0.926110 NA 6.0605e-01 1.4152e+00
## LIP - 1:0 1.000 2.000 NA 0.091396 0.16704 -2.3599e-01 4.1878e-01
## Odds Ratio 1.000 2.000 NA 1.095700 NA 7.8979e-01 1.5201e+00
## PAN - 1:0 1.000 2.000 NA 0.169890 0.17071 -1.6471e-01 5.0448e-01
## Odds Ratio 1.000 2.000 NA 1.185200 NA 8.4814e-01 1.6561e+00
## FAM - 1:0 1.000 2.000 NA 0.016496 0.16314 -3.0325e-01 3.3625e-01
## Odds Ratio 1.000 2.000 NA 1.016600 NA 7.3841e-01 1.3997e+00
## TTR - 0:1 2.000 1.000 NA -0.440990 0.16533 -7.6504e-01 -1.1695e-01
## Odds Ratio 2.000 1.000 NA 0.643400 NA 4.6532e-01 8.8963e-01
## STE - 0:3 4.000 1.000 NA 0.393950 0.69796 -9.7403e-01 1.7619e+00
## Odds Ratio 4.000 1.000 NA 1.482800 NA 3.7756e-01 5.8237e+00
## STE - 1:3 4.000 2.000 NA 0.466030 0.44925 -4.1448e-01 1.3465e+00
## Odds Ratio 4.000 2.000 NA 1.593700 NA 6.6068e-01 3.8441e+00
## STE - 2:3 4.000 3.000 NA 0.046583 0.31327 -5.6742e-01 6.6059e-01
## Odds Ratio 4.000 3.000 NA 1.047700 NA 5.6698e-01 1.9359e+00
## STE - 4:3 4.000 5.000 NA 0.309310 0.27032 -2.2051e-01 8.3912e-01
## Odds Ratio 4.000 5.000 NA 1.362500 NA 8.0211e-01 2.3143e+00
## STE - 5:3 4.000 6.000 NA 0.381850 0.27012 -1.4757e-01 9.1128e-01
## Odds Ratio 4.000 6.000 NA 1.465000 NA 8.6280e-01 2.4875e+00
## STE - 6:3 4.000 7.000 NA 0.356020 0.29238 -2.1703e-01 9.2907e-01
## Odds Ratio 4.000 7.000 NA 1.427600 NA 8.0490e-01 2.5321e+00
## STE - 7:3 4.000 8.000 NA 1.054500 0.30224 4.6215e-01 1.6469e+00
## Odds Ratio 4.000 8.000 NA 2.870600 NA 1.5875e+00 5.1909e+00
## STE - 8:3 4.000 9.000 NA 0.354840 0.41955 -4.6747e-01 1.1772e+00
## Odds Ratio 4.000 9.000 NA 1.426000 NA 6.2659e-01 3.2451e+00
## STE - 9:3 4.000 10.000 NA 0.953430 0.80023 -6.1499e-01 2.5218e+00
## Odds Ratio 4.000 10.000 NA 2.594600 NA 5.4064e-01 1.2452e+01
## STE - 10:3 4.000 11.000 NA 1.127600 1.27000 -1.3614e+00 3.6167e+00
## Odds Ratio 4.000 11.000 NA 3.088300 NA 2.5629e-01 3.7214e+01
## STE - 11:3 4.000 12.000 NA -4.805700 55.68800 -1.1395e+02 1.0434e+02
## Odds Ratio 4.000 12.000 NA 0.008183 NA 3.2478e-50 2.0618e+45