# ライブラリーの読み込み
pacman::p_load(foreign,
tidyverse,
Hmisc,
rms,
mice,
naniar,
gtsummary,
corrr,
survival,
survminer)
1-1. foreign packageのread.spssコマンドを使いデータを読み込みます。
# データの読み込み
dat <- read.spss("SMARTs.sav",
use.value.labels = FALSE,
to.data.frame = TRUE)
1-2. SMARTに含まれる変数を確認しましょう。
# データの確認→全てdouble型
glimpse(dat)
## Rows: 3,873
## Columns: 29
## $ TEVENT <dbl> 3466, 3465, 3465, 2445, 3463, 1642, 991, 1970, 2623, 3131, 34…
## $ EVENT <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ SEX <dbl> 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1…
## $ AGE <dbl> 71, 46, 59, 76, 57, 52, 66, 72, 75, 53, 40, 75, 63, 56, 66, 5…
## $ DIABETES <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ CEREBRAL <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0…
## $ CARDIAC <dbl> 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AAA <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ PERIPH <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1…
## $ STENOSIS <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0…
## $ SYSTBP <dbl> 185, 135, 149, 140, 177, 164, 166, 131, 175, 174, 144, 156, 1…
## $ DIASTBP <dbl> 86, 82, 97, 74, 96, 73, 70, 76, 68, 89, 84, 79, 86, 79, 83, 7…
## $ SYSTH <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DIASTH <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LENGTHO <dbl> 1.70, 1.72, 1.77, 1.82, 1.90, 1.73, 1.65, 1.60, 1.70, 1.83, 1…
## $ WEIGHTO <dbl> 69, 71, 82, 98, 107, 103, 66, 80, 80, 75, 62, 70, 95, 70, 74,…
## $ BMIO <dbl> 23.88, 24.00, 26.17, 29.59, 29.64, 34.41, 24.24, 31.25, 27.68…
## $ CHOLO <dbl> 6.0, 4.0, 4.9, 5.3, 5.2, 4.6, 5.8, 6.1, 4.4, 6.0, 4.6, 6.2, 5…
## $ HDLO <dbl> 0.94, 1.26, 1.28, 1.00, 0.81, 0.95, 0.87, 1.32, 0.55, 0.95, 1…
## $ LDLO <dbl> 4.03, 2.23, 3.17, 3.40, NA, 2.38, 3.33, 4.39, 2.26, 3.82, 2.5…
## $ TRIGO <dbl> 2.29, 1.13, 1.01, 2.01, 4.76, 2.82, 3.55, 0.87, 3.54, 2.74, 1…
## $ HOMOCO <dbl> NA, 8.7, NA, NA, NA, NA, NA, NA, NA, NA, 13.3, NA, NA, NA, NA…
## $ GLUTO <dbl> 6.3, 4.7, 6.1, 5.0, 6.1, 10.2, 5.4, 5.3, 5.9, 10.8, 5.0, 4.8,…
## $ CREATO <dbl> 95, 66, 93, 79, 91, 180, 81, 101, 157, 86, 56, 96, 89, 84, 84…
## $ IMTO <dbl> 0.82, 0.57, 0.83, 1.45, 1.07, 0.97, 1.15, 0.70, 1.48, 0.87, 0…
## $ albumin <dbl> 1, 1, 1, NA, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, …
## $ SMOKING <dbl> 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1…
## $ packyrs <dbl> 22.5, 0.0, 33.3, 49.0, 53.2, 54.6, 72.8, 28.6, 16.5, 29.7, 21…
## $ alcohol <dbl> 3, 1, 3, 3, 3, 1, 3, 2, 1, 3, 2, 1, 3, 3, 3, 3, 3, 2, 3, 1, 3…
# factor型にすべきものをする
# TEVENTは日数なので整数にする(必要あるか?)
dat <- dat %>%
mutate(across(.cols = c(SEX, DIABETES, CEREBRAL, CARDIAC,
AAA, PERIPH, STENOSIS, albumin, SMOKING, alcohol),
.fns = factor)) %>%
mutate(across(.cols = TEVENT,
.fns = round))
# 変数のまとめ
# Table 23.3と比べると、femaleが一人だけ多いが・・・
tbl_1 <- dat %>%
dplyr::select(-TEVENT, -EVENT) %>%
dplyr::select(SEX, AGE, SMOKING, packyrs, alcohol, BMIO, DIABETES,
SYSTH, SYSTBP, DIASTH, DIASTBP, CHOLO, HDLO, LDLO, TRIGO,
CEREBRAL, CARDIAC, PERIPH, AAA, HOMOCO, GLUTO, CREATO,
albumin, IMTO, STENOSIS) %>%
tbl_summary(missing_text = "Missing") %>%
add_n() %>%
modify_header(label ~ "**Variable**") %>%
modify_caption("**Table1. Baseline Characteristics**") %>%
bold_labels()
tbl_1
| Variable | N | N = 3,8731 |
|---|---|---|
| SEX | 3,873 | |
| 1 | 2,897 (75%) | |
| 2 | 976 (25%) | |
| AGE | 3,873 | 60 (52, 68) |
| SMOKING | 3,848 | |
| 1 | 693 (18%) | |
| 2 | 2,711 (70%) | |
| 3 | 444 (12%) | |
| Missing | 25 | |
| packyrs | 3,852 | 20 (6, 34) |
| Missing | 21 | |
| alcohol | 3,848 | |
| 1 | 751 (20%) | |
| 2 | 408 (11%) | |
| 3 | 2,689 (70%) | |
| Missing | 25 | |
| BMIO | 3,870 | 26.3 (24.1, 28.7) |
| Missing | 3 | |
| DIABETES | 3,833 | |
| 0 | 2,987 (78%) | |
| 1 | 846 (22%) | |
| Missing | 40 | |
| SYSTH | 2,375 | 140 (126, 156) |
| Missing | 1,498 | |
| SYSTBP | 2,650 | 139 (127, 154) |
| Missing | 1,223 | |
| DIASTH | 2,374 | 82 (75, 90) |
| Missing | 1,499 | |
| DIASTBP | 2,652 | 79 (73, 86) |
| Missing | 1,221 | |
| CHOLO | 3,855 | 5.10 (4.40, 5.90) |
| Missing | 18 | |
| HDLO | 3,843 | 1.17 (0.96, 1.42) |
| Missing | 30 | |
| LDLO | 3,657 | 3.06 (2.39, 3.83) |
| Missing | 216 | |
| TRIGO | 3,845 | 1.54 (1.12, 2.23) |
| Missing | 28 | |
| CEREBRAL | 3,873 | |
| 0 | 2,726 (70%) | |
| 1 | 1,147 (30%) | |
| CARDIAC | 3,873 | |
| 0 | 1,713 (44%) | |
| 1 | 2,160 (56%) | |
| PERIPH | 3,873 | |
| 0 | 2,933 (76%) | |
| 1 | 940 (24%) | |
| AAA | 3,873 | |
| 0 | 3,457 (89%) | |
| 1 | 416 (11%) | |
| HOMOCO | 3,410 | 12.8 (10.3, 15.7) |
| Missing | 463 | |
| GLUTO | 3,854 | 5.70 (5.30, 6.50) |
| Missing | 19 | |
| CREATO | 3,856 | 89 (78, 101) |
| Missing | 17 | |
| albumin | 3,666 | |
| 1 | 2,897 (79%) | |
| 2 | 655 (18%) | |
| 3 | 114 (3.1%) | |
| Missing | 207 | |
| IMTO | 3,775 | 0.88 (0.75, 1.07) |
| Missing | 98 | |
| STENOSIS | 3,780 | |
| 0 | 3,058 (81%) | |
| 1 | 722 (19%) | |
| Missing | 93 | |
| 1 n (%); Median (IQR) | ||
1-3. 連続変数間(特に血圧)の相関を確認しましょう。
# ここではNAはomitする
dat_cor <- dat %>%
dplyr::select(where(is.double)) %>%
dplyr::select(-TEVENT) %>%
drop_na() %>%
correlate() %>%
shave()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
dat_cor
## # A tibble: 18 × 19
## term EVENT AGE SYSTBP DIASTBP SYSTH DIASTH LENGTHO WEIGHTO
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EVENT NA NA NA NA NA NA NA NA
## 2 AGE 0.175 NA NA NA NA NA NA NA
## 3 SYSTBP 0.0802 0.322 NA NA NA NA NA NA
## 4 DIASTBP 0.000478 0.0306 0.665 NA NA NA NA NA
## 5 SYSTH 0.108 0.288 0.693 0.454 NA NA NA NA
## 6 DIASTH 0.0252 -0.0624 0.408 0.598 0.682 NA NA NA
## 7 LENGTHO -0.0102 -0.163 -0.118 0.0532 -0.133 -4.26e-4 NA NA
## 8 WEIGHTO -0.0259 -0.149 -0.0289 0.125 -0.0622 8.53e-2 0.539 NA
## 9 BMIO -0.0175 -0.0584 0.0473 0.108 0.0183 9.97e-2 -0.0807 0.791
## 10 CHOLO -0.0305 0.0253 0.0539 0.0591 0.0409 6.62e-2 -0.117 -0.104
## 11 HDLO -0.0997 0.0891 0.121 0.0615 0.0966 6.86e-2 -0.163 -0.286
## 12 LDLO -0.00669 0.0171 0.00509 0.0226 -0.00176 3.50e-2 -0.0715 -0.0600
## 13 TRIGO 0.0358 -0.0738 0.0214 0.0530 0.0283 2.86e-2 0.0274 0.176
## 14 HOMOCO 0.147 0.225 0.152 0.114 0.140 9.03e-2 -0.0169 -0.0442
## 15 GLUTO 0.0850 0.0987 0.164 0.0632 0.117 -1.45e-2 -0.0353 0.146
## 16 CREATO 0.194 0.0152 0.0514 0.0457 0.0666 5.81e-2 0.0168 -0.0738
## 17 IMTO 0.171 0.357 0.247 0.0422 0.294 3.61e-2 -0.00483 0.0565
## 18 packyrs 0.0579 0.0603 0.0194 0.0131 0.0251 -7.91e-3 0.0992 0.142
## # ℹ 10 more variables: BMIO <dbl>, CHOLO <dbl>, HDLO <dbl>, LDLO <dbl>,
## # TRIGO <dbl>, HOMOCO <dbl>, GLUTO <dbl>, CREATO <dbl>, IMTO <dbl>,
## # packyrs <dbl>
# やはり血圧同士の相関はつよめ。
plot_cor <- rplot(dat_cor)
plot_cor
1-4. 欠測パターンを確認しましょう。
# Hmiscパッケージ
na.patterns <- naclus(dat)
naplot1 <- plot(na.patterns, ylab = "Fraction of NAs in common")
naplot1
## NULL
naplot2 <- naplot(na.patterns, which = c("na per var"))
naplot2
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 924 197 1975 637 91 16 14 6 6 2 2 3
naplot3 <- naplot(na.patterns, which = c("na per obs"))
naplot3
##
## 0 1 2 3 4 5 6 7 8 9 10 11
## 924 197 1975 637 91 16 14 6 6 2 2 3
# naniarパッケージ
naplot4 <- gg_miss_var(dat, show_pct = TRUE)
naplot4
naplot5 <- vis_miss(dat)
naplot5
2-1. TEVENT(観察時間)の情報を用いて、ヒストグラム、総追跡人年、追跡期間の平均と中央値と求めましょう。
# TEVENTはの単位はday
# AGEのヒストグラムを作成
hist_TEVENT <- ggplot(data = dat) +
geom_histogram(aes(x = TEVENT)) +
theme_classic()
hist_TEVENT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# 総追跡人年
total_person_year <- round(sum(dat$TEVENT)/365.25, 2)
cat("total person-year is ", total_person_year, "\n")
## total person-year is 14530.62
# 追跡期間の平均と中央値
summary(dat$TEVENT)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 555 1213 1370 2165 3466
2-2.
EVENTとTEVENTの情報から、カプランマイヤーカーブを描きましょう。
追跡期間を無視した累積罹患率とカプランマイヤー法による累積罹患率を比べてみましょう。
surv_fit_1 <- survfit(Surv(time = TEVENT, event = EVENT) ~ 1,
data = dat)
# 生命表(6m, 1y, 3y, 5y, 7yを抜粋)
summary(surv_fit_1, times = c(180, 360, 360*3, 360*5, 360*7))
## Call: survfit(formula = Surv(time = TEVENT, event = EVENT) ~ 1, data = dat)
##
## time n.risk n.event survival std.err lower 95% CI upper 95% CI
## 180 3531 86 0.977 0.00244 0.972 0.982
## 360 3187 58 0.961 0.00323 0.954 0.967
## 1080 2122 129 0.915 0.00502 0.905 0.925
## 1800 1314 99 0.862 0.00703 0.848 0.876
## 2520 594 61 0.808 0.00952 0.789 0.827
# KMカーブ
KMcurve_1 <- ggsurvplot(surv_fit_1, data = dat)
KMcurve_1
2-3. 次に性別毎のカプランマイヤーカーブを描きましょう。
surv_fit_2 <- survfit(Surv(time = TEVENT, event = EVENT) ~ SEX,
data = dat)
KMcurve_2 <- ggsurvplot(surv_fit_2, data = dat, pval = TRUE)
KMcurve_2
2-4. いくつかの変数を使ってCoxモデルを当てはめてみましょう。
# 次章で欠測値の処理について検討するので、ここではcomplete case analysisとした。
dat_comp <- dat %>%
dplyr::select(TEVENT, EVENT,
SEX, AGE, SMOKING, alcohol, BMIO) %>%
drop_na()
cph_1 <- cph(Surv(time = TEVENT, event = EVENT) ~ SEX + AGE + SMOKING + alcohol + BMIO,
data = dat_comp)
cph_1
## Cox Proportional Hazards Model
##
## cph(formula = Surv(time = TEVENT, event = EVENT) ~ SEX + AGE +
## SMOKING + alcohol + BMIO, data = dat_comp)
##
## Model Tests Discrimination
## Indexes
## Obs 3841 LR chi2 129.40 R2 0.040
## Events 453 d.f. 7 R2(7,3841)0.031
## Center 2.456 Pr(> chi2) 0.0000 R2(7,453)0.237
## Score chi2 121.74 Dxy 0.270
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## SEX=2 -0.3958 0.1263 -3.13 0.0017
## AGE 0.0495 0.0052 9.48 <0.0001
## SMOKING=2 0.3067 0.1406 2.18 0.0292
## SMOKING=3 0.3527 0.2316 1.52 0.1278
## alcohol=2 -0.0861 0.1658 -0.52 0.6035
## alcohol=3 -0.3138 0.1181 -2.66 0.0079
## BMIO -0.0157 0.0136 -1.16 0.2465
2-5. Cox比例ハザード性が成り立っているか検討しましょう。
# 成り立っているとはいえない。とくにAGE, SMOKING
zph_1 <- cox.zph(cph_1)
zph_1
## chisq df p
## SEX 2.8910 1 0.0891
## AGE 6.6669 1 0.0098
## SMOKING 12.5043 2 0.0019
## alcohol 0.9210 2 0.6310
## BMIO 0.0904 1 0.7637
## GLOBAL 22.7550 7 0.0019
zph_plot <- ggcoxzph(zph_1)
zph_plot
3-1-1. 多重代入を行ってみましょう。mice package
dat_mice <- mice(dat,
m = 5,
maxit = 5,
print = FALSE)
3-1-2. HarrellによるaregImpute
dat_areg <- aregImpute( ~ I(TEVENT) + EVENT + SEX + I(AGE) + SYSTBP + DIASTBP + SYSTH + DIASTH +
DIABETES + CEREBRAL + CARDIAC + AAA + PERIPH + STENOSIS + I(LENGTHO) +
I(WEIGHTO) + I(BMIO) + I(CHOLO) + I(HDLO) + I(LDLO) + I(TRIGO) +
I(HOMOCO) + I(GLUTO) + I(CREATO) + I(IMTO) + albumin + SMOKING + I(packyrs) + alcohol,
n.impute = 5,
data = dat)
## Iteration 1
Iteration 2
Iteration 3
Iteration 4
Iteration 5
Iteration 6
Iteration 7
Iteration 8
3-2. 多重代入後の確認(plotを用いると便利)
plot_mice <- plot(dat_mice)
plot_mice
densityplot_mice <- densityplot(dat_mice)
densityplot_mice
3-3. 多重代入後の一つのデータを抜き出してCox比例ハザードモデルを実施しましょう。
dat_mice_1 <- complete(dat_mice, 1)
# cph実行は省略。
3-4. 多重代入後のデータでCox比例ハザードモデルを実施しましょう
formula_1 <- Surv(time = TEVENT, event = EVENT) ~
SEX + AGE + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA +
PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO +
GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol
# fit.mult.imputeにmiceを行ったデータを適用
cph_imp_1 <- fit.mult.impute(formula_1, cph, xtrans = dat_mice, data = dat)
##
## Variance Inflation Factors Due to Imputation:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 1.02 1.04 1.20 1.46 1.61 1.71 1.00
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 1.01 1.02 1.05 1.03 1.02 1.01 1.01
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 1.01 1.01 1.00 1.01 1.01 1.44 1.01
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 1.02 1.03 1.08 1.12 1.07 1.02 1.01
## alcohol=2 alcohol=3
## 1.03 1.02
##
## Rate of Missing Information:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 0.02 0.04 0.17 0.32 0.38 0.41 0.00
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 0.01 0.02 0.04 0.03 0.02 0.01 0.01
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 0.01 0.01 0.00 0.01 0.01 0.31 0.01
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 0.02 0.03 0.07 0.11 0.07 0.02 0.01
## alcohol=2 alcohol=3
## 0.03 0.02
##
## d.f. for t-distribution for Tests of Single Coefficients:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 10277.46 2603.66 142.69 39.90 27.59 23.31 3880576.06
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 22230.65 9446.46 2009.77 3967.98 15826.65 20801.17 28346.18
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 31390.58 107566.49 386918.71 88716.39 100891.42 42.29 99491.40
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 7006.99 3539.72 777.63 346.14 868.62 10910.79 46944.98
## alcohol=2 alcohol=3
## 4691.75 10308.21
##
## The following fit components were averaged over the 5 model fits:
##
## linear.predictors means stats center
cph_imp_1
## Cox Proportional Hazards Model
##
## fit.mult.impute(formula = formula_1, fitter = cph, xtrans = dat_mice,
## data = dat)
##
## Model Tests Discrimination
## Indexes
## Obs 3873 LR chi2 278.63 R2 0.083
## Events 460 d.f. 30 R2(30,3873)0.062
## Center 5.8105 Pr(> chi2) 0.0000 R2(30,460)0.418
## Score chi2 350.19 Dxy 0.389
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## SEX=2 -0.1785 0.1648 -1.08 0.2788
## AGE 0.0317 0.0063 5.01 <0.0001
## SYSTBP -0.0012 0.0046 -0.26 0.7961
## DIASTBP -0.0013 0.0092 -0.14 0.8911
## SYSTH 0.0002 0.0050 0.04 0.9644
## DIASTH 0.0093 0.0091 1.02 0.3085
## DIABETES=1 0.0313 0.1561 0.20 0.8412
## CEREBRAL=1 0.2867 0.1280 2.24 0.0251
## CARDIAC=1 0.4008 0.1124 3.56 0.0004
## AAA=1 0.6947 0.1274 5.45 <0.0001
## PERIPH=1 0.2876 0.1182 2.43 0.0150
## STENOSIS=1 0.2165 0.1219 1.78 0.0758
## LENGTHO 1.8160 3.9335 0.46 0.6443
## WEIGHTO -0.0312 0.0426 -0.73 0.4642
## BMIO 0.0600 0.1269 0.47 0.6362
## CHOLO 0.0459 0.5345 0.09 0.9315
## HDLO -0.4571 0.5347 -0.85 0.3926
## LDLO 0.0089 0.5377 0.02 0.9868
## TRIGO -0.0207 0.2222 -0.09 0.9256
## HOMOCO 0.0012 0.0050 0.24 0.8119
## GLUTO 0.0474 0.0291 1.63 0.1031
## CREATO 0.0021 0.0004 4.80 <0.0001
## IMTO 0.4572 0.1399 3.27 0.0011
## albumin=2 0.3503 0.1211 2.89 0.0038
## albumin=3 0.5787 0.2281 2.54 0.0112
## SMOKING=2 -0.0134 0.1670 -0.08 0.9361
## SMOKING=3 0.0324 0.2470 0.13 0.8955
## packyrs 0.0055 0.0026 2.13 0.0329
## alcohol=2 -0.2094 0.1715 -1.22 0.2220
## alcohol=3 -0.1422 0.1224 -1.16 0.2453
# fit.mult.imputeにaregImputeを行ったデータを適用
cph_imp_2 <- fit.mult.impute(formula_1, cph, xtrans = dat_areg, data = dat)
##
## Variance Inflation Factors Due to Imputation:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 1.02 1.09 2.36 1.80 3.67 3.55 1.02
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 1.01 1.04 1.06 1.00 1.05 1.03 1.02
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 1.02 1.08 1.08 1.07 1.08 1.28 1.02
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 1.09 1.04 1.18 1.07 1.04 1.05 1.00
## alcohol=2 alcohol=3
## 1.03 1.00
##
## Rate of Missing Information:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 0.02 0.08 0.58 0.44 0.73 0.72 0.02
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 0.01 0.04 0.06 0.00 0.04 0.03 0.02
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 0.02 0.07 0.08 0.07 0.07 0.22 0.02
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 0.08 0.04 0.16 0.06 0.04 0.05 0.00
## alcohol=2 alcohol=3
## 0.03 0.00
##
## d.f. for t-distribution for Tests of Single Coefficients:
##
## SEX=2 AGE SYSTBP DIASTBP SYSTH DIASTH DIABETES=1
## 14168.24 580.70 12.04 20.23 7.56 7.76 13690.44
## CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1 LENGTHO WEIGHTO
## 23747.04 3130.69 1106.61 359001.64 2090.14 5525.61 8239.13
## BMIO CHOLO HDLO LDLO TRIGO HOMOCO GLUTO
## 7411.75 792.24 682.72 829.15 781.82 84.71 8601.38
## CREATO IMTO albumin=2 albumin=3 SMOKING=2 SMOKING=3 packyrs
## 562.06 2919.05 164.33 1045.77 2892.08 1760.09 193434.20
## alcohol=2 alcohol=3
## 4306.30 466281.82
##
## The following fit components were averaged over the 5 model fits:
##
## linear.predictors means stats center
cph_imp_2
## Cox Proportional Hazards Model
##
## fit.mult.impute(formula = formula_1, fitter = cph, xtrans = dat_areg,
## data = dat)
##
## Model Tests Discrimination
## Indexes
## Obs 3873 LR chi2 284.82 R2 0.085
## Events 460 d.f. 30 R2(30,3873)0.064
## Center 5.6534 Pr(> chi2) 0.0000 R2(30,460)0.425
## Score chi2 356.89 Dxy 0.392
## Pr(> chi2) 0.0000
##
## Coef S.E. Wald Z Pr(>|Z|)
## SEX=2 -0.1796 0.1644 -1.09 0.2746
## AGE 0.0325 0.0065 5.02 <0.0001
## SYSTBP 0.0006 0.0064 0.09 0.9255
## DIASTBP -0.0030 0.0099 -0.31 0.7599
## SYSTH -0.0028 0.0076 -0.37 0.7130
## DIASTH 0.0135 0.0133 1.02 0.3079
## DIABETES=1 0.0467 0.1572 0.30 0.7665
## CEREBRAL=1 0.2816 0.1276 2.21 0.0274
## CARDIAC=1 0.4023 0.1131 3.56 0.0004
## AAA=1 0.6677 0.1288 5.18 <0.0001
## PERIPH=1 0.2933 0.1165 2.52 0.0118
## STENOSIS=1 0.2196 0.1228 1.79 0.0738
## LENGTHO 1.6942 3.9588 0.43 0.6687
## WEIGHTO -0.0298 0.0428 -0.70 0.4858
## BMIO 0.0537 0.1277 0.42 0.6741
## CHOLO 0.3411 0.7410 0.46 0.6453
## HDLO -0.7662 0.7511 -1.02 0.3077
## LDLO -0.2800 0.7436 -0.38 0.7065
## TRIGO -0.1447 0.3241 -0.45 0.6552
## HOMOCO 0.0036 0.0054 0.67 0.5026
## GLUTO 0.0469 0.0292 1.61 0.1077
## CREATO 0.0021 0.0005 4.42 <0.0001
## IMTO 0.4525 0.1373 3.29 0.0010
## albumin=2 0.3415 0.1270 2.69 0.0072
## albumin=3 0.5888 0.2194 2.68 0.0073
## SMOKING=2 -0.0072 0.1640 -0.04 0.9650
## SMOKING=3 0.0502 0.2490 0.20 0.8401
## packyrs 0.0055 0.0025 2.16 0.0305
## alcohol=2 -0.2058 0.1718 -1.20 0.2309
## alcohol=3 -0.1452 0.1214 -1.20 0.2317
# with→poolはcoxphだと行えるが、cphだと行えない様子。
cph_imp_with <- with(data = dat_mice,
coxph(Surv(time = TEVENT, event = EVENT) ~
SEX + AGE + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA +
PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO +
GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol))
pool_cph_imp_with <- pool(cph_imp_with)
result_cph_imp_with <- summary(pool_cph_imp_with)
result_cph_imp_with
## term estimate std.error statistic df p.value
## 1 SEX2 -0.1785130772 0.164829491 -1.08301662 403.10257 2.794484e-01
## 2 AGE 0.0316758285 0.006322550 5.00997704 355.12613 8.599656e-07
## 3 SYSTBP -0.0011946948 0.004623463 -0.25839827 101.88017 7.966208e-01
## 4 DIASTBP -0.0012535000 0.009153183 -0.13694690 35.10532 8.918552e-01
## 5 SYSTH 0.0002229135 0.005004650 0.04454128 24.98967 9.648270e-01
## 6 DIASTH 0.0093162172 0.009148239 1.01836177 21.32242 3.199226e-01
## 7 DIABETES1 0.0312857622 0.156136466 0.20037447 427.53229 8.412830e-01
## 8 CEREBRAL1 0.2867035668 0.127973133 2.24034186 414.39997 2.559801e-02
## 9 CARDIAC1 0.4007781463 0.112425964 3.56481840 401.38906 4.079601e-04
## 10 AAA1 0.6947424005 0.127420446 5.45236203 339.77355 9.578706e-08
## 11 PERIPH1 0.2875847730 0.118210733 2.43281440 375.24163 1.544850e-02
## 12 STENOSIS1 0.2164980328 0.121915550 1.77580327 410.29150 7.650646e-02
## 13 LENGTHO 1.8156045395 3.933507685 0.46157391 413.67691 6.446295e-01
## 14 WEIGHTO -0.0311736050 0.042592542 -0.73190290 416.70540 4.646392e-01
## 15 BMIO 0.0600028190 0.126873466 0.47293434 417.54718 6.365071e-01
## 16 CHOLO 0.0459389469 0.534514274 0.08594522 423.72721 9.315506e-01
## 17 HDLO -0.4571504928 0.534732795 -0.85491389 426.16717 3.930792e-01
## 18 LDLO 0.0088635918 0.537659365 0.01648552 423.11146 9.868548e-01
## 19 TRIGO -0.0207406658 0.222161637 -0.09335845 423.53260 9.256629e-01
## 20 HOMOCO 0.0012000101 0.005038448 0.23817060 36.99182 8.130632e-01
## 21 GLUTO 0.0474401625 0.029106882 1.62986069 423.48883 1.038745e-01
## 22 CREATO 0.0021176036 0.000441708 4.79412517 394.18263 2.319523e-06
## 23 IMTO 0.4572293474 0.139922469 3.26773357 370.34959 1.185458e-03
## 24 albumin2 0.3502919305 0.121077058 2.89313215 262.94904 4.133674e-03
## 25 albumin3 0.5787222251 0.228061553 2.53757030 181.55195 1.200353e-02
## 26 SMOKING2 -0.0133582869 0.166961810 -0.08000804 273.42636 9.362894e-01
## 27 SMOKING3 0.0324419660 0.247026790 0.13132975 404.26629 8.955798e-01
## 28 packyrs 0.0054693510 0.002563120 2.13386487 420.26706 3.343219e-02
## 29 alcohol2 -0.2094561025 0.171520719 -1.22117085 381.70995 2.227751e-01
## 30 alcohol3 -0.1421799317 0.122368223 -1.16190240 403.18414 2.459625e-01
3-5. 多重代入後のデータを用いて非線形性の確認をしましょう。
formula_2 <- Surv(time = TEVENT, event = EVENT) ~
SEX + rcs(AGE, 4) + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA +
PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO +
GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol
cph_imp_3 <- fit.mult.impute(formula_2, cph, xtrans = dat_mice, data = dat)
##
## Variance Inflation Factors Due to Imputation:
##
## SEX=2 AGE AGE' AGE'' SYSTBP DIASTBP SYSTH
## 1.02 1.00 1.00 1.00 1.24 1.50 1.69
## DIASTH DIABETES=1 CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1
## 1.82 1.00 1.01 1.03 1.04 1.03 1.02
## LENGTHO WEIGHTO BMIO CHOLO HDLO LDLO TRIGO
## 1.01 1.01 1.01 1.01 1.00 1.01 1.01
## HOMOCO GLUTO CREATO IMTO albumin=2 albumin=3 SMOKING=2
## 1.32 1.01 1.02 1.03 1.07 1.11 1.07
## SMOKING=3 packyrs alcohol=2 alcohol=3
## 1.02 1.02 1.03 1.02
##
## Rate of Missing Information:
##
## SEX=2 AGE AGE' AGE'' SYSTBP DIASTBP SYSTH
## 0.02 0.00 0.00 0.00 0.19 0.33 0.41
## DIASTH DIABETES=1 CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1
## 0.45 0.00 0.01 0.02 0.04 0.03 0.02
## LENGTHO WEIGHTO BMIO CHOLO HDLO LDLO TRIGO
## 0.01 0.01 0.01 0.01 0.00 0.01 0.01
## HOMOCO GLUTO CREATO IMTO albumin=2 albumin=3 SMOKING=2
## 0.24 0.01 0.02 0.03 0.07 0.10 0.07
## SMOKING=3 packyrs alcohol=2 alcohol=3
## 0.02 0.02 0.03 0.02
##
## d.f. for t-distribution for Tests of Single Coefficients:
##
## SEX=2 AGE AGE' AGE'' SYSTBP DIASTBP SYSTH
## 11117.08 2610001.35 2543050.56 2295837.39 108.31 36.05 24.17
## DIASTH DIABETES=1 CEREBRAL=1 CARDIAC=1 AAA=1 PERIPH=1 STENOSIS=1
## 19.81 2202027.90 30396.31 6692.65 2562.27 4308.69 11549.06
## LENGTHO WEIGHTO BMIO CHOLO HDLO LDLO TRIGO
## 21562.25 31804.41 34035.88 60961.14 281244.17 52178.73 58897.78
## HOMOCO GLUTO CREATO IMTO albumin=2 albumin=3 SMOKING=2
## 67.75 70810.64 13293.80 3511.36 905.31 417.34 859.31
## SMOKING=3 packyrs alcohol=2 alcohol=3
## 8656.44 14433.20 5329.33 13217.46
##
## The following fit components were averaged over the 5 model fits:
##
## linear.predictors means stats center
distribution <- datadist(dat)
options(datadist = "distribution")
# 55~60歳くらいからlogRRが増加していく形であることが分かる。
plot(Predict(cph_imp_3, AGE))
3-6. 多重代入後のデータを用いて判別能を評価しましょう。
# Somers' D(Dxy)から、C-indexを求める。
Cindex <- round(cph_imp_3$stat[13]*0.5 + 0.5, 3)
cat("apparent C-index is ", Cindex, "\n")
## apparent C-index is 0.7
3-7. 多重代入後のデータを用いて較正能を評価しましょう。
# 作成データセット数
m <- 5
# 4年後のリスクを予測/比較
u <- 365.25*4
# mice::completeデータをスタックする
dat_stack <- complete(dat_mice, 1)
for (i in 2:m){
dat_stack <- rbind(dat_stack, complete(dat_mice, i))
}
# 作成したデータセット分重みづけする
dat_stack$weight <- 1/m
# miceデータ(をスタックしたデータ)でcph
# クラスターを考慮した分散ではないので、信頼区間は狭く推定される
# ここでは点推定値だけ検討している
cph_stack <- cph(formula = formula_2,
data = dat_stack,
weights = weight,
x = TRUE,
y = TRUE)
# 予測
estim_surv <- survest(cph_stack,
newdata = dat_stack,
times = u)$surv
ext_val <- val.surv(cph_stack,
newdata = dat_stack,
u = u,
est.surv = estim_surv,
S = Surv(dat_stack$TEVENT, dat_stack$EVENT))
# プロット
plot(ext_val,
xlab = "Predicted probability at 4 years",
ylab = "Observed probability at 4 years",
las = 1)