# ライブラリーの読み込み
pacman::p_load(foreign, 
               tidyverse, 
               Hmisc, 
               rms,
               mice, 
               naniar, 
               gtsummary, 
               corrr, 
               survival, 
               survminer)


1-1. foreign packageのread.spssコマンドを使いデータを読み込みます。

# データの読み込み
dat <- read.spss("SMARTs.sav", 
                 use.value.labels = FALSE, 
                 to.data.frame = TRUE) 


1-2. SMARTに含まれる変数を確認しましょう。

# データの確認→全てdouble型
glimpse(dat)
## Rows: 3,873
## Columns: 29
## $ TEVENT   <dbl> 3466, 3465, 3465, 2445, 3463, 1642, 991, 1970, 2623, 3131, 34…
## $ EVENT    <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ SEX      <dbl> 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 2, 1, 1, 1, 2, 2, 1, 1, 1, 1…
## $ AGE      <dbl> 71, 46, 59, 76, 57, 52, 66, 72, 75, 53, 40, 75, 63, 56, 66, 5…
## $ DIABETES <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1…
## $ CEREBRAL <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0…
## $ CARDIAC  <dbl> 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ AAA      <dbl> 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ PERIPH   <dbl> 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1…
## $ STENOSIS <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0…
## $ SYSTBP   <dbl> 185, 135, 149, 140, 177, 164, 166, 131, 175, 174, 144, 156, 1…
## $ DIASTBP  <dbl> 86, 82, 97, 74, 96, 73, 70, 76, 68, 89, 84, 79, 86, 79, 83, 7…
## $ SYSTH    <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ DIASTH   <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ LENGTHO  <dbl> 1.70, 1.72, 1.77, 1.82, 1.90, 1.73, 1.65, 1.60, 1.70, 1.83, 1…
## $ WEIGHTO  <dbl> 69, 71, 82, 98, 107, 103, 66, 80, 80, 75, 62, 70, 95, 70, 74,…
## $ BMIO     <dbl> 23.88, 24.00, 26.17, 29.59, 29.64, 34.41, 24.24, 31.25, 27.68…
## $ CHOLO    <dbl> 6.0, 4.0, 4.9, 5.3, 5.2, 4.6, 5.8, 6.1, 4.4, 6.0, 4.6, 6.2, 5…
## $ HDLO     <dbl> 0.94, 1.26, 1.28, 1.00, 0.81, 0.95, 0.87, 1.32, 0.55, 0.95, 1…
## $ LDLO     <dbl> 4.03, 2.23, 3.17, 3.40, NA, 2.38, 3.33, 4.39, 2.26, 3.82, 2.5…
## $ TRIGO    <dbl> 2.29, 1.13, 1.01, 2.01, 4.76, 2.82, 3.55, 0.87, 3.54, 2.74, 1…
## $ HOMOCO   <dbl> NA, 8.7, NA, NA, NA, NA, NA, NA, NA, NA, 13.3, NA, NA, NA, NA…
## $ GLUTO    <dbl> 6.3, 4.7, 6.1, 5.0, 6.1, 10.2, 5.4, 5.3, 5.9, 10.8, 5.0, 4.8,…
## $ CREATO   <dbl> 95, 66, 93, 79, 91, 180, 81, 101, 157, 86, 56, 96, 89, 84, 84…
## $ IMTO     <dbl> 0.82, 0.57, 0.83, 1.45, 1.07, 0.97, 1.15, 0.70, 1.48, 0.87, 0…
## $ albumin  <dbl> 1, 1, 1, NA, 1, 3, 1, 2, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, …
## $ SMOKING  <dbl> 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 1…
## $ packyrs  <dbl> 22.5, 0.0, 33.3, 49.0, 53.2, 54.6, 72.8, 28.6, 16.5, 29.7, 21…
## $ alcohol  <dbl> 3, 1, 3, 3, 3, 1, 3, 2, 1, 3, 2, 1, 3, 3, 3, 3, 3, 2, 3, 1, 3…
# factor型にすべきものをする
# TEVENTは日数なので整数にする(必要あるか?)
dat <- dat %>% 
  mutate(across(.cols = c(SEX, DIABETES, CEREBRAL, CARDIAC, 
                          AAA, PERIPH, STENOSIS, albumin, SMOKING, alcohol), 
                .fns = factor)) %>% 
  mutate(across(.cols = TEVENT, 
                .fns = round))
# 変数のまとめ
# Table 23.3と比べると、femaleが一人だけ多いが・・・
tbl_1 <- dat %>% 
  dplyr::select(-TEVENT, -EVENT) %>% 
  dplyr::select(SEX, AGE, SMOKING, packyrs, alcohol, BMIO, DIABETES, 
                SYSTH, SYSTBP, DIASTH, DIASTBP, CHOLO, HDLO, LDLO, TRIGO, 
                CEREBRAL, CARDIAC, PERIPH, AAA, HOMOCO, GLUTO, CREATO, 
                albumin, IMTO, STENOSIS) %>% 
  tbl_summary(missing_text = "Missing") %>% 
  add_n() %>% 
  modify_header(label ~ "**Variable**") %>%
  modify_caption("**Table1. Baseline Characteristics**") %>%
  bold_labels()

tbl_1
Table1. Baseline Characteristics
Variable N N = 3,8731
SEX 3,873
    1 2,897 (75%)
    2 976 (25%)
AGE 3,873 60 (52, 68)
SMOKING 3,848
    1 693 (18%)
    2 2,711 (70%)
    3 444 (12%)
    Missing 25
packyrs 3,852 20 (6, 34)
    Missing 21
alcohol 3,848
    1 751 (20%)
    2 408 (11%)
    3 2,689 (70%)
    Missing 25
BMIO 3,870 26.3 (24.1, 28.7)
    Missing 3
DIABETES 3,833
    0 2,987 (78%)
    1 846 (22%)
    Missing 40
SYSTH 2,375 140 (126, 156)
    Missing 1,498
SYSTBP 2,650 139 (127, 154)
    Missing 1,223
DIASTH 2,374 82 (75, 90)
    Missing 1,499
DIASTBP 2,652 79 (73, 86)
    Missing 1,221
CHOLO 3,855 5.10 (4.40, 5.90)
    Missing 18
HDLO 3,843 1.17 (0.96, 1.42)
    Missing 30
LDLO 3,657 3.06 (2.39, 3.83)
    Missing 216
TRIGO 3,845 1.54 (1.12, 2.23)
    Missing 28
CEREBRAL 3,873
    0 2,726 (70%)
    1 1,147 (30%)
CARDIAC 3,873
    0 1,713 (44%)
    1 2,160 (56%)
PERIPH 3,873
    0 2,933 (76%)
    1 940 (24%)
AAA 3,873
    0 3,457 (89%)
    1 416 (11%)
HOMOCO 3,410 12.8 (10.3, 15.7)
    Missing 463
GLUTO 3,854 5.70 (5.30, 6.50)
    Missing 19
CREATO 3,856 89 (78, 101)
    Missing 17
albumin 3,666
    1 2,897 (79%)
    2 655 (18%)
    3 114 (3.1%)
    Missing 207
IMTO 3,775 0.88 (0.75, 1.07)
    Missing 98
STENOSIS 3,780
    0 3,058 (81%)
    1 722 (19%)
    Missing 93
1 n (%); Median (IQR)


1-3. 連続変数間(特に血圧)の相関を確認しましょう。

# ここではNAはomitする
dat_cor <- dat %>% 
  dplyr::select(where(is.double)) %>% 
  dplyr::select(-TEVENT) %>% 
  drop_na() %>% 
  correlate() %>% 
  shave()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
dat_cor
## # A tibble: 18 × 19
##    term        EVENT     AGE   SYSTBP DIASTBP    SYSTH   DIASTH  LENGTHO WEIGHTO
##    <chr>       <dbl>   <dbl>    <dbl>   <dbl>    <dbl>    <dbl>    <dbl>   <dbl>
##  1 EVENT   NA        NA      NA       NA      NA       NA       NA       NA     
##  2 AGE      0.175    NA      NA       NA      NA       NA       NA       NA     
##  3 SYSTBP   0.0802    0.322  NA       NA      NA       NA       NA       NA     
##  4 DIASTBP  0.000478  0.0306  0.665   NA      NA       NA       NA       NA     
##  5 SYSTH    0.108     0.288   0.693    0.454  NA       NA       NA       NA     
##  6 DIASTH   0.0252   -0.0624  0.408    0.598   0.682   NA       NA       NA     
##  7 LENGTHO -0.0102   -0.163  -0.118    0.0532 -0.133   -4.26e-4 NA       NA     
##  8 WEIGHTO -0.0259   -0.149  -0.0289   0.125  -0.0622   8.53e-2  0.539   NA     
##  9 BMIO    -0.0175   -0.0584  0.0473   0.108   0.0183   9.97e-2 -0.0807   0.791 
## 10 CHOLO   -0.0305    0.0253  0.0539   0.0591  0.0409   6.62e-2 -0.117   -0.104 
## 11 HDLO    -0.0997    0.0891  0.121    0.0615  0.0966   6.86e-2 -0.163   -0.286 
## 12 LDLO    -0.00669   0.0171  0.00509  0.0226 -0.00176  3.50e-2 -0.0715  -0.0600
## 13 TRIGO    0.0358   -0.0738  0.0214   0.0530  0.0283   2.86e-2  0.0274   0.176 
## 14 HOMOCO   0.147     0.225   0.152    0.114   0.140    9.03e-2 -0.0169  -0.0442
## 15 GLUTO    0.0850    0.0987  0.164    0.0632  0.117   -1.45e-2 -0.0353   0.146 
## 16 CREATO   0.194     0.0152  0.0514   0.0457  0.0666   5.81e-2  0.0168  -0.0738
## 17 IMTO     0.171     0.357   0.247    0.0422  0.294    3.61e-2 -0.00483  0.0565
## 18 packyrs  0.0579    0.0603  0.0194   0.0131  0.0251  -7.91e-3  0.0992   0.142 
## # ℹ 10 more variables: BMIO <dbl>, CHOLO <dbl>, HDLO <dbl>, LDLO <dbl>,
## #   TRIGO <dbl>, HOMOCO <dbl>, GLUTO <dbl>, CREATO <dbl>, IMTO <dbl>,
## #   packyrs <dbl>
# やはり血圧同士の相関はつよめ。
plot_cor <- rplot(dat_cor)

plot_cor


1-4. 欠測パターンを確認しましょう。

# Hmiscパッケージ
na.patterns <- naclus(dat) 
naplot1 <- plot(na.patterns, ylab = "Fraction of NAs in common") 

naplot1
## NULL
naplot2 <- naplot(na.patterns, which = c("na per var")) 

naplot2
## 
##    0    1    2    3    4    5    6    7    8    9   10   11 
##  924  197 1975  637   91   16   14    6    6    2    2    3
naplot3 <- naplot(na.patterns, which = c("na per obs"))

naplot3
## 
##    0    1    2    3    4    5    6    7    8    9   10   11 
##  924  197 1975  637   91   16   14    6    6    2    2    3
# naniarパッケージ
naplot4 <- gg_miss_var(dat, show_pct = TRUE)

naplot4

naplot5 <- vis_miss(dat)

naplot5


2-1. TEVENT(観察時間)の情報を用いて、ヒストグラム、総追跡人年、追跡期間の平均と中央値と求めましょう。

# TEVENTはの単位はday
# AGEのヒストグラムを作成
hist_TEVENT <- ggplot(data = dat) + 
  geom_histogram(aes(x = TEVENT)) + 
  theme_classic()

hist_TEVENT
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 総追跡人年
total_person_year <- round(sum(dat$TEVENT)/365.25, 2)

cat("total person-year is ", total_person_year, "\n")
## total person-year is  14530.62
# 追跡期間の平均と中央値
summary(dat$TEVENT)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0     555    1213    1370    2165    3466


2-2. EVENTとTEVENTの情報から、カプランマイヤーカーブを描きましょう。
追跡期間を無視した累積罹患率とカプランマイヤー法による累積罹患率を比べてみましょう。

surv_fit_1 <- survfit(Surv(time = TEVENT, event = EVENT) ~ 1, 
                      data = dat)

# 生命表(6m, 1y, 3y, 5y, 7yを抜粋)
summary(surv_fit_1, times = c(180, 360, 360*3, 360*5, 360*7))
## Call: survfit(formula = Surv(time = TEVENT, event = EVENT) ~ 1, data = dat)
## 
##  time n.risk n.event survival std.err lower 95% CI upper 95% CI
##   180   3531      86    0.977 0.00244        0.972        0.982
##   360   3187      58    0.961 0.00323        0.954        0.967
##  1080   2122     129    0.915 0.00502        0.905        0.925
##  1800   1314      99    0.862 0.00703        0.848        0.876
##  2520    594      61    0.808 0.00952        0.789        0.827
# KMカーブ
KMcurve_1 <- ggsurvplot(surv_fit_1, data = dat) 

KMcurve_1


2-3. 次に性別毎のカプランマイヤーカーブを描きましょう。

surv_fit_2 <- survfit(Surv(time = TEVENT, event = EVENT) ~ SEX, 
                      data = dat)

KMcurve_2 <- ggsurvplot(surv_fit_2, data = dat, pval = TRUE) 

KMcurve_2


2-4. いくつかの変数を使ってCoxモデルを当てはめてみましょう。

# 次章で欠測値の処理について検討するので、ここではcomplete case analysisとした。
dat_comp <- dat %>% 
  dplyr::select(TEVENT, EVENT, 
                SEX, AGE, SMOKING, alcohol, BMIO) %>% 
  drop_na()

cph_1 <- cph(Surv(time = TEVENT, event = EVENT) ~ SEX + AGE + SMOKING + alcohol + BMIO, 
             data = dat_comp)

cph_1
## Cox Proportional Hazards Model
## 
## cph(formula = Surv(time = TEVENT, event = EVENT) ~ SEX + AGE + 
##     SMOKING + alcohol + BMIO, data = dat_comp)
## 
##                       Model Tests     Discrimination    
##                                              Indexes    
## Obs     3841    LR chi2    129.40     R2       0.040    
## Events   453    d.f.            7    R2(7,3841)0.031    
## Center 2.456    Pr(> chi2) 0.0000     R2(7,453)0.237    
##                 Score chi2 121.74     Dxy      0.270    
##                 Pr(> chi2) 0.0000                       
## 
##           Coef    S.E.   Wald Z Pr(>|Z|)
## SEX=2     -0.3958 0.1263 -3.13  0.0017  
## AGE        0.0495 0.0052  9.48  <0.0001 
## SMOKING=2  0.3067 0.1406  2.18  0.0292  
## SMOKING=3  0.3527 0.2316  1.52  0.1278  
## alcohol=2 -0.0861 0.1658 -0.52  0.6035  
## alcohol=3 -0.3138 0.1181 -2.66  0.0079  
## BMIO      -0.0157 0.0136 -1.16  0.2465


2-5. Cox比例ハザード性が成り立っているか検討しましょう。

# 成り立っているとはいえない。とくにAGE, SMOKING
zph_1 <- cox.zph(cph_1)

zph_1
##           chisq df      p
## SEX      2.8910  1 0.0891
## AGE      6.6669  1 0.0098
## SMOKING 12.5043  2 0.0019
## alcohol  0.9210  2 0.6310
## BMIO     0.0904  1 0.7637
## GLOBAL  22.7550  7 0.0019
zph_plot <- ggcoxzph(zph_1)

zph_plot


3-1-1. 多重代入を行ってみましょう。mice package

dat_mice <- mice(dat,
                 m = 5,
                 maxit = 5, 
                 print = FALSE)


3-1-2. HarrellによるaregImpute

dat_areg <- aregImpute( ~ I(TEVENT) + EVENT + SEX + I(AGE) + SYSTBP + DIASTBP + SYSTH + DIASTH + 
                          DIABETES + CEREBRAL + CARDIAC + AAA + PERIPH + STENOSIS + I(LENGTHO) + 
                          I(WEIGHTO) + I(BMIO) + I(CHOLO) + I(HDLO) + I(LDLO) + I(TRIGO) + 
                          I(HOMOCO) + I(GLUTO) + I(CREATO) + I(IMTO) + albumin + SMOKING + I(packyrs) + alcohol, 
                        n.impute = 5, 
                        data = dat) 
## Iteration 1 
Iteration 2 
Iteration 3 
Iteration 4 
Iteration 5 
Iteration 6 
Iteration 7 
Iteration 8 


3-2. 多重代入後の確認(plotを用いると便利)

plot_mice <- plot(dat_mice)

plot_mice

densityplot_mice <- densityplot(dat_mice)

densityplot_mice


3-3. 多重代入後の一つのデータを抜き出してCox比例ハザードモデルを実施しましょう。

dat_mice_1 <- complete(dat_mice, 1)
# cph実行は省略。


3-4. 多重代入後のデータでCox比例ハザードモデルを実施しましょう

formula_1 <- Surv(time = TEVENT, event = EVENT) ~ 
  SEX + AGE + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA + 
  PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO + 
  GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol

# fit.mult.imputeにmiceを行ったデータを適用
cph_imp_1 <- fit.mult.impute(formula_1, cph, xtrans = dat_mice, data = dat)
## 
## Variance Inflation Factors Due to Imputation:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##       1.02       1.04       1.20       1.46       1.61       1.71       1.00 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##       1.01       1.02       1.05       1.03       1.02       1.01       1.01 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##       1.01       1.01       1.00       1.01       1.01       1.44       1.01 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##       1.02       1.03       1.08       1.12       1.07       1.02       1.01 
##  alcohol=2  alcohol=3 
##       1.03       1.02 
## 
## Rate of Missing Information:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##       0.02       0.04       0.17       0.32       0.38       0.41       0.00 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##       0.01       0.02       0.04       0.03       0.02       0.01       0.01 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##       0.01       0.01       0.00       0.01       0.01       0.31       0.01 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##       0.02       0.03       0.07       0.11       0.07       0.02       0.01 
##  alcohol=2  alcohol=3 
##       0.03       0.02 
## 
## d.f. for t-distribution for Tests of Single Coefficients:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##   10277.46    2603.66     142.69      39.90      27.59      23.31 3880576.06 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##   22230.65    9446.46    2009.77    3967.98   15826.65   20801.17   28346.18 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##   31390.58  107566.49  386918.71   88716.39  100891.42      42.29   99491.40 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##    7006.99    3539.72     777.63     346.14     868.62   10910.79   46944.98 
##  alcohol=2  alcohol=3 
##    4691.75   10308.21 
## 
## The following fit components were averaged over the 5 model fits:
## 
##   linear.predictors means stats center
cph_imp_1
## Cox Proportional Hazards Model
## 
## fit.mult.impute(formula = formula_1, fitter = cph, xtrans = dat_mice, 
##     data = dat)
## 
##                        Model Tests      Discrimination    
##                                                Indexes    
## Obs      3873    LR chi2    278.63      R2       0.083    
## Events    460    d.f.           30    R2(30,3873)0.062    
## Center 5.8105    Pr(> chi2) 0.0000     R2(30,460)0.418    
##                  Score chi2 350.19      Dxy      0.389    
##                  Pr(> chi2) 0.0000                        
## 
##            Coef    S.E.   Wald Z Pr(>|Z|)
## SEX=2      -0.1785 0.1648 -1.08  0.2788  
## AGE         0.0317 0.0063  5.01  <0.0001 
## SYSTBP     -0.0012 0.0046 -0.26  0.7961  
## DIASTBP    -0.0013 0.0092 -0.14  0.8911  
## SYSTH       0.0002 0.0050  0.04  0.9644  
## DIASTH      0.0093 0.0091  1.02  0.3085  
## DIABETES=1  0.0313 0.1561  0.20  0.8412  
## CEREBRAL=1  0.2867 0.1280  2.24  0.0251  
## CARDIAC=1   0.4008 0.1124  3.56  0.0004  
## AAA=1       0.6947 0.1274  5.45  <0.0001 
## PERIPH=1    0.2876 0.1182  2.43  0.0150  
## STENOSIS=1  0.2165 0.1219  1.78  0.0758  
## LENGTHO     1.8160 3.9335  0.46  0.6443  
## WEIGHTO    -0.0312 0.0426 -0.73  0.4642  
## BMIO        0.0600 0.1269  0.47  0.6362  
## CHOLO       0.0459 0.5345  0.09  0.9315  
## HDLO       -0.4571 0.5347 -0.85  0.3926  
## LDLO        0.0089 0.5377  0.02  0.9868  
## TRIGO      -0.0207 0.2222 -0.09  0.9256  
## HOMOCO      0.0012 0.0050  0.24  0.8119  
## GLUTO       0.0474 0.0291  1.63  0.1031  
## CREATO      0.0021 0.0004  4.80  <0.0001 
## IMTO        0.4572 0.1399  3.27  0.0011  
## albumin=2   0.3503 0.1211  2.89  0.0038  
## albumin=3   0.5787 0.2281  2.54  0.0112  
## SMOKING=2  -0.0134 0.1670 -0.08  0.9361  
## SMOKING=3   0.0324 0.2470  0.13  0.8955  
## packyrs     0.0055 0.0026  2.13  0.0329  
## alcohol=2  -0.2094 0.1715 -1.22  0.2220  
## alcohol=3  -0.1422 0.1224 -1.16  0.2453
# fit.mult.imputeにaregImputeを行ったデータを適用
cph_imp_2 <- fit.mult.impute(formula_1, cph, xtrans = dat_areg, data = dat)
## 
## Variance Inflation Factors Due to Imputation:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##       1.02       1.09       2.36       1.80       3.67       3.55       1.02 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##       1.01       1.04       1.06       1.00       1.05       1.03       1.02 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##       1.02       1.08       1.08       1.07       1.08       1.28       1.02 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##       1.09       1.04       1.18       1.07       1.04       1.05       1.00 
##  alcohol=2  alcohol=3 
##       1.03       1.00 
## 
## Rate of Missing Information:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##       0.02       0.08       0.58       0.44       0.73       0.72       0.02 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##       0.01       0.04       0.06       0.00       0.04       0.03       0.02 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##       0.02       0.07       0.08       0.07       0.07       0.22       0.02 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##       0.08       0.04       0.16       0.06       0.04       0.05       0.00 
##  alcohol=2  alcohol=3 
##       0.03       0.00 
## 
## d.f. for t-distribution for Tests of Single Coefficients:
## 
##      SEX=2        AGE     SYSTBP    DIASTBP      SYSTH     DIASTH DIABETES=1 
##   14168.24     580.70      12.04      20.23       7.56       7.76   13690.44 
## CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1    LENGTHO    WEIGHTO 
##   23747.04    3130.69    1106.61  359001.64    2090.14    5525.61    8239.13 
##       BMIO      CHOLO       HDLO       LDLO      TRIGO     HOMOCO      GLUTO 
##    7411.75     792.24     682.72     829.15     781.82      84.71    8601.38 
##     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2  SMOKING=3    packyrs 
##     562.06    2919.05     164.33    1045.77    2892.08    1760.09  193434.20 
##  alcohol=2  alcohol=3 
##    4306.30  466281.82 
## 
## The following fit components were averaged over the 5 model fits:
## 
##   linear.predictors means stats center
cph_imp_2
## Cox Proportional Hazards Model
## 
## fit.mult.impute(formula = formula_1, fitter = cph, xtrans = dat_areg, 
##     data = dat)
## 
##                        Model Tests      Discrimination    
##                                                Indexes    
## Obs      3873    LR chi2    284.82      R2       0.085    
## Events    460    d.f.           30    R2(30,3873)0.064    
## Center 5.6534    Pr(> chi2) 0.0000     R2(30,460)0.425    
##                  Score chi2 356.89      Dxy      0.392    
##                  Pr(> chi2) 0.0000                        
## 
##            Coef    S.E.   Wald Z Pr(>|Z|)
## SEX=2      -0.1796 0.1644 -1.09  0.2746  
## AGE         0.0325 0.0065  5.02  <0.0001 
## SYSTBP      0.0006 0.0064  0.09  0.9255  
## DIASTBP    -0.0030 0.0099 -0.31  0.7599  
## SYSTH      -0.0028 0.0076 -0.37  0.7130  
## DIASTH      0.0135 0.0133  1.02  0.3079  
## DIABETES=1  0.0467 0.1572  0.30  0.7665  
## CEREBRAL=1  0.2816 0.1276  2.21  0.0274  
## CARDIAC=1   0.4023 0.1131  3.56  0.0004  
## AAA=1       0.6677 0.1288  5.18  <0.0001 
## PERIPH=1    0.2933 0.1165  2.52  0.0118  
## STENOSIS=1  0.2196 0.1228  1.79  0.0738  
## LENGTHO     1.6942 3.9588  0.43  0.6687  
## WEIGHTO    -0.0298 0.0428 -0.70  0.4858  
## BMIO        0.0537 0.1277  0.42  0.6741  
## CHOLO       0.3411 0.7410  0.46  0.6453  
## HDLO       -0.7662 0.7511 -1.02  0.3077  
## LDLO       -0.2800 0.7436 -0.38  0.7065  
## TRIGO      -0.1447 0.3241 -0.45  0.6552  
## HOMOCO      0.0036 0.0054  0.67  0.5026  
## GLUTO       0.0469 0.0292  1.61  0.1077  
## CREATO      0.0021 0.0005  4.42  <0.0001 
## IMTO        0.4525 0.1373  3.29  0.0010  
## albumin=2   0.3415 0.1270  2.69  0.0072  
## albumin=3   0.5888 0.2194  2.68  0.0073  
## SMOKING=2  -0.0072 0.1640 -0.04  0.9650  
## SMOKING=3   0.0502 0.2490  0.20  0.8401  
## packyrs     0.0055 0.0025  2.16  0.0305  
## alcohol=2  -0.2058 0.1718 -1.20  0.2309  
## alcohol=3  -0.1452 0.1214 -1.20  0.2317
# with→poolはcoxphだと行えるが、cphだと行えない様子。
cph_imp_with <- with(data = dat_mice, 
                     coxph(Surv(time = TEVENT, event = EVENT) ~ 
                             SEX + AGE + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA + 
                             PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO + 
                             GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol))

pool_cph_imp_with <- pool(cph_imp_with)
result_cph_imp_with <- summary(pool_cph_imp_with)

result_cph_imp_with
##         term      estimate   std.error   statistic        df      p.value
## 1       SEX2 -0.1785130772 0.164829491 -1.08301662 403.10257 2.794484e-01
## 2        AGE  0.0316758285 0.006322550  5.00997704 355.12613 8.599656e-07
## 3     SYSTBP -0.0011946948 0.004623463 -0.25839827 101.88017 7.966208e-01
## 4    DIASTBP -0.0012535000 0.009153183 -0.13694690  35.10532 8.918552e-01
## 5      SYSTH  0.0002229135 0.005004650  0.04454128  24.98967 9.648270e-01
## 6     DIASTH  0.0093162172 0.009148239  1.01836177  21.32242 3.199226e-01
## 7  DIABETES1  0.0312857622 0.156136466  0.20037447 427.53229 8.412830e-01
## 8  CEREBRAL1  0.2867035668 0.127973133  2.24034186 414.39997 2.559801e-02
## 9   CARDIAC1  0.4007781463 0.112425964  3.56481840 401.38906 4.079601e-04
## 10      AAA1  0.6947424005 0.127420446  5.45236203 339.77355 9.578706e-08
## 11   PERIPH1  0.2875847730 0.118210733  2.43281440 375.24163 1.544850e-02
## 12 STENOSIS1  0.2164980328 0.121915550  1.77580327 410.29150 7.650646e-02
## 13   LENGTHO  1.8156045395 3.933507685  0.46157391 413.67691 6.446295e-01
## 14   WEIGHTO -0.0311736050 0.042592542 -0.73190290 416.70540 4.646392e-01
## 15      BMIO  0.0600028190 0.126873466  0.47293434 417.54718 6.365071e-01
## 16     CHOLO  0.0459389469 0.534514274  0.08594522 423.72721 9.315506e-01
## 17      HDLO -0.4571504928 0.534732795 -0.85491389 426.16717 3.930792e-01
## 18      LDLO  0.0088635918 0.537659365  0.01648552 423.11146 9.868548e-01
## 19     TRIGO -0.0207406658 0.222161637 -0.09335845 423.53260 9.256629e-01
## 20    HOMOCO  0.0012000101 0.005038448  0.23817060  36.99182 8.130632e-01
## 21     GLUTO  0.0474401625 0.029106882  1.62986069 423.48883 1.038745e-01
## 22    CREATO  0.0021176036 0.000441708  4.79412517 394.18263 2.319523e-06
## 23      IMTO  0.4572293474 0.139922469  3.26773357 370.34959 1.185458e-03
## 24  albumin2  0.3502919305 0.121077058  2.89313215 262.94904 4.133674e-03
## 25  albumin3  0.5787222251 0.228061553  2.53757030 181.55195 1.200353e-02
## 26  SMOKING2 -0.0133582869 0.166961810 -0.08000804 273.42636 9.362894e-01
## 27  SMOKING3  0.0324419660 0.247026790  0.13132975 404.26629 8.955798e-01
## 28   packyrs  0.0054693510 0.002563120  2.13386487 420.26706 3.343219e-02
## 29  alcohol2 -0.2094561025 0.171520719 -1.22117085 381.70995 2.227751e-01
## 30  alcohol3 -0.1421799317 0.122368223 -1.16190240 403.18414 2.459625e-01


3-5. 多重代入後のデータを用いて非線形性の確認をしましょう。

formula_2 <- Surv(time = TEVENT, event = EVENT) ~ 
  SEX + rcs(AGE, 4) + SYSTBP + DIASTBP + SYSTH + DIASTH + DIABETES + CEREBRAL + CARDIAC + AAA + 
  PERIPH + STENOSIS + LENGTHO + WEIGHTO + BMIO + CHOLO + HDLO + LDLO + TRIGO + HOMOCO + 
  GLUTO + CREATO + IMTO + albumin + SMOKING + packyrs + alcohol

cph_imp_3 <- fit.mult.impute(formula_2, cph, xtrans = dat_mice, data = dat)
## 
## Variance Inflation Factors Due to Imputation:
## 
##      SEX=2        AGE       AGE'      AGE''     SYSTBP    DIASTBP      SYSTH 
##       1.02       1.00       1.00       1.00       1.24       1.50       1.69 
##     DIASTH DIABETES=1 CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1 
##       1.82       1.00       1.01       1.03       1.04       1.03       1.02 
##    LENGTHO    WEIGHTO       BMIO      CHOLO       HDLO       LDLO      TRIGO 
##       1.01       1.01       1.01       1.01       1.00       1.01       1.01 
##     HOMOCO      GLUTO     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2 
##       1.32       1.01       1.02       1.03       1.07       1.11       1.07 
##  SMOKING=3    packyrs  alcohol=2  alcohol=3 
##       1.02       1.02       1.03       1.02 
## 
## Rate of Missing Information:
## 
##      SEX=2        AGE       AGE'      AGE''     SYSTBP    DIASTBP      SYSTH 
##       0.02       0.00       0.00       0.00       0.19       0.33       0.41 
##     DIASTH DIABETES=1 CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1 
##       0.45       0.00       0.01       0.02       0.04       0.03       0.02 
##    LENGTHO    WEIGHTO       BMIO      CHOLO       HDLO       LDLO      TRIGO 
##       0.01       0.01       0.01       0.01       0.00       0.01       0.01 
##     HOMOCO      GLUTO     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2 
##       0.24       0.01       0.02       0.03       0.07       0.10       0.07 
##  SMOKING=3    packyrs  alcohol=2  alcohol=3 
##       0.02       0.02       0.03       0.02 
## 
## d.f. for t-distribution for Tests of Single Coefficients:
## 
##      SEX=2        AGE       AGE'      AGE''     SYSTBP    DIASTBP      SYSTH 
##   11117.08 2610001.35 2543050.56 2295837.39     108.31      36.05      24.17 
##     DIASTH DIABETES=1 CEREBRAL=1  CARDIAC=1      AAA=1   PERIPH=1 STENOSIS=1 
##      19.81 2202027.90   30396.31    6692.65    2562.27    4308.69   11549.06 
##    LENGTHO    WEIGHTO       BMIO      CHOLO       HDLO       LDLO      TRIGO 
##   21562.25   31804.41   34035.88   60961.14  281244.17   52178.73   58897.78 
##     HOMOCO      GLUTO     CREATO       IMTO  albumin=2  albumin=3  SMOKING=2 
##      67.75   70810.64   13293.80    3511.36     905.31     417.34     859.31 
##  SMOKING=3    packyrs  alcohol=2  alcohol=3 
##    8656.44   14433.20    5329.33   13217.46 
## 
## The following fit components were averaged over the 5 model fits:
## 
##   linear.predictors means stats center
distribution <- datadist(dat)
options(datadist = "distribution")

# 55~60歳くらいからlogRRが増加していく形であることが分かる。
plot(Predict(cph_imp_3, AGE)) 


3-6. 多重代入後のデータを用いて判別能を評価しましょう。

# Somers' D(Dxy)から、C-indexを求める。
Cindex <- round(cph_imp_3$stat[13]*0.5 + 0.5, 3)

cat("apparent C-index is ", Cindex, "\n")
## apparent C-index is  0.7


3-7. 多重代入後のデータを用いて較正能を評価しましょう。

# 作成データセット数
m <- 5

# 4年後のリスクを予測/比較
u <- 365.25*4

# mice::completeデータをスタックする
dat_stack <- complete(dat_mice, 1)

for (i in 2:m){
  dat_stack <- rbind(dat_stack, complete(dat_mice, i))
}

# 作成したデータセット分重みづけする
dat_stack$weight <- 1/m  

# miceデータ(をスタックしたデータ)でcph
# クラスターを考慮した分散ではないので、信頼区間は狭く推定される
# ここでは点推定値だけ検討している
cph_stack <- cph(formula = formula_2, 
                 data = dat_stack, 
                 weights = weight, 
                 x = TRUE, 
                 y = TRUE)

# 予測
estim_surv <- survest(cph_stack,
                      newdata = dat_stack,
                      times = u)$surv

ext_val <- val.surv(cph_stack,
                    newdata = dat_stack, 
                    u = u,
                    est.surv = estim_surv,
                    S = Surv(dat_stack$TEVENT, dat_stack$EVENT)) 
# プロット
plot(ext_val, 
     xlab = "Predicted probability at 4 years", 
     ylab = "Observed probability at 4 years", 
     las = 1)