#library
pacman::p_load(
tidyverse,data.table,tableone,tibble,stringi,readxl,dplyr,scales,ggsci,
skimr,Epi,janitor,summarytools,broom,easystats,car,rms)

#conflict解消
select<-dplyr::select 

library(pROC);library(glmnetUtils);library(glmnet);library(ISLR)
## Type 'citation("pROC")' for a citation.
## 
##  次のパッケージを付け加えます: 'pROC'
##  以下のオブジェクトは 'package:parameters' からマスクされています:
## 
##     ci
##  以下のオブジェクトは 'package:bayestestR' からマスクされています:
## 
##     auc, ci
##  以下のオブジェクトは 'package:stats' からマスクされています:
## 
##     cov, smooth, var
##  要求されたパッケージ Matrix をロード中です
## 
##  次のパッケージを付け加えます: 'Matrix'
##  以下のオブジェクトは 'package:tidyr' からマスクされています:
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
## 
##  次のパッケージを付け加えます: 'glmnet'
##  以下のオブジェクトは 'package:glmnetUtils' からマスクされています:
## 
##     cv.glmnet, glmnet
#factor var
fac_var<-c("RANDID","SEX", "CURSMOKE", "DIABETES", "BPMEDS","educ","PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", "PERIOD", "DEATH", 
           "ANGINA", "HOSPMI", "MI_FCHD", "ANYCHD", "STROKE", "CVD", "HYPERTEN")

cont_var<-c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE", "GLUCOSE")

full_var<-c(fac_var[-c(1,6,12:20)],cont_var)

var<-c("SEX", "CURSMOKE", "DIABETES", "BPMEDS", "PREVCHD", "PREVAP", "PREVMI", "PREVHYP", cont_var) #PREVSTRは除外している

inter_var<-c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP", "SEX:DIABP", "CURSMOKE:HEARTRTE","DIABETES:TOTCHOL", "DIABETES:SYSBP",
             "BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE", "PREVHYP:BMI", "TOTCHOL:AGE","HEARTRTE:GLUCOSE")

nonlin_term<-c("BMI","HEARTRTE","GLUCOSE")

#data import
fr<-read_csv("frmgham2.csv") %>% 
  mutate(across(fac_var,as.factor))
## Rows: 11627 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): RANDID, SEX, TOTCHOL, AGE, SYSBP, DIABP, CURSMOKE, CIGPDAY, BMI, D...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(fac_var, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(fac_var)
## 
##   # Now:
##   data %>% select(all_of(fac_var))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#欠測補完後データのimport
load("fr.missf.RData")
fr.missf<-fr.missf$ximp


#plot in baseline variable
fr_p1<-fr %>% 
  filter(PERIOD==1)

fr.missf_p1<-
  fr.missf %>% 
  filter(PERIOD==1)

full model formula

# full model formula
fullmodel<-
  paste0("STROKE~",
       str_c(var[!var %in% nonlin_term],collapse = "+"), #非線形以外のvarを+でつなぐ
       "+",
       str_c(paste0("rcs(",nonlin_term,",4)"),collapse = "+"), #非線形のvarを+でつなぐ
       "+",
       str_c(inter_var,collapse = "+")) #交互作用項を+でつなぐ

full model fitting results

# Complete case 
full_results<-
  glm(as.formula(fullmodel),data=fr_p1,
      family = binomial(link = "logit")) %>% 
  summary() %>% 
  .$coefficients %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename("var"="rowname") %>% 
  mutate(OR=exp(Estimate),
         Lower=exp(Estimate-1.96*`Std. Error`),
         Upper=exp(Estimate+1.96*`Std. Error`)) %>% 
  select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>% 
  #すべての変数を丸める
  mutate_if(is.numeric,~round(.,3))

full_results
##                           var      OR Lower    Upper     p
## 1                 (Intercept)   0.031 0.000   21.641 0.298
## 2                        SEX2   0.346 0.060    2.005 0.237
## 3                   CURSMOKE1   0.226 0.050    1.013 0.052
## 4                   DIABETES1  52.994 1.393 2015.673 0.032
## 5                     BPMEDS1 127.580 2.055 7920.611 0.021
## 6                    PREVCHD1   3.288 0.288   37.541 0.338
## 7                     PREVAP1   0.637 0.163    2.487 0.516
## 8                     PREVMI1   1.392 0.394    4.915 0.607
## 9                    PREVHYP1   5.301 1.053   26.683 0.043
## 10                    TOTCHOL   1.002 0.984    1.020 0.825
## 11                        AGE   1.085 1.003    1.174 0.043
## 12                      SYSBP   1.001 0.992    1.010 0.816
## 13                      DIABP   1.012 0.993    1.031 0.204
## 14                    CIGPDAY   1.003 0.988    1.018 0.677
## 15             rcs(BMI, 4)BMI   0.866 0.755    0.994 0.040
## 16            rcs(BMI, 4)BMI'   1.469 0.940    2.298 0.091
## 17           rcs(BMI, 4)BMI''   0.361 0.089    1.463 0.154
## 18   rcs(HEARTRTE, 4)HEARTRTE   1.013 0.967    1.062 0.585
## 19  rcs(HEARTRTE, 4)HEARTRTE'   0.787 0.658    0.940 0.008
## 20 rcs(HEARTRTE, 4)HEARTRTE''   1.856 1.179    2.923 0.008
## 21     rcs(GLUCOSE, 4)GLUCOSE   0.947 0.904    0.991 0.020
## 22    rcs(GLUCOSE, 4)GLUCOSE'   1.102 0.934    1.300 0.249
## 23   rcs(GLUCOSE, 4)GLUCOSE''   0.768 0.492    1.198 0.244
## 24              SEX2:PREVCHD1   0.720 0.020   26.534 0.858
## 25               SEX2:PREVAP1   2.493 0.082   75.953 0.600
## 26               SEX2:PREVMI1   2.786 0.106   73.500 0.539
## 27              SEX2:PREVHYP1   1.048 0.589    1.863 0.874
## 28                 SEX2:DIABP   1.011 0.989    1.033 0.329
## 29         CURSMOKE0:HEARTRTE   0.976 0.957    0.996 0.017
## 30          DIABETES1:TOTCHOL   0.992 0.981    1.003 0.144
## 31            DIABETES1:SYSBP   0.992 0.975    1.011 0.413
## 32           BPMEDS1:PREVCHD1   0.463 0.105    2.038 0.309
## 33            BPMEDS1:TOTCHOL   0.993 0.983    1.003 0.153
## 34                BPMEDS1:AGE   0.957 0.904    1.013 0.133
## 35           PREVCHD0:GLUCOSE   1.013 0.991    1.036 0.249
## 36               PREVHYP0:BMI   1.044 0.985    1.107 0.144
## 37                TOTCHOL:AGE   1.000 1.000    1.000 0.816
## 38           HEARTRTE:GLUCOSE   1.000 1.000    1.001 0.069
#imputation data
full_results_mi<-
  glm(as.formula(fullmodel),data=fr.missf_p1,
      family = binomial(link = "logit")) %>% 
  summary() %>% 
  .$coefficients %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename("var"="rowname") %>% 
  mutate(OR=exp(Estimate),
         Lower=exp(Estimate-1.96*`Std. Error`),
         Upper=exp(Estimate+1.96*`Std. Error`)) %>% 
  select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>% 
  #すべての変数を丸める
  mutate_if(is.numeric,~round(.,3))

full_results_mi
##                           var      OR Lower    Upper     p
## 1                 (Intercept)   0.005 0.000    2.849 0.102
## 2                        SEX2   0.303 0.058    1.583 0.157
## 3                   CURSMOKE1   0.290 0.071    1.188 0.085
## 4                   DIABETES1  55.743 1.632 1903.573 0.026
## 5                     BPMEDS1 198.320 4.319 9106.958 0.007
## 6                    PREVCHD1   3.015 0.328   27.704 0.329
## 7                     PREVAP1   0.689 0.186    2.558 0.578
## 8                     PREVMI1   1.057 0.331    3.368 0.926
## 9                    PREVHYP1   4.549 0.974   21.246 0.054
## 10                    TOTCHOL   1.007 0.990    1.024 0.447
## 11                        AGE   1.106 1.027    1.192 0.008
## 12                      SYSBP   1.001 0.993    1.010 0.748
## 13                      DIABP   1.013 0.995    1.031 0.159
## 14                    CIGPDAY   1.003 0.988    1.017 0.718
## 15             rcs(BMI, 4)BMI   0.917 0.803    1.048 0.204
## 16            rcs(BMI, 4)BMI'   1.256 0.817    1.930 0.299
## 17           rcs(BMI, 4)BMI''   0.552 0.144    2.112 0.385
## 18   rcs(HEARTRTE, 4)HEARTRTE   1.010 0.965    1.056 0.681
## 19  rcs(HEARTRTE, 4)HEARTRTE'   0.809 0.683    0.958 0.014
## 20 rcs(HEARTRTE, 4)HEARTRTE''   1.721 1.120    2.645 0.013
## 21     rcs(GLUCOSE, 4)GLUCOSE   0.941 0.901    0.982 0.006
## 22    rcs(GLUCOSE, 4)GLUCOSE'   1.134 0.990    1.300 0.070
## 23   rcs(GLUCOSE, 4)GLUCOSE''   0.672 0.439    1.028 0.067
## 24              SEX2:PREVCHD1   0.638 0.021   19.464 0.797
## 25               SEX2:PREVAP1   2.654 0.100   70.232 0.559
## 26               SEX2:PREVMI1   3.705 0.165   83.060 0.409
## 27              SEX2:PREVHYP1   1.073 0.623    1.845 0.800
## 28                 SEX2:DIABP   1.012 0.992    1.033 0.240
## 29         CURSMOKE0:HEARTRTE   0.979 0.962    0.998 0.026
## 30          DIABETES1:TOTCHOL   0.993 0.983    1.004 0.217
## 31            DIABETES1:SYSBP   0.989 0.972    1.006 0.192
## 32           BPMEDS1:PREVCHD1   0.434 0.108    1.743 0.239
## 33            BPMEDS1:TOTCHOL   0.991 0.982    1.001 0.070
## 34                BPMEDS1:AGE   0.957 0.907    1.009 0.107
## 35           PREVCHD0:GLUCOSE   1.010 0.991    1.030 0.299
## 36               PREVHYP0:BMI   1.038 0.982    1.098 0.187
## 37                TOTCHOL:AGE   1.000 1.000    1.000 0.428
## 38           HEARTRTE:GLUCOSE   1.000 1.000    1.001 0.045
full_results_mi %>% 
  ggplot(aes(y=var,x=OR))+
  geom_point()+
  geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
  geom_vline(xintercept=1,color="red")

ORが異常に幅広くなっている因子があるため、sparse data biasがかかっている可能性が高い →penalized regressionを行うか、モデル式の変更をすべき状態

Backwards stepwiseとElastic Netによるパラメータ推定を行う

Elastic Netによるパラメータ推定

下準備

# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("glmnetUtils")
# library(glmnetUtils);library(glmnet);library(ISLR)

Ridge regression

#to detect optimal lambda
ridge.cv.res<-
  cv.glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=0)   #ridge

plot(ridge.cv.res) 

ridge.cv.res$lambda.min #min lambda
## [1] 0.009523244
# model coefficient with min lambda
glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  lambda=ridge.cv.res$lambda.min,
  alpha=0) %>% 
  .$beta
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                     s0
## SEX      -0.0813962421
## CURSMOKE  0.2468760397
## DIABETES  0.7155327927
## BPMEDS    0.5592037443
## PREVCHD   0.3002169372
## PREVAP   -0.2447784453
## PREVMI    0.0955726098
## PREVHYP   0.5155415106
## TOTCHOL  -0.0011747152
## AGE       0.0604815428
## SYSBP     0.0041215475
## DIABP     0.0149010624
## CIGPDAY   0.0040394071
## BMI       0.0159035694
## HEARTRTE -0.0102841712
## GLUCOSE  -0.0009616904

Lasso regression

#to detect optimal lambda
lasso.cv.res<-
  cv.glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=1)   #lasso

#log lambdaとdevianceの関連性plot
plot(lasso.cv.res)

#devianceが最小のときのlambda
lasso.cv.res$lambda.min #min lambda
## [1] 0.0021
#min lambdaのもとでのlasso regression coefficient;二通りのcodeがあるが、結果は同じ

coef(lasso.cv.res,s="lambda.min") #interceptの値も表示される
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -9.3732919829
## SEX         -0.0405473179
## CURSMOKE     0.2688147316
## DIABETES     0.6136939222
## BPMEDS       0.5142181504
## PREVCHD      0.0298690706
## PREVAP       .           
## PREVMI       0.1240355798
## PREVHYP      0.5511057332
## TOTCHOL     -0.0008671472
## AGE          0.0671689119
## SYSBP        0.0012846234
## DIABP        0.0178463734
## CIGPDAY      0.0025925936
## BMI          0.0112364738
## HEARTRTE    -0.0092393943
## GLUCOSE      .
glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  lambda=lasso.cv.res$lambda.min,
  alpha=1) %>% 
  .$beta 
## 16 x 1 sparse Matrix of class "dgCMatrix"
##                    s0
## SEX      -0.040476551
## CURSMOKE  0.268983024
## DIABETES  0.613718984
## BPMEDS    0.514118152
## PREVCHD   0.029927580
## PREVAP    .          
## PREVMI    0.123893960
## PREVHYP   0.551330129
## TOTCHOL  -0.000867187
## AGE       0.067179689
## SYSBP     0.001277099
## DIABP     0.017852439
## CIGPDAY   0.002589202
## BMI       0.011237120
## HEARTRTE -0.009239599
## GLUCOSE   .

Elastic Net

alphaを定める

Elastic Netでは、Ridge,Lassoの罰則項を割合alphaで混ぜ合わせる事ができる 当てはまりが最も良い時の最適なalpha求める

#cv.glmnetではなく、cv"a".glmnetを使う; to examine optimal alpha and lambda
elastic.cv.res<-
  cva.glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial"
  ) 

plot(elastic.cv.res)

 #alphaをデフォルトでは0-1の間で11個に振り分け、それぞれでのdevianceとlambdaの関係性を求める

#各alphaのうち、最もDevianceが小さいときのalphaを探し、その中で最も小さいlambdaを探す
##alpha=1はlassoと同じ;alpha=0はRidgeと同じということになる

sapply(elastic.cv.res$modlist,"[[","cvm") %>% #各alphaにおけるcvmを取り出す
  sapply(min) %>% #各alphaの中で最小の値を取り出す
  min() #すべてのalphaの中で最小のcvmを取り出す
## [1] 0.5569369
#結果を踏まえると、alpha=1のとき最小の値を示しており、すなわちlassoが最適と考えられる

#alpha=1のときの最小のlamdaを求める
cv.glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=1
  ) %>% 
  .$lambda.min
## [1] 0.001318861
 #これは一個前のchankのlamda minと同じ値になる

最適な(α,λ)は(1,0.001913442)であり、lasso regressionが良いということになる

Lasso model (再掲)

lasso.cv.res<-
  cv.glmnet(
  x=fr.missf_p1[,var] %>% data.matrix(),
  y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=1) 

coef(lasso.cv.res,s="lambda.min") 
## 17 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept) -9.472068721
## SEX         -0.045032608
## CURSMOKE     0.281816221
## DIABETES     0.633658495
## BPMEDS       0.525348184
## PREVCHD      0.033579323
## PREVAP       .          
## PREVMI       0.139586738
## PREVHYP      0.557336313
## TOTCHOL     -0.001026037
## AGE          0.068159990
## SYSBP        0.001215003
## DIABP        0.018234443
## CIGPDAY      0.002976881
## BMI          0.012476398
## HEARTRTE    -0.009930248
## GLUCOSE      .
cf<-coef(lasso.cv.res,s="lambda.min") %>% 
  as.matrix() %>% 
  as.data.frame() %>% 
  mutate(exp=exp(s1)) %>% 
  add_rownames(var="var") 
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#plot
cf %>% 
 ggplot(aes(y=var,x=exp))+
  geom_point()+
  geom_vline(xintercept=1,color="red")

  • Lasso regressionでは一部の変数の回帰係数が0にshrinkageしており、実質的に変数選択が行われたことがわかる
  • Shrinkageしたのは、PREVAP, GLUCOSEであり、これらは予測寄与が低いことを意味する

Lasso modelを使った予測性能評価

fr.missf_p1$lasso<-
  predict(lasso.cv.res,
        s="lambda.min",
        newx=fr.missf_p1[,var] %>% data.matrix(),
        type="response"
        ) 

fr.missf_p1<-
  fr.missf_p1 %>% 
  mutate(STROKEn=as.numeric(STROKE)-1)

ROC curve

#ROC curve
roc.lasso<-
  roc(STROKEn~lasso,
    data=fr.missf_p1,
    ci=T)
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
roc.lasso
## 
## Call:
## roc.formula(formula = STROKEn ~ lasso, data = fr.missf_p1, ci = T)
## 
## Data: lasso in 4019 controls (STROKEn 0) < 415 cases (STROKEn 1).
## Area under the curve: 0.7515
## 95% CI: 0.7277-0.7753 (DeLong)
roc.lasso %>% 
  ggroc(legacy.axes=T)+
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")

- AUC=0.7515104と中等度の予測性能

Calibration plot

val.prob(
  p=fr.missf_p1$lasso,
  y=fr.missf_p1$STROKEn,
  g=10,
  cex=0.5
)

##           Dxy       C (ROC)            R2             D      D:Chi-sq 
##  5.030551e-01  7.515275e-01  1.484266e-01  7.095404e-02  3.156102e+02 
##           D:p             U      U:Chi-sq           U:p             Q 
##            NA -3.219051e-04  5.726729e-01  7.510099e-01  7.127594e-02 
##         Brier     Intercept         Slope          Emax           E90 
##  7.790970e-02  9.269480e-02  1.046995e+00  1.364165e-02  7.993511e-03 
##          Eavg           S:z           S:p 
##  5.026621e-03 -8.989852e-02  9.283679e-01
  • Calibration性能はかなり高い事がわかる

BootstrapによるLasso modelの内的妥当性検証

optimismの算出

B<-100 #number of resampling 
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B) 

for(i in 1:B){  

  #bootstrap random sampling 
  bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
  
  #bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
  fr.missf_p1.i<-fr.missf_p1[bs.i,] 
  
  #bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
  lasso.cv.res.i<-
  cv.glmnet(
  x=fr.missf_p1.i[,var] %>% data.matrix(),
  y=fr.missf_p1.i[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=1) 
  
  fr.missf_p1.i$lasso1<- #lasso1では、bootstrap dataから構築した新しいLasso回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
  predict(lasso.cv.res.i,
        s="lambda.min",
        newx=fr.missf_p1.i[,var] %>% data.matrix(),
        type="response"
        )
  
  #AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
  AUC.i1[i]<-roc(STROKEn~lasso1,data=fr.missf_p1.i)$auc 
  
  #bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlasso2列に格納;AUC算出
  fr.missf_p1$lasso2<-
    predict(lasso.cv.res.i,
        s="lambda.min",
        newx=fr.missf_p1[,var] %>% data.matrix(),
        type="response"
        )
  
  AUC.i2[i]<-roc(STROKEn~lasso2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
  
  #print(paste(i,"th bootstrap iteration is completed.",sep="")) 
}  

opt1<-AUC.i1-AUC.i2 
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt1);hist(opt1)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.016380 -0.001658  0.005819  0.007805  0.017487  0.034368

optimismを用いたbias corrected AUCの算出

lam1<-mean(opt1) #estimate of the optimism 

cor.AUC<-roc.lasso$auc-lam1 #bias corrected AUC estimate

cor.AUC
## [1] 0.7437054
  • Lasso modelによる予測モデル構築を行い、Bootstrapによるbias corrected AUCを算出したところ、0.7437054であった

**疑問 - 今回のモデルでは、事前に考えていた非線形性や交互作用項がモデリングに組み込まれなかったが、実際にはそれらの柔軟なモデリングをLassoに適用することはできるのか?交互作用項に関しては、ペアの変数の掛け算項を列結合すれば良さそうだが、非線形性はどのように表現すればよいのかわからない。

Cross validationによる内的妥当性の検証

#k-fold CV 
k <-5 
AUC_CV <-data.frame(matrix(ncol=2,nrow=k)) 

#Randomly shuffle the data 
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),] 
 
#Create k equally size folds 
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE) 
 
#Perform k fold cross validation 
for(i in 1:k){  

  #Segment data2r by fold using the which() function  
testIndexes <- which(folds==i,arr.ind=TRUE) 
testData <- fr.missf_p1.r[testIndexes, ] 
trainData <- fr.missf_p1.r[-testIndexes, ] 

fit1<-
  cv.glmnet(
  x=trainData[,var] %>% data.matrix(),
  y=trainData[,"STROKE"] %>% data.matrix(),
  family="binomial",
  alpha=1) 

testData$fitted<-
  predict(lasso.cv.res,
        s="lambda.min",
        newx=testData[,var] %>% data.matrix(),
        type="response"
        ) 

ROC <- roc(testData$STROKE, testData$fitted) 
AUC_CV[i,2] <- ROC$auc 
AUC_CV[i,1] <- i 

#print(paste(i,"th cross-validation iteration is completed.", sep="")) 
}  
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv" 
names(AUC_CV)[1] <- "k" 
print(summary(AUC_CV$AUC_cv)) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7260  0.7425  0.7484  0.7502  0.7525  0.7817
  • CVによる内的妥当性検証ではAUCは0.7502146であり、Bootstrapによるbias corrected AUC=0.7437054と遜色ない。

Backward stepwiseによる変数選択とモデリング

ここでは、Lasso modelとの比較可能性を高めるために、2つの回帰モデルを考える 1. 交互作用項、非線形項を含まない線形の全変数モデル 2. 交互作用項、非線形項を含む全変数モデル(full model)

1. 交互作用項、非線形項を含まない線形の全変数モデルを考える(lin_model)

用いる変数はである

lin_form<-paste("STROKE~",paste(full_var,collapse="+"))

lin_model<-glm(as.formula(lin_form),
               data=fr.missf_p1,
               family=binomial(link="logit"))

VIFの検討

rms::vif(lin_model)
##      SEX2 CURSMOKE1 DIABETES1   BPMEDS1  PREVCHD1   PREVAP1   PREVMI1 PREVSTRK1 
##  1.245566  2.439088  1.694071  1.107002  9.598226  6.717491  2.379642  1.000000 
##  PREVHYP1   TOTCHOL       AGE     SYSBP     DIABP   CIGPDAY       BMI  HEARTRTE 
##  1.937207  1.067710  1.317387  3.700669  3.017425  2.568418  1.177581  1.100995 
##   GLUCOSE 
##  1.723184
  • PREVCHD、PREVAPのVIFが高く、多重共線性が疑われるため、PREVCHDを除外して再度VIFを算出
glm(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + 
    PREVMI + PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + 
    CIGPDAY + BMI + HEARTRTE + GLUCOSE,
               data=fr.missf_p1,
               family=binomial(link="logit")) %>% 
  rms::vif()
##      SEX2 CURSMOKE1 DIABETES1   BPMEDS1   PREVAP1   PREVMI1 PREVSTRK1  PREVHYP1 
##  1.244364  2.439228  1.693826  1.106665  1.187531  1.164856  1.000000  1.935830 
##   TOTCHOL       AGE     SYSBP     DIABP   CIGPDAY       BMI  HEARTRTE   GLUCOSE 
##  1.067423  1.316767  3.695881  3.010815  2.568423  1.177247  1.100259  1.722415
  • 見事に各変数のVIFが低下したため、PREVCHDを除外したモデルを採用する(lin_model2)
lin_form2<-
  as.formula(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + 
    PREVMI + PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + 
    CIGPDAY + BMI + HEARTRTE + GLUCOSE)
             
lin_model2<-glm(lin_form2,
               data=fr.missf_p1,
               family=binomial(link="logit"))  

Backwards stepwiseによる変数選択

step(lin_model2,
     direction="both") %>% 
  summary()
## Start:  AIC=2356.87
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI + 
##     PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + 
##     BMI + HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - PREVAP    1   2322.9 2354.9
## - PREVMI    1   2322.9 2354.9
## - SYSBP     1   2322.9 2354.9
## - SEX       1   2323.1 2355.1
## - CIGPDAY   1   2323.4 2355.4
## - GLUCOSE   1   2323.5 2355.5
## - TOTCHOL   1   2323.6 2355.6
## - BMI       1   2324.6 2356.6
## <none>          2322.9 2356.9
## - BPMEDS    1   2326.4 2358.4
## - CURSMOKE  1   2326.5 2358.5
## - HEARTRTE  1   2327.9 2359.9
## - DIABETES  1   2329.9 2361.9
## - DIABP     1   2331.5 2363.5
## - PREVHYP   1   2333.0 2365.0
## - AGE       1   2412.6 2444.6
## - PREVSTRK  1   2438.7 2470.7
## 
## Step:  AIC=2354.87
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVSTRK + 
##     PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + 
##     HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - PREVMI    1   2322.9 2352.9
## - SYSBP     1   2322.9 2352.9
## - SEX       1   2323.1 2353.1
## - CIGPDAY   1   2323.4 2353.4
## - GLUCOSE   1   2323.5 2353.5
## - TOTCHOL   1   2323.6 2353.6
## - BMI       1   2324.6 2354.6
## <none>          2322.9 2354.9
## - BPMEDS    1   2326.4 2356.4
## - CURSMOKE  1   2326.5 2356.5
## + PREVAP    1   2322.9 2356.9
## - HEARTRTE  1   2328.0 2358.0
## - DIABETES  1   2329.9 2359.9
## - DIABP     1   2331.5 2361.5
## - PREVHYP   1   2333.0 2363.0
## - AGE       1   2413.8 2443.8
## - PREVSTRK  1   2438.8 2468.8
## 
## Step:  AIC=2352.88
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + HEARTRTE + 
##     GLUCOSE
## 
##            Df Deviance    AIC
## - SYSBP     1   2322.9 2350.9
## - SEX       1   2323.1 2351.1
## - CIGPDAY   1   2323.4 2351.4
## - GLUCOSE   1   2323.5 2351.5
## - TOTCHOL   1   2323.6 2351.6
## - BMI       1   2324.6 2352.6
## <none>          2322.9 2352.9
## - BPMEDS    1   2326.4 2354.4
## - CURSMOKE  1   2326.5 2354.5
## + PREVMI    1   2322.9 2354.9
## + PREVAP    1   2322.9 2354.9
## - HEARTRTE  1   2328.0 2356.0
## - DIABETES  1   2329.9 2357.9
## - DIABP     1   2331.5 2359.5
## - PREVHYP   1   2333.0 2361.0
## - AGE       1   2414.8 2442.8
## - PREVSTRK  1   2439.4 2467.4
## 
## Step:  AIC=2350.94
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - SEX       1   2323.2 2349.2
## - CIGPDAY   1   2323.5 2349.5
## - GLUCOSE   1   2323.5 2349.5
## - TOTCHOL   1   2323.7 2349.7
## - BMI       1   2324.7 2350.7
## <none>          2322.9 2350.9
## - CURSMOKE  1   2326.6 2352.6
## - BPMEDS    1   2326.6 2352.6
## + SYSBP     1   2322.9 2352.9
## + PREVMI    1   2322.9 2352.9
## + PREVAP    1   2322.9 2352.9
## - HEARTRTE  1   2328.0 2354.0
## - DIABETES  1   2330.0 2356.0
## - PREVHYP   1   2334.9 2360.9
## - DIABP     1   2339.6 2365.6
## - AGE       1   2427.5 2453.5
## - PREVSTRK  1   2439.5 2465.5
## 
## Step:  AIC=2349.15
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - GLUCOSE   1   2323.7 2347.7
## - CIGPDAY   1   2323.9 2347.9
## - TOTCHOL   1   2324.1 2348.1
## - BMI       1   2324.9 2348.9
## <none>          2323.2 2349.2
## - BPMEDS    1   2326.7 2350.7
## - CURSMOKE  1   2326.8 2350.8
## + SEX       1   2322.9 2350.9
## + SYSBP     1   2323.1 2351.1
## + PREVMI    1   2323.1 2351.1
## + PREVAP    1   2323.2 2351.2
## - HEARTRTE  1   2328.5 2352.5
## - DIABETES  1   2330.3 2354.3
## - PREVHYP   1   2335.1 2359.1
## - DIABP     1   2339.9 2363.9
## - AGE       1   2427.9 2451.9
## - PREVSTRK  1   2439.8 2463.8
## 
## Step:  AIC=2347.72
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - CIGPDAY   1   2324.5 2346.5
## - TOTCHOL   1   2324.6 2346.6
## - BMI       1   2325.4 2347.4
## <none>          2323.7 2347.7
## + GLUCOSE   1   2323.2 2349.2
## - BPMEDS    1   2327.2 2349.2
## - CURSMOKE  1   2327.3 2349.3
## + SEX       1   2323.5 2349.5
## + PREVMI    1   2323.7 2349.7
## + SYSBP     1   2323.7 2349.7
## + PREVAP    1   2323.7 2349.7
## - HEARTRTE  1   2329.4 2351.4
## - DIABETES  1   2331.7 2353.7
## - PREVHYP   1   2335.6 2357.6
## - DIABP     1   2340.7 2362.7
## - AGE       1   2428.0 2450.0
## - PREVSTRK  1   2440.1 2462.1
## 
## Step:  AIC=2346.51
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     TOTCHOL + AGE + DIABP + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - TOTCHOL   1   2325.4 2345.4
## - BMI       1   2326.2 2346.2
## <none>          2324.5 2346.5
## + CIGPDAY   1   2323.7 2347.7
## - BPMEDS    1   2327.9 2347.9
## + GLUCOSE   1   2323.9 2347.9
## + SEX       1   2324.1 2348.1
## + PREVMI    1   2324.5 2348.5
## + SYSBP     1   2324.5 2348.5
## + PREVAP    1   2324.5 2348.5
## - HEARTRTE  1   2330.0 2350.0
## - DIABETES  1   2332.4 2352.4
## - PREVHYP   1   2336.3 2356.3
## - CURSMOKE  1   2339.1 2359.1
## - DIABP     1   2341.6 2361.6
## - AGE       1   2428.0 2448.0
## - PREVSTRK  1   2440.6 2460.6
## 
## Step:  AIC=2345.44
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     AGE + DIABP + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - BMI       1   2327.1 2345.1
## <none>          2325.4 2345.4
## + TOTCHOL   1   2324.5 2346.5
## + CIGPDAY   1   2324.6 2346.6
## - BPMEDS    1   2328.7 2346.7
## + SEX       1   2324.8 2346.8
## + GLUCOSE   1   2324.8 2346.8
## + PREVMI    1   2325.4 2347.4
## + PREVAP    1   2325.4 2347.4
## + SYSBP     1   2325.4 2347.4
## - HEARTRTE  1   2331.4 2349.4
## - DIABETES  1   2333.3 2351.3
## - PREVHYP   1   2337.1 2355.1
## - CURSMOKE  1   2340.3 2358.3
## - DIABP     1   2342.4 2360.4
## - AGE       1   2428.2 2446.2
## - PREVSTRK  1   2442.3 2460.3
## 
## Step:  AIC=2345.13
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + 
##     AGE + DIABP + HEARTRTE
## 
##            Df Deviance    AIC
## <none>          2327.1 2345.1
## + BMI       1   2325.4 2345.4
## + TOTCHOL   1   2326.2 2346.2
## + CIGPDAY   1   2326.3 2346.3
## - BPMEDS    1   2330.3 2346.3
## + SEX       1   2326.5 2346.5
## + GLUCOSE   1   2326.6 2346.6
## + PREVMI    1   2327.1 2347.1
## + PREVAP    1   2327.1 2347.1
## + SYSBP     1   2327.1 2347.1
## - HEARTRTE  1   2333.0 2349.0
## - DIABETES  1   2335.8 2351.8
## - PREVHYP   1   2339.5 2355.5
## - CURSMOKE  1   2340.9 2356.9
## - DIABP     1   2347.3 2363.3
## - AGE       1   2429.3 2445.3
## - PREVSTRK  1   2443.8 2459.8
## 
## Call:
## glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + 
##     PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = "logit"), 
##     data = fr.missf_p1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3340  -0.4512  -0.3184  -0.2316   2.9566  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -7.767033   0.667549 -11.635  < 2e-16 ***
## CURSMOKE1     0.427568   0.115490   3.702 0.000214 ***
## DIABETES1     0.750248   0.241140   3.111 0.001863 ** 
## BPMEDS1       0.407887   0.222002   1.837 0.066163 .  
## PREVSTRK1    17.571864 238.184327   0.074 0.941190    
## PREVHYP1      0.516553   0.146211   3.533 0.000411 ***
## AGE           0.071498   0.007278   9.823  < 2e-16 ***
## DIABP         0.023741   0.005264   4.510 6.49e-06 ***
## HEARTRTE     -0.011267   0.004715  -2.389 0.016872 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2756.0  on 4433  degrees of freedom
## Residual deviance: 2327.1  on 4425  degrees of freedom
## AIC: 2345.1
## 
## Number of Fisher Scoring iterations: 14
  • Backwardsの結果、glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = “logit”), data = fr.missf_p1) が最良のモデルとして選ばれた
stepwise_results<-
  glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + 
    PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = "logit"), 
    data = fr.missf_p1) %>% 
  summary() %>% 
  .$coefficients %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename("var"="rowname") %>% 
  mutate(OR=exp(Estimate),
         Lower=exp(Estimate-1.96*`Std. Error`),
         Upper=exp(Estimate+1.96*`Std. Error`)) %>% 
  select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>% 
  #すべての変数を丸める
  mutate_if(is.numeric,~round(.,3))

#結果を表示
stepwise_results
##           var          OR Lower         Upper     p
## 1 (Intercept) 0.00000e+00 0.000  2.000000e-03 0.000
## 2   CURSMOKE1 1.53400e+00 1.223  1.923000e+00 0.000
## 3   DIABETES1 2.11800e+00 1.320  3.397000e+00 0.002
## 4     BPMEDS1 1.50400e+00 0.973  2.323000e+00 0.066
## 5   PREVSTRK1 4.27921e+07 0.000 2.387569e+210 0.941
## 6    PREVHYP1 1.67600e+00 1.259  2.233000e+00 0.000
## 7         AGE 1.07400e+00 1.059  1.090000e+00 0.000
## 8       DIABP 1.02400e+00 1.014  1.035000e+00 0.000
## 9    HEARTRTE 9.89000e-01 0.980  9.980000e-01 0.017
#plot
stepwise_results %>% 
  #filter(!var=="PREVSTRK1") %>%  #これだけ非常にCIが大きくグラフ化すると視認性が悪いので一時的に削除
  ggplot(aes(y=var,x=OR))+
  geom_point()+
  geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
  geom_vline(xintercept=1,color="red")

- 変数選択の結果残った変数のうち、PREVSTRKのORが非常に大きくなり、推定が不安定になっている - アウトカムとPREVSTROKEの関係を詳しく見てみると、分割表において、 4019, 383, 0, 32  であり、STROKE=0のとき、PREVSTRK=1に該当する患者が0であることがわかった→これがsparse biasの原因である

xtabs(~STROKE+PREVSTRK,
      data=fr.missf_p1)
##       PREVSTRK
## STROKE    0    1
##      0 4019    0
##      1  383   32

ここで、すべてのカテゴリカル変数に対して同様にアウトカムとの分割表を考えてみる

f<-
  function(x){
form<-as.formula(paste("~STROKE+",x))  
xtabs(form,data=fr.missf_p1)
  }

for(i in fac_var[-1]){
  print(f(i))
}
##       SEX
## STROKE    1    2
##      0 1751 2268
##      1  193  222
##       CURSMOKE
## STROKE    0    1
##      0 2037 1982
##      1  216  199
##       DIABETES
## STROKE    0    1
##      0 3926   93
##      1  387   28
##       BPMEDS
## STROKE    0    1
##      0 3917  102
##      1  373   42
##       educ
## STROKE    1    2    3    4
##      0 1659 1206  676  478
##      1  224  104   52   35
##       PREVCHD
## STROKE    0    1
##      0 3862  157
##      1  378   37
##       PREVAP
## STROKE    0    1
##      0 3899  120
##      1  388   27
##       PREVMI
## STROKE    0    1
##      0 3949   70
##      1  399   16
##       PREVSTRK
## STROKE    0    1
##      0 4019    0
##      1  383   32
##       PREVHYP
## STROKE    0    1
##      0 2840 1179
##      1  164  251
##       PERIOD
## STROKE    1    2    3
##      0 4019    0    0
##      1  415    0    0
##       DEATH
## STROKE    0    1
##      0 2746 1273
##      1  138  277
##       ANGINA
## STROKE    0    1
##      0 3394  625
##      1  315  100
##       HOSPMI
## STROKE    0    1
##      0 3636  383
##      1  344   71
##       MI_FCHD
## STROKE    0    1
##      0 3398  621
##      1  305  110
##       ANYCHD
## STROKE    0    1
##      0 2958 1061
##      1  236  179
## STROKE
##    0    1 
## 4019  415 
##       CVD
## STROKE    0    1
##      0 3277  742
##      1    0  415
##       HYPERTEN
## STROKE    0    1
##      0 1142 2877
##      1   40  375
  • カテゴリカル変数に対してアウトカムとの分割表を考えてみると、sparseな部分が可視化できるため、事前の変数選択を考える上で非常に有用であることがわかった
  • 今回は、PREVSTRKのみが完全にsparse(cell=0がある)であるため、この変数を除外して再度変数選択を行うこととする(本当はこれを解析前にできればよかった)

PREVSTROKEを除外したモデルにおいて、VIFから再度検討する

glm(
  STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
               data=fr.missf_p1,
               family=binomial(link="logit")) %>% 
  rms::vif()
##      SEX2 CURSMOKE1 DIABETES1   BPMEDS1  PREVCHD1   PREVAP1   PREVMI1  PREVHYP1 
##  1.240065  2.456395  1.705096  1.112819  8.979822  5.835302  2.735766  1.922086 
##   TOTCHOL       AGE     SYSBP     DIABP   CIGPDAY       BMI  HEARTRTE   GLUCOSE 
##  1.066603  1.317287  3.664247  2.980477  2.583968  1.182116  1.101576  1.730470
#ココでも同様にPREVCHDのVIFが高いため、除外して再度VIFを算出
glm(
  STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
               data=fr.missf_p1,
               family=binomial(link="logit")) %>% 
  rms::vif()
##      SEX2 CURSMOKE1 DIABETES1   BPMEDS1   PREVAP1   PREVMI1  PREVHYP1   TOTCHOL 
##  1.240237  2.456599  1.705139  1.112127  1.179417  1.159379  1.919544  1.065887 
##       AGE     SYSBP     DIABP   CIGPDAY       BMI  HEARTRTE   GLUCOSE 
##  1.316273  3.658322  2.971481  2.583932  1.181654  1.100465  1.729751
#いい感じ→これを初期モデルとする

lin_model3<-
  glm(STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
               data=fr.missf_p1,
               family=binomial(link="logit"))
step(lin_model3,
     direction="both") %>% 
  summary()
## Start:  AIC=2470.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI + 
##     PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + 
##     HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - PREVAP    1   2438.8 2468.8
## - SYSBP     1   2438.8 2468.8
## - SEX       1   2439.0 2469.0
## - CIGPDAY   1   2439.1 2469.1
## - GLUCOSE   1   2439.1 2469.1
## - PREVMI    1   2439.4 2469.4
## - TOTCHOL   1   2440.2 2470.2
## - BMI       1   2440.3 2470.3
## <none>          2438.7 2470.7
## - CURSMOKE  1   2442.3 2472.3
## - DIABETES  1   2445.2 2475.2
## - HEARTRTE  1   2445.4 2475.4
## - BPMEDS    1   2445.7 2475.7
## - DIABP     1   2446.6 2476.6
## - PREVHYP   1   2453.2 2483.2
## - AGE       1   2532.4 2562.4
## 
## Step:  AIC=2468.82
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + HEARTRTE + 
##     GLUCOSE
## 
##            Df Deviance    AIC
## - SYSBP     1   2438.9 2466.9
## - SEX       1   2439.1 2467.1
## - CIGPDAY   1   2439.1 2467.1
## - GLUCOSE   1   2439.1 2467.1
## - PREVMI    1   2439.4 2467.4
## - TOTCHOL   1   2440.3 2468.3
## - BMI       1   2440.4 2468.4
## <none>          2438.8 2468.8
## - CURSMOKE  1   2442.4 2470.4
## + PREVAP    1   2438.7 2470.7
## - DIABETES  1   2445.2 2473.2
## - HEARTRTE  1   2445.5 2473.5
## - BPMEDS    1   2445.7 2473.7
## - DIABP     1   2446.7 2474.7
## - PREVHYP   1   2453.3 2481.3
## - AGE       1   2533.2 2561.2
## 
## Step:  AIC=2466.91
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - SEX       1   2439.1 2465.1
## - GLUCOSE   1   2439.2 2465.2
## - CIGPDAY   1   2439.2 2465.2
## - PREVMI    1   2439.5 2465.5
## - TOTCHOL   1   2440.4 2466.4
## - BMI       1   2440.5 2466.5
## <none>          2438.9 2466.9
## - CURSMOKE  1   2442.5 2468.5
## + SYSBP     1   2438.8 2468.8
## + PREVAP    1   2438.8 2468.8
## - DIABETES  1   2445.4 2471.4
## - HEARTRTE  1   2445.6 2471.6
## - BPMEDS    1   2446.0 2472.0
## - DIABP     1   2454.4 2480.4
## - PREVHYP   1   2456.1 2482.1
## - AGE       1   2546.7 2572.7
## 
## Step:  AIC=2465.12
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL + 
##     AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
## 
##            Df Deviance    AIC
## - GLUCOSE   1   2439.4 2463.4
## - CIGPDAY   1   2439.6 2463.6
## - PREVMI    1   2439.8 2463.8
## - BMI       1   2440.7 2464.7
## - TOTCHOL   1   2440.8 2464.8
## <none>          2439.1 2465.1
## - CURSMOKE  1   2442.7 2466.7
## + SEX       1   2438.9 2466.9
## + PREVAP    1   2439.1 2467.1
## + SYSBP     1   2439.1 2467.1
## - DIABETES  1   2445.7 2469.7
## - BPMEDS    1   2446.1 2470.1
## - HEARTRTE  1   2446.2 2470.2
## - DIABP     1   2454.8 2478.8
## - PREVHYP   1   2456.2 2480.2
## - AGE       1   2546.9 2570.9
## 
## Step:  AIC=2463.41
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL + 
##     AGE + DIABP + CIGPDAY + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - CIGPDAY   1   2439.9 2461.9
## - PREVMI    1   2440.1 2462.1
## - BMI       1   2441.0 2463.0
## - TOTCHOL   1   2441.1 2463.1
## <none>          2439.4 2463.4
## - CURSMOKE  1   2443.0 2465.0
## + GLUCOSE   1   2439.1 2465.1
## + SEX       1   2439.2 2465.2
## + PREVAP    1   2439.4 2465.4
## + SYSBP     1   2439.4 2465.4
## - BPMEDS    1   2446.3 2468.3
## - HEARTRTE  1   2446.7 2468.7
## - DIABETES  1   2447.4 2469.4
## - DIABP     1   2455.2 2477.2
## - PREVHYP   1   2456.5 2478.5
## - AGE       1   2547.0 2569.0
## 
## Step:  AIC=2461.93
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL + 
##     AGE + DIABP + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - PREVMI    1   2440.6 2460.6
## - BMI       1   2441.5 2461.5
## - TOTCHOL   1   2441.6 2461.6
## <none>          2439.9 2461.9
## + CIGPDAY   1   2439.4 2463.4
## + SEX       1   2439.5 2463.5
## + GLUCOSE   1   2439.6 2463.6
## + PREVAP    1   2439.9 2463.9
## + SYSBP     1   2439.9 2463.9
## - BPMEDS    1   2446.7 2466.7
## - HEARTRTE  1   2447.1 2467.1
## - DIABETES  1   2447.9 2467.9
## - CURSMOKE  1   2453.0 2473.0
## - DIABP     1   2455.8 2475.8
## - PREVHYP   1   2456.9 2476.9
## - AGE       1   2547.0 2567.0
## 
## Step:  AIC=2460.57
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + TOTCHOL + AGE + 
##     DIABP + BMI + HEARTRTE
## 
##            Df Deviance    AIC
## - BMI       1   2442.1 2460.1
## - TOTCHOL   1   2442.3 2460.3
## <none>          2440.6 2460.6
## + PREVMI    1   2439.9 2461.9
## + CIGPDAY   1   2440.1 2462.1
## + SEX       1   2440.1 2462.1
## + GLUCOSE   1   2440.3 2462.3
## + SYSBP     1   2440.6 2462.6
## + PREVAP    1   2440.6 2462.6
## - BPMEDS    1   2447.4 2465.4
## - HEARTRTE  1   2447.8 2465.8
## - DIABETES  1   2448.8 2466.8
## - CURSMOKE  1   2454.1 2472.1
## - DIABP     1   2456.3 2474.3
## - PREVHYP   1   2457.8 2475.8
## - AGE       1   2550.6 2568.6
## 
## Step:  AIC=2460.12
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + TOTCHOL + AGE + 
##     DIABP + HEARTRTE
## 
##            Df Deviance    AIC
## - TOTCHOL   1   2443.8 2459.8
## <none>          2442.1 2460.1
## + BMI       1   2440.6 2460.6
## + PREVMI    1   2441.5 2461.5
## + CIGPDAY   1   2441.6 2461.6
## + SEX       1   2441.6 2461.6
## + GLUCOSE   1   2441.9 2461.9
## + SYSBP     1   2442.1 2462.1
## + PREVAP    1   2442.1 2462.1
## - BPMEDS    1   2449.0 2465.0
## - HEARTRTE  1   2449.2 2465.2
## - DIABETES  1   2451.1 2467.1
## - CURSMOKE  1   2454.6 2470.6
## - PREVHYP   1   2460.2 2476.2
## - DIABP     1   2461.0 2477.0
## - AGE       1   2551.7 2567.7
## 
## Step:  AIC=2459.81
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + 
##     HEARTRTE
## 
##            Df Deviance    AIC
## <none>          2443.8 2459.8
## + TOTCHOL   1   2442.1 2460.1
## + BMI       1   2442.3 2460.3
## + SEX       1   2443.0 2461.0
## + PREVMI    1   2443.2 2461.2
## + CIGPDAY   1   2443.2 2461.2
## + GLUCOSE   1   2443.6 2461.6
## + SYSBP     1   2443.8 2461.8
## + PREVAP    1   2443.8 2461.8
## - BPMEDS    1   2450.4 2464.4
## - HEARTRTE  1   2451.6 2465.6
## - DIABETES  1   2452.7 2466.7
## - CURSMOKE  1   2456.7 2470.7
## - PREVHYP   1   2461.6 2475.6
## - DIABP     1   2462.5 2476.5
## - AGE       1   2551.7 2565.7
## 
## Call:
## glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + 
##     AGE + DIABP + HEARTRTE, family = binomial(link = "logit"), 
##     data = fr.missf_p1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3955  -0.4649  -0.3255  -0.2367   2.9230  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.459839   0.645143 -11.563  < 2e-16 ***
## CURSMOKE1    0.401439   0.112012   3.584 0.000339 ***
## DIABETES1    0.739673   0.235375   3.143 0.001675 ** 
## BPMEDS1      0.547228   0.207449   2.638 0.008342 ** 
## PREVHYP1     0.597373   0.141426   4.224 2.40e-05 ***
## AGE          0.071244   0.007065  10.085  < 2e-16 ***
## DIABP        0.021968   0.005071   4.332 1.48e-05 ***
## HEARTRTE    -0.012629   0.004590  -2.751 0.005933 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2756.0  on 4433  degrees of freedom
## Residual deviance: 2443.8  on 4426  degrees of freedom
## AIC: 2459.8
## 
## Number of Fisher Scoring iterations: 6

-Backwardsで最良のモデルとして選ばれたのは glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = “logit”), data = fr.missf_p1)

stepwise_results<-
  glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + 
    AGE + DIABP + HEARTRTE, family = binomial(link = "logit"), 
    data = fr.missf_p1) %>% 
  summary() %>% 
  .$coefficients %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename("var"="rowname") %>% 
  mutate(OR=exp(Estimate),
         Lower=exp(Estimate-1.96*`Std. Error`),
         Upper=exp(Estimate+1.96*`Std. Error`)) %>% 
  select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>% 
  #すべての変数を丸める
  mutate_if(is.numeric,~round(.,3))

#結果を表示
stepwise_results
##           var    OR Lower Upper     p
## 1 (Intercept) 0.001 0.000 0.002 0.000
## 2   CURSMOKE1 1.494 1.199 1.861 0.000
## 3   DIABETES1 2.095 1.321 3.323 0.002
## 4     BPMEDS1 1.728 1.151 2.596 0.008
## 5    PREVHYP1 1.817 1.377 2.398 0.000
## 6         AGE 1.074 1.059 1.089 0.000
## 7       DIABP 1.022 1.012 1.032 0.000
## 8    HEARTRTE 0.987 0.979 0.996 0.006
#plot
stepwise_results %>% 
  ggplot(aes(y=var,x=OR))+
  geom_point()+
  geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
  geom_vline(xintercept=1,color="red")

- ORが発散することなくとてもいい感じの推定ができたので、このモデルを 1. 交互作用項、非線形項を含まない線形の全変数モデル(lin_model)における最終モデル(lin_final_model)とする

lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)

lin_final_model<-
  glm(formula = lin_final_formula, 
      family = binomial(link = "logit"), 
      #x=T,y=T,
      data = fr.missf_p1)

予測性能の評価

ROC AUC

# risk prediction fitting
fr.missf_p1$fitted.lin.fin<-
  predict(lin_final_model,
          type="response") 

# ROC AUC
roc.lin.final<-
  roc(STROKE~fitted.lin.fin,
    data=fr.missf_p1,
    ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.lin.final
## 
## Call:
## roc.formula(formula = STROKE ~ fitted.lin.fin, data = fr.missf_p1,     ci = T)
## 
## Data: fitted.lin.fin in 4019 controls (STROKE 0) < 415 cases (STROKE 1).
## Area under the curve: 0.7504
## 95% CI: 0.7265-0.7742 (DeLong)
roc.lin.final %>% 
  ggroc(legacy.axes=T)+
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")

- ROCは0.7503518であり、中等度の予測性能と考えられる

#calibration plot
cal.lin.final<-
  val.prob(
  p=fr.missf_p1$fitted.lin.fin,
  y=fr.missf_p1$STROKEn,
  g=10,
  cex=0.5) 

- calibration plotからみても予測性能は高い

Bootstrapによるliner final modelの内的妥当性検証

optimismの算出

B<-100 #number of resampling 
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B) 

for(i in 1:B){  

  #bootstrap random sampling 
  bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
  
  #bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
  fr.missf_p1.i<-fr.missf_p1[bs.i,] 
  
  #bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
  lin.fin.cv.res.i<-
    glm(formula = lin_final_formula, 
      family = binomial(link = "logit"), 
      data = fr.missf_p1.i)
  
  fr.missf_p1.i$lin.fin.1<- #lin.fin.1では、bootstrap dataから構築した新しいロジスティク回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
    predict(lin.fin.cv.res.i,
            data=fr.missf_p1.i,
            type="response") 
 
  #AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
  AUC.i1[i]<-roc(STROKE~lin.fin.1,data=fr.missf_p1.i)$auc 
  
  
  #bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlin.fin.2列に格納;AUC算出
  fr.missf_p1$lin.fin.2<-
    predict(lin.fin.cv.res.i,
            newdata=fr.missf_p1, ##dataではなくnewdataにする必要がある!!!!
            type="response") 
  
  AUC.i2[i]<-roc(STROKE~lin.fin.2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
  
  #print(paste(i,"th bootstrap iteration is completed.",sep="")) 
}  

opt2<-AUC.i1-AUC.i2 
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt2);hist(opt2)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.036175 -0.004380  0.005196  0.004162  0.013247  0.034522

optimismを用いたbias corrected AUCの算出

lam2<-mean(opt2) #estimate of the optimism 

lin.cor.AUC<-roc.lin.final$auc-lam2 #bias corrected AUC estimate

lin.cor.AUC
## [1] 0.7461894
  • Linerity final modelによる予測モデル構築を行い、Bootstrapによるbias corrected AUCを算出したところ、0.7461894であった
  • 当初、2個目のpredictのdata部分をnewdataにしていなかったためoptimismが非常に大きく出てしまっていた。回帰モデルの内部におけるdataとpredictにおける外挿したいdataが異なるときは、newdataに設定する必要がある

Cross validationによる内的妥当性の検証

#k-fold CV 
k <-5 
AUC_CV <-data.frame(matrix(ncol=2,nrow=k)) 

lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)

lin_final_model<-
  glm(formula = lin_final_formula, 
      family = binomial(link = "logit"), 
      data = fr.missf_p1)

#Randomly shuffle the data 
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),] 
 
#Create k equally size folds 
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE) 
 
#Perform k fold cross validation 
for(i in 1:k){  

  #Segment data2r by fold using the which() function  
testIndexes <- which(folds==i,arr.ind=TRUE) 
testData <- fr.missf_p1.r[testIndexes, ] 
trainData <- fr.missf_p1.r[-testIndexes, ] 
fit1<-glm(formula = lin_final_formula, family = binomial(link = "logit"), data = trainData)
testData$fitted<-predict(fit1,testData,type="response") 
ROC <- roc(testData$STROKE, testData$fitted) 
AUC_CV[i,2] <- ROC$auc 
AUC_CV[i,1] <- i 

#print(paste(i,"th cross-validation iteration is completed.", sep="")) 
}  
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv" 
names(AUC_CV)[1] <- "k" 
print(summary(AUC_CV$AUC_cv)) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7163  0.7251  0.7339  0.7456  0.7728  0.7801
  • CVではAUCの平均値は0.7456469

2. 交互作用項、非線形項を含む線形の全変数モデルを考える(full_model)

用いる変数はである

  • full modelのうち、前述で多重共線性やsparseの問題から削除した変数(PREVSTRK,PREVCHD)を除いたモデルを考える
fullmodel
## [1] "STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVCHD+SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:PREVCHD+BPMEDS:TOTCHOL+BPMEDS:AGE+PREVCHD:GLUCOSE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE"
glm(as.formula(fullmodel),data=fr.missf_p1,
      family = binomial(link = "logit")) 
## 
## Call:  glm(formula = as.formula(fullmodel), family = binomial(link = "logit"), 
##     data = fr.missf_p1)
## 
## Coefficients:
##                (Intercept)                        SEX2  
##                 -5.2535723                  -1.1953373  
##                  CURSMOKE1                   DIABETES1  
##                 -1.2365073                   4.0207549  
##                    BPMEDS1                    PREVCHD1  
##                  5.2898809                   1.1035782  
##                    PREVAP1                     PREVMI1  
##                 -0.3720497                   0.0550895  
##                   PREVHYP1                     TOTCHOL  
##                  1.5148234                   0.0066005  
##                        AGE                       SYSBP  
##                  0.1010951                   0.0013414  
##                      DIABP                     CIGPDAY  
##                  0.0127927                   0.0026360  
##             rcs(BMI, 4)BMI             rcs(BMI, 4)BMI'  
##                 -0.0862948                   0.2278817  
##           rcs(BMI, 4)BMI''    rcs(HEARTRTE, 4)HEARTRTE  
##                 -0.5949000                   0.0094925  
##  rcs(HEARTRTE, 4)HEARTRTE'  rcs(HEARTRTE, 4)HEARTRTE''  
##                 -0.2123066                   0.5429251  
##     rcs(GLUCOSE, 4)GLUCOSE     rcs(GLUCOSE, 4)GLUCOSE'  
##                 -0.0611513                   0.1259855  
##   rcs(GLUCOSE, 4)GLUCOSE''               SEX2:PREVCHD1  
##                 -0.3977397                  -0.4496324  
##               SEX2:PREVAP1                SEX2:PREVMI1  
##                  0.9761805                   1.3096815  
##              SEX2:PREVHYP1                  SEX2:DIABP  
##                  0.0700994                   0.0122176  
##         CURSMOKE0:HEARTRTE          CURSMOKE1:HEARTRTE  
##                 -0.0207419                          NA  
##          DIABETES1:TOTCHOL             DIABETES1:SYSBP  
##                 -0.0066810                  -0.0112544  
##           BPMEDS1:PREVCHD1             BPMEDS1:TOTCHOL  
##                 -0.8348873                  -0.0085763  
##                BPMEDS1:AGE            PREVCHD0:GLUCOSE  
##                 -0.0439211                   0.0103560  
##           PREVCHD1:GLUCOSE                PREVHYP0:BMI  
##                         NA                   0.0374607  
##               PREVHYP1:BMI                 TOTCHOL:AGE  
##                         NA                  -0.0001260  
##           HEARTRTE:GLUCOSE  
##                  0.0003235  
## 
## Degrees of Freedom: 4433 Total (i.e. Null);  4396 Residual
## Null Deviance:       2756 
## Residual Deviance: 2395  AIC: 2471
fullmodel2<-
  STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+
  rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+
  SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:TOTCHOL+
  BPMEDS:AGE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE

fullmodel3<-STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+
  rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVHYP+SEX:PREVMI

VIFの評価

#dfが大きいのでGVIFを算出するcar::vifを用いる
glm(as.formula(fullmodel3),data=fr.missf_p1,
      family = binomial(link = "logit")) %>% 
  car::vif(type="predictor")
## Warning in vif.lm(., type = "predictor"): type = 'predictor' is available only for unweighted linear models;
##   type = 'terms' will be used
##                      GVIF Df GVIF^(1/(2*Df))
## SEX              2.570312  1        1.603219
## CURSMOKE         2.463038  1        1.569407
## DIABETES         1.751968  1        1.323619
## BPMEDS           1.134620  1        1.065186
## PREVAP           1.204167  1        1.097345
## PREVMI           1.687747  1        1.299133
## PREVHYP          2.854402  1        1.689498
## TOTCHOL          1.089812  1        1.043941
## AGE              1.329260  1        1.152935
## SYSBP            3.727491  1        1.930671
## DIABP            2.981403  1        1.726674
## CIGPDAY          2.594084  1        1.610616
## rcs(BMI, 4)      1.323785  3        1.047859
## rcs(HEARTRTE, 4) 1.150514  3        1.023643
## rcs(GLUCOSE, 4)  1.866674  3        1.109630
## SEX:PREVHYP      3.811174  1        1.952223
## SEX:PREVMI       1.557143  1        1.247855
  • fullmodel2では交互作用項が多すぎるためか計算不能になるので、臨床的な(感覚的に)視点から交互作用項を減らしたmodel3を考える
  • fullmodel3ではGVIFの問題は気にならないためこのまま進める

Backwards stepwiseによる変数選択

glm(as.formula(fullmodel3),data=fr.missf_p1,
      family = binomial(link = "logit")) %>% 
  step(direction="both") %>% 
  summary()
## Start:  AIC=2472.26
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI + 
##     PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + rcs(BMI, 
##     4) + rcs(HEARTRTE, 4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - rcs(BMI, 4)       3   2426.7 2468.7
## - PREVAP            1   2424.3 2470.3
## - SYSBP             1   2424.3 2470.3
## - CIGPDAY           1   2424.5 2470.5
## - rcs(GLUCOSE, 4)   3   2428.9 2470.9
## - SEX:PREVHYP       1   2425.2 2471.2
## - TOTCHOL           1   2425.4 2471.4
## <none>                  2424.3 2472.3
## - SEX:PREVMI        1   2426.5 2472.5
## - CURSMOKE          1   2428.0 2474.0
## - DIABETES          1   2429.0 2475.0
## - BPMEDS            1   2430.7 2476.7
## - rcs(HEARTRTE, 4)  3   2436.6 2478.6
## - DIABP             1   2433.4 2479.4
## - AGE               1   2515.0 2561.0
## 
## Step:  AIC=2468.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI + 
##     PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + rcs(HEARTRTE, 
##     4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - SYSBP             1   2426.7 2466.7
## - PREVAP            1   2426.7 2466.7
## - CIGPDAY           1   2427.0 2467.0
## - rcs(GLUCOSE, 4)   3   2431.6 2467.6
## - SEX:PREVHYP       1   2427.9 2467.9
## - TOTCHOL           1   2428.1 2468.1
## <none>                  2426.7 2468.7
## - SEX:PREVMI        1   2429.0 2469.0
## - CURSMOKE          1   2430.2 2470.2
## - DIABETES          1   2432.0 2472.0
## + rcs(BMI, 4)       3   2424.3 2472.3
## - BPMEDS            1   2433.3 2473.3
## - rcs(HEARTRTE, 4)  3   2438.9 2474.9
## - DIABP             1   2437.6 2477.6
## - AGE               1   2517.8 2557.8
## 
## Step:  AIC=2466.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI + 
##     PREVHYP + TOTCHOL + AGE + DIABP + CIGPDAY + rcs(HEARTRTE, 
##     4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - PREVAP            1   2426.7 2464.7
## - CIGPDAY           1   2427.0 2465.0
## - rcs(GLUCOSE, 4)   3   2431.6 2465.6
## - SEX:PREVHYP       1   2427.9 2465.9
## - TOTCHOL           1   2428.1 2466.1
## <none>                  2426.7 2466.7
## - SEX:PREVMI        1   2429.0 2467.0
## - CURSMOKE          1   2430.2 2468.2
## + SYSBP             1   2426.7 2468.7
## - DIABETES          1   2432.0 2470.0
## + rcs(BMI, 4)       3   2424.3 2470.3
## - BPMEDS            1   2433.3 2471.3
## - rcs(HEARTRTE, 4)  3   2438.9 2472.9
## - DIABP             1   2445.8 2483.8
## - AGE               1   2527.5 2565.5
## 
## Step:  AIC=2464.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     TOTCHOL + AGE + DIABP + CIGPDAY + rcs(HEARTRTE, 4) + rcs(GLUCOSE, 
##     4) + SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - CIGPDAY           1   2427.0 2463.0
## - rcs(GLUCOSE, 4)   3   2431.6 2463.6
## - SEX:PREVHYP       1   2427.9 2463.9
## - TOTCHOL           1   2428.1 2464.1
## <none>                  2426.7 2464.7
## - SEX:PREVMI        1   2429.0 2465.0
## - CURSMOKE          1   2430.2 2466.2
## + PREVAP            1   2426.7 2466.7
## + SYSBP             1   2426.7 2466.7
## - DIABETES          1   2432.0 2468.0
## + rcs(BMI, 4)       3   2424.3 2468.3
## - BPMEDS            1   2433.4 2469.4
## - rcs(HEARTRTE, 4)  3   2438.9 2470.9
## - DIABP             1   2445.8 2481.8
## - AGE               1   2529.1 2565.1
## 
## Step:  AIC=2462.99
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     TOTCHOL + AGE + DIABP + rcs(HEARTRTE, 4) + rcs(GLUCOSE, 4) + 
##     SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - rcs(GLUCOSE, 4)   3   2431.9 2461.9
## - SEX:PREVHYP       1   2428.2 2462.2
## - TOTCHOL           1   2428.3 2462.3
## <none>                  2427.0 2463.0
## - SEX:PREVMI        1   2429.3 2463.3
## + CIGPDAY           1   2426.7 2464.7
## + PREVAP            1   2427.0 2465.0
## + SYSBP             1   2427.0 2465.0
## - DIABETES          1   2432.2 2466.2
## + rcs(BMI, 4)       3   2424.5 2466.5
## - BPMEDS            1   2433.6 2467.6
## - rcs(HEARTRTE, 4)  3   2439.0 2469.0
## - CURSMOKE          1   2437.6 2471.6
## - DIABP             1   2446.1 2480.1
## - AGE               1   2529.2 2563.2
## 
## Step:  AIC=2461.88
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     TOTCHOL + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVHYP + 
##     SEX:PREVMI
## 
##                    Df Deviance    AIC
## - TOTCHOL           1   2433.4 2461.4
## - SEX:PREVHYP       1   2433.4 2461.4
## <none>                  2431.9 2461.9
## - SEX:PREVMI        1   2433.9 2461.9
## + rcs(GLUCOSE, 4)   3   2427.0 2463.0
## + CIGPDAY           1   2431.6 2463.6
## + PREVAP            1   2431.9 2463.9
## + SYSBP             1   2431.9 2463.9
## + rcs(BMI, 4)       3   2429.1 2465.1
## - BPMEDS            1   2438.7 2466.7
## - DIABETES          1   2438.9 2466.9
## - rcs(HEARTRTE, 4)  3   2443.6 2467.6
## - CURSMOKE          1   2442.6 2470.6
## - DIABP             1   2450.7 2478.7
## - AGE               1   2536.6 2564.6
## 
## Step:  AIC=2461.38
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVHYP + SEX:PREVMI
## 
##                    Df Deviance    AIC
## - SEX:PREVHYP       1   2434.8 2460.8
## <none>                  2433.4 2461.4
## - SEX:PREVMI        1   2435.4 2461.4
## + TOTCHOL           1   2431.9 2461.9
## + rcs(GLUCOSE, 4)   3   2428.3 2462.3
## + CIGPDAY           1   2433.2 2463.2
## + PREVAP            1   2433.4 2463.4
## + SYSBP             1   2433.4 2463.4
## + rcs(BMI, 4)       3   2430.3 2464.3
## - BPMEDS            1   2439.9 2465.9
## - DIABETES          1   2440.3 2466.3
## - rcs(HEARTRTE, 4)  3   2445.5 2467.5
## - CURSMOKE          1   2444.2 2470.2
## - DIABP             1   2451.9 2477.9
## - AGE               1   2536.6 2562.6
## 
## Step:  AIC=2460.77
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + 
##     AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI
## 
##                    Df Deviance    AIC
## <none>                  2434.8 2460.8
## - SEX:PREVMI        1   2437.0 2461.0
## + SEX:PREVHYP       1   2433.4 2461.4
## + TOTCHOL           1   2433.4 2461.4
## + rcs(GLUCOSE, 4)   3   2429.4 2461.4
## + CIGPDAY           1   2434.5 2462.5
## + SYSBP             1   2434.8 2462.8
## + PREVAP            1   2434.8 2462.8
## + rcs(BMI, 4)       3   2431.4 2463.4
## - BPMEDS            1   2441.8 2465.8
## - DIABETES          1   2441.9 2465.9
## - rcs(HEARTRTE, 4)  3   2446.9 2466.9
## - CURSMOKE          1   2445.4 2469.4
## - PREVHYP           1   2451.8 2475.8
## - DIABP             1   2453.6 2477.6
## - AGE               1   2542.1 2566.1
## 
## Call:
## glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
##     PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"), 
##     data = fr.missf_p1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3035  -0.4660  -0.3275  -0.2321   2.9086  
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                -9.297986   1.317164  -7.059 1.68e-12 ***
## SEX2                       -0.118785   0.115042  -1.033  0.30182    
## CURSMOKE1                   0.376133   0.115624   3.253  0.00114 ** 
## DIABETES1                   0.672881   0.240467   2.798  0.00514 ** 
## BPMEDS1                     0.569736   0.209547   2.719  0.00655 ** 
## PREVMI1                    -0.052790   0.369570  -0.143  0.88641    
## PREVHYP1                    0.586447   0.141663   4.140 3.48e-05 ***
## AGE                         0.071503   0.007106  10.062  < 2e-16 ***
## DIABP                       0.022128   0.005091   4.347 1.38e-05 ***
## rcs(HEARTRTE, 4)HEARTRTE    0.018041   0.018255   0.988  0.32303    
## rcs(HEARTRTE, 4)HEARTRTE'  -0.184369   0.084305  -2.187  0.02875 *  
## rcs(HEARTRTE, 4)HEARTRTE''  0.488790   0.213925   2.285  0.02232 *  
## SEX2:PREVMI1                1.017895   0.670499   1.518  0.12899    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2756.0  on 4433  degrees of freedom
## Residual deviance: 2434.8  on 4421  degrees of freedom
## AIC: 2460.8
## 
## Number of Fisher Scoring iterations: 6
  • glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = “logit”), data = fr.missf_p1) がモデルとして選ばれた
stepwise_results<-
  glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
    PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"), 
    data = fr.missf_p1) %>% 
  summary() %>% 
  .$coefficients %>% 
  as.data.frame() %>% 
  rownames_to_column() %>% 
  rename("var"="rowname") %>% 
  mutate(OR=exp(Estimate),
         Lower=exp(Estimate-1.96*`Std. Error`),
         Upper=exp(Estimate+1.96*`Std. Error`)) %>% 
  select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>% 
  #すべての変数を丸める
  mutate_if(is.numeric,~round(.,3))

#結果を表示
stepwise_results
##                           var    OR Lower  Upper     p
## 1                 (Intercept) 0.000 0.000  0.001 0.000
## 2                        SEX2 0.888 0.709  1.113 0.302
## 3                   CURSMOKE1 1.457 1.161  1.827 0.001
## 4                   DIABETES1 1.960 1.223  3.140 0.005
## 5                     BPMEDS1 1.768 1.172  2.666 0.007
## 6                     PREVMI1 0.949 0.460  1.957 0.886
## 7                    PREVHYP1 1.798 1.362  2.373 0.000
## 8                         AGE 1.074 1.059  1.089 0.000
## 9                       DIABP 1.022 1.012  1.033 0.000
## 10   rcs(HEARTRTE, 4)HEARTRTE 1.018 0.982  1.055 0.323
## 11  rcs(HEARTRTE, 4)HEARTRTE' 0.832 0.705  0.981 0.029
## 12 rcs(HEARTRTE, 4)HEARTRTE'' 1.630 1.072  2.480 0.022
## 13               SEX2:PREVMI1 2.767 0.744 10.299 0.129
#plot
stepwise_results %>% 
  ggplot(aes(y=var,x=OR))+
  geom_point()+
  geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
  geom_vline(xintercept=1,color="red")

- 交互作用項のORが増大しており発散気味だが、このまま計算を進める

#複雑なロジスティク回帰モデル
step_final_model<-
  glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
    PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"), 
    data = fr.missf_p1)

予測性能の評価

ROC AUC

# risk prediction fitting
fr.missf_p1$fitted.step.fin<-
  predict(step_final_model,
          type="response") 

# ROC AUC
roc.step.final<-
  roc(STROKE~fitted.step.fin,
    data=fr.missf_p1,
    ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.step.final
## 
## Call:
## roc.formula(formula = STROKE ~ fitted.step.fin, data = fr.missf_p1,     ci = T)
## 
## Data: fitted.step.fin in 4019 controls (STROKE 0) < 415 cases (STROKE 1).
## Area under the curve: 0.7533
## 95% CI: 0.7292-0.7774 (DeLong)
roc.step.final %>% 
  ggroc(legacy.axes=T)+
  geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")

  • 複雑なロジスティク回帰モデル(step_final_model)のROC_AUCは0.7532998と中等度の予測性能である

Calibration plot

#calibration plot
cal.step.final<-
  val.prob(
  p=fr.missf_p1$fitted.step.fin,
  y=fr.missf_p1$STROKEn,
  g=10,
  cex=0.5) 

  • calibration能は良好だが、liner modelやlassoのほうが当てはまりは視覚的に良さそうに見える

Bootstrapによるliner final modelの内的妥当性検証

optimismの算出

B<-100 #number of resampling 
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B) 

for(i in 1:B){  

  #bootstrap random sampling 
  bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
  
  #bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
  fr.missf_p1.i<-fr.missf_p1[bs.i,] 
  
  #bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
  step.fin.cv.res.i<-
    glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
    PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"), 
    data = fr.missf_p1.i)
  
  fr.missf_p1.i$step.fin.1<- #step.fin.1では、bootstrap dataから構築した新しいロジスティク回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
    predict(step.fin.cv.res.i,
            data=fr.missf_p1.i,
            type="response") 
 
  #AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
  AUC.i1[i]<-roc(STROKE~step.fin.1,data=fr.missf_p1.i)$auc 
  
  
  #bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlin.fin.2列に格納;AUC算出
  fr.missf_p1$step.fin.2<-
    predict(step.fin.cv.res.i,
            newdata=fr.missf_p1,
            type="response") 
  
  AUC.i2[i]<-roc(STROKE~step.fin.2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
  
  #print(paste(i,"th bootstrap iteration is completed.",sep="")) 
}  

opt3<-AUC.i1-AUC.i2 
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt3);hist(opt3)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.035344 -0.001234  0.006441  0.006532  0.014623  0.034209

lam3<-mean(opt3) #estimate of the optimism 

step.cor.AUC<-roc.step.final$auc-lam3 #bias corrected AUC estimate

step.cor.AUC
## [1] 0.7467678
  • liner modelと同じく、複雑なロジスティク回帰モデルでもoptimismが大きく、非常にoverfittingが生じていることが示唆される

Cross validationによる内的妥当性の検証

#k-fold CV 
k <-5 
AUC_CV <-data.frame(matrix(ncol=2,nrow=k)) 

#Randomly shuffle the data 
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),] 
 
#Create k equally size folds 
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE) 
 
#Perform k fold cross validation 
for(i in 1:k){  

  #Segment data2r by fold using the which() function  
testIndexes <- which(folds==i,arr.ind=TRUE) 
testData <- fr.missf_p1.r[testIndexes, ] 
trainData <- fr.missf_p1.r[-testIndexes, ] 
fit1<-
  glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
    PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"), 
    data = trainData)
testData$fitted<-predict(fit1,testData,type="response") 
ROC <- roc(testData$STROKE, testData$fitted) 
AUC_CV[i,2] <- ROC$auc 
AUC_CV[i,1] <- i 

#print(paste(i,"th cross-validation iteration is completed.", sep="")) 
}  
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv" 
names(AUC_CV)[1] <- "k" 
print(summary(AUC_CV$AUC_cv)) 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.7174  0.7307  0.7408  0.7415  0.7496  0.7691
  • CVではAUCはオリジナルデータにおけるAUC: 0.7532998よりも多少小さく(0.7415385)なった

Bootstrapを別の関数を用いて再度検証

使うモデルはSimple model(lin_final_formula)

STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE

lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)

lin_final_model.lrm<-
  lrm(lin_final_formula,
    x=T,
    y=T,
    data=fr.missf_p1)

boot<-validate(lin_final_model.lrm,
         bw=FALSE,
         B=100,
         method="boot")

boot
##           index.orig training    test optimism index.corrected   n
## Dxy           0.5007   0.5079  0.4974   0.0105          0.4902 100
## R2            0.1469   0.1522  0.1442   0.0081          0.1388 100
## Intercept     0.0000   0.0000 -0.0514   0.0514         -0.0514 100
## Slope         1.0000   1.0000  0.9702   0.0298          0.9702 100
## Emax          0.0000   0.0000  0.0164   0.0164          0.0164 100
## D             0.0702   0.0726  0.0688   0.0038          0.0664 100
## U            -0.0005  -0.0005  0.0002  -0.0006          0.0002 100
## Q             0.0706   0.0731  0.0687   0.0044          0.0662 100
## B             0.0780   0.0772  0.0783  -0.0011          0.0791 100
## g             1.0429   1.0581  1.0247   0.0334          1.0095 100
## gp            0.0837   0.0843  0.0827   0.0016          0.0820 100
corrected.AUC<-boot[1,5]*0.5 + 0.5 #corrected AUC=Somer's D/2 + 0.5
  • rmsによるBootstrapでは、corrected AUC=0.7450811であった。
  • こちらのコードのほうが非常にシンプルで、様々な結果が出てくる
  • bias corrected AUCはSomer’s D/2 + 0.5で算出する必要があるのが厄介ではあるが。

Complex modelでも同様に実施

formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI

step_final_model.lrm<-
  lrm(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + 
    PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI,
    x=T,
    y=T,
    data=fr.missf_p1
    )

boot2<-validate(step_final_model.lrm,
         bw=FALSE,
         B=100,
         method="boot")
## 
## Divergence or singularity in 1 samples
boot2
##           index.orig training    test optimism index.corrected  n
## Dxy           0.5066   0.5138  0.4993   0.0145          0.4921 99
## R2            0.1510   0.1566  0.1449   0.0116          0.1393 99
## Intercept     0.0000   0.0000 -0.0761   0.0761         -0.0761 99
## Slope         1.0000   1.0000  0.9566   0.0434          0.9566 99
## Emax          0.0000   0.0000  0.0242   0.0242          0.0242 99
## D             0.0722   0.0747  0.0692   0.0055          0.0667 99
## U            -0.0005  -0.0005  0.0001  -0.0006          0.0001 99
## Q             0.0727   0.0752  0.0691   0.0061          0.0666 99
## B             0.0778   0.0769  0.0783  -0.0014          0.0792 99
## g             1.0623   1.0810  1.0313   0.0498          1.0125 99
## gp            0.0846   0.0852  0.0827   0.0025          0.0821 99
corrected.AUC2<-boot2[1,5]*0.5 + 0.5 #AUC=Somer's D/2 + 0.5


corrected.AUC2
## [1] 0.7460305