#library
pacman::p_load(
tidyverse,data.table,tableone,tibble,stringi,readxl,dplyr,scales,ggsci,
skimr,Epi,janitor,summarytools,broom,easystats,car,rms)
#conflict解消
select<-dplyr::select
library(pROC);library(glmnetUtils);library(glmnet);library(ISLR)
## Type 'citation("pROC")' for a citation.
##
## 次のパッケージを付け加えます: 'pROC'
## 以下のオブジェクトは 'package:parameters' からマスクされています:
##
## ci
## 以下のオブジェクトは 'package:bayestestR' からマスクされています:
##
## auc, ci
## 以下のオブジェクトは 'package:stats' からマスクされています:
##
## cov, smooth, var
## 要求されたパッケージ Matrix をロード中です
##
## 次のパッケージを付け加えます: 'Matrix'
## 以下のオブジェクトは 'package:tidyr' からマスクされています:
##
## expand, pack, unpack
## Loaded glmnet 4.1-8
##
## 次のパッケージを付け加えます: 'glmnet'
## 以下のオブジェクトは 'package:glmnetUtils' からマスクされています:
##
## cv.glmnet, glmnet
#factor var
fac_var<-c("RANDID","SEX", "CURSMOKE", "DIABETES", "BPMEDS","educ","PREVCHD", "PREVAP", "PREVMI", "PREVSTRK", "PREVHYP", "PERIOD", "DEATH",
"ANGINA", "HOSPMI", "MI_FCHD", "ANYCHD", "STROKE", "CVD", "HYPERTEN")
cont_var<-c("TOTCHOL", "AGE", "SYSBP", "DIABP", "CIGPDAY", "BMI", "HEARTRTE", "GLUCOSE")
full_var<-c(fac_var[-c(1,6,12:20)],cont_var)
var<-c("SEX", "CURSMOKE", "DIABETES", "BPMEDS", "PREVCHD", "PREVAP", "PREVMI", "PREVHYP", cont_var) #PREVSTRは除外している
inter_var<-c("SEX:PREVCHD", "SEX:PREVAP", "SEX:PREVMI", "SEX:PREVHYP", "SEX:DIABP", "CURSMOKE:HEARTRTE","DIABETES:TOTCHOL", "DIABETES:SYSBP",
"BPMEDS:PREVCHD", "BPMEDS:TOTCHOL", "BPMEDS:AGE", "PREVCHD:GLUCOSE", "PREVHYP:BMI", "TOTCHOL:AGE","HEARTRTE:GLUCOSE")
nonlin_term<-c("BMI","HEARTRTE","GLUCOSE")
#data import
fr<-read_csv("frmgham2.csv") %>%
mutate(across(fac_var,as.factor))
## Rows: 11627 Columns: 39
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (39): RANDID, SEX, TOTCHOL, AGE, SYSBP, DIABP, CURSMOKE, CIGPDAY, BMI, D...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(fac_var, as.factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(fac_var)
##
## # Now:
## data %>% select(all_of(fac_var))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
#欠測補完後データのimport
load("fr.missf.RData")
fr.missf<-fr.missf$ximp
#plot in baseline variable
fr_p1<-fr %>%
filter(PERIOD==1)
fr.missf_p1<-
fr.missf %>%
filter(PERIOD==1)
# full model formula
fullmodel<-
paste0("STROKE~",
str_c(var[!var %in% nonlin_term],collapse = "+"), #非線形以外のvarを+でつなぐ
"+",
str_c(paste0("rcs(",nonlin_term,",4)"),collapse = "+"), #非線形のvarを+でつなぐ
"+",
str_c(inter_var,collapse = "+")) #交互作用項を+でつなぐ
# Complete case
full_results<-
glm(as.formula(fullmodel),data=fr_p1,
family = binomial(link = "logit")) %>%
summary() %>%
.$coefficients %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname") %>%
mutate(OR=exp(Estimate),
Lower=exp(Estimate-1.96*`Std. Error`),
Upper=exp(Estimate+1.96*`Std. Error`)) %>%
select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>%
#すべての変数を丸める
mutate_if(is.numeric,~round(.,3))
full_results
## var OR Lower Upper p
## 1 (Intercept) 0.031 0.000 21.641 0.298
## 2 SEX2 0.346 0.060 2.005 0.237
## 3 CURSMOKE1 0.226 0.050 1.013 0.052
## 4 DIABETES1 52.994 1.393 2015.673 0.032
## 5 BPMEDS1 127.580 2.055 7920.611 0.021
## 6 PREVCHD1 3.288 0.288 37.541 0.338
## 7 PREVAP1 0.637 0.163 2.487 0.516
## 8 PREVMI1 1.392 0.394 4.915 0.607
## 9 PREVHYP1 5.301 1.053 26.683 0.043
## 10 TOTCHOL 1.002 0.984 1.020 0.825
## 11 AGE 1.085 1.003 1.174 0.043
## 12 SYSBP 1.001 0.992 1.010 0.816
## 13 DIABP 1.012 0.993 1.031 0.204
## 14 CIGPDAY 1.003 0.988 1.018 0.677
## 15 rcs(BMI, 4)BMI 0.866 0.755 0.994 0.040
## 16 rcs(BMI, 4)BMI' 1.469 0.940 2.298 0.091
## 17 rcs(BMI, 4)BMI'' 0.361 0.089 1.463 0.154
## 18 rcs(HEARTRTE, 4)HEARTRTE 1.013 0.967 1.062 0.585
## 19 rcs(HEARTRTE, 4)HEARTRTE' 0.787 0.658 0.940 0.008
## 20 rcs(HEARTRTE, 4)HEARTRTE'' 1.856 1.179 2.923 0.008
## 21 rcs(GLUCOSE, 4)GLUCOSE 0.947 0.904 0.991 0.020
## 22 rcs(GLUCOSE, 4)GLUCOSE' 1.102 0.934 1.300 0.249
## 23 rcs(GLUCOSE, 4)GLUCOSE'' 0.768 0.492 1.198 0.244
## 24 SEX2:PREVCHD1 0.720 0.020 26.534 0.858
## 25 SEX2:PREVAP1 2.493 0.082 75.953 0.600
## 26 SEX2:PREVMI1 2.786 0.106 73.500 0.539
## 27 SEX2:PREVHYP1 1.048 0.589 1.863 0.874
## 28 SEX2:DIABP 1.011 0.989 1.033 0.329
## 29 CURSMOKE0:HEARTRTE 0.976 0.957 0.996 0.017
## 30 DIABETES1:TOTCHOL 0.992 0.981 1.003 0.144
## 31 DIABETES1:SYSBP 0.992 0.975 1.011 0.413
## 32 BPMEDS1:PREVCHD1 0.463 0.105 2.038 0.309
## 33 BPMEDS1:TOTCHOL 0.993 0.983 1.003 0.153
## 34 BPMEDS1:AGE 0.957 0.904 1.013 0.133
## 35 PREVCHD0:GLUCOSE 1.013 0.991 1.036 0.249
## 36 PREVHYP0:BMI 1.044 0.985 1.107 0.144
## 37 TOTCHOL:AGE 1.000 1.000 1.000 0.816
## 38 HEARTRTE:GLUCOSE 1.000 1.000 1.001 0.069
#imputation data
full_results_mi<-
glm(as.formula(fullmodel),data=fr.missf_p1,
family = binomial(link = "logit")) %>%
summary() %>%
.$coefficients %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname") %>%
mutate(OR=exp(Estimate),
Lower=exp(Estimate-1.96*`Std. Error`),
Upper=exp(Estimate+1.96*`Std. Error`)) %>%
select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>%
#すべての変数を丸める
mutate_if(is.numeric,~round(.,3))
full_results_mi
## var OR Lower Upper p
## 1 (Intercept) 0.005 0.000 2.849 0.102
## 2 SEX2 0.303 0.058 1.583 0.157
## 3 CURSMOKE1 0.290 0.071 1.188 0.085
## 4 DIABETES1 55.743 1.632 1903.573 0.026
## 5 BPMEDS1 198.320 4.319 9106.958 0.007
## 6 PREVCHD1 3.015 0.328 27.704 0.329
## 7 PREVAP1 0.689 0.186 2.558 0.578
## 8 PREVMI1 1.057 0.331 3.368 0.926
## 9 PREVHYP1 4.549 0.974 21.246 0.054
## 10 TOTCHOL 1.007 0.990 1.024 0.447
## 11 AGE 1.106 1.027 1.192 0.008
## 12 SYSBP 1.001 0.993 1.010 0.748
## 13 DIABP 1.013 0.995 1.031 0.159
## 14 CIGPDAY 1.003 0.988 1.017 0.718
## 15 rcs(BMI, 4)BMI 0.917 0.803 1.048 0.204
## 16 rcs(BMI, 4)BMI' 1.256 0.817 1.930 0.299
## 17 rcs(BMI, 4)BMI'' 0.552 0.144 2.112 0.385
## 18 rcs(HEARTRTE, 4)HEARTRTE 1.010 0.965 1.056 0.681
## 19 rcs(HEARTRTE, 4)HEARTRTE' 0.809 0.683 0.958 0.014
## 20 rcs(HEARTRTE, 4)HEARTRTE'' 1.721 1.120 2.645 0.013
## 21 rcs(GLUCOSE, 4)GLUCOSE 0.941 0.901 0.982 0.006
## 22 rcs(GLUCOSE, 4)GLUCOSE' 1.134 0.990 1.300 0.070
## 23 rcs(GLUCOSE, 4)GLUCOSE'' 0.672 0.439 1.028 0.067
## 24 SEX2:PREVCHD1 0.638 0.021 19.464 0.797
## 25 SEX2:PREVAP1 2.654 0.100 70.232 0.559
## 26 SEX2:PREVMI1 3.705 0.165 83.060 0.409
## 27 SEX2:PREVHYP1 1.073 0.623 1.845 0.800
## 28 SEX2:DIABP 1.012 0.992 1.033 0.240
## 29 CURSMOKE0:HEARTRTE 0.979 0.962 0.998 0.026
## 30 DIABETES1:TOTCHOL 0.993 0.983 1.004 0.217
## 31 DIABETES1:SYSBP 0.989 0.972 1.006 0.192
## 32 BPMEDS1:PREVCHD1 0.434 0.108 1.743 0.239
## 33 BPMEDS1:TOTCHOL 0.991 0.982 1.001 0.070
## 34 BPMEDS1:AGE 0.957 0.907 1.009 0.107
## 35 PREVCHD0:GLUCOSE 1.010 0.991 1.030 0.299
## 36 PREVHYP0:BMI 1.038 0.982 1.098 0.187
## 37 TOTCHOL:AGE 1.000 1.000 1.000 0.428
## 38 HEARTRTE:GLUCOSE 1.000 1.000 1.001 0.045
full_results_mi %>%
ggplot(aes(y=var,x=OR))+
geom_point()+
geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
geom_vline(xintercept=1,color="red")
ORが異常に幅広くなっている因子があるため、sparse data biasがかかっている可能性が高い →penalized regressionを行うか、モデル式の変更をすべき状態
Backwards stepwiseとElastic Netによるパラメータ推定を行う
下準備
# install.packages("glmnet")
# install.packages("ISLR")
# install.packages("glmnetUtils")
# library(glmnetUtils);library(glmnet);library(ISLR)
#to detect optimal lambda
ridge.cv.res<-
cv.glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=0) #ridge
plot(ridge.cv.res)
ridge.cv.res$lambda.min #min lambda
## [1] 0.009523244
# model coefficient with min lambda
glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
lambda=ridge.cv.res$lambda.min,
alpha=0) %>%
.$beta
## 16 x 1 sparse Matrix of class "dgCMatrix"
## s0
## SEX -0.0813962421
## CURSMOKE 0.2468760397
## DIABETES 0.7155327927
## BPMEDS 0.5592037443
## PREVCHD 0.3002169372
## PREVAP -0.2447784453
## PREVMI 0.0955726098
## PREVHYP 0.5155415106
## TOTCHOL -0.0011747152
## AGE 0.0604815428
## SYSBP 0.0041215475
## DIABP 0.0149010624
## CIGPDAY 0.0040394071
## BMI 0.0159035694
## HEARTRTE -0.0102841712
## GLUCOSE -0.0009616904
#to detect optimal lambda
lasso.cv.res<-
cv.glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=1) #lasso
#log lambdaとdevianceの関連性plot
plot(lasso.cv.res)
#devianceが最小のときのlambda
lasso.cv.res$lambda.min #min lambda
## [1] 0.0021
#min lambdaのもとでのlasso regression coefficient;二通りのcodeがあるが、結果は同じ
coef(lasso.cv.res,s="lambda.min") #interceptの値も表示される
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -9.3732919829
## SEX -0.0405473179
## CURSMOKE 0.2688147316
## DIABETES 0.6136939222
## BPMEDS 0.5142181504
## PREVCHD 0.0298690706
## PREVAP .
## PREVMI 0.1240355798
## PREVHYP 0.5511057332
## TOTCHOL -0.0008671472
## AGE 0.0671689119
## SYSBP 0.0012846234
## DIABP 0.0178463734
## CIGPDAY 0.0025925936
## BMI 0.0112364738
## HEARTRTE -0.0092393943
## GLUCOSE .
glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
lambda=lasso.cv.res$lambda.min,
alpha=1) %>%
.$beta
## 16 x 1 sparse Matrix of class "dgCMatrix"
## s0
## SEX -0.040476551
## CURSMOKE 0.268983024
## DIABETES 0.613718984
## BPMEDS 0.514118152
## PREVCHD 0.029927580
## PREVAP .
## PREVMI 0.123893960
## PREVHYP 0.551330129
## TOTCHOL -0.000867187
## AGE 0.067179689
## SYSBP 0.001277099
## DIABP 0.017852439
## CIGPDAY 0.002589202
## BMI 0.011237120
## HEARTRTE -0.009239599
## GLUCOSE .
Elastic Netでは、Ridge,Lassoの罰則項を割合alphaで混ぜ合わせる事ができる 当てはまりが最も良い時の最適なalpha求める
#cv.glmnetではなく、cv"a".glmnetを使う; to examine optimal alpha and lambda
elastic.cv.res<-
cva.glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial"
)
plot(elastic.cv.res)
#alphaをデフォルトでは0-1の間で11個に振り分け、それぞれでのdevianceとlambdaの関係性を求める
#各alphaのうち、最もDevianceが小さいときのalphaを探し、その中で最も小さいlambdaを探す
##alpha=1はlassoと同じ;alpha=0はRidgeと同じということになる
sapply(elastic.cv.res$modlist,"[[","cvm") %>% #各alphaにおけるcvmを取り出す
sapply(min) %>% #各alphaの中で最小の値を取り出す
min() #すべてのalphaの中で最小のcvmを取り出す
## [1] 0.5569369
#結果を踏まえると、alpha=1のとき最小の値を示しており、すなわちlassoが最適と考えられる
#alpha=1のときの最小のlamdaを求める
cv.glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=1
) %>%
.$lambda.min
## [1] 0.001318861
#これは一個前のchankのlamda minと同じ値になる
最適な(α,λ)は(1,0.001913442)であり、lasso regressionが良いということになる
lasso.cv.res<-
cv.glmnet(
x=fr.missf_p1[,var] %>% data.matrix(),
y=fr.missf_p1[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=1)
coef(lasso.cv.res,s="lambda.min")
## 17 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -9.472068721
## SEX -0.045032608
## CURSMOKE 0.281816221
## DIABETES 0.633658495
## BPMEDS 0.525348184
## PREVCHD 0.033579323
## PREVAP .
## PREVMI 0.139586738
## PREVHYP 0.557336313
## TOTCHOL -0.001026037
## AGE 0.068159990
## SYSBP 0.001215003
## DIABP 0.018234443
## CIGPDAY 0.002976881
## BMI 0.012476398
## HEARTRTE -0.009930248
## GLUCOSE .
cf<-coef(lasso.cv.res,s="lambda.min") %>%
as.matrix() %>%
as.data.frame() %>%
mutate(exp=exp(s1)) %>%
add_rownames(var="var")
## Warning: `add_rownames()` was deprecated in dplyr 1.0.0.
## ℹ Please use `tibble::rownames_to_column()` instead.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#plot
cf %>%
ggplot(aes(y=var,x=exp))+
geom_point()+
geom_vline(xintercept=1,color="red")
fr.missf_p1$lasso<-
predict(lasso.cv.res,
s="lambda.min",
newx=fr.missf_p1[,var] %>% data.matrix(),
type="response"
)
fr.missf_p1<-
fr.missf_p1 %>%
mutate(STROKEn=as.numeric(STROKE)-1)
#ROC curve
roc.lasso<-
roc(STROKEn~lasso,
data=fr.missf_p1,
ci=T)
## Setting levels: control = 0, case = 1
## Warning in roc.default(response, predictors[, 1], ...): Deprecated use a matrix
## as predictor. Unexpected results may be produced, please pass a numeric vector.
## Setting direction: controls < cases
roc.lasso
##
## Call:
## roc.formula(formula = STROKEn ~ lasso, data = fr.missf_p1, ci = T)
##
## Data: lasso in 4019 controls (STROKEn 0) < 415 cases (STROKEn 1).
## Area under the curve: 0.7515
## 95% CI: 0.7277-0.7753 (DeLong)
roc.lasso %>%
ggroc(legacy.axes=T)+
geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")
- AUC=0.7515104と中等度の予測性能
val.prob(
p=fr.missf_p1$lasso,
y=fr.missf_p1$STROKEn,
g=10,
cex=0.5
)
## Dxy C (ROC) R2 D D:Chi-sq
## 5.030551e-01 7.515275e-01 1.484266e-01 7.095404e-02 3.156102e+02
## D:p U U:Chi-sq U:p Q
## NA -3.219051e-04 5.726729e-01 7.510099e-01 7.127594e-02
## Brier Intercept Slope Emax E90
## 7.790970e-02 9.269480e-02 1.046995e+00 1.364165e-02 7.993511e-03
## Eavg S:z S:p
## 5.026621e-03 -8.989852e-02 9.283679e-01
B<-100 #number of resampling
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B)
for(i in 1:B){
#bootstrap random sampling
bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
#bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
fr.missf_p1.i<-fr.missf_p1[bs.i,]
#bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
lasso.cv.res.i<-
cv.glmnet(
x=fr.missf_p1.i[,var] %>% data.matrix(),
y=fr.missf_p1.i[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=1)
fr.missf_p1.i$lasso1<- #lasso1では、bootstrap dataから構築した新しいLasso回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
predict(lasso.cv.res.i,
s="lambda.min",
newx=fr.missf_p1.i[,var] %>% data.matrix(),
type="response"
)
#AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
AUC.i1[i]<-roc(STROKEn~lasso1,data=fr.missf_p1.i)$auc
#bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlasso2列に格納;AUC算出
fr.missf_p1$lasso2<-
predict(lasso.cv.res.i,
s="lambda.min",
newx=fr.missf_p1[,var] %>% data.matrix(),
type="response"
)
AUC.i2[i]<-roc(STROKEn~lasso2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
#print(paste(i,"th bootstrap iteration is completed.",sep=""))
}
opt1<-AUC.i1-AUC.i2
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt1);hist(opt1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.016380 -0.001658 0.005819 0.007805 0.017487 0.034368
lam1<-mean(opt1) #estimate of the optimism
cor.AUC<-roc.lasso$auc-lam1 #bias corrected AUC estimate
cor.AUC
## [1] 0.7437054
**疑問 - 今回のモデルでは、事前に考えていた非線形性や交互作用項がモデリングに組み込まれなかったが、実際にはそれらの柔軟なモデリングをLassoに適用することはできるのか?交互作用項に関しては、ペアの変数の掛け算項を列結合すれば良さそうだが、非線形性はどのように表現すればよいのかわからない。
#k-fold CV
k <-5
AUC_CV <-data.frame(matrix(ncol=2,nrow=k))
#Randomly shuffle the data
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),]
#Create k equally size folds
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE)
#Perform k fold cross validation
for(i in 1:k){
#Segment data2r by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- fr.missf_p1.r[testIndexes, ]
trainData <- fr.missf_p1.r[-testIndexes, ]
fit1<-
cv.glmnet(
x=trainData[,var] %>% data.matrix(),
y=trainData[,"STROKE"] %>% data.matrix(),
family="binomial",
alpha=1)
testData$fitted<-
predict(lasso.cv.res,
s="lambda.min",
newx=testData[,var] %>% data.matrix(),
type="response"
)
ROC <- roc(testData$STROKE, testData$fitted)
AUC_CV[i,2] <- ROC$auc
AUC_CV[i,1] <- i
#print(paste(i,"th cross-validation iteration is completed.", sep=""))
}
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Warning in roc.default(testData$STROKE, testData$fitted): Deprecated use a
## matrix as predictor. Unexpected results may be produced, please pass a numeric
## vector.
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv"
names(AUC_CV)[1] <- "k"
print(summary(AUC_CV$AUC_cv))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7260 0.7425 0.7484 0.7502 0.7525 0.7817
ここでは、Lasso modelとの比較可能性を高めるために、2つの回帰モデルを考える 1. 交互作用項、非線形項を含まない線形の全変数モデル 2. 交互作用項、非線形項を含む全変数モデル(full model)
用いる変数はである
lin_form<-paste("STROKE~",paste(full_var,collapse="+"))
lin_model<-glm(as.formula(lin_form),
data=fr.missf_p1,
family=binomial(link="logit"))
rms::vif(lin_model)
## SEX2 CURSMOKE1 DIABETES1 BPMEDS1 PREVCHD1 PREVAP1 PREVMI1 PREVSTRK1
## 1.245566 2.439088 1.694071 1.107002 9.598226 6.717491 2.379642 1.000000
## PREVHYP1 TOTCHOL AGE SYSBP DIABP CIGPDAY BMI HEARTRTE
## 1.937207 1.067710 1.317387 3.700669 3.017425 2.568418 1.177581 1.100995
## GLUCOSE
## 1.723184
glm(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP +
PREVMI + PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP +
CIGPDAY + BMI + HEARTRTE + GLUCOSE,
data=fr.missf_p1,
family=binomial(link="logit")) %>%
rms::vif()
## SEX2 CURSMOKE1 DIABETES1 BPMEDS1 PREVAP1 PREVMI1 PREVSTRK1 PREVHYP1
## 1.244364 2.439228 1.693826 1.106665 1.187531 1.164856 1.000000 1.935830
## TOTCHOL AGE SYSBP DIABP CIGPDAY BMI HEARTRTE GLUCOSE
## 1.067423 1.316767 3.695881 3.010815 2.568423 1.177247 1.100259 1.722415
lin_form2<-
as.formula(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP +
PREVMI + PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP +
CIGPDAY + BMI + HEARTRTE + GLUCOSE)
lin_model2<-glm(lin_form2,
data=fr.missf_p1,
family=binomial(link="logit"))
step(lin_model2,
direction="both") %>%
summary()
## Start: AIC=2356.87
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI +
## PREVSTRK + PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY +
## BMI + HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - PREVAP 1 2322.9 2354.9
## - PREVMI 1 2322.9 2354.9
## - SYSBP 1 2322.9 2354.9
## - SEX 1 2323.1 2355.1
## - CIGPDAY 1 2323.4 2355.4
## - GLUCOSE 1 2323.5 2355.5
## - TOTCHOL 1 2323.6 2355.6
## - BMI 1 2324.6 2356.6
## <none> 2322.9 2356.9
## - BPMEDS 1 2326.4 2358.4
## - CURSMOKE 1 2326.5 2358.5
## - HEARTRTE 1 2327.9 2359.9
## - DIABETES 1 2329.9 2361.9
## - DIABP 1 2331.5 2363.5
## - PREVHYP 1 2333.0 2365.0
## - AGE 1 2412.6 2444.6
## - PREVSTRK 1 2438.7 2470.7
##
## Step: AIC=2354.87
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVSTRK +
## PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI +
## HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - PREVMI 1 2322.9 2352.9
## - SYSBP 1 2322.9 2352.9
## - SEX 1 2323.1 2353.1
## - CIGPDAY 1 2323.4 2353.4
## - GLUCOSE 1 2323.5 2353.5
## - TOTCHOL 1 2323.6 2353.6
## - BMI 1 2324.6 2354.6
## <none> 2322.9 2354.9
## - BPMEDS 1 2326.4 2356.4
## - CURSMOKE 1 2326.5 2356.5
## + PREVAP 1 2322.9 2356.9
## - HEARTRTE 1 2328.0 2358.0
## - DIABETES 1 2329.9 2359.9
## - DIABP 1 2331.5 2361.5
## - PREVHYP 1 2333.0 2363.0
## - AGE 1 2413.8 2443.8
## - PREVSTRK 1 2438.8 2468.8
##
## Step: AIC=2352.88
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + HEARTRTE +
## GLUCOSE
##
## Df Deviance AIC
## - SYSBP 1 2322.9 2350.9
## - SEX 1 2323.1 2351.1
## - CIGPDAY 1 2323.4 2351.4
## - GLUCOSE 1 2323.5 2351.5
## - TOTCHOL 1 2323.6 2351.6
## - BMI 1 2324.6 2352.6
## <none> 2322.9 2352.9
## - BPMEDS 1 2326.4 2354.4
## - CURSMOKE 1 2326.5 2354.5
## + PREVMI 1 2322.9 2354.9
## + PREVAP 1 2322.9 2354.9
## - HEARTRTE 1 2328.0 2356.0
## - DIABETES 1 2329.9 2357.9
## - DIABP 1 2331.5 2359.5
## - PREVHYP 1 2333.0 2361.0
## - AGE 1 2414.8 2442.8
## - PREVSTRK 1 2439.4 2467.4
##
## Step: AIC=2350.94
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - SEX 1 2323.2 2349.2
## - CIGPDAY 1 2323.5 2349.5
## - GLUCOSE 1 2323.5 2349.5
## - TOTCHOL 1 2323.7 2349.7
## - BMI 1 2324.7 2350.7
## <none> 2322.9 2350.9
## - CURSMOKE 1 2326.6 2352.6
## - BPMEDS 1 2326.6 2352.6
## + SYSBP 1 2322.9 2352.9
## + PREVMI 1 2322.9 2352.9
## + PREVAP 1 2322.9 2352.9
## - HEARTRTE 1 2328.0 2354.0
## - DIABETES 1 2330.0 2356.0
## - PREVHYP 1 2334.9 2360.9
## - DIABP 1 2339.6 2365.6
## - AGE 1 2427.5 2453.5
## - PREVSTRK 1 2439.5 2465.5
##
## Step: AIC=2349.15
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - GLUCOSE 1 2323.7 2347.7
## - CIGPDAY 1 2323.9 2347.9
## - TOTCHOL 1 2324.1 2348.1
## - BMI 1 2324.9 2348.9
## <none> 2323.2 2349.2
## - BPMEDS 1 2326.7 2350.7
## - CURSMOKE 1 2326.8 2350.8
## + SEX 1 2322.9 2350.9
## + SYSBP 1 2323.1 2351.1
## + PREVMI 1 2323.1 2351.1
## + PREVAP 1 2323.2 2351.2
## - HEARTRTE 1 2328.5 2352.5
## - DIABETES 1 2330.3 2354.3
## - PREVHYP 1 2335.1 2359.1
## - DIABP 1 2339.9 2363.9
## - AGE 1 2427.9 2451.9
## - PREVSTRK 1 2439.8 2463.8
##
## Step: AIC=2347.72
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE
##
## Df Deviance AIC
## - CIGPDAY 1 2324.5 2346.5
## - TOTCHOL 1 2324.6 2346.6
## - BMI 1 2325.4 2347.4
## <none> 2323.7 2347.7
## + GLUCOSE 1 2323.2 2349.2
## - BPMEDS 1 2327.2 2349.2
## - CURSMOKE 1 2327.3 2349.3
## + SEX 1 2323.5 2349.5
## + PREVMI 1 2323.7 2349.7
## + SYSBP 1 2323.7 2349.7
## + PREVAP 1 2323.7 2349.7
## - HEARTRTE 1 2329.4 2351.4
## - DIABETES 1 2331.7 2353.7
## - PREVHYP 1 2335.6 2357.6
## - DIABP 1 2340.7 2362.7
## - AGE 1 2428.0 2450.0
## - PREVSTRK 1 2440.1 2462.1
##
## Step: AIC=2346.51
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## TOTCHOL + AGE + DIABP + BMI + HEARTRTE
##
## Df Deviance AIC
## - TOTCHOL 1 2325.4 2345.4
## - BMI 1 2326.2 2346.2
## <none> 2324.5 2346.5
## + CIGPDAY 1 2323.7 2347.7
## - BPMEDS 1 2327.9 2347.9
## + GLUCOSE 1 2323.9 2347.9
## + SEX 1 2324.1 2348.1
## + PREVMI 1 2324.5 2348.5
## + SYSBP 1 2324.5 2348.5
## + PREVAP 1 2324.5 2348.5
## - HEARTRTE 1 2330.0 2350.0
## - DIABETES 1 2332.4 2352.4
## - PREVHYP 1 2336.3 2356.3
## - CURSMOKE 1 2339.1 2359.1
## - DIABP 1 2341.6 2361.6
## - AGE 1 2428.0 2448.0
## - PREVSTRK 1 2440.6 2460.6
##
## Step: AIC=2345.44
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## AGE + DIABP + BMI + HEARTRTE
##
## Df Deviance AIC
## - BMI 1 2327.1 2345.1
## <none> 2325.4 2345.4
## + TOTCHOL 1 2324.5 2346.5
## + CIGPDAY 1 2324.6 2346.6
## - BPMEDS 1 2328.7 2346.7
## + SEX 1 2324.8 2346.8
## + GLUCOSE 1 2324.8 2346.8
## + PREVMI 1 2325.4 2347.4
## + PREVAP 1 2325.4 2347.4
## + SYSBP 1 2325.4 2347.4
## - HEARTRTE 1 2331.4 2349.4
## - DIABETES 1 2333.3 2351.3
## - PREVHYP 1 2337.1 2355.1
## - CURSMOKE 1 2340.3 2358.3
## - DIABP 1 2342.4 2360.4
## - AGE 1 2428.2 2446.2
## - PREVSTRK 1 2442.3 2460.3
##
## Step: AIC=2345.13
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK + PREVHYP +
## AGE + DIABP + HEARTRTE
##
## Df Deviance AIC
## <none> 2327.1 2345.1
## + BMI 1 2325.4 2345.4
## + TOTCHOL 1 2326.2 2346.2
## + CIGPDAY 1 2326.3 2346.3
## - BPMEDS 1 2330.3 2346.3
## + SEX 1 2326.5 2346.5
## + GLUCOSE 1 2326.6 2346.6
## + PREVMI 1 2327.1 2347.1
## + PREVAP 1 2327.1 2347.1
## + SYSBP 1 2327.1 2347.1
## - HEARTRTE 1 2333.0 2349.0
## - DIABETES 1 2335.8 2351.8
## - PREVHYP 1 2339.5 2355.5
## - CURSMOKE 1 2340.9 2356.9
## - DIABP 1 2347.3 2363.3
## - AGE 1 2429.3 2445.3
## - PREVSTRK 1 2443.8 2459.8
##
## Call:
## glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK +
## PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = "logit"),
## data = fr.missf_p1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3340 -0.4512 -0.3184 -0.2316 2.9566
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.767033 0.667549 -11.635 < 2e-16 ***
## CURSMOKE1 0.427568 0.115490 3.702 0.000214 ***
## DIABETES1 0.750248 0.241140 3.111 0.001863 **
## BPMEDS1 0.407887 0.222002 1.837 0.066163 .
## PREVSTRK1 17.571864 238.184327 0.074 0.941190
## PREVHYP1 0.516553 0.146211 3.533 0.000411 ***
## AGE 0.071498 0.007278 9.823 < 2e-16 ***
## DIABP 0.023741 0.005264 4.510 6.49e-06 ***
## HEARTRTE -0.011267 0.004715 -2.389 0.016872 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2756.0 on 4433 degrees of freedom
## Residual deviance: 2327.1 on 4425 degrees of freedom
## AIC: 2345.1
##
## Number of Fisher Scoring iterations: 14
stepwise_results<-
glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVSTRK +
PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = "logit"),
data = fr.missf_p1) %>%
summary() %>%
.$coefficients %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname") %>%
mutate(OR=exp(Estimate),
Lower=exp(Estimate-1.96*`Std. Error`),
Upper=exp(Estimate+1.96*`Std. Error`)) %>%
select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>%
#すべての変数を丸める
mutate_if(is.numeric,~round(.,3))
#結果を表示
stepwise_results
## var OR Lower Upper p
## 1 (Intercept) 0.00000e+00 0.000 2.000000e-03 0.000
## 2 CURSMOKE1 1.53400e+00 1.223 1.923000e+00 0.000
## 3 DIABETES1 2.11800e+00 1.320 3.397000e+00 0.002
## 4 BPMEDS1 1.50400e+00 0.973 2.323000e+00 0.066
## 5 PREVSTRK1 4.27921e+07 0.000 2.387569e+210 0.941
## 6 PREVHYP1 1.67600e+00 1.259 2.233000e+00 0.000
## 7 AGE 1.07400e+00 1.059 1.090000e+00 0.000
## 8 DIABP 1.02400e+00 1.014 1.035000e+00 0.000
## 9 HEARTRTE 9.89000e-01 0.980 9.980000e-01 0.017
#plot
stepwise_results %>%
#filter(!var=="PREVSTRK1") %>% #これだけ非常にCIが大きくグラフ化すると視認性が悪いので一時的に削除
ggplot(aes(y=var,x=OR))+
geom_point()+
geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
geom_vline(xintercept=1,color="red")
-
変数選択の結果残った変数のうち、PREVSTRKのORが非常に大きくなり、推定が不安定になっている
- アウトカムとPREVSTROKEの関係を詳しく見てみると、分割表において、 4019,
383, 0, 32
であり、STROKE=0のとき、PREVSTRK=1に該当する患者が0であることがわかった→これがsparse
biasの原因である
xtabs(~STROKE+PREVSTRK,
data=fr.missf_p1)
## PREVSTRK
## STROKE 0 1
## 0 4019 0
## 1 383 32
ここで、すべてのカテゴリカル変数に対して同様にアウトカムとの分割表を考えてみる
f<-
function(x){
form<-as.formula(paste("~STROKE+",x))
xtabs(form,data=fr.missf_p1)
}
for(i in fac_var[-1]){
print(f(i))
}
## SEX
## STROKE 1 2
## 0 1751 2268
## 1 193 222
## CURSMOKE
## STROKE 0 1
## 0 2037 1982
## 1 216 199
## DIABETES
## STROKE 0 1
## 0 3926 93
## 1 387 28
## BPMEDS
## STROKE 0 1
## 0 3917 102
## 1 373 42
## educ
## STROKE 1 2 3 4
## 0 1659 1206 676 478
## 1 224 104 52 35
## PREVCHD
## STROKE 0 1
## 0 3862 157
## 1 378 37
## PREVAP
## STROKE 0 1
## 0 3899 120
## 1 388 27
## PREVMI
## STROKE 0 1
## 0 3949 70
## 1 399 16
## PREVSTRK
## STROKE 0 1
## 0 4019 0
## 1 383 32
## PREVHYP
## STROKE 0 1
## 0 2840 1179
## 1 164 251
## PERIOD
## STROKE 1 2 3
## 0 4019 0 0
## 1 415 0 0
## DEATH
## STROKE 0 1
## 0 2746 1273
## 1 138 277
## ANGINA
## STROKE 0 1
## 0 3394 625
## 1 315 100
## HOSPMI
## STROKE 0 1
## 0 3636 383
## 1 344 71
## MI_FCHD
## STROKE 0 1
## 0 3398 621
## 1 305 110
## ANYCHD
## STROKE 0 1
## 0 2958 1061
## 1 236 179
## STROKE
## 0 1
## 4019 415
## CVD
## STROKE 0 1
## 0 3277 742
## 1 0 415
## HYPERTEN
## STROKE 0 1
## 0 1142 2877
## 1 40 375
PREVSTROKEを除外したモデルにおいて、VIFから再度検討する
glm(
STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
data=fr.missf_p1,
family=binomial(link="logit")) %>%
rms::vif()
## SEX2 CURSMOKE1 DIABETES1 BPMEDS1 PREVCHD1 PREVAP1 PREVMI1 PREVHYP1
## 1.240065 2.456395 1.705096 1.112819 8.979822 5.835302 2.735766 1.922086
## TOTCHOL AGE SYSBP DIABP CIGPDAY BMI HEARTRTE GLUCOSE
## 1.066603 1.317287 3.664247 2.980477 2.583968 1.182116 1.101576 1.730470
#ココでも同様にPREVCHDのVIFが高いため、除外して再度VIFを算出
glm(
STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
data=fr.missf_p1,
family=binomial(link="logit")) %>%
rms::vif()
## SEX2 CURSMOKE1 DIABETES1 BPMEDS1 PREVAP1 PREVMI1 PREVHYP1 TOTCHOL
## 1.240237 2.456599 1.705139 1.112127 1.179417 1.159379 1.919544 1.065887
## AGE SYSBP DIABP CIGPDAY BMI HEARTRTE GLUCOSE
## 1.316273 3.658322 2.971481 2.583932 1.181654 1.100465 1.729751
#いい感じ→これを初期モデルとする
lin_model3<-
glm(STROKE~ SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+BMI+HEARTRTE+GLUCOSE,
data=fr.missf_p1,
family=binomial(link="logit"))
step(lin_model3,
direction="both") %>%
summary()
## Start: AIC=2470.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI +
## PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI +
## HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - PREVAP 1 2438.8 2468.8
## - SYSBP 1 2438.8 2468.8
## - SEX 1 2439.0 2469.0
## - CIGPDAY 1 2439.1 2469.1
## - GLUCOSE 1 2439.1 2469.1
## - PREVMI 1 2439.4 2469.4
## - TOTCHOL 1 2440.2 2470.2
## - BMI 1 2440.3 2470.3
## <none> 2438.7 2470.7
## - CURSMOKE 1 2442.3 2472.3
## - DIABETES 1 2445.2 2475.2
## - HEARTRTE 1 2445.4 2475.4
## - BPMEDS 1 2445.7 2475.7
## - DIABP 1 2446.6 2476.6
## - PREVHYP 1 2453.2 2483.2
## - AGE 1 2532.4 2562.4
##
## Step: AIC=2468.82
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + BMI + HEARTRTE +
## GLUCOSE
##
## Df Deviance AIC
## - SYSBP 1 2438.9 2466.9
## - SEX 1 2439.1 2467.1
## - CIGPDAY 1 2439.1 2467.1
## - GLUCOSE 1 2439.1 2467.1
## - PREVMI 1 2439.4 2467.4
## - TOTCHOL 1 2440.3 2468.3
## - BMI 1 2440.4 2468.4
## <none> 2438.8 2468.8
## - CURSMOKE 1 2442.4 2470.4
## + PREVAP 1 2438.7 2470.7
## - DIABETES 1 2445.2 2473.2
## - HEARTRTE 1 2445.5 2473.5
## - BPMEDS 1 2445.7 2473.7
## - DIABP 1 2446.7 2474.7
## - PREVHYP 1 2453.3 2481.3
## - AGE 1 2533.2 2561.2
##
## Step: AIC=2466.91
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## TOTCHOL + AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - SEX 1 2439.1 2465.1
## - GLUCOSE 1 2439.2 2465.2
## - CIGPDAY 1 2439.2 2465.2
## - PREVMI 1 2439.5 2465.5
## - TOTCHOL 1 2440.4 2466.4
## - BMI 1 2440.5 2466.5
## <none> 2438.9 2466.9
## - CURSMOKE 1 2442.5 2468.5
## + SYSBP 1 2438.8 2468.8
## + PREVAP 1 2438.8 2468.8
## - DIABETES 1 2445.4 2471.4
## - HEARTRTE 1 2445.6 2471.6
## - BPMEDS 1 2446.0 2472.0
## - DIABP 1 2454.4 2480.4
## - PREVHYP 1 2456.1 2482.1
## - AGE 1 2546.7 2572.7
##
## Step: AIC=2465.12
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL +
## AGE + DIABP + CIGPDAY + BMI + HEARTRTE + GLUCOSE
##
## Df Deviance AIC
## - GLUCOSE 1 2439.4 2463.4
## - CIGPDAY 1 2439.6 2463.6
## - PREVMI 1 2439.8 2463.8
## - BMI 1 2440.7 2464.7
## - TOTCHOL 1 2440.8 2464.8
## <none> 2439.1 2465.1
## - CURSMOKE 1 2442.7 2466.7
## + SEX 1 2438.9 2466.9
## + PREVAP 1 2439.1 2467.1
## + SYSBP 1 2439.1 2467.1
## - DIABETES 1 2445.7 2469.7
## - BPMEDS 1 2446.1 2470.1
## - HEARTRTE 1 2446.2 2470.2
## - DIABP 1 2454.8 2478.8
## - PREVHYP 1 2456.2 2480.2
## - AGE 1 2546.9 2570.9
##
## Step: AIC=2463.41
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL +
## AGE + DIABP + CIGPDAY + BMI + HEARTRTE
##
## Df Deviance AIC
## - CIGPDAY 1 2439.9 2461.9
## - PREVMI 1 2440.1 2462.1
## - BMI 1 2441.0 2463.0
## - TOTCHOL 1 2441.1 2463.1
## <none> 2439.4 2463.4
## - CURSMOKE 1 2443.0 2465.0
## + GLUCOSE 1 2439.1 2465.1
## + SEX 1 2439.2 2465.2
## + PREVAP 1 2439.4 2465.4
## + SYSBP 1 2439.4 2465.4
## - BPMEDS 1 2446.3 2468.3
## - HEARTRTE 1 2446.7 2468.7
## - DIABETES 1 2447.4 2469.4
## - DIABP 1 2455.2 2477.2
## - PREVHYP 1 2456.5 2478.5
## - AGE 1 2547.0 2569.0
##
## Step: AIC=2461.93
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + TOTCHOL +
## AGE + DIABP + BMI + HEARTRTE
##
## Df Deviance AIC
## - PREVMI 1 2440.6 2460.6
## - BMI 1 2441.5 2461.5
## - TOTCHOL 1 2441.6 2461.6
## <none> 2439.9 2461.9
## + CIGPDAY 1 2439.4 2463.4
## + SEX 1 2439.5 2463.5
## + GLUCOSE 1 2439.6 2463.6
## + PREVAP 1 2439.9 2463.9
## + SYSBP 1 2439.9 2463.9
## - BPMEDS 1 2446.7 2466.7
## - HEARTRTE 1 2447.1 2467.1
## - DIABETES 1 2447.9 2467.9
## - CURSMOKE 1 2453.0 2473.0
## - DIABP 1 2455.8 2475.8
## - PREVHYP 1 2456.9 2476.9
## - AGE 1 2547.0 2567.0
##
## Step: AIC=2460.57
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + TOTCHOL + AGE +
## DIABP + BMI + HEARTRTE
##
## Df Deviance AIC
## - BMI 1 2442.1 2460.1
## - TOTCHOL 1 2442.3 2460.3
## <none> 2440.6 2460.6
## + PREVMI 1 2439.9 2461.9
## + CIGPDAY 1 2440.1 2462.1
## + SEX 1 2440.1 2462.1
## + GLUCOSE 1 2440.3 2462.3
## + SYSBP 1 2440.6 2462.6
## + PREVAP 1 2440.6 2462.6
## - BPMEDS 1 2447.4 2465.4
## - HEARTRTE 1 2447.8 2465.8
## - DIABETES 1 2448.8 2466.8
## - CURSMOKE 1 2454.1 2472.1
## - DIABP 1 2456.3 2474.3
## - PREVHYP 1 2457.8 2475.8
## - AGE 1 2550.6 2568.6
##
## Step: AIC=2460.12
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + TOTCHOL + AGE +
## DIABP + HEARTRTE
##
## Df Deviance AIC
## - TOTCHOL 1 2443.8 2459.8
## <none> 2442.1 2460.1
## + BMI 1 2440.6 2460.6
## + PREVMI 1 2441.5 2461.5
## + CIGPDAY 1 2441.6 2461.6
## + SEX 1 2441.6 2461.6
## + GLUCOSE 1 2441.9 2461.9
## + SYSBP 1 2442.1 2462.1
## + PREVAP 1 2442.1 2462.1
## - BPMEDS 1 2449.0 2465.0
## - HEARTRTE 1 2449.2 2465.2
## - DIABETES 1 2451.1 2467.1
## - CURSMOKE 1 2454.6 2470.6
## - PREVHYP 1 2460.2 2476.2
## - DIABP 1 2461.0 2477.0
## - AGE 1 2551.7 2567.7
##
## Step: AIC=2459.81
## STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP +
## HEARTRTE
##
## Df Deviance AIC
## <none> 2443.8 2459.8
## + TOTCHOL 1 2442.1 2460.1
## + BMI 1 2442.3 2460.3
## + SEX 1 2443.0 2461.0
## + PREVMI 1 2443.2 2461.2
## + CIGPDAY 1 2443.2 2461.2
## + GLUCOSE 1 2443.6 2461.6
## + SYSBP 1 2443.8 2461.8
## + PREVAP 1 2443.8 2461.8
## - BPMEDS 1 2450.4 2464.4
## - HEARTRTE 1 2451.6 2465.6
## - DIABETES 1 2452.7 2466.7
## - CURSMOKE 1 2456.7 2470.7
## - PREVHYP 1 2461.6 2475.6
## - DIABP 1 2462.5 2476.5
## - AGE 1 2551.7 2565.7
##
## Call:
## glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP +
## AGE + DIABP + HEARTRTE, family = binomial(link = "logit"),
## data = fr.missf_p1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3955 -0.4649 -0.3255 -0.2367 2.9230
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.459839 0.645143 -11.563 < 2e-16 ***
## CURSMOKE1 0.401439 0.112012 3.584 0.000339 ***
## DIABETES1 0.739673 0.235375 3.143 0.001675 **
## BPMEDS1 0.547228 0.207449 2.638 0.008342 **
## PREVHYP1 0.597373 0.141426 4.224 2.40e-05 ***
## AGE 0.071244 0.007065 10.085 < 2e-16 ***
## DIABP 0.021968 0.005071 4.332 1.48e-05 ***
## HEARTRTE -0.012629 0.004590 -2.751 0.005933 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2756.0 on 4433 degrees of freedom
## Residual deviance: 2443.8 on 4426 degrees of freedom
## AIC: 2459.8
##
## Number of Fisher Scoring iterations: 6
-Backwardsで最良のモデルとして選ばれたのは glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE, family = binomial(link = “logit”), data = fr.missf_p1)
stepwise_results<-
glm(formula = STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP +
AGE + DIABP + HEARTRTE, family = binomial(link = "logit"),
data = fr.missf_p1) %>%
summary() %>%
.$coefficients %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname") %>%
mutate(OR=exp(Estimate),
Lower=exp(Estimate-1.96*`Std. Error`),
Upper=exp(Estimate+1.96*`Std. Error`)) %>%
select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>%
#すべての変数を丸める
mutate_if(is.numeric,~round(.,3))
#結果を表示
stepwise_results
## var OR Lower Upper p
## 1 (Intercept) 0.001 0.000 0.002 0.000
## 2 CURSMOKE1 1.494 1.199 1.861 0.000
## 3 DIABETES1 2.095 1.321 3.323 0.002
## 4 BPMEDS1 1.728 1.151 2.596 0.008
## 5 PREVHYP1 1.817 1.377 2.398 0.000
## 6 AGE 1.074 1.059 1.089 0.000
## 7 DIABP 1.022 1.012 1.032 0.000
## 8 HEARTRTE 0.987 0.979 0.996 0.006
#plot
stepwise_results %>%
ggplot(aes(y=var,x=OR))+
geom_point()+
geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
geom_vline(xintercept=1,color="red")
- ORが発散することなくとてもいい感じの推定ができたので、このモデルを 1.
交互作用項、非線形項を含まない線形の全変数モデル(lin_model)における最終モデル(lin_final_model)とする
lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)
lin_final_model<-
glm(formula = lin_final_formula,
family = binomial(link = "logit"),
#x=T,y=T,
data = fr.missf_p1)
# risk prediction fitting
fr.missf_p1$fitted.lin.fin<-
predict(lin_final_model,
type="response")
# ROC AUC
roc.lin.final<-
roc(STROKE~fitted.lin.fin,
data=fr.missf_p1,
ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.lin.final
##
## Call:
## roc.formula(formula = STROKE ~ fitted.lin.fin, data = fr.missf_p1, ci = T)
##
## Data: fitted.lin.fin in 4019 controls (STROKE 0) < 415 cases (STROKE 1).
## Area under the curve: 0.7504
## 95% CI: 0.7265-0.7742 (DeLong)
roc.lin.final %>%
ggroc(legacy.axes=T)+
geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")
- ROCは0.7503518であり、中等度の予測性能と考えられる
#calibration plot
cal.lin.final<-
val.prob(
p=fr.missf_p1$fitted.lin.fin,
y=fr.missf_p1$STROKEn,
g=10,
cex=0.5)
- calibration plotからみても予測性能は高い
B<-100 #number of resampling
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B)
for(i in 1:B){
#bootstrap random sampling
bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
#bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
fr.missf_p1.i<-fr.missf_p1[bs.i,]
#bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
lin.fin.cv.res.i<-
glm(formula = lin_final_formula,
family = binomial(link = "logit"),
data = fr.missf_p1.i)
fr.missf_p1.i$lin.fin.1<- #lin.fin.1では、bootstrap dataから構築した新しいロジスティク回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
predict(lin.fin.cv.res.i,
data=fr.missf_p1.i,
type="response")
#AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
AUC.i1[i]<-roc(STROKE~lin.fin.1,data=fr.missf_p1.i)$auc
#bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlin.fin.2列に格納;AUC算出
fr.missf_p1$lin.fin.2<-
predict(lin.fin.cv.res.i,
newdata=fr.missf_p1, ##dataではなくnewdataにする必要がある!!!!
type="response")
AUC.i2[i]<-roc(STROKE~lin.fin.2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
#print(paste(i,"th bootstrap iteration is completed.",sep=""))
}
opt2<-AUC.i1-AUC.i2
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt2);hist(opt2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.036175 -0.004380 0.005196 0.004162 0.013247 0.034522
lam2<-mean(opt2) #estimate of the optimism
lin.cor.AUC<-roc.lin.final$auc-lam2 #bias corrected AUC estimate
lin.cor.AUC
## [1] 0.7461894
#k-fold CV
k <-5
AUC_CV <-data.frame(matrix(ncol=2,nrow=k))
lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)
lin_final_model<-
glm(formula = lin_final_formula,
family = binomial(link = "logit"),
data = fr.missf_p1)
#Randomly shuffle the data
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),]
#Create k equally size folds
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE)
#Perform k fold cross validation
for(i in 1:k){
#Segment data2r by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- fr.missf_p1.r[testIndexes, ]
trainData <- fr.missf_p1.r[-testIndexes, ]
fit1<-glm(formula = lin_final_formula, family = binomial(link = "logit"), data = trainData)
testData$fitted<-predict(fit1,testData,type="response")
ROC <- roc(testData$STROKE, testData$fitted)
AUC_CV[i,2] <- ROC$auc
AUC_CV[i,1] <- i
#print(paste(i,"th cross-validation iteration is completed.", sep=""))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv"
names(AUC_CV)[1] <- "k"
print(summary(AUC_CV$AUC_cv))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7163 0.7251 0.7339 0.7456 0.7728 0.7801
用いる変数はである
fullmodel
## [1] "STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVCHD+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVCHD+SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:PREVCHD+BPMEDS:TOTCHOL+BPMEDS:AGE+PREVCHD:GLUCOSE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE"
glm(as.formula(fullmodel),data=fr.missf_p1,
family = binomial(link = "logit"))
##
## Call: glm(formula = as.formula(fullmodel), family = binomial(link = "logit"),
## data = fr.missf_p1)
##
## Coefficients:
## (Intercept) SEX2
## -5.2535723 -1.1953373
## CURSMOKE1 DIABETES1
## -1.2365073 4.0207549
## BPMEDS1 PREVCHD1
## 5.2898809 1.1035782
## PREVAP1 PREVMI1
## -0.3720497 0.0550895
## PREVHYP1 TOTCHOL
## 1.5148234 0.0066005
## AGE SYSBP
## 0.1010951 0.0013414
## DIABP CIGPDAY
## 0.0127927 0.0026360
## rcs(BMI, 4)BMI rcs(BMI, 4)BMI'
## -0.0862948 0.2278817
## rcs(BMI, 4)BMI'' rcs(HEARTRTE, 4)HEARTRTE
## -0.5949000 0.0094925
## rcs(HEARTRTE, 4)HEARTRTE' rcs(HEARTRTE, 4)HEARTRTE''
## -0.2123066 0.5429251
## rcs(GLUCOSE, 4)GLUCOSE rcs(GLUCOSE, 4)GLUCOSE'
## -0.0611513 0.1259855
## rcs(GLUCOSE, 4)GLUCOSE'' SEX2:PREVCHD1
## -0.3977397 -0.4496324
## SEX2:PREVAP1 SEX2:PREVMI1
## 0.9761805 1.3096815
## SEX2:PREVHYP1 SEX2:DIABP
## 0.0700994 0.0122176
## CURSMOKE0:HEARTRTE CURSMOKE1:HEARTRTE
## -0.0207419 NA
## DIABETES1:TOTCHOL DIABETES1:SYSBP
## -0.0066810 -0.0112544
## BPMEDS1:PREVCHD1 BPMEDS1:TOTCHOL
## -0.8348873 -0.0085763
## BPMEDS1:AGE PREVCHD0:GLUCOSE
## -0.0439211 0.0103560
## PREVCHD1:GLUCOSE PREVHYP0:BMI
## NA 0.0374607
## PREVHYP1:BMI TOTCHOL:AGE
## NA -0.0001260
## HEARTRTE:GLUCOSE
## 0.0003235
##
## Degrees of Freedom: 4433 Total (i.e. Null); 4396 Residual
## Null Deviance: 2756
## Residual Deviance: 2395 AIC: 2471
fullmodel2<-
STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+
rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+
SEX:PREVAP+SEX:PREVMI+SEX:PREVHYP+SEX:DIABP+CURSMOKE:HEARTRTE+DIABETES:TOTCHOL+DIABETES:SYSBP+BPMEDS:TOTCHOL+
BPMEDS:AGE+PREVHYP:BMI+TOTCHOL:AGE+HEARTRTE:GLUCOSE
fullmodel3<-STROKE~SEX+CURSMOKE+DIABETES+BPMEDS+PREVAP+PREVMI+PREVHYP+TOTCHOL+AGE+SYSBP+DIABP+CIGPDAY+
rcs(BMI,4)+rcs(HEARTRTE,4)+rcs(GLUCOSE,4)+SEX:PREVHYP+SEX:PREVMI
#dfが大きいのでGVIFを算出するcar::vifを用いる
glm(as.formula(fullmodel3),data=fr.missf_p1,
family = binomial(link = "logit")) %>%
car::vif(type="predictor")
## Warning in vif.lm(., type = "predictor"): type = 'predictor' is available only for unweighted linear models;
## type = 'terms' will be used
## GVIF Df GVIF^(1/(2*Df))
## SEX 2.570312 1 1.603219
## CURSMOKE 2.463038 1 1.569407
## DIABETES 1.751968 1 1.323619
## BPMEDS 1.134620 1 1.065186
## PREVAP 1.204167 1 1.097345
## PREVMI 1.687747 1 1.299133
## PREVHYP 2.854402 1 1.689498
## TOTCHOL 1.089812 1 1.043941
## AGE 1.329260 1 1.152935
## SYSBP 3.727491 1 1.930671
## DIABP 2.981403 1 1.726674
## CIGPDAY 2.594084 1 1.610616
## rcs(BMI, 4) 1.323785 3 1.047859
## rcs(HEARTRTE, 4) 1.150514 3 1.023643
## rcs(GLUCOSE, 4) 1.866674 3 1.109630
## SEX:PREVHYP 3.811174 1 1.952223
## SEX:PREVMI 1.557143 1 1.247855
glm(as.formula(fullmodel3),data=fr.missf_p1,
family = binomial(link = "logit")) %>%
step(direction="both") %>%
summary()
## Start: AIC=2472.26
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI +
## PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + rcs(BMI,
## 4) + rcs(HEARTRTE, 4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - rcs(BMI, 4) 3 2426.7 2468.7
## - PREVAP 1 2424.3 2470.3
## - SYSBP 1 2424.3 2470.3
## - CIGPDAY 1 2424.5 2470.5
## - rcs(GLUCOSE, 4) 3 2428.9 2470.9
## - SEX:PREVHYP 1 2425.2 2471.2
## - TOTCHOL 1 2425.4 2471.4
## <none> 2424.3 2472.3
## - SEX:PREVMI 1 2426.5 2472.5
## - CURSMOKE 1 2428.0 2474.0
## - DIABETES 1 2429.0 2475.0
## - BPMEDS 1 2430.7 2476.7
## - rcs(HEARTRTE, 4) 3 2436.6 2478.6
## - DIABP 1 2433.4 2479.4
## - AGE 1 2515.0 2561.0
##
## Step: AIC=2468.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI +
## PREVHYP + TOTCHOL + AGE + SYSBP + DIABP + CIGPDAY + rcs(HEARTRTE,
## 4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - SYSBP 1 2426.7 2466.7
## - PREVAP 1 2426.7 2466.7
## - CIGPDAY 1 2427.0 2467.0
## - rcs(GLUCOSE, 4) 3 2431.6 2467.6
## - SEX:PREVHYP 1 2427.9 2467.9
## - TOTCHOL 1 2428.1 2468.1
## <none> 2426.7 2468.7
## - SEX:PREVMI 1 2429.0 2469.0
## - CURSMOKE 1 2430.2 2470.2
## - DIABETES 1 2432.0 2472.0
## + rcs(BMI, 4) 3 2424.3 2472.3
## - BPMEDS 1 2433.3 2473.3
## - rcs(HEARTRTE, 4) 3 2438.9 2474.9
## - DIABP 1 2437.6 2477.6
## - AGE 1 2517.8 2557.8
##
## Step: AIC=2466.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVAP + PREVMI +
## PREVHYP + TOTCHOL + AGE + DIABP + CIGPDAY + rcs(HEARTRTE,
## 4) + rcs(GLUCOSE, 4) + SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - PREVAP 1 2426.7 2464.7
## - CIGPDAY 1 2427.0 2465.0
## - rcs(GLUCOSE, 4) 3 2431.6 2465.6
## - SEX:PREVHYP 1 2427.9 2465.9
## - TOTCHOL 1 2428.1 2466.1
## <none> 2426.7 2466.7
## - SEX:PREVMI 1 2429.0 2467.0
## - CURSMOKE 1 2430.2 2468.2
## + SYSBP 1 2426.7 2468.7
## - DIABETES 1 2432.0 2470.0
## + rcs(BMI, 4) 3 2424.3 2470.3
## - BPMEDS 1 2433.3 2471.3
## - rcs(HEARTRTE, 4) 3 2438.9 2472.9
## - DIABP 1 2445.8 2483.8
## - AGE 1 2527.5 2565.5
##
## Step: AIC=2464.74
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## TOTCHOL + AGE + DIABP + CIGPDAY + rcs(HEARTRTE, 4) + rcs(GLUCOSE,
## 4) + SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - CIGPDAY 1 2427.0 2463.0
## - rcs(GLUCOSE, 4) 3 2431.6 2463.6
## - SEX:PREVHYP 1 2427.9 2463.9
## - TOTCHOL 1 2428.1 2464.1
## <none> 2426.7 2464.7
## - SEX:PREVMI 1 2429.0 2465.0
## - CURSMOKE 1 2430.2 2466.2
## + PREVAP 1 2426.7 2466.7
## + SYSBP 1 2426.7 2466.7
## - DIABETES 1 2432.0 2468.0
## + rcs(BMI, 4) 3 2424.3 2468.3
## - BPMEDS 1 2433.4 2469.4
## - rcs(HEARTRTE, 4) 3 2438.9 2470.9
## - DIABP 1 2445.8 2481.8
## - AGE 1 2529.1 2565.1
##
## Step: AIC=2462.99
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## TOTCHOL + AGE + DIABP + rcs(HEARTRTE, 4) + rcs(GLUCOSE, 4) +
## SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - rcs(GLUCOSE, 4) 3 2431.9 2461.9
## - SEX:PREVHYP 1 2428.2 2462.2
## - TOTCHOL 1 2428.3 2462.3
## <none> 2427.0 2463.0
## - SEX:PREVMI 1 2429.3 2463.3
## + CIGPDAY 1 2426.7 2464.7
## + PREVAP 1 2427.0 2465.0
## + SYSBP 1 2427.0 2465.0
## - DIABETES 1 2432.2 2466.2
## + rcs(BMI, 4) 3 2424.5 2466.5
## - BPMEDS 1 2433.6 2467.6
## - rcs(HEARTRTE, 4) 3 2439.0 2469.0
## - CURSMOKE 1 2437.6 2471.6
## - DIABP 1 2446.1 2480.1
## - AGE 1 2529.2 2563.2
##
## Step: AIC=2461.88
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## TOTCHOL + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVHYP +
## SEX:PREVMI
##
## Df Deviance AIC
## - TOTCHOL 1 2433.4 2461.4
## - SEX:PREVHYP 1 2433.4 2461.4
## <none> 2431.9 2461.9
## - SEX:PREVMI 1 2433.9 2461.9
## + rcs(GLUCOSE, 4) 3 2427.0 2463.0
## + CIGPDAY 1 2431.6 2463.6
## + PREVAP 1 2431.9 2463.9
## + SYSBP 1 2431.9 2463.9
## + rcs(BMI, 4) 3 2429.1 2465.1
## - BPMEDS 1 2438.7 2466.7
## - DIABETES 1 2438.9 2466.9
## - rcs(HEARTRTE, 4) 3 2443.6 2467.6
## - CURSMOKE 1 2442.6 2470.6
## - DIABP 1 2450.7 2478.7
## - AGE 1 2536.6 2564.6
##
## Step: AIC=2461.38
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVHYP + SEX:PREVMI
##
## Df Deviance AIC
## - SEX:PREVHYP 1 2434.8 2460.8
## <none> 2433.4 2461.4
## - SEX:PREVMI 1 2435.4 2461.4
## + TOTCHOL 1 2431.9 2461.9
## + rcs(GLUCOSE, 4) 3 2428.3 2462.3
## + CIGPDAY 1 2433.2 2463.2
## + PREVAP 1 2433.4 2463.4
## + SYSBP 1 2433.4 2463.4
## + rcs(BMI, 4) 3 2430.3 2464.3
## - BPMEDS 1 2439.9 2465.9
## - DIABETES 1 2440.3 2466.3
## - rcs(HEARTRTE, 4) 3 2445.5 2467.5
## - CURSMOKE 1 2444.2 2470.2
## - DIABP 1 2451.9 2477.9
## - AGE 1 2536.6 2562.6
##
## Step: AIC=2460.77
## STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP +
## AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI
##
## Df Deviance AIC
## <none> 2434.8 2460.8
## - SEX:PREVMI 1 2437.0 2461.0
## + SEX:PREVHYP 1 2433.4 2461.4
## + TOTCHOL 1 2433.4 2461.4
## + rcs(GLUCOSE, 4) 3 2429.4 2461.4
## + CIGPDAY 1 2434.5 2462.5
## + SYSBP 1 2434.8 2462.8
## + PREVAP 1 2434.8 2462.8
## + rcs(BMI, 4) 3 2431.4 2463.4
## - BPMEDS 1 2441.8 2465.8
## - DIABETES 1 2441.9 2465.9
## - rcs(HEARTRTE, 4) 3 2446.9 2466.9
## - CURSMOKE 1 2445.4 2469.4
## - PREVHYP 1 2451.8 2475.8
## - DIABP 1 2453.6 2477.6
## - AGE 1 2542.1 2566.1
##
## Call:
## glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
## PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"),
## data = fr.missf_p1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3035 -0.4660 -0.3275 -0.2321 2.9086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.297986 1.317164 -7.059 1.68e-12 ***
## SEX2 -0.118785 0.115042 -1.033 0.30182
## CURSMOKE1 0.376133 0.115624 3.253 0.00114 **
## DIABETES1 0.672881 0.240467 2.798 0.00514 **
## BPMEDS1 0.569736 0.209547 2.719 0.00655 **
## PREVMI1 -0.052790 0.369570 -0.143 0.88641
## PREVHYP1 0.586447 0.141663 4.140 3.48e-05 ***
## AGE 0.071503 0.007106 10.062 < 2e-16 ***
## DIABP 0.022128 0.005091 4.347 1.38e-05 ***
## rcs(HEARTRTE, 4)HEARTRTE 0.018041 0.018255 0.988 0.32303
## rcs(HEARTRTE, 4)HEARTRTE' -0.184369 0.084305 -2.187 0.02875 *
## rcs(HEARTRTE, 4)HEARTRTE'' 0.488790 0.213925 2.285 0.02232 *
## SEX2:PREVMI1 1.017895 0.670499 1.518 0.12899
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2756.0 on 4433 degrees of freedom
## Residual deviance: 2434.8 on 4421 degrees of freedom
## AIC: 2460.8
##
## Number of Fisher Scoring iterations: 6
stepwise_results<-
glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"),
data = fr.missf_p1) %>%
summary() %>%
.$coefficients %>%
as.data.frame() %>%
rownames_to_column() %>%
rename("var"="rowname") %>%
mutate(OR=exp(Estimate),
Lower=exp(Estimate-1.96*`Std. Error`),
Upper=exp(Estimate+1.96*`Std. Error`)) %>%
select(var,OR,Lower,Upper,"p"="Pr(>|z|)") %>%
#すべての変数を丸める
mutate_if(is.numeric,~round(.,3))
#結果を表示
stepwise_results
## var OR Lower Upper p
## 1 (Intercept) 0.000 0.000 0.001 0.000
## 2 SEX2 0.888 0.709 1.113 0.302
## 3 CURSMOKE1 1.457 1.161 1.827 0.001
## 4 DIABETES1 1.960 1.223 3.140 0.005
## 5 BPMEDS1 1.768 1.172 2.666 0.007
## 6 PREVMI1 0.949 0.460 1.957 0.886
## 7 PREVHYP1 1.798 1.362 2.373 0.000
## 8 AGE 1.074 1.059 1.089 0.000
## 9 DIABP 1.022 1.012 1.033 0.000
## 10 rcs(HEARTRTE, 4)HEARTRTE 1.018 0.982 1.055 0.323
## 11 rcs(HEARTRTE, 4)HEARTRTE' 0.832 0.705 0.981 0.029
## 12 rcs(HEARTRTE, 4)HEARTRTE'' 1.630 1.072 2.480 0.022
## 13 SEX2:PREVMI1 2.767 0.744 10.299 0.129
#plot
stepwise_results %>%
ggplot(aes(y=var,x=OR))+
geom_point()+
geom_errorbar(aes(xmin=Lower,xmax=Upper),width=0.2)+
geom_vline(xintercept=1,color="red")
- 交互作用項のORが増大しており発散気味だが、このまま計算を進める
#複雑なロジスティク回帰モデル
step_final_model<-
glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"),
data = fr.missf_p1)
# risk prediction fitting
fr.missf_p1$fitted.step.fin<-
predict(step_final_model,
type="response")
# ROC AUC
roc.step.final<-
roc(STROKE~fitted.step.fin,
data=fr.missf_p1,
ci=T)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
roc.step.final
##
## Call:
## roc.formula(formula = STROKE ~ fitted.step.fin, data = fr.missf_p1, ci = T)
##
## Data: fitted.step.fin in 4019 controls (STROKE 0) < 415 cases (STROKE 1).
## Area under the curve: 0.7533
## 95% CI: 0.7292-0.7774 (DeLong)
roc.step.final %>%
ggroc(legacy.axes=T)+
geom_segment(aes(x = 0, xend = 1, y = 0, yend = 1), color="grey", linetype="dashed")
#calibration plot
cal.step.final<-
val.prob(
p=fr.missf_p1$fitted.step.fin,
y=fr.missf_p1$STROKEn,
g=10,
cex=0.5)
B<-100 #number of resampling
N<-dim(fr.missf_p1)[1] #dataのdimentionを返す→4434行x39列→そのうち一番目(4434)を取得しNとする
AUC.i1<-AUC.i2<-numeric(B)
for(i in 1:B){
#bootstrap random sampling
bs.i<-sample(1:N,N,replace=TRUE) #重複を許して1~4434までの数を4434回サンプリングする
#bootstrap samplingに基づき、dataから集団を再構成 →data.iとする
fr.missf_p1.i<-fr.missf_p1[bs.i,]
#bootstrapしたデータセットごとで回帰モデル構築→各bootstrap dataからリスク予測確率の算出→data2.1のprob_1列に格納
step.fin.cv.res.i<-
glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"),
data = fr.missf_p1.i)
fr.missf_p1.i$step.fin.1<- #step.fin.1では、bootstrap dataから構築した新しいロジスティク回帰モデルを使い、そのbootstrap dataに対するリスク予測確率を格納
predict(step.fin.cv.res.i,
data=fr.missf_p1.i,
type="response")
#AUCの算出;Bootstrap回数(B=100)分のAUCを算出して結果を格納;各bootstrap dataに対するAUC
AUC.i1[i]<-roc(STROKE~step.fin.1,data=fr.missf_p1.i)$auc
#bootstrapで作成した各回帰モデルを使って、オリジナルデータ(fr.missf_p1)に対するリスク予測確率を算出→fr.missf_p1のlin.fin.2列に格納;AUC算出
fr.missf_p1$step.fin.2<-
predict(step.fin.cv.res.i,
newdata=fr.missf_p1,
type="response")
AUC.i2[i]<-roc(STROKE~step.fin.2,data=fr.missf_p1)$auc #ここで得られるAUCはbootstrap dataを用いてoriginal dataに外挿したときのAUC
#print(paste(i,"th bootstrap iteration is completed.",sep=""))
}
opt3<-AUC.i1-AUC.i2
#(bootstrap samplingした各data setを使い、それぞれ導いた回帰モデルにおけるAUC)ー(bootstrap samplingした各data setを使い、original dataに外挿したときのAUC)
summary(opt3);hist(opt3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.035344 -0.001234 0.006441 0.006532 0.014623 0.034209
lam3<-mean(opt3) #estimate of the optimism
step.cor.AUC<-roc.step.final$auc-lam3 #bias corrected AUC estimate
step.cor.AUC
## [1] 0.7467678
#k-fold CV
k <-5
AUC_CV <-data.frame(matrix(ncol=2,nrow=k))
#Randomly shuffle the data
fr.missf_p1.r<-fr.missf_p1[sample(nrow(fr.missf_p1)),]
#Create k equally size folds
folds <- cut(seq(1,nrow(fr.missf_p1.r)),breaks=k,labels=FALSE)
#Perform k fold cross validation
for(i in 1:k){
#Segment data2r by fold using the which() function
testIndexes <- which(folds==i,arr.ind=TRUE)
testData <- fr.missf_p1.r[testIndexes, ]
trainData <- fr.missf_p1.r[-testIndexes, ]
fit1<-
glm(formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI, family = binomial(link = "logit"),
data = trainData)
testData$fitted<-predict(fit1,testData,type="response")
ROC <- roc(testData$STROKE, testData$fitted)
AUC_CV[i,2] <- ROC$auc
AUC_CV[i,1] <- i
#print(paste(i,"th cross-validation iteration is completed.", sep=""))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
names(AUC_CV)[2] <- "AUC_cv"
names(AUC_CV)[1] <- "k"
print(summary(AUC_CV$AUC_cv))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.7174 0.7307 0.7408 0.7415 0.7496 0.7691
STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE
lin_final_formula<-as.formula(STROKE ~ CURSMOKE + DIABETES + BPMEDS + PREVHYP + AGE + DIABP + HEARTRTE)
lin_final_model.lrm<-
lrm(lin_final_formula,
x=T,
y=T,
data=fr.missf_p1)
boot<-validate(lin_final_model.lrm,
bw=FALSE,
B=100,
method="boot")
boot
## index.orig training test optimism index.corrected n
## Dxy 0.5007 0.5079 0.4974 0.0105 0.4902 100
## R2 0.1469 0.1522 0.1442 0.0081 0.1388 100
## Intercept 0.0000 0.0000 -0.0514 0.0514 -0.0514 100
## Slope 1.0000 1.0000 0.9702 0.0298 0.9702 100
## Emax 0.0000 0.0000 0.0164 0.0164 0.0164 100
## D 0.0702 0.0726 0.0688 0.0038 0.0664 100
## U -0.0005 -0.0005 0.0002 -0.0006 0.0002 100
## Q 0.0706 0.0731 0.0687 0.0044 0.0662 100
## B 0.0780 0.0772 0.0783 -0.0011 0.0791 100
## g 1.0429 1.0581 1.0247 0.0334 1.0095 100
## gp 0.0837 0.0843 0.0827 0.0016 0.0820 100
corrected.AUC<-boot[1,5]*0.5 + 0.5 #corrected AUC=Somer's D/2 + 0.5
formula = STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI + PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI
step_final_model.lrm<-
lrm(STROKE ~ SEX + CURSMOKE + DIABETES + BPMEDS + PREVMI +
PREVHYP + AGE + DIABP + rcs(HEARTRTE, 4) + SEX:PREVMI,
x=T,
y=T,
data=fr.missf_p1
)
boot2<-validate(step_final_model.lrm,
bw=FALSE,
B=100,
method="boot")
##
## Divergence or singularity in 1 samples
boot2
## index.orig training test optimism index.corrected n
## Dxy 0.5066 0.5138 0.4993 0.0145 0.4921 99
## R2 0.1510 0.1566 0.1449 0.0116 0.1393 99
## Intercept 0.0000 0.0000 -0.0761 0.0761 -0.0761 99
## Slope 1.0000 1.0000 0.9566 0.0434 0.9566 99
## Emax 0.0000 0.0000 0.0242 0.0242 0.0242 99
## D 0.0722 0.0747 0.0692 0.0055 0.0667 99
## U -0.0005 -0.0005 0.0001 -0.0006 0.0001 99
## Q 0.0727 0.0752 0.0691 0.0061 0.0666 99
## B 0.0778 0.0769 0.0783 -0.0014 0.0792 99
## g 1.0623 1.0810 1.0313 0.0498 1.0125 99
## gp 0.0846 0.0852 0.0827 0.0025 0.0821 99
corrected.AUC2<-boot2[1,5]*0.5 + 0.5 #AUC=Somer's D/2 + 0.5
corrected.AUC2
## [1] 0.7460305