1. Import dataset: anackd. This is baseline Su study dataset

dat = read.csv("C:/Users/Momo/Desktop/anackd.csv")

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
dat1 = dat %>% select(age, sex, sbp, dbp, pulse, tc, hdl, nonhdlc, tg, bs, 
                      rbc, hg, hct, tob, alc, dx_ht, dx_af, dx_ckd, dx_mi, dx_st, 
                      pyst, exer, kaidan, stress, BMI, BMIg, nonhdlc, diabetes, 
                      hypertension, metSg, eGFR, eGFRg, uprg, fast, dx_dm)

2. Create factors for categorical variables

dat1$sex    = as.factor(dat1$sex) # Sex: 1 is male, 2 is female
dat1$BMIg   = as.factor(dat1$BMIg)
dat1$eGFRg  = as.factor(dat1$eGFRg)
dat1$uprg   = as.factor(dat1$uprg) # Protein in urine: 0 is no protein in urine, 1 is trace, 2 is having protein in urine
dat1$tob    = as.factor(dat1$tob) # Smoke: 1 is current, 2 is quit, 3 is never smoke
dat1$alc    = as.factor(dat1$alc)
dat1$exer   = as.factor(dat1$exer)
dat1$kaidan = as.factor(dat1$kaidan)
dat1$dx_af  = as.factor(dat1$dx_af)
dat1$dx_ht  = as.factor(dat1$dx_ht)
dat1$dx_ckd = as.factor(dat1$dx_ckd)
dat1$dx_st  = as.factor(dat1$dx_st)
dat1$hypertension = as.factor(dat1$hypertension)
dat1$diabetes = as.factor(dat1$diabetes)
dat1$stress = as.factor(dat1$stress)
dat1$metSg  = as.factor(dat1$metSg)

# Create new variable: atrial fibrillation
dat1$af[dat1$dx_af == 0] = 0
dat1$af[dat1$dx_af == 1] = 1
dat1$af[dat1$dx_af == 2] = 1

# For stress
dat1$stressnew[dat1$stress == 1] = "No"
dat1$stressnew[dat1$stress == 2] = "Yes"
dat1$stressnew[dat1$stress == 9] = "Unknown"
dat1$stressnew = as.factor(dat1$stressnew)
dat1$stressnew = relevel(dat1$stressnew, ref = "No")

3. Table1: Characteristics of study participants who don’t have stroke at baseline

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + sex + eGFR + eGFRg + uprg + dx_ckd + BMI + BMIg + tob + alc + 
          sbp + dbp + pulse + hypertension + dx_ht + bs + diabetes + 
          tc + hdl + nonhdlc + tg + metSg + hg + hct + rbc + dx_af + 
          dx_ckd + dx_st + pyst, data = dat1 )
Overall
(N=7342)
age
Mean (SD) 55.1 (13.1)
Median [Min, Max] 56.0 [30.0, 84.0]
sex
1 3357 (45.7%)
2 3985 (54.3%)
eGFR
Mean (SD) 77.9 (14.6)
Median [Min, Max] 79.0 [7.92, 179]
eGFRg
0 149 (2.0%)
1 754 (10.3%)
2 6439 (87.7%)
uprg
0 6025 (82.1%)
1 902 (12.3%)
2 415 (5.7%)
dx_ckd
0 7323 (99.7%)
1 8 (0.1%)
2 11 (0.1%)
BMI
Mean (SD) 22.5 (3.12)
Median [Min, Max] 22.3 [13.1, 40.5]
Missing 2 (0.0%)
BMIg
Normal 5282 (71.9%)
Obesity 112 (1.5%)
Overweight 1333 (18.2%)
Underweight 615 (8.4%)
tob
1 2124 (28.9%)
2 1153 (15.7%)
3 3941 (53.7%)
Missing 124 (1.7%)
alc
1 3776 (51.4%)
2 168 (2.3%)
3 3283 (44.7%)
Missing 115 (1.6%)
sbp
Mean (SD) 126 (21.5)
Median [Min, Max] 124 [78.0, 233]
dbp
Mean (SD) 77.6 (12.2)
Median [Min, Max] 77.0 [0, 137]
Missing 1 (0.0%)
pulse
Mean (SD) 69.5 (9.48)
Median [Min, Max] 68.0 [38.0, 128]
Missing 18 (0.2%)
hypertension
0 5072 (69.1%)
1 2270 (30.9%)
dx_ht
0 5970 (81.3%)
1 547 (7.5%)
2 825 (11.2%)
bs
Mean (SD) 98.3 (18.6)
Median [Min, Max] 95.0 [51.0, 439]
diabetes
0 6850 (93.3%)
1 492 (6.7%)
tc
Mean (SD) 207 (36.1)
Median [Min, Max] 206 [75.0, 483]
hdl
Mean (SD) 54.7 (14.3)
Median [Min, Max] 53.0 [10.0, 132]
Missing 28 (0.4%)
nonhdlc
Mean (SD) 153 (36.9)
Median [Min, Max] 150 [22.0, 403]
Missing 28 (0.4%)
tg
Mean (SD) 122 (90.5)
Median [Min, Max] 99.0 [15.0, 1540]
metSg
0 5756 (78.4%)
1 1586 (21.6%)
hg
Mean (SD) 13.9 (1.55)
Median [Min, Max] 13.9 [5.70, 20.6]
Missing 3 (0.0%)
hct
Mean (SD) 41.8 (4.25)
Median [Min, Max] 41.7 [12.4, 61.8]
Missing 3 (0.0%)
rbc
Mean (SD) 4.53 (0.434)
Median [Min, Max] 4.51 [2.61, 7.29]
Missing 4 (0.1%)
dx_af
0 7312 (99.6%)
1 17 (0.2%)
2 13 (0.2%)
dx_st
0 6910 (94.1%)
1 432 (5.9%)
pyst
Mean (SD) 15.1 (7.06)
Median [Min, Max] 16.7 [0.0862, 24.3]
table1(~ age + factor(sex) + eGFR + eGFRg + uprg + dx_ckd + BMI + BMIg + tob + alc + 
          sbp + dbp + pulse + hypertension + dx_ht + bs + diabetes + 
          tc + hdl + nonhdlc + tg + metSg + hg + hct + rbc + dx_af + 
          dx_ckd + pyst + stress|dx_st, data = dat1 )
0
(N=6910)
1
(N=432)
Overall
(N=7342)
age
Mean (SD) 54.5 (13.1) 64.3 (9.73) 55.1 (13.1)
Median [Min, Max] 55.0 [30.0, 84.0] 66.0 [32.0, 80.0] 56.0 [30.0, 84.0]
factor(sex)
1 3128 (45.3%) 229 (53.0%) 3357 (45.7%)
2 3782 (54.7%) 203 (47.0%) 3985 (54.3%)
eGFR
Mean (SD) 78.4 (14.5) 70.5 (13.6) 77.9 (14.6)
Median [Min, Max] 79.6 [7.92, 179] 72.7 [27.7, 105] 79.0 [7.92, 179]
eGFRg
0 130 (1.9%) 19 (4.4%) 149 (2.0%)
1 676 (9.8%) 78 (18.1%) 754 (10.3%)
2 6104 (88.3%) 335 (77.5%) 6439 (87.7%)
uprg
0 5712 (82.7%) 313 (72.5%) 6025 (82.1%)
1 828 (12.0%) 74 (17.1%) 902 (12.3%)
2 370 (5.4%) 45 (10.4%) 415 (5.7%)
dx_ckd
0 6892 (99.7%) 431 (99.8%) 7323 (99.7%)
1 8 (0.1%) 0 (0%) 8 (0.1%)
2 10 (0.1%) 1 (0.2%) 11 (0.1%)
BMI
Mean (SD) 22.5 (3.10) 23.2 (3.33) 22.5 (3.12)
Median [Min, Max] 22.3 [13.1, 40.5] 23.0 [13.9, 35.3] 22.3 [13.1, 40.5]
Missing 1 (0.0%) 1 (0.2%) 2 (0.0%)
BMIg
Normal 4987 (72.2%) 295 (68.3%) 5282 (71.9%)
Obesity 100 (1.4%) 12 (2.8%) 112 (1.5%)
Overweight 1237 (17.9%) 96 (22.2%) 1333 (18.2%)
Underweight 586 (8.5%) 29 (6.7%) 615 (8.4%)
tob
1 1985 (28.7%) 139 (32.2%) 2124 (28.9%)
2 1069 (15.5%) 84 (19.4%) 1153 (15.7%)
3 3746 (54.2%) 195 (45.1%) 3941 (53.7%)
Missing 110 (1.6%) 14 (3.2%) 124 (1.7%)
alc
1 3554 (51.4%) 222 (51.4%) 3776 (51.4%)
2 152 (2.2%) 16 (3.7%) 168 (2.3%)
3 3100 (44.9%) 183 (42.4%) 3283 (44.7%)
Missing 104 (1.5%) 11 (2.5%) 115 (1.6%)
sbp
Mean (SD) 126 (21.1) 139 (24.2) 126 (21.5)
Median [Min, Max] 123 [78.0, 233] 137 [86.0, 223] 124 [78.0, 233]
dbp
Mean (SD) 77.4 (12.0) 81.2 (13.3) 77.6 (12.2)
Median [Min, Max] 77.0 [0, 130] 81.0 [19.0, 137] 77.0 [0, 137]
Missing 1 (0.0%) 0 (0%) 1 (0.0%)
pulse
Mean (SD) 69.4 (9.38) 71.0 (10.9) 69.5 (9.48)
Median [Min, Max] 68.0 [38.0, 128] 68.0 [40.0, 120] 68.0 [38.0, 128]
Missing 16 (0.2%) 2 (0.5%) 18 (0.2%)
hypertension
0 4877 (70.6%) 195 (45.1%) 5072 (69.1%)
1 2033 (29.4%) 237 (54.9%) 2270 (30.9%)
dx_ht
0 5701 (82.5%) 269 (62.3%) 5970 (81.3%)
1 491 (7.1%) 56 (13.0%) 547 (7.5%)
2 718 (10.4%) 107 (24.8%) 825 (11.2%)
bs
Mean (SD) 97.9 (17.7) 105 (28.6) 98.3 (18.6)
Median [Min, Max] 95.0 [51.0, 439] 99.0 [76.0, 394] 95.0 [51.0, 439]
diabetes
0 6486 (93.9%) 364 (84.3%) 6850 (93.3%)
1 424 (6.1%) 68 (15.7%) 492 (6.7%)
tc
Mean (SD) 207 (36.1) 211 (36.2) 207 (36.1)
Median [Min, Max] 205 [75.0, 483] 211 [107, 331] 206 [75.0, 483]
hdl
Mean (SD) 54.8 (14.3) 52.5 (14.0) 54.7 (14.3)
Median [Min, Max] 53.0 [10.0, 132] 51.0 [26.0, 99.0] 53.0 [10.0, 132]
Missing 22 (0.3%) 6 (1.4%) 28 (0.4%)
nonhdlc
Mean (SD) 152 (36.9) 159 (36.9) 153 (36.9)
Median [Min, Max] 150 [22.0, 403] 156 [78.0, 283] 150 [22.0, 403]
Missing 22 (0.3%) 6 (1.4%) 28 (0.4%)
tg
Mean (SD) 121 (89.0) 139 (111) 122 (90.5)
Median [Min, Max] 98.0 [15.0, 1540] 112 [36.0, 1510] 99.0 [15.0, 1540]
metSg
0 5488 (79.4%) 268 (62.0%) 5756 (78.4%)
1 1422 (20.6%) 164 (38.0%) 1586 (21.6%)
hg
Mean (SD) 13.9 (1.55) 14.1 (1.47) 13.9 (1.55)
Median [Min, Max] 13.8 [5.70, 20.6] 14.2 [7.00, 19.0] 13.9 [5.70, 20.6]
Missing 3 (0.0%) 0 (0%) 3 (0.0%)
hct
Mean (SD) 41.7 (4.26) 42.3 (4.07) 41.8 (4.25)
Median [Min, Max] 41.6 [12.4, 61.8] 42.4 [21.5, 53.8] 41.7 [12.4, 61.8]
Missing 3 (0.0%) 0 (0%) 3 (0.0%)
rbc
Mean (SD) 4.53 (0.435) 4.54 (0.418) 4.53 (0.434)
Median [Min, Max] 4.51 [2.61, 7.29] 4.54 [3.29, 5.79] 4.51 [2.61, 7.29]
Missing 4 (0.1%) 0 (0%) 4 (0.1%)
dx_af
0 6888 (99.7%) 424 (98.1%) 7312 (99.6%)
1 13 (0.2%) 4 (0.9%) 17 (0.2%)
2 9 (0.1%) 4 (0.9%) 13 (0.2%)
pyst
Mean (SD) 15.4 (6.99) 9.89 (6.10) 15.1 (7.06)
Median [Min, Max] 17.2 [0.0862, 24.3] 10.0 [0.103, 23.3] 16.7 [0.0862, 24.3]
stress
1 2300 (33.3%) 156 (36.1%) 2456 (33.5%)
2 2878 (41.6%) 136 (31.5%) 3014 (41.1%)
9 1060 (15.3%) 73 (16.9%) 1133 (15.4%)
Missing 672 (9.7%) 67 (15.5%) 739 (10.1%)
# Examing the pattern of missing variables
library(rms)
## Warning: package 'rms' was built under R version 4.2.2
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 4.2.2
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 4.2.1
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
## 
##     label, label<-, units
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
na.patterns = naclus(dat1)
plot(na.patterns)

4. Logistic regression

4.1: Model 1: adjusted for eGFR + uprg + tob + alc + age + sex

# Change the reference;
dat1$tob=relevel(dat1$tob, ref = "3")
dat1$alc=relevel(dat1$alc, ref = "3")

logreg1 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex, family=binomial, data=dat1)
library(epiDisplay)
## Loading required package: foreign
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nnet
## Registered S3 method overwritten by 'epiDisplay':
##   method       from
##   print.lrtest rms
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:rms':
## 
##     lrtest
## The following object is masked from 'package:lattice':
## 
##     dotplot
## The following object is masked from 'package:ggplot2':
## 
##     alpha
# summary(logreg1)
logistic.display(logreg1)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                   crude OR(95%CI)         adj. OR(95%CI)         P(Wald's test)
## eGFR (cont. var.) 0.966 (0.9599,0.9722)   0.9907 (0.9824,0.999)  0.028         
##                                                                                
## uprg: ref.=0                                                                   
##    1              1.62 (1.24,2.12)        1.47 (1.11,1.95)       0.007         
##    2              2.32 (1.67,3.23)        1.68 (1.19,2.37)       0.003         
##                                                                                
## tob: ref.=3                                                                    
##    1              1.35 (1.08,1.69)        1.43 (1.07,1.92)       0.016         
##    2              1.51 (1.16,1.97)        1.01 (0.72,1.42)       0.943         
##                                                                                
## alc: ref.=3                                                                    
##    1              1.06 (0.87,1.3)         1.21 (0.96,1.54)       0.11          
##    2              1.8 (1.05,3.07)         1.19 (0.68,2.1)        0.541         
##                                                                                
## age (cont. var.)  1.07 (1.06,1.08)        1.06 (1.05,1.07)       < 0.001       
##                                                                                
## sex: 2 vs 1       0.7015 (0.5752,0.8555)  0.9989 (0.748,1.3341)  0.994         
##                                                                                
##                   P(LR-test)
## eGFR (cont. var.) 0.03      
##                             
## uprg: ref.=0      0.001     
##    1                        
##    2                        
##                             
## tob: ref.=3       0.016     
##    1                        
##    2                        
##                             
## alc: ref.=3       0.274     
##    1                        
##    2                        
##                             
## age (cont. var.)  < 0.001   
##                             
## sex: 2 vs 1       0.994     
##                             
## Log-likelihood = -1455.081
## No. of observations = 7194
## AIC value = 2930.1621
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggcoef(logreg1, exponentiate=T, exclude_intercept=T, vline_color = "red", 
       errorbar_color = "blue", errorbar_height = 0.10)

4.2: Model 2: Model 1 + hypertension + DM + MetSg + af

logreg2 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex + 
                 hypertension + diabetes + metSg + af, family=binomial, data=dat1)
logistic.display(logreg2)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                      crude OR(95%CI)         adj. OR(95%CI)         
## eGFR (cont. var.)    0.966 (0.9599,0.9722)   0.9908 (0.9825,0.9991) 
##                                                                     
## uprg: ref.=0                                                        
##    1                 1.62 (1.24,2.12)        1.42 (1.07,1.88)       
##    2                 2.32 (1.67,3.23)        1.25 (0.87,1.79)       
##                                                                     
## tob: ref.=3                                                         
##    1                 1.35 (1.08,1.69)        1.5 (1.12,2.02)        
##    2                 1.5124 (1.1605,1.9711)  1.0037 (0.7116,1.4156) 
##                                                                     
## alc: ref.=3                                                         
##    1                 1.06 (0.87,1.3)         1.17 (0.92,1.49)       
##    2                 1.8 (1.05,3.07)         1.19 (0.67,2.11)       
##                                                                     
## age (cont. var.)     1.07 (1.06,1.08)        1.05 (1.04,1.07)       
##                                                                     
## sex: 2 vs 1          0.7015 (0.5752,0.8555)  1.0082 (0.7515,1.3526) 
##                                                                     
## hypertension: 1 vs 0 2.96 (2.42,3.61)        1.68 (1.35,2.1)        
##                                                                     
## diabetes: 1 vs 0     2.79 (2.1,3.71)         1.69 (1.24,2.3)        
##                                                                     
## metSg: 1 vs 0        2.41 (1.96,2.96)        1.45 (1.16,1.82)       
##                                                                     
## af: 1 vs 0           6.29 (2.77,14.29)       3.34 (1.41,7.9)        
##                                                                     
##                      P(Wald's test) P(LR-test)
## eGFR (cont. var.)    0.03           0.032     
##                                               
## uprg: ref.=0                        0.042     
##    1                 0.015                    
##    2                 0.228                    
##                                               
## tob: ref.=3                         0.005     
##    1                 0.007                    
##    2                 0.983                    
##                                               
## alc: ref.=3                         0.426     
##    1                 0.201                    
##    2                 0.552                    
##                                               
## age (cont. var.)     < 0.001        < 0.001   
##                                               
## sex: 2 vs 1          0.957          0.957     
##                                               
## hypertension: 1 vs 0 < 0.001        < 0.001   
##                                               
## diabetes: 1 vs 0     < 0.001        0.001     
##                                               
## metSg: 1 vs 0        0.001          0.001     
##                                               
## af: 1 vs 0           0.006          0.012     
##                                               
## Log-likelihood = -1422.9374
## No. of observations = 7194
## AIC value = 2873.8747
# Figure of OR (95% CI) of developing Stroke;
ggcoef(logreg2, exponentiate=T, exclude_intercept=T, vline_color = "red", 
       errorbar_color = "blue", errorbar_height = 0.10)

4.3: Full adjusted model: Model 3 + tc + tg + nonhdlc

logreg3 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex + hypertension + stressnew +
                 diabetes + metSg + af + tc + tg + nonhdlc, family=binomial, data=dat1)
logistic.display(logreg3)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                      crude OR(95%CI)         adj. OR(95%CI)         
## eGFR (cont. var.)    0.967 (0.9603,0.9738)   0.9919 (0.9827,1.0012) 
##                                                                     
## uprg: ref.=0                                                        
##    1                 1.76 (1.32,2.36)        1.65 (1.21,2.24)       
##    2                 2.34 (1.64,3.33)        1.24 (0.84,1.82)       
##                                                                     
## tob: ref.=3                                                         
##    1                 1.39 (1.09,1.77)        1.45 (1.05,1.99)       
##    2                 1.42 (1.06,1.91)        0.88 (0.6,1.28)        
##                                                                     
## alc: ref.=3                                                         
##    1                 1.09 (0.87,1.35)        1.2 (0.92,1.56)        
##    2                 1.63 (0.84,3.17)        1.16 (0.57,2.36)       
##                                                                     
## age (cont. var.)     1.06 (1.05,1.07)        1.05 (1.04,1.07)       
##                                                                     
## sex: 2 vs 1          0.67 (0.54,0.84)        0.91 (0.65,1.25)       
##                                                                     
## hypertension: 1 vs 0 2.86 (2.31,3.55)        1.64 (1.29,2.09)       
##                                                                     
## stressnew: ref.=No                                                  
##    Unknown           1.04 (0.78,1.4)         1.41 (1.05,1.92)       
##    Yes               0.7 (0.55,0.89)         1.14 (0.89,1.48)       
##                                                                     
## diabetes: 1 vs 0     2.89 (2.13,3.93)        1.78 (1.27,2.48)       
##                                                                     
## metSg: 1 vs 0        2.43 (1.94,3.04)        1.44 (1.08,1.92)       
##                                                                     
## af: 1 vs 0           6.14 (2.58,14.62)       3.3 (1.32,8.24)        
##                                                                     
## tc (cont. var.)      1.0035 (1.0005,1.0064)  1.0001 (0.9909,1.0093) 
##                                                                     
## tg (cont. var.)      1.0015 (1.0007,1.0024)  1.0001 (0.9987,1.0014) 
##                                                                     
## nonhdlc (cont. var.) 1.0052 (1.0023,1.008)   1.0015 (0.992,1.011)   
##                                                                     
##                      P(Wald's test) P(LR-test)
## eGFR (cont. var.)    0.087          0.09      
##                                               
## uprg: ref.=0                        0.007     
##    1                 0.001                    
##    2                 0.276                    
##                                               
## tob: ref.=3                         0.005     
##    1                 0.024                    
##    2                 0.504                    
##                                               
## alc: ref.=3                         0.403     
##    1                 0.18                     
##    2                 0.672                    
##                                               
## age (cont. var.)     < 0.001        < 0.001   
##                                               
## sex: 2 vs 1          0.553          0.553     
##                                               
## hypertension: 1 vs 0 < 0.001        < 0.001   
##                                               
## stressnew: ref.=No                  0.085     
##    Unknown           0.025                    
##    Yes               0.299                    
##                                               
## diabetes: 1 vs 0     < 0.001        0.001     
##                                               
## metSg: 1 vs 0        0.013          0.013     
##                                               
## af: 1 vs 0           0.01           0.019     
##                                               
## tc (cont. var.)      0.987          0.987     
##                                               
## tg (cont. var.)      0.939          0.939     
##                                               
## nonhdlc (cont. var.) 0.763          0.762     
##                                               
## Log-likelihood = -1230.9367
## No. of observations = 6479
## AIC value = 2499.8735
# Figure of OR (95% CI) of developing Stroke;
ggcoef(logreg3, exponentiate=T, exclude_intercept=T, vline_color = "red", 
       errorbar_color = "blue", errorbar_height = 0.10)

# 4.4. Choose the best model by BMA

library(BMA)
## Warning: package 'BMA' was built under R version 4.2.2
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-0)
yvar = dat1[, "dx_st"]
# xvars = dat1[, c("eGFR", "eGFRg", "uprg", "age", "sex", "sbp", "dbp", "pulse", "tc", "hdl", "tg", "bs", "rbc", "hg", "hct", "tob", "alc", "dx_ht", "dx_af", "dx_ckd", "dx_mi", "pyst", "exer", "kaidan", "stress", "BMI", "BMIg", "nonhdlc", "diabetes", "hypertension", "metSg" )]

xvars = dat1[, c("eGFR", "uprg", "age", "sex", "sbp", "hdl", "bs", "tob", "af", "kaidan", "stress", "BMI", "BMIg", "diabetes", "hypertension", "metSg" )]

bma = bic.glm(xvars, yvar, glm.family = "binomial")
## Warning in bic.glm.data.frame(xvars, yvar, glm.family = "binomial"): There were
## 989 records deleted due to NA's
summary(bma)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial")
## 
## 
##   21  models were selected
##  Best  5  models (cumulative posterior probability =  0.5175 ): 
## 
##                   p!=0    EV         SD        model 1     model 2   
## Intercept         100    -8.5847795  0.818150  -8.704e+00  -9.104e+00
## eGFR                2.1  -0.0002321  0.001720       .           .    
## uprg                0.0                                              
##     .1                    0.0000000  0.000000       .           .    
##     .2                    0.0000000  0.000000       .           .    
## age               100.0   0.0536071  0.005729   5.254e-02   5.475e-02
## sex                10.5                                              
##    .2                    -0.0315872  0.099499       .           .    
## sbp                91.0   0.0123332  0.004705   1.393e-02   1.467e-02
## hdl                 0.0   0.0000000  0.000000       .           .    
## bs                 79.1   0.0069436  0.004037   9.170e-03   9.135e-03
## tob                42.6                                              
##    .1                     0.2227354  0.271884       .       5.229e-01
##    .2                     0.0190803  0.104096       .       4.289e-02
## af                  1.1   0.0106636  0.114698       .           .    
## kaidan              0.0                                              
##       .2                  0.0000000  0.000000       .           .    
##       .3                  0.0000000  0.000000       .           .    
##       .4                  0.0000000  0.000000       .           .    
##       .5                  0.0000000  0.000000       .           .    
## stress              0.0                                              
##       .2                  0.0000000  0.000000       .           .    
##       .9                  0.0000000  0.000000       .           .    
## BMI                13.8   0.0064245  0.017336       .           .    
## BMIg                0.0                                              
##     .Obesity              0.0000000  0.000000       .           .    
##     .Overweight           0.0000000  0.000000       .           .    
##     .Underweight          0.0000000  0.000000       .           .    
## diabetes           19.4                                              
##         .1                0.1228145  0.261449       .           .    
## hypertension        9.9                                              
##             .1            0.0554267  0.174021       .           .    
## metSg              28.1                                              
##      .1                   0.1000332  0.173838       .           .    
##                                                                      
## nVar                                              3           4      
## BIC                                            -5.315e+04  -5.315e+04
## post prob                                       0.174       0.130    
##                   model 3     model 4     model 5   
## Intercept         -8.434e+00  -7.855e+00  -8.501e+00
## eGFR                   .           .           .    
## uprg                                                
##     .1                 .           .           .    
##     .2                 .           .           .    
## age                5.236e-02   5.192e-02   5.197e-02
## sex                                                 
##    .2                  .           .      -2.947e-01
## sbp                1.216e-02   1.427e-02   1.402e-02
## hdl                    .           .           .    
## bs                 7.924e-03       .       8.807e-03
## tob                                                 
##    .1                  .           .           .    
##    .2                  .           .           .    
## af                     .           .           .    
## kaidan                                              
##       .2               .           .           .    
##       .3               .           .           .    
##       .4               .           .           .    
##       .5               .           .           .    
## stress                                              
##       .2               .           .           .    
##       .9               .           .           .    
## BMI                    .           .           .    
## BMIg                                                
##     .Obesity           .           .           .    
##     .Overweight        .           .           .    
##     .Underweight       .           .           .    
## diabetes                                            
##         .1             .       6.841e-01       .    
## hypertension                                        
##             .1         .           .           .    
## metSg                                               
##      .1            3.452e-01       .           .    
##                                                     
## nVar                 4           3           4      
## BIC               -5.315e+04  -5.315e+04  -5.315e+04
## post prob          0.083       0.069       0.062
imageplot.bma(bma)

Reduced MODEL
logreg = glm(dx_st ~ I(age/10) + I(sbp/10) + sex + BMI + tob + stressnew + 
                 I(eGFR/10) + I(bs/10) + af + diabetes, 
             family=binomial, data=dat1)
                
summary(logreg)
## 
## Call:
## glm(formula = dx_st ~ I(age/10) + I(sbp/10) + sex + BMI + tob + 
##     stressnew + I(eGFR/10) + I(bs/10) + af + diabetes, family = binomial, 
##     data = dat1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0578  -0.3737  -0.2590  -0.1736   3.2173  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.57312    0.82421 -10.402  < 2e-16 ***
## I(age/10)         0.48942    0.06247   7.835 4.70e-15 ***
## I(sbp/10)         0.13224    0.02602   5.083 3.72e-07 ***
## sex2             -0.11993    0.15453  -0.776  0.43770    
## BMI               0.04762    0.01761   2.704  0.00684 ** 
## tob1              0.44794    0.16146   2.774  0.00553 ** 
## tob2             -0.09933    0.19099  -0.520  0.60301    
## stressnewUnknown  0.36626    0.15432   2.373  0.01763 *  
## stressnewYes      0.13503    0.13000   1.039  0.29896    
## I(eGFR/10)       -0.11295    0.04672  -2.418  0.01562 *  
## I(bs/10)          0.06182    0.02741   2.256  0.02410 *  
## af                1.02236    0.48918   2.090  0.03662 *  
## diabetes1         0.36074    0.21303   1.693  0.09039 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2754.3  on 6497  degrees of freedom
## Residual deviance: 2473.7  on 6485  degrees of freedom
##   (844 observations deleted due to missingness)
## AIC: 2499.7
## 
## Number of Fisher Scoring iterations: 6
logistic.display(logreg)
##  
##                         OR lower95ci upper95ci     Pr(>|Z|)
## I(age/10)        1.6313713 1.4433767 1.8438515 4.699389e-15
## I(sbp/10)        1.1413773 1.0846362 1.2010868 3.719270e-07
## sex2             0.8869850 0.6552121 1.2007444 4.376967e-01
## BMI              1.0487713 1.0131940 1.0855978 6.843365e-03
## tob1             1.5650919 1.1405248 2.1477066 5.531462e-03
## tob2             0.9054432 0.6227129 1.3165416 6.030107e-01
## stressnewUnknown 1.4423373 1.0658765 1.9517617 1.762678e-02
## stressnewYes     1.1445656 0.8871250 1.4767146 2.989638e-01
## I(eGFR/10)       0.8931987 0.8150440 0.9788478 1.562420e-02
## I(bs/10)         1.0637735 1.0081343 1.1224833 2.409957e-02
## af               2.7797383 1.0656499 7.2509229 3.662257e-02
## diabetes1        1.4343901 0.9447820 2.1777243 9.039150e-02
Backward logistic regression ]
FINAL
logreg = glm(dx_st ~ I(age/10) + I(sbp/10) + sex + BMI + tob + stressnew + 
                 I(eGFR/10) + I(bs/10) + af + diabetes, family=binomial, data=dat1)
summary(logreg)
## 
## Call:
## glm(formula = dx_st ~ I(age/10) + I(sbp/10) + sex + BMI + tob + 
##     stressnew + I(eGFR/10) + I(bs/10) + af + diabetes, family = binomial, 
##     data = dat1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0578  -0.3737  -0.2590  -0.1736   3.2173  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -8.57312    0.82421 -10.402  < 2e-16 ***
## I(age/10)         0.48942    0.06247   7.835 4.70e-15 ***
## I(sbp/10)         0.13224    0.02602   5.083 3.72e-07 ***
## sex2             -0.11993    0.15453  -0.776  0.43770    
## BMI               0.04762    0.01761   2.704  0.00684 ** 
## tob1              0.44794    0.16146   2.774  0.00553 ** 
## tob2             -0.09933    0.19099  -0.520  0.60301    
## stressnewUnknown  0.36626    0.15432   2.373  0.01763 *  
## stressnewYes      0.13503    0.13000   1.039  0.29896    
## I(eGFR/10)       -0.11295    0.04672  -2.418  0.01562 *  
## I(bs/10)          0.06182    0.02741   2.256  0.02410 *  
## af                1.02236    0.48918   2.090  0.03662 *  
## diabetes1         0.36074    0.21303   1.693  0.09039 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2754.3  on 6497  degrees of freedom
## Residual deviance: 2473.7  on 6485  degrees of freedom
##   (844 observations deleted due to missingness)
## AIC: 2499.7
## 
## Number of Fisher Scoring iterations: 6
logistic.display(logreg)
##  
##                         OR lower95ci upper95ci     Pr(>|Z|)
## I(age/10)        1.6313713 1.4433767 1.8438515 4.699389e-15
## I(sbp/10)        1.1413773 1.0846362 1.2010868 3.719270e-07
## sex2             0.8869850 0.6552121 1.2007444 4.376967e-01
## BMI              1.0487713 1.0131940 1.0855978 6.843365e-03
## tob1             1.5650919 1.1405248 2.1477066 5.531462e-03
## tob2             0.9054432 0.6227129 1.3165416 6.030107e-01
## stressnewUnknown 1.4423373 1.0658765 1.9517617 1.762678e-02
## stressnewYes     1.1445656 0.8871250 1.4767146 2.989638e-01
## I(eGFR/10)       0.8931987 0.8150440 0.9788478 1.562420e-02
## I(bs/10)         1.0637735 1.0081343 1.1224833 2.409957e-02
## af               2.7797383 1.0656499 7.2509229 3.662257e-02
## diabetes1        1.4343901 0.9447820 2.1777243 9.039150e-02
ggcoef(logreg, exponentiate=T, exclude_intercept=T, vline_color = "red", 
       errorbar_color = "blue", errorbar_height = 0.10)

5. Evaluate the performance of model

The performace of the developed models was assessed in term of

5.1 Discrimination

library(Epi)
## Warning: package 'Epi' was built under R version 4.2.2
ROC(form = dx_st ~ I(age/10) + I(sbp/10) + sex + BMI + tob + stressnew + 
                 I(eGFR/10) + I(bs/10) + af + diabetes, data=dat1, plot = "ROC", lwd=2) 

# The performance of the final model demonstrated an acceptable discrimination with area under the curve (AUC) of 75 %. 

# varImp function (help to weight the predictors, Weighted predictors analysis)
library(caret)
## Warning: package 'caret' was built under R version 4.2.1
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
varImp(logreg, scale = F)
##                    Overall
## I(age/10)        7.8347066
## I(sbp/10)        5.0827920
## sex2             0.7760885
## BMI              2.7043719
## tob1             2.7743354
## tob2             0.5200759
## stressnewUnknown 2.3733663
## stressnewYes     1.0386581
## I(eGFR/10)       2.4175777
## I(bs/10)         2.2555381
## af               2.0899469
## diabetes1        1.6933361

5.2 Calibration and Validation by Bootstrap.

library(rms)
moboostrap = lrm(dx_st ~ age + sbp + sex + BMI + tob + stressnew + 
                 eGFR + bs + af + diabetes, data=dat1, x = TRUE, y = TRUE)

cal = calibrate(moboostrap, method = "boot", B=100, legend = T, digits = 3, subtitles = T) 
## 
## Divergence or singularity in 6 samples
plot(cal, xlab = "Predicted probability of Stroke", ylab = "Observation Proportion")

## 
## n=6498   Mean absolute error=0.006   Mean squared error=1e-04
## 0.9 Quantile of absolute error=0.011
validate = validate(moboostrap, B = 100)
## 
## Divergence or singularity in 3 samples
validate
##           index.orig training    test optimism index.corrected  n
## Dxy           0.5035   0.5104  0.4959   0.0146          0.4889 97
## R2            0.1223   0.1270  0.1178   0.0092          0.1131 97
## Intercept     0.0000   0.0000 -0.1062   0.1062         -0.1062 97
## Slope         1.0000   1.0000  0.9588   0.0412          0.9588 97
## Emax          0.0000   0.0000  0.0306   0.0306          0.0306 97
## D             0.0430   0.0448  0.0414   0.0035          0.0396 97
## U            -0.0003  -0.0003  0.0000  -0.0004          0.0000 97
## Q             0.0433   0.0451  0.0413   0.0038          0.0395 97
## B             0.0492   0.0492  0.0494  -0.0002          0.0494 97
## g             1.1274   1.1469  1.0952   0.0517          1.0757 97
## gp            0.0513   0.0522  0.0503   0.0019          0.0494 97

5.3 Cross-validation by split dataset into train and test

library(caret)

library(caret)
sample = createDataPartition(dat1$dx_st, p = 0.7, list = FALSE)
train = dat1[sample,]
test = dat1[-sample,]

modelval = train(dx_st ~ eGFR +  bs + age + sbp + bs + sex + diabetes + af, data = train, method = "glm", family = "binomial")

modelval
## Generalized Linear Model 
## 
## 5140 samples
##    7 predictor
##    2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 5140, 5140, 5140, 5140, 5140, 5140, ... 
## Resampling results:
## 
##   Accuracy   Kappa      
##   0.9422405  0.005552353
summary(modelval)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0314  -0.3857  -0.2694  -0.1817   3.0444  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.049528   0.745020  -9.462  < 2e-16 ***
## eGFR        -0.012643   0.004853  -2.605  0.00918 ** 
## bs           0.006713   0.002787   2.409  0.01601 *  
## age          0.042978   0.006657   6.456 1.07e-10 ***
## sbp          0.014845   0.002760   5.378 7.54e-08 ***
## sex2        -0.117234   0.122887  -0.954  0.34008    
## diabetes1    0.524312   0.221049   2.372  0.01770 *  
## af           0.920538   0.603700   1.525  0.12730    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2303.4  on 5139  degrees of freedom
## Residual deviance: 2069.8  on 5132  degrees of freedom
## AIC: 2085.8
## 
## Number of Fisher Scoring iterations: 6
# How to interpret the output

#6. Nomogram for Stroke prediction (Probability of Stroke)

library(rms)
ddist = datadist(dat1)
options(datadist = "ddist")
m1 = lrm(dx_st ~  age + eGFR + BMI + tob + sex + sbp + uprg +  
            af + diabetes, data = dat1)
p = nomogram(m1, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of Stroke")

plot(p, cex.axis = 0.6, lmgp = 0.2)