#1. Import dataset: anackd. This is baseline Suita study dataset

dat = read.csv("C:/Users/thien/OneDrive/Desktop/anackd.csv")

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.4.0     v purrr   0.3.4
## v tibble  3.1.8     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
dat1 = dat %>% select(age, sex, sbp, dbp, pulse, tc, hdl, nonhdlc, tg, bs, rbc, hg, hct, tob, alc, dx_ht, dx_af, dx_ckd, dx_mi, dx_st, pyst, exer, kaidan, stress, BMI, BMIg, nonhdlc, diabetes, hypertension, metSg, eGFR, eGFRg, uprg, fast, dx_dm)

#2. Table1: Characteristics of study participants who don’t have stroke at baseline

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ age + sex + eGFR + eGFRg + uprg + dx_ckd + BMI + BMIg + tob + alc + sbp + dbp + pulse + hypertension + dx_ht + bs + diabetes + tc + hdl + nonhdlc + tg + metSg + hg + hct + rbc + dx_af + dx_ckd + dx_st + pyst, data = dat1 )
Overall
(N=7342)
age
Mean (SD) 55.1 (13.1)
Median [Min, Max] 56.0 [30.0, 84.0]
sex
Mean (SD) 1.54 (0.498)
Median [Min, Max] 2.00 [1.00, 2.00]
eGFR
Mean (SD) 77.9 (14.6)
Median [Min, Max] 79.0 [7.92, 179]
eGFRg
Mean (SD) 1.86 (0.404)
Median [Min, Max] 2.00 [0, 2.00]
uprg
Mean (SD) 0.236 (0.542)
Median [Min, Max] 0 [0, 2.00]
dx_ckd
Mean (SD) 0.00409 (0.0841)
Median [Min, Max] 0 [0, 2.00]
BMI
Mean (SD) 22.5 (3.12)
Median [Min, Max] 22.3 [13.1, 40.5]
Missing 2 (0.0%)
BMIg
Normal 5282 (71.9%)
Obesity 112 (1.5%)
Overweight 1333 (18.2%)
Underweight 615 (8.4%)
tob
Mean (SD) 2.25 (0.881)
Median [Min, Max] 3.00 [1.00, 3.00]
Missing 124 (1.7%)
alc
Mean (SD) 1.93 (0.986)
Median [Min, Max] 1.00 [1.00, 3.00]
Missing 115 (1.6%)
sbp
Mean (SD) 126 (21.5)
Median [Min, Max] 124 [78.0, 233]
dbp
Mean (SD) 77.6 (12.2)
Median [Min, Max] 77.0 [0, 137]
Missing 1 (0.0%)
pulse
Mean (SD) 69.5 (9.48)
Median [Min, Max] 68.0 [38.0, 128]
Missing 18 (0.2%)
hypertension
Mean (SD) 0.309 (0.462)
Median [Min, Max] 0 [0, 1.00]
dx_ht
Mean (SD) 0.299 (0.659)
Median [Min, Max] 0 [0, 2.00]
bs
Mean (SD) 98.3 (18.6)
Median [Min, Max] 95.0 [51.0, 439]
diabetes
Mean (SD) 0.0670 (0.250)
Median [Min, Max] 0 [0, 1.00]
tc
Mean (SD) 207 (36.1)
Median [Min, Max] 206 [75.0, 483]
hdl
Mean (SD) 54.7 (14.3)
Median [Min, Max] 53.0 [10.0, 132]
Missing 28 (0.4%)
nonhdlc
Mean (SD) 153 (36.9)
Median [Min, Max] 150 [22.0, 403]
Missing 28 (0.4%)
tg
Mean (SD) 122 (90.5)
Median [Min, Max] 99.0 [15.0, 1540]
metSg
Mean (SD) 0.216 (0.412)
Median [Min, Max] 0 [0, 1.00]
hg
Mean (SD) 13.9 (1.55)
Median [Min, Max] 13.9 [5.70, 20.6]
Missing 3 (0.0%)
hct
Mean (SD) 41.8 (4.25)
Median [Min, Max] 41.7 [12.4, 61.8]
Missing 3 (0.0%)
rbc
Mean (SD) 4.53 (0.434)
Median [Min, Max] 4.51 [2.61, 7.29]
Missing 4 (0.1%)
dx_af
Mean (SD) 0.00586 (0.0968)
Median [Min, Max] 0 [0, 2.00]
dx_st
Mean (SD) 0.0588 (0.235)
Median [Min, Max] 0 [0, 1.00]
pyst
Mean (SD) 15.1 (7.06)
Median [Min, Max] 16.7 [0.0862, 24.3]

#3. Create factors for categorical variables

dat1$sex=as.factor(dat1$sex) # Sex: 1 is male, 2 is female
dat1$BMIg=as.factor(dat1$BMIg)
dat1$eGFRg=as.factor(dat1$eGFRg)
dat1$uprg=as.factor(dat1$uprg) # Protein in urine: 0 is no protein in urine, 1 is trace, 2 is having protein in urine
dat1$tob=as.factor(dat1$tob) # Smoke: 1 is current, 2 is quit, 3 is never smoke
dat1$alc=as.factor(dat1$alc)
dat1$exer=as.factor(dat1$exer)
dat1$kaidan=as.factor(dat1$kaidan)
dat1$dx_af=as.factor(dat1$dx_af)
dat1$dx_ht=as.factor(dat1$dx_ht)
dat1$dx_ckd=as.factor(dat1$dx_ckd)
dat1$dx_st=as.factor(dat1$dx_st)
dat1$hypertension=as.factor(dat1$hypertension)
dat1$diabetes=as.factor(dat1$diabetes)
dat1$stress=as.factor(dat1$stress)
dat1$metSg=as.factor(dat1$metSg)

# Create new variable
dat1$af[dat1$dx_af == 0] = 0
dat1$af[dat1$dx_af == 1] = 1
dat1$af[dat1$dx_af == 2] = 1

#4. Logistic regression ##4.1: Model 1: adjusted for eGFR + uprg + tob + alc + age + sex

# Change the reference;
dat1$tob=relevel(dat1$tob, ref = "3")
dat1$alc=relevel(dat1$alc, ref = "3")

logreg1 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex, family=binomial, data=dat1)
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
logistic.display(logreg1)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                   crude OR(95%CI)         adj. OR(95%CI)         P(Wald's test)
## eGFR (cont. var.) 0.966 (0.9599,0.9722)   0.9907 (0.9824,0.999)  0.028         
##                                                                                
## uprg: ref.=0                                                                   
##    1              1.62 (1.24,2.12)        1.47 (1.11,1.95)       0.007         
##    2              2.32 (1.67,3.23)        1.68 (1.19,2.37)       0.003         
##                                                                                
## tob: ref.=3                                                                    
##    1              1.35 (1.08,1.69)        1.43 (1.07,1.92)       0.016         
##    2              1.51 (1.16,1.97)        1.01 (0.72,1.42)       0.943         
##                                                                                
## alc: ref.=3                                                                    
##    1              1.06 (0.87,1.3)         1.21 (0.96,1.54)       0.11          
##    2              1.8 (1.05,3.07)         1.19 (0.68,2.1)        0.541         
##                                                                                
## age (cont. var.)  1.07 (1.06,1.08)        1.06 (1.05,1.07)       < 0.001       
##                                                                                
## sex: 2 vs 1       0.7015 (0.5752,0.8555)  0.9989 (0.748,1.3341)  0.994         
##                                                                                
##                   P(LR-test)
## eGFR (cont. var.) 0.03      
##                             
## uprg: ref.=0      0.001     
##    1                        
##    2                        
##                             
## tob: ref.=3       0.016     
##    1                        
##    2                        
##                             
## alc: ref.=3       0.274     
##    1                        
##    2                        
##                             
## age (cont. var.)  < 0.001   
##                             
## sex: 2 vs 1       0.994     
##                             
## Log-likelihood = -1455.081
## No. of observations = 7194
## AIC value = 2930.1621
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggcoef(logreg1, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)

# Discrimination
library(Epi)
ROC(form = dx_st ~  eGFR + uprg + tob + alc + age + sex, data=dat1, plot = "ROC", lwd=2)

##4.2: Model 2: Model 1 + hypertension + DM + MetSg + af

logreg2 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + af, family=binomial, data=dat1)
logistic.display(logreg2)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                      crude OR(95%CI)         adj. OR(95%CI)         
## eGFR (cont. var.)    0.966 (0.9599,0.9722)   0.9908 (0.9825,0.9991) 
##                                                                     
## uprg: ref.=0                                                        
##    1                 1.62 (1.24,2.12)        1.42 (1.07,1.88)       
##    2                 2.32 (1.67,3.23)        1.25 (0.87,1.79)       
##                                                                     
## tob: ref.=3                                                         
##    1                 1.35 (1.08,1.69)        1.5 (1.12,2.02)        
##    2                 1.5124 (1.1605,1.9711)  1.0037 (0.7116,1.4156) 
##                                                                     
## alc: ref.=3                                                         
##    1                 1.06 (0.87,1.3)         1.17 (0.92,1.49)       
##    2                 1.8 (1.05,3.07)         1.19 (0.67,2.11)       
##                                                                     
## age (cont. var.)     1.07 (1.06,1.08)        1.05 (1.04,1.07)       
##                                                                     
## sex: 2 vs 1          0.7015 (0.5752,0.8555)  1.0082 (0.7515,1.3526) 
##                                                                     
## hypertension: 1 vs 0 2.96 (2.42,3.61)        1.68 (1.35,2.1)        
##                                                                     
## diabetes: 1 vs 0     2.79 (2.1,3.71)         1.69 (1.24,2.3)        
##                                                                     
## metSg: 1 vs 0        2.41 (1.96,2.96)        1.45 (1.16,1.82)       
##                                                                     
## af: 1 vs 0           6.29 (2.77,14.29)       3.34 (1.41,7.9)        
##                                                                     
##                      P(Wald's test) P(LR-test)
## eGFR (cont. var.)    0.03           0.032     
##                                               
## uprg: ref.=0                        0.042     
##    1                 0.015                    
##    2                 0.228                    
##                                               
## tob: ref.=3                         0.005     
##    1                 0.007                    
##    2                 0.983                    
##                                               
## alc: ref.=3                         0.426     
##    1                 0.201                    
##    2                 0.552                    
##                                               
## age (cont. var.)     < 0.001        < 0.001   
##                                               
## sex: 2 vs 1          0.957          0.957     
##                                               
## hypertension: 1 vs 0 < 0.001        < 0.001   
##                                               
## diabetes: 1 vs 0     < 0.001        0.001     
##                                               
## metSg: 1 vs 0        0.001          0.001     
##                                               
## af: 1 vs 0           0.006          0.012     
##                                               
## Log-likelihood = -1422.9374
## No. of observations = 7194
## AIC value = 2873.8747
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
ggcoef(logreg2, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)

# Discrimination
library(Epi)
ROC(form = dx_st ~  eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af, data=dat1, plot = "ROC", lwd=2)

##4.3: Full adjusted model: Model 3 + tc + tg + nonhdlc

logreg3 = glm(dx_st ~  eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + af + tc + tg + nonhdlc, family=binomial, data=dat1)
logistic.display(logreg3)
## 
## Logistic regression predicting dx_st : 1 vs 0 
##  
##                      crude OR(95%CI)         adj. OR(95%CI)         
## eGFR (cont. var.)    0.9665 (0.9603,0.9727)  0.9917 (0.9833,1.0002) 
##                                                                     
## uprg: ref.=0                                                        
##    1                 1.65 (1.26,2.16)        1.45 (1.1,1.93)        
##    2                 2.3 (1.65,3.21)         1.24 (0.87,1.79)       
##                                                                     
## tob: ref.=3                                                         
##    1                 1.34 (1.07,1.68)        1.47 (1.09,1.98)       
##    2                 1.51 (1.16,1.97)        0.99 (0.7,1.39)        
##                                                                     
## alc: ref.=3                                                         
##    1                 1.08 (0.88,1.32)        1.2 (0.94,1.54)        
##    2                 1.83 (1.07,3.13)        1.21 (0.68,2.15)       
##                                                                     
## age (cont. var.)     1.07 (1.06,1.08)        1.05 (1.04,1.07)       
##                                                                     
## sex: 2 vs 1          0.6953 (0.5694,0.849)   0.9913 (0.7322,1.3422) 
##                                                                     
## hypertension: 1 vs 0 2.91 (2.38,3.56)        1.65 (1.32,2.07)       
##                                                                     
## diabetes: 1 vs 0     2.79 (2.09,3.71)        1.69 (1.24,2.31)       
##                                                                     
## metSg: 1 vs 0        2.4 (1.95,2.95)         1.4 (1.07,1.84)        
##                                                                     
## af: 1 vs 0           6.35 (2.8,14.42)        3.36 (1.42,7.98)       
##                                                                     
## tc (cont. var.)      1.0029 (1.0002,1.0056)  1.0002 (0.9917,1.0087) 
##                                                                     
## tg (cont. var.)      1.0016 (1.0008,1.0024)  1.0003 (0.999,1.0015)  
##                                                                     
## nonhdlc (cont. var.) 1.0044 (1.0018,1.0071)  1.0006 (0.9918,1.0094) 
##                                                                     
##                      P(Wald's test) P(LR-test)
## eGFR (cont. var.)    0.056          0.058     
##                                               
## uprg: ref.=0                        0.03      
##    1                 0.009                    
##    2                 0.236                    
##                                               
## tob: ref.=3                         0.009     
##    1                 0.012                    
##    2                 0.941                    
##                                               
## alc: ref.=3                         0.338     
##    1                 0.147                    
##    2                 0.52                     
##                                               
## age (cont. var.)     < 0.001        < 0.001   
##                                               
## sex: 2 vs 1          0.955          0.955     
##                                               
## hypertension: 1 vs 0 < 0.001        < 0.001   
##                                               
## diabetes: 1 vs 0     < 0.001        0.001     
##                                               
## metSg: 1 vs 0        0.014          0.014     
##                                               
## af: 1 vs 0           0.006          0.012     
##                                               
## tc (cont. var.)      0.968          0.968     
##                                               
## tg (cont. var.)      0.668          0.673     
##                                               
## nonhdlc (cont. var.) 0.898          0.897     
##                                               
## Log-likelihood = -1409.83
## No. of observations = 7167
## AIC value = 2853.6601
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
ggcoef(logreg3, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)

# Discrimination
library(Epi)
ROC(form = dx_st ~  eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af+ tc + tg + nonhdlc, data=dat1, plot = "ROC", lwd=2) 

# varImp function (help to weight the predictors)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
## 
##     dotplot
## 
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
## 
##     cluster
## The following object is masked from 'package:purrr':
## 
##     lift
varImp(logreg3, scale = F)
##                  Overall
## eGFR          1.91241257
## uprg1         2.59818086
## uprg2         1.18408477
## tob1          2.51293033
## tob2          0.07343710
## alc1          1.44868220
## alc2          0.64305562
## age           9.25606049
## sex2          0.05633946
## hypertension1 4.38893974
## diabetes1     3.33084822
## metSg1        2.46959797
## af            2.75037146
## tc            0.04038902
## tg            0.42894747
## nonhdlc       0.12876569

#5a. Choose the best model by BMA

library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
## 
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
## 
##     heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-1)
yvar = dat1[, "dx_st"]
# xvars = dat1[, c("eGFR", "eGFRg", "uprg", "age", "sex", "sbp", "dbp", "pulse", "tc", "hdl", "tg", "bs", "rbc", "hg", "hct", "tob", "alc", "dx_ht", "dx_af", "dx_ckd", "dx_mi", "pyst", "exer", "kaidan", "stress", "BMI", "BMIg", "nonhdlc", "diabetes", "hypertension", "metSg" )]

xvars = dat1[, c("eGFR", "uprg", "age", "sex", "sbp", "dbp", "tc", "hdl", "tg", "bs",  "tob", "alc",  "dx_af", "kaidan", "stress", "BMI", "BMIg", "diabetes", "hypertension", "metSg" )]

bma = bic.glm(xvars, yvar, glm.family = "binomial")
## Warning in bic.glm.data.frame(xvars, yvar, glm.family = "binomial"): There were
## 1008 records deleted due to NA's
summary(bma)
## 
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial")
## 
## 
##   23  models were selected
##  Best  5  models (cumulative posterior probability =  0.4803 ): 
## 
##                   p!=0    EV         SD        model 1     model 2   
## Intercept         100    -8.5990543  0.850140  -8.720e+00  -9.125e+00
## eGFR                4.4  -0.0004856  0.002479       .           .    
## uprg                0.8                                              
##     .1                    0.0043739  0.050496       .           .    
##     .2                    0.0022142  0.030252       .           .    
## age               100.0   0.0537372  0.005828   5.266e-02   5.488e-02
## sex                10.6                                              
##    .2                    -0.0325206  0.101594       .           .    
## sbp                89.4   0.0120521  0.004923   1.393e-02   1.469e-02
## dbp                 0.0   0.0000000  0.000000       .           .    
## tc                  0.0   0.0000000  0.000000       .           .    
## hdl                 0.0   0.0000000  0.000000       .           .    
## tg                  0.0   0.0000000  0.000000       .           .    
## bs                 81.0   0.0071450  0.003958   9.249e-03   9.207e-03
## tob                45.5                                              
##    .1                     0.2400116  0.277022       .       5.276e-01
##    .2                     0.0231337  0.108144       .       4.926e-02
## alc                 0.0                                              
##    .1                     0.0000000  0.000000       .           .    
##    .2                     0.0000000  0.000000       .           .    
## dx_af               0.0                                              
##      .1                   0.0000000  0.000000       .           .    
##      .2                   0.0000000  0.000000       .           .    
## kaidan              0.0                                              
##       .2                  0.0000000  0.000000       .           .    
##       .3                  0.0000000  0.000000       .           .    
##       .4                  0.0000000  0.000000       .           .    
##       .5                  0.0000000  0.000000       .           .    
## stress              0.0                                              
##       .2                  0.0000000  0.000000       .           .    
##       .9                  0.0000000  0.000000       .           .    
## BMI                16.2   0.0077124  0.018943       .           .    
## BMIg                0.0                                              
##     .Obesity              0.0000000  0.000000       .           .    
##     .Overweight           0.0000000  0.000000       .           .    
##     .Underweight          0.0000000  0.000000       .           .    
## diabetes           17.6                                              
##         .1                0.1109721  0.251063       .           .    
## hypertension       11.5                                              
##             .1            0.0654977  0.188280       .           .    
## metSg              29.7                                              
##      .1                   0.1069930  0.178951       .           .    
##                                                                      
## nVar                                              3           4      
## BIC                                            -5.298e+04  -5.298e+04
## post prob                                       0.151       0.124    
##                   model 3     model 4     model 5   
## Intercept         -8.447e+00  -1.013e+01  -8.862e+00
## eGFR                   .           .           .    
## uprg                                                
##     .1                 .           .           .    
##     .2                 .           .           .    
## age                5.248e-02   5.686e-02   5.476e-02
## sex                                                 
##    .2                  .           .           .    
## sbp                1.213e-02   1.320e-02   1.294e-02
## dbp                    .           .           .    
## tc                     .           .           .    
## hdl                    .           .           .    
## tg                     .           .           .    
## bs                 7.992e-03   8.860e-03   7.937e-03
## tob                                                 
##    .1                  .       5.387e-01   5.283e-01
##    .2                  .       5.085e-02   5.997e-02
## alc                                                 
##    .1                  .           .           .    
##    .2                  .           .           .    
## dx_af                                               
##      .1                .           .           .    
##      .2                .           .           .    
## kaidan                                              
##       .2               .           .           .    
##       .3               .           .           .    
##       .4               .           .           .    
##       .5               .           .           .    
## stress                                              
##       .2               .           .           .    
##       .9               .           .           .    
## BMI                    .       4.859e-02       .    
## BMIg                                                
##     .Obesity           .           .           .    
##     .Overweight        .           .           .    
##     .Underweight       .           .           .    
## diabetes                                            
##         .1             .           .           .    
## hypertension                                        
##             .1         .           .           .    
## metSg                                               
##      .1            3.505e-01       .       3.472e-01
##                                                     
## nVar                 4           5           5      
## BIC               -5.297e+04  -5.297e+04  -5.297e+04
## post prob          0.080       0.064       0.062
imageplot.bma(bma)

#5b. Boostrap method to validate the model

library(rms)
## Loading required package: Hmisc
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
## 
##     label, label<-, units
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "replValueSp"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "mMatrix"; definition not updated
## Registered S3 method overwritten by 'rms':
##   method       from      
##   print.lrtest epiDisplay
## 
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
## 
##     lrtest
# Choose the best model: based on the predictors which one had p value < 0.05

moboostrap = lrm(dx_st ~  eGFR + uprg + tob + age +  hypertension + af, data=dat1, x = TRUE, y = TRUE)

validate = validate(moboostrap, B = 100)
## 
## Divergence or singularity in 11 samples
validate
##           index.orig training    test optimism index.corrected  n
## Dxy           0.4992   0.5009  0.4946   0.0063          0.4929 89
## R2            0.1178   0.1195  0.1151   0.0043          0.1135 89
## Intercept     0.0000   0.0000 -0.0479   0.0479         -0.0479 89
## Slope         1.0000   1.0000  0.9802   0.0198          0.9802 89
## Emax          0.0000   0.0000  0.0138   0.0138          0.0138 89
## D             0.0429   0.0435  0.0419   0.0016          0.0413 89
## U            -0.0003  -0.0003  0.0000  -0.0002          0.0000 89
## Q             0.0432   0.0438  0.0419   0.0019          0.0413 89
## B             0.0523   0.0522  0.0524  -0.0002          0.0525 89
## g             1.0846   1.0895  1.0655   0.0240          1.0605 89
## gp            0.0530   0.0532  0.0524   0.0009          0.0522 89
cal = calibrate(moboostrap, method = "boot", B=100, legend = T, digits = 3, subtitles = T) 
## 
## Divergence or singularity in 12 samples
plot(cal, xlab = "Predicted probability of Stroke", ylab = "Observation Proportion")

## 
## n=7218   Mean absolute error=0.006   Mean squared error=0.00013
## 0.9 Quantile of absolute error=0.01

#6. Nomogram for Stroke prediction (Probability of Stroke)

library(rms)
ddist = datadist(dat1)
options(datadist = "ddist")
m1 = lrm(dx_st ~ eGFR + uprg + tob + age + sex + hypertension + diabetes + af, data = dat1)
p = nomogram(m1, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of Stroke")

plot(p, cex.axis = 0.6, lmgp = 0.2)

# Explain nomogram:
# For example: if participant is male (2 points), and has eGFR = 120 (20 points), trace protein urine (12 points), age 70 (70 points), hypertension (20 points), no diabetes (0 points), atrial fibrilation (40 points) --> Total points = 2 + 20 + 12 + 70 + 20 + 0 + 40 = 164 --> The Probability of Stroke is about 20%

#7. Evaluate the model by split dataset into train and test

library(caret)
sample = createDataPartition(dat1$dx_st, p = 0.7, list = FALSE)
train = dat1[sample,]
test = dat1[-sample,]

mod = train(dx_st ~ eGFR + uprg +  bs + age + sex + hypertension + diabetes + metSg + af, data = train, method = "glm", family = "binomial")
summary(mod)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0404  -0.3896  -0.2619  -0.1679   3.0208  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -5.640092   0.719251  -7.842 4.45e-15 ***
## eGFR          -0.014908   0.004980  -2.993  0.00276 ** 
## uprg1          0.417660   0.168106   2.485  0.01297 *  
## uprg2          0.348534   0.208632   1.671  0.09481 .  
## bs             0.006464   0.002953   2.189  0.02861 *  
## age            0.050636   0.006631   7.636 2.24e-14 ***
## sex2          -0.256000   0.123684  -2.070  0.03847 *  
## hypertension1  0.576125   0.131590   4.378 1.20e-05 ***
## diabetes1      0.081483   0.243024   0.335  0.73741    
## metSg1         0.167545   0.138772   1.207  0.22730    
## af             0.712613   0.586410   1.215  0.22428    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2303.4  on 5139  degrees of freedom
## Residual deviance: 2045.7  on 5129  degrees of freedom
## AIC: 2067.7
## 
## Number of Fisher Scoring iterations: 6
# Validate
dx_st.pred = predict(mod, newdata = test, type = "raw")
confusionMatrix(data = dx_st.pred, reference = factor(test$dx_st)) # accuracy: 0.941
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2073  129
##          1    0    0
##                                           
##                Accuracy : 0.9414          
##                  95% CI : (0.9308, 0.9509)
##     No Information Rate : 0.9414          
##     P-Value [Acc > NIR] : 0.5234          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9414          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9414          
##          Detection Rate : 0.9414          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
# AUC
prob = predict(mod, newdata = test, type = "prob")
pred = data.frame(prob, test$dx_st)
head(pred)
##           X0         X1 test.dx_st
## 1  0.7340231 0.26597692          0
## 7  0.9204526 0.07954744          0
## 13 0.8772791 0.12272090          0
## 18 0.9402082 0.05979179          0
## 19 0.9600436 0.03995643          0
## 20 0.9548038 0.04519618          0
# library pROC
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
## 
##     ci
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc(pred$test.dx_st, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## 
## Call:
## roc.default(response = pred$test.dx_st, predictor = pred$X1)
## 
## Data: pred$X1 in 2073 controls (pred$test.dx_st 0) < 129 cases (pred$test.dx_st 1).
## Area under the curve: 0.7202
plot(smooth(roc(pred$test.dx_st, pred$X1)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

#8. Examing the pattern of missing variables

na.patterns = naclus(dat1)
plot(na.patterns)