#1. Import dataset: anackd. This is baseline Suita study dataset
dat = read.csv("C:/Users/thien/OneDrive/Desktop/anackd.csv")
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.8 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
dat1 = dat %>% select(age, sex, sbp, dbp, pulse, tc, hdl, nonhdlc, tg, bs, rbc, hg, hct, tob, alc, dx_ht, dx_af, dx_ckd, dx_mi, dx_st, pyst, exer, kaidan, stress, BMI, BMIg, nonhdlc, diabetes, hypertension, metSg, eGFR, eGFRg, uprg)
#2. Table1
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ age + sex + eGFR + eGFRg + uprg + dx_ckd + BMI + BMIg + tob + alc + sbp + dbp + pulse + hypertension + dx_ht + bs + diabetes + tc + hdl + nonhdlc + tg + metSg + hg + hct + rbc + dx_af + dx_ckd + dx_st + pyst, data = dat1 )
| Overall (N=7342) |
|
|---|---|
| age | |
| Mean (SD) | 55.1 (13.1) |
| Median [Min, Max] | 56.0 [30.0, 84.0] |
| sex | |
| Mean (SD) | 1.54 (0.498) |
| Median [Min, Max] | 2.00 [1.00, 2.00] |
| eGFR | |
| Mean (SD) | 77.9 (14.6) |
| Median [Min, Max] | 79.0 [7.92, 179] |
| eGFRg | |
| Mean (SD) | 1.86 (0.404) |
| Median [Min, Max] | 2.00 [0, 2.00] |
| uprg | |
| Mean (SD) | 0.236 (0.542) |
| Median [Min, Max] | 0 [0, 2.00] |
| dx_ckd | |
| Mean (SD) | 0.00409 (0.0841) |
| Median [Min, Max] | 0 [0, 2.00] |
| BMI | |
| Mean (SD) | 22.5 (3.12) |
| Median [Min, Max] | 22.3 [13.1, 40.5] |
| Missing | 2 (0.0%) |
| BMIg | |
| Normal | 5282 (71.9%) |
| Obesity | 112 (1.5%) |
| Overweight | 1333 (18.2%) |
| Underweight | 615 (8.4%) |
| tob | |
| Mean (SD) | 2.25 (0.881) |
| Median [Min, Max] | 3.00 [1.00, 3.00] |
| Missing | 124 (1.7%) |
| alc | |
| Mean (SD) | 1.93 (0.986) |
| Median [Min, Max] | 1.00 [1.00, 3.00] |
| Missing | 115 (1.6%) |
| sbp | |
| Mean (SD) | 126 (21.5) |
| Median [Min, Max] | 124 [78.0, 233] |
| dbp | |
| Mean (SD) | 77.6 (12.2) |
| Median [Min, Max] | 77.0 [0, 137] |
| Missing | 1 (0.0%) |
| pulse | |
| Mean (SD) | 69.5 (9.48) |
| Median [Min, Max] | 68.0 [38.0, 128] |
| Missing | 18 (0.2%) |
| hypertension | |
| Mean (SD) | 0.309 (0.462) |
| Median [Min, Max] | 0 [0, 1.00] |
| dx_ht | |
| Mean (SD) | 0.299 (0.659) |
| Median [Min, Max] | 0 [0, 2.00] |
| bs | |
| Mean (SD) | 98.3 (18.6) |
| Median [Min, Max] | 95.0 [51.0, 439] |
| diabetes | |
| Mean (SD) | 0.0670 (0.250) |
| Median [Min, Max] | 0 [0, 1.00] |
| tc | |
| Mean (SD) | 207 (36.1) |
| Median [Min, Max] | 206 [75.0, 483] |
| hdl | |
| Mean (SD) | 54.7 (14.3) |
| Median [Min, Max] | 53.0 [10.0, 132] |
| Missing | 28 (0.4%) |
| nonhdlc | |
| Mean (SD) | 153 (36.9) |
| Median [Min, Max] | 150 [22.0, 403] |
| Missing | 28 (0.4%) |
| tg | |
| Mean (SD) | 122 (90.5) |
| Median [Min, Max] | 99.0 [15.0, 1540] |
| metSg | |
| Mean (SD) | 0.216 (0.412) |
| Median [Min, Max] | 0 [0, 1.00] |
| hg | |
| Mean (SD) | 13.9 (1.55) |
| Median [Min, Max] | 13.9 [5.70, 20.6] |
| Missing | 3 (0.0%) |
| hct | |
| Mean (SD) | 41.8 (4.25) |
| Median [Min, Max] | 41.7 [12.4, 61.8] |
| Missing | 3 (0.0%) |
| rbc | |
| Mean (SD) | 4.53 (0.434) |
| Median [Min, Max] | 4.51 [2.61, 7.29] |
| Missing | 4 (0.1%) |
| dx_af | |
| Mean (SD) | 0.00586 (0.0968) |
| Median [Min, Max] | 0 [0, 2.00] |
| dx_st | |
| Mean (SD) | 0.0588 (0.235) |
| Median [Min, Max] | 0 [0, 1.00] |
| pyst | |
| Mean (SD) | 15.1 (7.06) |
| Median [Min, Max] | 16.7 [0.0862, 24.3] |
#3. Create factors
dat1$sex=as.factor(dat1$sex)
dat1$BMIg=as.factor(dat1$BMIg)
dat1$eGFRg=as.factor(dat1$eGFRg)
dat1$uprg=as.factor(dat1$uprg)
dat1$tob=as.factor(dat1$tob)
dat1$alc=as.factor(dat1$alc)
dat1$exer=as.factor(dat1$exer)
dat1$kaidan=as.factor(dat1$kaidan)
dat1$dx_af=as.factor(dat1$dx_af)
dat1$dx_ht=as.factor(dat1$dx_ht)
dat1$dx_ckd=as.factor(dat1$dx_ckd)
dat1$dx_st=as.factor(dat1$dx_st)
dat1$hypertension=as.factor(dat1$hypertension)
dat1$diabetes=as.factor(dat1$diabetes)
dat1$stress=as.factor(dat1$stress)
dat1$metSg=as.factor(dat1$metSg)
#4. Logistic regression ##4.1: Model 1: Age adjusted
# Change the reference;
dat1$tob=relevel(dat1$tob, ref = "3")
dat1$alc=relevel(dat1$alc, ref = "3")
logreg = glm(dx_st ~ eGFR + uprg + tob + alc + age + sex, family=binomial, data=dat1)
library(epiDisplay)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(logreg)
##
## Logistic regression predicting dx_st : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test)
## eGFR (cont. var.) 0.966 (0.9599,0.9722) 0.9907 (0.9824,0.999) 0.028
##
## uprg: ref.=0
## 1 1.62 (1.24,2.12) 1.47 (1.11,1.95) 0.007
## 2 2.32 (1.67,3.23) 1.68 (1.19,2.37) 0.003
##
## tob: ref.=3
## 1 1.35 (1.08,1.69) 1.43 (1.07,1.92) 0.016
## 2 1.51 (1.16,1.97) 1.01 (0.72,1.42) 0.943
##
## alc: ref.=3
## 1 1.06 (0.87,1.3) 1.21 (0.96,1.54) 0.11
## 2 1.8 (1.05,3.07) 1.19 (0.68,2.1) 0.541
##
## age (cont. var.) 1.07 (1.06,1.08) 1.06 (1.05,1.07) < 0.001
##
## sex: 2 vs 1 0.7015 (0.5752,0.8555) 0.9989 (0.748,1.3341) 0.994
##
## P(LR-test)
## eGFR (cont. var.) 0.03
##
## uprg: ref.=0 0.001
## 1
## 2
##
## tob: ref.=3 0.016
## 1
## 2
##
## alc: ref.=3 0.274
## 1
## 2
##
## age (cont. var.) < 0.001
##
## sex: 2 vs 1 0.994
##
## Log-likelihood = -1455.081
## No. of observations = 7194
## AIC value = 2930.1621
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggcoef(logreg, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)
# Danh gia tinh phan dinh (Discrimination)
library(Epi)
ROC(form = dx_st ~ eGFR + uprg + tob + alc + age + sex, data=dat1, plot = "ROC", lwd=2)
# So sanh 2 mo hinh
#library(pROC)
# attach(dat1)
##4.2: Model 2: Model 1 + hypertension + DM + MetSg + dx_af
logreg = glm(dx_st ~ eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af, family=binomial, data=dat1)
logistic.display(logreg)
##
## Logistic regression predicting dx_st : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## eGFR (cont. var.) 0.966 (0.9599,0.9722) 0.9908 (0.9825,0.9991)
##
## uprg: ref.=0
## 1 1.62 (1.24,2.12) 1.42 (1.07,1.88)
## 2 2.32 (1.67,3.23) 1.25 (0.87,1.79)
##
## tob: ref.=3
## 1 1.35 (1.08,1.69) 1.5 (1.12,2.02)
## 2 1.5124 (1.1605,1.9711) 1.001 (0.7097,1.412)
##
## alc: ref.=3
## 1 1.06 (0.87,1.3) 1.17 (0.92,1.5)
## 2 1.8 (1.05,3.07) 1.19 (0.67,2.11)
##
## age (cont. var.) 1.07 (1.06,1.08) 1.05 (1.04,1.07)
##
## sex: 2 vs 1 0.7015 (0.5752,0.8555) 1.0068 (0.7505,1.3506)
##
## hypertension: 1 vs 0 2.96 (2.42,3.61) 1.68 (1.35,2.1)
##
## diabetes: 1 vs 0 2.79 (2.1,3.71) 1.68 (1.24,2.29)
##
## metSg: 1 vs 0 2.41 (1.96,2.96) 1.45 (1.16,1.82)
##
## dx_af: ref.=0
## 1 5.51 (1.77,17.15) 2.75 (0.83,9.07)
## 2 7.34 (2.25,23.94) 4.17 (1.22,14.33)
##
## P(Wald's test) P(LR-test)
## eGFR (cont. var.) 0.03 0.032
##
## uprg: ref.=0 0.043
## 1 0.015
## 2 0.227
##
## tob: ref.=3 0.005
## 1 0.007
## 2 0.995
##
## alc: ref.=3 0.423
## 1 0.199
## 2 0.553
##
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.964 0.964
##
## hypertension: 1 vs 0 < 0.001 < 0.001
##
## diabetes: 1 vs 0 < 0.001 0.001
##
## metSg: 1 vs 0 0.001 0.001
##
## dx_af: ref.=0 0.039
## 1 0.097
## 2 0.023
##
## Log-likelihood = -1422.8219
## No. of observations = 7194
## AIC value = 2875.6437
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
ggcoef(logreg, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)
# Danh gia tinh phan dinh (Discrimination)
library(Epi)
ROC(form = dx_st ~ eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af, data=dat1, plot = "ROC", lwd=2)
##4.3: Full adjusted model: Model 3 + tc + tg + nonhdlc
logreg = glm(dx_st ~ eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af + tc + tg + nonhdlc, family=binomial, data=dat1)
logistic.display(logreg)
##
## Logistic regression predicting dx_st : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## eGFR (cont. var.) 0.9665 (0.9603,0.9727) 0.9917 (0.9833,1.0002)
##
## uprg: ref.=0
## 1 1.65 (1.26,2.16) 1.45 (1.09,1.92)
## 2 2.3 (1.65,3.21) 1.25 (0.87,1.79)
##
## tob: ref.=3
## 1 1.34 (1.07,1.68) 1.47 (1.09,1.98)
## 2 1.51 (1.16,1.97) 0.98 (0.7,1.39)
##
## alc: ref.=3
## 1 1.08 (0.88,1.32) 1.2 (0.94,1.54)
## 2 1.83 (1.07,3.13) 1.21 (0.68,2.15)
##
## age (cont. var.) 1.07 (1.06,1.08) 1.05 (1.04,1.07)
##
## sex: 2 vs 1 0.6953 (0.5694,0.849) 0.99 (0.7312,1.3404)
##
## hypertension: 1 vs 0 2.91 (2.38,3.56) 1.65 (1.32,2.07)
##
## diabetes: 1 vs 0 2.79 (2.09,3.71) 1.69 (1.24,2.31)
##
## metSg: 1 vs 0 2.4 (1.95,2.95) 1.41 (1.07,1.84)
##
## dx_af: ref.=0
## 1 5.56 (1.78,17.3) 2.78 (0.84,9.18)
## 2 7.41 (2.27,24.16) 4.19 (1.22,14.42)
##
## tc (cont. var.) 1.0029 (1.0002,1.0056) 1.0002 (0.9917,1.0087)
##
## tg (cont. var.) 1.0016 (1.0008,1.0024) 1.0003 (0.999,1.0015)
##
## nonhdlc (cont. var.) 1.0044 (1.0018,1.0071) 1.0006 (0.9918,1.0094)
##
## P(Wald's test) P(LR-test)
## eGFR (cont. var.) 0.055 0.058
##
## uprg: ref.=0 0.031
## 1 0.01
## 2 0.235
##
## tob: ref.=3 0.008
## 1 0.012
## 2 0.93
##
## alc: ref.=3 0.335
## 1 0.146
## 2 0.52
##
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.948 0.948
##
## hypertension: 1 vs 0 < 0.001 < 0.001
##
## diabetes: 1 vs 0 < 0.001 0.001
##
## metSg: 1 vs 0 0.013 0.014
##
## dx_af: ref.=0 0.038
## 1 0.093
## 2 0.023
##
## tc (cont. var.) 0.968 0.968
##
## tg (cont. var.) 0.669 0.674
##
## nonhdlc (cont. var.) 0.898 0.898
##
## Log-likelihood = -1409.7188
## No. of observations = 7167
## AIC value = 2855.4375
# Figure of OR (95% CI) of developing Stroke;
library(GGally)
ggcoef(logreg, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10)
# Danh gia tinh phan dinh (Discrimination)
library(Epi)
ROC(form = dx_st ~ eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af+ tc + tg + nonhdlc, data=dat1, plot = "ROC", lwd=2)
# varImp
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:epiDisplay':
##
## dotplot
##
## Attaching package: 'caret'
## The following object is masked from 'package:survival':
##
## cluster
## The following object is masked from 'package:purrr':
##
## lift
varImp(logreg, scale = F)
## Overall
## eGFR 1.91610514
## uprg1 2.58862441
## uprg2 1.18688800
## tob1 2.51398305
## tob2 0.08761943
## alc1 1.45461673
## alc2 0.64285038
## age 9.25803170
## sex2 0.06479326
## hypertension1 4.39185834
## diabetes1 3.31652009
## metSg1 2.47396174
## dx_af1 1.67800756
## dx_af2 2.27308395
## tc 0.03992275
## tg 0.42715276
## nonhdlc 0.12788985
#5a. Choose the best model by BMA
library(BMA)
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-1)
yvar = dat1[, "dx_st"]
# xvars = dat1[, c("eGFR", "eGFRg", "uprg", "age", "sex", "sbp", "dbp", "pulse", "tc", "hdl", "tg", "bs", "rbc", "hg", "hct", "tob", "alc", "dx_ht", "dx_af", "dx_ckd", "dx_mi", "pyst", "exer", "kaidan", "stress", "BMI", "BMIg", "nonhdlc", "diabetes", "hypertension", "metSg" )]
xvars = dat1[, c("eGFR", "uprg", "age", "sex", "sbp", "dbp", "tc", "hdl", "tg", "bs", "tob", "alc", "dx_af", "kaidan", "stress", "BMI", "BMIg", "diabetes", "hypertension", "metSg" )]
bma = bic.glm(xvars, yvar, glm.family = "binomial")
## Warning in bic.glm.data.frame(xvars, yvar, glm.family = "binomial"): There were
## 1008 records deleted due to NA's
summary(bma)
##
## Call:
## bic.glm.data.frame(x = xvars, y = yvar, glm.family = "binomial")
##
##
## 23 models were selected
## Best 5 models (cumulative posterior probability = 0.4803 ):
##
## p!=0 EV SD model 1 model 2
## Intercept 100 -8.5990543 0.850140 -8.720e+00 -9.125e+00
## eGFR 4.4 -0.0004856 0.002479 . .
## uprg 0.8
## .1 0.0043739 0.050496 . .
## .2 0.0022142 0.030252 . .
## age 100.0 0.0537372 0.005828 5.266e-02 5.488e-02
## sex 10.6
## .2 -0.0325206 0.101594 . .
## sbp 89.4 0.0120521 0.004923 1.393e-02 1.469e-02
## dbp 0.0 0.0000000 0.000000 . .
## tc 0.0 0.0000000 0.000000 . .
## hdl 0.0 0.0000000 0.000000 . .
## tg 0.0 0.0000000 0.000000 . .
## bs 81.0 0.0071450 0.003958 9.249e-03 9.207e-03
## tob 45.5
## .1 0.2400116 0.277022 . 5.276e-01
## .2 0.0231337 0.108144 . 4.926e-02
## alc 0.0
## .1 0.0000000 0.000000 . .
## .2 0.0000000 0.000000 . .
## dx_af 0.0
## .1 0.0000000 0.000000 . .
## .2 0.0000000 0.000000 . .
## kaidan 0.0
## .2 0.0000000 0.000000 . .
## .3 0.0000000 0.000000 . .
## .4 0.0000000 0.000000 . .
## .5 0.0000000 0.000000 . .
## stress 0.0
## .2 0.0000000 0.000000 . .
## .9 0.0000000 0.000000 . .
## BMI 16.2 0.0077124 0.018943 . .
## BMIg 0.0
## .Obesity 0.0000000 0.000000 . .
## .Overweight 0.0000000 0.000000 . .
## .Underweight 0.0000000 0.000000 . .
## diabetes 17.6
## .1 0.1109721 0.251063 . .
## hypertension 11.5
## .1 0.0654977 0.188280 . .
## metSg 29.7
## .1 0.1069930 0.178951 . .
##
## nVar 3 4
## BIC -5.298e+04 -5.298e+04
## post prob 0.151 0.124
## model 3 model 4 model 5
## Intercept -8.447e+00 -1.013e+01 -8.862e+00
## eGFR . . .
## uprg
## .1 . . .
## .2 . . .
## age 5.248e-02 5.686e-02 5.476e-02
## sex
## .2 . . .
## sbp 1.213e-02 1.320e-02 1.294e-02
## dbp . . .
## tc . . .
## hdl . . .
## tg . . .
## bs 7.992e-03 8.860e-03 7.937e-03
## tob
## .1 . 5.387e-01 5.283e-01
## .2 . 5.085e-02 5.997e-02
## alc
## .1 . . .
## .2 . . .
## dx_af
## .1 . . .
## .2 . . .
## kaidan
## .2 . . .
## .3 . . .
## .4 . . .
## .5 . . .
## stress
## .2 . . .
## .9 . . .
## BMI . 4.859e-02 .
## BMIg
## .Obesity . . .
## .Overweight . . .
## .Underweight . . .
## diabetes
## .1 . . .
## hypertension
## .1 . . .
## metSg
## .1 3.505e-01 . 3.472e-01
##
## nVar 4 5 5
## BIC -5.297e+04 -5.297e+04 -5.297e+04
## post prob 0.080 0.064 0.062
imageplot.bma(bma)
#5b. Boostrap method
library(rms)
## Loading required package: Hmisc
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "replValueSp"; definition not updated
## Warning in .recacheSubclasses(def@className, def, env): undefined subclass
## "packedMatrix" of class "mMatrix"; definition not updated
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
moboostrap = lrm(dx_st ~ eGFR + uprg + tob + alc + age + sex + hypertension + diabetes + metSg + dx_af+ tc + tg + nonhdlc, data=dat1, x = TRUE, y = TRUE)
validate = validate(moboostrap, B = 100)
##
## Divergence or singularity in 36 samples
validate
## index.orig training test optimism index.corrected n
## Dxy 0.5117 0.5171 0.5023 0.0148 0.4969 64
## R2 0.1279 0.1321 0.1218 0.0103 0.1176 64
## Intercept 0.0000 0.0000 -0.1115 0.1115 -0.1115 64
## Slope 1.0000 1.0000 0.9549 0.0451 0.9549 64
## Emax 0.0000 0.0000 0.0325 0.0325 0.0325 64
## D 0.0465 0.0481 0.0442 0.0039 0.0425 64
## U -0.0003 -0.0003 0.0001 -0.0004 0.0001 64
## Q 0.0467 0.0484 0.0441 0.0043 0.0425 64
## B 0.0515 0.0515 0.0517 -0.0002 0.0517 64
## g 1.1391 1.1572 1.1008 0.0564 1.0827 64
## gp 0.0547 0.0555 0.0534 0.0021 0.0525 64
cal = calibrate(moboostrap, method = "boot", B=100)
##
## Divergence or singularity in 38 samples
plot(cal)
##
## n=7167 Mean absolute error=0.006 Mean squared error=8e-05
## 0.9 Quantile of absolute error=0.01
#6. Nomogram
library(rms)
ddist = datadist(dat1)
options(datadist = "ddist")
m1 = lrm(dx_st ~ eGFR + tob + bs + age + sex + hypertension + diabetes + metSg + dx_af, data = dat1)
p = nomogram(m1, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of Stroke")
plot(p, cex.axis = 0.6, lmgp = 0.2)
#7. Evaluate the model
library(caret)
sample = createDataPartition(dat1$dx_st, p = 0.7, list = FALSE)
train = dat1[sample,]
test = dat1[-sample,]
mod = train(dx_st ~ eGFR + uprg + bs + age + sex + hypertension + diabetes + metSg + dx_af, data = train, method = "glm", family = "binomial")
summary(mod)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2085 -0.3871 -0.2676 -0.1786 3.0711
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.7266476 0.7307038 -7.837 4.61e-15 ***
## eGFR -0.0074491 0.0049191 -1.514 0.129945
## uprg1 0.4683790 0.1651929 2.835 0.004578 **
## uprg2 0.3590260 0.2129596 1.686 0.091817 .
## bs 0.0005894 0.0035346 0.167 0.867577
## age 0.0511767 0.0065602 7.801 6.14e-15 ***
## sex2 -0.1128202 0.1239637 -0.910 0.362766
## hypertension1 0.3990810 0.1319367 3.025 0.002488 **
## diabetes1 0.4412946 0.2318543 1.903 0.056998 .
## metSg1 0.4832281 0.1374555 3.516 0.000439 ***
## dx_af1 1.0267067 0.6104109 1.682 0.092570 .
## dx_af2 0.8443809 0.8649933 0.976 0.328980
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2303.4 on 5139 degrees of freedom
## Residual deviance: 2065.1 on 5128 degrees of freedom
## AIC: 2089.1
##
## Number of Fisher Scoring iterations: 6
# Validate
dx_st.pred = predict(mod, newdata = test, type = "raw")
confusionMatrix(data = dx_st.pred, reference = factor(test$dx_st))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2073 129
## 1 0 0
##
## Accuracy : 0.9414
## 95% CI : (0.9308, 0.9509)
## No Information Rate : 0.9414
## P-Value [Acc > NIR] : 0.5234
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.9414
## Neg Pred Value : NaN
## Prevalence : 0.9414
## Detection Rate : 0.9414
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 0
##
# AUC
prob = predict(mod, newdata = test, type = "prob")
pred = data.frame(prob, test$dx_st)
head(pred)
## X0 X1 test.dx_st
## 2 0.7429544 0.25704560 1
## 3 0.7519160 0.24808403 0
## 9 0.9186121 0.08138794 1
## 11 0.8619667 0.13803333 1
## 17 0.9506757 0.04932426 1
## 18 0.9418537 0.05814630 0
# library pROC
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following object is masked from 'package:epiDisplay':
##
## ci
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc(pred$test.dx_st, pred$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred$test.dx_st, predictor = pred$X1)
##
## Data: pred$X1 in 2073 controls (pred$test.dx_st 0) < 129 cases (pred$test.dx_st 1).
## Area under the curve: 0.7373
plot(smooth(roc(pred$test.dx_st, pred$X1)))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
#8. Examing the pattern of missing variables
na.patterns = naclus(dat1)
plot(na.patterns)