packagea loading
### echo=TRUE : 코드를 평가하고 실행결과를 포함한다
### eval=TRUE : 실행결과와 함게 코드를 출력한다
### message=FALSE : 메시지를 출력한다
### warning=TRUE : 경고메시지를 출력한다
### error=FALSE : 오류메시지를 출력한다
### tidy=FALSE : 깔끔한 방식으로 코드 형태를 변형한다
### 별(***) 나누기 선, 대쉬(---) 도 나누기 선
### 그림 삽입
### {r echo=FALSE, out.width='50%'}
### knitr::include_graphics('./pic.png')
library(tidyverse)
library(RNHANES)
library(ggplot2)
library(pROC)
Prepare the dataset
# Prepare the dataset
d0007 <- read.csv("d007.csv")
str(d0007) # 10149 obs. of 8 variables:
## 'data.frame': 10149 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SEQN : int 41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
## $ wave : Factor w/ 1 level "2007-2008": 1 1 1 1 1 1 1 1 1 1 ...
## $ RIAGENDR: int 2 2 1 2 1 1 1 1 1 1 ...
## $ RIDAGEYR: int 62 6 71 1 52 6 21 64 66 0 ...
## $ vitD : num 58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
## $ Calcium : num 9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
## $ Osteop : int 2 NA 2 NA 2 NA 2 2 2 NA ...
d0009 <- read.csv("d009.csv")
str(d0009) # 10537 obs. of 8 variables:
## 'data.frame': 10537 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SEQN : int 51624 51625 51626 51627 51628 51629 51630 51631 51632 51633 ...
## $ wave : Factor w/ 1 level "2009-2010": 1 1 1 1 1 1 1 1 1 1 ...
## $ RIAGENDR: int 1 1 1 1 2 1 2 2 1 1 ...
## $ RIDAGEYR: int 34 4 16 10 60 26 49 1 10 80 ...
## $ vitD : num 75.7 59.9 32.8 45.1 49.2 67.2 70.1 NA 66 90.6 ...
## $ Calcium : num 9.4 NA 9.7 NA 9.5 9.3 10 NA NA 9.5 ...
## $ Osteop : int 2 NA NA NA 2 2 2 NA NA 2 ...
dat <- bind_rows(d0007, d0009)
str(dat) # 20686 obs. of 8 variables:
## 'data.frame': 20686 obs. of 8 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SEQN : int 41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
## $ wave : chr "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
## $ RIAGENDR: int 2 2 1 2 1 1 1 1 1 1 ...
## $ RIDAGEYR: int 62 6 71 1 52 6 21 64 66 0 ...
## $ vitD : num 58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
## $ Calcium : num 9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
## $ Osteop : int 2 NA 2 NA 2 NA 2 2 2 NA ...
Create categories of Vitamin D
dat1 <- dat %>%
mutate(
vitD_group = case_when(
vitD < 30 ~ "Deficiency",
vitD >= 30 & vitD < 50 ~ "Inadequacy",
vitD >= 50 & vitD <= 125 ~ "Sufficiency"
)
)
head(dat1)
## X SEQN wave RIAGENDR RIDAGEYR vitD Calcium Osteop vitD_group
## 1 1 41475 2007-2008 2 62 58.8 9.5 2 Sufficiency
## 2 2 41476 2007-2008 2 6 80.9 NA NA Sufficiency
## 3 3 41477 2007-2008 1 71 81.8 10.0 2 Sufficiency
## 4 4 41478 2007-2008 2 1 NA NA NA <NA>
## 5 5 41479 2007-2008 1 52 78.4 9.0 2 Sufficiency
## 6 6 41480 2007-2008 1 6 NA NA NA <NA>
str(dat1)
## 'data.frame': 20686 obs. of 9 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ SEQN : int 41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
## $ wave : chr "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
## $ RIAGENDR : int 2 2 1 2 1 1 1 1 1 1 ...
## $ RIDAGEYR : int 62 6 71 1 52 6 21 64 66 0 ...
## $ vitD : num 58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
## $ Calcium : num 9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
## $ Osteop : int 2 NA 2 NA 2 NA 2 2 2 NA ...
## $ vitD_group: chr "Sufficiency" "Sufficiency" "Sufficiency" NA ...
Exclude missings
dat2 <- dat1 %>%
filter(!is.na(vitD_group), !is.na(Calcium), !is.na(Osteop), Osteop!=9) %>%
mutate(Gender = recode_factor(RIAGENDR,
`1` = "Men",
`2` = "Women"),
Osteop = recode_factor(Osteop,
`1` = 1,
`2` = 0))
head(dat2)
## X SEQN wave RIAGENDR RIDAGEYR vitD Calcium Osteop vitD_group
## 1 1 41475 2007-2008 2 62 58.8 9.5 0 Sufficiency
## 2 3 41477 2007-2008 1 71 81.8 10.0 0 Sufficiency
## 3 5 41479 2007-2008 1 52 78.4 9.0 0 Sufficiency
## 4 8 41482 2007-2008 1 64 61.9 9.1 0 Sufficiency
## 5 9 41483 2007-2008 1 66 53.3 8.9 0 Sufficiency
## 6 11 41485 2007-2008 2 30 39.1 9.3 0 Inadequacy
## Gender
## 1 Women
## 2 Men
## 3 Men
## 4 Men
## 5 Men
## 6 Women
str(dat2)
## 'data.frame': 10065 obs. of 10 variables:
## $ X : int 1 3 5 8 9 11 12 15 16 19 ...
## $ SEQN : int 41475 41477 41479 41482 41483 41485 41486 41489 41490 41493 ...
## $ wave : chr "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
## $ RIAGENDR : int 2 1 1 1 1 2 2 2 2 2 ...
## $ RIDAGEYR : int 62 71 52 64 66 30 61 40 66 77 ...
## $ vitD : num 58.8 81.8 78.4 61.9 53.3 39.1 51.1 19.4 22.6 34.5 ...
## $ Calcium : num 9.5 10 9 9.1 8.9 9.3 9.9 9 9.4 8.6 ...
## $ Osteop : Factor w/ 2 levels "1","0": 2 2 2 2 2 2 2 2 2 2 ...
## $ vitD_group: chr "Sufficiency" "Sufficiency" "Sufficiency" "Sufficiency" ...
## $ Gender : Factor w/ 2 levels "Men","Women": 2 1 1 1 1 2 2 2 2 2 ...
Logit regression model
fit <- glm(Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR,
data = dat2,
family = "binomial")
summary(fit)
##
## Call:
## glm(formula = Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR,
## family = "binomial", data = dat2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4265 0.1009 0.1894 0.3315 1.0305
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.81969 1.08054 7.237 4.59e-13 ***
## vitD_groupInadequacy -0.17444 0.20124 -0.867 0.38603
## vitD_groupSufficiency -0.53068 0.18159 -2.922 0.00347 **
## Calcium 0.10330 0.11404 0.906 0.36506
## GenderWomen -2.08873 0.12298 -16.984 < 2e-16 ***
## RIDAGEYR -0.07127 0.00330 -21.599 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4591.4 on 10064 degrees of freedom
## Residual deviance: 3553.1 on 10059 degrees of freedom
## AIC: 3565.1
##
## Number of Fisher Scoring iterations: 7
results
# Transforms beta's to the odds ratio
round(exp(coef(fit)), 2)
## (Intercept) vitD_groupInadequacy vitD_groupSufficiency
## 2489.14 0.84 0.59
## Calcium GenderWomen RIDAGEYR
## 1.11 0.12 0.93
# Interpreting results
round(exp(confint(fit)), 2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 298.55 20650.10
## vitD_groupInadequacy 0.56 1.24
## vitD_groupSufficiency 0.41 0.83
## Calcium 0.89 1.39
## GenderWomen 0.10 0.16
## RIDAGEYR 0.93 0.94
Assessing discrimination of the model with ROC curve
# model without vitamin D
fit1 <- glm(Osteop ~ Calcium + Gender + RIDAGEYR,
data = dat2,
family = "binomial")
# model with vitamin D
fit2 <- glm(Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR,
data = dat2,
family = "binomial")
dat2$prob1=predict(fit1,type=c("response"))
dat2$prob2=predict(fit2,type=c("response"))
roc(Osteop ~ prob1, data = dat2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = Osteop ~ prob1, data = dat2)
##
## Data: prob1 in 608 controls (Osteop 1) < 9457 cases (Osteop 0).
## Area under the curve: 0.8496
roc(Osteop ~ prob2, data = dat2)
## Setting levels: control = 1, case = 0
## Setting direction: controls < cases
##
## Call:
## roc.formula(formula = Osteop ~ prob2, data = dat2)
##
## Data: prob2 in 608 controls (Osteop 1) < 9457 cases (Osteop 0).
## Area under the curve: 0.8508
# There is a slight improvement in discrimination with including vitamin D in the model from 0.8496 to 0.8508.