glm & dplyr & markdown

packagea loading

### echo=TRUE : 코드를 평가하고 실행결과를 포함한다
### eval=TRUE : 실행결과와 함게 코드를 출력한다
### message=FALSE : 메시지를 출력한다
### warning=TRUE : 경고메시지를 출력한다 
### error=FALSE : 오류메시지를 출력한다 
### tidy=FALSE : 깔끔한 방식으로 코드 형태를 변형한다 
### 별(***) 나누기 선, 대쉬(---) 도 나누기 선

### 그림 삽입
### {r echo=FALSE, out.width='50%'}
### knitr::include_graphics('./pic.png')

library(tidyverse)
library(RNHANES)
library(ggplot2)
library(pROC)

Prepare the dataset

# Prepare the dataset
d0007 <- read.csv("d007.csv")
str(d0007)            # 10149 obs. of  8 variables:

## 'data.frame':    10149 obs. of  8 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SEQN    : int  41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
##  $ wave    : Factor w/ 1 level "2007-2008": 1 1 1 1 1 1 1 1 1 1 ...
##  $ RIAGENDR: int  2 2 1 2 1 1 1 1 1 1 ...
##  $ RIDAGEYR: int  62 6 71 1 52 6 21 64 66 0 ...
##  $ vitD    : num  58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
##  $ Calcium : num  9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
##  $ Osteop  : int  2 NA 2 NA 2 NA 2 2 2 NA ...

d0009 <- read.csv("d009.csv")
str(d0009)            # 10537 obs. of  8 variables:

## 'data.frame':    10537 obs. of  8 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SEQN    : int  51624 51625 51626 51627 51628 51629 51630 51631 51632 51633 ...
##  $ wave    : Factor w/ 1 level "2009-2010": 1 1 1 1 1 1 1 1 1 1 ...
##  $ RIAGENDR: int  1 1 1 1 2 1 2 2 1 1 ...
##  $ RIDAGEYR: int  34 4 16 10 60 26 49 1 10 80 ...
##  $ vitD    : num  75.7 59.9 32.8 45.1 49.2 67.2 70.1 NA 66 90.6 ...
##  $ Calcium : num  9.4 NA 9.7 NA 9.5 9.3 10 NA NA 9.5 ...
##  $ Osteop  : int  2 NA NA NA 2 2 2 NA NA 2 ...

dat <- bind_rows(d0007, d0009)
str(dat)              # 20686 obs. of  8 variables:

## 'data.frame':    20686 obs. of  8 variables:
##  $ X       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SEQN    : int  41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
##  $ wave    : chr  "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
##  $ RIAGENDR: int  2 2 1 2 1 1 1 1 1 1 ...
##  $ RIDAGEYR: int  62 6 71 1 52 6 21 64 66 0 ...
##  $ vitD    : num  58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
##  $ Calcium : num  9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
##  $ Osteop  : int  2 NA 2 NA 2 NA 2 2 2 NA ...

Create categories of Vitamin D

dat1 <- dat %>%
  mutate(
    vitD_group = case_when(
      vitD < 30 ~ "Deficiency",
      vitD >= 30 & vitD < 50 ~ "Inadequacy",
      vitD >= 50 & vitD <= 125 ~ "Sufficiency"
    )
  )

head(dat1)

##   X  SEQN      wave RIAGENDR RIDAGEYR vitD Calcium Osteop  vitD_group
## 1 1 41475 2007-2008        2       62 58.8     9.5      2 Sufficiency
## 2 2 41476 2007-2008        2        6 80.9      NA     NA Sufficiency
## 3 3 41477 2007-2008        1       71 81.8    10.0      2 Sufficiency
## 4 4 41478 2007-2008        2        1   NA      NA     NA        <NA>
## 5 5 41479 2007-2008        1       52 78.4     9.0      2 Sufficiency
## 6 6 41480 2007-2008        1        6   NA      NA     NA        <NA>

str(dat1)

## 'data.frame':    20686 obs. of  9 variables:
##  $ X         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ SEQN      : int  41475 41476 41477 41478 41479 41480 41481 41482 41483 41484 ...
##  $ wave      : chr  "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
##  $ RIAGENDR  : int  2 2 1 2 1 1 1 1 1 1 ...
##  $ RIDAGEYR  : int  62 6 71 1 52 6 21 64 66 0 ...
##  $ vitD      : num  58.8 80.9 81.8 NA 78.4 NA NA 61.9 53.3 NA ...
##  $ Calcium   : num  9.5 NA 10 NA 9 NA NA 9.1 8.9 NA ...
##  $ Osteop    : int  2 NA 2 NA 2 NA 2 2 2 NA ...
##  $ vitD_group: chr  "Sufficiency" "Sufficiency" "Sufficiency" NA ...

Exclude missings

dat2 <- dat1 %>% 
  filter(!is.na(vitD_group), !is.na(Calcium), !is.na(Osteop), Osteop!=9) %>% 
  mutate(Gender = recode_factor(RIAGENDR, 
                                `1` = "Men", 
                                `2` = "Women"),
         Osteop = recode_factor(Osteop, 
                                `1` = 1, 
                                `2` = 0))

head(dat2)

##    X  SEQN      wave RIAGENDR RIDAGEYR vitD Calcium Osteop  vitD_group
## 1  1 41475 2007-2008        2       62 58.8     9.5      0 Sufficiency
## 2  3 41477 2007-2008        1       71 81.8    10.0      0 Sufficiency
## 3  5 41479 2007-2008        1       52 78.4     9.0      0 Sufficiency
## 4  8 41482 2007-2008        1       64 61.9     9.1      0 Sufficiency
## 5  9 41483 2007-2008        1       66 53.3     8.9      0 Sufficiency
## 6 11 41485 2007-2008        2       30 39.1     9.3      0  Inadequacy
##   Gender
## 1  Women
## 2    Men
## 3    Men
## 4    Men
## 5    Men
## 6  Women

str(dat2)

## 'data.frame':    10065 obs. of  10 variables:
##  $ X         : int  1 3 5 8 9 11 12 15 16 19 ...
##  $ SEQN      : int  41475 41477 41479 41482 41483 41485 41486 41489 41490 41493 ...
##  $ wave      : chr  "2007-2008" "2007-2008" "2007-2008" "2007-2008" ...
##  $ RIAGENDR  : int  2 1 1 1 1 2 2 2 2 2 ...
##  $ RIDAGEYR  : int  62 71 52 64 66 30 61 40 66 77 ...
##  $ vitD      : num  58.8 81.8 78.4 61.9 53.3 39.1 51.1 19.4 22.6 34.5 ...
##  $ Calcium   : num  9.5 10 9 9.1 8.9 9.3 9.9 9 9.4 8.6 ...
##  $ Osteop    : Factor w/ 2 levels "1","0": 2 2 2 2 2 2 2 2 2 2 ...
##  $ vitD_group: chr  "Sufficiency" "Sufficiency" "Sufficiency" "Sufficiency" ...
##  $ Gender    : Factor w/ 2 levels "Men","Women": 2 1 1 1 1 2 2 2 2 2 ...

Logit regression model

fit <- glm(Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR, 
           data = dat2, 
           family = "binomial")
summary(fit)

## 
## Call:
## glm(formula = Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR, 
##     family = "binomial", data = dat2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4265   0.1009   0.1894   0.3315   1.0305  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            7.81969    1.08054   7.237 4.59e-13 ***
## vitD_groupInadequacy  -0.17444    0.20124  -0.867  0.38603    
## vitD_groupSufficiency -0.53068    0.18159  -2.922  0.00347 ** 
## Calcium                0.10330    0.11404   0.906  0.36506    
## GenderWomen           -2.08873    0.12298 -16.984  < 2e-16 ***
## RIDAGEYR              -0.07127    0.00330 -21.599  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4591.4  on 10064  degrees of freedom
## Residual deviance: 3553.1  on 10059  degrees of freedom
## AIC: 3565.1
## 
## Number of Fisher Scoring iterations: 7

results

# Transforms beta's to the odds ratio
round(exp(coef(fit)), 2)

##           (Intercept)  vitD_groupInadequacy vitD_groupSufficiency 
##               2489.14                  0.84                  0.59 
##               Calcium           GenderWomen              RIDAGEYR 
##                  1.11                  0.12                  0.93

# Interpreting results
round(exp(confint(fit)), 2)

## Waiting for profiling to be done...

##                        2.5 %   97.5 %
## (Intercept)           298.55 20650.10
## vitD_groupInadequacy    0.56     1.24
## vitD_groupSufficiency   0.41     0.83
## Calcium                 0.89     1.39
## GenderWomen             0.10     0.16
## RIDAGEYR                0.93     0.94

Assessing discrimination of the model with ROC curve

# model without vitamin D
fit1 <- glm(Osteop ~ Calcium + Gender + RIDAGEYR, 
            data = dat2, 
            family = "binomial")
# model with vitamin D
fit2 <- glm(Osteop ~ vitD_group + Calcium + Gender + RIDAGEYR, 
            data = dat2, 
            family = "binomial")

dat2$prob1=predict(fit1,type=c("response"))
dat2$prob2=predict(fit2,type=c("response"))
roc(Osteop ~ prob1, data = dat2)

## Setting levels: control = 1, case = 0

## Setting direction: controls < cases

## 
## Call:
## roc.formula(formula = Osteop ~ prob1, data = dat2)
## 
## Data: prob1 in 608 controls (Osteop 1) < 9457 cases (Osteop 0).
## Area under the curve: 0.8496

roc(Osteop ~ prob2, data = dat2)

## Setting levels: control = 1, case = 0
## Setting direction: controls < cases

## 
## Call:
## roc.formula(formula = Osteop ~ prob2, data = dat2)
## 
## Data: prob2 in 608 controls (Osteop 1) < 9457 cases (Osteop 0).
## Area under the curve: 0.8508

# There is a slight improvement in discrimination with including vitamin D in the model from 0.8496 to 0.8508.

glm & dplyr & markdown

updragon

2019/6/13

packagea loading

Prepare the dataset

Create categories of Vitamin D

Exclude missings

Logit regression model

results

Assessing discrimination of the model with ROC curve