EPI204 Lab 4 in R (Zou’s relative risk regression)

References

Load data

## Load foreign package
library(foreign)

## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")

Variable manipulation

## Recode
nhdat <- within(nhdat, {
    race <- factor(race, 1:3, c("White","Black","Other"))
    recex <- factor(recex, 1:3, c("high","medium","low"))
    recex <- relevel(recex, "low")

    height[height %in% 999] <- NA
    bmi <- wt / (height/100)^2
})

## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]

Generalized linear models

Linear regression (identity link, gaussian family, variance independent of mean)

## identity link, gaussian family
glm.identity.gaus <- glm(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, data = restricted,
                         family = gaussian(link = "identity"))
summary(glm.identity.gaus)

Call:
glm(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, 
    family = gaussian(link = "identity"), data = restricted)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-372.0   -26.0     0.0    26.7   443.9  

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -306.654     14.288  -21.46  < 2e-16 ***
ageyrs         2.908      0.564    5.15  2.7e-07 ***
height         3.087      0.134   23.04  < 2e-16 ***
wt             1.679      0.105   16.01  < 2e-16 ***
raceBlack    -46.104      2.456  -18.77  < 2e-16 ***
raceOther    -12.501      5.762   -2.17     0.03 *  
sex          -52.764      6.513   -8.10  7.8e-16 ***
ageyrs:sex     7.011      0.513   13.67  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 2374)

    Null deviance: 44766904  on 3017  degrees of freedom
Residual deviance:  7146345  on 3010  degrees of freedom
AIC: 32032

Number of Fisher Scoring iterations: 2

Logistic regression (logit link, binomial family, variance = p(1-p))

## Logit link, binomial family
restricted2 <- nhdat[complete.cases(nhdat[,c("ageyrs","height","wt","sex","race","stroke","recex","marry","smokever","school","choleste","sbp","dbp")]),]

glm.logit.binom.angina <- glm(angina ~ ageyrs + recex + smokever + sex + bmi + race, data = restricted2,
                              family = binomial(link = "logit"))
summary(glm.logit.binom.angina)

Call:
glm(formula = angina ~ ageyrs + recex + smokever + sex + bmi + 
    race, family = binomial(link = "logit"), data = restricted2)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.164  -0.750  -0.646  -0.499   2.147  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.46669    0.19417  -12.70  < 2e-16 ***
ageyrs       0.01531    0.00209    7.34  2.1e-13 ***
recexhigh   -0.67646    0.08739   -7.74  9.9e-15 ***
recexmedium -0.44597    0.05681   -7.85  4.1e-15 ***
smokever     0.15460    0.05781    2.67   0.0075 ** 
sex         -0.16363    0.05680   -2.88   0.0040 ** 
bmi          0.02407    0.00513    4.69  2.7e-06 ***
raceBlack    0.16006    0.08083    1.98   0.0477 *  
raceOther   -0.38240    0.23629   -1.62   0.1056    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9115.4  on 8506  degrees of freedom
Residual deviance: 8887.8  on 8498  degrees of freedom
  (3 observations deleted due to missingness)
AIC: 8906

Number of Fisher Scoring iterations: 4

library(epicalc)
logistic.display(glm.logit.binom.angina)

Logistic regression predicting Dr Diagnosed Angina 

                                                     crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
ageyrs (cont. var.)                                  1.02 (1.01,1.02)  1.02 (1.01,1.02)  < 0.001        < 0.001   

Recreational exercise 1=high, 2=mid, 3=low: ref.=low                                                    < 0.001   
   high                                              0.46 (0.39,0.54)  0.51 (0.43,0.6)   < 0.001                  
   medium                                            0.6 (0.53,0.66)   0.64 (0.57,0.72)  < 0.001                  

smokever: 1 vs 0                                     1.01 (0.91,1.12)  1.17 (1.04,1.31)  0.007          0.007     

sex: 1 vs 0                                          0.83 (0.75,0.92)  0.85 (0.76,0.95)  0.004          0.004     

NA (cont. var.)                                      1.03 (1.02,1.04)  1.02 (1.01,1.03)  < 0.001        < 0.001   

white, black, other: ref.=White                                                                         0.031     
   Black                                             1.23 (1.05,1.44)  1.17 (1,1.38)     0.048                    
   Other                                             0.65 (0.41,1.02)  0.68 (0.43,1.08)  0.106                    

Log-likelihood = -4443.9002
No. of observations = 8507
AIC value = 8905.8004

Logistic regression for common outcome

Odds ratios are not good effect measures when the outcome of interest is common. Odds are close to risks when the outcome is rare, but exaggerated (odds >> risk) when the outcome is common. Thus, odds ratio tends to give false impression of large effect when the outcome is common.

In the example below the odds ratio is much larger than the risk ratio when the outcome is common.

## Create a common outcome
restricted2$recex_dich <- restricted2$recex != "high"
table(restricted2$recex_dich)

FALSE  TRUE 
 1266  7244 

## Common outcome: Logit link, binomial family
glm.logit.binom <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
                       family = binomial(link = "logit"))
summary(glm.logit.binom)

Call:
glm(formula = recex_dich ~ ageyrs + smokever + sex + race, family = binomial(link = "logit"), 
    data = restricted2)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.556   0.451   0.528   0.603   0.859  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept)  1.15294    0.13556    8.51  < 2e-16 ***
ageyrs       0.01524    0.00229    6.65  2.9e-11 ***
smokever     0.16423    0.06702    2.45   0.0143 *  
sex         -0.61757    0.06563   -9.41  < 2e-16 ***
raceBlack   -0.18497    0.09297   -1.99   0.0466 *  
raceOther    0.90542    0.33148    2.73   0.0063 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 7158.0  on 8509  degrees of freedom
Residual deviance: 7015.8  on 8504  degrees of freedom
AIC: 7028

Number of Fisher Scoring iterations: 5

## Define a function to obtain ratio
..glm.ratio <- function(GLM.RESULT, DIGITS = 2, P.DIGITS = 3, CONF.LEVEL = 0.95) {

    if (GLM.RESULT$family$family == "binomial") {
        LABEL <- "OR"
    } else if (GLM.RESULT$family$family == "poisson") {
        LABEL <- "RR"
    } else {
        stop("Not logistic or Poisson model")
    }

    ## Extract coefficients and confidence interval
    COEF      <- stats::coef(GLM.RESULT)
    CONFINT   <- suppressMessages(stats::confint(GLM.RESULT, level = CONF.LEVEL))
    TABLE     <- cbind(coef = COEF, CONFINT)

    ## Turn them into OR
    TABLE.EXP <- round(exp(TABLE), DIGITS)

    colnames(TABLE.EXP)[1] <- LABEL

    ## Extract p-value
    PVAL <- round(summary(GLM.RESULT)$coef[,4], P.DIGITS)

    TABLE.EXP <- cbind(TABLE.EXP, "P" = PVAL)    
    TABLE.EXP
}
## Odds (intercept) and odds ratios
..glm.ratio(glm.logit.binom, DIGITS = 5)
                OR  2.5 % 97.5 %     P
(Intercept) 3.1675 2.4317 4.1373 0.000
ageyrs      1.0153 1.0108 1.0199 0.000
smokever    1.1785 1.0332 1.3436 0.014
sex         0.5393 0.4740 0.6131 0.000
raceBlack   0.8311 0.6944 1.0000 0.047
raceOther   2.4730 1.3599 5.0563 0.006

Relative risk regression for common outcome

log-binomial model (did not work)

## Common outcome: log link, binomial family (Did not work)
glm.log.binom <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
                     family = binomial(link = "log"))
Error: no valid set of coefficients has been found: please supply starting values
## summary(glm.log.binom)

Zou’s modified Poisson regression

## Common outcome: log link, poisson family, robust estimator (modified Poisson with robust estimator by Zou)
library(geepack)
geeglm.log.poisson <- geeglm(formula = recex_dich ~ ageyrs + smokever + sex + race,
                             data    = restricted2,
                             family  = poisson(link = "log"),
                             id      = seqno,
                             corstr  = "exchangeable")
summary(geeglm.log.poisson)

Call:
geeglm(formula = recex_dich ~ ageyrs + smokever + sex + race, 
    family = poisson(link = "log"), data = restricted2, id = seqno, 
    corstr = "exchangeable")

 Coefficients:
             Estimate   Std.err   Wald Pr(>|W|)    
(Intercept) -0.260374  0.022275 136.63  < 2e-16 ***
ageyrs       0.002323  0.000368  39.84  2.8e-10 ***
smokever     0.025359  0.009720   6.81   0.0091 ** 
sex         -0.091885  0.009899  86.15  < 2e-16 ***
raceBlack   -0.028859  0.015538   3.45   0.0633 .  
raceOther    0.097685  0.024432  15.99  6.4e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Estimated Scale Parameters:
            Estimate Std.err
(Intercept)    0.149 0.00393

Correlation: Structure = exchangeable  Link = identity 

Estimated Correlation Parameters:
      Estimate Std.err
alpha        0       0
Number of clusters:   8510   Maximum cluster size: 1 
exp(coef(geeglm.log.poisson))
(Intercept)      ageyrs    smokever         sex   raceBlack   raceOther 
      0.771       1.002       1.026       0.912       0.972       1.103 


## Fit by glm() then test using robust SE estimator
glm.log.poisson <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
                       family = poisson(link = "log"))
## Load sandwich package for robust estimator
library(sandwich)
## Load lmtest package for coeftest
library(lmtest)
## Poisson model with SE estimated via robust variance estimator
coeftest(glm.log.poisson, vcov = sandwich)

z test of coefficients:

             Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.260374   0.022275  -11.69  < 2e-16 ***
ageyrs       0.002323   0.000368    6.31  2.8e-10 ***
smokever     0.025359   0.009720    2.61   0.0091 ** 
sex         -0.091885   0.009899   -9.28  < 2e-16 ***
raceBlack   -0.028859   0.015538   -1.86   0.0633 .  
raceOther    0.097685   0.024432    4.00  6.4e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Risk (intercept) and risk ratios
exp(coef(glm.log.poisson))
(Intercept)      ageyrs    smokever         sex   raceBlack   raceOther 
      0.771       1.002       1.026       0.912       0.972       1.103 

Both methods result in the same SE.

Comparison of logistic regression and relative risk regression

Common outcome (excercise)

## The intercepts correspond to the baseline risk and baseline odds
cbind(RiskRatio = exp(coef(glm.log.poisson)),
      OddsRatio = exp(coef(glm.logit.binom)))
            RiskRatio OddsRatio
(Intercept)     0.771     3.167
ageyrs          1.002     1.015
smokever        1.026     1.178
sex             0.912     0.539
raceBlack       0.972     0.831
raceOther       1.103     2.473

Rare outcome (angina)

glm.log.poisson.angina <- glm(angina ~ ageyrs + recex + smokever + sex + bmi + race, data = restricted2,
                              family = poisson(link = "log"))
coeftest(glm.log.poisson.angina, vcov = sandwich)

z test of coefficients:

            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -2.42348    0.14448  -16.77  < 2e-16 ***
ageyrs       0.01178    0.00160    7.35  2.0e-13 ***
recexhigh   -0.52732    0.07145   -7.38  1.6e-13 ***
recexmedium -0.33612    0.04339   -7.75  9.4e-15 ***
smokever     0.11735    0.04252    2.76   0.0058 ** 
sex         -0.12266    0.04233   -2.90   0.0038 ** 
bmi          0.01740    0.00372    4.67  2.9e-06 ***
raceBlack    0.11751    0.05871    2.00   0.0453 *  
raceOther   -0.30671    0.19568   -1.57   0.1170    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## The intercepts correspond to the baseline risk and baseline odds
cbind(RiskRatio = exp(coef(glm.log.poisson.angina)),
      OddsRatio = exp(coef(glm.logit.binom.angina)))
            RiskRatio OddsRatio
(Intercept)    0.0886    0.0849
ageyrs         1.0118    1.0154
recexhigh      0.5902    0.5084
recexmedium    0.7145    0.6402
smokever       1.1245    1.1672
sex            0.8846    0.8491
bmi            1.0176    1.0244
raceBlack      1.1247    1.1736
raceOther      0.7359    0.6822