## Load foreign package
library(foreign)
## Load dataset converted to Stata format
nhdat <- read.dta("./lab1.dta")
## Recode
nhdat <- within(nhdat, {
race <- factor(race, 1:3, c("White","Black","Other"))
recex <- factor(recex, 1:3, c("high","medium","low"))
recex <- relevel(recex, "low")
height[height %in% 999] <- NA
bmi <- wt / (height/100)^2
})
## Restrict to complete cases
restricted <- nhdat[complete.cases(nhdat[,c("fvc","ageyrs","height","wt","sex","race")]),]
restricted <- restricted[5 <= restricted$ageyrs & restricted$ageyrs <= 18,]
## identity link, gaussian family
glm.identity.gaus <- glm(fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex, data = restricted,
family = gaussian(link = "identity"))
summary(glm.identity.gaus)
Call:
glm(formula = fvc ~ ageyrs + height + wt + race + sex + ageyrs:sex,
family = gaussian(link = "identity"), data = restricted)
Deviance Residuals:
Min 1Q Median 3Q Max
-372.0 -26.0 0.0 26.7 443.9
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -306.654 14.288 -21.46 < 2e-16 ***
ageyrs 2.908 0.564 5.15 2.7e-07 ***
height 3.087 0.134 23.04 < 2e-16 ***
wt 1.679 0.105 16.01 < 2e-16 ***
raceBlack -46.104 2.456 -18.77 < 2e-16 ***
raceOther -12.501 5.762 -2.17 0.03 *
sex -52.764 6.513 -8.10 7.8e-16 ***
ageyrs:sex 7.011 0.513 13.67 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for gaussian family taken to be 2374)
Null deviance: 44766904 on 3017 degrees of freedom
Residual deviance: 7146345 on 3010 degrees of freedom
AIC: 32032
Number of Fisher Scoring iterations: 2
## Logit link, binomial family
restricted2 <- nhdat[complete.cases(nhdat[,c("ageyrs","height","wt","sex","race","stroke","recex","marry","smokever","school","choleste","sbp","dbp")]),]
glm.logit.binom.angina <- glm(angina ~ ageyrs + recex + smokever + sex + bmi + race, data = restricted2,
family = binomial(link = "logit"))
summary(glm.logit.binom.angina)
Call:
glm(formula = angina ~ ageyrs + recex + smokever + sex + bmi +
race, family = binomial(link = "logit"), data = restricted2)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.164 -0.750 -0.646 -0.499 2.147
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.46669 0.19417 -12.70 < 2e-16 ***
ageyrs 0.01531 0.00209 7.34 2.1e-13 ***
recexhigh -0.67646 0.08739 -7.74 9.9e-15 ***
recexmedium -0.44597 0.05681 -7.85 4.1e-15 ***
smokever 0.15460 0.05781 2.67 0.0075 **
sex -0.16363 0.05680 -2.88 0.0040 **
bmi 0.02407 0.00513 4.69 2.7e-06 ***
raceBlack 0.16006 0.08083 1.98 0.0477 *
raceOther -0.38240 0.23629 -1.62 0.1056
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9115.4 on 8506 degrees of freedom
Residual deviance: 8887.8 on 8498 degrees of freedom
(3 observations deleted due to missingness)
AIC: 8906
Number of Fisher Scoring iterations: 4
library(epicalc)
logistic.display(glm.logit.binom.angina)
Logistic regression predicting Dr Diagnosed Angina
crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
ageyrs (cont. var.) 1.02 (1.01,1.02) 1.02 (1.01,1.02) < 0.001 < 0.001
Recreational exercise 1=high, 2=mid, 3=low: ref.=low < 0.001
high 0.46 (0.39,0.54) 0.51 (0.43,0.6) < 0.001
medium 0.6 (0.53,0.66) 0.64 (0.57,0.72) < 0.001
smokever: 1 vs 0 1.01 (0.91,1.12) 1.17 (1.04,1.31) 0.007 0.007
sex: 1 vs 0 0.83 (0.75,0.92) 0.85 (0.76,0.95) 0.004 0.004
NA (cont. var.) 1.03 (1.02,1.04) 1.02 (1.01,1.03) < 0.001 < 0.001
white, black, other: ref.=White 0.031
Black 1.23 (1.05,1.44) 1.17 (1,1.38) 0.048
Other 0.65 (0.41,1.02) 0.68 (0.43,1.08) 0.106
Log-likelihood = -4443.9002
No. of observations = 8507
AIC value = 8905.8004
Odds ratios are not good effect measures when the outcome of interest is common. Odds are close to risks when the outcome is rare, but exaggerated (odds >> risk) when the outcome is common. Thus, odds ratio tends to give false impression of large effect when the outcome is common.
In the example below the odds ratio is much larger than the risk ratio when the outcome is common.
## Create a common outcome
restricted2$recex_dich <- restricted2$recex != "high"
table(restricted2$recex_dich)
FALSE TRUE
1266 7244
## Common outcome: Logit link, binomial family
glm.logit.binom <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
family = binomial(link = "logit"))
summary(glm.logit.binom)
Call:
glm(formula = recex_dich ~ ageyrs + smokever + sex + race, family = binomial(link = "logit"),
data = restricted2)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.556 0.451 0.528 0.603 0.859
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 1.15294 0.13556 8.51 < 2e-16 ***
ageyrs 0.01524 0.00229 6.65 2.9e-11 ***
smokever 0.16423 0.06702 2.45 0.0143 *
sex -0.61757 0.06563 -9.41 < 2e-16 ***
raceBlack -0.18497 0.09297 -1.99 0.0466 *
raceOther 0.90542 0.33148 2.73 0.0063 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 7158.0 on 8509 degrees of freedom
Residual deviance: 7015.8 on 8504 degrees of freedom
AIC: 7028
Number of Fisher Scoring iterations: 5
## Define a function to obtain ratio
..glm.ratio <- function(GLM.RESULT, DIGITS = 2, P.DIGITS = 3, CONF.LEVEL = 0.95) {
if (GLM.RESULT$family$family == "binomial") {
LABEL <- "OR"
} else if (GLM.RESULT$family$family == "poisson") {
LABEL <- "RR"
} else {
stop("Not logistic or Poisson model")
}
## Extract coefficients and confidence interval
COEF <- stats::coef(GLM.RESULT)
CONFINT <- suppressMessages(stats::confint(GLM.RESULT, level = CONF.LEVEL))
TABLE <- cbind(coef = COEF, CONFINT)
## Turn them into OR
TABLE.EXP <- round(exp(TABLE), DIGITS)
colnames(TABLE.EXP)[1] <- LABEL
## Extract p-value
PVAL <- round(summary(GLM.RESULT)$coef[,4], P.DIGITS)
TABLE.EXP <- cbind(TABLE.EXP, "P" = PVAL)
TABLE.EXP
}
## Odds (intercept) and odds ratios
..glm.ratio(glm.logit.binom, DIGITS = 5)
OR 2.5 % 97.5 % P
(Intercept) 3.1675 2.4317 4.1373 0.000
ageyrs 1.0153 1.0108 1.0199 0.000
smokever 1.1785 1.0332 1.3436 0.014
sex 0.5393 0.4740 0.6131 0.000
raceBlack 0.8311 0.6944 1.0000 0.047
raceOther 2.4730 1.3599 5.0563 0.006
log-binomial model (did not work)
## Common outcome: log link, binomial family (Did not work)
glm.log.binom <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
family = binomial(link = "log"))
Error: no valid set of coefficients has been found: please supply starting values
## summary(glm.log.binom)
Zou’s modified Poisson regression
## Common outcome: log link, poisson family, robust estimator (modified Poisson with robust estimator by Zou)
library(geepack)
geeglm.log.poisson <- geeglm(formula = recex_dich ~ ageyrs + smokever + sex + race,
data = restricted2,
family = poisson(link = "log"),
id = seqno,
corstr = "exchangeable")
summary(geeglm.log.poisson)
Call:
geeglm(formula = recex_dich ~ ageyrs + smokever + sex + race,
family = poisson(link = "log"), data = restricted2, id = seqno,
corstr = "exchangeable")
Coefficients:
Estimate Std.err Wald Pr(>|W|)
(Intercept) -0.260374 0.022275 136.63 < 2e-16 ***
ageyrs 0.002323 0.000368 39.84 2.8e-10 ***
smokever 0.025359 0.009720 6.81 0.0091 **
sex -0.091885 0.009899 86.15 < 2e-16 ***
raceBlack -0.028859 0.015538 3.45 0.0633 .
raceOther 0.097685 0.024432 15.99 6.4e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Estimated Scale Parameters:
Estimate Std.err
(Intercept) 0.149 0.00393
Correlation: Structure = exchangeable Link = identity
Estimated Correlation Parameters:
Estimate Std.err
alpha 0 0
Number of clusters: 8510 Maximum cluster size: 1
exp(coef(geeglm.log.poisson))
(Intercept) ageyrs smokever sex raceBlack raceOther
0.771 1.002 1.026 0.912 0.972 1.103
## Fit by glm() then test using robust SE estimator
glm.log.poisson <- glm(recex_dich ~ ageyrs + smokever + sex + race, data = restricted2,
family = poisson(link = "log"))
## Load sandwich package for robust estimator
library(sandwich)
## Load lmtest package for coeftest
library(lmtest)
## Poisson model with SE estimated via robust variance estimator
coeftest(glm.log.poisson, vcov = sandwich)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.260374 0.022275 -11.69 < 2e-16 ***
ageyrs 0.002323 0.000368 6.31 2.8e-10 ***
smokever 0.025359 0.009720 2.61 0.0091 **
sex -0.091885 0.009899 -9.28 < 2e-16 ***
raceBlack -0.028859 0.015538 -1.86 0.0633 .
raceOther 0.097685 0.024432 4.00 6.4e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Risk (intercept) and risk ratios
exp(coef(glm.log.poisson))
(Intercept) ageyrs smokever sex raceBlack raceOther
0.771 1.002 1.026 0.912 0.972 1.103
Both methods result in the same SE.
Common outcome (excercise)
## The intercepts correspond to the baseline risk and baseline odds
cbind(RiskRatio = exp(coef(glm.log.poisson)),
OddsRatio = exp(coef(glm.logit.binom)))
RiskRatio OddsRatio
(Intercept) 0.771 3.167
ageyrs 1.002 1.015
smokever 1.026 1.178
sex 0.912 0.539
raceBlack 0.972 0.831
raceOther 1.103 2.473
Rare outcome (angina)
glm.log.poisson.angina <- glm(angina ~ ageyrs + recex + smokever + sex + bmi + race, data = restricted2,
family = poisson(link = "log"))
coeftest(glm.log.poisson.angina, vcov = sandwich)
z test of coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.42348 0.14448 -16.77 < 2e-16 ***
ageyrs 0.01178 0.00160 7.35 2.0e-13 ***
recexhigh -0.52732 0.07145 -7.38 1.6e-13 ***
recexmedium -0.33612 0.04339 -7.75 9.4e-15 ***
smokever 0.11735 0.04252 2.76 0.0058 **
sex -0.12266 0.04233 -2.90 0.0038 **
bmi 0.01740 0.00372 4.67 2.9e-06 ***
raceBlack 0.11751 0.05871 2.00 0.0453 *
raceOther -0.30671 0.19568 -1.57 0.1170
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## The intercepts correspond to the baseline risk and baseline odds
cbind(RiskRatio = exp(coef(glm.log.poisson.angina)),
OddsRatio = exp(coef(glm.logit.binom.angina)))
RiskRatio OddsRatio
(Intercept) 0.0886 0.0849
ageyrs 1.0118 1.0154
recexhigh 0.5902 0.5084
recexmedium 0.7145 0.6402
smokever 1.1245 1.1672
sex 0.8846 0.8491
bmi 1.0176 1.0244
raceBlack 1.1247 1.1736
raceOther 0.7359 0.6822