## Load SAS dataset
library(sas7bdat)
nh2fs.orig <- read.sas7bdat("./nh2fs2013.sas7bdat")
## Changed to lower case variable names
names(nh2fs.orig) <- tolower(names(nh2fs.orig))
## Restrict to complete cases only
variable.used <- c("death","pulse","ageyrs","height","wt","sex","race","smokever","booze")
nh2fs <- nh2fs.orig[,variable.used]
nh2fs <- nh2fs[complete.cases(nh2fs), ]
## Recode
nh2fs <- within(nh2fs, {
bmi <- wt / (height/100)^2
gender <- as.numeric(sex == 1)
alcohol <- as.numeric(booze > 0)
race <- factor(race)
})
nh2fs$bmicat <- cut(nh2fs$bmi,
breaks = c(-Inf, 25, 30, Inf),
labels = c("Normal","Overweight","Obese"),
right = F)
table(nh2fs$bmicat)
Normal Overweight Obese
3958 3104 1493
tab.death.by.bmicat <- xtabs( ~ bmicat + death, nh2fs)
prop.table(tab.death.by.bmicat, margin = 1)
death
bmicat 0 1
Normal 0.7625 0.2375
Overweight 0.7767 0.2233
Obese 0.7703 0.2297
res.q4 <- glm(formula = death ~ bmicat,
family = binomial(link = "logit"),
data = nh2fs)
summary(res.q4)
Call:
glm(formula = death ~ bmicat, family = binomial(link = "logit"),
data = nh2fs)
Deviance Residuals:
Min 1Q Median 3Q Max
-0.736 -0.736 -0.711 -0.711 1.732
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -1.1665 0.0374 -31.23 <2e-16 ***
bmicatOverweight -0.0803 0.0570 -1.41 0.16
bmicatObese -0.0433 0.0720 -0.60 0.55
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9247.2 on 8554 degrees of freedom
Residual deviance: 9245.1 on 8552 degrees of freedom
AIC: 9251
Number of Fisher Scoring iterations: 4
library(epicalc)
logistic.display(res.q4)
Logistic regression predicting death
OR(95%CI) P(Wald's test) P(LR-test)
bmicat: ref.=Normal 0.368
Overweight 0.92 (0.83,1.03) 0.159
Obese 0.96 (0.83,1.1) 0.547
Log-likelihood = -4622.5746
No. of observations = 8555
AIC value = 9251.1491
drop1(res.q4, test = "Chisq")
Single term deletions
Model:
death ~ bmicat
Df Deviance AIC LRT Pr(>Chi)
<none> 9245 9251
bmicat 2 9247 9249 2 0.37
res.q6 <- glm(formula = death ~ bmicat + ageyrs + gender + alcohol + race + smokever,
family = binomial(link = "logit"),
data = nh2fs)
summary(res.q6)
Call:
glm(formula = death ~ bmicat + ageyrs + gender + alcohol + race +
smokever, family = binomial(link = "logit"), data = nh2fs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.716 -0.725 -0.339 -0.134 3.208
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -8.12548 0.22811 -35.62 < 2e-16 ***
bmicatOverweight -0.28272 0.06478 -4.36 0.000013 ***
bmicatObese -0.06393 0.08217 -0.78 0.4366
ageyrs 0.10892 0.00335 32.51 < 2e-16 ***
gender 0.62906 0.06323 9.95 < 2e-16 ***
alcohol -0.17485 0.06089 -2.87 0.0041 **
race2 0.08941 0.09480 0.94 0.3456
race3 -0.33426 0.24347 -1.37 0.1698
smokever 0.64943 0.06628 9.80 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 9247.2 on 8554 degrees of freedom
Residual deviance: 7272.5 on 8546 degrees of freedom
AIC: 7290
Number of Fisher Scoring iterations: 5
logistic.display(res.q6)
Logistic regression predicting death
crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
bmicat: ref.=Normal < 0.001
Overweight 0.92 (0.83,1.03) 0.75 (0.66,0.86) < 0.001
Obese 0.96 (0.83,1.1) 0.94 (0.8,1.1) 0.437
ageyrs (cont. var.) 1.11 (1.1,1.11) 1.12 (1.11,1.12) < 0.001 < 0.001
gender: 1 vs 0 1.89 (1.7,2.09) 1.88 (1.66,2.12) < 0.001 < 0.001
alcohol: 1 vs 0 0.73 (0.66,0.8) 0.84 (0.75,0.95) 0.004 0.004
race: ref.=1 0.227
2 0.99 (0.84,1.16) 1.09 (0.91,1.32) 0.346
3 0.67 (0.44,1.02) 0.72 (0.44,1.15) 0.17
smokever: 1 vs 0 1.59 (1.43,1.76) 1.91 (1.68,2.18) < 0.001 < 0.001
Log-likelihood = -3636.2396
No. of observations = 8555
AIC value = 7290.4792
drop1(res.q6, test = "Chisq")
Single term deletions
Model:
death ~ bmicat + ageyrs + gender + alcohol + race + smokever
Df Deviance AIC LRT Pr(>Chi)
<none> 7272 7290
bmicat 2 7292 7306 20 0.000048 ***
ageyrs 1 8962 8978 1690 < 2e-16 ***
gender 1 7373 7389 100 < 2e-16 ***
alcohol 1 7281 7297 8 0.0041 **
race 2 7275 7289 3 0.2270
smokever 1 7371 7387 98 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
nh2fs$bmi_ge25 <- as.numeric(nh2fs$bmi >= 25)
table(nh2fs$bmi_ge25)
0 1
3958 4597
res.q10 <- glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever + smokever:alcohol,
family = binomial(link = "logit"),
data = nh2fs)
summary(res.q10)
Call:
glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever +
smokever:alcohol, family = binomial(link = "logit"), data = nh2fs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.717 -1.205 0.912 1.097 1.673
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.32509 0.10783 -3.01 0.00257 **
ageyrs 0.01054 0.00168 6.29 3.2e-10 ***
gender 0.34167 0.04705 7.26 3.8e-13 ***
alcohol -0.17420 0.07086 -2.46 0.01396 *
race2 0.43896 0.07275 6.03 1.6e-09 ***
race3 -0.64826 0.16880 -3.84 0.00012 ***
smokever -0.39462 0.06884 -5.73 9.9e-09 ***
alcohol:smokever 0.10035 0.09188 1.09 0.27477
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 11812 on 8554 degrees of freedom
Residual deviance: 11615 on 8547 degrees of freedom
AIC: 11631
Number of Fisher Scoring iterations: 4
logistic.display(res.q10)
Logistic regression predicting bmi_ge25
crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
ageyrs (cont. var.) 1.01 (1.01,1.02) 1.01 (1.01,1.01) < 0.001 < 0.001
gender: 1 vs 0 1.22 (1.12,1.33) 1.41 (1.28,1.54) < 0.001 < 0.001
alcohol: 1 vs 0 0.84 (0.77,0.91) 0.84 (0.73,0.97) 0.014 0.014
race: ref.=1 < 0.001
2 1.54 (1.34,1.77) 1.55 (1.34,1.79) < 0.001
3 0.55 (0.39,0.76) 0.52 (0.38,0.73) < 0.001
smokever: 1 vs 0 0.76 (0.69,0.82) 0.67 (0.59,0.77) < 0.001 < 0.001
alcohol:smokever - 1.11 (0.92,1.32) 0.275 0.275
Log-likelihood = -5807.3455
No. of observations = 8555
AIC value = 11630.691
res.q10b <- glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever,
family = binomial(link = "logit"),
data = nh2fs)
summary(res.q10b)
Call:
glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever,
family = binomial(link = "logit"), data = nh2fs)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.706 -1.206 0.915 1.095 1.680
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.35123 0.10511 -3.34 0.00083 ***
ageyrs 0.01056 0.00168 6.30 2.9e-10 ***
gender 0.33903 0.04700 7.21 5.4e-13 ***
alcohol -0.11582 0.04652 -2.49 0.01279 *
race2 0.44008 0.07274 6.05 1.4e-09 ***
race3 -0.65060 0.16873 -3.86 0.00012 ***
smokever -0.34078 0.04801 -7.10 1.3e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 11812 on 8554 degrees of freedom
Residual deviance: 11616 on 8548 degrees of freedom
AIC: 11630
Number of Fisher Scoring iterations: 4
AIC(res.q10b)
[1] 11630
rms::lrm
The higher levels are considered the event (equivalent of SAS PROC LOGISTIC with DESCENDING option).
library(rms)
res.q11b <- lrm(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
res.q11b
Logistic Regression Model
lrm(formula = bmicat ~ ageyrs + gender + alcohol + race + smokever,
data = nh2fs)
Model Likelihood Discrimination Rank Discrim.
Ratio Test Indexes Indexes
Obs 8555 LR chi2 204.73 R2 0.027 C 0.576
Normal 3958 d.f. 6 g 0.330 Dxy 0.152
Overweight 3104 Pr(> chi2) <0.0001 gr 1.390 gamma 0.153
Obese 1493 gp 0.080 tau-a 0.095
max |deriv| 2e-12 Brier 0.244
Coef S.E. Wald Z Pr(>|Z|)
y>=Overweight -0.1173 0.0997 -1.18 0.2393
y>=Obese -1.8523 0.1019 -18.17 <0.0001
ageyrs 0.0081 0.0016 5.17 <0.0001
gender 0.1639 0.0438 3.74 0.0002
alcohol -0.1946 0.0435 -4.48 <0.0001
race=2 0.5148 0.0658 7.82 <0.0001
race=3 -0.6263 0.1652 -3.79 0.0001
smokever -0.3113 0.0447 -6.96 <0.0001
AIC(res.q11b)
[1] 17419
The lower levels are considered the event (equivalent of SAS PROC LOGISTIC without DESCENDING option). The outcome has to be an ordered factor.
The “reverse = TRUE” can be used just like the DESCENDING option in SAS.
cumulative(link = "logit", parallel = TRUE, reverse = TRUE)
library(VGAM)
## Proportional odds assumption
res.q11b2 <- vglm(factor(bmicat, ordered = T) ~ ageyrs + gender + alcohol + race + smokever,
data = nh2fs, family = cumulative(link = "logit", parallel = TRUE))
summary(res.q11b2)
Call:
vglm(formula = factor(bmicat, ordered = T) ~ ageyrs + gender +
alcohol + race + smokever, family = cumulative(link = "logit",
parallel = TRUE), data = nh2fs)
Pearson Residuals:
Min 1Q Median 3Q Max
logit(P[Y<=1]) -1.8 -1.06 -0.38 1.00 1.74
logit(P[Y<=2]) -3.9 0.24 0.27 0.65 0.94
Coefficients:
Estimate Std. Error z value
(Intercept):1 0.1172 0.0987 1.2
(Intercept):2 1.8523 0.1010 18.3
ageyrs -0.0081 0.0016 -5.2
gender -0.1639 0.0438 -3.7
alcohol 0.1946 0.0433 4.5
race2 -0.5148 0.0650 -7.9
race3 0.6263 0.1639 3.8
smokever 0.3113 0.0445 7.0
Number of linear predictors: 2
Names of linear predictors: logit(P[Y<=1]), logit(P[Y<=2])
Dispersion Parameter for cumulative family: 1
Residual deviance: 17403 on 17102 degrees of freedom
Log-likelihood: -8702 on 17102 degrees of freedom
Number of iterations: 3
## No roportional odds assumption
res.q11b3 <- vglm(factor(bmicat, ordered = T) ~ ageyrs + gender + alcohol + race + smokever,
data = nh2fs, family = cumulative(link = "logit", parallel = FALSE))
summary(res.q11b3)
Call:
vglm(formula = factor(bmicat, ordered = T) ~ ageyrs + gender +
alcohol + race + smokever, family = cumulative(link = "logit",
parallel = FALSE), data = nh2fs)
Pearson Residuals:
Min 1Q Median 3Q Max
logit(P[Y<=1]) -2.1 -1.02 -0.36 1.00 1.8
logit(P[Y<=2]) -3.8 0.22 0.27 0.54 1.2
Coefficients:
Estimate Std. Error z value
(Intercept):1 0.3587 0.1049 3.42
(Intercept):2 1.1880 0.1359 8.74
ageyrs:1 -0.0107 0.0017 -6.37
ageyrs:2 -0.0004 0.0022 -0.18
gender:1 -0.3426 0.0469 -7.31
gender:2 0.3139 0.0625 5.03
alcohol:1 0.1128 0.0464 2.43
alcohol:2 0.3565 0.0602 5.92
race2:1 -0.4353 0.0724 -6.01
race2:2 -0.5692 0.0811 -7.02
race3:1 0.6534 0.1686 3.88
race3:2 0.6365 0.2706 2.35
smokever:1 0.3414 0.0479 7.13
smokever:2 0.2278 0.0615 3.71
Number of linear predictors: 2
Names of linear predictors: logit(P[Y<=1]), logit(P[Y<=2])
Dispersion Parameter for cumulative family: 1
Residual deviance: 17220 on 17096 degrees of freedom
Log-likelihood: -8610 on 17096 degrees of freedom
Number of iterations: 5
## Likelihood ratio test for models with or without proportional odds assumption
## Difference in -2 log likelihood
(diff.neg.2logLik <- -2*(logLik(res.q11b2) - logLik(res.q11b3)))
[1] 183
## Difference in degree of freedom (using residual degrees of freedom)
(diff.df <- df.residual(res.q11b2) - df.residual(res.q11b3))
[1] 6
## Threshold
qchisq(p = 0.05, df = 6, lower.tail = FALSE)
[1] 12.59
## p-value for comparing model with or without proportional odds assumption
pchisq(q = diff.neg.2logLik, df = diff.df, lower.tail = FALSE)
[1] 7.81e-37
The lower levels are considered the event for the intercept (equivalent of SAS PROC LOGISTIC without DESCENDING option), but the higher levels are considered the event for the slopes (equivalent of SAS PROC LOGISTIC with DESCENDING option).
library(MASS)
res.q11b4 <- polr(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
summary(res.q11b4)
Call:
polr(formula = bmicat ~ ageyrs + gender + alcohol + race + smokever,
data = nh2fs)
Coefficients:
Value Std. Error t value
ageyrs 0.00814 0.00157 5.17
gender 0.16395 0.04384 3.74
alcohol -0.19459 0.04345 -4.48
race2 0.51480 0.06583 7.82
race3 -0.62635 0.16515 -3.79
smokever -0.31127 0.04470 -6.96
Intercepts:
Value Std. Error t value
Normal|Overweight 0.117 0.100 1.177
Overweight|Obese 1.852 0.102 18.172
Residual Deviance: 17403.30
AIC: 17419.30
## effect plot
library(effects)
plot(allEffects(res.q11b4), style = "stacked")
The lower levels are considered the event for the intercept (equivalent of SAS PROC LOGISTIC without DESCENDING option), but the higher levels are considered the event for the slopes (equivalent of SAS PROC LOGISTIC with DESCENDING option).
library(ordinal)
res.q11b5 <- clm(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
summary(res.q11b5)
formula: bmicat ~ ageyrs + gender + alcohol + race + smokever
data: nh2fs
link threshold nobs logLik AIC niter max.grad cond.H
logit flexible 8555 -8701.65 17419.30 5(0) 1.23e-11 2.1e+05
Coefficients:
Estimate Std. Error z value Pr(>|z|)
ageyrs 0.00814 0.00157 5.17 2.3e-07 ***
gender 0.16395 0.04384 3.74 0.00018 ***
alcohol -0.19459 0.04345 -4.48 7.5e-06 ***
race2 0.51480 0.06583 7.82 5.3e-15 ***
race3 -0.62635 0.16515 -3.79 0.00015 ***
smokever -0.31127 0.04470 -6.96 3.3e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Threshold coefficients:
Estimate Std. Error z value
Normal|Overweight 0.1173 0.0997 1.18
Overweight|Obese 1.8523 0.1019 18.17