EPI 204 Homework 3 in R

References

Part I

Question 1: Prepare dataset

## Load SAS dataset
library(sas7bdat)
nh2fs.orig <- read.sas7bdat("./nh2fs2013.sas7bdat")

## Changed to lower case variable names
names(nh2fs.orig) <- tolower(names(nh2fs.orig))

## Restrict to complete cases only
variable.used <- c("death","pulse","ageyrs","height","wt","sex","race","smokever","booze")
nh2fs <- nh2fs.orig[,variable.used]
nh2fs <- nh2fs[complete.cases(nh2fs), ]

## Recode
nh2fs <- within(nh2fs, {

    bmi     <- wt / (height/100)^2
    gender  <- as.numeric(sex == 1)
    alcohol <- as.numeric(booze > 0)

    race    <- factor(race)
 })

Question 2:

nh2fs$bmicat <- cut(nh2fs$bmi,
                    breaks = c(-Inf, 25, 30, Inf),
                    labels = c("Normal","Overweight","Obese"),
                    right = F)

table(nh2fs$bmicat)

    Normal Overweight      Obese 
      3958       3104       1493 

Question 3:

tab.death.by.bmicat <- xtabs( ~ bmicat + death, nh2fs)

prop.table(tab.death.by.bmicat, margin = 1)
            death
bmicat            0      1
  Normal     0.7625 0.2375
  Overweight 0.7767 0.2233
  Obese      0.7703 0.2297

Question 4:

res.q4 <- glm(formula = death ~ bmicat,
              family  = binomial(link = "logit"),
              data    = nh2fs)
summary(res.q4)

Call:
glm(formula = death ~ bmicat, family = binomial(link = "logit"), 
    data = nh2fs)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-0.736  -0.736  -0.711  -0.711   1.732  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)       -1.1665     0.0374  -31.23   <2e-16 ***
bmicatOverweight  -0.0803     0.0570   -1.41     0.16    
bmicatObese       -0.0433     0.0720   -0.60     0.55    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9247.2  on 8554  degrees of freedom
Residual deviance: 9245.1  on 8552  degrees of freedom
AIC: 9251

Number of Fisher Scoring iterations: 4

library(epicalc)
logistic.display(res.q4)

Logistic regression predicting death 

                    OR(95%CI)         P(Wald's test) P(LR-test)
bmicat: ref.=Normal                                  0.368     
   Overweight       0.92 (0.83,1.03)  0.159                    
   Obese            0.96 (0.83,1.1)   0.547                    

Log-likelihood = -4622.5746
No. of observations = 8555
AIC value = 9251.1491

Question 5:

drop1(res.q4, test = "Chisq")
Single term deletions

Model:
death ~ bmicat
       Df Deviance  AIC LRT Pr(>Chi)
<none>        9245 9251             
bmicat  2     9247 9249   2     0.37

Question 6:

res.q6 <- glm(formula = death ~ bmicat + ageyrs + gender + alcohol + race + smokever,
              family  = binomial(link = "logit"),
              data    = nh2fs)
summary(res.q6)

Call:
glm(formula = death ~ bmicat + ageyrs + gender + alcohol + race + 
    smokever, family = binomial(link = "logit"), data = nh2fs)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.716  -0.725  -0.339  -0.134   3.208  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -8.12548    0.22811  -35.62  < 2e-16 ***
bmicatOverweight -0.28272    0.06478   -4.36 0.000013 ***
bmicatObese      -0.06393    0.08217   -0.78   0.4366    
ageyrs            0.10892    0.00335   32.51  < 2e-16 ***
gender            0.62906    0.06323    9.95  < 2e-16 ***
alcohol          -0.17485    0.06089   -2.87   0.0041 ** 
race2             0.08941    0.09480    0.94   0.3456    
race3            -0.33426    0.24347   -1.37   0.1698    
smokever          0.64943    0.06628    9.80  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 9247.2  on 8554  degrees of freedom
Residual deviance: 7272.5  on 8546  degrees of freedom
AIC: 7290

Number of Fisher Scoring iterations: 5
logistic.display(res.q6)

Logistic regression predicting death 

                    crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
bmicat: ref.=Normal                                                    < 0.001   
   Overweight       0.92 (0.83,1.03)  0.75 (0.66,0.86)  < 0.001                  
   Obese            0.96 (0.83,1.1)   0.94 (0.8,1.1)    0.437                    

ageyrs (cont. var.) 1.11 (1.1,1.11)   1.12 (1.11,1.12)  < 0.001        < 0.001   

gender: 1 vs 0      1.89 (1.7,2.09)   1.88 (1.66,2.12)  < 0.001        < 0.001   

alcohol: 1 vs 0     0.73 (0.66,0.8)   0.84 (0.75,0.95)  0.004          0.004     

race: ref.=1                                                           0.227     
   2                0.99 (0.84,1.16)  1.09 (0.91,1.32)  0.346                    
   3                0.67 (0.44,1.02)  0.72 (0.44,1.15)  0.17                     

smokever: 1 vs 0    1.59 (1.43,1.76)  1.91 (1.68,2.18)  < 0.001        < 0.001   

Log-likelihood = -3636.2396
No. of observations = 8555
AIC value = 7290.4792

Question 8:

drop1(res.q6, test = "Chisq")
Single term deletions

Model:
death ~ bmicat + ageyrs + gender + alcohol + race + smokever
         Df Deviance  AIC  LRT Pr(>Chi)    
<none>          7272 7290                  
bmicat    2     7292 7306   20 0.000048 ***
ageyrs    1     8962 8978 1690  < 2e-16 ***
gender    1     7373 7389  100  < 2e-16 ***
alcohol   1     7281 7297    8   0.0041 ** 
race      2     7275 7289    3   0.2270    
smokever  1     7371 7387   98  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Part II

Question 9:

nh2fs$bmi_ge25 <- as.numeric(nh2fs$bmi >= 25)
table(nh2fs$bmi_ge25)

   0    1 
3958 4597 

Question 10:

res.q10 <- glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever + smokever:alcohol,
               family  = binomial(link = "logit"),
               data    = nh2fs)
summary(res.q10)

Call:
glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever + 
    smokever:alcohol, family = binomial(link = "logit"), data = nh2fs)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.717  -1.205   0.912   1.097   1.673  

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)      -0.32509    0.10783   -3.01  0.00257 ** 
ageyrs            0.01054    0.00168    6.29  3.2e-10 ***
gender            0.34167    0.04705    7.26  3.8e-13 ***
alcohol          -0.17420    0.07086   -2.46  0.01396 *  
race2             0.43896    0.07275    6.03  1.6e-09 ***
race3            -0.64826    0.16880   -3.84  0.00012 ***
smokever         -0.39462    0.06884   -5.73  9.9e-09 ***
alcohol:smokever  0.10035    0.09188    1.09  0.27477    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11812  on 8554  degrees of freedom
Residual deviance: 11615  on 8547  degrees of freedom
AIC: 11631

Number of Fisher Scoring iterations: 4
logistic.display(res.q10)

Logistic regression predicting bmi_ge25 

                    crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
ageyrs (cont. var.) 1.01 (1.01,1.02)  1.01 (1.01,1.01)  < 0.001        < 0.001   

gender: 1 vs 0      1.22 (1.12,1.33)  1.41 (1.28,1.54)  < 0.001        < 0.001   

alcohol: 1 vs 0     0.84 (0.77,0.91)  0.84 (0.73,0.97)  0.014          0.014     

race: ref.=1                                                           < 0.001   
   2                1.54 (1.34,1.77)  1.55 (1.34,1.79)  < 0.001                  
   3                0.55 (0.39,0.76)  0.52 (0.38,0.73)  < 0.001                  

smokever: 1 vs 0    0.76 (0.69,0.82)  0.67 (0.59,0.77)  < 0.001        < 0.001   

alcohol:smokever    -                 1.11 (0.92,1.32)  0.275          0.275     

Log-likelihood = -5807.3455
No. of observations = 8555
AIC value = 11630.691

res.q10b <- glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever,
               family  = binomial(link = "logit"),
               data    = nh2fs)
summary(res.q10b)

Call:
glm(formula = bmi_ge25 ~ ageyrs + gender + alcohol + race + smokever, 
    family = binomial(link = "logit"), data = nh2fs)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-1.706  -1.206   0.915   1.095   1.680  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)    
(Intercept) -0.35123    0.10511   -3.34  0.00083 ***
ageyrs       0.01056    0.00168    6.30  2.9e-10 ***
gender       0.33903    0.04700    7.21  5.4e-13 ***
alcohol     -0.11582    0.04652   -2.49  0.01279 *  
race2        0.44008    0.07274    6.05  1.4e-09 ***
race3       -0.65060    0.16873   -3.86  0.00012 ***
smokever    -0.34078    0.04801   -7.10  1.3e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 11812  on 8554  degrees of freedom
Residual deviance: 11616  on 8548  degrees of freedom
AIC: 11630

Number of Fisher Scoring iterations: 4
AIC(res.q10b)
[1] 11630

Question 11:

rms::lrm

The higher levels are considered the event (equivalent of SAS PROC LOGISTIC with DESCENDING option).

library(rms)
res.q11b <- lrm(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
res.q11b

Logistic Regression Model

lrm(formula = bmicat ~ ageyrs + gender + alcohol + race + smokever, 
    data = nh2fs)

                      Model Likelihood     Discrimination    Rank Discrim.    
                         Ratio Test            Indexes          Indexes       
Obs          8555    LR chi2     204.73    R2       0.027    C       0.576    
 Normal      3958    d.f.             6    g        0.330    Dxy     0.152    
 Overweight  3104    Pr(> chi2) <0.0001    gr       1.390    gamma   0.153    
 Obese       1493                          gp       0.080    tau-a   0.095    
max |deriv| 2e-12                          Brier    0.244                     

              Coef    S.E.   Wald Z Pr(>|Z|)
y>=Overweight -0.1173 0.0997  -1.18 0.2393  
y>=Obese      -1.8523 0.1019 -18.17 <0.0001 
ageyrs         0.0081 0.0016   5.17 <0.0001 
gender         0.1639 0.0438   3.74 0.0002  
alcohol       -0.1946 0.0435  -4.48 <0.0001 
race=2         0.5148 0.0658   7.82 <0.0001 
race=3        -0.6263 0.1652  -3.79 0.0001  
smokever      -0.3113 0.0447  -6.96 <0.0001 
AIC(res.q11b)
[1] 17419

VGAM::vglm

The lower levels are considered the event (equivalent of SAS PROC LOGISTIC without DESCENDING option). The outcome has to be an ordered factor.

The “reverse = TRUE” can be used just like the DESCENDING option in SAS.

cumulative(link = "logit", parallel = TRUE, reverse = TRUE)
library(VGAM)

## Proportional odds assumption
res.q11b2 <- vglm(factor(bmicat, ordered = T) ~ ageyrs + gender + alcohol + race + smokever,
                  data = nh2fs, family = cumulative(link = "logit", parallel = TRUE))
summary(res.q11b2)

Call:
vglm(formula = factor(bmicat, ordered = T) ~ ageyrs + gender + 
    alcohol + race + smokever, family = cumulative(link = "logit", 
    parallel = TRUE), data = nh2fs)

Pearson Residuals:
                Min    1Q Median   3Q  Max
logit(P[Y<=1]) -1.8 -1.06  -0.38 1.00 1.74
logit(P[Y<=2]) -3.9  0.24   0.27 0.65 0.94

Coefficients:
              Estimate Std. Error z value
(Intercept):1   0.1172     0.0987     1.2
(Intercept):2   1.8523     0.1010    18.3
ageyrs         -0.0081     0.0016    -5.2
gender         -0.1639     0.0438    -3.7
alcohol         0.1946     0.0433     4.5
race2          -0.5148     0.0650    -7.9
race3           0.6263     0.1639     3.8
smokever        0.3113     0.0445     7.0

Number of linear predictors:  2 

Names of linear predictors: logit(P[Y<=1]), logit(P[Y<=2])

Dispersion Parameter for cumulative family:   1

Residual deviance: 17403 on 17102 degrees of freedom

Log-likelihood: -8702 on 17102 degrees of freedom

Number of iterations: 3 

## No roportional odds assumption
res.q11b3 <- vglm(factor(bmicat, ordered = T) ~ ageyrs + gender + alcohol + race + smokever,
                  data = nh2fs, family = cumulative(link = "logit", parallel = FALSE))
summary(res.q11b3)

Call:
vglm(formula = factor(bmicat, ordered = T) ~ ageyrs + gender + 
    alcohol + race + smokever, family = cumulative(link = "logit", 
    parallel = FALSE), data = nh2fs)

Pearson Residuals:
                Min    1Q Median   3Q Max
logit(P[Y<=1]) -2.1 -1.02  -0.36 1.00 1.8
logit(P[Y<=2]) -3.8  0.22   0.27 0.54 1.2

Coefficients:
              Estimate Std. Error z value
(Intercept):1   0.3587     0.1049    3.42
(Intercept):2   1.1880     0.1359    8.74
ageyrs:1       -0.0107     0.0017   -6.37
ageyrs:2       -0.0004     0.0022   -0.18
gender:1       -0.3426     0.0469   -7.31
gender:2        0.3139     0.0625    5.03
alcohol:1       0.1128     0.0464    2.43
alcohol:2       0.3565     0.0602    5.92
race2:1        -0.4353     0.0724   -6.01
race2:2        -0.5692     0.0811   -7.02
race3:1         0.6534     0.1686    3.88
race3:2         0.6365     0.2706    2.35
smokever:1      0.3414     0.0479    7.13
smokever:2      0.2278     0.0615    3.71

Number of linear predictors:  2 

Names of linear predictors: logit(P[Y<=1]), logit(P[Y<=2])

Dispersion Parameter for cumulative family:   1

Residual deviance: 17220 on 17096 degrees of freedom

Log-likelihood: -8610 on 17096 degrees of freedom

Number of iterations: 5 

## Likelihood ratio test for models with or without proportional odds assumption
## Difference in -2 log likelihood
(diff.neg.2logLik <- -2*(logLik(res.q11b2) - logLik(res.q11b3)))
[1] 183

## Difference in degree of freedom (using residual degrees of freedom)
(diff.df          <- df.residual(res.q11b2) - df.residual(res.q11b3))
[1] 6

## Threshold
qchisq(p = 0.05, df = 6, lower.tail = FALSE)
[1] 12.59

## p-value for comparing model with or without proportional odds assumption
pchisq(q = diff.neg.2logLik, df = diff.df, lower.tail = FALSE)
[1] 7.81e-37

MASS::polr

The lower levels are considered the event for the intercept (equivalent of SAS PROC LOGISTIC without DESCENDING option), but the higher levels are considered the event for the slopes (equivalent of SAS PROC LOGISTIC with DESCENDING option).

library(MASS)
res.q11b4 <- polr(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
summary(res.q11b4)
Call:
polr(formula = bmicat ~ ageyrs + gender + alcohol + race + smokever, 
    data = nh2fs)

Coefficients:
            Value Std. Error t value
ageyrs    0.00814    0.00157    5.17
gender    0.16395    0.04384    3.74
alcohol  -0.19459    0.04345   -4.48
race2     0.51480    0.06583    7.82
race3    -0.62635    0.16515   -3.79
smokever -0.31127    0.04470   -6.96

Intercepts:
                  Value  Std. Error t value
Normal|Overweight  0.117  0.100      1.177 
Overweight|Obese   1.852  0.102     18.172 

Residual Deviance: 17403.30 
AIC: 17419.30 

## effect plot
library(effects)
plot(allEffects(res.q11b4), style = "stacked")

plot of chunk unnamed-chunk-14

ordinal::clm

The lower levels are considered the event for the intercept (equivalent of SAS PROC LOGISTIC without DESCENDING option), but the higher levels are considered the event for the slopes (equivalent of SAS PROC LOGISTIC with DESCENDING option).

library(ordinal)
res.q11b5 <- clm(bmicat ~ ageyrs + gender + alcohol + race + smokever, data = nh2fs)
summary(res.q11b5)    
formula: bmicat ~ ageyrs + gender + alcohol + race + smokever
data:    nh2fs

 link  threshold nobs logLik   AIC      niter max.grad cond.H 
 logit flexible  8555 -8701.65 17419.30 5(0)  1.23e-11 2.1e+05

Coefficients:
         Estimate Std. Error z value Pr(>|z|)    
ageyrs    0.00814    0.00157    5.17  2.3e-07 ***
gender    0.16395    0.04384    3.74  0.00018 ***
alcohol  -0.19459    0.04345   -4.48  7.5e-06 ***
race2     0.51480    0.06583    7.82  5.3e-15 ***
race3    -0.62635    0.16515   -3.79  0.00015 ***
smokever -0.31127    0.04470   -6.96  3.3e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Threshold coefficients:
                  Estimate Std. Error z value
Normal|Overweight   0.1173     0.0997    1.18
Overweight|Obese    1.8523     0.1019   18.17