Biostatistics 213: Homework 5

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE, 
    echo = T, fig.width = 5, fig.height = 5)
options(width = 116, scipen = 5, digits = 5)

setwd("~/statistics/bio213/")
library(gdata)
lbw <- read.xls("~/statistics/bio213/lbw.xls")

## Same data available online: http://www.umass.edu/statdata/statdata/data/index.html
## Cases 10 and 39 needs fix to make them identical to Dr. Orav's dataset
## lbw2 <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
## lbw2[c(10,39),"BWT"] <- c(2655,3035)


lbw$race <- factor(lbw$race, levels = 1:3, labels = c("white","black","other"))
model.bwt.by.intercept <- lm(formula = bwt ~ 1, data = lbw)
model.bwt.by.lwt.race.ht <- lm(formula = bwt ~ lwt + ht + race, data = lbw)

anova1.model.bwt.by.lwt.race.ht <- anova(model.bwt.by.lwt.race.ht)

library(car)
anova3.model.bwt.by.lwt.race.ht <- Anova(model.bwt.by.lwt.race.ht, type = 3)

a. Use PROC REG to run a model that includes maternal weight, race (in 3 categories), and history of hypertension as predictors. Interpret everything on the printout.

summary(model.bwt.by.lwt.race.ht)

Call:
lm(formula = bwt ~ lwt + ht + race, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2128.3  -454.2    28.3   465.4  1909.0 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2374.53     241.89    9.82   <2e-16 ***
lwt             5.74       1.77    3.25   0.0014 ** 
ht           -558.54     212.83   -2.62   0.0094 ** 
raceblack    -433.66     155.20   -2.79   0.0058 ** 
raceother    -226.54     112.23   -2.02   0.0450 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 692 on 184 degrees of freedom
Multiple R-squared: 0.119,  Adjusted R-squared: 0.0999 
F-statistic: 6.22 on 4 and 184 DF,  p-value: 0.000103 

b. Use PROC GLM to run a model that includes maternal weight, race (in 3 categories), and history of hypertension as predictors. Interpret everything on the printout, making sure you are aware of how the REG and GLM runs correspond and how they differ.

lbw$race <- relevel(lbw$race, ref = "other")
list(Model = anova(model.bwt.by.intercept, model.bwt.by.lwt.race.ht),
     Type1 = anova1.model.bwt.by.lwt.race.ht,
     Type2 = anova3.model.bwt.by.lwt.race.ht,
     Regression   = summary(lm(formula = bwt ~ lwt + ht + race, data = lbw)))
$Model
Analysis of Variance Table

Model 1: bwt ~ 1
Model 2: bwt ~ lwt + ht + race
  Res.Df      RSS Df Sum of Sq    F Pr(>F)    
1    188 99927264                             
2    184 88030384  4  11896880 6.22 0.0001 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

$Type1
Analysis of Variance Table

Response: bwt
           Df   Sum Sq Mean Sq F value Pr(>F)   
lwt         1  3473052 3473052    7.26 0.0077 **
ht          1  3827657 3827657    8.00 0.0052 **
race        2  4596172 2298086    4.80 0.0093 **
Residuals 184 88030384  478426                  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

$Type2
Anova Table (Type III tests)

Response: bwt
              Sum Sq  Df F value Pr(>F)    
(Intercept) 46103636   1   96.37 <2e-16 ***
lwt          5041540   1   10.54 0.0014 ** 
ht           3294939   1    6.89 0.0094 ** 
race         4596172   2    4.80 0.0093 ** 
Residuals   88030384 184                   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

$Regression

Call:
lm(formula = bwt ~ lwt + ht + race, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2128.3  -454.2    28.3   465.4  1909.0 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2147.99     226.18    9.50   <2e-16 ***
lwt             5.74       1.77    3.25   0.0014 ** 
ht           -558.54     212.83   -2.62   0.0094 ** 
racewhite     226.54     112.23    2.02   0.0450 *  
raceblack    -207.12     166.35   -1.25   0.2147    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 692 on 184 degrees of freedom
Multiple R-squared: 0.119,  Adjusted R-squared: 0.0999 
F-statistic: 6.22 on 4 and 184 DF,  p-value: 0.000103