Linear regression 1: Prepare data/specify model/read results

References

Dataset: low birthweight dataset

http://www.umass.edu/statdata/statdata/data/lowbwt.txt

SOURCE: Hosmer and Lemeshow (2000) Applied Logistic Regression: Second Edition. These data are copyrighted by John Wiley & Sons Inc. and must be acknowledged and used accordingly. Data were collected at Baystate Medical Center, Springfield, Massachusetts during 1986.

Load dataset

## Fix cases 10 and 39 to make them identical to Dr. Orav's dataset
lbw <- read.table("http://www.umass.edu/statdata/statdata/data/lowbwt.dat", head = T, skip = 4)
lbw[c(10,39),"BWT"] <- c(2655,3035)

## Change variable names to lower cases
names(lbw) <- tolower(names(lbw))
head(lbw)
  id low age lwt race smoke ptl ht ui ftv  bwt
1 85   0  19 182    2     0   0  0  1   0 2523
2 86   0  33 155    3     0   0  0  0   3 2551
3 87   0  20 105    1     1   0  0  0   1 2557
4 88   0  21 108    1     1   0  0  1   2 2594
5 89   0  18 107    1     1   0  0  1   0 2600
6 91   0  21 124    3     0   0  0  0   0 2622

See overview of data using gpairs package

library(gpairs)
gpairs(lbw)

plot of chunk unnamed-chunk-3

Recode data

## Recoding
lbw <- within(lbw, {

    ## Relabel race
    race.cat <- factor(race, levels = 1:3, labels = c("White","Black","Other"))

    ## Categorize ftv (frequency of visit)
    ftv.cat  <- cut(ftv, breaks = c(-Inf, 0, 2, Inf), labels = c("None","Normal","Many"))
    ftv.cat  <- relevel(ftv.cat, ref = "Normal")

    ## Dichotomize ptl
    preterm  <- factor(ptl >= 1, levels = c(FALSE,TRUE), labels = c("0","1+"))

    ## Categorize smoke ht ui
    smoke    <- factor(smoke, levels = 0:1, labels = c("No","Yes"))
    ht       <- factor(ht, levels = 0:1, labels = c("No","Yes"))
    ui       <- factor(ui, levels = 0:1, labels = c("No","Yes"))
})

Alternative to the “## Categorize smoke ht ui” part.

## Categorize smoke ht ui
lbw[,c("smoke","ht","ui")] <- 
    lapply(lbw[,c("smoke","ht","ui")],
           function(var) {
               var <- factor(var, levels = 0:1, labels = c("No","Yes"))
           })

Fit a model

lm.full <- lm(bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + preterm, data = lbw)
lm.null <- lm(bwt ~ 1, data = lbw)

See result object

lm.full

Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + 
    preterm, data = lbw)

Coefficients:
  (Intercept)            age            lwt       smokeYes          htYes          uiYes    ftv.catNone  
      2947.32          -2.91           4.22        -307.34        -568.63        -494.11         -56.50  
  ftv.catMany  race.catBlack  race.catOther      preterm1+  
      -185.86        -467.30        -322.81        -207.91  

See summary

summary(lm.full)

Call:
lm(formula = bwt ~ age + lwt + smoke + ht + ui + ftv.cat + race.cat + 
    preterm, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-1896.7  -443.3    53.2   466.1  1654.4 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)    2947.32     320.48    9.20  < 2e-16 ***
age              -2.91       9.67   -0.30  0.76354    
lwt               4.22       1.72    2.46  0.01488 *  
smokeYes       -307.34     109.13   -2.82  0.00541 ** 
htYes          -568.63     200.88   -2.83  0.00518 ** 
uiYes          -494.11     137.23   -3.60  0.00041 ***
ftv.catNone     -56.50     105.36   -0.54  0.59245    
ftv.catMany    -185.86     203.19   -0.91  0.36158    
race.catBlack  -467.30     149.78   -3.12  0.00211 ** 
race.catOther  -322.81     117.40   -2.75  0.00658 ** 
preterm1+      -207.91     136.35   -1.52  0.12907    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 647 on 178 degrees of freedom
Multiple R-squared: 0.255,  Adjusted R-squared: 0.213 
F-statistic: 6.08 on 10 and 178 DF,  p-value: 0.0000000609 

Confidence intervals

confint(lm.full)
                  2.5 %   97.5 %
(Intercept)   2314.9019 3579.741
age            -22.0027   16.174
lwt              0.8344    7.612
smokeYes      -522.7049  -91.979
htYes         -965.0356 -172.218
uiYes         -764.9083 -223.303
ftv.catNone   -264.4122  151.414
ftv.catMany   -586.8257  215.111
race.catBlack -762.8701 -171.736
race.catOther -554.4737  -91.143
preterm1+     -476.9680   61.155

ANOVA table (type I sum of squares)

anova(lm.full)
Analysis of Variance Table

Response: bwt
           Df   Sum Sq Mean Sq F value   Pr(>F)    
age         1   812010  812010    1.94   0.1653    
lwt         1  2991583 2991583    7.15   0.0082 ** 
smoke       1  3196588 3196588    7.64   0.0063 ** 
ht          1  3554719 3554719    8.50   0.0040 ** 
ui          1  6866122 6866122   16.41 0.000076 ***
ftv.cat     2   845895  422947    1.01   0.3660    
race.cat    2  6212222 3106111    7.42   0.0008 ***
preterm     1   972851  972851    2.33   0.1291    
Residuals 178 74475274  418400                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

ANOVA table (type III sum of squares)

library(car)
Anova(lm.full, type = 3)
Anova Table (Type III tests)

Response: bwt
              Sum Sq  Df F value  Pr(>F)    
(Intercept) 35388198   1   84.58 < 2e-16 ***
age            37981   1    0.09 0.76354    
lwt          2530480   1    6.05 0.01488 *  
smoke        3318310   1    7.93 0.00541 ** 
ht           3352598   1    8.01 0.00518 ** 
ui           5424365   1   12.96 0.00041 ***
ftv.cat       383274   2    0.46 0.63328    
race.cat     5559887   2    6.64 0.00165 ** 
preterm       972851   1    2.33 0.12907    
Residuals   74475274 178                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Effect displays

Effect of a given predictor after other variables are typical values.

library(effects)
plot(allEffects(lm.full), ylim = c(2000,4000))

plot of chunk unnamed-chunk-12

More complex model for interaction plot

The model itself is not very meaningful. This is for demonstration purpose only.

lm.full.int <- lm(bwt ~ age*lwt + smoke + ht + ui + age*ftv.cat + race.cat*preterm, data = lbw)
Anova(lm.full.int, type = 3)
Anova Table (Type III tests)

Response: bwt
                   Sum Sq  Df F value  Pr(>F)    
(Intercept)        303077   1    0.76  0.3834    
age               1589252   1    4.00  0.0469 *  
lwt               1537542   1    3.87  0.0506 .  
smoke             3675193   1    9.26  0.0027 ** 
ht                2991978   1    7.54  0.0067 ** 
ui                6382035   1   16.08 0.00009 ***
ftv.cat           4446524   2    5.60  0.0044 ** 
race.cat          2954002   2    3.72  0.0261 *  
preterm             77945   1    0.20  0.6582    
age:lwt            750299   1    1.89  0.1709    
age:ftv.cat       4965938   2    6.26  0.0024 ** 
race.cat:preterm   335867   2    0.42  0.6556    
Residuals        68649992 173                    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Continuous x Continuous interaction

plot(effect("age:lwt", lm.full.int))

plot of chunk unnamed-chunk-14

plot(effect("age:lwt", lm.full.int, xlevels = list(lwt = c(100,150,200,250))), multiline = T)

plot of chunk unnamed-chunk-14

Continuous x Categorical interaction

plot(effect("age:ftv.cat", lm.full.int), multiline = TRUE)

plot of chunk unnamed-chunk-15

Categorical x Categorical interaction

Use x.var and z.var to override the default X-axis variable and grouping variable assignment. The default is the predictor with the largest number of levels or values for x.var, which is not useful in categorical x categorical interactions.

plot(effect(c("race.cat*preterm"), lm.full.int), multiline = TRUE)

plot of chunk unnamed-chunk-16

plot(effect(c("race.cat*preterm"), lm.full.int), x.var = "preterm", z.var = "race.cat", multiline = TRUE)

plot of chunk unnamed-chunk-16