BIO 213 Regression: Interaction between a continuous and categorical variables

## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE, 
    echo = T, fig.width = 10, fig.height = 8)
options(width = 125, scipen = 5, digits = 5)

setwd("~/statistics/bio213/")

Load data

library(gdata)
lbw <- read.xls("~/statistics/bio213/lbw.xls")
lbw$smoke <- factor(lbw$smoke, levels = 0:1, labels = c("non-smoker","smoker"))

## Same data available online: http://www.umass.edu/statdata/statdata/data/index.html
## Cases 10 and 39 needs fix to make them identical to Dr. Orav's dataset
## lbw2 <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
## lbw2[c(10,39),"BWT"] <- c(2655,3035)

Scatter plot

library(ggplot2)
plot.scatter <- ggplot(lbw, aes(x = lwt, y = bwt, color = smoke)) +
    geom_point() +
    xlim(0,300)
plot.scatter

plot of chunk unnamed-chunk-3

Fit models

## Intercept-only
model.null                  <- lm(bwt ~ 1,                       lbw)
## Smoking-only
model.smoke                 <- lm(bwt ~ smoke,                   lbw)
## Maternal weight-only
model.lwt                   <- lm(bwt ~ lwt,                     lbw)
## Both without interaction
model.smoke.lwt             <- lm(bwt ~ smoke + lwt,             lbw)
## Both with interaction
model.smoke.lwt.interaction <- lm(bwt ~ smoke + lwt + smoke:lwt, lbw)

Intercept-only: Everybody is predicted have the mean bwt.

plot.scatter +
    geom_abline(aes(intercept = coef(model.null)["(Intercept)"],
                    slope     = 0))

plot of chunk unnamed-chunk-5

summary(model.null)

Call:
lm(formula = bwt ~ 1, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2235.8  -530.8    32.2   530.2  2045.2 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)     2945         53    55.5   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 729 on 188 degrees of freedom

Smoking-only: Smokers have a lower bwt, but it is not affected by lwt.

df <- data.frame(a = c(coef(model.smoke)["(Intercept)"],
                     sum(coef(model.smoke)[c("(Intercept)","smokesmoker")])
                     ),
                 b = 0)
plot.scatter +
    geom_abline(aes(intercept = a,
                    slope     = b,
                    color     = c("non-smoker","smoker")
                    ),
                    data = df)

plot of chunk unnamed-chunk-6

summary(model.smoke)

Call:
lm(formula = bwt ~ smoke, data = lbw)

Residuals:
   Min     1Q Median     3Q    Max 
 -2064   -478     35    545   1935 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   3055.0       66.9   45.64   <2e-16 ***
smokesmoker   -281.4      107.0   -2.63   0.0092 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 718 on 187 degrees of freedom
Multiple R-squared: 0.0357, Adjusted R-squared: 0.0305 
F-statistic: 6.92 on 1 and 187 DF,  p-value: 0.00923 

Maternal weight-only: Lwt affects bwt, smoking does not.

plot.scatter +
    geom_abline(aes(intercept = coef(model.lwt)["(Intercept)"],
                    slope     = coef(model.lwt)["lwt"]))

plot of chunk unnamed-chunk-7

summary(model.lwt)

Call:
lm(formula = bwt ~ lwt, data = lbw)

Residuals:
   Min     1Q Median     3Q    Max 
 -2192   -504     -4    508   2076 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2367.77     228.41   10.37   <2e-16 ***
lwt             4.44       1.71    2.59     0.01 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 718 on 187 degrees of freedom
Multiple R-squared: 0.0348, Adjusted R-squared: 0.0296 
F-statistic: 6.73 on 1 and 187 DF,  p-value: 0.0102 

Both without interaction: Same slopes and different intercepts

df <- data.frame(a = c(coef(model.smoke.lwt)["(Intercept)"],
                     sum(coef(model.smoke.lwt)[c("(Intercept)","smokesmoker")])),
                 b = coef(model.smoke.lwt)["lwt"])

plot.scatter +
    geom_abline(aes(intercept = a,
                slope = b,
                color = c("non-smoker","smoker")), data = df)

plot of chunk unnamed-chunk-8

summary(model.smoke.lwt)

Call:
lm(formula = bwt ~ smoke + lwt, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2030.1  -447.2    26.6   514.4  1968.6 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  2498.12     230.82   10.82   <2e-16 ***
smokesmoker  -269.70     105.59   -2.55    0.011 *  
lwt             4.25       1.69    2.52    0.013 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 708 on 186 degrees of freedom
Multiple R-squared: 0.0675, Adjusted R-squared: 0.0574 
F-statistic: 6.73 on 2 and 186 DF,  p-value: 0.00151 

Both with interaction: Different slopes and different intercepts

The p-value for the smoke = smoker variable is testing the significance of the intercept difference, which is not very useful.

df <- data.frame(
    a = c(coef(model.smoke.lwt.interaction)["(Intercept)"],
        sum(coef(model.smoke.lwt.interaction)[c("(Intercept)","smokesmoker")])),

    b = c(coef(model.smoke.lwt.interaction)["lwt"],
        sum(coef(model.smoke.lwt.interaction)[c("lwt","smokesmoker:lwt")]))
    )

plot.scatter +
    geom_abline(aes(intercept = a,
                slope = b,
                color = c("non-smoker","smoker")), data = df)

plot of chunk unnamed-chunk-9

summary(model.smoke.lwt.interaction)

Call:
lm(formula = bwt ~ smoke + lwt + smoke:lwt, data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2040.3  -456.2    28.6   531.7  1977.7 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)      2347.51     312.72    7.51  2.5e-12 ***
smokesmoker        43.90     451.16    0.10    0.923    
lwt                 5.40       2.34    2.31    0.022 *  
smokesmoker:lwt    -2.42       3.39   -0.71    0.476    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 709 on 185 degrees of freedom
Multiple R-squared: 0.07,   Adjusted R-squared: 0.055 
F-statistic: 4.64 on 3 and 185 DF,  p-value: 0.00372 

Centering to test difference between lines at different X-axis value

To test the distance between lines at a different point along the X-axis, center the X-axis variable (lwt) at a point of interest (here 120). This is actually shifting the X-Y coordinate, and then testing the new intercept difference.

last_plot() + geom_vline(xintercept = c(0, 120))

plot of chunk unnamed-chunk-10

summary(lm(formula = bwt ~ smoke + I(lwt - 120) + smoke:I(lwt - 120), data = lbw))

Call:
lm(formula = bwt ~ smoke + I(lwt - 120) + smoke:I(lwt - 120), 
    data = lbw)

Residuals:
    Min      1Q  Median      3Q     Max 
-2040.3  -456.2    28.6   531.7  1977.7 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)               2996.07      70.82   42.31   <2e-16 ***
smokesmoker               -246.82     110.46   -2.23    0.027 *  
I(lwt - 120)                 5.40       2.34    2.31    0.022 *  
smokesmoker:I(lwt - 120)    -2.42       3.39   -0.71    0.476    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 

Residual standard error: 709 on 185 degrees of freedom
Multiple R-squared: 0.07,   Adjusted R-squared: 0.055 
F-statistic: 4.64 on 3 and 185 DF,  p-value: 0.00372 

Now the smoke = smoker variable is significant, meaning the difference between these two lines at lwt = 120 is significant.