## Settings for RMarkdown http://yihui.name/knitr/options#chunk_options
opts_chunk$set(comment = "", warning = FALSE, message = FALSE, tidy = FALSE,
echo = T, fig.width = 10, fig.height = 8)
options(width = 125, scipen = 5, digits = 5)
setwd("~/statistics/bio213/")
library(gdata)
lbw <- read.xls("~/statistics/bio213/lbw.xls")
lbw$smoke <- factor(lbw$smoke, levels = 0:1, labels = c("non-smoker","smoker"))
## Same data available online: http://www.umass.edu/statdata/statdata/data/index.html
## Cases 10 and 39 needs fix to make them identical to Dr. Orav's dataset
## lbw2 <- read.xls("http://www.umass.edu/statdata/statdata/data/lowbwt.xls")
## lbw2[c(10,39),"BWT"] <- c(2655,3035)
library(ggplot2)
plot.scatter <- ggplot(lbw, aes(x = lwt, y = bwt, color = smoke)) +
geom_point() +
xlim(0,300)
plot.scatter
## Intercept-only
model.null <- lm(bwt ~ 1, lbw)
## Smoking-only
model.smoke <- lm(bwt ~ smoke, lbw)
## Maternal weight-only
model.lwt <- lm(bwt ~ lwt, lbw)
## Both without interaction
model.smoke.lwt <- lm(bwt ~ smoke + lwt, lbw)
## Both with interaction
model.smoke.lwt.interaction <- lm(bwt ~ smoke + lwt + smoke:lwt, lbw)
plot.scatter +
geom_abline(aes(intercept = coef(model.null)["(Intercept)"],
slope = 0))
summary(model.null)
Call:
lm(formula = bwt ~ 1, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2235.8 -530.8 32.2 530.2 2045.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2945 53 55.5 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 729 on 188 degrees of freedom
df <- data.frame(a = c(coef(model.smoke)["(Intercept)"],
sum(coef(model.smoke)[c("(Intercept)","smokesmoker")])
),
b = 0)
plot.scatter +
geom_abline(aes(intercept = a,
slope = b,
color = c("non-smoker","smoker")
),
data = df)
summary(model.smoke)
Call:
lm(formula = bwt ~ smoke, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2064 -478 35 545 1935
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3055.0 66.9 45.64 <2e-16 ***
smokesmoker -281.4 107.0 -2.63 0.0092 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 718 on 187 degrees of freedom
Multiple R-squared: 0.0357, Adjusted R-squared: 0.0305
F-statistic: 6.92 on 1 and 187 DF, p-value: 0.00923
plot.scatter +
geom_abline(aes(intercept = coef(model.lwt)["(Intercept)"],
slope = coef(model.lwt)["lwt"]))
summary(model.lwt)
Call:
lm(formula = bwt ~ lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2192 -504 -4 508 2076
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2367.77 228.41 10.37 <2e-16 ***
lwt 4.44 1.71 2.59 0.01 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 718 on 187 degrees of freedom
Multiple R-squared: 0.0348, Adjusted R-squared: 0.0296
F-statistic: 6.73 on 1 and 187 DF, p-value: 0.0102
df <- data.frame(a = c(coef(model.smoke.lwt)["(Intercept)"],
sum(coef(model.smoke.lwt)[c("(Intercept)","smokesmoker")])),
b = coef(model.smoke.lwt)["lwt"])
plot.scatter +
geom_abline(aes(intercept = a,
slope = b,
color = c("non-smoker","smoker")), data = df)
summary(model.smoke.lwt)
Call:
lm(formula = bwt ~ smoke + lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2030.1 -447.2 26.6 514.4 1968.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2498.12 230.82 10.82 <2e-16 ***
smokesmoker -269.70 105.59 -2.55 0.011 *
lwt 4.25 1.69 2.52 0.013 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 708 on 186 degrees of freedom
Multiple R-squared: 0.0675, Adjusted R-squared: 0.0574
F-statistic: 6.73 on 2 and 186 DF, p-value: 0.00151
The p-value for the smoke = smoker variable is testing the significance of the intercept difference, which is not very useful.
df <- data.frame(
a = c(coef(model.smoke.lwt.interaction)["(Intercept)"],
sum(coef(model.smoke.lwt.interaction)[c("(Intercept)","smokesmoker")])),
b = c(coef(model.smoke.lwt.interaction)["lwt"],
sum(coef(model.smoke.lwt.interaction)[c("lwt","smokesmoker:lwt")]))
)
plot.scatter +
geom_abline(aes(intercept = a,
slope = b,
color = c("non-smoker","smoker")), data = df)
summary(model.smoke.lwt.interaction)
Call:
lm(formula = bwt ~ smoke + lwt + smoke:lwt, data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2040.3 -456.2 28.6 531.7 1977.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2347.51 312.72 7.51 2.5e-12 ***
smokesmoker 43.90 451.16 0.10 0.923
lwt 5.40 2.34 2.31 0.022 *
smokesmoker:lwt -2.42 3.39 -0.71 0.476
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 709 on 185 degrees of freedom
Multiple R-squared: 0.07, Adjusted R-squared: 0.055
F-statistic: 4.64 on 3 and 185 DF, p-value: 0.00372
To test the distance between lines at a different point along the X-axis, center the X-axis variable (lwt) at a point of interest (here 120). This is actually shifting the X-Y coordinate, and then testing the new intercept difference.
last_plot() + geom_vline(xintercept = c(0, 120))
summary(lm(formula = bwt ~ smoke + I(lwt - 120) + smoke:I(lwt - 120), data = lbw))
Call:
lm(formula = bwt ~ smoke + I(lwt - 120) + smoke:I(lwt - 120),
data = lbw)
Residuals:
Min 1Q Median 3Q Max
-2040.3 -456.2 28.6 531.7 1977.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2996.07 70.82 42.31 <2e-16 ***
smokesmoker -246.82 110.46 -2.23 0.027 *
I(lwt - 120) 5.40 2.34 2.31 0.022 *
smokesmoker:I(lwt - 120) -2.42 3.39 -0.71 0.476
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 709 on 185 degrees of freedom
Multiple R-squared: 0.07, Adjusted R-squared: 0.055
F-statistic: 4.64 on 3 and 185 DF, p-value: 0.00372
Now the smoke = smoker variable is significant, meaning the difference between these two lines at lwt = 120 is significant.