Created on 23 June 2013
Revised on Sun Jun 23 15:44:28 2013
Load Galton Data
library(UsingR)
## Warning: package 'UsingR' was built under R version 3.0.1
## Loading required package: MASS
data(galton)
par(mfrow = c(1, 2))
hist(galton$child, col = "blue", breaks = 100)
hist(galton$parent, col = "blue", breaks = 100)
The distribution of child heights
par(mfrow = c(1, 1))
hist(galton$child, col = "blue", breaks = 100)
Only know the child - average height
hist(galton$child, col = "blue", breaks = 100)
meanChild <- mean(galton$child)
lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5)
What if we plot child versus average parent
plot(galton$parent, galton$child, pch = 19, col = "blue")
Jittered plot
set.seed(1234)
plot(jitter(galton$parent, factor = 2), jitter(galton$child, factor = 2), pch = 19,
col = "blue")
Average parent = 65 inches tall
plot(galton$parent, galton$child, pch = 19, col = "blue")
near65 <- galton[abs(galton$parent - 65) < 1, ]
points(near65$parent, near65$child, pch = 19, col = "red")
lines(seq(64, 66, length = 100), rep(mean(near65$child), 100), col = "red",
lwd = 4)
Average parent = 71 inches tall
plot(galton$parent, galton$child, pch = 19, col = "blue")
near71 <- galton[abs(galton$parent - 71) < 1, ]
points(near71$parent, near71$child, pch = 19, col = "red")
lines(seq(70, 72, length = 100), rep(mean(near71$child), 100), col = "red",
lwd = 4)
Fitting a line
plot(galton$parent, galton$child, pch = 19, col = "blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)
Why not this line?
plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, 26 + 0.646 * galton$parent)
Not all points are on the line
plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)
Plot what is leftover
oldpar = par(mfrow = c(1, 2))
plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)
plot(galton$parent, lm1$residuals, col = "blue", pch = 19)
abline(c(0, 0), col = "red", lwd = 3)
par(oldpar)
Fit a line to the Galton Data
library(UsingR)
data(galton)
plot(galton$parent, galton$child, pch = 19, col = "blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)
lm1
##
## Call:
## lm(formula = galton$child ~ galton$parent)
##
## Coefficients:
## (Intercept) galton$parent
## 23.942 0.646
Create a “population” of 1 million families
newGalton <- data.frame(parent = rep(NA, 1e+06), child = rep(NA, 1e+06))
newGalton$parent <- rnorm(1e+06, mean = mean(galton$parent), sd = sd(galton$parent))
newGalton$child <- lm1$coeff[1] + lm1$coeff[2] * newGalton$parent + rnorm(1e+06,
sd = sd(lm1$residuals))
smoothScatter(newGalton$parent, newGalton$child)
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
abline(lm1, col = "red", lwd = 3)
Let's take a sample
set.seed(134325)
sampleGalton1 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm1 <- lm(sampleGalton1$child ~ sampleGalton1$parent)
plot(sampleGalton1$parent, sampleGalton1$child, pch = 19, col = "blue")
lines(sampleGalton1$parent, sampleLm1$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)
Let's take another sample
sampleGalton2 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm2 <- lm(sampleGalton2$child ~ sampleGalton2$parent)
plot(sampleGalton2$parent, sampleGalton2$child, pch = 19, col = "blue")
lines(sampleGalton2$parent, sampleLm2$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)
Let's take another sample
sampleGalton3 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm3 <- lm(sampleGalton3$child ~ sampleGalton3$parent)
plot(sampleGalton3$parent, sampleGalton3$child, pch = 19, col = "blue")
lines(sampleGalton3$parent, sampleLm3$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)
Many samples
sampleLm <- vector(100, mode = "list")
for (i in 1:100) {
sampleGalton <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm[[i]] <- lm(sampleGalton$child ~ sampleGalton$parent)
}
smoothScatter(newGalton$parent, newGalton$child)
for (i in 1:100) {
abline(sampleLm[[i]], lwd = 3, lty = 2)
}
abline(lm1, col = "red", lwd = 3)
Histogram of estimates
oldpar = par(mfrow = c(1, 2))
hist(sapply(sampleLm, function(x) {
coef(x)[1]
}), col = "blue", xlab = "Intercept", main = "")
hist(sapply(sampleLm, function(x) {
coef(x)[2]
}), col = "blue", xlab = "Slope", main = "")
par(oldpar)
Estimating the values in R
sampleGalton4 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm4 <- lm(sampleGalton4$child ~ sampleGalton4$parent)
summary(sampleLm4)
##
## Call:
## lm(formula = sampleGalton4$child ~ sampleGalton4$parent)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.391 -1.805 -0.434 1.912 5.966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.859 13.831 0.71 0.47941
## sampleGalton4$parent 0.851 0.202 4.22 0.00011 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.58 on 48 degrees of freedom
## Multiple R-squared: 0.27, Adjusted R-squared: 0.255
## F-statistic: 17.8 on 1 and 48 DF, p-value: 0.000109
hist(sapply(sampleLm, function(x) {
coef(x)[2]
}), col = "blue", xlab = "Slope", main = "", freq = F)
lines(seq(0, 5, length = 100), dnorm(seq(0, 5, length = 100), mean = coef(sampleLm4)[2],
sd = summary(sampleLm4)$coeff[2, 2]), lwd = 3, col = "red")
Why do we standardize?
oldpar = par(mfrow = c(1, 2))
hist(sapply(sampleLm, function(x) {
coef(x)[1]
}), col = "blue", xlab = "Intercept", main = "")
hist(sapply(sampleLm, function(x) {
coef(x)[2]
}), col = "blue", xlab = "Slope", main = "")
par(oldpar)
t(n−2) versus N(0, 1)
x <- seq(-5, 5, length = 100)
plot(x, dnorm(x), type = "l", lwd = 3)
lines(x, dt(x, df = 3), lwd = 3, col = "red")
lines(x, dt(x, df = 10), lwd = 3, col = "blue")
Confidence intervals
summary(sampleLm4)$coeff
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.8590 13.8309 0.7128 0.4794056
## sampleGalton4$parent 0.8515 0.2019 4.2172 0.0001089
confint(sampleLm4, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -17.9499 37.668
## sampleGalton4$parent 0.4455 1.257
par(mar = c(4, 4, 0, 2))
plot(1:10, type = "n", xlim = c(0, 1.5), ylim = c(0, 100), xlab = "Coefficient Values",
ylab = "Replication")
for (i in 1:100) {
ci <- confint(sampleLm[[i]])
color = "red"
if ((ci[2, 1] < lm1$coeff[2]) & (lm1$coeff[2] < ci[2, 2])) {
color = "grey"
}
segments(ci[2, 1], i, ci[2, 2], i, col = color, lwd = 3)
}
lines(rep(lm1$coeff[2], 100), seq(0, 100, length = 100), lwd = 3)
How you report the inference
sampleLm4$coeff
## (Intercept) sampleGalton4$parent
## 9.8590 0.8515
confint(sampleLm4, level = 0.95)
## 2.5 % 97.5 %
## (Intercept) -17.9499 37.668
## sampleGalton4$parent 0.4455 1.257
A one inch increase in parental height is associated with a 0.77 inch increase in child's height (95% CI: 0.42-1.12 inches).