<< Data Analysis >> Note part 3

Created on 23 June 2013
Revised on Sun Jun 23 15:44:28 2013

13 Basic least squares

Load Galton Data

library(UsingR)
## Warning: package 'UsingR' was built under R version 3.0.1
## Loading required package: MASS
data(galton)
par(mfrow = c(1, 2))
hist(galton$child, col = "blue", breaks = 100)
hist(galton$parent, col = "blue", breaks = 100)

plot of chunk unnamed-chunk-1

The distribution of child heights

par(mfrow = c(1, 1))
hist(galton$child, col = "blue", breaks = 100)

plot of chunk unnamed-chunk-2

Only know the child - average height

hist(galton$child, col = "blue", breaks = 100)
meanChild <- mean(galton$child)
lines(rep(meanChild, 100), seq(0, 150, length = 100), col = "red", lwd = 5)

plot of chunk unnamed-chunk-3

What if we plot child versus average parent

plot(galton$parent, galton$child, pch = 19, col = "blue")

plot of chunk unnamed-chunk-4

Jittered plot

set.seed(1234)
plot(jitter(galton$parent, factor = 2), jitter(galton$child, factor = 2), pch = 19, 
    col = "blue")

plot of chunk unnamed-chunk-5

Average parent = 65 inches tall

plot(galton$parent, galton$child, pch = 19, col = "blue")
near65 <- galton[abs(galton$parent - 65) < 1, ]
points(near65$parent, near65$child, pch = 19, col = "red")
lines(seq(64, 66, length = 100), rep(mean(near65$child), 100), col = "red", 
    lwd = 4)

plot of chunk unnamed-chunk-6

Average parent = 71 inches tall

plot(galton$parent, galton$child, pch = 19, col = "blue")
near71 <- galton[abs(galton$parent - 71) < 1, ]
points(near71$parent, near71$child, pch = 19, col = "red")
lines(seq(70, 72, length = 100), rep(mean(near71$child), 100), col = "red", 
    lwd = 4)

plot of chunk unnamed-chunk-7

Fitting a line

plot(galton$parent, galton$child, pch = 19, col = "blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)

plot of chunk unnamed-chunk-8

Why not this line?

plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, 26 + 0.646 * galton$parent)

plot of chunk unnamed-chunk-9

Not all points are on the line

plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)

plot of chunk unnamed-chunk-10

Plot what is leftover

oldpar = par(mfrow = c(1, 2))
plot(galton$parent, galton$child, pch = 19, col = "blue")
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)
plot(galton$parent, lm1$residuals, col = "blue", pch = 19)
abline(c(0, 0), col = "red", lwd = 3)

plot of chunk unnamed-chunk-11

par(oldpar)

14 Inference basics

Fit a line to the Galton Data

library(UsingR)
data(galton)
plot(galton$parent, galton$child, pch = 19, col = "blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent, lm1$fitted, col = "red", lwd = 3)

plot of chunk unnamed-chunk-12

lm1
## 
## Call:
## lm(formula = galton$child ~ galton$parent)
## 
## Coefficients:
##   (Intercept)  galton$parent  
##        23.942          0.646

Create a “population” of 1 million families

newGalton <- data.frame(parent = rep(NA, 1e+06), child = rep(NA, 1e+06))
newGalton$parent <- rnorm(1e+06, mean = mean(galton$parent), sd = sd(galton$parent))
newGalton$child <- lm1$coeff[1] + lm1$coeff[2] * newGalton$parent + rnorm(1e+06, 
    sd = sd(lm1$residuals))
smoothScatter(newGalton$parent, newGalton$child)
## KernSmooth 2.23 loaded Copyright M. P. Wand 1997-2009
abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-13

Let's take a sample

set.seed(134325)
sampleGalton1 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm1 <- lm(sampleGalton1$child ~ sampleGalton1$parent)
plot(sampleGalton1$parent, sampleGalton1$child, pch = 19, col = "blue")
lines(sampleGalton1$parent, sampleLm1$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-14

Let's take another sample

sampleGalton2 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm2 <- lm(sampleGalton2$child ~ sampleGalton2$parent)
plot(sampleGalton2$parent, sampleGalton2$child, pch = 19, col = "blue")
lines(sampleGalton2$parent, sampleLm2$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-15

Let's take another sample

sampleGalton3 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm3 <- lm(sampleGalton3$child ~ sampleGalton3$parent)
plot(sampleGalton3$parent, sampleGalton3$child, pch = 19, col = "blue")
lines(sampleGalton3$parent, sampleLm3$fitted, lwd = 3, lty = 2)
abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-16

Many samples

sampleLm <- vector(100, mode = "list")

for (i in 1:100) {
    sampleGalton <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
    sampleLm[[i]] <- lm(sampleGalton$child ~ sampleGalton$parent)
}

smoothScatter(newGalton$parent, newGalton$child)

for (i in 1:100) {
    abline(sampleLm[[i]], lwd = 3, lty = 2)
}

abline(lm1, col = "red", lwd = 3)

plot of chunk unnamed-chunk-17

Histogram of estimates

oldpar = par(mfrow = c(1, 2))
hist(sapply(sampleLm, function(x) {
    coef(x)[1]
}), col = "blue", xlab = "Intercept", main = "")
hist(sapply(sampleLm, function(x) {
    coef(x)[2]
}), col = "blue", xlab = "Slope", main = "")

plot of chunk unnamed-chunk-18

par(oldpar)

Estimating the values in R

sampleGalton4 <- newGalton[sample(1:1e+06, size = 50, replace = F), ]
sampleLm4 <- lm(sampleGalton4$child ~ sampleGalton4$parent)
summary(sampleLm4)
## 
## Call:
## lm(formula = sampleGalton4$child ~ sampleGalton4$parent)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.391 -1.805 -0.434  1.912  5.966 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             9.859     13.831    0.71  0.47941    
## sampleGalton4$parent    0.851      0.202    4.22  0.00011 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.58 on 48 degrees of freedom
## Multiple R-squared:  0.27,   Adjusted R-squared:  0.255 
## F-statistic: 17.8 on 1 and 48 DF,  p-value: 0.000109
hist(sapply(sampleLm, function(x) {
    coef(x)[2]
}), col = "blue", xlab = "Slope", main = "", freq = F)
lines(seq(0, 5, length = 100), dnorm(seq(0, 5, length = 100), mean = coef(sampleLm4)[2], 
    sd = summary(sampleLm4)$coeff[2, 2]), lwd = 3, col = "red")

plot of chunk unnamed-chunk-20

Why do we standardize?

oldpar = par(mfrow = c(1, 2))
hist(sapply(sampleLm, function(x) {
    coef(x)[1]
}), col = "blue", xlab = "Intercept", main = "")
hist(sapply(sampleLm, function(x) {
    coef(x)[2]
}), col = "blue", xlab = "Slope", main = "")

plot of chunk unnamed-chunk-21

par(oldpar)

t(n−2) versus N(0, 1)

x <- seq(-5, 5, length = 100)
plot(x, dnorm(x), type = "l", lwd = 3)
lines(x, dt(x, df = 3), lwd = 3, col = "red")
lines(x, dt(x, df = 10), lwd = 3, col = "blue")

plot of chunk unnamed-chunk-22

Confidence intervals

summary(sampleLm4)$coeff
##                      Estimate Std. Error t value  Pr(>|t|)
## (Intercept)            9.8590    13.8309  0.7128 0.4794056
## sampleGalton4$parent   0.8515     0.2019  4.2172 0.0001089
confint(sampleLm4, level = 0.95)
##                         2.5 % 97.5 %
## (Intercept)          -17.9499 37.668
## sampleGalton4$parent   0.4455  1.257
par(mar = c(4, 4, 0, 2))
plot(1:10, type = "n", xlim = c(0, 1.5), ylim = c(0, 100), xlab = "Coefficient Values", 
    ylab = "Replication")
for (i in 1:100) {
    ci <- confint(sampleLm[[i]])
    color = "red"
    if ((ci[2, 1] < lm1$coeff[2]) & (lm1$coeff[2] < ci[2, 2])) {
        color = "grey"
    }
    segments(ci[2, 1], i, ci[2, 2], i, col = color, lwd = 3)
}
lines(rep(lm1$coeff[2], 100), seq(0, 100, length = 100), lwd = 3)

plot of chunk unnamed-chunk-24

How you report the inference

sampleLm4$coeff
##          (Intercept) sampleGalton4$parent 
##               9.8590               0.8515
confint(sampleLm4, level = 0.95)
##                         2.5 % 97.5 %
## (Intercept)          -17.9499 37.668
## sampleGalton4$parent   0.4455  1.257

A one inch increase in parental height is associated with a 0.77 inch increase in child's height (95% CI: 0.42-1.12 inches).