Goals of statistical modeling

Example: Average parent and child heights

Load Galton Data

library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: replacing previous import by 'ggplot2::arrow' when loading 'Hmisc'
## Warning: replacing previous import by 'ggplot2::unit' when loading 'Hmisc'
## Warning: replacing previous import by 'scales::alpha' when loading 'Hmisc'
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
## 
## 
## Attaching package: 'UsingR'
## 
## The following object is masked from 'package:survival':
## 
##     cancer
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(UsingR)
library(manipulate)
library(reshape)
## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:dplyr':
## 
##     rename
data(galton)
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100)
hist(galton$parent,col="blue",breaks=100)

##or the ggplot way 
 
long <- melt(galton)
## Using  as id variables
g <- ggplot(long, aes(x = value, fill = variable)) 
g <- g + geom_histogram(colour = "black", binwidth=1) 
g <- g + facet_grid(. ~ variable)
g

The distribution of child heights

hist(galton$child,col="blue",breaks=100)

myHist<- function(mu){
  hist(galton$child,col="blue",breaks=100)
  lines(c(mu,mu),c(0,150),col="red",lwd=5)
  mse<- mean((galton$child-mu)^2)
  text(63,150,paste("MSE",round(mse,2)))
}

##manipulate(myHist(mu),mu=slider(62,74,step=0.5))

Only know the child - average height

hist(galton$child,col="blue",breaks=100)
meanChild <- mean(galton$child)
lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5)

Experiment

Use R studio’s manipulate to see what value of \(\mu\) minimizes the

sum of the squared deviations.

myHist <- function(mu){
  mse <- mean((galton$child - mu)^2)
  g<-ggplot(galton,aes(x=child))+
    geom_histogram(fill="salmon",colour="black",binwidth=1)
  g <- g + geom_vline(xintercept = mu, size = 3)
  g <- g + ggtitle(paste("mu= ",mu,",MSE= ",round(mse,2),sep = ""))
  g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

myHist(mean(galton$child))

####old version minimise mu for childrens heights

myHist <- function(mu){ 
   hist(galton$child,col="blue",breaks=100) 
   lines(c(mu, mu), c(0, 150),col="red",lwd=5) 
   mse <- mean((galton$child - mu)^2) 
   text(63, 150, paste("mu = ", mu)) 
   text(63, 140, paste("MSE = ", round(mse, 2))) 
 } 
 #manipulate(myHist(mu), mu = slider(62, 74, step = 0.5)) 

Comparing childrens’ heights and their parents’ heights

What if we plot child versus average parent

plot(galton$parent,galton$child,pch=19,col="blue")

##better plot 

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g+geom_point(colour="grey50",aes(size = freq+20,show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")                    
g

## Regression through the origin * Suppose that \(X_i\) are the parents’ heights. * Consider picking the slope \(\beta\) that minimizes \[\sum_{i=1}^n (Y_i - X_i \beta)^2\] * This is exactly using the origin as a pivot point picking the line that minimizes the sum of the squared vertical distances of the points to the line * Use R studio’s manipulate function to experiment * Subtract the means so that the origin is the mean of the parent and children’s heights


y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
    g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
    g <- g  + scale_size(range = c(2, 20), guide = "none" )
    g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
    g <- g + geom_point(aes(colour=freq, size = freq))
    g <- g + scale_colour_gradient(low = "lightblue", high="white")                     
    g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
    mse <- mean( (y - beta * x) ^2 )
    g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
    g
}
#manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))

General least squares for linear equations

Consider again the parent and child height data from Galton

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")  
g

Results

Revisiting Galton’s data

Reversing the outcome/predictor relationship

beta1 <- cor(y, x) *  sd(x) / sd(y)
beta0 <- mean(x) - beta1 * mean(y)
rbind(c(beta0, beta1), coef(lm(x ~ y)))
##      (Intercept)         y
## [1,]    46.13535 0.3256475
## [2,]    46.13535 0.3256475

Revisiting Galton’s data

Normalizing variables results in the slope being the correlation

yn <- (y - mean(y))/sd(y)
xn <- (x - mean(x))/sd(x)
c(cor(y, x), cor(yn, xn), coef(lm(yn ~ xn))[2])
##                            xn 
## 0.4587624 0.4587624 0.4587624

Regression to the mean

g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")  
g <- g + geom_smooth(method="lm", formula=y~x)
g

Jittered plot

set.seed(1234)
plot(jitter(galton$parent,factor=2),jitter(galton$child,factor=2),pch=19,col="blue")

ggplot(galton, aes(x = parent, y = child)) + geom_point()

## Average parent = 65 inches tall

plot(galton$parent,galton$child,pch=19,col="blue")
near65 <- galton[abs(galton$parent - 65)<1, ]
points(near65$parent,near65$child,pch=19,col="red")
lines(seq(64,66,length=100),rep(mean(near65$child),100),col="red",lwd=4)

## Average parent = 71 inches tall

plot(galton$parent,galton$child,pch=19,col="blue")
near71 <- galton[abs(galton$parent - 71)<1, ]
points(near71$parent,near71$child,pch=19,col="red")
lines(seq(70,72,length=100),rep(mean(near71$child),100),col="red",lwd=4)

Fitting a line

plot(galton$parent,galton$child,pch=19,col="blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)


Why not this line?

plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent, 26 + 0.646*galton$parent)

y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))

myPlot <- function(beta){
    g <- ggplot(filter(freqData,freq > 0),aes(x = parent, y = child))
    g <- g  + scale_size(range = c(2, 20), guide = "none" )
    g <- g +geom_point(colour="grey50",aes(size =freq+20,show_guide = FALSE))
    g <- g + geom_point(aes(colour=freq, size = freq))
    g <- g + scale_colour_gradient(low = "lightblue", high="white")                     
    g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
    mse <- mean( (y - beta * x) ^2 )
    g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
    g
}
#manipulate(myPlot(beta),beta = slider(0.6, 1.2,step = 0.02))

The equation for a line

If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, then we can imagine writing the equation for a line

\[C_i = b_0 + b_1 P_i\]


Not all points are on the line

plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)

# load father.son data
data(father.son)
# normalize son's height
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
# normalize father's height
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
# calculate correlation
rho <- cor(x, y)
# plot the relationship between the two
g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 3, alpha = .2, colour = "black")
g = g + geom_point(size = 2, alpha = .2, colour = "red")
g = g + xlim(-4, 4) + ylim(-4, 4)
# reference line for perfect correlation between
# variables (data points will like on diagonal line)
g = g + geom_abline(intercept = 0, slope = 1)
#g = g + geom_abline(position = "identity")
# if there is no correlation between the two variables,
# the data points will lie on horizontal/vertical lines
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
# plot the actual correlation for both
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
# add appropriate labels
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g = g + geom_text(x = 3.8, y = 1.6, label="x~y", angle = 25) +
geom_text(x = 3.2, y = 3.6, label="cor(y,x)=1", angle = 35) +
geom_text(x = 1.6, y = 3.8, label="y~x", angle = 60)
g

Regression to the mean


data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)


g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 6, colour = "black", alpha = 0.2)
g = g + geom_point(size = 4, colour = "salmon", alpha = 0.2)
g = g + xlim(-4, 4) + ylim(-4, 4)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g

g = ggplot(data.frame(x, y), aes(x = x, y = y))
g = g + geom_point(size = 5, alpha = .2, colour = "black")
g = g + geom_point(size = 4, alpha = .2, colour = "red")
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")

g

Normalizing the data and setting plotting parameters

library(UsingR)
data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
  plot(x, y, 
       xlab = "Father's height, normalized",
       ylab = "Son's height, normalized",
       xlim = c(-3, 3), ylim = c(-3, 3),
       bg = "lightblue", col = "black", cex = 1.1, pch = 21, 
       frame = FALSE)
}

## Plot the data, code

myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd = 2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship


Allowing for variation

If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, then we can imagine writing the equation for a line

\[C_i = b_0 + b_1 P_i + e_i\]

\(e_i\) is everything we didn t measure (how much they eat, where they live, do they stretch in the morning…)


How do we pick best?

If \(C_i\) is the height of child \(i\) and \(P_i\) is the height of the average parent, pick the line that makes the child values \(C_i\) and our guesses

\[ \sum_{i=1}^{928}(C_i - \{b_0 + b_1 P_i\})^2 \]


Plot what is leftover

par(mfrow=c(1,2))
plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,lm1$residuals,col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)

## The least squares est. is the empirical mean
library(manipulate)
myHist <- function(mu){
    mse <- mean((galton$child - mu)^2)
    g <- ggplot(galton, aes(x = child)) +
geom_histogram(fill = "salmon", colour = "black", binwidth=1)
    g <- g + geom_vline(xintercept = mu, size = 3)
    g <- g + ggtitle(paste("mu = ", mu, ", MSE = ", round(mse, 2), sep = ""))
    g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

####The solution

lm(I(child - mean(child))~ I(parent - mean(parent)) - 1, data = galton)
## 
## Call:
## lm(formula = I(child - mean(child)) ~ I(parent - mean(parent)) - 
##     1, data = galton)
## 
## Coefficients:
## I(parent - mean(parent))  
##                   0.6463
freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")                    
lm1 <- lm(galton$child ~ galton$parent)
g <- g + geom_abline(intercept = coef(lm1)[1], slope = coef(lm1)[2], size = 3, colour = grey(.5))
g

data(father.son) 
 y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight) 
 x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight) 
 rho <- cor(x, y) 
 myPlot <- function(x, y) { 
   plot(x, y,  
        xlab = "Father's height, normalized", 
        ylab = "Son's height, normalized", 
        xlim = c(-3, 3), ylim = c(-3, 3), 
        bg  ="lightblue", col = "black", cex  =1.1, pch = 21,  
        frame = FALSE) 
 } 
myPlot(x, y) 
 abline(0, 1) # if there were perfect correlation 
 abline(0, rho, lwd =2) # father predicts son 
 abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis 
 abline(h = 0); abline(v = 0) # reference lines for no relathionship 

### Dalton’s Investigation on Regression to the Mean ## Basic regression model with additive Gaussian errors. ## Plot of the data

Getting a more interpretable intercept

fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
coef(fit2)
##            (Intercept) I(carat - mean(carat)) 
##               500.0833              3721.0249

Thus $500.1 is the expected price for the average sized diamond of the data (0.2041667 carats).

Predicting the price of a diamond

newx <- c(0.16, 0.27, 0.34)
coef(fit)[1] + coef(fit)[2] * newx
## [1]  335.7381  745.0508 1005.5225
predict(fit, newdata = data.frame(carat = newx))
##         1         2         3 
##  335.7381  745.0508 1005.5225

Predicted values at the observed Xs (red) and at the new Xs (lines)