Goals of statistical modeling

Example: Average parent and child heights

Load Galton Data

library(UsingR)

## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: grid
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2

## Warning: replacing previous import by 'ggplot2::arrow' when loading 'Hmisc'

## Warning: replacing previous import by 'ggplot2::unit' when loading 'Hmisc'

## Warning: replacing previous import by 'scales::alpha' when loading 'Hmisc'

## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
## 
## 
## Attaching package: 'UsingR'
## 
## The following object is masked from 'package:survival':
## 
##     cancer

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:Hmisc':
## 
##     combine, src, summarize
## 
## The following object is masked from 'package:MASS':
## 
##     select
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(UsingR)
library(manipulate)
library(reshape)

## 
## Attaching package: 'reshape'
## 
## The following object is masked from 'package:dplyr':
## 
##     rename

data(galton)
par(mfrow=c(1,2))
hist(galton$child,col="blue",breaks=100)
hist(galton$parent,col="blue",breaks=100)

##or the ggplot way 
 
long <- melt(galton)

## Using  as id variables

g <- ggplot(long, aes(x = value, fill = variable)) 
g <- g + geom_histogram(colour = "black", binwidth=1) 
g <- g + facet_grid(. ~ variable)
g

The distribution of child heights

hist(galton$child,col="blue",breaks=100)

myHist<- function(mu){
  hist(galton$child,col="blue",breaks=100)
  lines(c(mu,mu),c(0,150),col="red",lwd=5)
  mse<- mean((galton$child-mu)^2)
  text(63,150,paste("MSE",round(mse,2)))
}

##manipulate(myHist(mu),mu=slider(62,74,step=0.5))

Only know the child - average height

hist(galton$child,col="blue",breaks=100)
meanChild <- mean(galton$child)
lines(rep(meanChild,100),seq(0,150,length=100),col="red",lwd=5)

Experiment

Use R studio’s manipulate to see what value of $\mu$ minimizes the

sum of the squared deviations.

myHist <- function(mu){
  mse <- mean((galton$child - mu)^2)
  g<-ggplot(galton,aes(x=child))+
    geom_histogram(fill="salmon",colour="black",binwidth=1)
  g <- g + geom_vline(xintercept = mu, size = 3)
  g <- g + ggtitle(paste("mu= ",mu,",MSE= ",round(mse,2),sep = ""))
  g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

myHist(mean(galton$child))

####old version minimise mu for childrens heights

myHist <- function(mu){ 
   hist(galton$child,col="blue",breaks=100) 
   lines(c(mu, mu), c(0, 150),col="red",lwd=5) 
   mse <- mean((galton$child - mu)^2) 
   text(63, 150, paste("mu = ", mu)) 
   text(63, 140, paste("MSE = ", round(mse, 2))) 
 } 
 #manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

Comparing childrens’ heights and their parents’ heights

What if we plot child versus average parent

plot(galton$parent,galton$child,pch=19,col="blue")

##better plot 

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g+geom_point(colour="grey50",aes(size = freq+20,show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")                    
g

## Regression through the origin * Suppose that $X_i$ are the parents’ heights. * Consider picking the slope $\beta$ that minimizes \[\sum_{i=1}^n (Y_i - X_i \beta)^2\] * This is exactly using the origin as a pivot point picking the line that minimizes the sum of the squared vertical distances of the points to the line * Use R studio’s manipulate function to experiment * Subtract the means so that the origin is the mean of the parent and children’s heights

y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
myPlot <- function(beta){
    g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
    g <- g  + scale_size(range = c(2, 20), guide = "none" )
    g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
    g <- g + geom_point(aes(colour=freq, size = freq))
    g <- g + scale_colour_gradient(low = "lightblue", high="white")                     
    g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
    mse <- mean( (y - beta * x) ^2 )
    g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
    g
}
#manipulate(myPlot(beta), beta = slider(0.6, 1.2, step = 0.02))

General least squares for linear equations

Consider again the parent and child height data from Galton

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")  
g

Results

The least squares model fit to the line $Y = \beta_0 + \beta_1 X$ through the data pairs $(X_i, Y_i)$ with $Y_i$ as the outcome obtains the line $Y = \hat \beta_0 + \hat \beta_1 X$ where \[\hat \beta_1 = Cor(Y, X) \frac{Sd(Y)}{Sd(X)} ~~~ \hat \beta_0 = \bar Y - \hat \beta_1 \bar X\]
$\hat \beta_1$ has the units of $Y / X$, $\hat \beta_0$ has the units of $Y$.
The line passes through the point $(\bar X, \bar Y$)
The slope of the regression line with $X$ as the outcome and $Y$ as the predictor is $Cor(Y, X) Sd(X)/ Sd(Y)$.
The slope is the same one you would get if you centered the data, $(X_i - \bar X, Y_i - \bar Y)$, and did regression through the origin.
If you normalized the data, $\{ \frac{X_i - \bar X}{Sd(X)}, \frac{Y_i - \bar Y}{Sd(Y)}\}$, the slope is $Cor(Y, X)$.

Revisiting Galton’s data

Reversing the outcome/predictor relationship

beta1 <- cor(y, x) *  sd(x) / sd(y)
beta0 <- mean(x) - beta1 * mean(y)
rbind(c(beta0, beta1), coef(lm(x ~ y)))

##      (Intercept)         y
## [1,]    46.13535 0.3256475
## [2,]    46.13535 0.3256475

Revisiting Galton’s data

Normalizing variables results in the slope being the correlation

yn <- (y - mean(y))/sd(y)
xn <- (x - mean(x))/sd(x)
c(cor(y, x), cor(yn, xn), coef(lm(yn ~ xn))[2])

##                            xn 
## 0.4587624 0.4587624 0.4587624

Regression to the mean

g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")  
g <- g + geom_smooth(method="lm", formula=y~x)
g

Jittered plot

set.seed(1234)
plot(jitter(galton$parent,factor=2),jitter(galton$child,factor=2),pch=19,col="blue")

ggplot(galton, aes(x = parent, y = child)) + geom_point()

## Average parent = 65 inches tall

plot(galton$parent,galton$child,pch=19,col="blue")
near65 <- galton[abs(galton$parent - 65)<1, ]
points(near65$parent,near65$child,pch=19,col="red")
lines(seq(64,66,length=100),rep(mean(near65$child),100),col="red",lwd=4)

## Average parent = 71 inches tall

plot(galton$parent,galton$child,pch=19,col="blue")
near71 <- galton[abs(galton$parent - 71)<1, ]
points(near71$parent,near71$child,pch=19,col="red")
lines(seq(70,72,length=100),rep(mean(near71$child),100),col="red",lwd=4)

Fitting a line

plot(galton$parent,galton$child,pch=19,col="blue")
lm1 <- lm(galton$child ~ galton$parent)
lines(galton$parent,lm1$fitted,col="red",lwd=3)

Why not this line?

plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent, 26 + 0.646*galton$parent)

y <- galton$child - mean(galton$child)
x <- galton$parent - mean(galton$parent)
freqData <- as.data.frame(table(x, y))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))

myPlot <- function(beta){
    g <- ggplot(filter(freqData,freq > 0),aes(x = parent, y = child))
    g <- g  + scale_size(range = c(2, 20), guide = "none" )
    g <- g +geom_point(colour="grey50",aes(size =freq+20,show_guide = FALSE))
    g <- g + geom_point(aes(colour=freq, size = freq))
    g <- g + scale_colour_gradient(low = "lightblue", high="white")                     
    g <- g + geom_abline(intercept = 0, slope = beta, size = 3)
    mse <- mean( (y - beta * x) ^2 )
    g <- g + ggtitle(paste("beta = ", beta, "mse = ", round(mse, 3)))
    g
}
#manipulate(myPlot(beta),beta = slider(0.6, 1.2,step = 0.02))

The equation for a line

If $C_i$ is the height of child $i$ and $P_i$ is the height of the average parent, then we can imagine writing the equation for a line

\[C_i = b_0 + b_1 P_i\]

Not all points are on the line

plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)

# load father.son data
data(father.son)
# normalize son's height
y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight)
# normalize father's height
x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight)
# calculate correlation
rho <- cor(x, y)
# plot the relationship between the two
g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 3, alpha = .2, colour = "black")
g = g + geom_point(size = 2, alpha = .2, colour = "red")
g = g + xlim(-4, 4) + ylim(-4, 4)
# reference line for perfect correlation between
# variables (data points will like on diagonal line)
g = g + geom_abline(intercept = 0, slope = 1)
#g = g + geom_abline(position = "identity")
# if there is no correlation between the two variables,
# the data points will lie on horizontal/vertical lines
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
# plot the actual correlation for both
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
# add appropriate labels
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")
g = g + geom_text(x = 3.8, y = 1.6, label="x~y", angle = 25) +
geom_text(x = 3.2, y = 3.6, label="cor(y,x)=1", angle = 35) +
geom_text(x = 1.6, y = 3.8, label="y~x", angle = 60)
g

Regression to the mean

Suppose that we normalize $X$ (child’s height) and $Y$ (parent’s height) so that they both have mean 0 and variance 1.
Then, recall, our regression line passes through $(0, 0)$ (the mean of the X and Y).
If the slope of the regression line is $Cor(Y,X)$, regardless of which variable is the outcome (recall, both standard deviations are 1).
Notice if $X$ is the outcome and you create a plot where $X$ is the horizontal axis, the slope of the least squares line that you plot is $1/Cor(Y, X)$.

data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)


g = ggplot(data.frame(x = x, y = y), aes(x = x, y = y))
g = g + geom_point(size = 6, colour = "black", alpha = 0.2)
g = g + geom_point(size = 4, colour = "salmon", alpha = 0.2)
g = g + xlim(-4, 4) + ylim(-4, 4)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g

g = ggplot(data.frame(x, y), aes(x = x, y = y))
g = g + geom_point(size = 5, alpha = .2, colour = "black")
g = g + geom_point(size = 4, alpha = .2, colour = "red")
g = g + geom_vline(xintercept = 0)
g = g + geom_hline(yintercept = 0)
g = g + geom_abline(intercept = 0, slope = 1)
g = g + geom_abline(intercept = 0, slope = rho, size = 2)
g = g + geom_abline(intercept = 0, slope = 1 / rho, size = 2)
g = g + xlab("Father's height, normalized")
g = g + ylab("Son's height, normalized")

g

Normalizing the data and setting plotting parameters

library(UsingR)
data(father.son)
y<-(father.son$sheight-mean(father.son$sheight))/sd(father.son$sheight)
x<-(father.son$fheight-mean(father.son$fheight))/sd(father.son$fheight)
rho <- cor(x, y)
myPlot <- function(x, y) {
  plot(x, y, 
       xlab = "Father's height, normalized",
       ylab = "Son's height, normalized",
       xlim = c(-3, 3), ylim = c(-3, 3),
       bg = "lightblue", col = "black", cex = 1.1, pch = 21, 
       frame = FALSE)
}

## Plot the data, code

myPlot(x, y)
abline(0, 1) # if there were perfect correlation
abline(0, rho, lwd = 2) # father predicts son
abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis
abline(h = 0); abline(v = 0) # reference lines for no relathionship

Allowing for variation

If $C_i$ is the height of child $i$ and $P_i$ is the height of the average parent, then we can imagine writing the equation for a line

\[C_i = b_0 + b_1 P_i + e_i\]

$e_i$ is everything we didn t measure (how much they eat, where they live, do they stretch in the morning…)

How do we pick best?

If $C_i$ is the height of child $i$ and $P_i$ is the height of the average parent, pick the line that makes the child values $C_i$ and our guesses

\[ \sum_{i=1}^{928}(C_i - \{b_0 + b_1 P_i\})^2 \]

Plot what is leftover

par(mfrow=c(1,2))
plot(galton$parent,galton$child,pch=19,col="blue")
lines(galton$parent,lm1$fitted,col="red",lwd=3)
plot(galton$parent,lm1$residuals,col="blue",pch=19)
abline(c(0,0),col="red",lwd=3)

## The least squares est. is the empirical mean
library(manipulate)
myHist <- function(mu){
    mse <- mean((galton$child - mu)^2)
    g <- ggplot(galton, aes(x = child)) +
geom_histogram(fill = "salmon", colour = "black", binwidth=1)
    g <- g + geom_vline(xintercept = mu, size = 3)
    g <- g + ggtitle(paste("mu = ", mu, ", MSE = ", round(mse, 2), sep = ""))
    g
}
#manipulate(myHist(mu), mu = slider(62, 74, step = 0.5))

####The solution

lm(I(child - mean(child))~ I(parent - mean(parent)) - 1, data = galton)

## 
## Call:
## lm(formula = I(child - mean(child)) ~ I(parent - mean(parent)) - 
##     1, data = galton)
## 
## Coefficients:
## I(parent - mean(parent))  
##                   0.6463

freqData <- as.data.frame(table(galton$child, galton$parent))
names(freqData) <- c("child", "parent", "freq")
freqData$child <- as.numeric(as.character(freqData$child))
freqData$parent <- as.numeric(as.character(freqData$parent))
g <- ggplot(filter(freqData, freq > 0), aes(x = parent, y = child))
g <- g  + scale_size(range = c(2, 20), guide = "none" )
g <- g + geom_point(colour="grey50", aes(size = freq+20, show_guide = FALSE))
g <- g + geom_point(aes(colour=freq, size = freq))
g <- g + scale_colour_gradient(low = "lightblue", high="white")                    
lm1 <- lm(galton$child ~ galton$parent)
g <- g + geom_abline(intercept = coef(lm1)[1], slope = coef(lm1)[2], size = 3, colour = grey(.5))
g

data(father.son) 
 y <- (father.son$sheight - mean(father.son$sheight)) / sd(father.son$sheight) 
 x <- (father.son$fheight - mean(father.son$fheight)) / sd(father.son$fheight) 
 rho <- cor(x, y) 
 myPlot <- function(x, y) { 
   plot(x, y,  
        xlab = "Father's height, normalized", 
        ylab = "Son's height, normalized", 
        xlim = c(-3, 3), ylim = c(-3, 3), 
        bg  ="lightblue", col = "black", cex  =1.1, pch = 21,  
        frame = FALSE) 
 } 
myPlot(x, y) 
 abline(0, 1) # if there were perfect correlation 
 abline(0, rho, lwd =2) # father predicts son 
 abline(0, 1 / rho, lwd = 2) # son predicts father, son on vertical axis 
 abline(h = 0); abline(v = 0) # reference lines for no relathionship

### Dalton’s Investigation on Regression to the Mean ## Basic regression model with additive Gaussian errors. ## Plot of the data

Getting a more interpretable intercept

fit2 <- lm(price ~ I(carat - mean(carat)), data = diamond)
coef(fit2)

##            (Intercept) I(carat - mean(carat)) 
##               500.0833              3721.0249

Thus $500.1 is the expected price for the average sized diamond of the data (0.2041667 carats).

Predicting the price of a diamond

newx <- c(0.16, 0.27, 0.34)
coef(fit)[1] + coef(fit)[2] * newx

## [1]  335.7381  745.0508 1005.5225

predict(fit, newdata = data.frame(carat = newx))

##         1         2         3 
##  335.7381  745.0508 1005.5225

Predicted values at the observed Xs (red) and at the new Xs (lines)

Goals of statistical modeling

Example: Average parent and child heights

Load Galton Data

The distribution of child heights

Only know the child - average height

Experiment

Use R studio’s manipulate to see what value of \(\mu\) minimizes the

Comparing childrens’ heights and their parents’ heights

What if we plot child versus average parent

General least squares for linear equations

Results

Revisiting Galton’s data

Reversing the outcome/predictor relationship

Revisiting Galton’s data

Normalizing variables results in the slope being the correlation

Regression to the mean

Jittered plot

Fitting a line

Why not this line?

The equation for a line

Not all points are on the line

Regression to the mean

Normalizing the data and setting plotting parameters

Allowing for variation

How do we pick best?

Plot what is leftover

Getting a more interpretable intercept

Predicting the price of a diamond