This demonstration uses a small data set to define a context for repeated sampling from a “population” with known slope and intercept to develope the sampling distributions of regression estimates. The data relate to predicting outcome in therapy (Y) from scores on a personality test (X).

We treat the X values as fixed as in the standard regression model. For each X value, generate sample Y’s, according to the model, \[ Y = 14 + 1.2 * X + sigma * N(0,1) \] where

Load packages

library(purrr)
library(broom)
library(dplyr)
library(ggplot2) 
library(car)

Read the data

therapy <- read.table(header=TRUE, text ='
name   sex perstest therapy
John    M 26 32
Susan   F 24 40
Mary    F 22 44
Paul    M 33 44
Jenny   F 27 48
Rick    M 36 52
Cathy   F 30 56
Robert  M 38 56
Lisa    F 30 60
Tina    F 34 68
')

Treat the therapy data as defining a ‘true model’

true_mod <- lm(therapy ~ perstest, data=therapy)

get tidy coefficients

tmod <- tidy(true_mod)
tmod
# A tibble: 2 x 5
  term        estimate std.error statistic p.value
  <chr>          <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)    14.      17.2       0.814  0.439 
2 perstest        1.20     0.566     2.12   0.0667

Add lower/upper 95% intervals

upper <- tmod$estimate +  1.96 * tmod$std.error
lower <- tmod$estimate -  1.96 * tmod$std.error
tmod <- cbind(tmod, lower, upper)
tmod
         term estimate std.error statistic p.value     lower  upper
1 (Intercept)     14.0   17.2047    0.8137 0.43934 -19.72112 47.721
2    perstest      1.2    0.5657    2.1213 0.06669   0.09126  2.309

Set up the simulation parameters

x <- therapy$perstest
n <- length(x)

samples <- 1000

# "true" parameters
b0 <- coef(true_mod)[1]   # 14
b1 <- coef(true_mod)[2]   # 1.2
sigma <- summary(true_mod)$sigma  # sqrt(80)
set.seed(1234)       # reproducibility

Generate one sample from the bivariate distribution of (x, y)

For the simulation, it is useful to define some simple functions to do a calculation for a single sample.

Then, one can use replicate(samples, fun()) to repeat over many samples.

one_sample <- function(x, b0=14, b1=1.2, sigma=sqrt(80)) {
    n <- length(x)
    y <- b0 + b1 * x + rnorm(n, mean=0, sd=sigma)
    data.frame(x, y)
}
one_sample(x, b0, b1, sigma)
    x     y
1  26 34.40
2  24 45.28
3  22 50.10
4  33 32.62
5  27 50.24
6  36 61.73
7  30 44.86
8  38 54.71
9  30 44.95
10 34 46.84

Do a regression for one sample

one_reg <- function(x) {
    simdata <- one_sample(x)
    lm(y ~ x, data=simdata)
}
one_reg(x)

Call:
lm(formula = y ~ x, data = simdata)

Coefficients:
(Intercept)            x  
      -1.16         1.67  

Generate the coefficients for all samples

Note that replicate() binds the results as columns of a matrix, but I want them as rows, of a data frame

all_coefs <- function(x, samples=5) {
    res <- replicate(n=samples, coef(one_reg(x)))
    rownames(res) <- c("b0", "b1")
    as.data.frame(t(res))
}

Apply this for all samples

coefs <- all_coefs(x, samples=samples)

head(coefs)   # show first 6
      b0     b1
1 22.584 0.7982
2 20.127 0.7673
3 17.365 0.9060
4  1.213 1.5431
5 15.949 1.3189
6 20.712 0.9637

Plot the sampling distribution of coefs

Plot the density of b1 estimates together with the theoretical normal density with true parameters. Note the use of stat_function() for the normal density (dnorm())

ggplot(coefs,  aes(x = b1) ) +
    geom_density(color="blue", fill = "blue", alpha = .2, size=1.5) +
    geom_vline( xintercept = b1, color="red", size=1.5) + 
    geom_vline(xintercept = lower[2]) +
    geom_vline(xintercept = upper[2]) +
    annotate("text", x=b1, y=0.05, label=paste("beta[1] ==", b1), 
             size=8, parse=TRUE) +
    stat_function(fun = dnorm, args = list(mean = b1, sd = .565)) +
        xlab("Slope (b1)") 

Plot the joint distribution of b0, b1

ggplot(coefs, aes(x=b1, y=b0)) +
            geom_point(color="blue", size=0.9) +
            stat_ellipse(level = 0.68, size=1.2) + 
            stat_ellipse(level = 0.95, size=1.05, linetype=1) +
            geom_point(x=b1, y=b0, color="red", size=4) +
            theme_minimal()

The very strong negative correlation between slope & intercept is perhaps a bit surprising. Here it is in numbers:

cor(coefs)          
        b0      b1
b0  1.0000 -0.9873
b1 -0.9873  1.0000

Plot the slopes & their individual CIs (Fig 4.2)

Add se, CI info to the slope estimate in on sample

one_slope_info <- function(x) {
    simdata <- one_sample(x)
    reg <- lm(y ~ x, data=simdata)
    treg <- tidy(reg)
    b1 <- as.numeric(treg[2,"estimate"])
    se <- as.numeric(treg[2,"std.error"])
    conf <- b1 + c(-2, -.67, .67, 2) * se
    c(b1=b1, se=se, conf)
}
one_slope_info(x)
     b1      se                                 
 0.2750  0.3320 -0.3890  0.0526  0.4975  0.9390 

Do these for all samples

all_slope_info <- function(x, samples=5) {
    res <- replicate(n=samples, one_slope_info(x))
    rownames(res) <- c("b1", "se", "lo95", "lo50", "up50", "up95")
    as.data.frame(t(res))
}
slopes <- all_slope_info(x, samples=samples)
head(slopes)
       b1     se    lo95     lo50   up50   up95
1  0.4860 0.8664 -1.2467 -0.09448 1.0664 2.2187
2  1.5155 0.8138 -0.1121  0.97022 2.0607 3.1430
3  0.8219 0.5789 -0.3359  0.43403 1.2098 1.9797
4  0.9374 0.3625  0.2123  0.69450 1.1803 1.6625
5  0.4011 0.4327 -0.4643  0.11118 0.6909 1.2664
6 -0.2435 0.5364 -1.3162 -0.60290 0.1158 0.8292

Make the plot in the style of Figure 4.2

plot(c(-2, samples+2), range(slopes[,c("lo95", "up95")]), bty="l", 
        xlab="Simulation", ylab="Estimate, 50% & 95% confidence interval", 
        type="n")
abline(b1, 0, col="gray")
for (i in seq(1, samples, by=2)){
  lines(c(i,i), slopes[i,c("lo95", "up95")], lwd=.8, col=gray(.3, alpha=.5))
  lines(c(i,i), slopes[i,c("lo50", "up50")], lwd=2,  col="red")
}
points(1:samples, slopes[,"b1"], pch=20)

Plot individual regression lines to confirm the confidence band

Here I plot the lm regression line for the data with 68 & 95% confidence intervals. Overlaid here are the individual regression lines for the samples.

A crafty ggplot trick allows to use the fitted coefficients as a new data frame input to geom_abline

ggplot(data=therapy, aes(x=perstest, y=therapy)) +
    geom_smooth(method="lm", formula= y~x, level=0.68, alpha=.2) +
    geom_smooth(method="lm", formula= y~x, level=0.95, alpha=.3, size=2) +
    geom_abline(data=coefs, aes(intercept = b0, slope = b1), color=adjustcolor( "red", alpha.f = 0.1)) +
    geom_point(size=3, color="blue")  +
    theme_bw()

Other explorations

What happens if we use 2 confidence levels? Surprise!

ggplot(data=therapy, aes(x=perstest, y=therapy)) +
    geom_point(size=3, color="blue") + 
    geom_smooth(method="lm", formula= y~x, level=c(0.68, .95))

Show three confidence levels

ggplot(data=therapy, aes(x=perstest, y=therapy)) +
            geom_point(size=3, color="blue") + 
            geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.68) +
            geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.90) +
            geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.95) +
            theme_bw()

Different way to do this

In the examples above, I essentially re-ran all of the simulations for each of the the functions used with replicate(). This is somewhat wasteful.

Another way is to generate all the model objects at once, store them in a list.
Can then use tidy functions to extract summaries, do plots, etc.

sims <- replicate(n=samples, one_reg(x), simplify=FALSE)

plot the residual standard errors

sims %>%
     map_dbl(~summary(.x)$sigma) %>%
     data.frame(sigma = .) %>%
     ggplot( aes(x = sigma) ) +
          geom_density(fill = "blue", alpha = .2) +
          geom_vline(xintercept = sqrt(80))

References

#' ---
#' title: "Generate sampling distribution of regression estimates b0, b1"
#' author: "Michael Friendly"
#' date: "`r format(Sys.Date())`"
#' output:
#'   html_document:
#'     theme: readable
#'     code_download: true
#' ---

#-----------------------------------------------------------
#' This demonstration uses a small data set to define a context for repeated
#' sampling from a "population" with known slope and intercept to develope the
#' sampling distributions of regression estimates. The data relate to predicting
#' outcome in therapy (Y) from scores on a personality test (X).
#'
#' We treat the X values as fixed as in the standard regression model.
#'  For each X value, generate sample Y's, according
#'  to the model,
#'  $$
#'           Y  =  14 + 1.2 * X + sigma * N(0,1)
#'  $$
#'  where
#'  
#' * $\beta_0 = 14$,  $\beta_1 = 1.2$ (coefficients in the therapy data) 
#' * sigma = sqrt(MSE) = sqrt(80)
#' * N(0,1)= a random normal deviate
#-----------------------------------------------------------

#+ setup, include=FALSE
knitr::opts_chunk$set(message=FALSE, error=FALSE, warning=FALSE, comment=NA)
options(digits=4)

#' ### Load packages
library(purrr)
library(broom)
library(dplyr)
library(ggplot2) 
library(car)

#' ### Read the data
therapy <- read.table(header=TRUE, text ='
name   sex perstest therapy
John    M 26 32
Susan   F 24 40
Mary    F 22 44
Paul    M 33 44
Jenny   F 27 48
Rick    M 36 52
Cathy   F 30 56
Robert  M 38 56
Lisa    F 30 60
Tina    F 34 68
')

#' ## Treat the therapy data as defining a 'true model'

true_mod <- lm(therapy ~ perstest, data=therapy)

#' ## get tidy coefficients
tmod <- tidy(true_mod)
tmod

#' Add lower/upper 95% intervals
upper <- tmod$estimate +  1.96 * tmod$std.error
lower <- tmod$estimate -  1.96 * tmod$std.error
tmod <- cbind(tmod, lower, upper)
tmod

#' ## Set up the simulation parameters
x <- therapy$perstest
n <- length(x)

samples <- 1000

# "true" parameters
b0 <- coef(true_mod)[1]   # 14
b1 <- coef(true_mod)[2]   # 1.2
sigma <- summary(true_mod)$sigma  # sqrt(80)
set.seed(1234)       # reproducibility

#' ### Generate one sample from the bivariate distribution of (x, y)
#'
#' For the simulation, it is useful to define some simple functions to do a
#' calculation for a single sample.
#' 
#' Then, one can use `replicate(samples, fun())` to repeat over many samples.

one_sample <- function(x, b0=14, b1=1.2, sigma=sqrt(80)) {
	n <- length(x)
	y <- b0 + b1 * x + rnorm(n, mean=0, sd=sigma)
	data.frame(x, y)
}
one_sample(x, b0, b1, sigma)

#' Do a regression for one sample
one_reg <- function(x) {
	simdata <- one_sample(x)
	lm(y ~ x, data=simdata)
}
one_reg(x)

#' ### Generate the coefficients for **all** samples
#' 
#' Note that `replicate()` binds the results as columns of a matrix, but I want them
#' as rows, of a data frame
all_coefs <- function(x, samples=5) {
	res <- replicate(n=samples, coef(one_reg(x)))
	rownames(res) <- c("b0", "b1")
	as.data.frame(t(res))
}

#' Apply this for all samples
coefs <- all_coefs(x, samples=samples)

head(coefs)   # show first 6

#' ## Plot the sampling distribution of coefs
#' 
#' Plot the density of b1 estimates together with the theoretical
#' normal density with true parameters. Note the use of `stat_function()`
#' for the normal density (`dnorm()`)
#' 
ggplot(coefs,  aes(x = b1) ) +
    geom_density(color="blue", fill = "blue", alpha = .2, size=1.5) +
    geom_vline( xintercept = b1, color="red", size=1.5) + 
    geom_vline(xintercept = lower[2]) +
    geom_vline(xintercept = upper[2]) +
    annotate("text", x=b1, y=0.05, label=paste("beta[1] ==", b1), 
             size=8, parse=TRUE) +
    stat_function(fun = dnorm, args = list(mean = b1, sd = .565)) +
        xlab("Slope (b1)") 
        

#' ## Plot the joint distribution of b0, b1
ggplot(coefs, aes(x=b1, y=b0)) +
			geom_point(color="blue", size=0.9) +
			stat_ellipse(level = 0.68, size=1.2) + 
			stat_ellipse(level = 0.95, size=1.05, linetype=1) +
			geom_point(x=b1, y=b0, color="red", size=4) +
			theme_minimal()

#' The very strong negative correlation between slope & intercept is perhaps
#' a bit surprising.  Here it is in numbers:
#' 
cor(coefs)			

#' ## Plot the slopes & their individual CIs (Fig 4.2)

#' Add se, CI info to the slope estimate in on sample

one_slope_info <- function(x) {
	simdata <- one_sample(x)
	reg <- lm(y ~ x, data=simdata)
	treg <- tidy(reg)
	b1 <- as.numeric(treg[2,"estimate"])
	se <- as.numeric(treg[2,"std.error"])
	conf <- b1 + c(-2, -.67, .67, 2) * se
	c(b1=b1, se=se, conf)
}
one_slope_info(x)

#' Do these for all samples
all_slope_info <- function(x, samples=5) {
	res <- replicate(n=samples, one_slope_info(x))
	rownames(res) <- c("b1", "se", "lo95", "lo50", "up50", "up95")
	as.data.frame(t(res))
}
slopes <- all_slope_info(x, samples=samples)
head(slopes)

#' Make the plot in the style of Figure 4.2
#'
#+ fig.width=7
plot(c(-2, samples+2), range(slopes[,c("lo95", "up95")]), bty="l", 
		xlab="Simulation", ylab="Estimate, 50% & 95% confidence interval", 
		type="n")
abline(b1, 0, col="gray")
for (i in seq(1, samples, by=2)){
  lines(c(i,i), slopes[i,c("lo95", "up95")], lwd=.8, col=gray(.3, alpha=.5))
  lines(c(i,i), slopes[i,c("lo50", "up50")], lwd=2,  col="red")
}
points(1:samples, slopes[,"b1"], pch=20)


#' ## Plot individual regression lines to confirm the confidence band

#' Here I plot the `lm` regression line for the data with 68 & 95% confidence intervals.
#' Overlaid here are the *individual* regression lines for the samples. 
#' 
#' A crafty `ggplot` trick
#' allows to use the fitted coefficients as a new data frame input to `geom_abline`


ggplot(data=therapy, aes(x=perstest, y=therapy)) +
	geom_smooth(method="lm", formula= y~x, level=0.68, alpha=.2) +
	geom_smooth(method="lm", formula= y~x, level=0.95, alpha=.3, size=2) +
	geom_abline(data=coefs, aes(intercept = b0, slope = b1), color=adjustcolor( "red", alpha.f = 0.1)) +
	geom_point(size=3, color="blue")  +
	theme_bw()


#' ### Other explorations
#' What happens if we use 2 confidence levels?  Surprise!
ggplot(data=therapy, aes(x=perstest, y=therapy)) +
	geom_point(size=3, color="blue") + 
	geom_smooth(method="lm", formula= y~x, level=c(0.68, .95))

#' Show three confidence levels
ggplot(data=therapy, aes(x=perstest, y=therapy)) +
			geom_point(size=3, color="blue") + 
			geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.68) +
			geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.90) +
			geom_smooth(method="lm", formula= y~x, alpha=.2, level=0.95) +
			theme_bw()
	


#' ## Different way to do this

#' In the examples above, I essentially re-ran all of the simulations for each of
#' the the functions used with `replicate()`. This is somewhat wasteful.
#'
#' Another way is to generate all the model objects at once, store them in a list.  
#' Can then use tidy functions to extract
#' summaries, do plots, etc.

sims <- replicate(n=samples, one_reg(x), simplify=FALSE)

#' ## plot the residual standard errors

sims %>%
     map_dbl(~summary(.x)$sigma) %>%
     data.frame(sigma = .) %>%
     ggplot( aes(x = sigma) ) +
          geom_density(fill = "blue", alpha = .2) +
          geom_vline(xintercept = sqrt(80))

#' ## References
#' 

#' * Some similar methods for simulation are described in 
#' [Simulate! Simulate! - Part 1: A linear model](https://aosmith.rbind.io/2018/01/09/simulate-simulate-part1/)
#' 
#' * Chapter 20, [Simulation](https://bookdown.org/rdpeng/rprogdatascience/simulation.html) from
#' Roger Peng, [R Programming for Data Science](https://bookdown.org/rdpeng/rprogdatascience/).
#' 