Setup

Today we will be simulating various simple OLS models to illustrate the theory behind OLS. Suppose the true model is: \[Y_i = 2 + 3X_i + u_i\]

Interpretation:

library(pacman)
p_load(tidyverse)
# set a seed to ensure our random numbers generated starts in the same spot
set.seed(12345)

One Sample

n <- 50 #sample size

X <- rnorm(n, mean = 10, sd = 2) #create x variable
u <- rnorm(n, mean = 0, sd = 5) #create error term

Y <- 2 + 3*X + u #regression equation

model <- lm(Y ~ X) #estimate regression
summary(model)
## 
## Call:
## lm(formula = Y ~ X)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2986  -4.5364   0.9347   3.2703  10.8443 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.6001     3.9700   0.907    0.369    
## X             2.9956     0.3751   7.986 2.31e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.758 on 48 degrees of freedom
## Multiple R-squared:  0.5706, Adjusted R-squared:  0.5616 
## F-statistic: 63.78 on 1 and 48 DF,  p-value: 2.31e-10

Is your estimated slope exactly 3? Why not? Is it close?

df_out <- data.frame(X = X, Y = Y)

ggplot(df_out, aes(x = X, y = Y)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(
    title = "One Sample",
    x = "X",
    y = "Y"
  )

OLS is a random variable because the data are random.Under the classical model \(\hat{\beta_1} = \beta_1 +\) sampling error.

Many Samples

We will now simulate many samples of OLS estmiates to illustrate several properties of our OLS estimator \(\hat{\beta_1}\).

Unbiasedness

Recall, OLS is unbiased: \[E[\beta_1] = \hat{\beta_1}\] In other words, OLS is right on average, not in every sample. Let’s simulate many samples and obtain many estimates of \(\beta_1\).

reps <- 1000 #specify number of replications
n <- 50 #sample size

b1_hat <- numeric(reps) #create empty object for b1_hat values

for (i in 1:reps) { #loop through the number of replications
  
  X <- rnorm(n, 10, 2)
  u <- rnorm(n, 0, 5)
  Y <- 2 + 3*X + u
  
  model <- lm(Y ~ X)
  b1_hat[i] <- coef(model)[2] #store b1_hat values in object
}

#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)

Check the dataframe. Each row is a value of \(\hat{\beta_1}\).

head(b1_hat_df)
##     b1_hat
## 1 2.923745
## 2 2.838730
## 3 2.856780
## 4 3.715551
## 5 3.341234
## 6 3.291897

Now let’s look at the histogram of our estimates.

ggplot(b1_hat_df, aes(x = b1_hat)) +
  geom_histogram(bins = 30) +
  geom_vline(xintercept = 3, linewidth = 1) +
  labs(
    title = "Sampling Distribution of beta1_hat",
    x = "Estimated slope",
    y = "Count"
  ) +
  theme_minimal()

Larger N

n <- 500 # increase the sample size, N

b1_hat <- numeric(reps) #create empty object for b1_hat values

for (i in 1:reps) { 
  
  X <- rnorm(n, 10, 2)
  u <- rnorm(n, 0, 5)
  Y <- 2 + 3*X + u
  
  model <- lm(Y ~ X)
  b1_hat[i] <- coef(model)[2] 
}

#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
  geom_histogram(bins = 30) +
  geom_vline(xintercept = 3, linewidth = 1) +
  labs(
    title = "Sampling Distribution of beta1_hat",
    x = "Estimated slope",
    y = "Count"
  ) +
  theme_minimal()

More Noise

Noise in the error term affects precision, not unbiasedness.

n <- 500 

b1_hat <- numeric(reps) #create empty object for b1_hat values

for (i in 1:reps) { 
  
  X <- rnorm(n, 10, 2)
  u <- rnorm(n, 0, 15) # more noise, higher variance of u
  Y <- 2 + 3*X + u
  
  model <- lm(Y ~ X)
  b1_hat[i] <- coef(model)[2] 
}

#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
  geom_histogram(bins = 30) +
  geom_vline(xintercept = 3, linewidth = 1) +
  labs(
    title = "Sampling Distribution of beta1_hat",
    x = "Estimated slope",
    y = "Count"
  ) +
  theme_minimal()

Variation in X

What happens to our estimates when there is little variation in X?

n <- 500

b1_hat <- numeric(reps) #create empty object for b1_hat values

for (i in 1:reps) { 
  
  X <- rnorm(n, 10, 0.2) # set low variance for x
  u <- rnorm(n, 0, 5)
  Y <- 2 + 3*X + u
  
  model <- lm(Y ~ X)
  b1_hat[i] <- coef(model)[2] 
}

#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
  geom_histogram(bins = 30) +
  geom_vline(xintercept = 3, linewidth = 1) +
  labs(
    title = "Sampling Distribution of beta1_hat",
    x = "Estimated slope",
    y = "Count"
  ) +
  theme_minimal()