Today we will be simulating various simple OLS models to illustrate the theory behind OLS. Suppose the true model is: \[Y_i = 2 + 3X_i + u_i\]
Interpretation:
library(pacman)
p_load(tidyverse)
# set a seed to ensure our random numbers generated starts in the same spot
set.seed(12345)
n <- 50 #sample size
X <- rnorm(n, mean = 10, sd = 2) #create x variable
u <- rnorm(n, mean = 0, sd = 5) #create error term
Y <- 2 + 3*X + u #regression equation
model <- lm(Y ~ X) #estimate regression
summary(model)
##
## Call:
## lm(formula = Y ~ X)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.2986 -4.5364 0.9347 3.2703 10.8443
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.6001 3.9700 0.907 0.369
## X 2.9956 0.3751 7.986 2.31e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.758 on 48 degrees of freedom
## Multiple R-squared: 0.5706, Adjusted R-squared: 0.5616
## F-statistic: 63.78 on 1 and 48 DF, p-value: 2.31e-10
Is your estimated slope exactly 3? Why not? Is it close?
df_out <- data.frame(X = X, Y = Y)
ggplot(df_out, aes(x = X, y = Y)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "One Sample",
x = "X",
y = "Y"
)
OLS is a random variable because the data are random.Under the classical model \(\hat{\beta_1} = \beta_1 +\) sampling error.
We will now simulate many samples of OLS estmiates to illustrate several properties of our OLS estimator \(\hat{\beta_1}\).
Recall, OLS is unbiased: \[E[\beta_1] = \hat{\beta_1}\] In other words, OLS is right on average, not in every sample. Let’s simulate many samples and obtain many estimates of \(\beta_1\).
reps <- 1000 #specify number of replications
n <- 50 #sample size
b1_hat <- numeric(reps) #create empty object for b1_hat values
for (i in 1:reps) { #loop through the number of replications
X <- rnorm(n, 10, 2)
u <- rnorm(n, 0, 5)
Y <- 2 + 3*X + u
model <- lm(Y ~ X)
b1_hat[i] <- coef(model)[2] #store b1_hat values in object
}
#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
Check the dataframe. Each row is a value of \(\hat{\beta_1}\).
head(b1_hat_df)
## b1_hat
## 1 2.923745
## 2 2.838730
## 3 2.856780
## 4 3.715551
## 5 3.341234
## 6 3.291897
Now let’s look at the histogram of our estimates.
ggplot(b1_hat_df, aes(x = b1_hat)) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 3, linewidth = 1) +
labs(
title = "Sampling Distribution of beta1_hat",
x = "Estimated slope",
y = "Count"
) +
theme_minimal()
n <- 500 # increase the sample size, N
b1_hat <- numeric(reps) #create empty object for b1_hat values
for (i in 1:reps) {
X <- rnorm(n, 10, 2)
u <- rnorm(n, 0, 5)
Y <- 2 + 3*X + u
model <- lm(Y ~ X)
b1_hat[i] <- coef(model)[2]
}
#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 3, linewidth = 1) +
labs(
title = "Sampling Distribution of beta1_hat",
x = "Estimated slope",
y = "Count"
) +
theme_minimal()
Noise in the error term affects precision, not unbiasedness.
n <- 500
b1_hat <- numeric(reps) #create empty object for b1_hat values
for (i in 1:reps) {
X <- rnorm(n, 10, 2)
u <- rnorm(n, 0, 15) # more noise, higher variance of u
Y <- 2 + 3*X + u
model <- lm(Y ~ X)
b1_hat[i] <- coef(model)[2]
}
#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 3, linewidth = 1) +
labs(
title = "Sampling Distribution of beta1_hat",
x = "Estimated slope",
y = "Count"
) +
theme_minimal()
What happens to our estimates when there is little variation in X?
n <- 500
b1_hat <- numeric(reps) #create empty object for b1_hat values
for (i in 1:reps) {
X <- rnorm(n, 10, 0.2) # set low variance for x
u <- rnorm(n, 0, 5)
Y <- 2 + 3*X + u
model <- lm(Y ~ X)
b1_hat[i] <- coef(model)[2]
}
#convert to a dataframe for ggplot
b1_hat_df <- data.frame(b1_hat = b1_hat)
ggplot(b1_hat_df, aes(x = b1_hat)) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 3, linewidth = 1) +
labs(
title = "Sampling Distribution of beta1_hat",
x = "Estimated slope",
y = "Count"
) +
theme_minimal()