Setup code
# devtools::install_github("dicook/nullabor")
library(nullabor)
library(tidyverse)
library(broom)
theme_set(theme_bw(base_size = 30) +
theme(axis.text = element_blank(),
strip.text = element_blank()))
knitr::opts_chunk$set(
warning = FALSE,
message = FALSE,
fig.width = 3,
fig.height = 4,
fig.path = "img/"
)
Consider a simple linear model for \(i = 1, ..., n\) \[y_i = \beta_0 + \beta_1 x_i + e_i, \qquad e_i\sim NID(0, \sigma^2).\] Let’s do \(n \in \{20, 100, 1000, 10000\}\).
The actual data generating model is
\[y_i \sim NID(1 + 2 x_i + 2 \sin(2x_i), ~1).\]
set.seed(1)
nobsv <- c(20, 100, 1000, 10000)
n_null <- 3
nobs <- 100
null_fig_width <- 8
dat1 <- map(nobsv, ~{
df <- tibble(id = 1:.x) %>%
mutate(x = runif(n(), 0, 10),
y = rnorm(n(), 1 + 2 * x + 2 * sin(2 * x), 1))
lm(y ~ 1, data = df) %>%
augment() %>%
mutate(x = df$x)
})
null1 <- map(seq_along(nobsv), function(i) {
map_dfr(1:n_null, ~{
fit <- lm(y ~ 1, dat1[[i]])
beta0_hat <- tidy(fit)$estimate[1]
sig_hat <- glance(fit)$sigma
tibble(id = 1:nobsv[i]) %>%
mutate(y = rnorm(n(), beta0_hat, sig_hat)) %>%
lm(y ~ 1, data = .) %>%
augment() %>%
mutate(sim = paste0("Sim ", .x),
x = dat1[[i]]$x)
})})
dat2 <- map(nobsv, ~{
df <- tibble(id = 1:.x) %>%
mutate(x = sample(c(0, 1), size = n(), replace = TRUE),
y = rnorm(n(), 1 + 2 * x + 2 * sin(2 * x), 1))
lm(y ~ 1, data = df) %>%
augment() %>%
mutate(x = df$x)
})
null2 <- map(seq_along(nobsv), function(i) {
map_dfr(1:n_null, ~{
fit <- lm(y ~ 1, dat2[[i]])
beta0_hat <- tidy(fit)$estimate[1]
sig_hat <- glance(fit)$sigma
tibble(id = 1:nobsv[i]) %>%
mutate(y = rnorm(n(), beta0_hat, sig_hat)) %>%
lm(y ~ 1, data = .) %>%
augment() %>%
mutate(sim = paste0("Sim ", .x),
x = dat2[[i]]$x)
})})
dat3 <- map(seq_along(nobsv), ~{
lm(y ~ x, data = dat1[[.x]]) %>%
augment()
})
null3 <- map(seq_along(nobsv), function(i) {
map_dfr(1:n_null, ~{
fit <- lm(y ~ x, dat1[[i]])
beta0_hat <- tidy(fit)$estimate[1]
beta1_hat <- tidy(fit)$estimate[2]
sig_hat <- glance(fit)$sigma
tibble(id = 1:nobsv[i]) %>%
mutate(x = dat1[[i]]$x,
y = rnorm(n(), beta0_hat + beta1_hat * x, sig_hat)) %>%
lm(y ~ x, data = .) %>%
augment() %>%
mutate(sim = paste0("Sim ", .x))
})})
walk(seq_along(nobsv), ~{
p <- dat1[[.x]] %>%
ggplot(aes(x, y)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(paste0("n = ", nobsv[.x]))
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(null1[[.x]], aes(x, y)) +
geom_point() +
geom_smooth(method = "lm") +
ggtitle(paste0("n = ", nobsv[.x])) +
facet_grid(. ~ sim)
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(dat1[[.x]], aes(x, .resid)) +
geom_point() +
labs(y = "Residual") +
ggtitle(paste0("n = ", nobsv[.x])) +
geom_hline(yintercept = 0, color = "blue")
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(null1[[.x]], aes(x, .resid)) +
geom_point() +
geom_hline(yintercept = 0, color = "blue") +
labs(y = "Residual") +
ggtitle(paste0("n = ", nobsv[.x])) +
facet_grid(. ~ sim)
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(dat2[[.x]], aes(as.factor(x), .resid)) +
geom_boxplot() +
geom_hline(yintercept = 0, color = "blue") +
ggtitle(paste0("n = ", nobsv[.x])) +
labs(x = "x", y = "Residual")
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(null2[[.x]], aes(as.factor(x), .resid)) +
geom_boxplot() +
geom_hline(yintercept = 0, color = "blue") +
ggtitle(paste0("n = ", nobsv[.x])) +
labs(x = "x", y = "Residual") +
facet_grid(. ~ sim)
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(dat3[[.x]], aes(sample = .resid)) +
geom_qq() +
geom_qq_line(color = "blue") +
ggtitle(paste0("n = ", nobsv[.x]))
print(p)
})
walk(seq_along(nobsv), ~{
p <- ggplot(null3[[.x]], aes(sample = .resid)) +
geom_qq() +
geom_qq_line(color = "blue") +
ggtitle(paste0("n = ", nobsv[.x])) +
facet_grid(. ~ sim)
print(p)
})