We have \(X \sim N(0, 1)\), so the
true population mean is \(\mu = 0\).
Throughout the problem we use t.test() because in practice
the population standard deviation is not assumed known; this is also the
built-in function that returns the \(p\)-value directly.
We draw 5 samples, construct a 95% CI for \(\mu\), and check whether the interval contains the true mean \(\mu = 0\). Repeating this 10,000 times, the proportion of intervals that contain \(\mu\) estimates the coverage probability — which theory says should be \(\approx 0.95\).
n <- 5
n_sim <- 10000
mu0 <- 0
contains <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, mean = 0, sd = 1)
ci <- t.test(x, conf.level = 0.95)$conf.int
contains[i] <- (ci[1] <= mu0) && (mu0 <= ci[2])
}
coverage_1a <- mean(contains)
coverage_1a
## [1] 0.9474
Result: the estimated coverage probability is 0.9474, very close to the nominal 0.95.
\(H_0: \mu = 0\) vs. \(H_a: \mu \ne 0\), with \(\alpha = 0.05\). Because \(H_0\) is true (data really comes from \(N(0,1)\)), the rejection rate should be approximately \(\alpha = 0.05\) (the Type I error rate).
n <- 5
n_sim <- 10000
alpha <- 0.05
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, 0, 1)
p <- t.test(x, mu = 0, alternative = "two.sided")$p.value
reject[i] <- p < alpha
}
reject_1b <- mean(reject)
reject_1b
## [1] 0.0536
Result: rejection probability is 0.0536 \(\approx \alpha = 0.05\).
\(H_0: \mu = 0\) vs. \(H_a: \mu > 0\), with \(\alpha = 0.05\). Again, since \(H_0\) is true, the rejection rate should be \(\approx 0.05\).
n <- 5
n_sim <- 10000
alpha <- 0.05
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, 0, 1)
p <- t.test(x, mu = 0, alternative = "greater")$p.value
reject[i] <- p < alpha
}
reject_1c <- mean(reject)
reject_1c
## [1] 0.0547
Result: rejection probability is 0.0547 \(\approx 0.05\).
n <- 100
n_sim <- 10000
alpha <- 0.05
mu0 <- 0
contains <- logical(n_sim)
reject_two <- logical(n_sim)
reject_gtr <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, 0, 1)
ci <- t.test(x, conf.level = 0.95)$conf.int
contains[i] <- (ci[1] <= mu0) && (mu0 <= ci[2])
reject_two[i] <- t.test(x, mu = 0, alternative = "two.sided")$p.value < alpha
reject_gtr[i] <- t.test(x, mu = 0, alternative = "greater")$p.value < alpha
}
coverage_1d <- mean(contains)
reject_two_1d <- mean(reject_two)
reject_gtr_1d <- mean(reject_gtr)
data.frame(
Quantity = c("Coverage prob. (95% CI)",
"Reject rate (two-sided)",
"Reject rate (one-sided, >)"),
n_5 = c(coverage_1a, reject_1b, reject_1c),
n_100 = c(coverage_1d, reject_two_1d, reject_gtr_1d)
)
## Quantity n_5 n_100
## 1 Coverage prob. (95% CI) 0.9474 0.9488
## 2 Reject rate (two-sided) 0.0536 0.0512
## 3 Reject rate (one-sided, >) 0.0547 0.0493
The numbers stay close to the theoretical values (0.95, 0.05, 0.05) for both sample sizes — i.e., the validity of CIs and tests does not depend on \(n\) when the data are normal.
Now \(X \sim \mathrm{Unif}(-1, 1)\), so \(\mu = 0\) and \(\sigma^2 = (1-(-1))^2/12 = 1/3\). The data are not normal, so the \(t\)-test’s nominal levels are only approximate (CLT-based) for small \(n\).
n <- 5
n_sim <- 10000
mu0 <- 0
contains <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- runif(n, -1, 1)
ci <- t.test(x, conf.level = 0.95)$conf.int
contains[i] <- (ci[1] <= mu0) && (mu0 <= ci[2])
}
coverage_2a <- mean(contains)
coverage_2a
## [1] 0.9361
n <- 5
n_sim <- 10000
alpha <- 0.05
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- runif(n, -1, 1)
reject[i] <- t.test(x, mu = 0, alternative = "two.sided")$p.value < alpha
}
reject_2b <- mean(reject)
reject_2b
## [1] 0.0696
n <- 5
n_sim <- 10000
alpha <- 0.05
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- runif(n, -1, 1)
reject[i] <- t.test(x, mu = 0, alternative = "greater")$p.value < alpha
}
reject_2c <- mean(reject)
reject_2c
## [1] 0.0553
n <- 100
n_sim <- 10000
alpha <- 0.05
mu0 <- 0
contains <- logical(n_sim)
reject_two <- logical(n_sim)
reject_gtr <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- runif(n, -1, 1)
ci <- t.test(x, conf.level = 0.95)$conf.int
contains[i] <- (ci[1] <= mu0) && (mu0 <= ci[2])
reject_two[i] <- t.test(x, mu = 0, alternative = "two.sided")$p.value < alpha
reject_gtr[i] <- t.test(x, mu = 0, alternative = "greater")$p.value < alpha
}
coverage_2d <- mean(contains)
reject_two_2d <- mean(reject_two)
reject_gtr_2d <- mean(reject_gtr)
data.frame(
Quantity = c("Coverage prob. (95% CI)",
"Reject rate (two-sided)",
"Reject rate (one-sided, >)"),
n_5 = c(coverage_2a, reject_2b, reject_2c),
n_100 = c(coverage_2d, reject_two_2d, reject_gtr_2d)
)
## Quantity n_5 n_100
## 1 Coverage prob. (95% CI) 0.9361 0.9513
## 2 Reject rate (two-sided) 0.0696 0.0487
## 3 Reject rate (one-sided, >) 0.0553 0.0518
Now \(X \sim N(1, 4)\), i.e. \(\mu = 1\), \(\sigma = 2\). We still test \(H_0: \mu = 0\) vs \(H_a: \mu > 0\), but \(H_0\) is now false (the true mean is 1, not 0). So:
n <- 10
n_sim <- 10000
alpha <- 0.05
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, mean = 1, sd = 2)
reject[i] <- t.test(x, mu = 0, alternative = "greater")$p.value < alpha
}
power_3a <- mean(reject)
beta_3a <- 1 - power_3a
cat("beta =", round(beta_3a, 4), "\n")
## beta = 0.573
cat("power =", round(power_3a, 4), "\n")
## power = 0.427
We can confirm with R’s exact power.t.test():
power.t.test(n = 10, delta = 1, sd = 2,
sig.level = 0.05, type = "one.sample",
alternative = "one.sided")$power
## [1] 0.4272898
n <- 10
n_sim <- 10000
alpha <- 0.01
reject <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, mean = 1, sd = 2)
reject[i] <- t.test(x, mu = 0, alternative = "greater")$p.value < alpha
}
power_3b <- mean(reject)
beta_3b <- 1 - power_3b
cat("beta =", round(beta_3b, 4), "\n")
## beta = 0.8357
cat("power =", round(power_3b, 4), "\n")
## power = 0.1643
Making \(\alpha\) smaller (more stringent) reduces power and increases \(\beta\) — there is a direct trade-off between Type I and Type II errors.
n <- 1000
n_sim <- 10000
reject05 <- logical(n_sim)
reject01 <- logical(n_sim)
for (i in seq_len(n_sim)) {
x <- rnorm(n, mean = 1, sd = 2)
p <- t.test(x, mu = 0, alternative = "greater")$p.value
reject05[i] <- p < 0.05
reject01[i] <- p < 0.01
}
power_3c_05 <- mean(reject05); beta_3c_05 <- 1 - power_3c_05
power_3c_01 <- mean(reject01); beta_3c_01 <- 1 - power_3c_01
data.frame(
alpha = c(0.05, 0.01),
beta = c(beta_3c_05, beta_3c_01),
power = c(power_3c_05, power_3c_01)
)
## alpha beta power
## 1 0.05 0 1
## 2 0.01 0 1
With \(n = 1000\), the power is essentially 1 for both significance levels: the test virtually always detects that \(\mu \ne 0\).
data.frame(
Setting = c("n=10, alpha=0.05",
"n=10, alpha=0.01",
"n=1000, alpha=0.05",
"n=1000, alpha=0.01"),
beta = c(beta_3a, beta_3b, beta_3c_05, beta_3c_01),
power = c(power_3a, power_3b, power_3c_05, power_3c_01)
)
## Setting beta power
## 1 n=10, alpha=0.05 0.5730 0.4270
## 2 n=10, alpha=0.01 0.8357 0.1643
## 3 n=1000, alpha=0.05 0.0000 1.0000
## 4 n=1000, alpha=0.01 0.0000 1.0000
The experiment illustrates the three classical levers that control statistical power:
1.00. Larger
samples shrink the standard error and make even small departures from
\(H_0\) easy to detect.In short, the simulation confirms the theoretical relationship \[ \text{Power} = 1 - \beta = f(n,\ \alpha,\ \text{effect size}), \] which is monotonically increasing in \(n\) and effect size, and monotonically decreasing in stringency (i.e., as \(\alpha\) shrinks).