This document simulates hypothesis tests for coin fairness using various statistical methods. We analyze Type I errors, false discovery rates (FDR), and the impact of multiple comparisons.
For each experiment, we flip a fair coin 10,000 times and reject the null hypothesis if the p-value is below a significance threshold.
library(ggplot2)
binomial_test <- function(flips) {
n_heads <- sum(flips)
n <- length(flips)
p_val <- 2 * pbinom(min(n_heads, n - n_heads), n, 0.5) # Two-tailed test
return(p_val)
}
simulate_coin_test <- function(n_simulations = 50000, n_flips = 10000, alpha = 0.05) {
set.seed(142)
p_values <- replicate(n_simulations, {
flips <- sample(c(0,1), n_flips, replace=TRUE)
binomial_test(flips)
})
df <- data.frame(p_values)
ggplot(df, aes(x = p_values)) +
geom_histogram(breaks = seq(0, 1, length.out = 11), fill = "blue", alpha = 0.7, color = "black") +
geom_vline(xintercept = c(0.01, 0.05, 0.10), color = c("green", "red", "blue"), linetype = "dashed") +
labs(title = "Histogram of p-values from Coin Fairness Test", x = "p-value", y = "Frequency")
}
simulate_coin_test()
Now, we simulate hypothesis tests where p-values are checked every 1000 flips. The experiment stops if p-value < α.
simulate_coin_test_stopping <- function(n_simulations = 10000, max_flips = 10000, check_interval = 1000, alpha = 0.05) {
set.seed(142)
p_values <- replicate(n_simulations, {
flips <- c()
p_value <- 1.0
for (i in seq(check_interval, max_flips, by = check_interval)) {
flips <- c(flips, sample(c(0,1), check_interval, replace=TRUE))
p_value <- binomial_test(flips)
if (p_value < alpha) break
}
return(p_value)
})
df <- data.frame(p_values)
ggplot(df, aes(x = p_values)) +
geom_histogram(breaks = seq(0, 1, length.out = 11), fill = "blue", alpha = 0.7, color = "black") +
geom_vline(xintercept = c(0.01, 0.05, 0.10), color = c("green", "red", "blue"), linetype = "dashed") +
labs(title = "Histogram of p-values from Stopping Rule Test", x = "p-value", y = "Frequency")
}
simulate_coin_test_stopping()
We now simulate 20 coin fairness tests to show how multiple hypothesis testing can lead to incorrect rejections.
simulate_multiple_coin_tests <- function(n_simulations = 10000, n_coins = 20, n_flips = 1000, alpha = 0.05) {
set.seed(142)
rejected_counts <- replicate(n_simulations, {
sum(replicate(n_coins, binomial_test(sample(c(0,1), n_flips, replace=TRUE)) < alpha))
})
df <- data.frame(rejected_counts)
ggplot(df, aes(x = rejected_counts)) +
geom_histogram(binwidth = 1, fill = "blue", alpha = 0.7, color = "black") +
labs(title = "Histogram of Rejected Null Hypotheses", x = "Number of Rejections", y = "Frequency")
}
simulate_multiple_coin_tests()
We apply the Benjamini-Hochberg (BH) method to control the False Discovery Rate (FDR).
library(stats)
simulate_fdr_test <- function(n_simulations = 1000, n_coins = 100, n_unfair = 10, n_flips = 1000, alpha = 0.05, unfair_prob = 0.6) {
set.seed(142)
fdr_results <- replicate(n_simulations, {
actual_unfair <- c(rep(FALSE, n_coins - n_unfair), rep(TRUE, n_unfair))
actual_unfair <- sample(actual_unfair)
p_values <- sapply(actual_unfair, function(unfair) {
p <- ifelse(unfair, unfair_prob, 0.5)
binomial_test(sample(c(0,1), n_flips, replace=TRUE, prob=c(1-p, p)))
})
rejected_fdr <- p.adjust(p_values, method = "BH") < alpha
false_positives_fdr <- sum(rejected_fdr & !actual_unfair)
total_rejected_fdr <- sum(rejected_fdr)
fdr_fdr <- ifelse(total_rejected_fdr > 0, false_positives_fdr / total_rejected_fdr, 0)
rejected_standard <- p_values < alpha
false_positives_standard <- sum(rejected_standard & !actual_unfair)
total_rejected_standard <- sum(rejected_standard)
fdr_standard <- ifelse(total_rejected_standard > 0, false_positives_standard / total_rejected_standard, 0)
return(c(fdr_fdr, fdr_standard))
})
df <- data.frame(FDR_Method = fdr_results[1,], Standard_Method = fdr_results[2,])
ggplot() +
geom_histogram(data = df, aes(x = FDR_Method), breaks = seq(0, 1, length.out = 11), fill = "green", alpha = 0.7, color = "black") +
geom_histogram(data = df, aes(x = Standard_Method), breaks = seq(0, 1, length.out = 11), fill = "red", alpha = 0.5, color = "black") +
labs(title = "False Discovery Rate (FDR) Comparison", x = "FDR", y = "Frequency") +
theme_minimal()
}
simulate_fdr_test()
We demonstrate the built-in function for FDR correction using
p.adjust
.
p_values <- c(0.02, 0.15, 0.0005, 0.04, 0.10, 0.001, 0.25, 0.03, 0.07, 0.005)
adjusted_p_values <- p.adjust(p_values, method = "BH")
rejected <- adjusted_p_values < 0.05
print(data.frame(p_values, adjusted_p_values, rejected))
## p_values adjusted_p_values rejected
## 1 0.0200 0.05000000 FALSE
## 2 0.1500 0.16666667 FALSE
## 3 0.0005 0.00500000 TRUE
## 4 0.0400 0.06666667 FALSE
## 5 0.1000 0.12500000 FALSE
## 6 0.0010 0.00500000 TRUE
## 7 0.2500 0.25000000 FALSE
## 8 0.0300 0.06000000 FALSE
## 9 0.0700 0.10000000 FALSE
## 10 0.0050 0.01666667 TRUE
You are allowed to call a built-in function that directly computes the p-value for you.
Let’s say a random variable \(X\) follows a normal distribution of \(N(0, 1)\).
Now let’s simulate with \(X\) following a normal distribution of \(N(1,4)\) (variance of 4 and standard deviation of 2).