Inference_extra

#problem 1

set.seed(123)
#a
N <- 1000
p <- 0.5
R <- 5000

g_vals <- numeric(R)

for (r in 1:R) {
  x <- rbinom(N, 1, p)
  p_hat <- mean(x)
  g_vals[r] <- p_hat / (1 - p_hat)
}

hist(g_vals, probability = TRUE, breaks = 50,
     main = "Histogram of g(Xbar), N = 1000, p = 0.5",
     xlab = expression(g(bar(X))))

xgrid <- seq(min(g_vals), max(g_vals), length.out = 1000)
lines(xgrid, dnorm(xgrid, mean = 1, sd = sqrt(4/N)),
      col = "red", lwd = 2)

set.seed(123)
#b)
p <- 0.5
R <- 5000
N_vals <- c(10, 30, 50, 100, 500)

par(mfrow = c(3, 2))

for (N in N_vals) {
  g_vals <- rep(NA, R)
  
  for (r in 1:R) {
    x <- rbinom(N, 1, p)
    p_hat <- mean(x)
    
    if (p_hat < 1) {
      g_vals[r] <- p_hat / (1 - p_hat)
    }
  }
  
  hist(g_vals, probability = TRUE, breaks = 40,
       main = paste("N =", N),
       xlab = expression(g(bar(X))),
       col = "lightgray", border = "white")
  
  xgrid <- seq(min(g_vals, na.rm = TRUE),
               max(g_vals, na.rm = TRUE),
               length.out = 1000)
  
  lines(xgrid, dnorm(xgrid, mean = 1, sd = sqrt(4/N)),
        col = "red", lwd = 2)
}

##problem 2


``` r
set.seed(123)

N_vals <- c(10, 30, 100, 1000)
p_vals <- c(0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99)
R <- 100

results_delta <- data.frame()

for (N in N_vals) {
  for (p in p_vals) {
    for (r in 1:R) {
      
      x <- rbinom(N, 1, p)
      p_hat <- mean(x)
      
      fail_delta <- (p_hat == 1)
      
      if (fail_delta) {
        E_hat <- NA
        V_hat <- NA
      } else {
        E_hat <- p_hat / (1 - p_hat)
        V_hat <- p_hat / (N * (1 - p_hat)^3)
      }
      
   results_delta <- rbind(
    results_delta,
        data.frame(
        N = N,
          p = p,
           rep = r,
          p_hat = p_hat,
          E_delta = E_hat,
           V_delta = V_hat,
          fail_delta = fail_delta
        )
      )
    }
  }
}
aggregate(fail_delta ~ N + p, data = results_delta, mean)

##       N    p fail_delta
## 1    10 0.01       0.00
## 2    30 0.01       0.00
## 3   100 0.01       0.00
## 4  1000 0.01       0.00
## 5    10 0.10       0.00
## 6    30 0.10       0.00
## 7   100 0.10       0.00
## 8  1000 0.10       0.00
## 9    10 0.25       0.00
## 10   30 0.25       0.00
## 11  100 0.25       0.00
## 12 1000 0.25       0.00
## 13   10 0.50       0.00
## 14   30 0.50       0.00
## 15  100 0.50       0.00
## 16 1000 0.50       0.00
## 17   10 0.75       0.10
## 18   30 0.75       0.00
## 19  100 0.75       0.00
## 20 1000 0.75       0.00
## 21   10 0.90       0.37
## 22   30 0.90       0.04
## 23  100 0.90       0.00
## 24 1000 0.90       0.00
## 25   10 0.99       0.91
## 26   30 0.99       0.79
## 27  100 0.99       0.37
## 28 1000 0.99       0.00

#b
set.seed(123)

B <- 1000
results_boot <- data.frame()

for (N in N_vals) {
  for (p in p_vals) {
    for (r in 1:R) {
      
      x <- rbinom(N, 1, p)
      g_boot <- rep(NA, B)
      fail_boot_vec <- logical(B)
      
      for (b in 1:B) {
        x_boot <- sample(x, N, replace = TRUE)
        p_boot <- mean(x_boot)
        
        fail_boot_vec[b] <- (p_boot == 1)
        
        if (!fail_boot_vec[b]) {
          g_boot[b] <- p_boot / (1 - p_boot)
        }
      }
      
      results_boot <- rbind(
        results_boot,
        data.frame(
          N = N,
          p = p,
           rep = r,
          E_boot = mean(g_boot, na.rm = TRUE),
            V_boot = var(g_boot, na.rm = TRUE),
          n_boot_fail = sum(fail_boot_vec),
           prop_boot_fail = mean(fail_boot_vec),
           any_boot_fail = any(fail_boot_vec)
        )
       )
    }
  }
}
aggregate(prop_boot_fail ~ N + p, data = results_boot, mean)

##       N    p prop_boot_fail
## 1    10 0.01        0.00000
## 2    30 0.01        0.00000
## 3   100 0.01        0.00000
## 4  1000 0.01        0.00000
## 5    10 0.10        0.00000
## 6    30 0.10        0.00000
## 7   100 0.10        0.00000
## 8  1000 0.10        0.00000
## 9    10 0.25        0.00046
## 10   30 0.25        0.00000
## 11  100 0.25        0.00000
## 12 1000 0.25        0.00000
## 13   10 0.50        0.01182
## 14   30 0.50        0.00000
## 15  100 0.50        0.00000
## 16 1000 0.50        0.00000
## 17   10 0.75        0.15963
## 18   30 0.75        0.00208
## 19  100 0.75        0.00000
## 20 1000 0.75        0.00000
## 21   10 0.90        0.61375
## 22   30 0.90        0.16608
## 23  100 0.90        0.00099
## 24 1000 0.90        0.00000
## 25   10 0.99        0.96735
## 26   30 0.99        0.78867
## 27  100 0.99        0.51581
## 28 1000 0.99        0.00037

##problem 3


``` r
set.seed(123)

N_vals <- c(10, 30, 100, 1000)
p_vals <- c(0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99)
R <- 100

results_bayes <- data.frame()

for (N in N_vals) {
  for (p in p_vals) {
    for (r in 1:R) {
      
      #Simulating sample
      x <- rbinom(N, 1, p)
      S <- sum(x)
      
      #Posterior parameters
      a<- S + 0.5
      b <- N - S + 0.5
      
      # Posterior mean of z = p/(1-p)
      if (b > 1) {
        post_mean <- a / (b - 1)
      } else {
        post_mean <- NA
      }
      
        # Posterior variance of z
      if (b > 2) {
        post_var <- a * (a + b - 1) / ((b - 2) * (b - 1)^2)
      } else {
        post_var <- NA
      }
      
        # MAP of z
      if (S >= 1) {
        post_map <- (S - 0.5) / (N - S + 1.5)
      } else {
        post_map <- 0
      }
      
        # 95% credible interval using Beta quantiles for p
      p_lower <- qbeta(0.025, a, b)
      p_upper <- qbeta(0.975, a, b)
      
      z_lower <- p_lower / (1 - p_lower)
      z_upper <- p_upper / (1 - p_upper)
      
      results_bayes <- rbind(
        results_bayes,
        data.frame(
          N = N,
          p = p,
          rep = r,
          S = S,
          a = a,
          b = b,
            bayes_mean = post_mean,
          bayes_var = post_var,
          bayes_map = post_map,
            ci_lower = z_lower,
          ci_upper = z_upper,
          mean_exists = !is.na(post_mean),
          var_exists = !is.na(post_var)
        )
      )
    }
  }
}
aggregate(mean_exists ~ N + p, data = results_bayes, mean)

##       N    p mean_exists
## 1    10 0.01        1.00
## 2    30 0.01        1.00
## 3   100 0.01        1.00
## 4  1000 0.01        1.00
## 5    10 0.10        1.00
## 6    30 0.10        1.00
## 7   100 0.10        1.00
## 8  1000 0.10        1.00
## 9    10 0.25        1.00
## 10   30 0.25        1.00
## 11  100 0.25        1.00
## 12 1000 0.25        1.00
## 13   10 0.50        1.00
## 14   30 0.50        1.00
## 15  100 0.50        1.00
## 16 1000 0.50        1.00
## 17   10 0.75        0.90
## 18   30 0.75        1.00
## 19  100 0.75        1.00
## 20 1000 0.75        1.00
## 21   10 0.90        0.63
## 22   30 0.90        0.96
## 23  100 0.90        1.00
## 24 1000 0.90        1.00
## 25   10 0.99        0.09
## 26   30 0.99        0.21
## 27  100 0.99        0.63
## 28 1000 0.99        1.00

aggregate(var_exists ~ N + p, data = results_bayes, mean)

##       N    p var_exists
## 1    10 0.01       1.00
## 2    30 0.01       1.00
## 3   100 0.01       1.00
## 4  1000 0.01       1.00
## 5    10 0.10       1.00
## 6    30 0.10       1.00
## 7   100 0.10       1.00
## 8  1000 0.10       1.00
## 9    10 0.25       1.00
## 10   30 0.25       1.00
## 11  100 0.25       1.00
## 12 1000 0.25       1.00
## 13   10 0.50       1.00
## 14   30 0.50       1.00
## 15  100 0.50       1.00
## 16 1000 0.50       1.00
## 17   10 0.75       0.61
## 18   30 0.75       1.00
## 19  100 0.75       1.00
## 20 1000 0.75       1.00
## 21   10 0.90       0.25
## 22   30 0.90       0.77
## 23  100 0.90       1.00
## 24 1000 0.90       1.00
## 25   10 0.99       0.00
## 26   30 0.99       0.02
## 27  100 0.99       0.28
## 28 1000 0.99       1.00

aggregate(cbind(bayes_mean, bayes_var, bayes_map) ~ N + p,
          data = results_bayes, mean, na.rm = TRUE)

##       N    p   bayes_mean    bayes_var    bayes_map
## 1    10 0.01   0.06501548 8.634224e-03  0.004761905
## 2    30 0.01   0.02674205 1.005270e-03  0.005353436
## 3   100 0.01   0.01697502 1.793993e-04  0.007850033
## 4  1000 0.01   0.01113306 1.141590e-05  0.010101478
## 5    10 0.10   0.16973248 3.202819e-02  0.060311072
## 6    30 0.10   0.13417787 6.523420e-03  0.090211370
## 7   100 0.10   0.12059013 1.563154e-03  0.106961778
## 8  1000 0.10   0.11473495 1.429805e-04  0.113367257
## 9    10 0.25   0.46605315 1.542652e-01  0.242570385
## 10   30 0.25   0.38035879 2.812287e-02  0.304929589
## 11  100 0.25   0.35690816 6.880250e-03  0.334139369
## 12 1000 0.25   0.33516822 6.001054e-04  0.332942784
## 13   10 0.50   1.55637696 4.712709e+00  0.797369442
## 14   30 0.50   1.14081712 2.272749e-01  0.928156385
## 15  100 0.50   1.01861377 4.435602e-02  0.959018722
## 16 1000 0.50   1.00497695 4.064507e-03  0.998959723
## 17   10 0.75   3.30601093 2.476746e+01  1.463099742
## 18   30 0.75   3.90612164 1.287914e+01  2.704176829
## 19  100 0.75   3.21665365 6.662436e-01  2.919352618
## 20 1000 0.75   3.06307353 5.142503e-02  3.034240231
## 21   10 0.90   4.34095238 4.289379e+01  1.789841270
## 22   30 0.90  10.23484308 1.982078e+02  5.325113203
## 23  100 0.90  10.80738913 2.823281e+01  8.459935838
## 24 1000 0.90   9.01567294 9.386323e-01  8.827236883
## 25   30 0.99  19.00000000 7.600000e+02  7.857142857
## 26  100 0.99  59.95238095 7.102222e+03 26.482993197
## 27 1000 0.99 125.52629823 1.298774e+04 95.895275412

results_bayes$ci_length <- results_bayes$ci_upper - results_bayes$ci_lower

aggregate(ci_length ~ N + p, data = results_bayes, mean)

##       N    p    ci_length
## 1    10 0.01 3.101900e-01
## 2    30 0.01 1.073456e-01
## 3   100 0.01 4.768982e-02
## 4  1000 0.01 1.296497e-02
## 5    10 0.10 5.700470e-01
## 6    30 0.10 2.872319e-01
## 7   100 0.10 1.510704e-01
## 8  1000 0.10 4.676131e-02
## 9    10 0.25 1.279402e+00
## 10   30 0.25 6.150712e-01
## 11  100 0.25 3.185894e-01
## 12 1000 0.25 9.583893e-02
## 13   10 0.50 4.501711e+00
## 14   30 0.50 1.702021e+00
## 15  100 0.50 8.072527e-01
## 16 1000 0.50 2.494111e-01
## 17   10 0.75 2.119832e+03
## 18   30 0.75 7.833440e+00
## 19  100 0.75 3.046027e+00
## 20 1000 0.75 8.841045e-01
## 21   10 0.90 7.761526e+03
## 22   30 0.90 2.537608e+03
## 23  100 0.90 1.662124e+01
## 24 1000 0.90 3.753537e+00
## 25   10 0.99 1.900545e+04
## 26   30 0.99 4.871229e+04
## 27  100 0.99 7.589646e+04
## 28 1000 0.99 2.032750e+02

##problem 4

``` r
# Delta
delta_E <- results_delta[, c("N", "p", "rep", "E_delta")]
names(delta_E)[4] <- "estimate"
delta_E$method <- "Delta"
delta_E$type <- "Expectation"

# Bootstrap
boot_E <- results_boot[, c("N", "p", "rep", "E_boot")]
names(boot_E)[4] <- "estimate"
boot_E$method <- "Bootstrap"
boot_E$type <- "Expectation"

# Bayesian
bayes_E <- results_bayes[, c("N", "p", "rep", "bayes_mean")]
names(bayes_E)[4] <- "estimate"
bayes_E$method <- "Bayesian"
bayes_E$type <- "Expectation"

comparison_E <- rbind(delta_E, boot_E, bayes_E)
comparison_E <- comparison_E[is.finite(comparison_E$estimate), ]
# Delta
delta_V <- results_delta[, c("N", "p", "rep", "V_delta")]
names(delta_V)[4] <- "estimate"
delta_V$method <- "Delta"
delta_V$type <- "Variance"

# Bootstrap
boot_V <- results_boot[, c("N", "p", "rep", "V_boot")]
names(boot_V)[4] <- "estimate"
boot_V$method <- "Bootstrap"
boot_V$type <- "Variance"

# Bayesian
bayes_V <- results_bayes[, c("N", "p", "rep", "bayes_var")]
names(bayes_V)[4] <- "estimate"
bayes_V$method <- "Bayesian"
bayes_V$type <- "Variance"

comparison_V <- rbind(delta_V, boot_V, bayes_V)
comparison_V <- comparison_V[is.finite(comparison_V$estimate), ]

library(ggplot2)

ggplot(comparison_E, aes(x = method, y = estimate, fill = method)) +
  geom_boxplot(outlier.size = 0.8) +
  facet_grid(N ~ p, scales = "free_y") +
  labs(
    title = "Comparison of Expectation Estimates",
    subtitle = "Delta vs Bootstrap vs Bayesian",
    x = "Method",
    y = "Estimated Expectation"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(comparison_V, aes(x = method, y = estimate, fill = method)) +
  geom_boxplot(outlier.size = 0.8) +
  facet_grid(N ~ p, scales = "free_y") +
  labs(
    title = "Comparison of Variance Estimates",
    subtitle = "Delta vs Bootstrap vs Bayesian",
    x = "Method",
    y = "Estimated Variance"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(comparison_E, aes(x = method, y = estimate, fill = method)) +
  geom_boxplot(outlier.size = 0.8) +
  facet_grid(N ~ p, scales = "free_y") +
  scale_y_log10() +
  labs(
    title = "Comparison of Expectation Estimates (log scale)",
    x = "Method",
    y = "Estimated Expectation (log scale)"
  ) +
  theme_minimal()

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.

## Warning: Removed 506 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

ggplot(comparison_V, aes(x = method, y = estimate, fill = method)) +
  geom_boxplot(outlier.size = 0.8) +
  facet_grid(N ~ p, scales = "free_y") +
  scale_y_log10() +
  labs(
    title = "Comparison of Variance Estimates (log scale)",
    x = "Method",
    y = "Estimated Variance (log scale)"
  ) +
  theme_minimal()

## Warning in scale_y_log10(): log-10 transformation introduced infinite values.
## Removed 506 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Inference_extra_credit

Sabuj Ganguly

2026-03-30