1 CI for Mean

1.1 One Sample Mean - Z Interval

A (1−\alpha)100 \% confidence interval for \mu is:

\bar{x} \pm z_{\alpha / 2}\left(\frac{\sigma}{\sqrt{n}}\right)

z_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  sigma_hat <- sd(data)   # estimate sigma from data
  
  # critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha/2)
  
  # margin of error
  margin_error <- z_crit * (sigma_hat / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = sigma_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
df1_url <- "https://github.com/novrisuhermi/dataset/raw/refs/heads/main/data_blood_lead_level.csv"
df1 <- read.csv(df1_url)
head(df1)
print(mean(df1$x))
[1] 29.28741
print(sd(df1$x))
[1] 6.716728
z_interval(df1$x, conf_level = 0.95)
$n
[1] 126

$mean
[1] 29.28741

$sd
[1] 6.716728

$conf_level
[1] 0.95

$z_crit
[1] 1.959964

$margin_error
[1] 1.172791

$lower
[1] 28.11461

$upper
[1] 30.4602
{
  mu <- mean(df1$x)
  sigma <- sd(df1$x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(df1$x,
       breaks = 15,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Blood Lead Levels",
       xlab   = "Blood Lead Levels (micrograms / liter)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}

1.2 One Sample Mean - t Interval

A (1−\alpha)100 \% confidence interval for \mu is:

\bar{x} \pm t_{\alpha / 2, n-1}\left(\frac{s}{\sqrt{n}}\right)

t_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  s <- sd(data)
  
  # degree of freedom
  df <- n - 1
  
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df = df)
  
  # margin of error
  margin_error <- t_crit * (s / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = s,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
x <- c(118, 115, 125, 110, 112, 130, 117, 112, 
       115, 120, 113, 118, 119, 122, 123, 126)
t_interval(x, conf_level = 0.95)
$n
[1] 16

$mean
[1] 118.4375

$sd
[1] 5.656486

$df
[1] 15

$conf_level
[1] 0.95

$t_crit
[1] 2.13145

$margin_error
[1] 3.014129

$lower
[1] 115.4234

$upper
[1] 121.4516
# Base R function
t.test(x)[c("estimate","stderr","conf.int")]
$estimate
mean of x 
 118.4375 

$stderr
[1] 1.414121

$conf.int
[1] 115.4234 121.4516
attr(,"conf.level")
[1] 0.95
{
  mu <- mean(x)
  sigma <- sd(x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(x,
       breaks = 8,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Beef Consumption",
       xlab   = "Beef Consumption (pounds / year)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}

1.3 Two Sample Mean - Pooled t Interval

A (1−\alpha)100 \% confidence interval for \mu_X - \mu_Y is:

(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, n+m-2} S_p \sqrt{\frac{1}{n}+\frac{1}{m}}

where the pooled sample variance is:

S_p^2=\frac{(n-1) S_X^2+(m-1) S_Y^2}{n+m-2}

is an unbiased estimator of the common \sigma^2.

t_interval_2sample_equal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean and sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # pooled variance
  sp2 <- ((n1 - 1)*s1^2 + (n2 - 1)*s2^2) / (n1 + n2 - 2)
  sp <- sqrt(sp2)
  # standard error
  SE <- sp * sqrt(1/n1 + 1/n2)
  # degree of freedom
  df <- n1 + n2 - 2
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1, n2 = n2,
    mean1 = xbar1, mean2 = xbar2,
    sd1 = s1, sd2 = s2,
    pooled_sd = sp,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_equal(x1, x2, conf_level = 0.95)
$n1
[1] 10

$n2
[1] 10

$mean1
[1] 10.26

$mean2
[1] 9.02

$sd1
[1] 2.513607

$sd2
[1] 1.896664

$pooled_sd
[1] 2.226607

$df
[1] 18

$conf_level
[1] 0.95

$t_crit
[1] 2.100922

$SE
[1] 0.9957688

$margin_error
[1] 2.092033

$diff_mean
[1] 1.24

$lower
[1] -0.8520327

$upper
[1] 3.332033
# Base R function
t.test(x1,x2,var.equal = TRUE)[c("estimate","stderr","conf.int")]
$estimate
mean of x mean of y 
    10.26      9.02 

$stderr
[1] 0.9957688

$conf.int
[1] -0.8520327  3.3320327
attr(,"conf.level")
[1] 0.95
boxplot(x1, x2,
        names = c("deinopis", "menneus"),
        col   = c(rgb(0, 0, 1, 0.4), rgb(1, 0, 0, 0.4)),
        border = c("blue", "red"),
        main  = "Distribution of Spider Prey Sizes",
        ylab  = "size (mm)",
        notch = FALSE)

1.4 Two Sample Mean - Welch’s t Interval

If the data are approximately normal and \sigma^2_X \neq \sigma^2_Y, a (1−\alpha)100 \% confidence interval for \mu_X - \mu_Y is:

(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, r} \sqrt{\frac{S_X^2}{n}+\frac{S_Y^2}{m}}

where the degree of freedom r are approximated by Welch-Satterthwaite:

r=\frac{\left(\frac{S_X^2}{n}+\frac{S_Y^2}{m}\right)^2}{\frac{\left(S_X^2 / n\right)^2}{n-1}+\frac{\left(S_Y^2 / m\right)^2}{m-1}}

t_interval_2sample_unequal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean & sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # standard error (Welch)
  SE <- sqrt((s1^2 / n1) + (s2^2 / n2))
  # Welch degrees of freedom
  df <- ((s1^2/n1 + s2^2/n2)^2) /
        (( (s1^2/n1)^2 / (n1 - 1) ) + ( (s2^2/n2)^2 / (n2 - 1) ))
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1,
    n2 = n2,
    mean1 = xbar1,
    mean2 = xbar2,
    sd1 = s1,
    sd2 = s2,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_unequal(x1, x2, conf_level = 0.95)
$n1
[1] 10

$n2
[1] 10

$mean1
[1] 10.26

$mean2
[1] 9.02

$sd1
[1] 2.513607

$sd2
[1] 1.896664

$df
[1] 16.73953

$conf_level
[1] 0.95

$t_crit
[1] 2.112319

$SE
[1] 0.9957688

$margin_error
[1] 2.103382

$diff_mean
[1] 1.24

$lower
[1] -0.8633815

$upper
[1] 3.343382
# Base R function
t.test(x1,x2,var.equal = FALSE)[c("estimate","stderr","conf.int")]
$estimate
mean of x mean of y 
    10.26      9.02 

$stderr
[1] 0.9957688

$conf.int
[1] -0.8633815  3.3433815
attr(,"conf.level")
[1] 0.95

1.5 Dependent Sample Mean - Paired t Interval

Define differences D_i = X_i-Y-i for each pair i.

Assume the D_i are normally distributed with mean \mu_D = \mu_X-\mu_Y.

The differences are independent (one per pair), so we reduce to a one-sample problem!

A (1−\alpha)100 \% confidence interval for \mu_D=\mu_X - \mu_Y is simply the one-sample t-interval applied to the differences:

\bar{d} \pm t_{\alpha / 2, n-1}\left(\frac{s_d}{\sqrt{n}}\right)

where \bar{d}=\frac{1}{n} \sum_{i=1}^n d_i

and

s_d=\sqrt{\frac{1}{n-1} \sum_{i=1}^n\left(d_i-\bar{d}\right)^2}

paired_t_interval <- function(data1, data2, conf_level = 0.95) {
  
  # Check whether the two samples have the same length
  if (length(data1) != length(data2)) {
    stop("data1 and data2 must have the same length for a paired-sample analysis.")
  }
  # Compute paired differences
  differences <- data1 - data2
  # Basic summary statistics of the differences
  n <- length(differences)
  mean_diff <- mean(differences)
  sd_diff <- sd(differences)
  df <- n - 1
  # Significance level
  alpha <- 1 - conf_level
  # Critical t value
  t_critical <- qt(1 - alpha / 2, df = df)
  # Standard error
  standard_error <- sd_diff / sqrt(n)
  # Margin of error
  margin_of_error <- t_critical * standard_error
  # Confidence interval
  lower_bound <- mean_diff - margin_of_error
  upper_bound <- mean_diff + margin_of_error
  
  # Return results
  return(list(
    n = n,
    mean_difference = mean_diff,
    sd_difference = sd_diff,
    degrees_of_freedom = df,
    confidence_level = conf_level,
    t_critical = t_critical,
    standard_error = standard_error,
    margin_of_error = margin_of_error,
    lower_bound = lower_bound,
    upper_bound = upper_bound
  ))
}
unaffect <- c(1.94, 1.44, 1.56, 1.58, 2.06,
              1.66, 1.75, 1.77, 1.78, 1.92,
              1.25, 1.93, 2.04, 1.62, 2.08)

affect <- c(1.27, 1.63, 1.47, 1.39, 1.93,
            1.26, 1.71, 1.67, 1.28, 1.85,
            1.02, 1.34, 2.02, 1.59, 1.97)

paired_t_interval(unaffect, affect, conf_level = 0.95)
$n
[1] 15

$mean_difference
[1] 0.1986667

$sd_difference
[1] 0.2382935

$degrees_of_freedom
[1] 14

$confidence_level
[1] 0.95

$t_critical
[1] 2.144787

$standard_error
[1] 0.06152713

$margin_of_error
[1] 0.1319626

$lower_bound
[1] 0.0667041

$upper_bound
[1] 0.3306292
# Base R function
t.test(unaffect,affect,paired = TRUE)[c("estimate","stderr","conf.int")]
$estimate
mean difference 
      0.1986667 

$stderr
[1] 0.06152713

$conf.int
[1] 0.0667041 0.3306292
attr(,"conf.level")
[1] 0.95
{
  diff <- unaffect - affect
  par(mfrow = c(1,3), mar = c(4,6,3,1))
  
  # 🔹 1. Before–After Plot
  plot(c(1,2), range(c(unaffect, affect)),
       type = "n", xaxt = "n",
       xlab = "", ylab = expression(paste("Volume (cm"^3, ")")),
       main = "Before vs After")
  
  axis(1, at = c(1,2), labels = c("Unaffect", "Affect"))
  
  for(i in 1:length(unaffect)){
    lines(c(1,2), c(unaffect[i], affect[i]),
          col = rgb(0.5,0.5,0.5,0.5))
  }
  
  points(rep(1, length(unaffect)), unaffect,
         col = "blue", pch = 19)
  
  points(rep(2, length(affect)), affect,
         col = "red", pch = 19)
  
  # 🔹 2. Boxplot Comparison
  boxplot(unaffect, affect,
          names = c("Unaffect", "Affect"),
          col = c(rgb(0,0,1,0.4), rgb(1,0,0,0.4)),
          border = c("blue","red"),
          main = "Boxplot Comparison")
  
  # 🔹 3. Boxplot Differences
  boxplot(diff,
          col = rgb(0,0,1,0.3),
          border = "blue",
          main = "Difference (Unaffect - Affect)")
  
  abline(h = 0, col = "red", lwd = 2)
  
  # Reset layout
  par(mfrow = c(1,1))
  
  invisible(NULL)
}

2 CI for Proportion

2.1 CI for One Sample Proportion

Let X_i \sim \operatorname{Bernoulli}(p), we have

\mathrm{E}\left(X_i\right)=p \text{ and } \operatorname{Var}\left(X_i\right)=p(1-p)

By the Central Limit Theorem, for large n:

Z=\frac{\bar{X}-\mu}{\sigma / \sqrt{n}}=\frac{\hat{p}-p}{\sqrt{\frac{p(1-p)}{n}}} \sim N(0,1) \text{ (approximately)}

For large random samples, an approximate (1−\alpha)100\% confidence interval for a population proportion p is:

\hat{p} \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}

z_interval_proportion <- function(x, n, conf_level = 0.95) {
  
  # Estimate the proportion
  p_hat <- x / n
  
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  
  # Standard error
  SE <- sqrt(p_hat * (1 - p_hat) / n)
  
  # Margin of error
  margin_error <- z_crit * SE
  
  # Confidence interval
  lower <- p_hat - margin_error
  upper <- p_hat + margin_error
  
  return(list(
    x = x,
    n = n,
    p_hat = p_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
z_interval_proportion(280, 418, conf_level = 0.95)
$x
[1] 280

$n
[1] 418

$p_hat
[1] 0.6698565

$conf_level
[1] 0.95

$z_crit
[1] 1.959964

$standard_error
[1] 0.02300139

$margin_error
[1] 0.0450819

$lower
[1] 0.6247746

$upper
[1] 0.7149384
# Base R function
prop.test(x = 280, n = 418, conf.level = 0.95)[c("estimate","conf.int")]
$estimate
        p 
0.6698565 

$conf.int
[1] 0.6221809 0.7143568
attr(,"conf.level")
[1] 0.95

2.2 CI for Two Sample Proportion

For large random samples, an approximate (1−\alpha)100\% confidence interval for a population proportion p_1-p_2 is:

\left(\hat{p}_1-\hat{p}_2\right) \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}_1\left(1-\hat{p}_1\right)}{n_1}+\frac{\hat{p}_2\left(1-\hat{p}_2\right)}{n_2}}

z_interval_2prop <- function(x1, n1, x2, n2, conf_level = 0.95) {

  # Estimate the proportion
  p1_hat <- x1 / n1; p2_hat <- x2 / n2
  # Proportion difference
  diff <- p1_hat - p2_hat
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  # Standard error
  SE <- sqrt((p1_hat * (1 - p1_hat) / n1) +
             (p2_hat * (1 - p2_hat) / n2))
  # Margin of error
  margin_error <- z_crit * SE
  # Confidence interval
  lower <- diff - margin_error
  upper <- diff + margin_error
  
  return(list(
    x1 = x1,
    n1 = n1,
    p1_hat = p1_hat,
    x2 = x2,
    n2 = n2,
    p2_hat = p2_hat,
    diff = diff,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
z_interval_2prop(x1 = 840, n1 = 2100,
                 x2 = 323, n2 = 1900,
                 conf_level = 0.95)
$x1
[1] 840

$n1
[1] 2100

$p1_hat
[1] 0.4

$x2
[1] 323

$n2
[1] 1900

$p2_hat
[1] 0.17

$diff
[1] 0.23

$conf_level
[1] 0.95

$z_crit
[1] 1.959964

$standard_error
[1] 0.01373131

$margin_error
[1] 0.02691287

$lower
[1] 0.2030871

$upper
[1] 0.2569129
# Base R function
prop.test(x = c(840, 323),
          n = c(2100, 1900),
          conf.level = 0.95,
          correct = FALSE)[c("estimate","conf.int")]
$estimate
prop 1 prop 2 
  0.40   0.17 

$conf.int
[1] 0.2030871 0.2569129
attr(,"conf.level")
[1] 0.95

3 CI for Variance

3.1 CI for One Variance

If X_1, \ldots, X_n \sim N\left(\mu, \sigma^2\right)

and

a=\chi_{1-\alpha / 2, n-1}^2, b=\chi_{\alpha / 2, n-1}^2,

then a (1-\alpha)100\% CI for \sigma^2 is:

\frac{(n-1) s^2}{b} \leq \sigma^2 \leq \frac{(n-1) s^2}{a}

And a (1-\alpha)100\% CI for \sigma is obtained by taking square roots:

\frac{\sqrt{(n-1)} s}{\sqrt{b}} \leq \sigma \leq \frac{\sqrt{(n-1)} s}{\sqrt{a}}

ci_variance <- function(data, conf_level = 0.95){
  
  # basic stats
  n <- length(data)
  s2 <- var(data)
  df <- n - 1
  
  alpha <- 1 - conf_level
  
  # chi-square critical values
  chi_lower <- qchisq(alpha/2, df)
  chi_upper <- qchisq(1 - alpha/2, df)
  
  # CI for variance
  lower_var <- (df * s2) / chi_upper
  upper_var <- (df * s2) / chi_lower
  
  # CI for standard deviation
  lower_sd <- sqrt(lower_var)
  upper_sd <- sqrt(upper_var)
  
  return(list(
    n = n,
    sample_variance = s2,
    df = df,
    conf_level = conf_level,
    chi_lower = chi_lower,
    chi_upper = chi_upper,
    variance_CI = c(lower_var, upper_var),
    sd_CI = c(lower_sd, upper_sd)
  ))
}
x <- c(52.48, 52.15, 50.02, 51.85, 54.96,
       52.92, 52.08, 51.13, 47.79, 54.31)

ci_variance(data = x, conf_level = 0.95)
$n
[1] 10

$sample_variance
[1] 4.179743

$df
[1] 9

$conf_level
[1] 0.95

$chi_lower
[1] 2.700389

$chi_upper
[1] 19.02277

$variance_CI
[1]  1.977509 13.930468

$sd_CI
[1] 1.406239 3.732354
# Verify results
EnvStats::varTest(x, conf.level = 0.95)[c("estimate","conf.int")]
$estimate
variance 
4.179743 

$conf.int
      LCL       UCL 
 1.977509 13.930468 
attr(,"conf.level")
[1] 0.95

3.2 CI for Two Variances

If X_1, \ldots, X_n \sim N\left(\mu_X, \sigma_X^2\right) and Y_1, \ldots, Y_m \sim N\left(\mu_Y, \sigma_Y^2\right) be independent random samples.

Since \frac{(n-1) S_X^2}{\sigma_X^2} \sim \chi_{n-1}^2 and \frac{(m-1) S_Y^2}{\sigma_Y^2} \sim \chi_{m-1}^2 independently, by the definition of the F-distribution:

F=\frac{(m-1) S_Y^2 / \sigma_Y^2 /(m-1)}{(n-1) S_X^2 / \sigma_X^2 /(n-1)}=\frac{\sigma_X^2}{\sigma_Y^2} \cdot \frac{S_Y^2}{S_X^2} \sim F(m-1, n-1)

This pivotal quantity contains the ratio \sigma^2_X / \sigma^2_Y, exactly the parameter we want to estimate.

A (1-\alpha)100\% CI for the ratio of two population variances \sigma^2_X / \sigma^2_Y is:

\frac{1}{F_{\alpha / 2}(n-1, m-1)} \cdot \frac{S_X^2}{S_Y^2} \leq \frac{\sigma_X^2}{\sigma_Y^2} \leq F_{\alpha / 2}(m-1, n-1) \cdot \frac{S_X^2}{S_Y^2}

ci_variance_ratio <- function(data1, data2, conf_level = 0.95){
  
  # basic stats
  n1 <- length(data1); n2 <- length(data2)
  s1_sq <- var(data1); s2_sq <- var(data2)
  
  df1 <- n1 - 1
  df2 <- n2 - 1
  
  alpha <- 1 - conf_level
  
  # F critical values
  F_lower <- qf(alpha/2, df1, df2)
  F_upper <- qf(1 - alpha/2, df1, df2)
  
  # ratio of variances
  ratio <- s1_sq / s2_sq
  
  # confidence interval
  lower <- ratio / F_upper
  upper <- ratio / F_lower
  
  return(list(
    n1 = n1,
    n2 = n2,
    var1 = s1_sq,
    var2 = s2_sq,
    ratio = ratio,
    df1 = df1,
    df2 = df2,
    conf_level = conf_level,
    F_lower = F_lower,
    F_upper = F_upper,
    lower = lower,
    upper = upper
  ))
}
ci_variance_ratio(x1, x2, conf_level = 0.95)
$n1
[1] 10

$n2
[1] 10

$var1
[1] 6.318222

$var2
[1] 3.597333

$ratio
[1] 1.756363

$df1
[1] 9

$df2
[1] 9

$conf_level
[1] 0.95

$F_lower
[1] 0.2483859

$F_upper
[1] 4.025994

$lower
[1] 0.4362557

$upper
[1] 7.071106
# Verify results
var.test(x1, x2, conf.level = 0.95)[c("estimate", "conf.int")]
$estimate
ratio of variances 
          1.756363 

$conf.int
[1] 0.4362557 7.0711061
attr(,"conf.level")
[1] 0.95
{
# Density value
d1 <- density(x1)
d2 <- density(x2)

# Data range
x_range <- range(c(d1$x, d2$x))
y_range <- range(c(d1$y, d2$y))

# Empty plot
plot(d1,
     col = "blue",
     lwd = 2,
     main = "Prey Sizes Distribution",
     xlab = "Size (mm)",
     xlim = x_range,
     ylim = y_range,
     type = "n")  

# 🔵 Fill Group 1
polygon(d1,
        col = rgb(0, 0, 1, 0.3),
        border = NA)

# 🔴 Fill Group 2
polygon(d2,
        col = rgb(1, 0, 0, 0.3),
        border = NA)

# Line colors
lines(d1, col = "blue", lwd = 2)
lines(d2, col = "red",  lwd = 2)
legend("topright",
       legend = c("deinopis", "menneus"),
       col = c("blue", "red"),
       lwd = 2,
       bty = "n")
}

---
title: "Computational Statistics Week 4"

output:
  html_notebook:
    math_method: katex
    theme: yeti
    toc: true
    toc_float:
      toc_collapsed: true
    number_sections: true
    df_print: paged
---

# CI for Mean

## One Sample Mean - Z Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu$ is:

$$
\bar{x} \pm z_{\alpha / 2}\left(\frac{\sigma}{\sqrt{n}}\right)
$$

```{r}
z_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  sigma_hat <- sd(data)   # estimate sigma from data
  
  # critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha/2)
  
  # margin of error
  margin_error <- z_crit * (sigma_hat / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = sigma_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
df1_url <- "https://github.com/novrisuhermi/dataset/raw/refs/heads/main/data_blood_lead_level.csv"
df1 <- read.csv(df1_url)
head(df1)
```

```{r}
print(mean(df1$x))
print(sd(df1$x))
```

```{r}
z_interval(df1$x, conf_level = 0.95)
```

```{r}
{
  mu <- mean(df1$x)
  sigma <- sd(df1$x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(df1$x,
       breaks = 15,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Blood Lead Levels",
       xlab   = "Blood Lead Levels (micrograms / liter)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}
```

## One Sample Mean - t Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu$ is:

$$
\bar{x} \pm t_{\alpha / 2, n-1}\left(\frac{s}{\sqrt{n}}\right)
$$

```{r}
t_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  s <- sd(data)
  
  # degree of freedom
  df <- n - 1
  
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df = df)
  
  # margin of error
  margin_error <- t_crit * (s / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = s,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x <- c(118, 115, 125, 110, 112, 130, 117, 112, 
       115, 120, 113, 118, 119, 122, 123, 126)
t_interval(x, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x)[c("estimate","stderr","conf.int")]
```

```{r}
{
  mu <- mean(x)
  sigma <- sd(x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(x,
       breaks = 8,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Beef Consumption",
       xlab   = "Beef Consumption (pounds / year)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}
```

## Two Sample Mean - Pooled t Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu_X - \mu_Y$ is:

$$
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, n+m-2} S_p \sqrt{\frac{1}{n}+\frac{1}{m}}
$$

where the pooled sample variance is:

$$
S_p^2=\frac{(n-1) S_X^2+(m-1) S_Y^2}{n+m-2}
$$

is an unbiased estimator of the common $\sigma^2$.

```{r}
t_interval_2sample_equal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean and sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # pooled variance
  sp2 <- ((n1 - 1)*s1^2 + (n2 - 1)*s2^2) / (n1 + n2 - 2)
  sp <- sqrt(sp2)
  # standard error
  SE <- sp * sqrt(1/n1 + 1/n2)
  # degree of freedom
  df <- n1 + n2 - 2
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1, n2 = n2,
    mean1 = xbar1, mean2 = xbar2,
    sd1 = s1, sd2 = s2,
    pooled_sd = sp,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_equal(x1, x2, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x1,x2,var.equal = TRUE)[c("estimate","stderr","conf.int")]
```

```{r}
boxplot(x1, x2,
        names = c("deinopis", "menneus"),
        col   = c(rgb(0, 0, 1, 0.4), rgb(1, 0, 0, 0.4)),
        border = c("blue", "red"),
        main  = "Distribution of Spider Prey Sizes",
        ylab  = "size (mm)",
        notch = FALSE)
```


## Two Sample Mean - Welch's t Interval

If the data are approximately normal and $\sigma^2_X \neq \sigma^2_Y$, a $(1−\alpha)100 \%$ confidence interval for $\mu_X - \mu_Y$ is:

$$
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, r} \sqrt{\frac{S_X^2}{n}+\frac{S_Y^2}{m}}
$$

where the degree of freedom $r$ are approximated by Welch-Satterthwaite:

$$
r=\frac{\left(\frac{S_X^2}{n}+\frac{S_Y^2}{m}\right)^2}{\frac{\left(S_X^2 / n\right)^2}{n-1}+\frac{\left(S_Y^2 / m\right)^2}{m-1}}
$$

```{r}
t_interval_2sample_unequal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean & sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # standard error (Welch)
  SE <- sqrt((s1^2 / n1) + (s2^2 / n2))
  # Welch degrees of freedom
  df <- ((s1^2/n1 + s2^2/n2)^2) /
        (( (s1^2/n1)^2 / (n1 - 1) ) + ( (s2^2/n2)^2 / (n2 - 1) ))
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1,
    n2 = n2,
    mean1 = xbar1,
    mean2 = xbar2,
    sd1 = s1,
    sd2 = s2,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_unequal(x1, x2, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x1,x2,var.equal = FALSE)[c("estimate","stderr","conf.int")]
```

## Dependent Sample Mean - Paired t Interval

Define differences $D_i = X_i-Y-i$ for each pair $i$.

Assume the $D_i$ are normally distributed with mean $\mu_D = \mu_X-\mu_Y$.

The differences are independent (one per pair), so we reduce to a one-sample problem!

A $(1−\alpha)100 \%$ confidence interval for $\mu_D=\mu_X - \mu_Y$ is simply the one-sample t-interval applied to the differences:

$$
\bar{d} \pm t_{\alpha / 2, n-1}\left(\frac{s_d}{\sqrt{n}}\right)
$$

where $$
\bar{d}=\frac{1}{n} \sum_{i=1}^n d_i
$$

and

$$
s_d=\sqrt{\frac{1}{n-1} \sum_{i=1}^n\left(d_i-\bar{d}\right)^2}
$$

```{r}
paired_t_interval <- function(data1, data2, conf_level = 0.95) {
  
  # Check whether the two samples have the same length
  if (length(data1) != length(data2)) {
    stop("data1 and data2 must have the same length for a paired-sample analysis.")
  }
  # Compute paired differences
  differences <- data1 - data2
  # Basic summary statistics of the differences
  n <- length(differences)
  mean_diff <- mean(differences)
  sd_diff <- sd(differences)
  df <- n - 1
  # Significance level
  alpha <- 1 - conf_level
  # Critical t value
  t_critical <- qt(1 - alpha / 2, df = df)
  # Standard error
  standard_error <- sd_diff / sqrt(n)
  # Margin of error
  margin_of_error <- t_critical * standard_error
  # Confidence interval
  lower_bound <- mean_diff - margin_of_error
  upper_bound <- mean_diff + margin_of_error
  
  # Return results
  return(list(
    n = n,
    mean_difference = mean_diff,
    sd_difference = sd_diff,
    degrees_of_freedom = df,
    confidence_level = conf_level,
    t_critical = t_critical,
    standard_error = standard_error,
    margin_of_error = margin_of_error,
    lower_bound = lower_bound,
    upper_bound = upper_bound
  ))
}
```

```{r}
unaffect <- c(1.94, 1.44, 1.56, 1.58, 2.06,
              1.66, 1.75, 1.77, 1.78, 1.92,
              1.25, 1.93, 2.04, 1.62, 2.08)

affect <- c(1.27, 1.63, 1.47, 1.39, 1.93,
            1.26, 1.71, 1.67, 1.28, 1.85,
            1.02, 1.34, 2.02, 1.59, 1.97)

paired_t_interval(unaffect, affect, conf_level = 0.95)
```

```{r}
# Base R function
t.test(unaffect,affect,paired = TRUE)[c("estimate","stderr","conf.int")]
```



```{r}
{
  diff <- unaffect - affect
  par(mfrow = c(1,3), mar = c(4,6,3,1))
  
  # 🔹 1. Before–After Plot
  plot(c(1,2), range(c(unaffect, affect)),
       type = "n", xaxt = "n",
       xlab = "", ylab = expression(paste("Volume (cm"^3, ")")),
       main = "Before vs After")
  
  axis(1, at = c(1,2), labels = c("Unaffect", "Affect"))
  
  for(i in 1:length(unaffect)){
    lines(c(1,2), c(unaffect[i], affect[i]),
          col = rgb(0.5,0.5,0.5,0.5))
  }
  
  points(rep(1, length(unaffect)), unaffect,
         col = "blue", pch = 19)
  
  points(rep(2, length(affect)), affect,
         col = "red", pch = 19)
  
  # 🔹 2. Boxplot Comparison
  boxplot(unaffect, affect,
          names = c("Unaffect", "Affect"),
          col = c(rgb(0,0,1,0.4), rgb(1,0,0,0.4)),
          border = c("blue","red"),
          main = "Boxplot Comparison")
  
  # 🔹 3. Boxplot Differences
  boxplot(diff,
          col = rgb(0,0,1,0.3),
          border = "blue",
          main = "Difference (Unaffect - Affect)")
  
  abline(h = 0, col = "red", lwd = 2)
  
  # Reset layout
  par(mfrow = c(1,1))
  
  invisible(NULL)
}
```


# CI for Proportion

## CI for One Sample Proportion

Let $X_i \sim \operatorname{Bernoulli}(p)$, we have

$$
\mathrm{E}\left(X_i\right)=p \text{ and } \operatorname{Var}\left(X_i\right)=p(1-p)
$$

By the Central Limit Theorem, for large $n$:

$$
Z=\frac{\bar{X}-\mu}{\sigma / \sqrt{n}}=\frac{\hat{p}-p}{\sqrt{\frac{p(1-p)}{n}}} \sim N(0,1) \text{  (approximately)}
$$

For large random samples, an approximate $(1−\alpha)100\%$ confidence interval for a population proportion $p$ is:

$$
\hat{p} \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}
$$

```{r}
z_interval_proportion <- function(x, n, conf_level = 0.95) {
  
  # Estimate the proportion
  p_hat <- x / n
  
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  
  # Standard error
  SE <- sqrt(p_hat * (1 - p_hat) / n)
  
  # Margin of error
  margin_error <- z_crit * SE
  
  # Confidence interval
  lower <- p_hat - margin_error
  upper <- p_hat + margin_error
  
  return(list(
    x = x,
    n = n,
    p_hat = p_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
z_interval_proportion(280, 418, conf_level = 0.95)
```

```{r}
# Base R function
prop.test(x = 280, n = 418, conf.level = 0.95)[c("estimate","conf.int")]
```




## CI for Two Sample Proportion

For large random samples, an approximate $(1−\alpha)100\%$ confidence interval for a population proportion $p_1-p_2$ is:

$$
\left(\hat{p}_1-\hat{p}_2\right) \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}_1\left(1-\hat{p}_1\right)}{n_1}+\frac{\hat{p}_2\left(1-\hat{p}_2\right)}{n_2}}
$$

```{r}
z_interval_2prop <- function(x1, n1, x2, n2, conf_level = 0.95) {

  # Estimate the proportion
  p1_hat <- x1 / n1; p2_hat <- x2 / n2
  # Proportion difference
  diff <- p1_hat - p2_hat
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  # Standard error
  SE <- sqrt((p1_hat * (1 - p1_hat) / n1) +
             (p2_hat * (1 - p2_hat) / n2))
  # Margin of error
  margin_error <- z_crit * SE
  # Confidence interval
  lower <- diff - margin_error
  upper <- diff + margin_error
  
  return(list(
    x1 = x1,
    n1 = n1,
    p1_hat = p1_hat,
    x2 = x2,
    n2 = n2,
    p2_hat = p2_hat,
    diff = diff,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
z_interval_2prop(x1 = 840, n1 = 2100,
                 x2 = 323, n2 = 1900,
                 conf_level = 0.95)
```

```{r}
# Base R function
prop.test(x = c(840, 323),
          n = c(2100, 1900),
          conf.level = 0.95,
          correct = FALSE)[c("estimate","conf.int")]
```

# CI for Variance

## CI for One Variance

If $$X_1, \ldots, X_n \sim N\left(\mu, \sigma^2\right)$$

and

$$
a=\chi_{1-\alpha / 2, n-1}^2, b=\chi_{\alpha / 2, n-1}^2,
$$

then a $(1-\alpha)100\%$ CI for $\sigma^2$ is:

$$
\frac{(n-1) s^2}{b} \leq \sigma^2 \leq \frac{(n-1) s^2}{a}
$$

And a $(1-\alpha)100\%$ CI for $\sigma$ is obtained by taking square roots:

$$
\frac{\sqrt{(n-1)} s}{\sqrt{b}} \leq \sigma \leq \frac{\sqrt{(n-1)} s}{\sqrt{a}}
$$

```{r}
ci_variance <- function(data, conf_level = 0.95){
  
  # basic stats
  n <- length(data)
  s2 <- var(data)
  df <- n - 1
  
  alpha <- 1 - conf_level
  
  # chi-square critical values
  chi_lower <- qchisq(alpha/2, df)
  chi_upper <- qchisq(1 - alpha/2, df)
  
  # CI for variance
  lower_var <- (df * s2) / chi_upper
  upper_var <- (df * s2) / chi_lower
  
  # CI for standard deviation
  lower_sd <- sqrt(lower_var)
  upper_sd <- sqrt(upper_var)
  
  return(list(
    n = n,
    sample_variance = s2,
    df = df,
    conf_level = conf_level,
    chi_lower = chi_lower,
    chi_upper = chi_upper,
    variance_CI = c(lower_var, upper_var),
    sd_CI = c(lower_sd, upper_sd)
  ))
}
```

```{r}
x <- c(52.48, 52.15, 50.02, 51.85, 54.96,
       52.92, 52.08, 51.13, 47.79, 54.31)

ci_variance(data = x, conf_level = 0.95)
```

```{r}
# Verify results
EnvStats::varTest(x, conf.level = 0.95)[c("estimate","conf.int")]
```

## CI for Two Variances

If $X_1, \ldots, X_n \sim N\left(\mu_X, \sigma_X^2\right)$ and $Y_1, \ldots, Y_m \sim N\left(\mu_Y, \sigma_Y^2\right)$ be independent random samples.

Since $\frac{(n-1) S_X^2}{\sigma_X^2} \sim \chi_{n-1}^2$ and $\frac{(m-1) S_Y^2}{\sigma_Y^2} \sim \chi_{m-1}^2$ independently, by the definition of the F-distribution:

$$
F=\frac{(m-1) S_Y^2 / \sigma_Y^2 /(m-1)}{(n-1) S_X^2 / \sigma_X^2 /(n-1)}=\frac{\sigma_X^2}{\sigma_Y^2} \cdot \frac{S_Y^2}{S_X^2} \sim F(m-1, n-1)
$$

This pivotal quantity contains the ratio $\sigma^2_X / \sigma^2_Y$, exactly the parameter we want to estimate.

A $(1-\alpha)100\%$ CI for the ratio of two population variances $\sigma^2_X / \sigma^2_Y$ is:

$$
\frac{1}{F_{\alpha / 2}(n-1, m-1)} \cdot \frac{S_X^2}{S_Y^2} \leq \frac{\sigma_X^2}{\sigma_Y^2} \leq F_{\alpha / 2}(m-1, n-1) \cdot \frac{S_X^2}{S_Y^2}
$$

```{r}
ci_variance_ratio <- function(data1, data2, conf_level = 0.95){
  
  # basic stats
  n1 <- length(data1); n2 <- length(data2)
  s1_sq <- var(data1); s2_sq <- var(data2)
  
  df1 <- n1 - 1
  df2 <- n2 - 1
  
  alpha <- 1 - conf_level
  
  # F critical values
  F_lower <- qf(alpha/2, df1, df2)
  F_upper <- qf(1 - alpha/2, df1, df2)
  
  # ratio of variances
  ratio <- s1_sq / s2_sq
  
  # confidence interval
  lower <- ratio / F_upper
  upper <- ratio / F_lower
  
  return(list(
    n1 = n1,
    n2 = n2,
    var1 = s1_sq,
    var2 = s2_sq,
    ratio = ratio,
    df1 = df1,
    df2 = df2,
    conf_level = conf_level,
    F_lower = F_lower,
    F_upper = F_upper,
    lower = lower,
    upper = upper
  ))
}
```


```{r}
ci_variance_ratio(x1, x2, conf_level = 0.95)
```

```{r}
# Verify results
var.test(x1, x2, conf.level = 0.95)[c("estimate", "conf.int")]
```

```{r}
{
# Density value
d1 <- density(x1)
d2 <- density(x2)

# Data range
x_range <- range(c(d1$x, d2$x))
y_range <- range(c(d1$y, d2$y))

# Empty plot
plot(d1,
     col = "blue",
     lwd = 2,
     main = "Prey Sizes Distribution",
     xlab = "Size (mm)",
     xlim = x_range,
     ylim = y_range,
     type = "n")  

# 🔵 Fill Group 1
polygon(d1,
        col = rgb(0, 0, 1, 0.3),
        border = NA)

# 🔴 Fill Group 2
polygon(d2,
        col = rgb(1, 0, 0, 0.3),
        border = NA)

# Line colors
lines(d1, col = "blue", lwd = 2)
lines(d2, col = "red",  lwd = 2)
legend("topright",
       legend = c("deinopis", "menneus"),
       col = c("blue", "red"),
       lwd = 2,
       bty = "n")
}
```


