CI for Mean
One Sample Mean - Z
Interval
A (1−\alpha)100 \% confidence
interval for \mu is:
\bar{x} \pm z_{\alpha / 2}\left(\frac{\sigma}{\sqrt{n}}\right)
z_interval <- function(data, conf_level = 0.95){
# basic stats from data
n <- length(data)
xbar <- mean(data)
sigma_hat <- sd(data) # estimate sigma from data
# critical value
alpha <- 1 - conf_level
z_crit <- qnorm(1 - alpha/2)
# margin of error
margin_error <- z_crit * (sigma_hat / sqrt(n))
# interval
lower <- xbar - margin_error
upper <- xbar + margin_error
return(list(
n = n,
mean = xbar,
sd = sigma_hat,
conf_level = conf_level,
z_crit = z_crit,
margin_error = margin_error,
lower = lower,
upper = upper
))
}
df1_url <- "https://github.com/novrisuhermi/dataset/raw/refs/heads/main/data_blood_lead_level.csv"
df1 <- read.csv(df1_url)
head(df1)
print(mean(df1$x))
[1] 29.28741
print(sd(df1$x))
[1] 6.716728
z_interval(df1$x, conf_level = 0.95)
$n
[1] 126
$mean
[1] 29.28741
$sd
[1] 6.716728
$conf_level
[1] 0.95
$z_crit
[1] 1.959964
$margin_error
[1] 1.172791
$lower
[1] 28.11461
$upper
[1] 30.4602
{
mu <- mean(df1$x)
sigma <- sd(df1$x)
x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
y_vals <- dnorm(x_vals, mean = mu, sd = sigma)
hist(df1$x,
breaks = 15,
col = "steelblue",
border = "white",
main = "Distribution of Blood Lead Levels",
xlab = "Blood Lead Levels (micrograms / liter)",
freq = FALSE,
xlim = c(mu - 4 * sigma, mu + 4 * sigma))
polygon(c(x_vals, rev(x_vals)),
c(y_vals, rep(0, length(y_vals))),
col = rgb(0, 0, 1, 0.2),
border = NA)
invisible(lines(x_vals, y_vals,
col = "blue",
lwd = 2))
}

One Sample Mean - t
Interval
A (1−\alpha)100 \% confidence
interval for \mu is:
\bar{x} \pm t_{\alpha / 2, n-1}\left(\frac{s}{\sqrt{n}}\right)
t_interval <- function(data, conf_level = 0.95){
# basic stats from data
n <- length(data)
xbar <- mean(data)
s <- sd(data)
# degree of freedom
df <- n - 1
# critical value
alpha <- 1 - conf_level
t_crit <- qt(1 - alpha/2, df = df)
# margin of error
margin_error <- t_crit * (s / sqrt(n))
# interval
lower <- xbar - margin_error
upper <- xbar + margin_error
return(list(
n = n,
mean = xbar,
sd = s,
df = df,
conf_level = conf_level,
t_crit = t_crit,
margin_error = margin_error,
lower = lower,
upper = upper
))
}
x <- c(118, 115, 125, 110, 112, 130, 117, 112,
115, 120, 113, 118, 119, 122, 123, 126)
t_interval(x, conf_level = 0.95)
$n
[1] 16
$mean
[1] 118.4375
$sd
[1] 5.656486
$df
[1] 15
$conf_level
[1] 0.95
$t_crit
[1] 2.13145
$margin_error
[1] 3.014129
$lower
[1] 115.4234
$upper
[1] 121.4516
# Base R function
t.test(x)[c("estimate","stderr","conf.int")]
$estimate
mean of x
118.4375
$stderr
[1] 1.414121
$conf.int
[1] 115.4234 121.4516
attr(,"conf.level")
[1] 0.95
{
mu <- mean(x)
sigma <- sd(x)
x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
y_vals <- dnorm(x_vals, mean = mu, sd = sigma)
hist(x,
breaks = 8,
col = "steelblue",
border = "white",
main = "Distribution of Beef Consumption",
xlab = "Beef Consumption (pounds / year)",
freq = FALSE,
xlim = c(mu - 4 * sigma, mu + 4 * sigma))
polygon(c(x_vals, rev(x_vals)),
c(y_vals, rep(0, length(y_vals))),
col = rgb(0, 0, 1, 0.2),
border = NA)
invisible(lines(x_vals, y_vals,
col = "blue",
lwd = 2))
}

Two Sample Mean -
Pooled t Interval
A (1−\alpha)100 \% confidence
interval for \mu_X - \mu_Y is:
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, n+m-2} S_p
\sqrt{\frac{1}{n}+\frac{1}{m}}
where the pooled sample variance is:
S_p^2=\frac{(n-1) S_X^2+(m-1) S_Y^2}{n+m-2}
is an unbiased estimator of the common \sigma^2.
t_interval_2sample_equal <- function(data1, data2, conf_level = 0.95){
# sample size
n1 <- length(data1); n2 <- length(data2)
# statistic mean and sd
xbar1 <- mean(data1); xbar2 <- mean(data2)
s1 <- sd(data1); s2 <- sd(data2)
# pooled variance
sp2 <- ((n1 - 1)*s1^2 + (n2 - 1)*s2^2) / (n1 + n2 - 2)
sp <- sqrt(sp2)
# standard error
SE <- sp * sqrt(1/n1 + 1/n2)
# degree of freedom
df <- n1 + n2 - 2
# critical value
alpha <- 1 - conf_level
t_crit <- qt(1 - alpha/2, df)
# margin of error
margin_error <- t_crit * SE
# difference in mean
diff_mean <- xbar1 - xbar2
# interval
lower <- diff_mean - margin_error
upper <- diff_mean + margin_error
return(list(
n1 = n1, n2 = n2,
mean1 = xbar1, mean2 = xbar2,
sd1 = s1, sd2 = s2,
pooled_sd = sp,
df = df,
conf_level = conf_level,
t_crit = t_crit,
SE = SE,
margin_error = margin_error,
diff_mean = diff_mean,
lower = lower,
upper = upper
))
}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_equal(x1, x2, conf_level = 0.95)
$n1
[1] 10
$n2
[1] 10
$mean1
[1] 10.26
$mean2
[1] 9.02
$sd1
[1] 2.513607
$sd2
[1] 1.896664
$pooled_sd
[1] 2.226607
$df
[1] 18
$conf_level
[1] 0.95
$t_crit
[1] 2.100922
$SE
[1] 0.9957688
$margin_error
[1] 2.092033
$diff_mean
[1] 1.24
$lower
[1] -0.8520327
$upper
[1] 3.332033
# Base R function
t.test(x1,x2,var.equal = TRUE)[c("estimate","stderr","conf.int")]
$estimate
mean of x mean of y
10.26 9.02
$stderr
[1] 0.9957688
$conf.int
[1] -0.8520327 3.3320327
attr(,"conf.level")
[1] 0.95
boxplot(x1, x2,
names = c("deinopis", "menneus"),
col = c(rgb(0, 0, 1, 0.4), rgb(1, 0, 0, 0.4)),
border = c("blue", "red"),
main = "Distribution of Spider Prey Sizes",
ylab = "size (mm)",
notch = FALSE)

Two Sample Mean -
Welch’s t Interval
If the data are approximately normal and \sigma^2_X \neq \sigma^2_Y, a (1−\alpha)100 \% confidence interval for
\mu_X - \mu_Y is:
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, r}
\sqrt{\frac{S_X^2}{n}+\frac{S_Y^2}{m}}
where the degree of freedom r are
approximated by Welch-Satterthwaite:
r=\frac{\left(\frac{S_X^2}{n}+\frac{S_Y^2}{m}\right)^2}{\frac{\left(S_X^2
/ n\right)^2}{n-1}+\frac{\left(S_Y^2 / m\right)^2}{m-1}}
t_interval_2sample_unequal <- function(data1, data2, conf_level = 0.95){
# sample size
n1 <- length(data1); n2 <- length(data2)
# statistic mean & sd
xbar1 <- mean(data1); xbar2 <- mean(data2)
s1 <- sd(data1); s2 <- sd(data2)
# standard error (Welch)
SE <- sqrt((s1^2 / n1) + (s2^2 / n2))
# Welch degrees of freedom
df <- ((s1^2/n1 + s2^2/n2)^2) /
(( (s1^2/n1)^2 / (n1 - 1) ) + ( (s2^2/n2)^2 / (n2 - 1) ))
# critical value
alpha <- 1 - conf_level
t_crit <- qt(1 - alpha/2, df)
# margin of error
margin_error <- t_crit * SE
# difference in mean
diff_mean <- xbar1 - xbar2
# interval
lower <- diff_mean - margin_error
upper <- diff_mean + margin_error
return(list(
n1 = n1,
n2 = n2,
mean1 = xbar1,
mean2 = xbar2,
sd1 = s1,
sd2 = s2,
df = df,
conf_level = conf_level,
t_crit = t_crit,
SE = SE,
margin_error = margin_error,
diff_mean = diff_mean,
lower = lower,
upper = upper
))
}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_unequal(x1, x2, conf_level = 0.95)
$n1
[1] 10
$n2
[1] 10
$mean1
[1] 10.26
$mean2
[1] 9.02
$sd1
[1] 2.513607
$sd2
[1] 1.896664
$df
[1] 16.73953
$conf_level
[1] 0.95
$t_crit
[1] 2.112319
$SE
[1] 0.9957688
$margin_error
[1] 2.103382
$diff_mean
[1] 1.24
$lower
[1] -0.8633815
$upper
[1] 3.343382
# Base R function
t.test(x1,x2,var.equal = FALSE)[c("estimate","stderr","conf.int")]
$estimate
mean of x mean of y
10.26 9.02
$stderr
[1] 0.9957688
$conf.int
[1] -0.8633815 3.3433815
attr(,"conf.level")
[1] 0.95
Dependent Sample Mean
- Paired t Interval
Define differences D_i = X_i-Y-i for
each pair i.
Assume the D_i are normally
distributed with mean \mu_D =
\mu_X-\mu_Y.
The differences are independent (one per pair), so we reduce to a
one-sample problem!
A (1−\alpha)100 \% confidence
interval for \mu_D=\mu_X - \mu_Y is
simply the one-sample t-interval applied to the differences:
\bar{d} \pm t_{\alpha / 2, n-1}\left(\frac{s_d}{\sqrt{n}}\right)
where
\bar{d}=\frac{1}{n} \sum_{i=1}^n d_i
and
s_d=\sqrt{\frac{1}{n-1} \sum_{i=1}^n\left(d_i-\bar{d}\right)^2}
paired_t_interval <- function(data1, data2, conf_level = 0.95) {
# Check whether the two samples have the same length
if (length(data1) != length(data2)) {
stop("data1 and data2 must have the same length for a paired-sample analysis.")
}
# Compute paired differences
differences <- data1 - data2
# Basic summary statistics of the differences
n <- length(differences)
mean_diff <- mean(differences)
sd_diff <- sd(differences)
df <- n - 1
# Significance level
alpha <- 1 - conf_level
# Critical t value
t_critical <- qt(1 - alpha / 2, df = df)
# Standard error
standard_error <- sd_diff / sqrt(n)
# Margin of error
margin_of_error <- t_critical * standard_error
# Confidence interval
lower_bound <- mean_diff - margin_of_error
upper_bound <- mean_diff + margin_of_error
# Return results
return(list(
n = n,
mean_difference = mean_diff,
sd_difference = sd_diff,
degrees_of_freedom = df,
confidence_level = conf_level,
t_critical = t_critical,
standard_error = standard_error,
margin_of_error = margin_of_error,
lower_bound = lower_bound,
upper_bound = upper_bound
))
}
unaffect <- c(1.94, 1.44, 1.56, 1.58, 2.06,
1.66, 1.75, 1.77, 1.78, 1.92,
1.25, 1.93, 2.04, 1.62, 2.08)
affect <- c(1.27, 1.63, 1.47, 1.39, 1.93,
1.26, 1.71, 1.67, 1.28, 1.85,
1.02, 1.34, 2.02, 1.59, 1.97)
paired_t_interval(unaffect, affect, conf_level = 0.95)
$n
[1] 15
$mean_difference
[1] 0.1986667
$sd_difference
[1] 0.2382935
$degrees_of_freedom
[1] 14
$confidence_level
[1] 0.95
$t_critical
[1] 2.144787
$standard_error
[1] 0.06152713
$margin_of_error
[1] 0.1319626
$lower_bound
[1] 0.0667041
$upper_bound
[1] 0.3306292
# Base R function
t.test(unaffect,affect,paired = TRUE)[c("estimate","stderr","conf.int")]
$estimate
mean difference
0.1986667
$stderr
[1] 0.06152713
$conf.int
[1] 0.0667041 0.3306292
attr(,"conf.level")
[1] 0.95
{
diff <- unaffect - affect
par(mfrow = c(1,3), mar = c(4,6,3,1))
# 🔹 1. Before–After Plot
plot(c(1,2), range(c(unaffect, affect)),
type = "n", xaxt = "n",
xlab = "", ylab = expression(paste("Volume (cm"^3, ")")),
main = "Before vs After")
axis(1, at = c(1,2), labels = c("Unaffect", "Affect"))
for(i in 1:length(unaffect)){
lines(c(1,2), c(unaffect[i], affect[i]),
col = rgb(0.5,0.5,0.5,0.5))
}
points(rep(1, length(unaffect)), unaffect,
col = "blue", pch = 19)
points(rep(2, length(affect)), affect,
col = "red", pch = 19)
# 🔹 2. Boxplot Comparison
boxplot(unaffect, affect,
names = c("Unaffect", "Affect"),
col = c(rgb(0,0,1,0.4), rgb(1,0,0,0.4)),
border = c("blue","red"),
main = "Boxplot Comparison")
# 🔹 3. Boxplot Differences
boxplot(diff,
col = rgb(0,0,1,0.3),
border = "blue",
main = "Difference (Unaffect - Affect)")
abline(h = 0, col = "red", lwd = 2)
# Reset layout
par(mfrow = c(1,1))
invisible(NULL)
}

CI for Proportion
CI for One Sample
Proportion
Let X_i \sim
\operatorname{Bernoulli}(p), we have
\mathrm{E}\left(X_i\right)=p \text{ and }
\operatorname{Var}\left(X_i\right)=p(1-p)
By the Central Limit Theorem, for large n:
Z=\frac{\bar{X}-\mu}{\sigma /
\sqrt{n}}=\frac{\hat{p}-p}{\sqrt{\frac{p(1-p)}{n}}} \sim N(0,1)
\text{ (approximately)}
For large random samples, an approximate (1−\alpha)100\% confidence interval for a
population proportion p is:
\hat{p} \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}
z_interval_proportion <- function(x, n, conf_level = 0.95) {
# Estimate the proportion
p_hat <- x / n
# Critical value
alpha <- 1 - conf_level
z_crit <- qnorm(1 - alpha / 2)
# Standard error
SE <- sqrt(p_hat * (1 - p_hat) / n)
# Margin of error
margin_error <- z_crit * SE
# Confidence interval
lower <- p_hat - margin_error
upper <- p_hat + margin_error
return(list(
x = x,
n = n,
p_hat = p_hat,
conf_level = conf_level,
z_crit = z_crit,
standard_error = SE,
margin_error = margin_error,
lower = lower,
upper = upper
))
}
z_interval_proportion(280, 418, conf_level = 0.95)
$x
[1] 280
$n
[1] 418
$p_hat
[1] 0.6698565
$conf_level
[1] 0.95
$z_crit
[1] 1.959964
$standard_error
[1] 0.02300139
$margin_error
[1] 0.0450819
$lower
[1] 0.6247746
$upper
[1] 0.7149384
# Base R function
prop.test(x = 280, n = 418, conf.level = 0.95)[c("estimate","conf.int")]
$estimate
p
0.6698565
$conf.int
[1] 0.6221809 0.7143568
attr(,"conf.level")
[1] 0.95
CI for Two Sample
Proportion
For large random samples, an approximate (1−\alpha)100\% confidence interval for a
population proportion p_1-p_2 is:
\left(\hat{p}_1-\hat{p}_2\right) \pm z_{\alpha / 2}
\sqrt{\frac{\hat{p}_1\left(1-\hat{p}_1\right)}{n_1}+\frac{\hat{p}_2\left(1-\hat{p}_2\right)}{n_2}}
z_interval_2prop <- function(x1, n1, x2, n2, conf_level = 0.95) {
# Estimate the proportion
p1_hat <- x1 / n1; p2_hat <- x2 / n2
# Proportion difference
diff <- p1_hat - p2_hat
# Critical value
alpha <- 1 - conf_level
z_crit <- qnorm(1 - alpha / 2)
# Standard error
SE <- sqrt((p1_hat * (1 - p1_hat) / n1) +
(p2_hat * (1 - p2_hat) / n2))
# Margin of error
margin_error <- z_crit * SE
# Confidence interval
lower <- diff - margin_error
upper <- diff + margin_error
return(list(
x1 = x1,
n1 = n1,
p1_hat = p1_hat,
x2 = x2,
n2 = n2,
p2_hat = p2_hat,
diff = diff,
conf_level = conf_level,
z_crit = z_crit,
standard_error = SE,
margin_error = margin_error,
lower = lower,
upper = upper
))
}
z_interval_2prop(x1 = 840, n1 = 2100,
x2 = 323, n2 = 1900,
conf_level = 0.95)
$x1
[1] 840
$n1
[1] 2100
$p1_hat
[1] 0.4
$x2
[1] 323
$n2
[1] 1900
$p2_hat
[1] 0.17
$diff
[1] 0.23
$conf_level
[1] 0.95
$z_crit
[1] 1.959964
$standard_error
[1] 0.01373131
$margin_error
[1] 0.02691287
$lower
[1] 0.2030871
$upper
[1] 0.2569129
# Base R function
prop.test(x = c(840, 323),
n = c(2100, 1900),
conf.level = 0.95,
correct = FALSE)[c("estimate","conf.int")]
$estimate
prop 1 prop 2
0.40 0.17
$conf.int
[1] 0.2030871 0.2569129
attr(,"conf.level")
[1] 0.95
CI for Variance
CI for One
Variance
If X_1, \ldots, X_n \sim N\left(\mu,
\sigma^2\right)
and
a=\chi_{1-\alpha / 2, n-1}^2, b=\chi_{\alpha / 2, n-1}^2,
then a (1-\alpha)100\% CI for \sigma^2 is:
\frac{(n-1) s^2}{b} \leq \sigma^2 \leq \frac{(n-1) s^2}{a}
And a (1-\alpha)100\% CI for \sigma is obtained by taking square
roots:
\frac{\sqrt{(n-1)} s}{\sqrt{b}} \leq \sigma \leq \frac{\sqrt{(n-1)}
s}{\sqrt{a}}
ci_variance <- function(data, conf_level = 0.95){
# basic stats
n <- length(data)
s2 <- var(data)
df <- n - 1
alpha <- 1 - conf_level
# chi-square critical values
chi_lower <- qchisq(alpha/2, df)
chi_upper <- qchisq(1 - alpha/2, df)
# CI for variance
lower_var <- (df * s2) / chi_upper
upper_var <- (df * s2) / chi_lower
# CI for standard deviation
lower_sd <- sqrt(lower_var)
upper_sd <- sqrt(upper_var)
return(list(
n = n,
sample_variance = s2,
df = df,
conf_level = conf_level,
chi_lower = chi_lower,
chi_upper = chi_upper,
variance_CI = c(lower_var, upper_var),
sd_CI = c(lower_sd, upper_sd)
))
}
x <- c(52.48, 52.15, 50.02, 51.85, 54.96,
52.92, 52.08, 51.13, 47.79, 54.31)
ci_variance(data = x, conf_level = 0.95)
$n
[1] 10
$sample_variance
[1] 4.179743
$df
[1] 9
$conf_level
[1] 0.95
$chi_lower
[1] 2.700389
$chi_upper
[1] 19.02277
$variance_CI
[1] 1.977509 13.930468
$sd_CI
[1] 1.406239 3.732354
# Verify results
EnvStats::varTest(x, conf.level = 0.95)[c("estimate","conf.int")]
$estimate
variance
4.179743
$conf.int
LCL UCL
1.977509 13.930468
attr(,"conf.level")
[1] 0.95
CI for Two
Variances
If X_1, \ldots, X_n \sim N\left(\mu_X,
\sigma_X^2\right) and Y_1, \ldots, Y_m
\sim N\left(\mu_Y, \sigma_Y^2\right) be independent random
samples.
Since \frac{(n-1) S_X^2}{\sigma_X^2} \sim
\chi_{n-1}^2 and \frac{(m-1)
S_Y^2}{\sigma_Y^2} \sim \chi_{m-1}^2 independently, by the
definition of the F-distribution:
F=\frac{(m-1) S_Y^2 / \sigma_Y^2 /(m-1)}{(n-1) S_X^2 / \sigma_X^2
/(n-1)}=\frac{\sigma_X^2}{\sigma_Y^2} \cdot \frac{S_Y^2}{S_X^2} \sim
F(m-1, n-1)
This pivotal quantity contains the ratio \sigma^2_X / \sigma^2_Y, exactly the
parameter we want to estimate.
A (1-\alpha)100\% CI for the ratio
of two population variances \sigma^2_X /
\sigma^2_Y is:
\frac{1}{F_{\alpha / 2}(n-1, m-1)} \cdot \frac{S_X^2}{S_Y^2} \leq
\frac{\sigma_X^2}{\sigma_Y^2} \leq F_{\alpha / 2}(m-1, n-1) \cdot
\frac{S_X^2}{S_Y^2}
ci_variance_ratio <- function(data1, data2, conf_level = 0.95){
# basic stats
n1 <- length(data1); n2 <- length(data2)
s1_sq <- var(data1); s2_sq <- var(data2)
df1 <- n1 - 1
df2 <- n2 - 1
alpha <- 1 - conf_level
# F critical values
F_lower <- qf(alpha/2, df1, df2)
F_upper <- qf(1 - alpha/2, df1, df2)
# ratio of variances
ratio <- s1_sq / s2_sq
# confidence interval
lower <- ratio / F_upper
upper <- ratio / F_lower
return(list(
n1 = n1,
n2 = n2,
var1 = s1_sq,
var2 = s2_sq,
ratio = ratio,
df1 = df1,
df2 = df2,
conf_level = conf_level,
F_lower = F_lower,
F_upper = F_upper,
lower = lower,
upper = upper
))
}
ci_variance_ratio(x1, x2, conf_level = 0.95)
$n1
[1] 10
$n2
[1] 10
$var1
[1] 6.318222
$var2
[1] 3.597333
$ratio
[1] 1.756363
$df1
[1] 9
$df2
[1] 9
$conf_level
[1] 0.95
$F_lower
[1] 0.2483859
$F_upper
[1] 4.025994
$lower
[1] 0.4362557
$upper
[1] 7.071106
# Verify results
var.test(x1, x2, conf.level = 0.95)[c("estimate", "conf.int")]
$estimate
ratio of variances
1.756363
$conf.int
[1] 0.4362557 7.0711061
attr(,"conf.level")
[1] 0.95
{
# Density value
d1 <- density(x1)
d2 <- density(x2)
# Data range
x_range <- range(c(d1$x, d2$x))
y_range <- range(c(d1$y, d2$y))
# Empty plot
plot(d1,
col = "blue",
lwd = 2,
main = "Prey Sizes Distribution",
xlab = "Size (mm)",
xlim = x_range,
ylim = y_range,
type = "n")
# 🔵 Fill Group 1
polygon(d1,
col = rgb(0, 0, 1, 0.3),
border = NA)
# 🔴 Fill Group 2
polygon(d2,
col = rgb(1, 0, 0, 0.3),
border = NA)
# Line colors
lines(d1, col = "blue", lwd = 2)
lines(d2, col = "red", lwd = 2)
legend("topright",
legend = c("deinopis", "menneus"),
col = c("blue", "red"),
lwd = 2,
bty = "n")
}

---
title: "Computational Statistics Week 4"

output:
  html_notebook:
    math_method: katex
    theme: yeti
    toc: true
    toc_float:
      toc_collapsed: true
    number_sections: true
    df_print: paged
---

# CI for Mean

## One Sample Mean - Z Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu$ is:

$$
\bar{x} \pm z_{\alpha / 2}\left(\frac{\sigma}{\sqrt{n}}\right)
$$

```{r}
z_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  sigma_hat <- sd(data)   # estimate sigma from data
  
  # critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha/2)
  
  # margin of error
  margin_error <- z_crit * (sigma_hat / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = sigma_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
df1_url <- "https://github.com/novrisuhermi/dataset/raw/refs/heads/main/data_blood_lead_level.csv"
df1 <- read.csv(df1_url)
head(df1)
```

```{r}
print(mean(df1$x))
print(sd(df1$x))
```

```{r}
z_interval(df1$x, conf_level = 0.95)
```

```{r}
{
  mu <- mean(df1$x)
  sigma <- sd(df1$x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(df1$x,
       breaks = 15,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Blood Lead Levels",
       xlab   = "Blood Lead Levels (micrograms / liter)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}
```

## One Sample Mean - t Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu$ is:

$$
\bar{x} \pm t_{\alpha / 2, n-1}\left(\frac{s}{\sqrt{n}}\right)
$$

```{r}
t_interval <- function(data, conf_level = 0.95){
  
  # basic stats from data
  n <- length(data)
  xbar <- mean(data)
  s <- sd(data)
  
  # degree of freedom
  df <- n - 1
  
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df = df)
  
  # margin of error
  margin_error <- t_crit * (s / sqrt(n))
  
  # interval
  lower <- xbar - margin_error
  upper <- xbar + margin_error
  
  return(list(
    n = n,
    mean = xbar,
    sd = s,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x <- c(118, 115, 125, 110, 112, 130, 117, 112, 
       115, 120, 113, 118, 119, 122, 123, 126)
t_interval(x, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x)[c("estimate","stderr","conf.int")]
```

```{r}
{
  mu <- mean(x)
  sigma <- sd(x)
  x_vals <- seq(mu - 4 * sigma, mu + 4 * sigma, length.out = 200)
  y_vals <- dnorm(x_vals, mean = mu, sd = sigma)

  hist(x,
       breaks = 8,
       col    = "steelblue",
       border = "white",
       main   = "Distribution of Beef Consumption",
       xlab   = "Beef Consumption (pounds / year)",
       freq   = FALSE,
       xlim   = c(mu - 4 * sigma, mu + 4 * sigma))

  polygon(c(x_vals, rev(x_vals)),
          c(y_vals, rep(0, length(y_vals))),
          col = rgb(0, 0, 1, 0.2),
          border = NA)

  invisible(lines(x_vals, y_vals,
                  col = "blue",
                  lwd = 2))
}
```

## Two Sample Mean - Pooled t Interval

A $(1−\alpha)100 \%$ confidence interval for $\mu_X - \mu_Y$ is:

$$
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, n+m-2} S_p \sqrt{\frac{1}{n}+\frac{1}{m}}
$$

where the pooled sample variance is:

$$
S_p^2=\frac{(n-1) S_X^2+(m-1) S_Y^2}{n+m-2}
$$

is an unbiased estimator of the common $\sigma^2$.

```{r}
t_interval_2sample_equal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean and sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # pooled variance
  sp2 <- ((n1 - 1)*s1^2 + (n2 - 1)*s2^2) / (n1 + n2 - 2)
  sp <- sqrt(sp2)
  # standard error
  SE <- sp * sqrt(1/n1 + 1/n2)
  # degree of freedom
  df <- n1 + n2 - 2
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1, n2 = n2,
    mean1 = xbar1, mean2 = xbar2,
    sd1 = s1, sd2 = s2,
    pooled_sd = sp,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_equal(x1, x2, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x1,x2,var.equal = TRUE)[c("estimate","stderr","conf.int")]
```

```{r}
boxplot(x1, x2,
        names = c("deinopis", "menneus"),
        col   = c(rgb(0, 0, 1, 0.4), rgb(1, 0, 0, 0.4)),
        border = c("blue", "red"),
        main  = "Distribution of Spider Prey Sizes",
        ylab  = "size (mm)",
        notch = FALSE)
```


## Two Sample Mean - Welch's t Interval

If the data are approximately normal and $\sigma^2_X \neq \sigma^2_Y$, a $(1−\alpha)100 \%$ confidence interval for $\mu_X - \mu_Y$ is:

$$
(\bar{X}-\bar{Y}) \pm t_{\alpha / 2, r} \sqrt{\frac{S_X^2}{n}+\frac{S_Y^2}{m}}
$$

where the degree of freedom $r$ are approximated by Welch-Satterthwaite:

$$
r=\frac{\left(\frac{S_X^2}{n}+\frac{S_Y^2}{m}\right)^2}{\frac{\left(S_X^2 / n\right)^2}{n-1}+\frac{\left(S_Y^2 / m\right)^2}{m-1}}
$$

```{r}
t_interval_2sample_unequal <- function(data1, data2, conf_level = 0.95){
  
  # sample size
  n1 <- length(data1); n2 <- length(data2)
  # statistic mean & sd
  xbar1 <- mean(data1); xbar2 <- mean(data2)
  s1 <- sd(data1); s2 <- sd(data2)
  # standard error (Welch)
  SE <- sqrt((s1^2 / n1) + (s2^2 / n2))
  # Welch degrees of freedom
  df <- ((s1^2/n1 + s2^2/n2)^2) /
        (( (s1^2/n1)^2 / (n1 - 1) ) + ( (s2^2/n2)^2 / (n2 - 1) ))
  # critical value
  alpha <- 1 - conf_level
  t_crit <- qt(1 - alpha/2, df)
  # margin of error
  margin_error <- t_crit * SE
  # difference in mean
  diff_mean <- xbar1 - xbar2
  # interval
  lower <- diff_mean - margin_error
  upper <- diff_mean + margin_error
  
  return(list(
    n1 = n1,
    n2 = n2,
    mean1 = xbar1,
    mean2 = xbar2,
    sd1 = s1,
    sd2 = s2,
    df = df,
    conf_level = conf_level,
    t_crit = t_crit,
    SE = SE,
    margin_error = margin_error,
    diff_mean = diff_mean,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
x1 <- c(12.9, 10.2, 7.4, 7.0, 10.5, 11.9, 7.1, 9.9, 14.4, 11.3)
x2 <- c(10.2, 6.9, 10.9, 11.0, 10.1, 5.3, 7.5, 10.3, 9.2, 8.8)
t_interval_2sample_unequal(x1, x2, conf_level = 0.95)
```

```{r}
# Base R function
t.test(x1,x2,var.equal = FALSE)[c("estimate","stderr","conf.int")]
```

## Dependent Sample Mean - Paired t Interval

Define differences $D_i = X_i-Y-i$ for each pair $i$.

Assume the $D_i$ are normally distributed with mean $\mu_D = \mu_X-\mu_Y$.

The differences are independent (one per pair), so we reduce to a one-sample problem!

A $(1−\alpha)100 \%$ confidence interval for $\mu_D=\mu_X - \mu_Y$ is simply the one-sample t-interval applied to the differences:

$$
\bar{d} \pm t_{\alpha / 2, n-1}\left(\frac{s_d}{\sqrt{n}}\right)
$$

where $$
\bar{d}=\frac{1}{n} \sum_{i=1}^n d_i
$$

and

$$
s_d=\sqrt{\frac{1}{n-1} \sum_{i=1}^n\left(d_i-\bar{d}\right)^2}
$$

```{r}
paired_t_interval <- function(data1, data2, conf_level = 0.95) {
  
  # Check whether the two samples have the same length
  if (length(data1) != length(data2)) {
    stop("data1 and data2 must have the same length for a paired-sample analysis.")
  }
  # Compute paired differences
  differences <- data1 - data2
  # Basic summary statistics of the differences
  n <- length(differences)
  mean_diff <- mean(differences)
  sd_diff <- sd(differences)
  df <- n - 1
  # Significance level
  alpha <- 1 - conf_level
  # Critical t value
  t_critical <- qt(1 - alpha / 2, df = df)
  # Standard error
  standard_error <- sd_diff / sqrt(n)
  # Margin of error
  margin_of_error <- t_critical * standard_error
  # Confidence interval
  lower_bound <- mean_diff - margin_of_error
  upper_bound <- mean_diff + margin_of_error
  
  # Return results
  return(list(
    n = n,
    mean_difference = mean_diff,
    sd_difference = sd_diff,
    degrees_of_freedom = df,
    confidence_level = conf_level,
    t_critical = t_critical,
    standard_error = standard_error,
    margin_of_error = margin_of_error,
    lower_bound = lower_bound,
    upper_bound = upper_bound
  ))
}
```

```{r}
unaffect <- c(1.94, 1.44, 1.56, 1.58, 2.06,
              1.66, 1.75, 1.77, 1.78, 1.92,
              1.25, 1.93, 2.04, 1.62, 2.08)

affect <- c(1.27, 1.63, 1.47, 1.39, 1.93,
            1.26, 1.71, 1.67, 1.28, 1.85,
            1.02, 1.34, 2.02, 1.59, 1.97)

paired_t_interval(unaffect, affect, conf_level = 0.95)
```

```{r}
# Base R function
t.test(unaffect,affect,paired = TRUE)[c("estimate","stderr","conf.int")]
```



```{r}
{
  diff <- unaffect - affect
  par(mfrow = c(1,3), mar = c(4,6,3,1))
  
  # 🔹 1. Before–After Plot
  plot(c(1,2), range(c(unaffect, affect)),
       type = "n", xaxt = "n",
       xlab = "", ylab = expression(paste("Volume (cm"^3, ")")),
       main = "Before vs After")
  
  axis(1, at = c(1,2), labels = c("Unaffect", "Affect"))
  
  for(i in 1:length(unaffect)){
    lines(c(1,2), c(unaffect[i], affect[i]),
          col = rgb(0.5,0.5,0.5,0.5))
  }
  
  points(rep(1, length(unaffect)), unaffect,
         col = "blue", pch = 19)
  
  points(rep(2, length(affect)), affect,
         col = "red", pch = 19)
  
  # 🔹 2. Boxplot Comparison
  boxplot(unaffect, affect,
          names = c("Unaffect", "Affect"),
          col = c(rgb(0,0,1,0.4), rgb(1,0,0,0.4)),
          border = c("blue","red"),
          main = "Boxplot Comparison")
  
  # 🔹 3. Boxplot Differences
  boxplot(diff,
          col = rgb(0,0,1,0.3),
          border = "blue",
          main = "Difference (Unaffect - Affect)")
  
  abline(h = 0, col = "red", lwd = 2)
  
  # Reset layout
  par(mfrow = c(1,1))
  
  invisible(NULL)
}
```


# CI for Proportion

## CI for One Sample Proportion

Let $X_i \sim \operatorname{Bernoulli}(p)$, we have

$$
\mathrm{E}\left(X_i\right)=p \text{ and } \operatorname{Var}\left(X_i\right)=p(1-p)
$$

By the Central Limit Theorem, for large $n$:

$$
Z=\frac{\bar{X}-\mu}{\sigma / \sqrt{n}}=\frac{\hat{p}-p}{\sqrt{\frac{p(1-p)}{n}}} \sim N(0,1) \text{  (approximately)}
$$

For large random samples, an approximate $(1−\alpha)100\%$ confidence interval for a population proportion $p$ is:

$$
\hat{p} \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}(1-\hat{p})}{n}}
$$

```{r}
z_interval_proportion <- function(x, n, conf_level = 0.95) {
  
  # Estimate the proportion
  p_hat <- x / n
  
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  
  # Standard error
  SE <- sqrt(p_hat * (1 - p_hat) / n)
  
  # Margin of error
  margin_error <- z_crit * SE
  
  # Confidence interval
  lower <- p_hat - margin_error
  upper <- p_hat + margin_error
  
  return(list(
    x = x,
    n = n,
    p_hat = p_hat,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
z_interval_proportion(280, 418, conf_level = 0.95)
```

```{r}
# Base R function
prop.test(x = 280, n = 418, conf.level = 0.95)[c("estimate","conf.int")]
```




## CI for Two Sample Proportion

For large random samples, an approximate $(1−\alpha)100\%$ confidence interval for a population proportion $p_1-p_2$ is:

$$
\left(\hat{p}_1-\hat{p}_2\right) \pm z_{\alpha / 2} \sqrt{\frac{\hat{p}_1\left(1-\hat{p}_1\right)}{n_1}+\frac{\hat{p}_2\left(1-\hat{p}_2\right)}{n_2}}
$$

```{r}
z_interval_2prop <- function(x1, n1, x2, n2, conf_level = 0.95) {

  # Estimate the proportion
  p1_hat <- x1 / n1; p2_hat <- x2 / n2
  # Proportion difference
  diff <- p1_hat - p2_hat
  # Critical value
  alpha <- 1 - conf_level
  z_crit <- qnorm(1 - alpha / 2)
  # Standard error
  SE <- sqrt((p1_hat * (1 - p1_hat) / n1) +
             (p2_hat * (1 - p2_hat) / n2))
  # Margin of error
  margin_error <- z_crit * SE
  # Confidence interval
  lower <- diff - margin_error
  upper <- diff + margin_error
  
  return(list(
    x1 = x1,
    n1 = n1,
    p1_hat = p1_hat,
    x2 = x2,
    n2 = n2,
    p2_hat = p2_hat,
    diff = diff,
    conf_level = conf_level,
    z_crit = z_crit,
    standard_error = SE,
    margin_error = margin_error,
    lower = lower,
    upper = upper
  ))
}
```

```{r}
z_interval_2prop(x1 = 840, n1 = 2100,
                 x2 = 323, n2 = 1900,
                 conf_level = 0.95)
```

```{r}
# Base R function
prop.test(x = c(840, 323),
          n = c(2100, 1900),
          conf.level = 0.95,
          correct = FALSE)[c("estimate","conf.int")]
```

# CI for Variance

## CI for One Variance

If $$X_1, \ldots, X_n \sim N\left(\mu, \sigma^2\right)$$

and

$$
a=\chi_{1-\alpha / 2, n-1}^2, b=\chi_{\alpha / 2, n-1}^2,
$$

then a $(1-\alpha)100\%$ CI for $\sigma^2$ is:

$$
\frac{(n-1) s^2}{b} \leq \sigma^2 \leq \frac{(n-1) s^2}{a}
$$

And a $(1-\alpha)100\%$ CI for $\sigma$ is obtained by taking square roots:

$$
\frac{\sqrt{(n-1)} s}{\sqrt{b}} \leq \sigma \leq \frac{\sqrt{(n-1)} s}{\sqrt{a}}
$$

```{r}
ci_variance <- function(data, conf_level = 0.95){
  
  # basic stats
  n <- length(data)
  s2 <- var(data)
  df <- n - 1
  
  alpha <- 1 - conf_level
  
  # chi-square critical values
  chi_lower <- qchisq(alpha/2, df)
  chi_upper <- qchisq(1 - alpha/2, df)
  
  # CI for variance
  lower_var <- (df * s2) / chi_upper
  upper_var <- (df * s2) / chi_lower
  
  # CI for standard deviation
  lower_sd <- sqrt(lower_var)
  upper_sd <- sqrt(upper_var)
  
  return(list(
    n = n,
    sample_variance = s2,
    df = df,
    conf_level = conf_level,
    chi_lower = chi_lower,
    chi_upper = chi_upper,
    variance_CI = c(lower_var, upper_var),
    sd_CI = c(lower_sd, upper_sd)
  ))
}
```

```{r}
x <- c(52.48, 52.15, 50.02, 51.85, 54.96,
       52.92, 52.08, 51.13, 47.79, 54.31)

ci_variance(data = x, conf_level = 0.95)
```

```{r}
# Verify results
EnvStats::varTest(x, conf.level = 0.95)[c("estimate","conf.int")]
```

## CI for Two Variances

If $X_1, \ldots, X_n \sim N\left(\mu_X, \sigma_X^2\right)$ and $Y_1, \ldots, Y_m \sim N\left(\mu_Y, \sigma_Y^2\right)$ be independent random samples.

Since $\frac{(n-1) S_X^2}{\sigma_X^2} \sim \chi_{n-1}^2$ and $\frac{(m-1) S_Y^2}{\sigma_Y^2} \sim \chi_{m-1}^2$ independently, by the definition of the F-distribution:

$$
F=\frac{(m-1) S_Y^2 / \sigma_Y^2 /(m-1)}{(n-1) S_X^2 / \sigma_X^2 /(n-1)}=\frac{\sigma_X^2}{\sigma_Y^2} \cdot \frac{S_Y^2}{S_X^2} \sim F(m-1, n-1)
$$

This pivotal quantity contains the ratio $\sigma^2_X / \sigma^2_Y$, exactly the parameter we want to estimate.

A $(1-\alpha)100\%$ CI for the ratio of two population variances $\sigma^2_X / \sigma^2_Y$ is:

$$
\frac{1}{F_{\alpha / 2}(n-1, m-1)} \cdot \frac{S_X^2}{S_Y^2} \leq \frac{\sigma_X^2}{\sigma_Y^2} \leq F_{\alpha / 2}(m-1, n-1) \cdot \frac{S_X^2}{S_Y^2}
$$

```{r}
ci_variance_ratio <- function(data1, data2, conf_level = 0.95){
  
  # basic stats
  n1 <- length(data1); n2 <- length(data2)
  s1_sq <- var(data1); s2_sq <- var(data2)
  
  df1 <- n1 - 1
  df2 <- n2 - 1
  
  alpha <- 1 - conf_level
  
  # F critical values
  F_lower <- qf(alpha/2, df1, df2)
  F_upper <- qf(1 - alpha/2, df1, df2)
  
  # ratio of variances
  ratio <- s1_sq / s2_sq
  
  # confidence interval
  lower <- ratio / F_upper
  upper <- ratio / F_lower
  
  return(list(
    n1 = n1,
    n2 = n2,
    var1 = s1_sq,
    var2 = s2_sq,
    ratio = ratio,
    df1 = df1,
    df2 = df2,
    conf_level = conf_level,
    F_lower = F_lower,
    F_upper = F_upper,
    lower = lower,
    upper = upper
  ))
}
```


```{r}
ci_variance_ratio(x1, x2, conf_level = 0.95)
```

```{r}
# Verify results
var.test(x1, x2, conf.level = 0.95)[c("estimate", "conf.int")]
```

```{r}
{
# Density value
d1 <- density(x1)
d2 <- density(x2)

# Data range
x_range <- range(c(d1$x, d2$x))
y_range <- range(c(d1$y, d2$y))

# Empty plot
plot(d1,
     col = "blue",
     lwd = 2,
     main = "Prey Sizes Distribution",
     xlab = "Size (mm)",
     xlim = x_range,
     ylim = y_range,
     type = "n")  

# 🔵 Fill Group 1
polygon(d1,
        col = rgb(0, 0, 1, 0.3),
        border = NA)

# 🔴 Fill Group 2
polygon(d2,
        col = rgb(1, 0, 0, 0.3),
        border = NA)

# Line colors
lines(d1, col = "blue", lwd = 2)
lines(d2, col = "red",  lwd = 2)
legend("topright",
       legend = c("deinopis", "menneus"),
       col = c("blue", "red"),
       lwd = 2,
       bty = "n")
}
```


