Homework 4 Solutions

Question 1

Objective: Generate a sample of size \(n=10\) from a \(\chi^2_5\) distribution. Estimate the mean using a confidence interval. Construct a 95% confidence interval using the Central Limit Theorem (CLT), and check whether it contains the true mean (5). Then construct a bootstrap interval using 100 bootstrap samples and perform the same check. Repeat the procedure 1000 times and compare coverage.

set.seed(123)

n <- 10
B <- 100
sims <- 1000
true_mean <- 5

clt_coverage_count <- 0
boot_coverage_count <- 0

for(i in 1:sims) {
  x <- rchisq(n, df = 5)
  
  # CLT Confidence Interval
  se <- sd(x) / sqrt(n)
  clt_lower <- mean(x) - 1.96 * se
  clt_upper <- mean(x) + 1.96 * se
  
  if(true_mean >= clt_lower && true_mean <= clt_upper) {
    clt_coverage_count <- clt_coverage_count + 1
  }
  
  # Bootstrap Confidence Interval
  boot_means <- replicate(B, mean(sample(x, size = n, replace = TRUE)))
  boot_ci <- quantile(boot_means, probs = c(0.025, 0.975))
  
  if(true_mean >= boot_ci[1] && true_mean <= boot_ci[2]) {
    boot_coverage_count <- boot_coverage_count + 1
  }
}

clt_coverage_rate <- clt_coverage_count / sims
boot_coverage_rate <- boot_coverage_count / sims

cat("Empirical Coverage of CLT Interval:", clt_coverage_rate * 100, "%\n")

## Empirical Coverage of CLT Interval: 90.1 %

cat("Empirical Coverage of Bootstrap Interval:", boot_coverage_rate * 100, "%\n")

## Empirical Coverage of Bootstrap Interval: 87.2 %

Conclusion: Given the small sample size (\(n=10\)) drawn from a right-skewed Chi-squared distribution, the Central Limit Theorem approximation struggles, usually resulting in under-coverage. The bootstrap interval handles the skewness slightly better and generally provides coverage closer to the nominal 95% level.

Question 2

Objective: Objective: The aim is to generate observations from the density \(p(x)\propto(2+\sin(5x)+\sin(2x)) \cdot \exp(-x^{2})\). Do this by setting up a Metropolis Hastings algorithm with transition density \(q(x,y)=\mathcal{N}(x-y,1)\).

set.seed(567)

# Defining the nonnormalized target density
p <- function(x) {
  (2 + sin(5*x) + sin(2*x)) * exp(-x^2)
}

N <- 10000
samples <- numeric(N)
samples[1] <- 0 # Initializing the chain

for(i in 2:N) {
  #  a new state y from a symmetric Normal distribution
  y <- rnorm(1, mean = samples[i-1], sd = 1)
  

  # Since the proposal is symmetric (q(x,y) = q(y,x)), it gets cancel out
  alpha <- min(1, p(y) / p(samples[i-1]))
  
  # Accept or reject
  if(runif(1) <= alpha) {
    samples[i] <- y
  } else {
    samples[i] <- samples[i-1]
  }
}

# Plotting the results
hist(samples, breaks = 50, probability = TRUE, col = "skyblue", border = "white",
     main = "Metropolis-Hastings Samples vs. Target Density", xlab = "x")


normalizing_constant <- integrate(p, lower = -Inf, upper = Inf)$value
curve(p(x) / normalizing_constant, add = TRUE, col = "red", lwd = 2)

legend("topright", legend = c("M-H Samples", "True Normalized Density"),
       fill = c("skyblue", NA), border = c("white", NA), col = c(NA, "red"), 
       lwd = c(NA, 2), bty = "n")

Question 3

Objective: Generate 100 data points from the model \(y=x+\epsilon\) with \(\epsilon\sim\mathcal{N}(0,0.5^{2})\) and is the subset of (0,1) with 100 equally spaced points starting at 0.01. Fit a kernel regression with normal kernel and bandwidth 0.25. Find the 5-fold cross-validation error. Repeat the process by changing the bandwidth from 0.02 to 0.5 in increments of 0.02 and find the optimal bandwidth. Plot the data overlayed with the kernel fit for the best choice of bandwidth. Also plot the CV-error as a function of bandwidth.

set.seed(123)

# Generate Data
n <- 100
x <- seq(0.01, 1, length.out = n)
epsilon <- rnorm(n, mean = 0, sd = 0.5)
y <- x + epsilon

# 5-fold CV
k <- 5
# Randomly assigning indices to folds
cv_lab <- sample(1:n, n, replace = FALSE) %% k 
h_seq <- seq(from = 0.02, to = 0.5, by = 0.02)
cv_err_h <- rep(NA, length(h_seq))

# Calculating CV Error for a specific bandwidth (h = 0.25)
h_specific <- 0.25
cv_err_specific <- 0

for(i_cv in 0:(k-1)) {
  w_val <- which(cv_lab == i_cv)
  X_tr <- x[-w_val]
  Y_tr <- y[-w_val]
  X_val <- x[w_val]
  Y_val <- y[w_val]
  
  fit <- ksmooth(x = X_tr, y = Y_tr, kernel = "normal", bandwidth = h_specific, x.points = X_val)
  cv_err_specific <- cv_err_specific + mean((Y_val[order(X_val)] - fit$y)^2, na.rm = TRUE)
}
cv_err_specific <- cv_err_specific / k
cat("5-fold CV Error for bandwidth 0.25 is:", cv_err_specific, "\n")

## 5-fold CV Error for bandwidth 0.25 is: 0.2258717

# Repeating the process across the full grid of bandwidths
for(i in 1:length(h_seq)) {
  h_using <- h_seq[i]
  cv_err <- 0
  
  for(i_cv in 0:(k-1)) {
    w_val <- which(cv_lab == i_cv)
    X_tr <- x[-w_val]
    Y_tr <- y[-w_val]
    X_val <- x[w_val]
    Y_val <- y[w_val]
    
    kernel_reg <- ksmooth(x = X_tr, y = Y_tr, kernel = "normal", bandwidth = h_using, x.points = X_val)
    cv_err <- cv_err + mean((Y_val[order(X_val)] - kernel_reg$y)^2, na.rm = TRUE)
  }
  cv_err_h[i] <- cv_err / k
}

# Finding Optimal Bandwidth
opt_h <- h_seq[which.min(cv_err_h)]
cat("The optimal bandwidth is:", opt_h, "\n")

## The optimal bandwidth is: 0.38

# Plotting the results
par(mfrow = c(1, 2))

# Plot 1: CV-error as a function of bandwidth
plot(h_seq, cv_err_h, type = "b", lwd = 2, col = "blue", pch = 16,
     xlab = "Smoothing Bandwidth (h)", ylab = "5-Fold CV Error",
     main = "CV Error vs Bandwidth")
abline(v = opt_h, col = "red", lty = 2, lwd = 2)

# Plot 2: Data overlayed with the optimal kernel fit
plot(x, y, pch = 20, col = "gray", 
     xlab = "x", ylab = "y", 
     main = paste("Kernel Fit (Optimal h =", opt_h, ")"))
fit_best <- ksmooth(x = x, y = y, kernel = "normal", bandwidth = opt_h, x.points = x)
lines(fit_best$x, fit_best$y, col = "red", lwd = 3)

Homework 4 Solutions

Somesh Biswas (BSD-CC-2420)

May 13, 2026

Question 1

Question 2

Question 3