Please submit your finished lab on Canvas as a knitted pdf or html document. The lab is due on Feb 18, 2024.

Section V: Student Tasks: (1) - (3)

Students will solve three major tasks in this lab. The first task is described below.

Task 1

Simulate R=1000 training datasets and for each iteration (or each simulated dataset), store the predicted test cases corresponding to inputs \(x_0=10,16\). For each simulated dataset, you must predict \(y_0\) using polynomial regression having degrees \(p=1,2,3,4,5\). You can easily solve this problem using a loop and calling on the two functions sim.training() and poly.predict().

Your final result will be three matrices. The first matrix mat.pred.1 is the collection of predicted test cases for each degree corresponding to input \(x_0=10\). Similarly, the matrix mat.pred.2 corresponds to \(x_0=16\). The first two matrices are dimension \((5 \times 1000)\). The third matrix y.test.mat is the collection of all test cases \(y_0\) for each simulated dataset. This matrix is dimension \((2 \times 1000)\). After completing this problem, display the first three columns of each matrix.

# Solution goes here -----------
set.seed(0) 

#sim.training()
sim.training <- function() {
  x <- runif(100, 0, 20)  
  y <- x^2 + rnorm(100)  
  return(list(x = x, y = y))
}

#poly.predict()
poly.predict <- function(x, y, degree, x0) {
  model <- lm(y ~ poly(x, degree, raw = TRUE))  
  predictions <- predict(model, newdata = data.frame(x = x0))  #predict
  return(predictions)
}

#initialzie
R <- 1000  
degrees <- 1:5  
x0_values <- c(10, 16)  

mat_pred_1 <- matrix(nrow = length(degrees), ncol = R)
mat_pred_2 <- matrix(nrow = length(degrees), ncol = R)
y_test_mat <- array(dim = c(length(degrees), 2, R))

#循环
for (i in 1:R) {
  data <- sim.training()  
  for (degree in degrees) {
    preds <- poly.predict(data$x, data$y, degree, x0_values)  
    mat_pred_1[degree, i] <- preds[1]  
    mat_pred_2[degree, i] <- preds[2]  
    y_test_mat[degree, , i] <- preds   
  }
}

#前三次
list(
  mat_pred_1 = mat_pred_1[, 1:3],
  mat_pred_2 = mat_pred_2[, 1:3],
  y_test_mat = y_test_mat[, , 1:3]
)
## $mat_pred_1
##           [,1]     [,2]     [,3]
## [1,] 128.82272 133.2706 130.9599
## [2,]  99.84459 100.1807 100.0964
## [3,]  99.84712 100.1912 100.0961
## [4,] 100.07018 100.2561 100.1353
## [5,] 100.08198 100.2549 100.1259
## 
## $mat_pred_2
##          [,1]     [,2]     [,3]
## [1,] 250.9843 261.8240 254.7054
## [2,] 255.8955 255.9032 256.0258
## [3,] 255.9188 255.6763 255.9873
## [4,] 255.7249 255.6167 255.9441
## [5,] 255.6879 255.6094 255.9949
## 
## $y_test_mat
## , , 1
## 
##           [,1]     [,2]
## [1,] 128.82272 250.9843
## [2,]  99.84459 255.8955
## [3,]  99.84712 255.9188
## [4,] 100.07018 255.7249
## [5,] 100.08198 255.6879
## 
## , , 2
## 
##          [,1]     [,2]
## [1,] 133.2706 261.8240
## [2,] 100.1807 255.9032
## [3,] 100.1912 255.6763
## [4,] 100.2561 255.6167
## [5,] 100.2549 255.6094
## 
## , , 3
## 
##          [,1]     [,2]
## [1,] 130.9599 254.7054
## [2,] 100.0964 256.0258
## [3,] 100.0961 255.9873
## [4,] 100.1353 255.9441
## [5,] 100.1259 255.9949

Task 2

The second task is to estimate three different quantities based on the simulation from Task (1). For each polynomial degree (\(p=1,2,3,4,5\)), use the matrices mat.pred.1, mat.pred.2 and y.test.mat to estimate:

After solving this problem, display the 6 vectors of interest.

Notes:

  1. When estimating the squared bias, students will use y.test.mat to estimate \(E\hat{f}(x_0)\) but will also call the function true.f() for computing \(f(x_0)\). Obviously in practice we never know the true relation \(f(x)\).

  2. To estimate \(MSE(x_0)\), you can slightly modify your loop from Task (1) or write a new program that computes \((y_0-\hat{f}(x_0))^2\) for all \(R=1000\) iterations. Then take the average of the resulting vectors.

# Solution goes here -----------
#假设f(x0)如SecIII
true.f <- function(x0) {
  return((x0 - 5) * (x0 - 12))
}

# 计算MSE, 方差和偏差的平方
calculate_task2 <- function(mat_pred, x0) {
  #的E[f̂(x0)]
  predicted_means <- apply(mat_pred, 1, mean)
  
  #MSE
  true_value <- true.f(x0)
  mse <- apply(mat_pred, 1, function(pred) mean((pred - true_value)^2))
  
  #VAR
  variance <- apply(mat_pred, 1, var)
  
  #squared^2
  bias_squared <- (predicted_means - true_value)^2
  
  return(list(MSE = mse, Variance = variance, Bias_squared = bias_squared))
}


results_x10 <- calculate_task2(mat_pred_1, 10)
results_x16 <- calculate_task2(mat_pred_2, 16)


results_list <- list(MSE_x10 = results_x10$MSE,
                     Variance_x10 = results_x10$Variance,
                     Bias_squared_x10 = results_x10$Bias_squared,
                     MSE_x16 = results_x16$MSE,
                     Variance_x16 = results_x16$Variance,
                     Bias_squared_x16 = results_x16$Bias_squared)

# 6个
print(results_list)
## $MSE_x10
## [1] 20508.91 12099.88 12099.88 12100.42 12100.73
## 
## $Variance_x10
## [1] 8.57971615 0.02176720 0.02202664 0.03596871 0.03651165
## 
## $Bias_squared_x10
## [1] 20500.34 12099.85 12099.86 12100.39 12100.70
## 
## $MSE_x16
## [1] 43756.04 44943.79 44941.56 44940.47 44940.13
## 
## $Variance_x16
## [1] 24.98840168  0.02141094  0.02958859  0.04487059  0.04854596
## 
## $Bias_squared_x16
## [1] 43731.08 44943.77 44941.53 44940.43 44940.08

Task 3

The third task requires students to construct a plots showing \(MSE(x_0)\), \(\text{Var}(\hat{f}(x_0))\) and \(\text{Bias}^2(\hat{f}(x_0))\) as a function of the polynomial degree. There should be two graphics corresponding to the two test cases \(x_0=10\) and \(x_0=16\).

# Solution goes here -----------
plot_metrics <- function(degrees, mse, variance, bias_sq, title) {
  plot(degrees, mse, type='o', col='blue', pch=16, ylab='Metrics', xlab='Degree of Polynomial', 
       main=paste('Metrics for', title), ylim=c(min(c(mse, variance, bias_sq)), max(c(mse, variance, bias_sq))))
  lines(degrees, variance, type='o', col='red', pch=17)
  lines(degrees, bias_sq, type='o', col='green', pch=18)
  legend('topright', legend=c('MSE', 'Variance', 'Bias^2'), col=c('blue', 'red', 'green'), pch=16:18)
}


degrees <- 1:5


mse_x10 <- results_x10$MSE
variance_x10 <- results_x10$Variance
bias_sq_x10 <- results_x10$Bias_squared

mse_x16 <- results_x16$MSE
variance_x16 <- results_x16$Variance
bias_sq_x16 <- results_x16$Bias_squared

#x0 = 10
plot_metrics(degrees, mse_x10, variance_x10, bias_sq_x10, 'x0 = 10')

#x0 = 16
plot_metrics(degrees, mse_x16, variance_x16, bias_sq_x16, 'x0 = 16')