Lab1_ data 609

Author

Heleine Fouda

Problem 1: Gradient Descent

#| label: Load necessary libraries
library(MASS)  
set.seed(101317)  # For reproducibility
f <- function(x, y, a) {
  return(x^2 + a * y^2)
}

gradient <- function(x, y, a) {
  return(c(2 * x, 2 * a * y))
}
gradient
function (x, y, a) 
{
    return(c(2 * x, 2 * a * y))
}
#| label: Gradient Descent Function

gradient_descent <- function(a, alpha, max_iter = 100, tol = 1e-6) {
  x <- 1  # Initial x
  y <- 1  # Initial y
  history <- matrix(NA, nrow = max_iter, ncol = 2)
  
  for (i in 1:max_iter) {
    grad <- gradient(x, y, a)
    x <- x - alpha * grad[1]
    y <- y - alpha * grad[2]
    
    history[i, ] <- c(x, y)
    
    if (sqrt(sum(grad^2)) < tol) {
      break
    }
  }
  
  return(list(final_x = x, final_y = y, history = history))
}
#| label: Experiment with different learning rates
alpha_values <- seq(0.001, 1, length.out = 100)
errors_a1 <- numeric(length(alpha_values))
errors_a01 <- numeric(length(alpha_values))

for (i in 1:length(alpha_values)) {
  result_a1 <- gradient_descent(a = 1, alpha = alpha_values[i])
  result_a01 <- gradient_descent(a = 0.01, alpha = alpha_values[i])
  
  errors_a1[i] <- log10(sqrt(result_a1$final_x^2 + result_a1$final_y^2))
  errors_a01[i] <- log10(sqrt(result_a01$final_x^2 + result_a01$final_y^2))
}

#| label: Plot results
par(mfrow = c(1, 2))

plot(alpha_values, errors_a1, type = "l", col = "red", 
     xlab = "Learning Rate (alpha)", ylab = "Log10(Final Error)",
     main = "Final Error vs Learning Rate (a = 1)")

plot(alpha_values, errors_a01, type = "l", col = "blue", 
     xlab = "Learning Rate (alpha)", ylab = "Log10(Final Error)",
     main = "Final Error vs Learning Rate (a = 0.01)")

critical_alpha <- 0.1  # Pick based on reversal trend

result_critical <- gradient_descent(a = 0.01, alpha = critical_alpha)
x_vals <- seq(-1, 1, length.out = 100)
y_vals <- seq(-1, 1, length.out = 100)
z_vals <- outer(x_vals, y_vals, Vectorize(function(x, y) f(x, y, 0.01)))

contour(x_vals, y_vals, z_vals, levels = 10, main = "Gradient Descent Trajectory")
lines(result_critical$history[,1], result_critical$history[,2], col = "red", lwd = 2)
points(0, 0, col = "blue", pch = 19)

Problem 2: Solving Least Squares Problems

#| label: Generate random 20x10 matrix A and 20x1 vector b (Gaussian distribution)
A <- matrix(rnorm(20 * 10), nrow = 20, ncol = 10)
b <- rnorm(20)

#| label:  (a) Pseudoinverse solution: x = A^+ * b
A_pinv <- ginv(A)  # Compute Moore-Penrose pseudoinverse
x_pinv <- A_pinv %*% b

#| label: (b) Built-in least squares function
x_lm <- lm.fit(A, b)$coefficients

#| label:  (c) QR Factorization: x = R^{-1} Q^T b
QR <- qr(A)
Q <- qr.Q(QR)  # Extract Q
R <- qr.R(QR)  # Extract R
x_qr <- solve(R, t(Q) %*% b)

#| label: Convert all solutions to numeric vectors for correct comparison
x_pinv_vec <- as.vector(x_pinv)
x_qr_vec <- as.vector(x_qr)
#| label: (d) Verify solutions are nearly equal

equal_pinv_lm <- all.equal(x_pinv_vec, x_lm)
equal_lm_qr <- all.equal(x_lm, x_qr_vec)
residuals_pinv <- A %*% x_pinv - b
residuals_lm <- A %*% x_lm - b
residuals_qr <- A %*% x_qr - b
orthogonality_pinv <- t(residuals_pinv) %*% (A %*% x_pinv)
orthogonality_lm <- t(residuals_lm) %*% (A %*% x_lm)
orthogonality_qr <- t(residuals_qr) %*% (A %*% x_qr)

#| label:  Print results
list(
  solutions_equal = list(equal_pinv_lm, equal_lm_qr),
  orthogonality_pinv = orthogonality_pinv,
  orthogonality_lm = orthogonality_lm,
  orthogonality_qr = orthogonality_qr
)
$solutions_equal
$solutions_equal[[1]]
[1] "names for current but not for target"

$solutions_equal[[2]]
[1] "names for target but not for current"


$orthogonality_pinv
             [,1]
[1,] 1.035501e-15

$orthogonality_lm
              [,1]
[1,] -6.896032e-16

$orthogonality_qr
              [,1]
[1,] -9.018485e-16

Problem 3: Iterative Solutions to Least Squares: Richardson Iteration for Least Squares:

A <- matrix(rnorm(20 * 10), nrow = 20, ncol = 10)
b <- rnorm(20)

#| label: Compute exact least squares solution using the pseudoinverse
x_exact <- ginv(A) %*% b
x_exact
              [,1]
 [1,] -0.497875053
 [2,] -0.394771337
 [3,] -0.669152261
 [4,]  0.094448027
 [5,]  0.306222311
 [6,]  0.136908920
 [7,] -0.105711833
 [8,] -0.006290893
 [9,]  0.737834607
[10,]  0.237205856
# Compute the step size parameter: mu = 1 / ||A||^2
A_norm <- norm(A, type = "F")  # Frobenius norm ||A||
mu <- 1 / (A_norm^2)


#| label: Initialize x^(0) = 0 and set iteration parameters
x_k <- rep(0, 10)  # Initial guess (vector of zeros)
max_iter <- 500    # Number of iterations
errors <- numeric(max_iter)  # Store errors ||x^(k) - x||
errors
  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [38] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 [75] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[112] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[149] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[186] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[223] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[260] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[297] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[334] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[371] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[408] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[445] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
[482] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
for (k in 1:max_iter) {
  x_k <- x_k - mu * t(A) %*% (A %*% x_k - b)  # Richardson update
  errors[k] <- norm(x_k - x_exact, type = "2")  # Compute error
}
#| label: Plot error decay
plot(1:max_iter, log10(errors), type = "l", col = "blue", lwd = 2,
     xlab = "Iteration (k)", ylab = "log10(||x^(k) - x||)",
     main = "Convergence of Richardson Iteration")
grid()