Topic: Gradient Descent for Unconstrained Minimization

This document implements gradient descent for five optimization problems as specified, using a fixed step size, stopping when the gradient norm satisfies \(|\nabla f(x)| < \varepsilon\), or after a maximum number of iterations. For each problem, we provide the code, the final solution, the loss vs. iteration plot, and a 2D visualization of the gradient descent path.

Problem 1: Linear Regression Loss

Function:

\(f(\beta) = \frac{1}{2n} \sum_{i=1}^n (y_i - \mathbf{x}_i^\top \beta)^2\)

Details:

  • Generate \(n=100\) paired \((x, y)\) where \(x \sim N(1, 2)\), \(y \sim N(2 + 3x, 5)\).
  • Initial point: \(\beta_0 = [0, 0]\)
  • Step size: \(\eta = 0.01\)
  • Threshold: \(\varepsilon = 10^{-6}\).
n <- 100
x <- rnorm(n, mean = 1, sd = sqrt(2))
y <- rnorm(n, mean = 2 + 3*x, sd = sqrt(5))
data <- data.frame(x, y)

eta <- 0.01       # step size
epsilon <- 1e-6   # threshold
beta <- c(0, 0)   # initial point
max_iter <- 10000

# Gradient computation
gradient_comp <- function(x, y, beta, n) {
  beta0 <- beta[1]
  beta1 <- beta[2]
  predictions <- beta0 + beta1 * x
  errors <- y - predictions
  grad_beta0 <- -sum(errors) / n
  grad_beta1 <- -sum(errors * x) / n
  c(grad_beta0, grad_beta1)
}

# Loss computation
loss_comp <- function(x, y, beta, n) {
  predictions <- beta[1] + beta[2] * x
  sum((y - predictions)^2) / (2 * n)
}

iteration <- 0
loss_history <- numeric()
beta_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
beta_history[1, ] <- beta
converged <- FALSE

while (iteration < max_iter) {
  grad <- gradient_comp(x, y, beta, n)
  grad_norm <- sqrt(sum(grad^2))
  loss <- loss_comp(x, y, beta, n)
  loss_history <- c(loss_history, loss)
  
  if (grad_norm < epsilon) {
    converged <- TRUE
    break
  }
  
  beta <- beta - eta * grad
  iteration <- iteration + 1
  beta_history[iteration + 1, ] <- beta
}

# Trim beta_history
beta_history <- beta_history[1:(iteration + 1), ]

# Print results
cat("Solution: beta = [", beta[1], ", ", beta[2], "]\n")
## Solution: beta = [ 2.062891 ,  2.990398 ]
cat("Number of iterations:", iteration, "\n")
## Number of iterations: 1998
cat("Final loss:", loss_history[length(loss_history)], "\n")
## Final loss: 3.240448
cat("Converged:", converged, "\n")
## Converged: TRUE
# Plot loss vs iteration
plot(1:length(loss_history), loss_history, type = "l", 
     xlab = "Iteration", ylab = "Loss", 
     main = "Loss vs Iteration: Problem 1",
     col = "red", lwd = 2)
grid()

# Contour plot for gradient descent path
beta0_range <- seq(-2, 6, length.out = 100)
beta1_range <- seq(0, 5, length.out = 100)
loss_grid <- matrix(NA, nrow = length(beta0_range), ncol = length(beta1_range))
for (i in 1:length(beta0_range)) {
  for (j in 1:length(beta1_range)) {
    beta_temp <- c(beta0_range[i], beta1_range[j])
    loss_grid[i, j] <- loss_comp(x, y, beta_temp, n)
  }
}

contour(beta0_range, beta1_range, loss_grid, 
        xlab = expression(beta[0]), ylab = expression(beta[1]), 
        main = "Gradient Descent Path: Problem 1", nlevels = 50)
lines(beta_history[, 1], beta_history[, 2], col = "red", lwd = 2)
points(beta[1], beta[2], col = "green", pch = 19)

Problem 2: Logistic Regression Negative Log-Likelihood

Function:

\[f(\beta) = \sum_{i=1}^n \log(1 + \exp(-y_i \mathbf{x}i^\top \beta))\]

Gradient:

\[\nabla f(\beta) = -\sum_{i=1}^n y_i \mathbf{x}_i (1 - \sigma(y_i \mathbf{x}_i^\top \beta)), \ where\ \sigma(z) = \frac{1}{1 + e^{-z}}\]

Details:

  • Data: Provided in logistic_data.csv
  • Initial point: \(\beta_0 = [0, 0, 0]\)
  • Step size: \(\eta = 0.05\)
  • Threshold: \(\varepsilon = 10^{-5}\)
data <- read.csv("/Users/rajveervora/R Programs/Sem 2/Optimization/Assignment/logistic_data.csv")
x1 <- data$x1
x2 <- data$x2
y <- data$y
n <- length(y)

# Create matrix X (n x 3) with intercept
X <- matrix(c(rep(1, n), x1, x2), nrow = n, ncol = 3)

# Sigmoid function
sigmoid <- function(z) {
  1 / (1 + exp(-z))
}

# Negative log-likelihood
neg_loglik <- function(X, y, beta) {
  z <- X %*% beta
  sigma_z <- sigmoid(z)
  sigma_z <- pmax(pmin(sigma_z, 1 - 1e-10), 1e-10)
  sum(-y * log(sigma_z) - (1 - y) * log1p(-sigma_z))
}

# Gradient
gradient <- function(X, y, beta) {
  z <- X %*% beta
  sigma_z <- sigmoid(z)
  errors <- sigma_z - y
  grad <- t(errors) %*% X
  as.vector(grad)
}

# Gradient descent
beta <- c(0, 0, 0)
eta <- 0.05
max_iter <- 10000
loss_history <- numeric()
beta_history <- matrix(beta, nrow = 1, ncol = 3)

for (iter in 1:max_iter) {
  grad <- gradient(X, y, beta)
  loss <- neg_loglik(X, y, beta)
  loss_history <- c(loss_history, loss)
  
  beta_new <- beta - eta * grad
  beta_history <- rbind(beta_history, beta_new)
  
  if (iter > 1 && sqrt(sum((beta_new - beta)^2)) < 1e-5) break
  beta <- beta_new
}

# Print results
cat("Solution: beta = [", beta[1], ", ", beta[2], ", ", beta[3], "]\n")
## Solution: beta = [ -0.3751797 ,  1.238195 ,  -1.839831 ]
cat("Final loss:", loss_history[length(loss_history)], "\n")
## Final loss: 45.30311
cat("Number of iterations:", length(loss_history), "\n")
## Number of iterations: 37
# Plot loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
     xlab = "Iteration", ylab = "Negative Log-Likelihood",
     main = "Loss vs Iteration: Problem 2")
grid()

# 3D Scatter Plot
library(rgl)
plot3d(beta_history[, 1], beta_history[, 2], beta_history[, 3], 
       type = "l", col = "red", lwd = 2,
       xlab = "beta0", ylab = "beta1", zlab = "beta2",
       main = "Gradient Descent Path in 3D")
points3d(beta_history[1, 1], beta_history[1, 2], beta_history[1, 3], 
         col = "blue", size = 8)
points3d(beta_history[nrow(beta_history), 1], beta_history[nrow(beta_history), 2], 
         beta_history[nrow(beta_history), 3], col = "green", size = 8)
# to see the plot
rglwidget()

Problem 3: Quadratic Convex Function

Function:

\[ f(x) = x^\top A x + b^\top x \ \ \ \ with\ \ A = \begin{bmatrix} 2 & 0 \\ 0 & 4 \end{bmatrix} b = \begin{bmatrix} -4 \\ -8 \end{bmatrix} \]

Details:

  • Initial point: \(x_0 = [1, 1]\)
  • Step size: \(\eta = 0.1\)
  • Threshold: \(\varepsilon = 10^{-6}\)
f <- function(x){
  2*(x[1]^2) + 4*(x[2]^2) - 4*x[1] - 8*x[2]
}

gradient_comp <- function(x){
  c(4 * x[1] - 4, 8 * x[2] - 8)
}

eta <- 0.1
eps <- 1e-6
max_iter <- 10000
iter <- 0
x <- c(0,0)

x_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
x_history[1, ] <- x
loss_history[1] <- f(x)

while(iter < max_iter){
  grad <- gradient_comp(x)
  grad_norm <- sqrt(sum(grad^2))
  
  if(grad_norm < eps) break
  x <- x - eta * grad
  iter <- iter + 1
  
  x_history[iter + 1, ] <- x
  loss_history[iter + 1] <- f(x)
}

# Truncate history to actual number of iterations
x_history <- x_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]

cat("Solution: x =", x, "\n")
## Solution: x = 0.9999998 1
cat("Number of iterations:", iter, "\n")
## Number of iterations: 30
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 8.842957e-07
# Plot: loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
     xlab = "Iteration", ylab = "Negative Log-Likelihood",
     main = "Loss vs Iteration: Problem 3")
grid()

# Plot: Gradient Descent Path with Contour
# Create a grid for the contour plot
x1 <- seq(-1, 2, length.out = 100)
x2 <- seq(-1, 2, length.out = 100)
z <- matrix(NA, nrow = length(x1), ncol = length(x2))
for (i in 1:length(x1)) {
  for (j in 1:length(x2)) {
    z[i, j] <- f(c(x1[i], x2[j]))
  }
}

# Plot contour and the gradient descent path
contour(x1, x2, z, xlab = "x1", ylab = "x2", main = "Gradient Descent Path")
points(x_history[, 1], x_history[, 2], col = "red", pch = 16)  # Points
lines(x_history[, 1], x_history[, 2], col = "red")  # Path

Problem 4: Normal Distribution MLE (Negative Log-Likelihood)

Function:

\[ f(\mu, \sigma) = \sum_{i=1}^n \log(\sigma) + \frac{(x_i - \mu)^2}{2 \sigma^2} \]

Details:

  • Data: Provided in normal_mle.csv
  • Initial point: \(\mu_0 = 0\), \(\sigma_0 = 1\)
  • Step size: \(\eta = 0.01\)
  • Threshold: \(\varepsilon = 10^{-5}\)
x <- read.csv("/Users/rajveervora/R Programs/Sem 2/Optimization/Assignment/normal_mle.csv")
x <- x$x
n <- length(x)

para <- c(mu = 0, sigma = 1)
eta <- 0.01
eps <- 1e-5
max_iter <- 10000
iter <- 0

f <- function(mu, sigma){
  n*log(sigma) + (sum(x-mu))^2 / (2 * sigma^2)
}

gradient_comp <- function(mu, sigma) {
  grad_mu <- -(1 / sigma^2) * sum(x - mu)
  grad_sigma <- (n / sigma) - (1 / sigma^3) * sum((x - mu)^2)
  return(c(grad_mu, grad_sigma))
}

para_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
para_history[1, ] <- para
loss_history[1] <- f(para[1], para[2])


while(iter < max_iter){
  grad <- gradient_comp(para[1], para[2])
  grad_norm <- sqrt(sum(grad^2))
  
  if(grad_norm <= eps) break
  para <- para - eta * grad
  
  # ensuring variance remains positive
  if(para[2] <= 0) para[2] <- 1e-5
  iter <- iter + 1
  
  para_history[iter, ] <- para
  loss_history[iter] <- f(para[1], para[2])
}

cat("Solution: mu =", para[1], ", sigma =", para[2], "\n")
## Solution: mu = 5.013222 , sigma = 2.093897
cat("Number of iterations:", iter, "\n")
## Number of iterations: 473
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 8.566335e-06
# Truncate history to actual number of iterations
para_history <- para_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]


# plot: loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
     xlab = "Iteration", ylab = "Negative Log-Likelihood",
     main = "Loss vs Iteration: Problem 4")
grid()

# plot: Gradient Descent Path with Contour
mu_range <- seq(4, 6, length.out = 100)
sigma_range <- seq(1, 3, length.out = 100)
z <- matrix(NA, nrow = length(mu_range), ncol = length(sigma_range))
for (i in 1:length(mu_range)) {
  for (j in 1:length(sigma_range)) {
    z[i, j] <- f(mu_range[i], sigma_range[j])
  }
}

# Plot the contour and the gradient descent path
contour(mu_range, sigma_range, z, xlab = "mu", ylab = "sigma", main = "Gradient Descent Path: Problem 4")
points(para_history[, 1], para_history[, 2], col = "red", pch = 16)  # Points
lines(para_history[, 1], para_history[, 2], col = "red")  # Path

Problem 5: Rosenbrock Function (Non-Convex)

Function:

\[f(x, y) = (1 - x)^2 + 100 (y - x^2)^2\]

Gradient:

\[\nabla f(x, y) = \begin{bmatrix} -2(1 - x) - 400 x (y - x^2) \\ 200 (y - x^2) \end{bmatrix} \]

Details:

  • Initial point: \([x_0, y_0] = [-1, 1]\)
  • Step size: \(\eta = 0.001\)
  • Threshold: \(\varepsilon = 10^{-6}\)
para <- c(-1, 1)
eta <- 0.001
eps <- 1e-6
max_iter <- 100000
iter <- 0

f <- function(x, y) {
  (1 - x)^2 + 100 * (y - x^2)^2
}

gradient_comp <- function(x, y) {
  grad_x <- -2 * (1 - x) - 400 * x * (y - x^2)
  grad_y <- 200 * (y - x^2)
  c(grad_x, grad_y)
}

para_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
para_history[1, ] <- para
loss_history[1] <- f(para[1], para[2])

while (iter < max_iter) {
  grad <- gradient_comp(para[1], para[2])
  grad_norm <- sqrt(sum(grad^2))
  if (grad_norm < eps) break
  para <- para - eta * grad
  iter <- iter + 1
  para_history[iter + 1, ] <- para
  loss_history[iter + 1] <- f(para[1], para[2])
}

# Truncate history
para_history <- para_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]

# Print results
cat("Solution: x =", para[1], ", y =", para[2], "\n")
## Solution: x = 0.9999989 , y = 0.9999978
cat("Number of iterations:", iter, "\n")
## Number of iterations: 32035
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 9.998616e-07
cat("Function value at solution:", f(para[1], para[2]), "\n")
## Function value at solution: 1.251652e-12
# Plot loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
     xlab = "Iteration", ylab = "Loss",
     main = "Loss vs Iteration: Problem 5")
grid()

# Contour plot
x_range <- seq(-1.5, 1.5, length.out = 100)
y_range <- seq(-0.5, 2, length.out = 100)
z <- matrix(NA, nrow = length(x_range), ncol = length(y_range))
for (i in 1:length(x_range)) {
  for (j in 1:length(y_range)) {
    z[i, j] <- f(x_range[i], y_range[j])
  }
}

contour(x_range, y_range, z, xlab = "x", ylab = "y", 
        main = "Gradient Descent Path: Problem 5", nlevels = 50)
points(para_history[, 1], para_history[, 2], col = "red", pch = 16)
lines(para_history[, 1], para_history[, 2], col = "red")
points(1, 1, col = "green", pch = 19)