This document implements gradient descent for five optimization problems as specified, using a fixed step size, stopping when the gradient norm satisfies \(|\nabla f(x)| < \varepsilon\), or after a maximum number of iterations. For each problem, we provide the code, the final solution, the loss vs. iteration plot, and a 2D visualization of the gradient descent path.
\(f(\beta) = \frac{1}{2n} \sum_{i=1}^n (y_i - \mathbf{x}_i^\top \beta)^2\)
n <- 100
x <- rnorm(n, mean = 1, sd = sqrt(2))
y <- rnorm(n, mean = 2 + 3*x, sd = sqrt(5))
data <- data.frame(x, y)
eta <- 0.01 # step size
epsilon <- 1e-6 # threshold
beta <- c(0, 0) # initial point
max_iter <- 10000
# Gradient computation
gradient_comp <- function(x, y, beta, n) {
beta0 <- beta[1]
beta1 <- beta[2]
predictions <- beta0 + beta1 * x
errors <- y - predictions
grad_beta0 <- -sum(errors) / n
grad_beta1 <- -sum(errors * x) / n
c(grad_beta0, grad_beta1)
}
# Loss computation
loss_comp <- function(x, y, beta, n) {
predictions <- beta[1] + beta[2] * x
sum((y - predictions)^2) / (2 * n)
}
iteration <- 0
loss_history <- numeric()
beta_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
beta_history[1, ] <- beta
converged <- FALSE
while (iteration < max_iter) {
grad <- gradient_comp(x, y, beta, n)
grad_norm <- sqrt(sum(grad^2))
loss <- loss_comp(x, y, beta, n)
loss_history <- c(loss_history, loss)
if (grad_norm < epsilon) {
converged <- TRUE
break
}
beta <- beta - eta * grad
iteration <- iteration + 1
beta_history[iteration + 1, ] <- beta
}
# Trim beta_history
beta_history <- beta_history[1:(iteration + 1), ]
# Print results
cat("Solution: beta = [", beta[1], ", ", beta[2], "]\n")
## Solution: beta = [ 2.062891 , 2.990398 ]
cat("Number of iterations:", iteration, "\n")
## Number of iterations: 1998
cat("Final loss:", loss_history[length(loss_history)], "\n")
## Final loss: 3.240448
cat("Converged:", converged, "\n")
## Converged: TRUE
# Plot loss vs iteration
plot(1:length(loss_history), loss_history, type = "l",
xlab = "Iteration", ylab = "Loss",
main = "Loss vs Iteration: Problem 1",
col = "red", lwd = 2)
grid()
# Contour plot for gradient descent path
beta0_range <- seq(-2, 6, length.out = 100)
beta1_range <- seq(0, 5, length.out = 100)
loss_grid <- matrix(NA, nrow = length(beta0_range), ncol = length(beta1_range))
for (i in 1:length(beta0_range)) {
for (j in 1:length(beta1_range)) {
beta_temp <- c(beta0_range[i], beta1_range[j])
loss_grid[i, j] <- loss_comp(x, y, beta_temp, n)
}
}
contour(beta0_range, beta1_range, loss_grid,
xlab = expression(beta[0]), ylab = expression(beta[1]),
main = "Gradient Descent Path: Problem 1", nlevels = 50)
lines(beta_history[, 1], beta_history[, 2], col = "red", lwd = 2)
points(beta[1], beta[2], col = "green", pch = 19)
\[f(\beta) = \sum_{i=1}^n \log(1 + \exp(-y_i \mathbf{x}i^\top \beta))\]
\[\nabla f(\beta) = -\sum_{i=1}^n y_i \mathbf{x}_i (1 - \sigma(y_i \mathbf{x}_i^\top \beta)), \ where\ \sigma(z) = \frac{1}{1 + e^{-z}}\]
data <- read.csv("/Users/rajveervora/R Programs/Sem 2/Optimization/Assignment/logistic_data.csv")
x1 <- data$x1
x2 <- data$x2
y <- data$y
n <- length(y)
# Create matrix X (n x 3) with intercept
X <- matrix(c(rep(1, n), x1, x2), nrow = n, ncol = 3)
# Sigmoid function
sigmoid <- function(z) {
1 / (1 + exp(-z))
}
# Negative log-likelihood
neg_loglik <- function(X, y, beta) {
z <- X %*% beta
sigma_z <- sigmoid(z)
sigma_z <- pmax(pmin(sigma_z, 1 - 1e-10), 1e-10)
sum(-y * log(sigma_z) - (1 - y) * log1p(-sigma_z))
}
# Gradient
gradient <- function(X, y, beta) {
z <- X %*% beta
sigma_z <- sigmoid(z)
errors <- sigma_z - y
grad <- t(errors) %*% X
as.vector(grad)
}
# Gradient descent
beta <- c(0, 0, 0)
eta <- 0.05
max_iter <- 10000
loss_history <- numeric()
beta_history <- matrix(beta, nrow = 1, ncol = 3)
for (iter in 1:max_iter) {
grad <- gradient(X, y, beta)
loss <- neg_loglik(X, y, beta)
loss_history <- c(loss_history, loss)
beta_new <- beta - eta * grad
beta_history <- rbind(beta_history, beta_new)
if (iter > 1 && sqrt(sum((beta_new - beta)^2)) < 1e-5) break
beta <- beta_new
}
# Print results
cat("Solution: beta = [", beta[1], ", ", beta[2], ", ", beta[3], "]\n")
## Solution: beta = [ -0.3751797 , 1.238195 , -1.839831 ]
cat("Final loss:", loss_history[length(loss_history)], "\n")
## Final loss: 45.30311
cat("Number of iterations:", length(loss_history), "\n")
## Number of iterations: 37
# Plot loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
xlab = "Iteration", ylab = "Negative Log-Likelihood",
main = "Loss vs Iteration: Problem 2")
grid()
# 3D Scatter Plot
library(rgl)
plot3d(beta_history[, 1], beta_history[, 2], beta_history[, 3],
type = "l", col = "red", lwd = 2,
xlab = "beta0", ylab = "beta1", zlab = "beta2",
main = "Gradient Descent Path in 3D")
points3d(beta_history[1, 1], beta_history[1, 2], beta_history[1, 3],
col = "blue", size = 8)
points3d(beta_history[nrow(beta_history), 1], beta_history[nrow(beta_history), 2],
beta_history[nrow(beta_history), 3], col = "green", size = 8)
# to see the plot
rglwidget()
\[ f(x) = x^\top A x + b^\top x \ \ \ \ with\ \ A = \begin{bmatrix} 2 & 0 \\ 0 & 4 \end{bmatrix} b = \begin{bmatrix} -4 \\ -8 \end{bmatrix} \]
f <- function(x){
2*(x[1]^2) + 4*(x[2]^2) - 4*x[1] - 8*x[2]
}
gradient_comp <- function(x){
c(4 * x[1] - 4, 8 * x[2] - 8)
}
eta <- 0.1
eps <- 1e-6
max_iter <- 10000
iter <- 0
x <- c(0,0)
x_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
x_history[1, ] <- x
loss_history[1] <- f(x)
while(iter < max_iter){
grad <- gradient_comp(x)
grad_norm <- sqrt(sum(grad^2))
if(grad_norm < eps) break
x <- x - eta * grad
iter <- iter + 1
x_history[iter + 1, ] <- x
loss_history[iter + 1] <- f(x)
}
# Truncate history to actual number of iterations
x_history <- x_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]
cat("Solution: x =", x, "\n")
## Solution: x = 0.9999998 1
cat("Number of iterations:", iter, "\n")
## Number of iterations: 30
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 8.842957e-07
# Plot: loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
xlab = "Iteration", ylab = "Negative Log-Likelihood",
main = "Loss vs Iteration: Problem 3")
grid()
# Plot: Gradient Descent Path with Contour
# Create a grid for the contour plot
x1 <- seq(-1, 2, length.out = 100)
x2 <- seq(-1, 2, length.out = 100)
z <- matrix(NA, nrow = length(x1), ncol = length(x2))
for (i in 1:length(x1)) {
for (j in 1:length(x2)) {
z[i, j] <- f(c(x1[i], x2[j]))
}
}
# Plot contour and the gradient descent path
contour(x1, x2, z, xlab = "x1", ylab = "x2", main = "Gradient Descent Path")
points(x_history[, 1], x_history[, 2], col = "red", pch = 16) # Points
lines(x_history[, 1], x_history[, 2], col = "red") # Path
\[ f(\mu, \sigma) = \sum_{i=1}^n \log(\sigma) + \frac{(x_i - \mu)^2}{2 \sigma^2} \]
x <- read.csv("/Users/rajveervora/R Programs/Sem 2/Optimization/Assignment/normal_mle.csv")
x <- x$x
n <- length(x)
para <- c(mu = 0, sigma = 1)
eta <- 0.01
eps <- 1e-5
max_iter <- 10000
iter <- 0
f <- function(mu, sigma){
n*log(sigma) + (sum(x-mu))^2 / (2 * sigma^2)
}
gradient_comp <- function(mu, sigma) {
grad_mu <- -(1 / sigma^2) * sum(x - mu)
grad_sigma <- (n / sigma) - (1 / sigma^3) * sum((x - mu)^2)
return(c(grad_mu, grad_sigma))
}
para_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
para_history[1, ] <- para
loss_history[1] <- f(para[1], para[2])
while(iter < max_iter){
grad <- gradient_comp(para[1], para[2])
grad_norm <- sqrt(sum(grad^2))
if(grad_norm <= eps) break
para <- para - eta * grad
# ensuring variance remains positive
if(para[2] <= 0) para[2] <- 1e-5
iter <- iter + 1
para_history[iter, ] <- para
loss_history[iter] <- f(para[1], para[2])
}
cat("Solution: mu =", para[1], ", sigma =", para[2], "\n")
## Solution: mu = 5.013222 , sigma = 2.093897
cat("Number of iterations:", iter, "\n")
## Number of iterations: 473
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 8.566335e-06
# Truncate history to actual number of iterations
para_history <- para_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]
# plot: loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
xlab = "Iteration", ylab = "Negative Log-Likelihood",
main = "Loss vs Iteration: Problem 4")
grid()
# plot: Gradient Descent Path with Contour
mu_range <- seq(4, 6, length.out = 100)
sigma_range <- seq(1, 3, length.out = 100)
z <- matrix(NA, nrow = length(mu_range), ncol = length(sigma_range))
for (i in 1:length(mu_range)) {
for (j in 1:length(sigma_range)) {
z[i, j] <- f(mu_range[i], sigma_range[j])
}
}
# Plot the contour and the gradient descent path
contour(mu_range, sigma_range, z, xlab = "mu", ylab = "sigma", main = "Gradient Descent Path: Problem 4")
points(para_history[, 1], para_history[, 2], col = "red", pch = 16) # Points
lines(para_history[, 1], para_history[, 2], col = "red") # Path
\[f(x, y) = (1 - x)^2 + 100 (y - x^2)^2\]
\[\nabla f(x, y) = \begin{bmatrix} -2(1 - x) - 400 x (y - x^2) \\ 200 (y - x^2) \end{bmatrix} \]
para <- c(-1, 1)
eta <- 0.001
eps <- 1e-6
max_iter <- 100000
iter <- 0
f <- function(x, y) {
(1 - x)^2 + 100 * (y - x^2)^2
}
gradient_comp <- function(x, y) {
grad_x <- -2 * (1 - x) - 400 * x * (y - x^2)
grad_y <- 200 * (y - x^2)
c(grad_x, grad_y)
}
para_history <- matrix(NA, nrow = max_iter + 1, ncol = 2)
loss_history <- numeric(max_iter + 1)
para_history[1, ] <- para
loss_history[1] <- f(para[1], para[2])
while (iter < max_iter) {
grad <- gradient_comp(para[1], para[2])
grad_norm <- sqrt(sum(grad^2))
if (grad_norm < eps) break
para <- para - eta * grad
iter <- iter + 1
para_history[iter + 1, ] <- para
loss_history[iter + 1] <- f(para[1], para[2])
}
# Truncate history
para_history <- para_history[1:(iter + 1), ]
loss_history <- loss_history[1:(iter + 1)]
# Print results
cat("Solution: x =", para[1], ", y =", para[2], "\n")
## Solution: x = 0.9999989 , y = 0.9999978
cat("Number of iterations:", iter, "\n")
## Number of iterations: 32035
cat("Gradient norm at solution:", grad_norm, "\n")
## Gradient norm at solution: 9.998616e-07
cat("Function value at solution:", f(para[1], para[2]), "\n")
## Function value at solution: 1.251652e-12
# Plot loss vs iteration
plot(loss_history, type = "l", col = "blue", lwd = 2,
xlab = "Iteration", ylab = "Loss",
main = "Loss vs Iteration: Problem 5")
grid()
# Contour plot
x_range <- seq(-1.5, 1.5, length.out = 100)
y_range <- seq(-0.5, 2, length.out = 100)
z <- matrix(NA, nrow = length(x_range), ncol = length(y_range))
for (i in 1:length(x_range)) {
for (j in 1:length(y_range)) {
z[i, j] <- f(x_range[i], y_range[j])
}
}
contour(x_range, y_range, z, xlab = "x", ylab = "y",
main = "Gradient Descent Path: Problem 5", nlevels = 50)
points(para_history[, 1], para_history[, 2], col = "red", pch = 16)
lines(para_history[, 1], para_history[, 2], col = "red")
points(1, 1, col = "green", pch = 19)