Objective function :
\[ g(w) = (\frac{1}{4}w_o - 2w_1)^2+(\frac{1}{4}w_o - \frac{1}{2}w_1)^2 \]
gradient :
\[ \nabla g(w) = \begin{bmatrix} \frac{\partial g}{\partial w_0} \\ \frac{\partial g}{\partial w_1} \end{bmatrix} = \begin{bmatrix} \frac14 w_0 - \frac54 w_1 \\ -\frac54 w_0 + \frac{17}{2} w_1 \end{bmatrix} \]
tolerance :
\[ ||g(w^{k})|| = \]
\[ w^{k} = w^{k-1} - \alpha \nabla g(w) \]
\[ w^{(k+1)} = w^{(k)} - \alpha \frac{\nabla g(w^{(k)})} {\|\nabla g(w^{(k)})\|} \]
\[ \nabla g(w) = \begin{bmatrix} \frac14 w_0 - \frac54 w_1 \\ -\frac54 w_0 + \frac{17}{2} w_1 \end{bmatrix} \]
\[ \left\|\nabla g(w)\right\| = \sqrt{ \left(\frac14 w_0 - \frac54 w_1\right)^2 + \left(-\frac54 w_0 + \frac{17}{2} w_1\right)^2 } \]
\[ \frac{\nabla g(w)} {\|\nabla g(w)\|} = \frac{ \begin{bmatrix} \frac14 w_0 - \frac54 w_1 \\ -\frac54 w_0 + \frac{17}{2} w_1 \end{bmatrix} }{ \sqrt{ \left(\frac14 w_0 - \frac54 w_1\right)^2 + \left(-\frac54 w_0 + \frac{17}{2} w_1\right)^2 } } \]
\[ w^{(k)} = w^{(k-1)} - \alpha \, \mathrm{sign}\!\left(\nabla g(w^{(k-1)})\right) \]
\[ w^{(k)} = w^{(k-1)} - a^{(k-1)} \odot \nabla g(w^{(k-1)}) \]
\[ a^{(k-1)} = \begin{bmatrix} \dfrac{\alpha}{ \left| \frac{\partial g}{\partial w_1}(w^{(k-1)}) \right| } \\[1em] \vdots \\[0.5em] \dfrac{\alpha}{ \left| \frac{\partial g}{\partial w_N}(w^{(k-1)}) \right| } \end{bmatrix} \]
gd <- function(g, g_prime, alpha_choice = 1,
max_its = 100, w0,
tol = 1e-5,
normalized = 0) {
w <- w0
w_history <- list(w)
g_history <- c(g(w))
for (k in 1:max_its) {
grad <- g_prime(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# normalization options
if (normalized == 1) {
# full normalized gradient descent
grad <- grad / grad_length
} else if (normalized == -1) {
# component-wise normalized gradient descent
grad <- sign(grad)
}
# step length
if (alpha_choice == "diminishing") {
alpha <- 1 / k
} else {
alpha <- alpha_choice
}
# update step
w <- w - alpha * grad
# store history
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
g <- function(w) {
(1/4 * w[1] - 2 * w[2])^2 +
(1/4 * w[1] - 1/2 * w[2])^2
}
g_prime <- function(w) {
c(
1/4 * w[1] - 5/4 * w[2],
-5/4 * w[1] + 17/2 * w[2]
)
}
\[ w_0 = (-4,4) \]
w0 <- c(-4, 4)
run_standard <- gd(g, g_prime, alpha_choice = 0.1,
max_its = 50, w0 = w0, normalized = 0)
run_full <- gd(g, g_prime, alpha_choice = 0.1,
max_its = 50, w0 = w0, normalized = 1)
run_component <- gd(g, g_prime, alpha_choice = 0.1,
max_its = 50, w0 = w0, normalized = -1)
plot(run_standard$g_history, type = "l", col = "black", lwd = 2,
xlab = "Iteration",
ylab = "Cost g(w)",
main = "Gradient Descent Comparison",
ylim = range(c(run_standard$g_history,
run_full$g_history,
run_component$g_history)))
lines(run_full$g_history, col = "blue", lwd = 2)
lines(run_component$g_history, col = "red", lwd = 2)
legend("topright",
legend = c("Standard", "Full normalized", "Component-wise normalized"),
col = c("black", "blue", "red"),
lwd = 2,
cex = .2)
# create grid
w1_vals <- seq(-5, 5, length.out = 200)
w2_vals <- seq(-5, 5, length.out = 200)
# evaluate g(w) on grid
z <- outer(w1_vals, w2_vals,
Vectorize(function(w1, w2) g(c(w1, w2))))
# extract optimization paths
path_standard <- do.call(rbind, run_standard$w_history)
path_full <- do.call(rbind, run_full$w_history)
path_component <- do.call(rbind, run_component$w_history)
# contour plot
contour(w1_vals, w2_vals, z,
nlevels = 30,
xlab = expression(w[0]),
ylab = expression(w[1]),
main = "Gradient Descent Paths on Contours of g(w)")
# standard GD path
lines(path_standard[,1], path_standard[,2],
col = "black", lwd = 2)
points(path_standard[,1], path_standard[,2],
col = "black", pch = 16, cex = 0.5)
# full normalized GD path
lines(path_full[,1], path_full[,2],
col = "blue", lwd = 2)
points(path_full[,1], path_full[,2],
col = "blue", pch = 16, cex = 0.5)
# component-wise normalized GD path
lines(path_component[,1], path_component[,2],
col = "red", lwd = 2)
points(path_component[,1], path_component[,2],
col = "red", pch = 16, cex = 0.5)
legend("topright",
legend = c("Standard", "Full normalized", "Component-wise normalized"),
col = c("black", "blue", "red"),
lwd = 2,
pch = 10,
cex=.5)
Component-wise normalized gradient descent performed best because it normalized each partial derivative separately. This prevented one coordinate direction from dominating the update and allowed both \(w_0\) and \(w_1\)to keep making progress toward the true minimum \((0,0)'\). In contrast, standard and fully normalized gradient descent became stuck or failed to make sufficient progress within 50 iterations.
\[ C_1= \begin{bmatrix} 0.5 & 0 \\ 0 & 9 \end{bmatrix} \qquad g_1(w)=w^T C_1 w \]
\[ \nabla g_1(w) = 2C_1 w = \begin{bmatrix} 1 & 0 \\ 0 & 18 \end{bmatrix} \begin{bmatrix} w_0 \\ w_1 \end{bmatrix} = \begin{bmatrix} w_0 \\ 18w_1 \end{bmatrix} \]
\[ C_2= \begin{bmatrix} 0.1 & 0 \\ 0 & 9 \end{bmatrix} \qquad g_2(w)=w^T C_2 w \]
\[ \nabla g_2(w) = 2C_2 w = \begin{bmatrix} 0.2 & 0 \\ 0 & 18 \end{bmatrix} \begin{bmatrix} w_0 \\ w_1 \end{bmatrix} = \begin{bmatrix} 0.2w_0 \\ 18w_1 \end{bmatrix} \]
gd <- function(g, g_prime, alpha_choice = 1,
max_its = 100, w0,
tol = 1e-5,
normalized = 0,
momentum = FALSE,
beta = 0.7) {
w <- w0
w_history <- list(w)
g_history <- c(g(w))
# initial momentum direction
d <- rep(0, length(w0))
for (k in 1:max_its) {
grad <- g_prime(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# normalization options
if (normalized == 1) {
grad <- grad / grad_length
} else if (normalized == -1) {
grad <- sign(grad)
}
# step length
if (alpha_choice == "diminishing") {
alpha <- 1 / k
} else {
alpha <- alpha_choice
}
# update step
if (momentum == TRUE) {
d <- beta * d - (1 - beta) * grad
w <- w + alpha * d
} else {
w <- w - alpha * grad
}
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
g1 <- function(w) {
0.5 * w[1]^2 + 9 * w[2]^2
}
g1_prime <- function(w) {
c(w[1], 18 * w[2])
}
g2 <- function(w) {
0.1 * w[1]^2 + 9 * w[2]^2
}
g2_prime <- function(w) {
c(0.2 * w[1], 18 * w[2])
}
plot_paths <- function(g, run_standard, run_momentum, title) {
# create grid
w1_vals <- seq(-12, 12, length.out = 200)
w2_vals <- seq(-4, 4, length.out = 200)
# evaluate g(w) on grid
z <- outer(w1_vals, w2_vals,
Vectorize(function(w1, w2) g(c(w1, w2))))
# extract paths
path_standard <- do.call(rbind, run_standard$w_history)
path_momentum <- do.call(rbind, run_momentum$w_history)
# contour plot
contour(w1_vals, w2_vals, z,
nlevels = 30,
xlab = expression(w[0]),
ylab = expression(w[1]),
main = title)
# standard GD path
lines(path_standard[,1], path_standard[,2],
col = "black", lwd = 2)
points(path_standard[,1], path_standard[,2],
col = "black", pch = 16, cex = 0.5)
# momentum GD path
lines(path_momentum[,1], path_momentum[,2],
col = "purple", lwd = 2)
points(path_momentum[,1], path_momentum[,2],
col = "purple", pch = 16, cex = 0.5)
legend("topright",
legend = c("Standard GD", "Momentum GD"),
col = c("black", "purple"),
lwd = 2,
pch = 16,
cex = 0.7)
}
w0 <- c(10, 1)
# C1 runs
run_c1_standard <- gd(g1, g1_prime,
alpha_choice = 0.1,
max_its = 25,
w0 = w0,
momentum = FALSE)
run_c1_momentum <- gd(g1, g1_prime,
alpha_choice = 0.1,
max_its = 25,
w0 = w0,
momentum = TRUE,
beta = 0.7)
# C2 runs
run_c2_standard <- gd(g2, g2_prime,
alpha_choice = 0.1,
max_its = 25,
w0 = w0,
momentum = FALSE)
run_c2_momentum <- gd(g2, g2_prime,
alpha_choice = 0.1,
max_its = 25,
w0 = w0,
momentum = TRUE,
beta = 0.7)
par(mfrow = c(1, 2))
plot_paths(g1, run_c1_standard, run_c1_momentum,
"C1: Standard vs Momentum GD")
plot_paths(g2, run_c2_standard, run_c2_momentum,
"C2: Standard vs Momentum GD")
par(mfrow = c(1, 1))
\[ g(w)=a+b^T w+w^T Cw \]
\[ \nabla g(w)=b+(C+C^T)w \]
\[ b= \begin{bmatrix} 1\\ 1 \end{bmatrix}, \qquad C= \begin{bmatrix} 7.5 & 2\\ 1 & 1.25 \end{bmatrix} \]
\[ C+C^T = \begin{bmatrix} 15 & 3\\ 3 & 2.5 \end{bmatrix} \]
\[ \nabla g(w) = \begin{bmatrix} 1\\ 1 \end{bmatrix} + \begin{bmatrix} 15 & 3\\ 3 & 2.5 \end{bmatrix} \begin{bmatrix} w_0\\ w_1 \end{bmatrix} \]
\[ \nabla g(w) = \begin{bmatrix} 1 + 15w_0 + 3w_1\\ 1 + 3w_0 + 2.5w_1 \end{bmatrix} \]
w0 <- c(-10, 1)
gd_hybrid <- function(g, g_prime,
alpha = 0.01,
beta = 0.7,
max_its = 100,
w0,
tol = 1e-5,
switch_iter = 20) {
w <- w0
d <- rep(0, length(w0))
w_history <- list(w)
g_history <- c(g(w))
for (k in 1:max_its) {
grad <- g_prime(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# first use component-wise normalized gradient
if (k <= switch_iter) {
direction <- sign(grad)
} else {
# then use momentum gradient descent
d <- beta * d - (1 - beta) * grad
direction <- -d
}
w <- w - alpha * direction
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
The first phase uses component-wise normalized gradient descent to prevent very small partial derivatives from slowing progress, allowing movement through flatter regions of the function.
The second phase switches to momentum gradient descent to smooth oscillations and accelerate convergence toward the minimum.
Combining both methods attempts to capture the strengths of each approach: fast early exploration from component-wise updates and stable convergence from momentum acceleration.
Notice this is very similar to Adaptive Moment Estimation (Adam) however we hold a constant \(\beta\)
g <- function(w) {
a <- 2
b <- c(1, 1)
C <- matrix(c(7.5, 1,
2, 1.25), nrow = 2, byrow = FALSE)
as.numeric(a + t(b) %*% w + t(w) %*% C %*% w)
}
g_prime <- function(w) {
c(
1 + 15 * w[1] + 3 * w[2],
1 + 3 * w[1] + 2.5 * w[2]
)
}
w0 <- c(-10, 1)
alphas <- c(0.001, 0.005, 0.01, 0.02, 0.05)
betas <- c(0.2, 0.5, 0.7, 0.9)
results <- data.frame()
for (alpha in alphas) {
for (beta in betas) {
run <- gd_hybrid(g, g_prime,
alpha = alpha,
beta = beta,
max_its = 200,
w0 = w0,
switch_iter = 20)
results <- rbind(results, data.frame(
alpha = alpha,
beta = beta,
w0_final = run$w_final[1],
w1_final = run$w_final[2],
g_final = run$g_final
))
}
}
results[order(results$g_final), ]
## alpha beta w0_final w1_final g_final
## 19 0.050 0.7 0.0175432991 -0.4210510 1.798246
## 17 0.050 0.2 0.0175427478 -0.4210477 1.798246
## 18 0.050 0.5 0.0175426493 -0.4210473 1.798246
## 20 0.050 0.9 0.0171055863 -0.4212252 1.798247
## 16 0.020 0.9 0.0168689354 -0.4211308 1.798249
## 15 0.020 0.7 0.0169476275 -0.4184326 1.798252
## 14 0.020 0.5 0.0166684092 -0.4172057 1.798260
## 13 0.020 0.2 0.0164964211 -0.4164499 1.798266
## 12 0.010 0.9 0.0005559053 -0.3452758 1.803726
## 11 0.010 0.7 -0.0100248064 -0.2999094 1.812271
## 10 0.010 0.5 -0.0118531542 -0.2918752 1.814193
## 9 0.010 0.2 -0.0128549270 -0.2874731 1.815299
## 8 0.005 0.9 -0.1323541122 0.2338737 2.208410
## 7 0.005 0.7 -0.1383290119 0.2638897 2.246609
## 6 0.005 0.5 -0.1396038130 0.2694876 2.253968
## 5 0.005 0.2 -0.1403101924 0.2725829 2.258063
## 4 0.001 0.9 -0.9400657034 2.0707895 9.278827
## 3 0.001 0.7 -1.0600495232 2.0200869 10.064587
## 2 0.001 0.5 -1.0810710698 2.0108364 10.227883
## 1 0.001 0.2 -1.0925545812 2.0057401 10.320352
A <- matrix(c(15, 3,
3, 2.5), nrow = 2, byrow = TRUE)
w_star <- solve(A, c(-1, -1))
w_star
## [1] 0.01754386 -0.42105263
g(w_star)
## [1] 1.798246
Notice :
The best results were obtained using a larger learning rate ( \(\alpha\) =0.05) with momentum coefficients
between 0.2 and 0.7. These settings converged closest to the true
minimum and achieved the lowest objective value. Smaller learning rates
converged much more slowly and remained farther from the minimum after
the same number of iterations.
ADAM <- function(g, g_prime,
alpha = 0.01,
beta1 = 0.9,
beta2 = 0.999,
max_its = 100,
w0,
tol = 1e-5,
epsilon = 1e-8) {
w <- w0
w_history <- list(w)
g_history <- c(g(w))
# initialize first and second moments
grad <- g_prime(w)
d <- grad
h <- grad^2
for (k in 1:max_its) {
grad <- g_prime(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# exponential average of gradients
d <- beta1 * d + (1 - beta1) * grad
# exponential average of squared gradients
h <- beta2 * h + (1 - beta2) * (grad^2)
# ADAM update : inc. epsilon for div. 0
w <- w - alpha * d / (sqrt(h) + epsilon)
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
alphas <- c(0.001, 0.005, 0.01, 0.02, 0.05)
beta1s <- c(0.2, 0.5, 0.7, 0.9)
beta2s <- c(0.9, 0.99, 0.999)
results <- data.frame()
for (alpha in alphas) {
for (beta1 in beta1s) {
for (beta2 in beta2s) {
run <- ADAM(g, g_prime,
alpha = alpha,
beta1 = beta1,
beta2 = beta2,
max_its = 200,
w0 = w0)
results <- rbind(results, data.frame(
alpha = alpha,
beta1 = beta1,
beta2 = beta2,
w0_final = run$w_final[1],
w1_final = run$w_final[2],
g_final = run$g_final
))
}
}
}
results[order(results$g_final), ]
## alpha beta1 beta2 w0_final w1_final g_final
## 58 0.050 0.9 0.900 -0.3401776 1.798281 6.533075
## 55 0.050 0.7 0.900 -0.9019101 1.365253 7.200045
## 52 0.050 0.5 0.900 -1.0086061 1.383850 8.211413
## 49 0.050 0.2 0.900 -1.0669909 1.405783 8.847724
## 59 0.050 0.9 0.990 -3.1845829 4.695942 62.274109
## 56 0.050 0.7 0.990 -3.3443437 4.583498 67.398106
## 53 0.050 0.5 0.990 -3.3755550 4.563031 68.463544
## 50 0.050 0.2 0.990 -3.3930142 4.551760 69.068428
## 60 0.050 0.9 0.999 -3.8754460 4.787791 88.544660
## 57 0.050 0.7 0.999 -3.9939010 4.656293 93.607740
## 54 0.050 0.5 0.999 -4.0169700 4.631560 94.634623
## 51 0.050 0.2 0.999 -4.0298623 4.617855 95.214190
## 46 0.020 0.9 0.900 -6.0018759 4.986414 212.450285
## 43 0.020 0.7 0.900 -6.0824919 4.797832 219.416315
## 40 0.020 0.5 0.900 -6.0984810 4.762373 220.820442
## 37 0.020 0.2 0.900 -6.1074526 4.742735 221.611448
## 47 0.020 0.9 0.990 -6.5020901 4.057439 258.067180
## 44 0.020 0.7 0.990 -6.5509718 3.979919 262.875861
## 41 0.020 0.5 0.990 -6.5608250 3.964511 263.852168
## 38 0.020 0.2 0.990 -6.5663772 3.955860 264.403339
## 48 0.020 0.9 0.999 -6.7418888 3.766249 281.478241
## 45 0.020 0.7 0.999 -6.7809854 3.706898 285.556235
## 42 0.020 0.5 0.999 -6.7888525 3.695072 286.381157
## 39 0.020 0.2 0.999 -6.7932833 3.688429 286.846387
## 34 0.010 0.9 0.900 -8.0001602 2.999419 416.276623
## 31 0.010 0.7 0.900 -8.0175664 2.966823 418.701984
## 28 0.010 0.5 0.900 -8.0211297 2.960209 419.198876
## 25 0.010 0.2 0.900 -8.0231454 2.956476 419.480026
## 35 0.010 0.9 0.990 -8.1219321 2.782895 433.277502
## 32 0.010 0.7 0.990 -8.1359457 2.759351 435.243234
## 29 0.010 0.5 0.990 -8.1388195 2.754549 435.646695
## 26 0.010 0.2 0.990 -8.1404461 2.751835 435.875103
## 36 0.010 0.9 0.999 -8.1968447 2.670018 443.639218
## 33 0.010 0.7 0.999 -8.2091767 2.650132 445.383116
## 30 0.010 0.5 0.999 -8.2117000 2.646080 445.740233
## 27 0.010 0.2 0.999 -8.2131273 2.643791 445.942272
## 22 0.005 0.9 0.900 -9.0000172 1.999946 553.503337
## 19 0.005 0.7 0.900 -9.0041024 1.992748 554.177677
## 16 0.005 0.5 0.900 -9.0049473 1.991264 554.317114
## 13 0.005 0.2 0.900 -9.0054266 1.990423 554.396204
## 23 0.005 0.9 0.990 -9.0299319 1.948187 558.434089
## 20 0.005 0.7 0.990 -9.0336251 1.941945 559.041714
## 17 0.005 0.5 0.990 -9.0343890 1.940657 559.167379
## 14 0.005 0.2 0.990 -9.0348223 1.939927 559.238662
## 24 0.005 0.9 0.999 -9.0506725 1.914829 561.815955
## 21 0.005 0.7 0.999 -9.0541249 1.909128 562.383349
## 18 0.005 0.5 0.999 -9.0548379 1.907953 562.500520
## 15 0.005 0.2 0.999 -9.0552422 1.907288 562.566956
## 10 0.001 0.9 0.900 -9.8000001 1.200000 680.220027
## 7 0.001 0.7 0.900 -9.8001561 1.199735 680.248962
## 4 0.001 0.5 0.900 -9.8001886 1.199680 680.254986
## 1 0.001 0.2 0.900 -9.8002070 1.199649 680.258409
## 11 0.001 0.9 0.990 -9.8011782 1.198006 680.438454
## 8 0.001 0.7 0.990 -9.8013312 1.197748 680.466804
## 5 0.001 0.5 0.990 -9.8013631 1.197694 680.472706
## 2 0.001 0.2 0.990 -9.8013812 1.197663 680.476060
## 12 0.001 0.9 0.999 -9.8020742 1.196509 680.604104
## 9 0.001 0.7 0.999 -9.8022251 1.196256 680.632033
## 6 0.001 0.5 0.999 -9.8022565 1.196203 680.637845
## 3 0.001 0.2 0.999 -9.8022743 1.196173 680.641148
Best result :
best_result <- results[which.min(results$g_final), ]
best_result
## alpha beta1 beta2 w0_final w1_final g_final
## 58 0.05 0.9 0.9 -0.3401776 1.798281 6.533075
gd_hybrid()# run both methods
run_hybrid <- gd_hybrid(g, g_prime,
alpha = 0.05,
beta = 0.7,
max_its = 200,
w0 = w0,
switch_iter = 20)
run_adam <- ADAM(g, g_prime,
alpha = 0.05,
beta1 = 0.9,
beta2 = 0.9,
max_its = 200,
w0 = w0)
# create grid
w1_vals <- seq(-12, 5, length.out = 200)
w2_vals <- seq(-2, 5, length.out = 200)
# evaluate g(w) on grid
z <- outer(w1_vals, w2_vals,
Vectorize(function(w1, w2) g(c(w1, w2))))
# extract paths
path_hybrid <- do.call(rbind, run_hybrid$w_history)
path_adam <- do.call(rbind, run_adam$w_history)
# contour plot
contour(w1_vals, w2_vals, z,
nlevels = 30,
xlab = expression(w[0]),
ylab = expression(w[1]),
main = "ADAM vs gd_hybrid on Contours of g(w)")
# gd_hybrid path
lines(path_hybrid[,1], path_hybrid[,2],
col = "red", lwd = 2)
points(path_hybrid[,1], path_hybrid[,2],
col = "red", pch = 16, cex = 0.5)
# ADAM path
lines(path_adam[,1], path_adam[,2],
col = "blue", lwd = 2)
points(path_adam[,1], path_adam[,2],
col = "blue", pch = 16, cex = 0.5)
# true minimum
points(w_star[1], w_star[2],
col = "darkgreen", pch = 8, cex = 1.5, lwd = 2)
legend("bottomleft",
legend = c("gd_hybrid", "ADAM", "True minimum"),
col = c("red", "blue", "darkgreen"),
lwd = c(2, 2, NA),
pch = c(16, 16, 8),
cex = 0.7)
# plot cost histories
plot(run_hybrid$g_history,
type = "l",
col = "red",
lwd = 2,
xlab = "Iteration",
ylab = "Cost g(w)",
main = "Cost vs Iteration: ADAM vs gd_hybrid",
ylim = range(c(run_hybrid$g_history,
run_adam$g_history)))
lines(run_adam$g_history,
col = "blue",
lwd = 2)
legend("topright",
legend = c("gd_hybrid", "ADAM"),
col = c("red", "blue"),
lwd = 2,
cex = 0.7)
Observations :
ADAM initially moves quickly toward the minimum, but with \(\alpha\)=0.05 it overshoots and oscillates around the minimizer. In contrast, gd_hybrid() approaches the minimum more smoothly and settles closer to \(w^{*}\). My algorithm dramatically drops towards min. For the first 20 steps, it uses component-wise normalized GD then it goes to momentum.
\[ w^{(k)} = w^{(k-1)} - \left( \nabla^2 g(w^{(k-1)}) \right)^{-1} \nabla g(w^{(k-1)}) \]
\[ g(w)=a+b^T w+w^T Cw \]
\[ \nabla g(w)=b+(C+C^T)w \]
\[ \nabla^2 g(w)=C+C^T \]
For this problem,
\[ b= \begin{bmatrix} 1\\ 1 \end{bmatrix}, \qquad C= \begin{bmatrix} 7.5 & 2\\ 1 & 1.25 \end{bmatrix} \]
Hence,
\[ C+C^T= \begin{bmatrix} 15 & 3\\ 3 & 2.5 \end{bmatrix} \]
inverse – using det. trick w/ \(\mathbb{R}^{2 \text{ by } 2}\):
\[ (\nabla^2 g(w))^{-1} = \frac{1}{15(2.5)-3(3)} \begin{bmatrix} 2.5 & -3\\ -3 & 15 \end{bmatrix} \]
\[ (\nabla^2 g(w))^{-1} = \frac{1}{28.5} \begin{bmatrix} 2.5 & -3\\ -3 & 15 \end{bmatrix} \]
C <- matrix(c(7.5, 2,
1, 1.25),
nrow = 2,
byrow = TRUE)
H <- C + t(C)
H
## [,1] [,2]
## [1,] 15 3.0
## [2,] 3 2.5
solve(H)
## [,1] [,2]
## [1,] 0.0877193 -0.1052632
## [2,] -0.1052632 0.5263158
Newton <- function(g, g_prime, hessian,
max_its = 100,
w0,
tol = 1e-5) {
w <- w0
w_history <- list(w)
g_history <- c(g(w))
for (k in 1:max_its) {
grad <- g_prime(w)
H <- hessian(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# regular Newton update
w <- w - solve(H, grad)
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
hessian <- function(w) {
matrix(c(15, 3,
3, 2.5),
nrow = 2,
byrow = TRUE)
}
run_newton <- Newton(g, g_prime, hessian,
max_its = 10,
w0 = w0)
run_newton$w_final
## [1] 0.01754386 -0.42105263
run_newton$g_final
## [1] 1.798246
run_newton$w_history
## [[1]]
## [1] -10 1
##
## [[2]]
## [1] 0.01754386 -0.42105263
run_newton <- Newton(g, g_prime, hessian,
max_its = 10,
w0 = w0)
path_newton <- do.call(rbind, run_newton$w_history)
contour(w1_vals, w2_vals, z,
nlevels = 30,
xlab = expression(w[0]),
ylab = expression(w[1]),
main = "Newton's Method on Contours of g(w)")
lines(path_newton[,1], path_newton[,2],
col = "purple", lwd = 2)
points(path_newton[,1], path_newton[,2],
col = "purple", pch = 16, cex = 0.7)
points(w_star[1], w_star[2],
col = "darkgreen", pch = 8, cex = 1.5, lwd = 2)
legend("topright",
legend = c("Newton", "True minimum"),
col = c("purple", "darkgreen"),
lwd = c(2, NA),
pch = c(16, 8),
cex = 0.7)
plot(run_hybrid$g_history,
type = "l",
col = "red",
lwd = 2,
xlab = "Iteration",
ylab = "Cost g(w)",
main = "Cost vs Iteration: Newton vs ADAM vs gd_hybrid",
ylim = range(c(run_hybrid$g_history,
run_adam$g_history,
run_newton$g_history)))
lines(run_adam$g_history,
col = "blue",
lwd = 2)
lines(run_newton$g_history,
col = "purple",
lwd = 2)
legend("topright",
legend = c("gd_hybrid", "ADAM", "Newton"),
col = c("red", "blue", "purple"),
lwd = 2,
cex = 0.5)
sub_samp <- function(g, g_prime, hessian,
max_its = 100,
w0,
tol = 1e-5) {
w <- w0
w_history <- list(w)
g_history <- c(g(w))
for (k in 1:max_its) {
grad <- g_prime(w)
H <- hessian(w)
grad_length <- sqrt(sum(grad^2))
if (grad_length < tol) {
break
}
# subsampled Hessian:
# keep only diagonal entries
H_sub <- diag(diag(H))
# update step
w <- w - solve(H_sub, grad)
w_history[[k + 1]] <- w
g_history[k + 1] <- g(w)
}
return(list(
w_final = w,
g_final = g(w),
w_history = w_history,
g_history = g_history
))
}
run_sub <- sub_samp(g, g_prime, hessian,
max_its = 50,
w0 = w0)
run_sub$w_final
## [1] 0.01754382 -0.42105080
run_sub$g_final
## [1] 1.798246
notice unlike newtons() method, this took 24 iterations vs. 2 iterations
The subsampled Hessian method takes longer because it uses only the diagonal of the Hessian
The missing off-diagonal 3’s contain information about how \(w_0\) and \(w_1\) interact.
Notice the leading principle (top left corner) is positive and so is the det of the Hessian :
det(H) > 0
## [1] TRUE
mu <- c(2,4,-1,3,0)
mu <- matrix(mu)
Sigma <- matrix(c(4,-1,1/2,-1/2, 0, -1,3,1,-1,0,1/2, 1,6,1,-4,-1/2,-1,1,4,0,0,0,-4,0,2), nrow = 5, ncol = 5, byrow=T)
A <- matrix(c(-1, 1, 1, 1/2), ncol =2, byrow=T)
B <- matrix(c(1,1,1/2,-2, 1, -2), nrow = 2, byrow = T)
\[ X^{(1)}=(X_1 X_2)' \]
\[ X^{(2)}=(X_3 X_4 X_5)' \]
\[ \mathbb{E}(X^{(1)}) \]
mu[1:2] |> as.matrix()
## [,1]
## [1,] 2
## [2,] 4
A %*% mu[1:2] |> as.matrix()
## [,1]
## [1,] 2
## [2,] 4
Sigma[1:2, 1:2]
## [,1] [,2]
## [1,] 4 -1
## [2,] -1 3
A %*% Sigma[1:2, 1:2] %*% t(A)
## [,1] [,2]
## [1,] 9 -3.00
## [2,] -3 3.75
mu[3:length(mu)] |> as.matrix()
## [,1]
## [1,] -1
## [2,] 3
## [3,] 0
B %*% mu[3:length(mu)] |> as.matrix()
## [,1]
## [1,] 2
## [2,] 5
Sigma[3:length(mu), 3:length(mu)]
## [,1] [,2] [,3]
## [1,] 6 1 -4
## [2,] 1 4 0
## [3,] -4 0 2
B %*% Sigma[3:length(mu), 3:length(mu)] |> as.matrix() %*% t(B)
## [,1] [,2]
## [1,] 8.5 1
## [2,] 1.0 0
Sigma[1:2, 3:5]
## [,1] [,2] [,3]
## [1,] 0.5 -0.5 0
## [2,] 1.0 -1.0 0
A %*% Sigma[1:2, 3:5] %*% t(B)
## [,1] [,2]
## [1,] 0 -1.5
## [2,] 0 -3.0