Q1) Zero-Order : Random Search Algorithm

Brief layout :

Algorithm Weakness :

A) Build Function

Recall :

\[ W^{(k)}=W^{(k-1)}+\alpha d^{(k-1)} \]

where \(||d^{k-1}||=1\) (ie. normalized) and we just sample from 0 to \(2\pi\) (0 to 360 degrees).

Meaning,

\[ d^{k-1}=\begin{bmatrix}\cos(\theta) \\ \sin(\theta)\end{bmatrix} \]

\[ X \sim \text{Unif}(0,2\pi) \]

Helper function) Directions \(\theta\)

random_direction_radians <- 
  function(num_samp = 1000){
    radians <- runif(num_samp, min = 0, max = 2 * pi)
    radians
    }

# tst
random_direction_radians(3) 
## [1] 1.806903 4.953067 2.569678

Helper function) \(d^{k-1}\)

d_k_1 <- function(num_samp) {
  theta <- random_direction_radians(num_samp)
  
  cos_vals <- cos(theta)
  sin_vals <- sin(theta)
  
  values <- c(rbind(cos_vals, sin_vals))
  
  names(values) <- paste0(
    rep(c("cos_v", "sin_v"), times = num_samp),
    rep(seq_len(num_samp), each = 2)
  )
  
  values
}
d_k_1(3)
##     cos_v1     sin_v1     cos_v2     sin_v2     cos_v3     sin_v3 
##  0.7418151 -0.6706045  0.9308533 -0.3653931  0.9593123  0.2823471

Helper function) \(\alpha d^{k-1}\)

scale_direction <- function(num_samp, alpha_choice = 1){
  direction <- d_k_1(num_samp)
  if(alpha_choice=="diminishing"){
    k_val <- rep(seq_len(num_samp), each = 2) # for cos and sin (twice)
    dininishing_direction <- (1/k_val)*direction
    dininishing_direction
  }else{
    alpha_choice*direction
  }
}

set.seed(123)
list(v1=scale_direction(3), v2=scale_direction(3, 2), v3=scale_direction(3, "diminishing")) |> as.data.frame()
##                v1         v2         v3
## cos_v1 -0.2339190  1.4836302 -0.9844481
## sin_v1  0.9722561 -1.3412089 -0.1756756
## cos_v2  0.2383614  1.8617066  0.3900561
## sin_v2 -0.9711765 -0.7307862 -0.3128198
## cos_v3 -0.8408661  1.9186246 -0.3160772
## sin_v3  0.5412432  0.5646943 -0.1058598

Graph) \(\alpha d^{k-1}\)

  • 3 separate test cases
set.seed(123)
vec_to_matrix <- function(v) {
  matrix(v, ncol = 2, byrow = TRUE)
}

plot_vectors <- function(v, main_title = "Vectors") {
  mat <- vec_to_matrix(v)
  
  plot(
    0, 0,
    xlim = range(c(0, mat[, 1])),
    ylim = range(c(0, mat[, 2])),
    xlab = "x",
    ylab = "y",
    main = main_title,
    asp = 1
  )
  
  abline(h = 0, v = 0, col = "gray")
  
  arrows(
    x0 = 0,
    y0 = 0,
    x1 = mat[, 1],
    y1 = mat[, 2],
    length = 0.1
  )
}

v_list <- list(
  v1 = scale_direction(3),
  v2 = scale_direction(3, 2),
  v3 = scale_direction(3, "diminishing")
)

plot_vectors(v_list$v1, "Alpha = 1")

plot_vectors(v_list$v2, "Alpha = 2")

plot_vectors(v_list$v3, "Diminishing Step Size")

Generate Function Object) Proof of Concept

set.seed(1)

g_expr <- expression(100 * w1^2 + w2)
# Generate random inputs (as you wrote)
w1_vals <- rnorm(100, mean = 0, sd = 10)
w2_vals <- rnorm(100, mean = 0, sd = 10)

g_vals <- eval(g_expr, list(
  w1 = w1_vals,
  w2 = w2_vals
))

data.frame(g_vals) |> head(5)
##       g_vals
## 1  3918.2401
## 2   337.6699
## 3  6973.6426
## 4 25450.7887
## 5  1079.2079
cat("each is a G value") 
## each is a G value

Graph Function) Proof of Concept

# Create smooth curve (fix w2 at its mean)
w1_grid <- seq(-20, 20, length.out = 400)
w2_fixed <- mean(w2_vals)

g_curve <- eval(g_expr, list(
  w1 = w1_grid,
  w2 = w2_fixed
))

# Plot
plot(
  w1_grid,
  g_curve,
  type = "l",
  lwd = 2,
  xlab = "w1",
  ylab = "g(w1, w2)",
  main = "g(w1, w2) = 100 w1^2 + w2", 
  col = "red"
)

points(
  w1_vals,
  g_vals,
  pch = 19,
  cex = 1.5
)

  • notice we are viewing just the w1 direction not w2 aswell

Helper function) Function expression

function_expression <- function(g) {
  g_expr <- parse(text = g)
  g_expr
}

evaluate_expression <- function(g, data) {
  g_expr <- function_expression(g)
  
  w1_vals <- data[seq(1, length(data), by = 2)]
  w2_vals <- data[seq(2, length(data), by = 2)]
  
  eval(g_expr, list(
    w1 = w1_vals,
    w2 = w2_vals
  ))
}

Helper function) update_wk

Recall :

\[ W^k=W^{k-1}+\alpha d^{k-1} \]

Combined function) random_search(``g,alpha_choice,max_its,num_samples,w0``)

random_search <- function(
  g,
  alpha_choice = 1,
  max_its = 100,
  num_samples = 1,
  w0 = c(0, 0)
) {
  
  # Parse once (do NOT re-parse every loop)
  g_expr <- function_expression(g)
  
  # Storage
  weight_history <- matrix(NA, nrow = max_its + 1, ncol = 2)
  colnames(weight_history) <- c("w1", "w2")
  
  cost_history <- numeric(max_its + 1)
  
  # Initial
  w_current <- w0
  weight_history[1, ] <- w_current
  
  # evaluate cost function for w_i values
  cost_history[1] <- eval(
    g_expr,
    list(w1 = w_current[1], w2 = w_current[2])
  )
  
  # Main loop : iterate across k-steps
  for (k in seq_len(max_its)) {
    
    # Generate multiple candidate directions
    direction_vec <- scale_direction(num_samples, alpha_choice)
    direction_mat <- matrix(direction_vec, ncol = 2, byrow = TRUE)
    
    # Form candidate weights
    w_candidates <- sweep(direction_mat, 2, w_current, "+") # row wise operation
    
    # Evaluate objective at ALL candidates
    candidate_costs <- eval(
      g_expr,
      list(
        w1 = w_candidates[, 1],
        w2 = w_candidates[, 2]
      )
    )
    
    # Choose best direction (minimum)
    best_index <- which.min(candidate_costs)
    
    # Update using best candidate
    w_current <- w_candidates[best_index, ]
    
    # Store
    weight_history[k + 1, ] <- w_current
    cost_history[k + 1] <- candidate_costs[best_index]
  }
  
  list(
    weight_history = weight_history,
    cost_history = cost_history
  )
}
set.seed(123)

result <- random_search(
  g = "w1^2 + w2^2",
  alpha_choice = "diminishing",
  max_its = 100, # not inc. initial value
  num_samples = 5, # try 5 directions 
  w0 = c(4, 4)
)

tail(result$weight_history) # we are heading down which is good
##                 w1          w2
##  [96,] -0.11737643 -0.27862070
##  [97,] -0.06628516 -0.03389701
##  [98,]  0.11608902 -0.11599244
##  [99,] -0.10940962 -0.00805559
## [100,]  0.06148494  0.09584364
## [101,] -0.03588856 -0.13441361
tail(result$cost_history) # nearing 0 which is good 
## [1] 0.091406718 0.005542729 0.026930907 0.012035357 0.012966401 0.019355007

B) Plot cost_history

The function we are minimizing is – it contains multiple mininum:

Plot

https://www.desmos.com/3d/9kxi3uvf97

set.seed(123)

result <- random_search(
  g = "100*(w2 - w1^2)^2 + (w1 - w2)^2",
  alpha_choice = 1,
  max_its = 30, # not inc. initial value
  num_samples = 100, # try 100 directions 
  w0 = c(-3, -3)
)
plot(result$cost_history) # notice we get lower over each index 

tail(result$cost_history)
## [1] 0.05972796 0.01844171 0.13096306 1.65886500 0.73447902 0.71095444
w_hist <- result$weight_history

plot(
  w_hist[,1], w_hist[,2],
  type = "o",
  pch = 19,
  col = "red",
  xlab = "w1",
  ylab = "w2",
  main = "Weight History (Search Path)"
)

# mark start and end
points(w_hist[1,1], w_hist[1,2], col = "green", pch = 19, cex = 1.5) # start
points(w_hist[nrow(w_hist),1], w_hist[nrow(w_hist),2], col = "blue", pch = 19, cex = 1.5) # end

legend("topleft",
       legend = c("Path", "Start", "End"),
       col = c("red", "green", "blue"),
       pch = 19, 
       cex = .7)

g_func <- function(w1, w2) {
  100*(w2 - w1^2)^2 + (w1 - w2)^2
}

x <- seq(-5, 5, length.out = 100)
y <- seq(-5, 5, length.out = 100)
z <- outer(x, y, g_func)

contour(x, y, z,
        xlab = "w1", ylab = "w2",
        main = "Search Path on Cost Surface")

lines(w_hist[,1], w_hist[,2], col = "red", lwd = 2)
points(w_hist[,1], w_hist[,2], col = "red", pch = 19)

C) "diminishing"

set.seed(123)

result <- random_search(
  g = "100*(w2 - w1^2)^2 + (w1 - w2)^2",
  alpha_choice = "diminishing",
  max_its = 30, # not inc. initial value
  num_samples = 100, # try 100 directions 
  w0 = c(-3, -3)
)
plot(result$cost_history) # notice we get lower over each index 

tail(result$cost_history) # much lower value with diminishing
## [1] 7.245882e-06 2.768931e-04 5.799276e-04 7.535580e-04 2.382325e-04
## [6] 5.145663e-06

D) Discuss Results

\[ f(w_1,w_2)=100(w_2-w_1^2)^2+(w_1-w_2)^2 \]

Since both terms are squares, \[ 100(w_2-w_1^2)^2 \ge 0 \qquad \text{and} \qquad (w_1-w_2)^2 \ge 0 \] so \[ f(w_1,w_2)\ge 0 \]

The minimum occurs when both squared terms are zero: \[ w_2-w_1^2=0 \qquad \text{and} \qquad w_1-w_2=0 \]

Thus, \[ w_2=w_1^2 \qquad \text{and} \qquad w_2=w_1 \]

Equating them gives \[ w_1^2=w_1 \] so \[ w_1(w_1-1)=0 \]

Hence \[ w_1=0 \quad \text{or} \quad w_1=1 \]

Using \(w_2=w_1\), we get the minimizers \[ (w_1,w_2)=(0,0)\quad \text{and}\quad (1,1) \]

At both points, \[ f(0,0)=0, \qquad f(1,1)=0 \]

Therefore, the true global minimum value is \[ 0 \] attained at \[ (0,0)\quad \text{and}\quad (1,1) \]

In other words, two inputs may result in the same global min. So depending upon our initial value \(w_0\) we are subject to get different results. Either way however, using unit-length ( alpha_choice = 1 ), we perform worse than diminishing. We see that a better strategy is diminishing step length for this cost function.

Q2) Zero-Order Coordinate Search/Descent

General Framework :

\[ \vec{w}^{(k)}=w^{(k-1)}+\alpha \vec{e}_i \]

Brief layout :

Algorithm Weakness :

Algorithm Strength :

A) Build Function

Helper function) Unit Vectors ( \(\vec{u}_i\) ) – u_i()`

u_i <- function() {
  rbind(
    c(1, 0),
    c(-1, 0),
    c(0, 1),
    c(0, -1)
  )
}

Helper function) stop_condition()

stop_condition <- function(k, max_its, cost_history, tol = 1e-5) {
  
  # stop if max iterations reached
  if (k >= max_its) {
    return(TRUE)
  }
  
  # need at least 2 values to compare improvement
  if (k > 1) {
    improvement <- abs(cost_history[k] - cost_history[k - 1])
    if (improvement < tol) {
      return(TRUE)
    }
  }
  
  return(FALSE)
}

Helper function) diminishing_func()

diminishing_func <- function(directions, alpha_choice = 1, k) {
  if (alpha_choice == "diminishing") {
    return((1 / k) * directions)
  } else {
    return(alpha_choice * directions)
  }
}

Combined function) coordinate_search(g,alpha_choice,max_its,tol,w0)

coordinate_search <- function(
  g,
  alpha_choice = 1,
  max_its = 100,
  tol = 1e-5,
  w0 = c(0, 0)
) {
  
  g_expr <- function_expression(g)
  
  # storage
  weight_history <- matrix(NA, nrow = max_its + 1, ncol = 2)
  colnames(weight_history) <- c("w1", "w2")
  
  cost_history <- numeric(max_its + 1)
  
  # initialize
  w_current <- w0
  weight_history[1, ] <- w_current
  
  cost_history[1] <- eval(
    g_expr,
    list(w1 = w_current[1], w2 = w_current[2])
  )
  
  k <- 1
  
  repeat {
    
    # basis directions (±e1, ±e2)
    directions <- u_i()
    
    # apply step size (handles diminishing)
    direction_mat <- diminishing_func(directions, alpha_choice, k)
    
    # form candidates
    w_candidates <- sweep(direction_mat, 2, w_current, "+")
    
    # evaluate
    candidate_costs <- eval(
      g_expr,
      list(
        w1 = w_candidates[, 1],
        w2 = w_candidates[, 2]
      )
    )
    
    # pick best
    best_index <- which.min(candidate_costs)
    
    # update
    w_current <- w_candidates[best_index, ]
    
    # store
    k <- k + 1
    weight_history[k, ] <- w_current
    cost_history[k] <- candidate_costs[best_index]
    
    # stop condition
    if (stop_condition(k, max_its, cost_history, tol)) {
      break
    }
  }
  
  list(
    weight_history = weight_history[1:k, ],
    cost_history = cost_history[1:k]
  )
}

Personal Test Case )

result <- coordinate_search(
  g = "w1^2 + w2^2",
  alpha_choice = "diminishing",
  max_its = 1000, # not inc. initial value
  w0 = c(4, 4)
)

tail(result$weight_history) # we are heading down which is good
##                w1        w2
##  [995,] 0.2602721 0.2602721
##  [996,] 0.2592671 0.2602721
##  [997,] 0.2592671 0.2592681
##  [998,] 0.2592671 0.2582651
##  [999,] 0.2582651 0.2582651
## [1000,] 0.2582651 0.2572641
tail(result$cost_history) # nearing 0 which is good 
## [1] 0.1354831 0.1349610 0.1344394 0.1339203 0.1334017 0.1328856
plot(result$cost_history) 

B) Plot cost_history

The function we are minimizing is :

\[ x'Ax+c \text{ For :} A=I \\ \implies \\ x'x+c \]

PLT2

Link : https://www.desmos.com/3d/9kxi3uvf97

result <- coordinate_search(
  g = "w1^2 + w2^2 + 2",
  alpha_choice = 1, # unit vector
  max_its = 1000, # not inc. initial value
  w0 = c(4, 4)
)

tail(result$weight_history) # we are heading down which is good
##         w1 w2
##  [995,]  0  0
##  [996,]  1  0
##  [997,]  0  0
##  [998,]  1  0
##  [999,]  0  0
## [1000,]  1  0
tail(result$cost_history) # not great, looks like we missed 
## [1] 2 3 2 3 2 3
plot(result$cost_history) 

Cost against weight

plot(result$weight_history[,1], result$cost_history, main = "wt1")

plot(result$weight_history[,2], result$cost_history, main = "wt2")

  • Steep decline – good

  • we can see across time we are getting closer to the min

  • Notice however that this is a convex function so in the long, run we are guaranteed this

w_hist <- result$weight_history

plot(
  w_hist[,1], w_hist[,2],
  type = "o",
  pch = 19,
  col = "red",
  xlab = "w1",
  ylab = "w2",
  main = "Weight History (Search Path)"
)

# mark start and end
points(w_hist[1,1], w_hist[1,2], col = "green", pch = 19, cex = 1.5) # start
points(w_hist[nrow(w_hist),1], w_hist[nrow(w_hist),2], col = "blue", pch = 19, cex = 1.5) # end

legend("bottomright",
       legend = c("Path", "Start", "End"),
       col = c("red", "green", "blue"),
       pch = 19, 
       cex = .7)

  • this is silly
g_func <- function(w1, w2) {
  w1^2 + w2^2 + 2
}

x <- seq(-5, 5, length.out = 100)
y <- seq(-5, 5, length.out = 100)
z <- outer(x, y, g_func)

contour(x, y, z,
        xlab = "w1", ylab = "w2",
        main = "Search Path on Cost Surface")

lines(w_hist[,1], w_hist[,2], col = "red", lwd = 2)
points(w_hist[,1], w_hist[,2], col = "red", pch = 19)

C) "diminishing"

result <- coordinate_search(
  g = "w1^2 + w2^2 + 2",
  alpha_choice = "diminishing", # unit vector
  max_its = 1000, # not inc. initial value
  w0 = c(4, 4)
)

tail(result$weight_history) # we are heading down which is good
##                w1        w2
##  [995,] 0.2602721 0.2602721
##  [996,] 0.2592671 0.2602721
##  [997,] 0.2592671 0.2592681
##  [998,] 0.2592671 0.2582651
##  [999,] 0.2582651 0.2582651
## [1000,] 0.2582651 0.2572641
tail(result$cost_history) # much better -- but not best
## [1] 2.135483 2.134961 2.134439 2.133920 2.133402 2.132886
plot(result$cost_history) # beatiful

w_hist <- result$weight_history
plot(
  w_hist[,1], w_hist[,2],
  type = "o",
  pch = 19,
  col = "red",
  xlab = "w1",
  ylab = "w2",
  main = "Weight History (Search Path)"
)

# mark start and end
points(w_hist[1,1], w_hist[1,2], col = "green", pch = 19, cex = 1.5) # start
points(w_hist[nrow(w_hist),1], w_hist[nrow(w_hist),2], col = "blue", pch = 19, cex = 1.5) # end

legend("bottomright",
       legend = c("Path", "Start", "End"),
       col = c("red", "green", "blue"),
       pch = 19, 
       cex = .7)

g_func <- function(w1, w2) {
  w1^2 + w2^2 + 2
}

x <- seq(-5, 5, length.out = 100)
y <- seq(-5, 5, length.out = 100)
z <- outer(x, y, g_func)

contour(x, y, z,
        xlab = "w1", ylab = "w2",
        main = "Search Path on Cost Surface")

lines(w_hist[,1], w_hist[,2], col = "red", lwd = 2)
points(w_hist[,1], w_hist[,2], col = "red", pch = 19)

  • this is much better

d) Find Min of func

Mathematical Sol)

Okay so, imagine we didnt have the graph – how might we determine

We are provided :

\[ f(w_1, w_2)=c_1(w_1^2 + w_2^2)+c_2w_1w_2 \]

Where \(c_1 \ne c_2\)

Notice that our function is of the form :

\[ f(w_1,w_2)=x'Ax \]

(ie. quadratic form)

Recall :

  • Below I have provided the matrix A along with my work to get it

v1 <- c(.26,  -0.24)
v2 <- c( -0.24, .26)

A <- matrix(c(v1,v2), nrow=2, byrow=T)
cat("A Matrix =\n")
## A Matrix =
A
##       [,1]  [,2]
## [1,]  0.26 -0.24
## [2,] -0.24  0.26
cat("\n")
cat("eigen(A)$values\n")
## eigen(A)$values
eigen(A)$values
## [1] 0.50 0.02
  • notice both are positive eigens so its strictly convex

And recall \(\nabla f =\vec{0}\) means we can find gradient.

And recall \(\nabla (x'Ax) = (A+A')\vec{x}\) except recall \(A\) is symmetric so \(A=A'\) ; so :

\[ \nabla f=\nabla f=(x'Ax)=2A\vec{x} \]

B <- 2*A
B
##       [,1]  [,2]
## [1,]  0.52 -0.48
## [2,] -0.48  0.52

and we want to know : \(\nabla f =\vec{0}\) – so :

\[ B\vec{x}=\vec{0} \]

Which is the definition of the kernal, so in other words \(\text{Ker}(B)\)

\[ \text{Solve } Ax = 0, \quad A = \begin{bmatrix} 0.26 & -0.24 \\ -0.24 & 0.26 \end{bmatrix}. \]

Multiply by \(100\) to clear decimals: \[ \begin{bmatrix} 26 & -24 \\ -24 & 26 \end{bmatrix} \begin{bmatrix} w_1 \\ w_2 \end{bmatrix} = \begin{bmatrix} 0 \\ 0 \end{bmatrix}. \]

Row reduce: \[ \begin{bmatrix} 26 & -24 \\ -24 & 26 \end{bmatrix} \xrightarrow{R_2 \leftarrow R_2 + R_1} \begin{bmatrix} 26 & -24 \\ 2 & 2 \end{bmatrix} \xrightarrow{R_2 \leftarrow \tfrac{1}{2}R_2} \begin{bmatrix} 26 & -24 \\ 1 & 1 \end{bmatrix} \xrightarrow{R_1 \leftarrow R_1 - 26R_2} \begin{bmatrix} 0 & -50 \\ 1 & 1 \end{bmatrix} \]

Thus, \[ -50w_2 = 0 \Rightarrow w_2 = 0, \quad w_1 + w_2 = 0 \Rightarrow w_1 = 0. \]

Hence, \[ \ker(A) = \left\{ \begin{bmatrix}0 \\ 0\end{bmatrix} \right\}, \] so the unique global minimizer is \((0,0)\).

Verify Result :

library(MASS)
Null(B)
##     
## [1,]
## [2,]

or, we can just be lazy and check if our matrix is invertible and if so then we know the kernal space is just the “trivial solution” and therefore the min point is just (0,0)

det(B)
## [1] 0.04
  • inverse exists so we’re chilling – The min point is just (0,0)

Programmatic Sol)

  • Notice since its a strictly convex func, we know there is a global min and we dont have to worry about other stationary points. This is good.

Generate Plot Func)

analyze_results <- function(
  results,
  g_func = NULL,
  orientation = "bottomright",
  cex = 0.8
) {
  
  # Extract histories
  w_hist <- results$weight_history
  cost_hist <- results$cost_history
  
  # Print tail tables
  cat("---- Tail of weight history ----\n")
  print(tail(w_hist))
  
  cat("\n---- Tail of cost history ----\n")
  print(tail(cost_hist))
  
  # Plot 1: cost history
  plot(
    cost_hist,
    type = "p",
    pch = 19,
    col = "blue",
    xlab = "Iteration",
    ylab = "Cost",
    main = "Cost History"
  )
  
  # Plot 2: w1 vs cost
  plot(
    w_hist[,1], cost_hist,
    pch = 19,
    col = "purple",
    xlab = "w1",
    ylab = "Cost",
    main = "w1 vs Cost"
  )
  
  # Plot 3: w2 vs cost
  plot(
    w_hist[,2], cost_hist,
    pch = 19,
    col = "darkgreen",
    xlab = "w2",
    ylab = "Cost",
    main = "w2 vs Cost"
  )
  
  # Plot 4: contour + search path
  if (!is.null(g_func)) {
    
    # Convert character input to function
    if (is.character(g_func)) {
      expr <- parse(text = g_func)
      g_func <- function(w1, w2) {
        eval(expr, list(w1 = w1, w2 = w2))
      }
    }
    
    x <- seq(min(w_hist[, 1]) - 1, max(w_hist[, 1]) + 1, length.out = 100)
    y <- seq(min(w_hist[, 2]) - 1, max(w_hist[, 2]) + 1, length.out = 100)
    
    z <- outer(x, y, Vectorize(g_func))
    
    contour(
      x, y, z,
      xlab = "w1",
      ylab = "w2",
      main = "Search Path on Cost Surface"
    )
    
    # Overlay path
    lines(w_hist[, 1], w_hist[, 2], col = "red", lwd = 2)
    points(w_hist[, 1], w_hist[, 2], col = "red", pch = 19)
    
    # Start and end
    points(w_hist[1, 1], w_hist[1, 2], col = "green", pch = 19, cex = 1.5)
    points(w_hist[nrow(w_hist), 1], w_hist[nrow(w_hist), 2], col = "blue", pch = 19, cex = 1.5)
    
    legend(
      orientation,
      legend = c("Path", "Start", "End"),
      col = c("red", "green", "blue"),
      pch = 19,
      cex = cex
    )
  }
}

random_search() – unit length

set.seed(123)
result <- random_search(
  g = "0.26*(w1^2+w2^2)-.48*w1*w2",
  alpha_choice = 1,
  max_its = 1000, # not inc. initial value
  num_samples = 100, # try 100 directions 
  w0 = c(4, 4)
)

g <-  "0.26*(w1^2+w2^2)-.48*w1*w2"

plot(result$cost_history)

analyze_results(result, g_func="0.26*(w1^2+w2^2)-.48*w1*w2")
## ---- Tail of weight history ----
##                 w1         w2
##  [996,]  0.4667857  0.4554810
##  [997,] -0.2513049 -0.2404686
##  [998,]  0.4569310  0.4655072
##  [999,] -0.2521168 -0.2396532
## [1000,]  0.4366925  0.4852894
## [1001,] -0.2226319 -0.2665693
## 
## ---- Tail of cost history ----
## [1] 0.008537707 0.002447769 0.008527310 0.002457213 0.009090920 0.002875802

random_search() – diminishing length

set.seed(123)
result <- random_search(
  g = "0.26*(w1^2+w2^2)-.48*w1*w2",
  alpha_choice = "diminishing",
  max_its = 1000, # not inc. initial value
  num_samples = 100, # try 100 directions 
  w0 = c(4, 4)
)

g <-  "0.26*(w1^2+w2^2)-.48*w1*w2"

plot(result$cost_history)

analyze_results(result, g_func="0.26*(w1^2+w2^2)-.48*w1*w2")
## ---- Tail of weight history ----
##                   w1           w2
##  [996,] -0.005304488 -0.002771495
##  [997,]  0.005274850  0.003591949
##  [998,] -0.003747409 -0.002044737
##  [999,]  0.004297576  0.007728131
## [1000,] -0.002052746 -0.002510637
## [1001,]  0.006562357  0.002566765
## 
## ---- Tail of cost history ----
## [1] 2.256228e-06 1.494240e-06 1.060262e-06 4.388353e-06 2.606606e-07
## [6] 4.824597e-06

coordinate_search() – unit length

result <- coordinate_search(
  g = "0.26*(w1^2+w2^2)-.48*w1*w2",
  alpha_choice = 1,
  max_its = 1000,
  w0 = c(4, 4)
)
g <-  "0.26*(w1^2+w2^2)-.48*w1*w2"

plot(result$cost_history)

analyze_results(result, g_func="0.26*(w1^2+w2^2)-.48*w1*w2")
## ---- Tail of weight history ----
##         w1 w2
##  [995,]  0  0
##  [996,]  1  0
##  [997,]  0  0
##  [998,]  1  0
##  [999,]  0  0
## [1000,]  1  0
## 
## ---- Tail of cost history ----
## [1] 0.00 0.26 0.00 0.26 0.00 0.26

  • we end on a incorrect sol

coordinate_search() – diminishing length

result <- coordinate_search(
  g = "0.26*(w1^2+w2^2)-.48*w1*w2",
  alpha_choice = "diminishing",
  max_its = 1000,
  w0 = c(4, 4)
)

plot(result$cost_history)

tail(result$cost_history)
## [1] 0.002709663 0.002699462 0.002688787 0.002678646 0.002668034 0.002657953
analyze_results(result, g_func="0.26*(w1^2+w2^2)-.48*w1*w2")
## ---- Tail of weight history ----
##                w1        w2
##  [995,] 0.2602721 0.2602721
##  [996,] 0.2592671 0.2602721
##  [997,] 0.2592671 0.2592681
##  [998,] 0.2592671 0.2582651
##  [999,] 0.2582651 0.2582651
## [1000,] 0.2582651 0.2572641
## 
## ---- Tail of cost history ----
## [1] 0.002709663 0.002699462 0.002688787 0.002678646 0.002668034 0.002657953

Discussion)

  • The closest one to the min was diminishing, random_search with 100 directions. This is very overkill. Irregardless of algorithm, the best performance was that of “diminishing” – this stops the zig-zag problem. Depending on computational resources due to dimensionality of input function youd choose a different method. Since we are only in \(\mathbb{R}^{N X2}\) , i would go with random search. Seems to perform better but unfortunantly suffer from irregular performance – ie lots of variation. So for more stability perhaps coordinate search instead with “diminishing” alpha length.

Q3)

Brief layout :

Q4)

Recall :

b)

Therefore

\[ h(\eta)\le \lambda h(x_1)+(1-\lambda)h(x_2) \]

So its also convex