Part a

#mat2vec function
#input: symmetric matrix
#output: elements by row up to diagonal as a column vector matrix
mat2vec <- function(mat) {
    row.list = list()
    for(i in 1:nrow(mat)) {
        current.row = mat[i,]
        for (j in 1:i) {
            row.list[[i]] = c(current.row[1:j])
        }
    }
    return(as.matrix(unlist(row.list)))
}

#Example: 
play = matrix(c(11, 12, 13, 21, 22, 23, 31, 32, 33), nrow=3, byrow=TRUE)
a= mat2vec(play)

play2 = matrix(c(11, 12, 13,14, 21, 22, 23,24, 31, 32, 33,34,41,42,43,44), nrow=4, byrow=TRUE)
b=mat2vec(play2)

#------------------------------------------------
##------------------------------------------------
#vec2mat function()
#input: column vector in matrix form
#ouput: matrix
vec2mat <- function(vec) {
    p = (-1 + sqrt(1+8*nrow(vec)))/2
    row.list = list()
    mat = matrix(0,p,p)
    start = 1
    end = 1
    for(i in 1:nrow(mat)) {
        take = vec[start:end,]
        zeros = rep(0,length = (p - length(take)))
        row.list[[i]] = c(take, zeros)
        start = end + 1
        end = start + i 
    }
    sig.half = do.call(rbind, row.list)
    sig = sig.half + t(sig.half) - diag((diag(sig.half)))                
    return(sig) 
}

a2 = vec2mat(a)
b2 = vec2mat(b)

#------------------------------------------------
##------------------------------------------------

diff.sig <- function(A) {
    row.list = list()
    row.list[[1]] = (-1/2)*A[1,1]
    for(i in 2:nrow(A)) {
        current.row = A[i,]
        for (j in 1:i) {
            row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])
        }
    }
    return(as.matrix(unlist(row.list)))
}

y = diff.sig(play)

grad.sig <- function(n, siginv, C) {
    A = n*siginv - (siginv %*% (C %*% siginv))
    diff.sig(A) #individual sigma_ii and sigma_ij's
}

#------------------------------------------------------
sqrtm <- function (A) {
  # Obtain matrix square root of a matrix A
  a = eigen(A)
  sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
  sqm = (sqm+t(sqm))/2
}


like.grad <- function (x,mu,sig) {
  # computes the likelihood and the gradient for multivariate normal
  # x is the n by p data matrix
  # mu is the mean (px1 matrix vector)
  # sig is the covariance (pxp matrix)
  a = dim(x)
  n = a[1]
  p = a[2]
  #check if sigma is feasible. all eigenvalues of sigma must be positive
  e.values = eigen(sig)$values
  if (any(e.values < 0)) { 
    l = NA
    grad.ms = NA
    grad.norm = NA
  } else {
    siginv = solve(sig)
  
    C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
    sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
    for (i in 1:n){
        xm = x[i,] - mu
        sxm = sxm + xm
        C = C + xm %*% t(xm)
    }
    
    gradm = siginv %*% sxm #gradient wrt mu
    grads = grad.sig(n=n, siginv = siginv, C = C)
      grad.ms = matrix(c(gradm, grads), ncol=1) #gradient of both. return this
    
    # --- Note that trace(siginv %*% C) = sum(siginv*C) 
    log_det_sig <- log(det(sig))
    l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv * C ))/2
    grad.norm = norm(grad.ms, type='2')
   }
  list(l = l, grad.mu.sig = grad.ms, grad.norm = grad.norm)
}

hessian.M=function(x,mu,sigma){ 
  n=dim(x)[1]
  p=dim(x)[2]
  siginv=solve(sigma) #inverse of sigma
  d2mu2=-n*siginv # deal with mu
  shess=matrix(0,.5*p*(p+1),.5*p*(p+1)) #deal with sigma shess
  C= matrix(0,p,p)
  for (i in 1:n){
    xm = x[i,] -mu
    C=C + xm %*% t(xm)
  }
  B=(n/2)*siginv -siginv%*%C%*%siginv 
  rwct=0
  clct=0
  for(i in 1:p){
    for(j in 1:i){
      rwct=rwct + 1 
      clct = 0 
      for(k in 1:p){
        for(l in 1:k){ 
          clct=clct+1
        if(i==j && k==l){
          shess[rwct,clct] = B[l,i]*siginv[j,k]
        }
        else if (i!=j && k==l){
          shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,j]*siginv[i,l]
        }
        else if(i==j && k!=l){ shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,i]*siginv[j,l]
        }
        else if(i!=j && k!=l){
          shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,j]*siginv[i,l]+B[k,i]*siginv[j,l]+B[j,l]*siginv[i,k]
        }
        } 
      }
    }
    }
  # deal with mu and sigma
  musig=matrix(0,p,.5*p*(p+1)) 
  sxm = matrix(0,p,1)
  for (i in 1:n){
    xm = x[i,] -mu
    sxm = sxm + xm
  }
  D=-siginv%*%sxm 
  rwct=0
  clct=0
  for(i in 1:p){
    rwct=rwct + 1 
    clct = 0 
    for(k in 1:p){
      for(l in 1:k){ 
        clct=clct+1 
        if(k==l){
        musig[rwct,clct] = D[l,]*siginv[i,k]
      }
      else if (k!=l){ musig[rwct,clct]=D[l,]*siginv[i,k]+D[k,]*siginv[i,l]
      } }
    }
  }
  hess=cbind(rbind(d2mu2,t(musig)),rbind(musig,shess))
  hess=.5*(hess+t(hess))
  return(hess)
}



steep.ascent <- function(mu, sig, datan, maxit=500, tolerr=10^-6, tolgrad=10^-5) {
    header = paste0("Iteration", "        Halving", "        log-likelihood", "             ||gradient||")
    print(header)
    
    
    n = nrow(datan) # number of observations
    p = ncol(datan)
    
    for(it in 1:maxit) {
        
        theta.n = matrix(c(mu, mat2vec(sig)), ncol=1)
        a = like.grad(x=datan,mu=mu, sig=sig)
        
        if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
            print("-----------------------------------------------------------------")
            print(sprintf('%2.0f                 --          %.1e               %.1e', it,  a$l, a$grad.norm))
        }
        
        
        #move in the direction. update theta.n1
        dir = a$grad.mu.sig
        theta.next = theta.n + dir
        mu1 = theta.next[1:p,]
        sig1 = theta.next[(p+1):nrow(theta.next), ]     
        
        #calculate likelihood at this new point. see if you improved
        a.temp = like.grad(x=datan, mu=matrix(mu1,ncol=1), sig=vec2mat(matrix(sig1,ncol=1))) 
        if (is.na(a.temp$l)) {
            temp.like = -Inf
        } else {
            temp.like = a.temp$l
        }   
        halve = 0
        
        if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
            print(sprintf('%2.0f              %2.0f           %.1e                  %.1e', it, halve, a.temp$l, a.temp$grad.norm))
        }
        

        while(temp.like < a$l & halve <= 20) {
            halve = halve + 1
            theta.next = theta.n + dir/(2^halve)
            mu1 = theta.next[1:p,]
            sig1 = theta.next[(p+1):nrow(theta.next), ]
            
            a.temp = like.grad(x=datan, mu=matrix(mu1, ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
            
            if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
                print(sprintf('%2.0f              %2.0f           %.1e               %.1e', it, halve, a.temp$l, a.temp$grad.norm))
            }
            
            if (is.na(a.temp$l)) {
                temp.like = -Inf
            } else {
                temp.like = a.temp$l
            }       
        } #end while loop
        theta.n1 = theta.next
        
        if (halve >= 21) {
            print('Step-halve failed')
        }
        
        #convergence criteria
        c1 = abs(max(1,theta.n1))
        c2 = abs(theta.n1 - theta.n)
        mre = max(c2/c1)
        grad.norm = a.temp$grad.norm

        if (mre < tolerr & grad.norm < tolgrad) {
            break
            print("We reached convergence in steep ascent")
        } 
        
        #update variables for next iteration
        mu1 = theta.n1[1:p, ]
        sig1 = theta.n1[(p+1):nrow(theta.n1), ]
        
        mu = matrix(mu1, ncol=1) #need them in matrix vector form
        sig = vec2mat(matrix(sig1,ncol=1)) #need in matrix form
    }
    return(list(l = a.temp$l, mu=mu, sig = sig, iteration = it))
}

Part b: Generate Data

gen <- function(n, p, mu, sig, seed) {
    #---- Generate data from a p-variate normal with mean mu and covariance sigma
    # mu should be a p by 1 vector sigma should be a positive definite p by p matrix
    # Seed can be optionally set for the random number generator
    set.seed(seed)
    # generate data from normal mu sigma
    x = matrix(rnorm(n * p), n, p)
    datan = x %*% sqrtm(sig) + matrix(mu, n, p, byrow = TRUE)
    datan
}

MU = matrix(c(-1, 1, 2), ncol = 1)
SIG = matrix(c(1, 0.7, 0.7, 0.7, 1, 0.7, 0.7, 0.7, 1), byrow = TRUE, nrow = 3)
data = gen(200, 3, MU, SIG, seed = 2021)
print(data[1:3, ])
##            [,1]       [,2]     [,3]
## [1,] -1.2429394 0.97212678 1.454665
## [2,] -0.8819003 0.07977828 1.945934
## [3,] -0.7940660 0.55001519 2.313527

Part c: Steepest Ascent

mu.init = matrix(c(0, 0, 0), ncol = 1)
sig.init = diag(3)
y = steep.ascent(mu = mu.init, sig = sig.init, datan = data, maxit = 700)
## [1] "Iteration        Halving        log-likelihood             ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1                 --          -1.5e+03               9.1e+02"
## [1] " 1               0           NA                  NA"
## [1] " 1               1           NA               NA"
## [1] " 1               2           NA               NA"
## [1] " 1               3           NA               NA"
## [1] " 1               4           NA               NA"
## [1] " 1               5           NA               NA"
## [1] " 1               6           NA               NA"
## [1] " 1               7           NA               NA"
## [1] " 1               8           NA               NA"
## [1] " 1               9           -9.4e+02               2.0e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2                 --          -9.4e+02               2.0e+02"
## [1] " 2               0           NA                  NA"
## [1] " 2               1           NA               NA"
## [1] " 2               2           NA               NA"
## [1] " 2               3           NA               NA"
## [1] " 2               4           NA               NA"
## [1] " 2               5           NA               NA"
## [1] " 2               6           NA               NA"
## [1] " 2               7           -1.1e+03               1.7e+03"
## [1] " 2               8           -8.8e+02               2.1e+02"
## [1] "-----------------------------------------------------------------"
## [1] "499                 --          -7.1e+02               1.4e-04"
## [1] "499               0           -7.1e+02                  2.1e-01"
## [1] "499               1           -7.1e+02               1.0e-01"
## [1] "499               2           -7.1e+02               5.2e-02"
## [1] "499               3           -7.1e+02               2.6e-02"
## [1] "499               4           -7.1e+02               1.3e-02"
## [1] "499               5           -7.1e+02               6.5e-03"
## [1] "499               6           -7.1e+02               3.2e-03"
## [1] "499               7           -7.1e+02               1.5e-03"
## [1] "499               8           -7.1e+02               7.2e-04"
## [1] "499               9           -7.1e+02               3.1e-04"
## [1] "499              10           -7.1e+02               1.3e-04"
## [1] "-----------------------------------------------------------------"
## [1] "500                 --          -7.1e+02               1.3e-04"
## [1] "500               0           -7.1e+02                  1.9e-01"
## [1] "500               1           -7.1e+02               9.5e-02"
## [1] "500               2           -7.1e+02               4.8e-02"
## [1] "500               3           -7.1e+02               2.4e-02"
## [1] "500               4           -7.1e+02               1.2e-02"
## [1] "500               5           -7.1e+02               5.9e-03"
## [1] "500               6           -7.1e+02               2.9e-03"
## [1] "500               7           -7.1e+02               1.4e-03"
## [1] "500               8           -7.1e+02               6.5e-04"
## [1] "500               9           -7.1e+02               2.9e-04"
## [1] "500              10           -7.1e+02               1.2e-04"
## [1] "-----------------------------------------------------------------"
## [1] "611                 --          -7.1e+02               1.8e-05"
## [1] "611               0           -7.1e+02                  3.4e-02"
## [1] "611               1           -7.1e+02               1.7e-02"
## [1] "611               2           -7.1e+02               8.4e-03"
## [1] "611               3           -7.1e+02               4.2e-03"
## [1] "611               4           -7.1e+02               2.1e-03"
## [1] "611               5           -7.1e+02               1.0e-03"
## [1] "611               6           -7.1e+02               5.1e-04"
## [1] "611               7           -7.1e+02               2.4e-04"
## [1] "611               8           -7.1e+02               1.1e-04"
## [1] "611               9           -7.1e+02               4.9e-05"
## [1] "611              10           -7.1e+02               1.6e-05"
## [1] "-----------------------------------------------------------------"
## [1] "612                 --          -7.1e+02               1.6e-05"
## [1] "612               0           -7.1e+02                  3.0e-02"
## [1] "612               1           -7.1e+02               1.5e-02"
## [1] "612               2           -7.1e+02               7.6e-03"
## [1] "612               3           -7.1e+02               3.8e-03"
## [1] "612               4           -7.1e+02               1.9e-03"
## [1] "612               5           -7.1e+02               9.4e-04"
## [1] "612               6           -7.1e+02               4.6e-04"
## [1] "612               7           -7.1e+02               2.2e-04"
## [1] "612               8           -7.1e+02               1.0e-04"
## [1] "612               9           -7.1e+02               4.4e-05"
## [1] "612              10           -7.1e+02               1.5e-05"
## [1] "612              11           -7.1e+02               5.2e-06"
y
## $l
## [1] -711.983
## 
## $mu
##            [,1]
## [1,] -1.0697892
## [2,]  0.9509498
## [3,]  2.0187316
## 
## $sig
##           [,1]      [,2]      [,3]
## [1,] 1.0461949 0.7133607 0.7354662
## [2,] 0.7133607 1.0132409 0.7789205
## [3,] 0.7354662 0.7789205 1.1133201
## 
## $iteration
## [1] 612

I believe my algorithm has the correct estimates for \(\mu\) and \(\Sigma\). However, with the tolgrad set at \(10^{-5}\) it appears my steep ascent doesn’t converge in under 500 iterations. My estimates can be seen in the list elements above. Furthermore, I report that I used all 500 iterations without reaching the convergence criteria. In another test I upped the max iterations to 750. In turns out steep ascent converges in 612 iterations.

Part d

Newton <- function(mu, sig, datan, maxit=500, tolerr=10^-7, tolgrad=10^-7) {
  header = paste0("Iteration", "        Halving", "        log-likelihood", "             ||gradient||")
  print(header)
  
  
  n = nrow(datan) # number of observations
  p = ncol(datan)
  
  for(it in 1:maxit) {
    
    theta.n = matrix(c(mu, mat2vec(sig)), ncol=1)
    a = like.grad(x=datan,mu=mu, sig=sig)
    

    print("-----------------------------------------------------------------")
    print(sprintf('%2.0f                 --          %.1e               %.1e', it,  a$l, a$grad.norm))
  
    
    
    #move in the direction. update theta.n1
    hess = hessian.M(data, mu, sig)
    hess.inverse  = solve(hess)
    dir = -1* hess.inverse %*% a$grad.mu.sig
    theta.next = theta.n + dir
    mu1 = theta.next[1:p,]
    sig1 = theta.next[(p+1):nrow(theta.next), ]     
    
    #calculate likelihood at this new point. see if you improved
    a.temp = like.grad(x=datan, mu=matrix(mu1,ncol=1), sig=vec2mat(matrix(sig1,ncol=1))) 
    if (is.na(a.temp$l)) {
      temp.like = -Inf
    } else {
      temp.like = a.temp$l
    }   
    halve = 0
    
    
    print(sprintf('%2.0f              %2.0f           %.1e                  %.1e', it, halve, a.temp$l, a.temp$grad.norm))

    
    
    while(temp.like < a$l & halve <= 20) {
      halve = halve + 1
      theta.next = theta.n + dir/(2^halve)
      mu1 = theta.next[1:p,]
      sig1 = theta.next[(p+1):nrow(theta.next), ]
      
      a.temp = like.grad(x=datan, mu=matrix(mu1, ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
      
  
      print(sprintf('%2.0f              %2.0f           %.1e               %.1e', it, halve, a.temp$l, a.temp$grad.norm))

      
      if (is.na(a.temp$l)) {
        temp.like = -Inf
      } else {
        temp.like = a.temp$l
      }     
    } #end while loop
    theta.n1 = theta.next
    
    if (halve >= 21) {
      print('Step-halve failed')
    }
    
    #convergence criteria
    c1 = abs(max(1,theta.n1))
    c2 = abs(theta.n1 - theta.n)
    mre = max(c2/c1)
    grad.norm = a.temp$grad.norm
    
    if (mre < tolerr & grad.norm < tolgrad) {
      break
      print("We reached convergence in steep ascent")
    } 
    
    #update variables for next iteration
    mu1 = theta.n1[1:p, ]
    sig1 = theta.n1[(p+1):nrow(theta.n1), ]
    
    mu = matrix(mu1, ncol=1) #need them in matrix vector form
    sig = vec2mat(matrix(sig1,ncol=1)) #need in matrix form
  }
  return(list(l = a.temp$l, mu=mu, sig = sig, iteration = it))
}


mu.init = matrix(c(-1.5, 1.5, 2.3), ncol=1)
sig.init = matrix(c(1, 0.5, 0.5, 0.5, 1, 0.5, 0.5, 0.5, 1), byrow=TRUE, nrow=3)
y2 = Newton(mu = mu.init, sig = sig.init, datan = data)
## [1] "Iteration        Halving        log-likelihood             ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1                 --          -8.4e+02               3.6e+02"
## [1] " 1               0           NA                  NA"
## [1] " 1               1           NA               NA"
## [1] " 1               2           NA               NA"
## [1] " 1               3           -8.4e+02               3.0e+03"
## [1] " 1               4           -7.8e+02               3.5e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2                 --          -7.8e+02               3.5e+02"
## [1] " 2               0           -2.6e+03                  1.2e+05"
## [1] " 2               1           -7.3e+02               3.7e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 3                 --          -7.3e+02               3.7e+02"
## [1] " 3               0           -7.2e+02                  1.2e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 4                 --          -7.2e+02               1.2e+02"
## [1] " 4               0           -7.1e+02                  3.2e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 5                 --          -7.1e+02               3.2e+01"
## [1] " 5               0           -7.1e+02                  4.1e+00"
## [1] "-----------------------------------------------------------------"
## [1] " 6                 --          -7.1e+02               4.1e+00"
## [1] " 6               0           -7.1e+02                  8.9e-02"
## [1] "-----------------------------------------------------------------"
## [1] " 7                 --          -7.1e+02               8.9e-02"
## [1] " 7               0           -7.1e+02                  4.5e-05"
## [1] "-----------------------------------------------------------------"
## [1] " 8                 --          -7.1e+02               4.5e-05"
## [1] " 8               0           -7.1e+02                  1.0e-11"
y2
## $l
## [1] -711.983
## 
## $mu
##            [,1]
## [1,] -1.0697892
## [2,]  0.9509498
## [3,]  2.0187316
## 
## $sig
##           [,1]      [,2]      [,3]
## [1,] 1.0461948 0.7133607 0.7354661
## [2,] 0.7133607 1.0132407 0.7789204
## [3,] 0.7354661 0.7789204 1.1133200
## 
## $iteration
## [1] 8

It appears that Newton’s algorithm is very fast in comparison to steep ascent. The estimates can be seen above in the R console printout. Furthermore, the number of iterations it took to converge was only 8.