#mat2vec function
#input: symmetric matrix
#output: elements by row up to diagonal as a column vector matrix
mat2vec <- function(mat) {
row.list = list()
for(i in 1:nrow(mat)) {
current.row = mat[i,]
for (j in 1:i) {
row.list[[i]] = c(current.row[1:j])
}
}
return(as.matrix(unlist(row.list)))
}
#Example:
play = matrix(c(11, 12, 13, 21, 22, 23, 31, 32, 33), nrow=3, byrow=TRUE)
a= mat2vec(play)
play2 = matrix(c(11, 12, 13,14, 21, 22, 23,24, 31, 32, 33,34,41,42,43,44), nrow=4, byrow=TRUE)
b=mat2vec(play2)
#------------------------------------------------
##------------------------------------------------
#vec2mat function()
#input: column vector in matrix form
#ouput: matrix
vec2mat <- function(vec) {
p = (-1 + sqrt(1+8*nrow(vec)))/2
row.list = list()
mat = matrix(0,p,p)
start = 1
end = 1
for(i in 1:nrow(mat)) {
take = vec[start:end,]
zeros = rep(0,length = (p - length(take)))
row.list[[i]] = c(take, zeros)
start = end + 1
end = start + i
}
sig.half = do.call(rbind, row.list)
sig = sig.half + t(sig.half) - diag((diag(sig.half)))
return(sig)
}
a2 = vec2mat(a)
b2 = vec2mat(b)
#------------------------------------------------
##------------------------------------------------
diff.sig <- function(A) {
row.list = list()
row.list[[1]] = (-1/2)*A[1,1]
for(i in 2:nrow(A)) {
current.row = A[i,]
for (j in 1:i) {
row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])
}
}
return(as.matrix(unlist(row.list)))
}
y = diff.sig(play)
grad.sig <- function(n, siginv, C) {
A = n*siginv - (siginv %*% (C %*% siginv))
diff.sig(A) #individual sigma_ii and sigma_ij's
}
#------------------------------------------------------
sqrtm <- function (A) {
# Obtain matrix square root of a matrix A
a = eigen(A)
sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
sqm = (sqm+t(sqm))/2
}
like.grad <- function (x,mu,sig) {
# computes the likelihood and the gradient for multivariate normal
# x is the n by p data matrix
# mu is the mean (px1 matrix vector)
# sig is the covariance (pxp matrix)
a = dim(x)
n = a[1]
p = a[2]
#check if sigma is feasible. all eigenvalues of sigma must be positive
e.values = eigen(sig)$values
if (any(e.values < 0)) {
l = NA
grad.ms = NA
grad.norm = NA
} else {
siginv = solve(sig)
C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
for (i in 1:n){
xm = x[i,] - mu
sxm = sxm + xm
C = C + xm %*% t(xm)
}
gradm = siginv %*% sxm #gradient wrt mu
grads = grad.sig(n=n, siginv = siginv, C = C)
grad.ms = matrix(c(gradm, grads), ncol=1) #gradient of both. return this
# --- Note that trace(siginv %*% C) = sum(siginv*C)
log_det_sig <- log(det(sig))
l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv * C ))/2
grad.norm = norm(grad.ms, type='2')
}
list(l = l, grad.mu.sig = grad.ms, grad.norm = grad.norm)
}
hessian.M=function(x,mu,sigma){
n=dim(x)[1]
p=dim(x)[2]
siginv=solve(sigma) #inverse of sigma
d2mu2=-n*siginv # deal with mu
shess=matrix(0,.5*p*(p+1),.5*p*(p+1)) #deal with sigma shess
C= matrix(0,p,p)
for (i in 1:n){
xm = x[i,] -mu
C=C + xm %*% t(xm)
}
B=(n/2)*siginv -siginv%*%C%*%siginv
rwct=0
clct=0
for(i in 1:p){
for(j in 1:i){
rwct=rwct + 1
clct = 0
for(k in 1:p){
for(l in 1:k){
clct=clct+1
if(i==j && k==l){
shess[rwct,clct] = B[l,i]*siginv[j,k]
}
else if (i!=j && k==l){
shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,j]*siginv[i,l]
}
else if(i==j && k!=l){ shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,i]*siginv[j,l]
}
else if(i!=j && k!=l){
shess[rwct,clct]=B[l,i]*siginv[j,k]+B[k,j]*siginv[i,l]+B[k,i]*siginv[j,l]+B[j,l]*siginv[i,k]
}
}
}
}
}
# deal with mu and sigma
musig=matrix(0,p,.5*p*(p+1))
sxm = matrix(0,p,1)
for (i in 1:n){
xm = x[i,] -mu
sxm = sxm + xm
}
D=-siginv%*%sxm
rwct=0
clct=0
for(i in 1:p){
rwct=rwct + 1
clct = 0
for(k in 1:p){
for(l in 1:k){
clct=clct+1
if(k==l){
musig[rwct,clct] = D[l,]*siginv[i,k]
}
else if (k!=l){ musig[rwct,clct]=D[l,]*siginv[i,k]+D[k,]*siginv[i,l]
} }
}
}
hess=cbind(rbind(d2mu2,t(musig)),rbind(musig,shess))
hess=.5*(hess+t(hess))
return(hess)
}
steep.ascent <- function(mu, sig, datan, maxit=500, tolerr=10^-6, tolgrad=10^-5) {
header = paste0("Iteration", " Halving", " log-likelihood", " ||gradient||")
print(header)
n = nrow(datan) # number of observations
p = ncol(datan)
for(it in 1:maxit) {
theta.n = matrix(c(mu, mat2vec(sig)), ncol=1)
a = like.grad(x=datan,mu=mu, sig=sig)
if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
print("-----------------------------------------------------------------")
print(sprintf('%2.0f -- %.1e %.1e', it, a$l, a$grad.norm))
}
#move in the direction. update theta.n1
dir = a$grad.mu.sig
theta.next = theta.n + dir
mu1 = theta.next[1:p,]
sig1 = theta.next[(p+1):nrow(theta.next), ]
#calculate likelihood at this new point. see if you improved
a.temp = like.grad(x=datan, mu=matrix(mu1,ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
halve = 0
if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
print(sprintf('%2.0f %2.0f %.1e %.1e', it, halve, a.temp$l, a.temp$grad.norm))
}
while(temp.like < a$l & halve <= 20) {
halve = halve + 1
theta.next = theta.n + dir/(2^halve)
mu1 = theta.next[1:p,]
sig1 = theta.next[(p+1):nrow(theta.next), ]
a.temp = like.grad(x=datan, mu=matrix(mu1, ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
if (it == 1 | it == 2 | it == 499 | it == 500 | it == 611 | it == 612) {
print(sprintf('%2.0f %2.0f %.1e %.1e', it, halve, a.temp$l, a.temp$grad.norm))
}
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
} #end while loop
theta.n1 = theta.next
if (halve >= 21) {
print('Step-halve failed')
}
#convergence criteria
c1 = abs(max(1,theta.n1))
c2 = abs(theta.n1 - theta.n)
mre = max(c2/c1)
grad.norm = a.temp$grad.norm
if (mre < tolerr & grad.norm < tolgrad) {
break
print("We reached convergence in steep ascent")
}
#update variables for next iteration
mu1 = theta.n1[1:p, ]
sig1 = theta.n1[(p+1):nrow(theta.n1), ]
mu = matrix(mu1, ncol=1) #need them in matrix vector form
sig = vec2mat(matrix(sig1,ncol=1)) #need in matrix form
}
return(list(l = a.temp$l, mu=mu, sig = sig, iteration = it))
}
gen <- function(n, p, mu, sig, seed) {
#---- Generate data from a p-variate normal with mean mu and covariance sigma
# mu should be a p by 1 vector sigma should be a positive definite p by p matrix
# Seed can be optionally set for the random number generator
set.seed(seed)
# generate data from normal mu sigma
x = matrix(rnorm(n * p), n, p)
datan = x %*% sqrtm(sig) + matrix(mu, n, p, byrow = TRUE)
datan
}
MU = matrix(c(-1, 1, 2), ncol = 1)
SIG = matrix(c(1, 0.7, 0.7, 0.7, 1, 0.7, 0.7, 0.7, 1), byrow = TRUE, nrow = 3)
data = gen(200, 3, MU, SIG, seed = 2021)
print(data[1:3, ])
## [,1] [,2] [,3]
## [1,] -1.2429394 0.97212678 1.454665
## [2,] -0.8819003 0.07977828 1.945934
## [3,] -0.7940660 0.55001519 2.313527
mu.init = matrix(c(0, 0, 0), ncol = 1)
sig.init = diag(3)
y = steep.ascent(mu = mu.init, sig = sig.init, datan = data, maxit = 700)
## [1] "Iteration Halving log-likelihood ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1 -- -1.5e+03 9.1e+02"
## [1] " 1 0 NA NA"
## [1] " 1 1 NA NA"
## [1] " 1 2 NA NA"
## [1] " 1 3 NA NA"
## [1] " 1 4 NA NA"
## [1] " 1 5 NA NA"
## [1] " 1 6 NA NA"
## [1] " 1 7 NA NA"
## [1] " 1 8 NA NA"
## [1] " 1 9 -9.4e+02 2.0e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2 -- -9.4e+02 2.0e+02"
## [1] " 2 0 NA NA"
## [1] " 2 1 NA NA"
## [1] " 2 2 NA NA"
## [1] " 2 3 NA NA"
## [1] " 2 4 NA NA"
## [1] " 2 5 NA NA"
## [1] " 2 6 NA NA"
## [1] " 2 7 -1.1e+03 1.7e+03"
## [1] " 2 8 -8.8e+02 2.1e+02"
## [1] "-----------------------------------------------------------------"
## [1] "499 -- -7.1e+02 1.4e-04"
## [1] "499 0 -7.1e+02 2.1e-01"
## [1] "499 1 -7.1e+02 1.0e-01"
## [1] "499 2 -7.1e+02 5.2e-02"
## [1] "499 3 -7.1e+02 2.6e-02"
## [1] "499 4 -7.1e+02 1.3e-02"
## [1] "499 5 -7.1e+02 6.5e-03"
## [1] "499 6 -7.1e+02 3.2e-03"
## [1] "499 7 -7.1e+02 1.5e-03"
## [1] "499 8 -7.1e+02 7.2e-04"
## [1] "499 9 -7.1e+02 3.1e-04"
## [1] "499 10 -7.1e+02 1.3e-04"
## [1] "-----------------------------------------------------------------"
## [1] "500 -- -7.1e+02 1.3e-04"
## [1] "500 0 -7.1e+02 1.9e-01"
## [1] "500 1 -7.1e+02 9.5e-02"
## [1] "500 2 -7.1e+02 4.8e-02"
## [1] "500 3 -7.1e+02 2.4e-02"
## [1] "500 4 -7.1e+02 1.2e-02"
## [1] "500 5 -7.1e+02 5.9e-03"
## [1] "500 6 -7.1e+02 2.9e-03"
## [1] "500 7 -7.1e+02 1.4e-03"
## [1] "500 8 -7.1e+02 6.5e-04"
## [1] "500 9 -7.1e+02 2.9e-04"
## [1] "500 10 -7.1e+02 1.2e-04"
## [1] "-----------------------------------------------------------------"
## [1] "611 -- -7.1e+02 1.8e-05"
## [1] "611 0 -7.1e+02 3.4e-02"
## [1] "611 1 -7.1e+02 1.7e-02"
## [1] "611 2 -7.1e+02 8.4e-03"
## [1] "611 3 -7.1e+02 4.2e-03"
## [1] "611 4 -7.1e+02 2.1e-03"
## [1] "611 5 -7.1e+02 1.0e-03"
## [1] "611 6 -7.1e+02 5.1e-04"
## [1] "611 7 -7.1e+02 2.4e-04"
## [1] "611 8 -7.1e+02 1.1e-04"
## [1] "611 9 -7.1e+02 4.9e-05"
## [1] "611 10 -7.1e+02 1.6e-05"
## [1] "-----------------------------------------------------------------"
## [1] "612 -- -7.1e+02 1.6e-05"
## [1] "612 0 -7.1e+02 3.0e-02"
## [1] "612 1 -7.1e+02 1.5e-02"
## [1] "612 2 -7.1e+02 7.6e-03"
## [1] "612 3 -7.1e+02 3.8e-03"
## [1] "612 4 -7.1e+02 1.9e-03"
## [1] "612 5 -7.1e+02 9.4e-04"
## [1] "612 6 -7.1e+02 4.6e-04"
## [1] "612 7 -7.1e+02 2.2e-04"
## [1] "612 8 -7.1e+02 1.0e-04"
## [1] "612 9 -7.1e+02 4.4e-05"
## [1] "612 10 -7.1e+02 1.5e-05"
## [1] "612 11 -7.1e+02 5.2e-06"
y
## $l
## [1] -711.983
##
## $mu
## [,1]
## [1,] -1.0697892
## [2,] 0.9509498
## [3,] 2.0187316
##
## $sig
## [,1] [,2] [,3]
## [1,] 1.0461949 0.7133607 0.7354662
## [2,] 0.7133607 1.0132409 0.7789205
## [3,] 0.7354662 0.7789205 1.1133201
##
## $iteration
## [1] 612
I believe my algorithm has the correct estimates for \(\mu\) and \(\Sigma\). However, with the tolgrad set at \(10^{-5}\) it appears my steep ascent doesn’t converge in under 500 iterations. My estimates can be seen in the list elements above. Furthermore, I report that I used all 500 iterations without reaching the convergence criteria. In another test I upped the max iterations to 750. In turns out steep ascent converges in 612 iterations.
Newton <- function(mu, sig, datan, maxit=500, tolerr=10^-7, tolgrad=10^-7) {
header = paste0("Iteration", " Halving", " log-likelihood", " ||gradient||")
print(header)
n = nrow(datan) # number of observations
p = ncol(datan)
for(it in 1:maxit) {
theta.n = matrix(c(mu, mat2vec(sig)), ncol=1)
a = like.grad(x=datan,mu=mu, sig=sig)
print("-----------------------------------------------------------------")
print(sprintf('%2.0f -- %.1e %.1e', it, a$l, a$grad.norm))
#move in the direction. update theta.n1
hess = hessian.M(data, mu, sig)
hess.inverse = solve(hess)
dir = -1* hess.inverse %*% a$grad.mu.sig
theta.next = theta.n + dir
mu1 = theta.next[1:p,]
sig1 = theta.next[(p+1):nrow(theta.next), ]
#calculate likelihood at this new point. see if you improved
a.temp = like.grad(x=datan, mu=matrix(mu1,ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
halve = 0
print(sprintf('%2.0f %2.0f %.1e %.1e', it, halve, a.temp$l, a.temp$grad.norm))
while(temp.like < a$l & halve <= 20) {
halve = halve + 1
theta.next = theta.n + dir/(2^halve)
mu1 = theta.next[1:p,]
sig1 = theta.next[(p+1):nrow(theta.next), ]
a.temp = like.grad(x=datan, mu=matrix(mu1, ncol=1), sig=vec2mat(matrix(sig1,ncol=1)))
print(sprintf('%2.0f %2.0f %.1e %.1e', it, halve, a.temp$l, a.temp$grad.norm))
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
} #end while loop
theta.n1 = theta.next
if (halve >= 21) {
print('Step-halve failed')
}
#convergence criteria
c1 = abs(max(1,theta.n1))
c2 = abs(theta.n1 - theta.n)
mre = max(c2/c1)
grad.norm = a.temp$grad.norm
if (mre < tolerr & grad.norm < tolgrad) {
break
print("We reached convergence in steep ascent")
}
#update variables for next iteration
mu1 = theta.n1[1:p, ]
sig1 = theta.n1[(p+1):nrow(theta.n1), ]
mu = matrix(mu1, ncol=1) #need them in matrix vector form
sig = vec2mat(matrix(sig1,ncol=1)) #need in matrix form
}
return(list(l = a.temp$l, mu=mu, sig = sig, iteration = it))
}
mu.init = matrix(c(-1.5, 1.5, 2.3), ncol=1)
sig.init = matrix(c(1, 0.5, 0.5, 0.5, 1, 0.5, 0.5, 0.5, 1), byrow=TRUE, nrow=3)
y2 = Newton(mu = mu.init, sig = sig.init, datan = data)
## [1] "Iteration Halving log-likelihood ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1 -- -8.4e+02 3.6e+02"
## [1] " 1 0 NA NA"
## [1] " 1 1 NA NA"
## [1] " 1 2 NA NA"
## [1] " 1 3 -8.4e+02 3.0e+03"
## [1] " 1 4 -7.8e+02 3.5e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2 -- -7.8e+02 3.5e+02"
## [1] " 2 0 -2.6e+03 1.2e+05"
## [1] " 2 1 -7.3e+02 3.7e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 3 -- -7.3e+02 3.7e+02"
## [1] " 3 0 -7.2e+02 1.2e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 4 -- -7.2e+02 1.2e+02"
## [1] " 4 0 -7.1e+02 3.2e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 5 -- -7.1e+02 3.2e+01"
## [1] " 5 0 -7.1e+02 4.1e+00"
## [1] "-----------------------------------------------------------------"
## [1] " 6 -- -7.1e+02 4.1e+00"
## [1] " 6 0 -7.1e+02 8.9e-02"
## [1] "-----------------------------------------------------------------"
## [1] " 7 -- -7.1e+02 8.9e-02"
## [1] " 7 0 -7.1e+02 4.5e-05"
## [1] "-----------------------------------------------------------------"
## [1] " 8 -- -7.1e+02 4.5e-05"
## [1] " 8 0 -7.1e+02 1.0e-11"
y2
## $l
## [1] -711.983
##
## $mu
## [,1]
## [1,] -1.0697892
## [2,] 0.9509498
## [3,] 2.0187316
##
## $sig
## [,1] [,2] [,3]
## [1,] 1.0461948 0.7133607 0.7354661
## [2,] 0.7133607 1.0132407 0.7789204
## [3,] 0.7354661 0.7789204 1.1133200
##
## $iteration
## [1] 8
It appears that Newton’s algorithm is very fast in comparison to steep ascent. The estimates can be seen above in the R console printout. Furthermore, the number of iterations it took to converge was only 8.