sqrtm <- function(A) {
# Obtain matrix square root of a matrix A
a = eigen(A)
sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
sqm = (sqm + t(sqm))/2
}
gen <- function(n, p, mu, sig, seed) {
#---- Generate data from a p-variate normal with mean mu and covariance sigma
# mu should be a p by 1 vector sigma should be a positive definite p by p matrix
# Seed can be optionally set for the random number generator
set.seed(seed)
# generate data from normal mu sigma
x = matrix(rnorm(n * p), n, p)
datan = x %*% sqrtm(sig) + matrix(mu, n, p, byrow = TRUE)
datan
}
MU = matrix(c(-1, 1, 2), ncol = 1)
SIG = matrix(c(1, 0.7, 0.7, 0.7, 1, 0.7, 0.7, 0.7, 1), byrow = TRUE, nrow = 3)
x.data = gen(200, 3, MU, SIG, seed = 2020)
print(x.data[1:3, ])
## [,1] [,2] [,3]
## [1,] -1.5604822 -0.1665401 0.1677568
## [2,] -0.3413109 1.9598429 2.6717930
## [3,] -1.7510037 0.6332410 2.4235447
#Takes a matrix and returns a column matrix with the lower triangular entries (+ diagonal)
mat2vec <- function(mat) {
row.list = list()
for(i in 1:nrow(mat)) {
current.row = mat[i,]
for (j in 1:i) {
row.list[[i]] = c(current.row[1:j])
}
}
return(as.matrix(unlist(row.list)))
}
mat2vec.R <- function(mat) {
row.list = list()
for(i in 1:nrow(mat)) {
current.row = mat[i,]
for (j in 1:i) {
row.list[[i]] = c(current.row[1:j])
}
}
return(unlist(row.list))
}
vec2mat <- function(vec) {
p = (-1 + sqrt(1+8*nrow(vec)))/2
row.list = list()
mat = matrix(0,p,p)
start = 1
end = 1
for(i in 1:nrow(mat)) {
take = vec[start:end,]
zeros = rep(0,length = (p - length(take)))
row.list[[i]] = c(take, zeros)
start = end + 1
end = start + i
}
sig.half = do.call(rbind, row.list)
sig = sig.half + t(sig.half) - diag((diag(sig.half)))
return(sig)
}
#log likelihood function(theta) to be used in optim()
fn <- function(par, data) {
x = data
n = nrow(data)
p = ncol(data)
mu = matrix(par[1:3], ncol=1)
sig = vec2mat(matrix(par[4:9],ncol=1))
e.values = eigen(sig)$values
if(any(e.values < 0)) {
l = -Inf
} else {
siginv = solve(sig)
C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
for (i in 1:n){
xm = x[i,] - mu
sxm = sxm + xm
C = C + xm %*% t(xm)
}
log_det_sig <- log(det(sig))
l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv * C ))/2
}
l
}
diff.sig <- function(A) {
row.list = list()
row.list[[1]] = (-1/2)*A[1,1]
for(i in 2:nrow(A)) {
current.row = A[i,]
for (j in 1:i) {
row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])
}
}
return(as.matrix(unlist(row.list)))
}
grad.sig <- function(n, siginv, C) {
A = n*siginv - (siginv %*% (C %*% siginv))
diff.sig(A) #individual sigma_ii and sigma_ij's
}
#gradient function(theta) to be used in optim()
grr <- function(par, data) {
x = data
n = nrow(data)
p = ncol(data)
mu = matrix(par[1:3], ncol=1)
sig = vec2mat(matrix(par[4:9],ncol=1))
siginv = solve(sig)
C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
for (i in 1:n){
xm = x[i,] - mu
sxm = sxm + xm
C = C + xm %*% t(xm)
}
gradm = siginv %*% sxm #gradient wrt mu
grads = grad.sig(n=n, siginv = siginv, C = C)
#grad.ms = matrix(c(gradm, grads), ncol=1)
grad.ms = c(gradm, grads)
grad.ms
}
MU = matrix(c(-1,1,2), ncol=1)
SIG = matrix(c(1,0.7,0.7,0.7,1,0.7,0.7,0.7,1),byrow=TRUE, nrow=3)
x.data = gen(200, 3, MU, SIG, seed=2020)
SIGMA.INIT = diag(3)
MU.INIT = matrix(c(0,0,0), ncol=1)
theta.init = c(MU.INIT, mat2vec.R(SIGMA.INIT))
optim(par = theta.init, fn = fn, gr = grr, data = x.data, method = "BFGS", control=list(trace=3, fnscale=-1, abstol=10^-5))
## initial value 1423.081083
## iter 10 value 760.641810
## iter 20 value 724.611964
## iter 30 value 724.573770
## final value 724.571984
## converged
## $par
## [1] -1.0496597 0.9159954 1.9117942 1.1572594 0.7312970 1.0766724 0.6978262
## [8] 0.6498122 0.8874578
##
## $value
## [1] -724.572
##
## $counts
## function gradient
## 111 31
##
## $convergence
## [1] 0
##
## $message
## NULL
Therefore, we can see from the optim() output, the estimate for \(\mu\) is the first three entries of the list variable par. In particular we have:
\[ \mu_1 = -1.0496\\ \mu_2 = 0.91599\\ \mu_3 = 1.91179 \]
The estimate for \(\Sigma\) is the last 6 entries. In particular we have
\[ \sigma_{11} = 1.157\\ \sigma_{21} = 0.731\\ \sigma_{22} = 1.076\\ \sigma_{31} = 0.698\\ \sigma_{32} = 0.649\\ \sigma_{33} = 0.887 \]
Since \(\Sigma\) is symmetric, we have the full matrix from those 6 entries.
Suppose the ith patient is dead at time \(t_i\). This means their data is “uncensored” and that we have observed the real time \(t_i\). Thus, the contribution to the likelihood function is \(f(t_i)\).
Now suppose the ith patient is still alive at time \(t_i\). This means their data is “censored” and that all we know is they exceeded time \(t_i\). Thus, the contribution to the likelihood function is \(1-F(t_i) = S(t_i)\).
Letting \(w_i\) equal 1 if time \(t_i\) is uncensored (ith patient is dead at time \(t_i\)) and equal 0 if time \(t_i\) is censored (ith patient is still alive at time \(t_i\)). Then, we can compactly the contribution of the ith patient to the likelihood function as \(L_i = f(t_i)^{w_i}(S(t_i))^{1-w_i}\).
We are now ready to compute the log-likelihood. \[\begin{align} \log(\prod_{i=1}^{n}L_i) &=\sum_{i=1}^{n}w_ilog(f(t_i)) + (1-w_i)log(S(t_i))\\ &=\sum_{i=1}^{n}w_ilog(\lambda_i\exp(log(\frac{\mu_i}{\Lambda_i}) - \mu_i)) + (1-w_i)log(\exp(-\mu_i))\\ &=\sum_{i=1}^{n}w_i(log(\lambda_i) + log(\frac{\mu_i}{\Lambda_i}) - \mu_i) + \sum_{i=1}^{n}(1-w_i)(-\mu_i)\\ &=\sum_{i=1}^{n}(w_ilog(\mu_i) - \mu_i) + \sum_{i=1}^{n} w_ilog(\frac{\lambda_i}{\Lambda_i}) \end{align}\]
We need to compute the gradient and Hessian of the log-likelihood function.
With the reparameterization the log-likelihood is: \[l(\alpha, \beta_0,\beta_1) = \sum_{i=1}^{n} w_ilog(t_i^{\alpha}\exp(\beta_0+\delta_i\beta_1)) - t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) + \sum_{i=1}^{n} w_ilog(\frac{\alpha}{t_i}) \]
Therefore, the partials are the following. \[ \frac{\partial \ell}{\partial \alpha} = \sum_{i=1}^{n} w_ilog(t_i) -t_i^{\alpha}log(t_i)\exp(\beta_0 + \delta_i\beta_1) + \sum_{i=1}^{n}\frac{w_i}{\alpha} \]
\[ \frac{\partial \ell}{\partial \beta_0} = \sum_{i=1}^{n} w_i - t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]
\[ \frac{\partial \ell}{\partial \beta_1} = \sum_{i=1}^{n} w_i\delta_i - t_i^{\alpha}\delta_i\exp(\beta_0 + \delta_i\beta_1) \]
We must now calculate the second partials and mixed partials for the Hessian.
\[ \frac{\partial^2 \ell}{\partial \alpha^2} = -\sum_{i=1}^{n} log^2(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) - \sum_{i=1}^{n}\frac{w_i}{\alpha^2} \]
\[ \frac{\partial^2 \ell}{\partial \beta_0^2} = -\sum_{i=1}^{n} t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]
\[ \frac{\partial^2 \ell}{\partial \beta_1^2} = -\sum_{i=1}^{n}\delta_it_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \] \[ \frac{\partial^2 \ell}{\partial \alpha \beta_0} = -\sum_{i=1}^{n}log(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]
\[ \frac{\partial^2 \ell}{\partial \alpha \beta_1} =-\sum_{i=1}^{n}\delta_ilog(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]
\[ \frac{\partial^2 \ell}{\partial \beta_0 \beta_1} =-\sum_{i=1}^{n}\delta_it_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \] ## Part B(ii): Coding a Newton Algorithm
# observations and data struture
t.obs = c(6, 6, 6, 6, 7, 9, 10, 10, 11, 13, 16, 17, 19, 20, 22, 23, 25, 32, 32, 34,
35, 1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23)
W = c(c(0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), rep(1, 21))
Delta = c(rep(1, 21), rep(0, 21))
data = data.frame(t.obs, W, Delta)
# compute partials
dl.da <- function(datan, alpha, beta.0, beta.1) {
sum1 = 0
sum2 = 0
x = datan$t.obs
w = datan$W
delta = datan$Delta
for (i in 1:nrow(datan)) {
val = w[i] * log(x[i]) - (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] *
beta.1)
sum1 = sum1 + val
}
for (i in 1:nrow(datan)) {
val = w[i]/alpha
sum2 = sum2 + val
}
return(sum1 + sum2)
}
dl.b0 <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = w[i] - (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
return(sum1)
}
dl.b1 <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = w[i] * delta[i] - delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] *
beta.1)
sum1 = sum1 + val
}
return(sum1)
}
#-------------------------------------------
dl.aa <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
sum2 = 0
for (i in 1:nrow(datan)) {
val = 1 * (log(x[i])^2) * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
for (i in 1:nrow(datan)) {
val = w[i]/(alpha^2)
sum2 = sum2 + val
}
sum2 = -1 * sum2
return(sum1 + sum2)
}
dl.b02 <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
return(sum1)
}
dl.b1.b1 <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
return(sum1)
}
dl.b0.a <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
return(sum1)
}
dl.b1.a <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = delta[i] * (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
return(sum1)
}
dl.b0.b1 <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum1 = -1 * sum1
return(sum1)
}
# likelihood
like <- function(datan, alpha, beta.0, beta.1) {
x = datan$t.obs
w = datan$W
delta = datan$Delta
sum1 = 0
for (i in 1:nrow(datan)) {
val = w[i] * log((x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)) - (x[i]^alpha) *
exp(beta.0 + delta[i] * beta.1)
sum1 = sum1 + val
}
sum2 = 0
for (i in 1:nrow(datan)) {
val = w[i] * log(alpha/x[i])
sum2 = sum2 + val
}
return(sum1 + sum2)
}
# Gradient Function returns list with likelihood l and gradient grad
like.grad <- function(datan, alpha, beta.0, beta.1) {
if (alpha <= 0) {
# infeasible alpha
l = NA
grad = NA
} else {
l = like(datan, alpha, beta.0, beta.1)
entry.1 = dl.da(datan, alpha, beta.0, beta.1)
entry.2 = dl.b0(datan, alpha, beta.0, beta.1)
entry.3 = dl.b1(datan, alpha, beta.0, beta.1)
grad = matrix(c(entry.1, entry.2, entry.3), ncol = 1)
grad.norm = norm(grad, type = "2")
}
return(list(l = l, grad = grad, grad.norm = grad.norm))
}
# Hessian function
hessian <- function(datan, alpha, beta.0, beta.1) {
if (alpha <= 0) {
h = NA
} else {
h = matrix(0, nrow = 3, ncol = 3)
h[1, 1] = dl.aa(datan, alpha, beta.0, beta.1)
h[2, 2] = dl.b02(datan, alpha, beta.0, beta.1)
h[3, 3] = dl.b1.b1(datan, alpha, beta.0, beta.1)
h[2, 1] = dl.b0.a(datan, alpha, beta.0, beta.1)
h[1, 2] = h[2, 1]
h[3, 1] = dl.b1.a(datan, alpha, beta.0, beta.1)
h[1, 3] = h[3, 1]
h[2, 3] = dl.b0.b1(datan, alpha, beta.0, beta.1)
h[3, 2] = h[2, 3]
}
return(h)
}
Newton <- function(datan, alpha, beta.0, beta.1, maxit = 30, tolerr = 10^-7, tolgrad = 10^-7) {
header = paste0("Iteration", " Halving", " log-likelihood", " ||gradient||")
print(header)
for (it in 1:maxit) {
theta.n = matrix(c(alpha, beta.0, beta.1), ncol = 1)
a = like.grad(datan, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1)
if (it == 1 | it == 2 | it == 32 | it == 33) {
print("-----------------------------------------------------------------")
print(sprintf("%2.0f -- %.1e %.1e",
it, a$l, a$grad.norm))
}
# move in the direction
hess = hessian(datan = datan, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1)
hess.inverse = solve(hess)
dir = -1 * hess.inverse %*% a$grad
theta.next = theta.n + dir
alpha.next = theta.next[1, ]
beta.0.next = theta.next[2, ]
beta.1.next = theta.next[3, ]
# calculate likelihood at this new point.see if you improved
a.temp = like.grad(datan = datan, alpha = alpha.next, beta.0 = beta.0.next,
beta.1 = beta.1.next)
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
halve = 0
if (it == 1 | it == 2 | it == 32 | it == 33) {
print(sprintf("%2.0f %2.0f %.1e %.1e",
it, halve, a.temp$l, a.temp$grad.norm))
}
while (temp.like < a$l & halve <= 20) {
halve = halve + 1
theta.next = theta.n + dir/(2^halve)
alpha.next = theta.next[1, ]
beta.0.next = theta.next[2, ]
beta.1.next = theta.next[3, ]
a.temp = like.grad(datan = datan, alpha = alpha.next, beta.0 = beta.0.next,
beta.1 = beta.1.next)
if (it == 1 | it == 2 | it == 32 | it == 33) {
print(sprintf("%2.0f %2.0f %.1e %.1e",
it, halve, a.temp$l, a.temp$grad.norm))
}
if (is.na(a.temp$l)) {
temp.like = -Inf
} else {
temp.like = a.temp$l
}
} #end while loop
theta.n1 = theta.next
if (halve >= 21) {
print("Step-halve failed")
}
# convergence criteria
c1 = abs(max(1, theta.n1))
c2 = abs(theta.n1 - theta.n)
mre = max(c2/c1)
grad.norm = a.temp$grad.norm
if (mre < tolerr & grad.norm < tolgrad) {
print(paste("We reached convergence in newton at iteration: ", it))
break
}
# update variable for next iteration
alpha = theta.n1[1, ]
beta.0 = theta.n1[2, ]
beta.1 = theta.n1[3, ]
} #end for loop
return(list(l = a.temp$l, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1, iteration = it))
}
a.init = 2
b.0.init = 10
b.1.init = 11
y = Newton(data, a.init, b.0.init, b.1.init, maxit = 500)
## [1] "Iteration Halving log-likelihood ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1 -- -1.1e+13 3.8e+13"
## [1] " 1 0 -3.9e+12 1.4e+13"
## [1] "-----------------------------------------------------------------"
## [1] " 2 -- -3.9e+12 1.4e+13"
## [1] " 2 0 -1.5e+12 5.1e+12"
## [1] "-----------------------------------------------------------------"
## [1] "32 -- -1.1e+02 9.7e-05"
## [1] "32 0 -1.1e+02 1.6e-10"
## [1] "-----------------------------------------------------------------"
## [1] "33 -- -1.1e+02 1.6e-10"
## [1] "33 0 -1.1e+02 7.5e-15"
## [1] "We reached convergence in newton at iteration: 33"
y
## $l
## [1] -106.5795
##
## $alpha
## [1] 1.365758
##
## $beta.0
## [1] -3.070704
##
## $beta.1
## [1] -1.730872
##
## $iteration
## [1] 33
To calculate the standard errors of our estimates we would ideally want to compute the Fisher Information matrix \(I(\theta)\) and then compute the inverse \(I^{-1}(\theta)\). The diagonal entries of this matrix would be the variance of \(\alpha\), \(\beta_0\), and \(\beta_1\). However, this calculation is difficult. We would like to compute \(I(\theta) = E(-\nabla^{2}\ell)\). But our Hessian \(\nabla^{2}\ell\) has nonlinear functions of our random variables \(T_i\) and so its expectation could end up being difficult and/or impossible to compute theoretically.
Therefore, we will appeal to a theorem in class that states the following. For large n we have:
\[
SE(\hat{\theta_i}) \approx \sqrt{[-\nabla^{2}\ell(\hat{\theta})^{-1}]_{ii}}
\]
y2 = hessian(data, y$alpha, y$beta.0, y$beta.1)
y2.inv = solve(-1*y2)
se.vec = rep(0,3)
for(i in 1:3) {
se.vec[i] = sqrt(y2.inv[i,i])
}
se.vec
## [1] 0.2011650 0.5580702 0.4130819
Therefore, we can conclude that \(SE(\hat{\alpha}) \approx 0.201\), \(SE(\hat{\beta_0}) \approx 0.558\), and \(SE(\hat{\beta_1}) \approx 0.413\).
We will use the formula \(Corr(X,Y) = \frac{Cov(X,Y)}{SD(X)SD(Y)}\).
corr.a.b0 = y2.inv[1, 2]/(se.vec[1] * se.vec[2])
corr.a.b1 = y2.inv[1, 3]/(se.vec[1] * se.vec[3])
corr.b0.b1 = y2.inv[2, 3]/(se.vec[2] * se.vec[3])
corr.a.b0
## [1] -0.9203812
corr.a.b1
## [1] -0.2641529
corr.b0.b1
## [1] 0.03655683
Above we can see the pairwise correlations. It appears that \(\alpha\) and \(\beta_0\) are highly negatively correlated.