Excercise J-2.2 Continued (Optim Function)

Generate Data. We first generate our trivariate normal data.

sqrtm <- function(A) {
    # Obtain matrix square root of a matrix A
    a = eigen(A)
    sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
    sqm = (sqm + t(sqm))/2
}

gen <- function(n, p, mu, sig, seed) {
    #---- Generate data from a p-variate normal with mean mu and covariance sigma
    # mu should be a p by 1 vector sigma should be a positive definite p by p matrix
    # Seed can be optionally set for the random number generator
    set.seed(seed)
    # generate data from normal mu sigma
    x = matrix(rnorm(n * p), n, p)
    datan = x %*% sqrtm(sig) + matrix(mu, n, p, byrow = TRUE)
    datan
}

MU = matrix(c(-1, 1, 2), ncol = 1)
SIG = matrix(c(1, 0.7, 0.7, 0.7, 1, 0.7, 0.7, 0.7, 1), byrow = TRUE, nrow = 3)
x.data = gen(200, 3, MU, SIG, seed = 2020)
print(x.data[1:3, ])
##            [,1]       [,2]      [,3]
## [1,] -1.5604822 -0.1665401 0.1677568
## [2,] -0.3413109  1.9598429 2.6717930
## [3,] -1.7510037  0.6332410 2.4235447

Helper Functions

#Takes a matrix and returns a column matrix with the lower triangular entries (+ diagonal)
mat2vec <- function(mat) {
    row.list = list()
    for(i in 1:nrow(mat)) {
        current.row = mat[i,]
        for (j in 1:i) {
            row.list[[i]] = c(current.row[1:j])
        }
    }
    return(as.matrix(unlist(row.list)))
}

mat2vec.R <- function(mat) {
    row.list = list()
    for(i in 1:nrow(mat)) {
        current.row = mat[i,]
        for (j in 1:i) {
            row.list[[i]] = c(current.row[1:j])
        }
    }
    return(unlist(row.list))
}

vec2mat <- function(vec) {
    p = (-1 + sqrt(1+8*nrow(vec)))/2
    row.list = list()
    mat = matrix(0,p,p)
    start = 1
    end = 1
    for(i in 1:nrow(mat)) {
        take = vec[start:end,]
        zeros = rep(0,length = (p - length(take)))
        row.list[[i]] = c(take, zeros)
        start = end + 1
        end = start + i 
    }
    sig.half = do.call(rbind, row.list)
    sig = sig.half + t(sig.half) - diag((diag(sig.half)))                
    return(sig) 
}

#log likelihood function(theta) to be used in optim()
fn <- function(par, data) {
  x = data
  n = nrow(data)
  p = ncol(data)
  mu = matrix(par[1:3], ncol=1)
  sig = vec2mat(matrix(par[4:9],ncol=1))
  
  e.values = eigen(sig)$values
  if(any(e.values < 0)) {
    l = -Inf
  } else {
    siginv = solve(sig)
    
    C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
    sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
    for (i in 1:n){
      xm = x[i,] - mu
      sxm = sxm + xm
      C = C + xm %*% t(xm)
    }
    log_det_sig <- log(det(sig))
    l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv * C ))/2
  }
  l
}

diff.sig <- function(A) {
    row.list = list()
    row.list[[1]] = (-1/2)*A[1,1]
    for(i in 2:nrow(A)) {
        current.row = A[i,]
        for (j in 1:i) {
            row.list[[i]] = c(-1*current.row[1:(j-1)], (-1/2)*current.row[j])
        }
    }
    return(as.matrix(unlist(row.list)))
}

grad.sig <- function(n, siginv, C) {
    A = n*siginv - (siginv %*% (C %*% siginv))
    diff.sig(A) #individual sigma_ii and sigma_ij's
}

#gradient function(theta) to be used in optim()
grr <- function(par, data) {
  x = data
  n = nrow(data) 
  p = ncol(data) 
  mu = matrix(par[1:3], ncol=1)
  sig = vec2mat(matrix(par[4:9],ncol=1))
  siginv = solve(sig)
  
  C = matrix(0,p,p); # initializing sum of (xi-mu)(xi-mu)^T (pxp matrix)
  sxm = matrix(0,p,1) # initializing sum of xi-mu (px1 vector)
  for (i in 1:n){
    xm = x[i,] - mu
    sxm = sxm + xm
    C = C + xm %*% t(xm)
  }
  gradm = siginv %*% sxm #gradient wrt mu
  grads = grad.sig(n=n, siginv = siginv, C = C)
  #grad.ms = matrix(c(gradm, grads), ncol=1)
    grad.ms = c(gradm, grads)
    grad.ms
}

Using optim(): Sigma = I and mu = (0,0,0)^T

MU = matrix(c(-1,1,2), ncol=1)
SIG = matrix(c(1,0.7,0.7,0.7,1,0.7,0.7,0.7,1),byrow=TRUE, nrow=3)
x.data = gen(200, 3, MU, SIG, seed=2020)

SIGMA.INIT =  diag(3)
MU.INIT = matrix(c(0,0,0), ncol=1)

theta.init = c(MU.INIT, mat2vec.R(SIGMA.INIT))

optim(par = theta.init, fn = fn, gr = grr, data = x.data, method = "BFGS", control=list(trace=3, fnscale=-1, abstol=10^-5))
## initial  value 1423.081083 
## iter  10 value 760.641810
## iter  20 value 724.611964
## iter  30 value 724.573770
## final  value 724.571984 
## converged
## $par
## [1] -1.0496597  0.9159954  1.9117942  1.1572594  0.7312970  1.0766724  0.6978262
## [8]  0.6498122  0.8874578
## 
## $value
## [1] -724.572
## 
## $counts
## function gradient 
##      111       31 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Therefore, we can see from the optim() output, the estimate for \(\mu\) is the first three entries of the list variable par. In particular we have:

\[ \mu_1 = -1.0496\\ \mu_2 = 0.91599\\ \mu_3 = 1.91179 \]

The estimate for \(\Sigma\) is the last 6 entries. In particular we have

\[ \sigma_{11} = 1.157\\ \sigma_{21} = 0.731\\ \sigma_{22} = 1.076\\ \sigma_{31} = 0.698\\ \sigma_{32} = 0.649\\ \sigma_{33} = 0.887 \]

Since \(\Sigma\) is symmetric, we have the full matrix from those 6 entries.

Excercise GH-2.3

Part A: Prove the log-likelihood takes the form given in text

Suppose the ith patient is dead at time \(t_i\). This means their data is “uncensored” and that we have observed the real time \(t_i\). Thus, the contribution to the likelihood function is \(f(t_i)\).

Now suppose the ith patient is still alive at time \(t_i\). This means their data is “censored” and that all we know is they exceeded time \(t_i\). Thus, the contribution to the likelihood function is \(1-F(t_i) = S(t_i)\).

Letting \(w_i\) equal 1 if time \(t_i\) is uncensored (ith patient is dead at time \(t_i\)) and equal 0 if time \(t_i\) is censored (ith patient is still alive at time \(t_i\)). Then, we can compactly the contribution of the ith patient to the likelihood function as \(L_i = f(t_i)^{w_i}(S(t_i))^{1-w_i}\).

We are now ready to compute the log-likelihood. \[\begin{align} \log(\prod_{i=1}^{n}L_i) &=\sum_{i=1}^{n}w_ilog(f(t_i)) + (1-w_i)log(S(t_i))\\ &=\sum_{i=1}^{n}w_ilog(\lambda_i\exp(log(\frac{\mu_i}{\Lambda_i}) - \mu_i)) + (1-w_i)log(\exp(-\mu_i))\\ &=\sum_{i=1}^{n}w_i(log(\lambda_i) + log(\frac{\mu_i}{\Lambda_i}) - \mu_i) + \sum_{i=1}^{n}(1-w_i)(-\mu_i)\\ &=\sum_{i=1}^{n}(w_ilog(\mu_i) - \mu_i) + \sum_{i=1}^{n} w_ilog(\frac{\lambda_i}{\Lambda_i}) \end{align}\]

Part B(i): Gradients and Hessian derivatives

We need to compute the gradient and Hessian of the log-likelihood function.

With the reparameterization the log-likelihood is: \[l(\alpha, \beta_0,\beta_1) = \sum_{i=1}^{n} w_ilog(t_i^{\alpha}\exp(\beta_0+\delta_i\beta_1)) - t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) + \sum_{i=1}^{n} w_ilog(\frac{\alpha}{t_i}) \]

Therefore, the partials are the following. \[ \frac{\partial \ell}{\partial \alpha} = \sum_{i=1}^{n} w_ilog(t_i) -t_i^{\alpha}log(t_i)\exp(\beta_0 + \delta_i\beta_1) + \sum_{i=1}^{n}\frac{w_i}{\alpha} \]

\[ \frac{\partial \ell}{\partial \beta_0} = \sum_{i=1}^{n} w_i - t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]

\[ \frac{\partial \ell}{\partial \beta_1} = \sum_{i=1}^{n} w_i\delta_i - t_i^{\alpha}\delta_i\exp(\beta_0 + \delta_i\beta_1) \]

We must now calculate the second partials and mixed partials for the Hessian.

\[ \frac{\partial^2 \ell}{\partial \alpha^2} = -\sum_{i=1}^{n} log^2(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) - \sum_{i=1}^{n}\frac{w_i}{\alpha^2} \]

\[ \frac{\partial^2 \ell}{\partial \beta_0^2} = -\sum_{i=1}^{n} t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]

\[ \frac{\partial^2 \ell}{\partial \beta_1^2} = -\sum_{i=1}^{n}\delta_it_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \] \[ \frac{\partial^2 \ell}{\partial \alpha \beta_0} = -\sum_{i=1}^{n}log(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]

\[ \frac{\partial^2 \ell}{\partial \alpha \beta_1} =-\sum_{i=1}^{n}\delta_ilog(t_i)t_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \]

\[ \frac{\partial^2 \ell}{\partial \beta_0 \beta_1} =-\sum_{i=1}^{n}\delta_it_i^{\alpha}\exp(\beta_0 + \delta_i\beta_1) \] ## Part B(ii): Coding a Newton Algorithm

# observations and data struture
t.obs = c(6, 6, 6, 6, 7, 9, 10, 10, 11, 13, 16, 17, 19, 20, 22, 23, 25, 32, 32, 34, 
    35, 1, 1, 2, 2, 3, 4, 4, 5, 5, 8, 8, 8, 8, 11, 11, 12, 12, 15, 17, 22, 23)

W = c(c(0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0), rep(1, 21))

Delta = c(rep(1, 21), rep(0, 21))

data = data.frame(t.obs, W, Delta)

# compute partials
dl.da <- function(datan, alpha, beta.0, beta.1) {
    sum1 = 0
    sum2 = 0
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    
    for (i in 1:nrow(datan)) {
        val = w[i] * log(x[i]) - (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] * 
            beta.1)
        sum1 = sum1 + val
    }
    
    for (i in 1:nrow(datan)) {
        val = w[i]/alpha
        sum2 = sum2 + val
    }
    return(sum1 + sum2)
}

dl.b0 <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = w[i] - (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    return(sum1)
}

dl.b1 <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = w[i] * delta[i] - delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] * 
            beta.1)
        sum1 = sum1 + val
    }
    return(sum1)
}
#-------------------------------------------

dl.aa <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    sum2 = 0
    for (i in 1:nrow(datan)) {
        val = 1 * (log(x[i])^2) * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    for (i in 1:nrow(datan)) {
        val = w[i]/(alpha^2)
        sum2 = sum2 + val
    }
    sum2 = -1 * sum2
    return(sum1 + sum2)
}

dl.b02 <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    return(sum1)
}

dl.b1.b1 <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    return(sum1)
}

dl.b0.a <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    return(sum1)
}

dl.b1.a <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = delta[i] * (x[i]^alpha) * log(x[i]) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    return(sum1)
}

dl.b0.b1 <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = delta[i] * (x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum1 = -1 * sum1
    return(sum1)
}

# likelihood
like <- function(datan, alpha, beta.0, beta.1) {
    x = datan$t.obs
    w = datan$W
    delta = datan$Delta
    sum1 = 0
    for (i in 1:nrow(datan)) {
        val = w[i] * log((x[i]^alpha) * exp(beta.0 + delta[i] * beta.1)) - (x[i]^alpha) * 
            exp(beta.0 + delta[i] * beta.1)
        sum1 = sum1 + val
    }
    sum2 = 0
    for (i in 1:nrow(datan)) {
        val = w[i] * log(alpha/x[i])
        sum2 = sum2 + val
    }
    return(sum1 + sum2)
}

# Gradient Function returns list with likelihood l and gradient grad
like.grad <- function(datan, alpha, beta.0, beta.1) {
    
    if (alpha <= 0) {
        # infeasible alpha
        l = NA
        grad = NA
    } else {
        l = like(datan, alpha, beta.0, beta.1)
        entry.1 = dl.da(datan, alpha, beta.0, beta.1)
        entry.2 = dl.b0(datan, alpha, beta.0, beta.1)
        entry.3 = dl.b1(datan, alpha, beta.0, beta.1)
        grad = matrix(c(entry.1, entry.2, entry.3), ncol = 1)
        grad.norm = norm(grad, type = "2")
    }
    return(list(l = l, grad = grad, grad.norm = grad.norm))
}

# Hessian function
hessian <- function(datan, alpha, beta.0, beta.1) {
    if (alpha <= 0) {
        h = NA
    } else {
        h = matrix(0, nrow = 3, ncol = 3)
        h[1, 1] = dl.aa(datan, alpha, beta.0, beta.1)
        h[2, 2] = dl.b02(datan, alpha, beta.0, beta.1)
        h[3, 3] = dl.b1.b1(datan, alpha, beta.0, beta.1)
        h[2, 1] = dl.b0.a(datan, alpha, beta.0, beta.1)
        h[1, 2] = h[2, 1]
        h[3, 1] = dl.b1.a(datan, alpha, beta.0, beta.1)
        h[1, 3] = h[3, 1]
        h[2, 3] = dl.b0.b1(datan, alpha, beta.0, beta.1)
        h[3, 2] = h[2, 3]
    }
    return(h)
}

Newton <- function(datan, alpha, beta.0, beta.1, maxit = 30, tolerr = 10^-7, tolgrad = 10^-7) {
    header = paste0("Iteration", "        Halving", "        log-likelihood", "             ||gradient||")
    print(header)
    
    for (it in 1:maxit) {
        
        theta.n = matrix(c(alpha, beta.0, beta.1), ncol = 1)
        a = like.grad(datan, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1)
        
        if (it == 1 | it == 2 | it == 32 | it == 33) {
            print("-----------------------------------------------------------------")
            print(sprintf("%2.0f                 --          %.1e               %.1e", 
                it, a$l, a$grad.norm))
        }
        
        # move in the direction
        hess = hessian(datan = datan, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1)
        hess.inverse = solve(hess)
        dir = -1 * hess.inverse %*% a$grad
        theta.next = theta.n + dir
        alpha.next = theta.next[1, ]
        beta.0.next = theta.next[2, ]
        beta.1.next = theta.next[3, ]
        
        # calculate likelihood at this new point.see if you improved
        a.temp = like.grad(datan = datan, alpha = alpha.next, beta.0 = beta.0.next, 
            beta.1 = beta.1.next)
        
        
        if (is.na(a.temp$l)) {
            temp.like = -Inf
        } else {
            temp.like = a.temp$l
        }
        halve = 0
        if (it == 1 | it == 2 | it == 32 | it == 33) {
            print(sprintf("%2.0f              %2.0f           %.1e                  %.1e", 
                it, halve, a.temp$l, a.temp$grad.norm))
        }
        
        
        while (temp.like < a$l & halve <= 20) {
            halve = halve + 1
            theta.next = theta.n + dir/(2^halve)
            alpha.next = theta.next[1, ]
            beta.0.next = theta.next[2, ]
            beta.1.next = theta.next[3, ]
            
            a.temp = like.grad(datan = datan, alpha = alpha.next, beta.0 = beta.0.next, 
                beta.1 = beta.1.next)
            if (it == 1 | it == 2 | it == 32 | it == 33) {
                print(sprintf("%2.0f              %2.0f           %.1e               %.1e", 
                  it, halve, a.temp$l, a.temp$grad.norm))
            }
            
            
            if (is.na(a.temp$l)) {
                temp.like = -Inf
            } else {
                temp.like = a.temp$l
            }
        }  #end while loop
        theta.n1 = theta.next
        
        if (halve >= 21) {
            print("Step-halve failed")
        }
        # convergence criteria
        c1 = abs(max(1, theta.n1))
        c2 = abs(theta.n1 - theta.n)
        mre = max(c2/c1)
        grad.norm = a.temp$grad.norm
        
        if (mre < tolerr & grad.norm < tolgrad) {
            print(paste("We reached convergence in newton at iteration: ", it))
            break
        }
        
        # update variable for next iteration
        alpha = theta.n1[1, ]
        beta.0 = theta.n1[2, ]
        beta.1 = theta.n1[3, ]
        
    }  #end for loop
    return(list(l = a.temp$l, alpha = alpha, beta.0 = beta.0, beta.1 = beta.1, iteration = it))
}

a.init = 2
b.0.init = 10
b.1.init = 11
y = Newton(data, a.init, b.0.init, b.1.init, maxit = 500)
## [1] "Iteration        Halving        log-likelihood             ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1                 --          -1.1e+13               3.8e+13"
## [1] " 1               0           -3.9e+12                  1.4e+13"
## [1] "-----------------------------------------------------------------"
## [1] " 2                 --          -3.9e+12               1.4e+13"
## [1] " 2               0           -1.5e+12                  5.1e+12"
## [1] "-----------------------------------------------------------------"
## [1] "32                 --          -1.1e+02               9.7e-05"
## [1] "32               0           -1.1e+02                  1.6e-10"
## [1] "-----------------------------------------------------------------"
## [1] "33                 --          -1.1e+02               1.6e-10"
## [1] "33               0           -1.1e+02                  7.5e-15"
## [1] "We reached convergence in newton at iteration:  33"
y
## $l
## [1] -106.5795
## 
## $alpha
## [1] 1.365758
## 
## $beta.0
## [1] -3.070704
## 
## $beta.1
## [1] -1.730872
## 
## $iteration
## [1] 33

Part D: Standard Errors of Estimates

To calculate the standard errors of our estimates we would ideally want to compute the Fisher Information matrix \(I(\theta)\) and then compute the inverse \(I^{-1}(\theta)\). The diagonal entries of this matrix would be the variance of \(\alpha\), \(\beta_0\), and \(\beta_1\). However, this calculation is difficult. We would like to compute \(I(\theta) = E(-\nabla^{2}\ell)\). But our Hessian \(\nabla^{2}\ell\) has nonlinear functions of our random variables \(T_i\) and so its expectation could end up being difficult and/or impossible to compute theoretically.

Therefore, we will appeal to a theorem in class that states the following. For large n we have:
\[ SE(\hat{\theta_i}) \approx \sqrt{[-\nabla^{2}\ell(\hat{\theta})^{-1}]_{ii}} \]

y2 = hessian(data, y$alpha, y$beta.0, y$beta.1)
y2.inv = solve(-1*y2)

se.vec = rep(0,3)
for(i in 1:3) {
  se.vec[i] = sqrt(y2.inv[i,i])
}

se.vec
## [1] 0.2011650 0.5580702 0.4130819

Therefore, we can conclude that \(SE(\hat{\alpha}) \approx 0.201\), \(SE(\hat{\beta_0}) \approx 0.558\), and \(SE(\hat{\beta_1}) \approx 0.413\).

Part D(ii): Correlations

We will use the formula \(Corr(X,Y) = \frac{Cov(X,Y)}{SD(X)SD(Y)}\).

corr.a.b0 = y2.inv[1, 2]/(se.vec[1] * se.vec[2])

corr.a.b1 = y2.inv[1, 3]/(se.vec[1] * se.vec[3])

corr.b0.b1 = y2.inv[2, 3]/(se.vec[2] * se.vec[3])

corr.a.b0
## [1] -0.9203812
corr.a.b1
## [1] -0.2641529
corr.b0.b1
## [1] 0.03655683

Above we can see the pairwise correlations. It appears that \(\alpha\) and \(\beta_0\) are highly negatively correlated.