Exercise J-2.2

### Generating Data 
sqrtm <- function (A){
  a = eigen(A)
  sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
  sqm = (sqm+t(sqm))/2
}

gen <- function(n, p, mu, sig, seed){
  set.seed(seed)
  x = matrix(rnorm(n*p),n,p)
  datan = x %*% sqrtm(sig) + matrix(mu,n,p, byrow = TRUE)
  datan
}

n <- 200
p <- 3
mu1 <- matrix(c(-1,1,2), nrow=3, ncol=1)
sig1 <- matrix(c(1,0.7,0.7,0.7,1,0.7,0.7,0.7,1), nrow=3, ncol=3)

data_hw3 <- gen(n=200, p=3, mu=mu1, sig=sig1, seed=2022)
head(data_hw3, 3)
##            [,1]       [,2]      [,3]
## [1,]  0.1947111  1.9078866  3.153741
## [2,] -2.8172598 -0.2811456 -0.352587
## [3,] -2.4052730 -1.0723173  1.237333
### Matrix to Vector (Sigma) 
sig_fun <- function(A){
  V = numeric(p*(p+1)/2)
  cnt = 0
  for(i in 1:nrow(A)){
    for (j in 1:i){
      cnt = cnt + 1    
      V[cnt] = A[i,j]
      if(i==j){V[cnt]=(-1/2)*A[i,j]}
      else if(i!=j){V[cnt]=-A[i,j]}
    }
  }
  V = matrix(V, ncol=1)
 return(V)
}

### Vector To Matrix 
VtoM <- function(V){
  p = (-1 + sqrt(1+8*nrow(V)))/2
  M = matrix(0,p,p)
  cnt = 0
  for(i in 1:nrow(M)){
    for (j in 1:i){
      cnt = cnt + 1
      M[i,j] = V[cnt,1]
      M[j,i] = M[i,j]  
    }
  }  
 return(M) 
}

### Log-likelihood function
my_fun <- function (parvec, x) {
    a = dim(x)
    n = a[1]
    p = a[2]
    mu = matrix(parvec[1:p], ncol=1)
    sig = VtoM(matrix(parvec[(p+1):(p+p*(p+1)/2)], ncol=1))
    if (any(eigen(sig)$values < 0)) { 
      l = NA
    } else {
      siginv = solve(sig)
      C = matrix(0,p,p)
      for (i in 1:n){
        xm = x[i,] - mu
        C = C + xm %*% t(xm)
      }
      log_det_sig <- log(det(sig))
      l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv*C))/2
    }
 return(l)
}
# Check: my_fun(parvec=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), x=data_hw3)

### Gradient function
my_gr <- function (parvec, x) {
  a = dim(x)
  n = a[1]
  p = a[2]
  mu = matrix(parvec[1:p], ncol=1)
  sig = VtoM(matrix(parvec[(p+1):(p+p*(p+1)/2)], ncol=1))
  if (any(eigen(sig)$values < 0)) { 
    gradms = NA
  } else {
    siginv = solve(sig)
    C = matrix(0,p,p)
    sxm = matrix(0,p,1)
    for (i in 1:n){
      xm = x[i,] - mu
      sxm = sxm + xm
      C = C + xm %*% t(xm)
    }
    gradm = siginv %*% sxm
    A = n*siginv - (siginv %*% (C %*% siginv))
    grads = sig_fun(A)
    gradms = matrix(c(gradm, grads), ncol=1) 
  }
  return(gradms)
}
# Check: my_gr(parvec=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), x=data_hw3)

mu0 <- matrix(c(0, 0, 0), 3, 1)
sig0 <- matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow=3, ncol=3)

optim(par=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), fn=my_fun, gr=my_gr,  
x=data_hw3, method="BFGS", control=list(abstol=10^(-5), trace=TRUE, fnscale=-1))
## initial  value 1449.380448 
## iter  10 value 723.132199
## iter  20 value 692.631928
## iter  30 value 692.590246
## final  value 692.587107 
## converged
## $par
##             [,1]
##  [1,] -1.0402718
##  [2,]  0.9439508
##  [3,]  1.9896595
##  [4,]  1.0807572
##  [5,]  0.7093068
##  [6,]  0.9189932
##  [7,]  0.7800262
##  [8,]  0.6881343
##  [9,]  1.0484688
## 
## $value
## [1] -692.5871
## 
## $counts
## function gradient 
##      115       31 
## 
## $convergence
## [1] 0
## 
## $message
## NULL

Therefore, \[\hat{\mu_1}=-1.0402718,\hat{\mu_2}=0.9439508,\hat{\mu_3}=1.9896595\] \[\hat{\sigma_{11}}=1.0807572 , \hat{\sigma_{21}}=0.7093068 ,\hat{\sigma_{22}}=0.9189932, \hat{\sigma_{31}}=0.7800262, \hat{\sigma_{32}}=0.6881343, \hat{\sigma_{33}}=1.0484688,\]

Exercise GH-2.3

(a)

\[L_i=[f(t_i)]^{w_i}[1-F(t_i)]^{1-w_i}\] \[log(\prod_{i = 1}^{n}L_i)=\sum_{i = 1}^{n}log(L_i)=\sum_{i = 1}^{n}log[f(t_i)]^{w_i}[1-F(t_i)]^{1-w_i}=\sum_{i = 1}^{n}\{{w_ilogf(t_i)+(1-w_i)log(1-F(t_i))}\}\]

\[=\sum_{i = 1}^{n}[w_ilog(\lambda(t_i)exp\{X_i^T\beta-\Lambda(t_i)exp(X_i^T\beta)\}+(1-w_i)log(exp\{-\Lambda(t_i)exp(X_i^T\beta)\}]\] \[=\sum_{i = 1}^{n}[w_ilog(\lambda(t_i)exp(log\frac{\mu_i}{\Lambda(t_i)}-\mu_i))+(1-w_i)log(exp(-\Lambda(t_i)exp(X_i^T\beta))]\] \[=\sum_{i = 1}^{n}[w_i\{log(\lambda(t_i))+log(exp(log\frac{\mu_i}{\Lambda(t_i)}-\mu_i))\}+(1-w_i)log(exp(-\mu_i))]\] \[=\sum_{i = 1}^{n}[w_i\{log(\lambda(t_i))+log\frac{\mu_i}{\Lambda(t_i)}-\mu_i\}+(1-w_i)(-\mu_i)]\] \[=\sum_{i = 1}^{n}[w_i\log(\lambda(t_i))+w_ilog\frac{\mu_i}{\Lambda(t_i)}-w_i\mu_i-\mu_i+w_i\mu_i]\] \[=\sum_{i = 1}^{n}(w_i\log\{\mu_i\}-\mu_i)+\sum_{i = 1}^{n}w_ilog\{\frac{\lambda(t_i)}{\Lambda(t_i)}\}\]

(b)

# Data (GH-2.3)
data_b <- data.frame(Time=c(6,6,6,6,7,9,10,10,11,13,16,17,19,20,22,23,25,32,32,
          34,35,1,1,2,2,3,4,4,5,5,8,8,8,8,11,11,12,12,15,17,22,23),
          W=c(0,1,1,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
              ,1,1,1,1,1,1,1,1,1),Delta=c(rep(1,21),rep(0,21)))

# Partial Derevative
dlda_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum1 = 0
  sum2 = 0
  for (i in 1:nrow(data)) {
   dlda_v1 = w[i]*log(x[i])-(x[i]^(alpha))*log(x[i])*exp(beta0+del[i]*beta1)
   sum1 = sum1 + dlda_v1    
  }
  for (i in 1:nrow(data)) {
   dlda_v2 = w[i]/alpha
   sum2 = sum2 + dlda_v2    
  }
 return(sum1+sum2)  
}

dldb0_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta
  sum = 0
  for (i in 1:nrow(data)) {
   dldb0v = w[i]-(x[i]^(alpha))*exp(beta0+del[i]*beta1)
   sum = sum + dldb0v   
  }
 return(sum)  
}

dldb1_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta
  sum = 0
  for (i in 1:nrow(data)) {
   dldb1v = (w[i]*del[i])-(x[i]^alpha)*del[i]*exp(beta0+del[i]*beta1)
   sum = sum + dldb1v   
  }
  return(sum)   
}

# Second Partial Derevative
dl2da2_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum1 = 0
  sum2 = 0
  for (i in 1:nrow(data)) {
    dl2da2v_1 = -(log(x[i])^2*(x[i]^alpha)*exp(beta0+del[i]*beta1))
    sum1 = sum1 + dl2da2v_1    
  }
  for (i in 1:nrow(data)) {
    dl2da2v_2 = -(w[i]/(alpha^2))
    sum2 = sum2 + dl2da2v_2    
  }
  return(sum1+sum2)  
}

dl2db02_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum = 0
  for (i in 1:nrow(data)) {
    dl2db02v = -(x[i]^alpha)*exp(beta0+del[i]*beta1)
    sum = sum + dl2db02v    
  }
  return(sum)  
}

dl2db12_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum = 0
  for (i in 1:nrow(data)) {
    dl2db12v = -(x[i]^alpha)*(del[i]^2)*exp(beta0+del[i]*beta1)
    sum = sum + dl2db12v    
  }
  return(sum)  
}

dl2dadb0_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum = 0
  for (i in 1:nrow(data)) {
    dl2dadb0v = -log(x[i])*(x[i]^alpha)*exp(beta0+del[i]*beta1)
    sum = sum + dl2dadb0v    
  }
  return(sum)  
}

dl2dadb1_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum = 0
  for (i in 1:nrow(data)) {
    dl2dadb1v = -del[i]*log(x[i])*(x[i]^alpha)*exp(beta0+del[i]*beta1)
    sum = sum + dl2dadb1v    
  }
  return(sum)  
}

dl2db0db1_fun <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta 
  sum = 0
  for (i in 1:nrow(data)) {
    dl2db0db1v = -del[i]*(x[i]^alpha)*exp(beta0+del[i]*beta1)
    sum = sum + dl2db0db1v    
  }
  return(sum)  
}

# Log-likelihood
Llike_b <- function(data, alpha, beta0, beta1){
  x = data$Time
  w = data$W
  del = data$Delta
  l = 0 
  for (i in 1:nrow(data)){
  lv = w[i]*log((x[i]^alpha)*exp(beta0+del[i]*beta1))-
      (x[i]^alpha)*exp(beta0+del[i]*beta1)+
      w[i]*log(alpha*(x[i]^(alpha-1))/(x[i]^alpha))
  l = l + lv 
  }
 return(l)  
}  
# Test: Llike_b(data=data_b, alpha=1, beta0=0, beta1=0)

# Gradient
Grad_b <- function(data, alpha, beta0, beta1){  
  if (alpha <= 0){
  l = NA
  gradab = NA  
  } else {
   l = Llike_b(data, alpha, beta0, beta1) 
   dlda = dlda_fun(data, alpha, beta0, beta1)
   dldb0 = dldb0_fun(data, alpha, beta0, beta1)
   dldb1 = dldb1_fun(data, alpha, beta0, beta1)
   gradab = matrix(c(dlda, dldb0, dldb1), ncol=1)
   gradnorm = norm(gradab, type="2")          
   }
 return(list(l=l, gradab=gradab, gradnorm=gradnorm))        
}

# Test: Grad_b(data=data_b, alpha=1, beta0=0, beta1=0)

# Hessian    
Hess_b <- function(data, alpha, beta0, beta1){
  h = matrix(0, nrow=3, ncol=3)
  if (alpha <= 0){
  h = NA
  } else {
  h[1,1]= dl2da2_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  h[2,2]= dl2db02_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  h[3,3]= dl2db12_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  h[1,2]=h[2,1]=dl2dadb0_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  h[1,3]=h[3,1]=dl2dadb1_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  h[2,3]=h[3,2]=dl2db0db1_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
  }
 return(h)  
}

# Test: Hess_b(data=data_b, alpha=1, beta0=0, beta1=0)

# Newton Optimization
Newton_fun <- function (data, alpha, beta0, beta1, tolerr, tolgrad, maxit){
  header = paste0("Iteration", "   Halving", "         log-likelihood", 
                  "        ||gradient||")
  print(header)
  for(it in 1:maxit) {
    theta = matrix(c(alpha, beta0, beta1), ncol=1)
    a <- Grad_b(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
    if (it <= maxit) {
      print("-----------------------------------------------------------------")
      print(sprintf('%2.0f           -             %.4f             %.1e', 
                    it,  a$l, a$gradnorm))
    }
    b <- Hess_b(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
    hessinv  = solve(b)
    dir <- -1*hessinv %*% a$gradab 
    thetan = theta + dir
    alphan = thetan[1,]
    beta0n = thetan[2,]
    beta1n = thetan[3,]
    atmp = Grad_b(data=data_b, alpha=alphan, beta0=beta0n, beta1=beta1n)
    if (is.na(atmp$l)) {
      atmp_new = -Inf
    } else {
      atmp_new = atmp$l
    }   
    halve = 0
    print(sprintf('%2.0f          %2.0f             %.4f             %.1e', 
                  it, halve, atmp$l, atmp$gradnorm))    
    while(atmp_new < a$l & halve <= 20) {
      halve = halve + 1
      thetan = theta + dir/(2^halve)
      alphan = thetan[1,]
      beta0n = thetan[2,]
      beta1n = thetan[3,]
      atmp = Grad_b(data=data_b, alpha=alphan, beta0=beta0n, beta1=beta1n)
      print(sprintf('%2.0f          %2.0f             %.4f            %.1e', 
                    it, halve, atmp$l, atmp$gradnorm))
      if (is.na(atmp$l)) {
        atmp_new = -Inf
      } else {
        atmp_new = atmp$l
      }       
    } 
    thetan1 = thetan
    if (halve >= 20) { break }
    Mre = max(abs(thetan1 - theta)/pmax(1, abs(thetan1)))
    gradnorm = atmp$gradnorm
    if (Mre < tolerr && gradnorm < tolgrad){ break }
    alpha = thetan1[1, ]
    beta0 = thetan1[2, ]
    beta1 = thetan1[3, ]
  }
  return(list(alpha=alpha, beta0=beta0, beta1=beta1))
}

alpha_ini <- 1
beta0_ini <- 0
beta1_ini <- 0
tolerr <- 10^(-6)
tolgrad <- 10^(-5)
result.d <- Newton_fun(data=data_b, alpha=alpha_ini, beta0=beta0_ini, 
                       beta1=beta1_ini, tolerr=10^(-6), tolgrad=10^(-5), 
                       maxit=500)
## [1] "Iteration   Halving         log-likelihood        ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1           -             -541.0000             1.6e+03"
## [1] " 1           0             -237.4147             5.5e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2           -             -237.4147             5.5e+02"
## [1] " 2           0             -139.4431             1.8e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 3           -             -139.4431             1.8e+02"
## [1] " 3           0             -112.5964             5.4e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 4           -             -112.5964             5.4e+01"
## [1] " 4           0             -107.1589             1.2e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 5           -             -107.1589             1.2e+01"
## [1] " 5           0             -106.5914             1.6e+00"
## [1] "-----------------------------------------------------------------"
## [1] " 6           -             -106.5914             1.6e+00"
## [1] " 6           0             -106.5795             3.9e-02"
## [1] "-----------------------------------------------------------------"
## [1] " 7           -             -106.5795             3.9e-02"
## [1] " 7           0             -106.5795             2.5e-05"
## [1] "-----------------------------------------------------------------"
## [1] " 8           -             -106.5795             2.5e-05"
## [1] " 8           0             -106.5795             1.0e-11"
result.d
## $alpha
## [1] 1.365758
## 
## $beta0
## [1] -3.070704
## 
## $beta1
## [1] -1.730871

(d)

h <- Hess_b(data=data_b, alpha=1.365758, beta0=-3.070704, beta1=-1.730871)
OIM <- sqrt(-solve(h))
## Warning in sqrt(-solve(h)): NaNs produced
# std.error of alpha
OIM[1,1]
## [1] 0.201165
# std.error of beta0
OIM[2,2]
## [1] 0.5580701
# std.error of beta1
OIM[3,3] 
## [1] 0.4130815

\[Since\text{ }\text{ }\text{ }\text{ } std.error(\hat{\theta_i}) \approx \sqrt{[-H^{-1}\hat{(\theta)}]_{ii}}\text{ },\]
\[std.error(\hat{\alpha})= 0.201165\] \[std.error(\hat{\beta_o})= 0.5580701\] \[std.error(\hat{\beta_1})= 0.4130815\]

h <- Hess_b(data=data_b, alpha=1.365758, beta0=-3.070704, beta1=-1.730871)
OIM <- sqrt(-solve(h))
## Warning in sqrt(-solve(h)): NaNs produced
h1 <- -solve(h)
# other way: cov2cor(-solve(h))

# Correlation of alpha & beta0
h1[1,2]/(OIM[1,1]*OIM[2,2])
## [1] -0.9203813
# Correlation of alpha & beta1
h1[1,3]/(OIM[1,1]*OIM[3,3])
## [1] -0.2641532
# Correlation of beta0 & beta1
h1[2,3]/(OIM[2,2]*OIM[3,3])
## [1] 0.03655716

\[Since\text{ }\text{ }\text{ }\text{ } Corr(x,y)=\frac{Cov(x,y)}{\sqrt{Var(x)}\sqrt{Var(Y)}}\text{ }\text{ },\]
\[ Corr(\alpha, \beta_0)= \rho(\alpha, \beta_0) = -0.9203813\] \[ Corr(\alpha, \beta_1)= \rho(\alpha, \beta_1) = -0.2641532\] \[ Corr(\beta_0, \beta_1)= \rho(\beta_0, \beta_1) = 0.03655716\]

As can be seen from the result above,\[\alpha\text{ }and\text{ }\beta_0\] are highly negatively correlated. \[\alpha\text{ }and\text{ }\beta_1\] are negatively correlated, and \[\beta_0\text{ }and\text{ }\beta_1\] are slightly positively correlated.