### Generating Data
sqrtm <- function (A){
a = eigen(A)
sqm = a$vectors %*% diag(sqrt(a$values)) %*% t(a$vectors)
sqm = (sqm+t(sqm))/2
}
gen <- function(n, p, mu, sig, seed){
set.seed(seed)
x = matrix(rnorm(n*p),n,p)
datan = x %*% sqrtm(sig) + matrix(mu,n,p, byrow = TRUE)
datan
}
n <- 200
p <- 3
mu1 <- matrix(c(-1,1,2), nrow=3, ncol=1)
sig1 <- matrix(c(1,0.7,0.7,0.7,1,0.7,0.7,0.7,1), nrow=3, ncol=3)
data_hw3 <- gen(n=200, p=3, mu=mu1, sig=sig1, seed=2022)
head(data_hw3, 3)
## [,1] [,2] [,3]
## [1,] 0.1947111 1.9078866 3.153741
## [2,] -2.8172598 -0.2811456 -0.352587
## [3,] -2.4052730 -1.0723173 1.237333
### Matrix to Vector (Sigma)
sig_fun <- function(A){
V = numeric(p*(p+1)/2)
cnt = 0
for(i in 1:nrow(A)){
for (j in 1:i){
cnt = cnt + 1
V[cnt] = A[i,j]
if(i==j){V[cnt]=(-1/2)*A[i,j]}
else if(i!=j){V[cnt]=-A[i,j]}
}
}
V = matrix(V, ncol=1)
return(V)
}
### Vector To Matrix
VtoM <- function(V){
p = (-1 + sqrt(1+8*nrow(V)))/2
M = matrix(0,p,p)
cnt = 0
for(i in 1:nrow(M)){
for (j in 1:i){
cnt = cnt + 1
M[i,j] = V[cnt,1]
M[j,i] = M[i,j]
}
}
return(M)
}
### Log-likelihood function
my_fun <- function (parvec, x) {
a = dim(x)
n = a[1]
p = a[2]
mu = matrix(parvec[1:p], ncol=1)
sig = VtoM(matrix(parvec[(p+1):(p+p*(p+1)/2)], ncol=1))
if (any(eigen(sig)$values < 0)) {
l = NA
} else {
siginv = solve(sig)
C = matrix(0,p,p)
for (i in 1:n){
xm = x[i,] - mu
C = C + xm %*% t(xm)
}
log_det_sig <- log(det(sig))
l = -(n*p*log(2*pi)+n*log_det_sig + sum(siginv*C))/2
}
return(l)
}
# Check: my_fun(parvec=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), x=data_hw3)
### Gradient function
my_gr <- function (parvec, x) {
a = dim(x)
n = a[1]
p = a[2]
mu = matrix(parvec[1:p], ncol=1)
sig = VtoM(matrix(parvec[(p+1):(p+p*(p+1)/2)], ncol=1))
if (any(eigen(sig)$values < 0)) {
gradms = NA
} else {
siginv = solve(sig)
C = matrix(0,p,p)
sxm = matrix(0,p,1)
for (i in 1:n){
xm = x[i,] - mu
sxm = sxm + xm
C = C + xm %*% t(xm)
}
gradm = siginv %*% sxm
A = n*siginv - (siginv %*% (C %*% siginv))
grads = sig_fun(A)
gradms = matrix(c(gradm, grads), ncol=1)
}
return(gradms)
}
# Check: my_gr(parvec=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), x=data_hw3)
mu0 <- matrix(c(0, 0, 0), 3, 1)
sig0 <- matrix(c(1, 0, 0, 0, 1, 0, 0, 0, 1), nrow=3, ncol=3)
optim(par=matrix(c(0,0,0,1,0,1,0,0,1), ncol=1), fn=my_fun, gr=my_gr,
x=data_hw3, method="BFGS", control=list(abstol=10^(-5), trace=TRUE, fnscale=-1))
## initial value 1449.380448
## iter 10 value 723.132199
## iter 20 value 692.631928
## iter 30 value 692.590246
## final value 692.587107
## converged
## $par
## [,1]
## [1,] -1.0402718
## [2,] 0.9439508
## [3,] 1.9896595
## [4,] 1.0807572
## [5,] 0.7093068
## [6,] 0.9189932
## [7,] 0.7800262
## [8,] 0.6881343
## [9,] 1.0484688
##
## $value
## [1] -692.5871
##
## $counts
## function gradient
## 115 31
##
## $convergence
## [1] 0
##
## $message
## NULL
Therefore, \[\hat{\mu_1}=-1.0402718,\hat{\mu_2}=0.9439508,\hat{\mu_3}=1.9896595\] \[\hat{\sigma_{11}}=1.0807572 , \hat{\sigma_{21}}=0.7093068 ,\hat{\sigma_{22}}=0.9189932, \hat{\sigma_{31}}=0.7800262, \hat{\sigma_{32}}=0.6881343, \hat{\sigma_{33}}=1.0484688,\]
\[L_i=[f(t_i)]^{w_i}[1-F(t_i)]^{1-w_i}\] \[log(\prod_{i = 1}^{n}L_i)=\sum_{i = 1}^{n}log(L_i)=\sum_{i = 1}^{n}log[f(t_i)]^{w_i}[1-F(t_i)]^{1-w_i}=\sum_{i = 1}^{n}\{{w_ilogf(t_i)+(1-w_i)log(1-F(t_i))}\}\]
\[=\sum_{i = 1}^{n}[w_ilog(\lambda(t_i)exp\{X_i^T\beta-\Lambda(t_i)exp(X_i^T\beta)\}+(1-w_i)log(exp\{-\Lambda(t_i)exp(X_i^T\beta)\}]\] \[=\sum_{i = 1}^{n}[w_ilog(\lambda(t_i)exp(log\frac{\mu_i}{\Lambda(t_i)}-\mu_i))+(1-w_i)log(exp(-\Lambda(t_i)exp(X_i^T\beta))]\] \[=\sum_{i = 1}^{n}[w_i\{log(\lambda(t_i))+log(exp(log\frac{\mu_i}{\Lambda(t_i)}-\mu_i))\}+(1-w_i)log(exp(-\mu_i))]\] \[=\sum_{i = 1}^{n}[w_i\{log(\lambda(t_i))+log\frac{\mu_i}{\Lambda(t_i)}-\mu_i\}+(1-w_i)(-\mu_i)]\] \[=\sum_{i = 1}^{n}[w_i\log(\lambda(t_i))+w_ilog\frac{\mu_i}{\Lambda(t_i)}-w_i\mu_i-\mu_i+w_i\mu_i]\] \[=\sum_{i = 1}^{n}(w_i\log\{\mu_i\}-\mu_i)+\sum_{i = 1}^{n}w_ilog\{\frac{\lambda(t_i)}{\Lambda(t_i)}\}\]
# Data (GH-2.3)
data_b <- data.frame(Time=c(6,6,6,6,7,9,10,10,11,13,16,17,19,20,22,23,25,32,32,
34,35,1,1,2,2,3,4,4,5,5,8,8,8,8,11,11,12,12,15,17,22,23),
W=c(0,1,1,1,1,0,0,1,0,1,1,0,0,0,1,1,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1
,1,1,1,1,1,1,1,1,1),Delta=c(rep(1,21),rep(0,21)))
# Partial Derevative
dlda_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum1 = 0
sum2 = 0
for (i in 1:nrow(data)) {
dlda_v1 = w[i]*log(x[i])-(x[i]^(alpha))*log(x[i])*exp(beta0+del[i]*beta1)
sum1 = sum1 + dlda_v1
}
for (i in 1:nrow(data)) {
dlda_v2 = w[i]/alpha
sum2 = sum2 + dlda_v2
}
return(sum1+sum2)
}
dldb0_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dldb0v = w[i]-(x[i]^(alpha))*exp(beta0+del[i]*beta1)
sum = sum + dldb0v
}
return(sum)
}
dldb1_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dldb1v = (w[i]*del[i])-(x[i]^alpha)*del[i]*exp(beta0+del[i]*beta1)
sum = sum + dldb1v
}
return(sum)
}
# Second Partial Derevative
dl2da2_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum1 = 0
sum2 = 0
for (i in 1:nrow(data)) {
dl2da2v_1 = -(log(x[i])^2*(x[i]^alpha)*exp(beta0+del[i]*beta1))
sum1 = sum1 + dl2da2v_1
}
for (i in 1:nrow(data)) {
dl2da2v_2 = -(w[i]/(alpha^2))
sum2 = sum2 + dl2da2v_2
}
return(sum1+sum2)
}
dl2db02_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dl2db02v = -(x[i]^alpha)*exp(beta0+del[i]*beta1)
sum = sum + dl2db02v
}
return(sum)
}
dl2db12_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dl2db12v = -(x[i]^alpha)*(del[i]^2)*exp(beta0+del[i]*beta1)
sum = sum + dl2db12v
}
return(sum)
}
dl2dadb0_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dl2dadb0v = -log(x[i])*(x[i]^alpha)*exp(beta0+del[i]*beta1)
sum = sum + dl2dadb0v
}
return(sum)
}
dl2dadb1_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dl2dadb1v = -del[i]*log(x[i])*(x[i]^alpha)*exp(beta0+del[i]*beta1)
sum = sum + dl2dadb1v
}
return(sum)
}
dl2db0db1_fun <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
sum = 0
for (i in 1:nrow(data)) {
dl2db0db1v = -del[i]*(x[i]^alpha)*exp(beta0+del[i]*beta1)
sum = sum + dl2db0db1v
}
return(sum)
}
# Log-likelihood
Llike_b <- function(data, alpha, beta0, beta1){
x = data$Time
w = data$W
del = data$Delta
l = 0
for (i in 1:nrow(data)){
lv = w[i]*log((x[i]^alpha)*exp(beta0+del[i]*beta1))-
(x[i]^alpha)*exp(beta0+del[i]*beta1)+
w[i]*log(alpha*(x[i]^(alpha-1))/(x[i]^alpha))
l = l + lv
}
return(l)
}
# Test: Llike_b(data=data_b, alpha=1, beta0=0, beta1=0)
# Gradient
Grad_b <- function(data, alpha, beta0, beta1){
if (alpha <= 0){
l = NA
gradab = NA
} else {
l = Llike_b(data, alpha, beta0, beta1)
dlda = dlda_fun(data, alpha, beta0, beta1)
dldb0 = dldb0_fun(data, alpha, beta0, beta1)
dldb1 = dldb1_fun(data, alpha, beta0, beta1)
gradab = matrix(c(dlda, dldb0, dldb1), ncol=1)
gradnorm = norm(gradab, type="2")
}
return(list(l=l, gradab=gradab, gradnorm=gradnorm))
}
# Test: Grad_b(data=data_b, alpha=1, beta0=0, beta1=0)
# Hessian
Hess_b <- function(data, alpha, beta0, beta1){
h = matrix(0, nrow=3, ncol=3)
if (alpha <= 0){
h = NA
} else {
h[1,1]= dl2da2_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
h[2,2]= dl2db02_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
h[3,3]= dl2db12_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
h[1,2]=h[2,1]=dl2dadb0_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
h[1,3]=h[3,1]=dl2dadb1_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
h[2,3]=h[3,2]=dl2db0db1_fun(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
}
return(h)
}
# Test: Hess_b(data=data_b, alpha=1, beta0=0, beta1=0)
# Newton Optimization
Newton_fun <- function (data, alpha, beta0, beta1, tolerr, tolgrad, maxit){
header = paste0("Iteration", " Halving", " log-likelihood",
" ||gradient||")
print(header)
for(it in 1:maxit) {
theta = matrix(c(alpha, beta0, beta1), ncol=1)
a <- Grad_b(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
if (it <= maxit) {
print("-----------------------------------------------------------------")
print(sprintf('%2.0f - %.4f %.1e',
it, a$l, a$gradnorm))
}
b <- Hess_b(data=data_b, alpha=alpha, beta0=beta0, beta1=beta1)
hessinv = solve(b)
dir <- -1*hessinv %*% a$gradab
thetan = theta + dir
alphan = thetan[1,]
beta0n = thetan[2,]
beta1n = thetan[3,]
atmp = Grad_b(data=data_b, alpha=alphan, beta0=beta0n, beta1=beta1n)
if (is.na(atmp$l)) {
atmp_new = -Inf
} else {
atmp_new = atmp$l
}
halve = 0
print(sprintf('%2.0f %2.0f %.4f %.1e',
it, halve, atmp$l, atmp$gradnorm))
while(atmp_new < a$l & halve <= 20) {
halve = halve + 1
thetan = theta + dir/(2^halve)
alphan = thetan[1,]
beta0n = thetan[2,]
beta1n = thetan[3,]
atmp = Grad_b(data=data_b, alpha=alphan, beta0=beta0n, beta1=beta1n)
print(sprintf('%2.0f %2.0f %.4f %.1e',
it, halve, atmp$l, atmp$gradnorm))
if (is.na(atmp$l)) {
atmp_new = -Inf
} else {
atmp_new = atmp$l
}
}
thetan1 = thetan
if (halve >= 20) { break }
Mre = max(abs(thetan1 - theta)/pmax(1, abs(thetan1)))
gradnorm = atmp$gradnorm
if (Mre < tolerr && gradnorm < tolgrad){ break }
alpha = thetan1[1, ]
beta0 = thetan1[2, ]
beta1 = thetan1[3, ]
}
return(list(alpha=alpha, beta0=beta0, beta1=beta1))
}
alpha_ini <- 1
beta0_ini <- 0
beta1_ini <- 0
tolerr <- 10^(-6)
tolgrad <- 10^(-5)
result.d <- Newton_fun(data=data_b, alpha=alpha_ini, beta0=beta0_ini,
beta1=beta1_ini, tolerr=10^(-6), tolgrad=10^(-5),
maxit=500)
## [1] "Iteration Halving log-likelihood ||gradient||"
## [1] "-----------------------------------------------------------------"
## [1] " 1 - -541.0000 1.6e+03"
## [1] " 1 0 -237.4147 5.5e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 2 - -237.4147 5.5e+02"
## [1] " 2 0 -139.4431 1.8e+02"
## [1] "-----------------------------------------------------------------"
## [1] " 3 - -139.4431 1.8e+02"
## [1] " 3 0 -112.5964 5.4e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 4 - -112.5964 5.4e+01"
## [1] " 4 0 -107.1589 1.2e+01"
## [1] "-----------------------------------------------------------------"
## [1] " 5 - -107.1589 1.2e+01"
## [1] " 5 0 -106.5914 1.6e+00"
## [1] "-----------------------------------------------------------------"
## [1] " 6 - -106.5914 1.6e+00"
## [1] " 6 0 -106.5795 3.9e-02"
## [1] "-----------------------------------------------------------------"
## [1] " 7 - -106.5795 3.9e-02"
## [1] " 7 0 -106.5795 2.5e-05"
## [1] "-----------------------------------------------------------------"
## [1] " 8 - -106.5795 2.5e-05"
## [1] " 8 0 -106.5795 1.0e-11"
result.d
## $alpha
## [1] 1.365758
##
## $beta0
## [1] -3.070704
##
## $beta1
## [1] -1.730871
h <- Hess_b(data=data_b, alpha=1.365758, beta0=-3.070704, beta1=-1.730871)
OIM <- sqrt(-solve(h))
## Warning in sqrt(-solve(h)): NaNs produced
# std.error of alpha
OIM[1,1]
## [1] 0.201165
# std.error of beta0
OIM[2,2]
## [1] 0.5580701
# std.error of beta1
OIM[3,3]
## [1] 0.4130815
\[Since\text{ }\text{ }\text{ }\text{ }
std.error(\hat{\theta_i}) \approx
\sqrt{[-H^{-1}\hat{(\theta)}]_{ii}}\text{ },\]
\[std.error(\hat{\alpha})= 0.201165\]
\[std.error(\hat{\beta_o})=
0.5580701\] \[std.error(\hat{\beta_1})= 0.4130815\]
h <- Hess_b(data=data_b, alpha=1.365758, beta0=-3.070704, beta1=-1.730871)
OIM <- sqrt(-solve(h))
## Warning in sqrt(-solve(h)): NaNs produced
h1 <- -solve(h)
# other way: cov2cor(-solve(h))
# Correlation of alpha & beta0
h1[1,2]/(OIM[1,1]*OIM[2,2])
## [1] -0.9203813
# Correlation of alpha & beta1
h1[1,3]/(OIM[1,1]*OIM[3,3])
## [1] -0.2641532
# Correlation of beta0 & beta1
h1[2,3]/(OIM[2,2]*OIM[3,3])
## [1] 0.03655716
\[Since\text{ }\text{ }\text{ }\text{ }
Corr(x,y)=\frac{Cov(x,y)}{\sqrt{Var(x)}\sqrt{Var(Y)}}\text{ }\text{
},\]
\[ Corr(\alpha, \beta_0)= \rho(\alpha,
\beta_0) = -0.9203813\] \[
Corr(\alpha, \beta_1)= \rho(\alpha, \beta_1) = -0.2641532\] \[ Corr(\beta_0, \beta_1)= \rho(\beta_0, \beta_1) =
0.03655716\]
As can be seen from the result above,\[\alpha\text{ }and\text{ }\beta_0\] are highly negatively correlated. \[\alpha\text{ }and\text{ }\beta_1\] are negatively correlated, and \[\beta_0\text{ }and\text{ }\beta_1\] are slightly positively correlated.