MLE응용

Part 1) Challenger 우주왕복선 O-ring 고장확률예측
변수
\(y_i\) : 고장난 O-ring 개수(erosion/blowby)
\(n_i\) : O-rings 개수 (=6)
\(x_i\) : 온도(Temp/F) \(i=1,...25\)

  1. 위의 3개의 link function 에 대한 AIC 값을 각각 구하고 이를 이용하여 최적 예측 모형을 구하시오.
    (AIC = \(-2 l (\hat {\theta} + 2p; l(\theta) = lnL(\theta); p =\) # of free parameters in the model; AIC 가 가장 작은 모형선택 )
#Setup
options(scipen=100)
options(crayon.enabled = FALSE) 
setwd("D:\\0_grad\\data")

library(tidyverse)
library(optimx)
library(readxl)
library(knitr)
library(kableExtra)
oring <- read_xlsx("O-ring-Leukemia-data.xlsx", col_names=T, na="*", sheet=1) 
names(oring)<-c('flight','orings','damage','temp')
n <- 6
data <- na.omit(oring)
attach(data)
y <- damage
x <- temp
p <- 2
# logit.glm <- glm(cbind(damage, 6-damage) ~ temp, data=data, family=binomial(link="logit"))
# probit.glm <- glm(cbind(damage, 6-damage) ~ temp, data=data, family=binomial(link="probit"))
# gompit.glm <- glm(cbind(damage, 6-damage) ~ temp, data=data, family=binomial(link="cloglog"))

#logit
logit.function <- function(variable){
  alpha <- variable[1]
  beta <- variable[2]
  z <- alpha + beta * x
  logit <- 1/(1+exp(-z))
  sum(log(logit^(y) * (1-logit)^(n-y)))
}
logit.initial.value <- optim(par=c(3,0),logit.function,method="BFGS",hessian=T,control=list(fnscale=-1))$par
logit.optim <- optimx(par=c(logit.initial.value), logit.function, 
                      control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
logit.result <- logit.optim[which.min(logit.optim[1:5,]$value),]
logit.l.theta <- logit.result$value
logit.AIC <- -2 * logit.l.theta + 2 * p 


#Probit
probit.function <- function(variable){
  alpha <- variable[1]
  beta <- variable[2]
  z <- alpha + beta * x
  probit <- pnorm(z)
#  return(sum(y*log(probit)+(n-y)*log(1-probit)))
  sum(log(probit^(y) * (1-probit)^(n-y)))
}
probit.initial.value <- optim(par=c(1,0),probit.function,method="BFGS",hessian=T,control=list(fnscale=-1))$par
probit.optim <- optimx(par=c(probit.initial.value),probit.function, 
                       control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
probit.result <- probit.optim[1,]
probit.l.theta <- probit.result$value
probit.AIC <- -2 * probit.l.theta + 2 * p 

#Gompit
gompit.function <- function(variable){
  alpha <- variable[1]
  beta <- variable[2]
  z <- alpha + beta * x
  gompit <- 1-exp(-exp(z))
  return(sum(y*log(gompit)+(n-y)*log(1-gompit)))
}

gompit.initial.value <- optim(par=c(1,0),gompit.function,method="BFGS",hessian=T,control=list(fnscale=-1))$par
gompit.optim <- optimx(par=c(gompit.initial.value),gompit.function, 
                       control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
gompit.result <- gompit.optim[1,]
gompit.l.theta <- gompit.result$value
gompit.AIC <- -2 * gompit.l.theta + 2 * p 

a.result <- data.frame(rbind(c(-2*logit.l.theta, 2*p, logit.AIC), 
                     c(-2*probit.l.theta, 2*p, probit.AIC), 
                     c(-2*gompit.l.theta, 2*p, gompit.AIC)))

rownames(a.result) <- c("logit", "probit", "gompit")
colnames(a.result) <- c("-2l(theta)", "2p", "AIC")

kable(a.result) %>%   kable_styling(full_width = F, position = "left")
-2l(theta) 2p AIC
logit 60.39653 4 64.39653
probit 60.62158 4 64.62158
gompit 60.32436 4 64.32436


gompit 모형 선택


b) 위에서 선택된 모형에서 \(H_0 : \beta = 0\) v.s. \(H_1 : \beta \neq 0\) 두 모형에 대해 AIC를 구하고 적절한 예측모형을 선택하시오

gompit.h0.function <- function(variable){
  alpha <- variable[1]
  beta <- 0
  z <- alpha + beta * x
  gompit <- 1-exp(-exp(z))
  return(sum(y*log(gompit)+(n-y)*log(1-gompit)))
}

p.h0 <- 1
gompit.h0.initial.value <- optim(par=0,gompit.h0.function,hessian=T,control=list(fnscale=-1))$par
gompit.h0.optim <- optimx(par=0,gompit.h0.function, 
                       control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
gompit.h0.result <- gompit.h0.optim[1,]
gompit.h0.l.theta <- gompit.h0.result$value
gompit.h0.AIC <- -2 * gompit.h0.l.theta + 2 * p.h0

b.result <- data.frame(rbind(c(-2*gompit.h0.l.theta, 2*p.h0, gompit.h0.AIC), 
                c(-2*gompit.l.theta, 2*p, gompit.AIC)))

rownames(b.result) <- c("H0", "H1")
colnames(b.result) <- c("-2l(theta)", "2p", "AIC")
kable(b.result) %>%   kable_styling(full_width = F, position = "left")
-2l(theta) 2p AIC
H0 66.54037 2 68.54037
H1 60.32436 4 64.32436


c) 위에서 선택된 모형에서 추정된 모수들의 추정오차를 구하고 이를 이용하여 각 모수의 95% 신뢰구간을 구하시오

gompit <- optim(par=c(gompit.initial.value),gompit.function,method="BFGS",hessian=T,control=list(fnscale=-1))
Hessian <- gompit$hessian
Inv_H <- solve(Hessian)
se1 <- sqrt(-Inv_H[1,1])
se2 <- sqrt(-Inv_H[2,2])
c.result <- data.frame(rbind(c(gompit$par[1] - qnorm(.975)*se1, gompit$par[1] + qnorm(.975)*se1), 
                             c(gompit$par[2] - qnorm(.975)*se2, gompit$par[2] + qnorm(.975)*se2)))

rownames(c.result) <- c("alpha", "beta")
colnames(c.result) <- c("lower", "upper")
kable(c.result) %>%   kable_styling(full_width = F, position = "left")
lower upper
alpha -0.7255749 10.2521764
beta -0.1969486 -0.0261142


d) 위 결과를 이용하여 \(x^*=31F\) 일 때 O-ring 고장확률 \(p(x^*|\hat \theta) = G(\hat \alpha +\hat \beta x^*)\) 및 고장 개수 ;
\(E(y|x^*) = n_i p(\hat \alpha + \hat \beta x^*)\)계산하고 그 의미를 설명하시오.

temp <- 31
z <- gompit$par[1] + gompit$par[2] * temp
p <- 1 - exp(-exp(z))
E <- n * p

d.result <- cbind(p, E)
colnames(d.result) <- c("고장확률", "고장개수")
rownames(d.result) <- c("")
kable(c.result) %>%   kable_styling(full_width = F, position = "left")
lower upper
alpha -0.7255749 10.2521764
beta -0.1969486 -0.0261142


온도가 내려갈 수록 고장날 확률은 높아진다.

Part 2) Leukemia(백혈병)환자 생존수명 예측
변수 :
\(y_i\) : 백혈병 환자 생존시간 (weeks)
\(x_i\) : WBC ( 백혈구수치)
a) \((lny_i, lnx_i)\)의 산점도를 그리고 \(lny_i\)에 대한 단순선형회귀모형을 가정할 때
모수 \((\alpha, \beta)\)의 최소제곱추정량 LSE \((\hat \alpha, \hat \beta)\)에 대한 잔차 \({e_i}\)를 구하여
잔차의 Normal Q-Q Plot 및 Gompertz Q-Q plot을 그리고 적절한 오차의 분포를 찾으시오.

#setting
leuk <- read_xlsx("O-ring-Leukemia-data.xlsx", col_names=T, na="*", sheet=2)
names(leuk) <- c("week", "WBC")

#산점도
ggplot(leuk, aes(log(WBC), log(week))) +
  geom_point() + theme_bw()

#잔차의 Q-Q Plot
lm.fit <- lm(log(week) ~ log(WBC), data=leuk)
leuk <- leuk %>% mutate(residual = lm.fit$residuals, rank = rank(residual), pr = rank / (nrow(leuk)+1), 
                        normal = qnorm(pr), gompertz=log(-log(1-pr)))
#normal
ggplot(leuk, aes(normal, residual)) + 
  geom_point(size=2) + geom_smooth(method="lm", se=F) +
  labs(title = "Normal Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(residual ~ normal, leuk))$r.squared,4)*100, "%", sep = ""),x=NULL, y=NULL) + theme_bw()

#gompertz
ggplot(leuk, aes(gompertz, residual)) + 
  geom_point(size=2) + geom_smooth(method="lm", se=F) +
  labs(title = "Gompertz Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(residual ~ gompertz, leuk))$r.squared,4)*100, "%", sep = ""),x=NULL, y=NULL) +   theme_bw()


잔차는 Normal 을 따른는것으로 볼수 있다.


b) (log-normal 모형) \(lny_i\)에 대한 정규 선형 회귀모형 \(lny_i = \alpha + \beta lnx_i + \sigma \epsilon_i\) , \(\epsilon_i\)~ \(N(0,1)\) 을 가정할 때
모수 \(\theta\) = (\(\alpha , \beta , \sigma\))\(^{'}\)의 MLE \(\hat \theta = (\hat \alpha, \hat \beta, \hat \sigma)^{'}\)및 log-normal 모형의 AIC값을 구하시오.

attach(leuk)
normal.function <- function(variable){
  alpha <- variable[1]
  beta  <- variable[2]
  sigma <- variable[3]
  mean <- alpha + log(WBC)*beta 
  sum(log(dnorm(log(week), mean, sigma)))
  }

summary.lm.fit <- summary(lm.fit)
normal.initail.value <- optim(par=c(summary.lm.fit$coefficient[1], summary.lm.fit$coefficient[2], summary.lm.fit$sigma), 
                              normal.function, hessian=T, control=list(fnscale=-1))
normal.optim <- optimx(par=c(normal.initail.value$par), normal.function, 
                       control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
normal.result <- normal.optim[which.min(normal.optim[1:5,]$value),]
normal.l.theta <- normal.result$value
normal.AIC <- -2 * normal.l.theta + 2 * length(normal.initail.value$par)

#result
b.result <- data.frame(cbind(normal.result$p1, normal.result$p2, normal.result$p3, normal.AIC))
colnames(b.result) <- c("alpha", "beta", "sigma", "AIC")
rownames(b.result) <- c("")
kable(b.result) %>%   kable_styling(full_width = F, position = "left")
alpha beta sigma AIC
5.425238 -0.8177172 1.157962 59.23033


c) (Weibull 모형 - log-Gompertz 모형) \(y_i\)가 Weibull 모형을 따를 때
모수 \(\theta\) = (\(\alpha , \beta , \sigma\))\(^{'}\)의 MLE \(\hat \theta = (\hat \alpha, \hat \beta, \hat \sigma)^{'}\)및 log-Gompertz 모형의 AIC값을 구하시오.

gompertz.function <- function(variable){
  alpha <- variable[1]
  beta  <- variable[2]
  sigma <- variable[3]
  z  <- (log(week) - alpha - beta * log(WBC))/sigma #residual
  gomertz <- exp(z-exp(z))
  sum(log(gomertz)-log(sigma))
}

gompertz.initail.value <- optim(par=c(summary.lm.fit$coefficient[1], summary.lm.fit$coefficient[2], summary.lm.fit$sigma), 
                                gompertz.function, hessian=T, control=list(fnscale=-1))
gompertz.optim <- optimx(par=c(gompertz.initail.value$par), gompertz.function, 
                       control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
gompertz.result <- gompertz.optim[1,]
gompertz.l.theta <- gompertz.result$value
gompertz.AIC <- -2 * gompertz.l.theta + 2 * length(gompertz.initail.value$par)

c.result <- data.frame(cbind(gompertz.result$p1, gompertz.result$p2, gompertz.result$p3, gompertz.AIC))
colnames(c.result) <- c("alpha", "beta", "sigma", "AIC")
rownames(c.result) <- c("")
kable(c.result) %>%   kable_styling(full_width = F, position = "left")
alpha beta sigma AIC
5.146173 -0.4769812 0.9784919 59.44568


d) c) 에서 추정된 모수들의 추정오차를 구하고 이를 이용하여 각 모수의 95% 신뢰구간을 구하시오.

hessian <- gompertz.initail.value$hessian
inv_hessian <- solve(hessian)
se1 <- sqrt(-inv_hessian[1,1])
se2 <- sqrt(-inv_hessian[2,2])
se3 <- sqrt(-inv_hessian[3,3])
mle.variable <- gompertz.result[1:3]

d.result <- data.frame(rbind(c(se1, mle.variable[1] - qnorm(.975)*se1, mle.variable[1] + qnorm(.975)*se1), 
                             c(se2, mle.variable[2] - qnorm(.975)*se2, mle.variable[2] + qnorm(.975)*se2),
                             c(se3, mle.variable[3] - qnorm(.975)*se2, mle.variable[3] + qnorm(.975)*se2)))
colnames(d.result) <- c("SE", "Lower", "Upper")
rownames(d.result) <- c("alpha", "beta", "sigma")
kable(d.result) %>%   kable_styling(full_width = F, position = "left")
SE Lower Upper
alpha 0.504128082797674 4.15810029987412 6.13424607163145
beta 0.181381352561592 -0.832482133993207 -0.121480297017444
sigma 0.19728523094148 0.62299100134024 1.333992838316


e) 위에서 구한 두 모형의 AIC 값을 비교하여 어느 모형이 더 적절한지 판정하시오.

e.result <- data.frame(rbind(c(-2 * normal.l.theta, 2*length(normal.initail.value$par), normal.AIC), 
                             c(-2 * gompertz.l.theta, 2*length(gompertz.initail.value$par), gompertz.AIC)))
rownames(e.result) <- c("log-normal", "log-Gompertz")
colnames(e.result) <- c("-2l(theta)", "2p", "AIC")
kable(e.result) %>%   kable_styling(full_width = F, position = "left")
-2l(theta) 2p AIC
log-normal 53.23033 6 59.23033
log-Gompertz 53.44568 6 59.44568


f) 위에서 추정된 두 모형을 이용하여 \(x=20\)인 백혈병 환자의 1년, 2년, 5년 생존확률을 각각 구하여 서로 비교해보고 장단점을 설명하시오.

x <- 20 
t <- c(52, 104, 260)
normal.mu.hat <- as.numeric(normal.result[1] + normal.result[2]*log(x))
normal.sigma.hat <- as.numeric(normal.result[3])
normal.pred <- 1-pnorm((log(t)-normal.mu.hat)/normal.sigma.hat)

gompertz.mu.hat <- as.numeric(gompertz.result[1] + gompertz.result[2]*log(x))
gompertz.sigma.hat <- as.numeric(gompertz.result[3])
gompertz.pred <- 1-(1-exp(-exp((log(t)-gompertz.mu.hat)/gompertz.sigma.hat)))

f.result <- data.frame(cbind(normal.pred, gompertz.pred))
rownames(f.result) <- c("52week", "104week", "260week")
colnames(f.result) <- c("Normal", "Gompertz")
kable(f.result) %>%   kable_styling(full_width = F, position = "left")
Normal Gompertz
52week 0.1997336 0.2807943
104week 0.0747689 0.0758297
260week 0.0127922 0.0013887


Part 3) Q-Q plot 을 이용한 Weibull 분포 모수추정
a) Weibull 논문의 Table1, Table 3 자료에 대하여 histogram 을 그려보고 적절한 EVD(Extreme Value Distribution)를 제시하시오.

#data setting
table1 <- read_xlsx("steel.xlsx", col_names=T, na="*", sheet=1) 
table3 <- read_xlsx("steel.xlsx", col_names=T, na="*", sheet=2)
table1$n <- diff(c(0, table1$observed.value))
table3$n <- diff(c(0, table3$observed.value))

#histogram
ggplot(table1, aes(x, n)) +
  geom_histogram(stat="identity") + theme_bw() + labs(x=NULL, y=NULL)

ggplot(table3, aes(x, n)) +
  geom_histogram(stat="identity") + theme_bw() + labs(x=NULL, y=NULL)


b) 각 자료에 대해 Group화된 자료에 대한 적절한 Q-Q plot을 그려 3-parameter Weibull 분포의 모수 \((a, b, \alpha)\)를 추정하시오.
\(F(x) = 1-exp[-\{(x-a)/b \}^{\alpha}\), \(x>a\), \(b, \alpha>0\)
\(f(x) = (1/b) \{(x-a)/b \}^{\alpha-1} exp [-\{(x-a)/b\}^{\alpha}]\), \(x>a\)

#table1
table1 <- table1 %>% mutate(pr = observed.value/(389+1))
r.sq <- NULL
for (a in 1:31) {                        
  r.sq[a] <- summary(lm(log(x-a) ~ log(-log(1-pr)), data=table1))$r.squared 
  }

a <- seq(29, 31.9, by=0.01)
r.sq <- NULL
for (i in 1:length(a)) {                        
  r.sq[i] <- summary(lm(log(x-a[i]) ~ log(-log(1-pr)), data=table1))$r.squared 
}
table1.a <- a[which.max(r.sq)]

table1 <- table1 %>% mutate(new.y = log(x-table1.a), 
                            new.x = log(-log(1-pr)))

ggplot(table1, aes(new.x, new.y)) + 
  geom_point(size=2) + geom_smooth(method="lm", se=F) +
  labs(title = "Table1 Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(new.y ~ new.x, table1))$r.squared,4)*100, "%", sep = ""),x="", y="rain") +
  theme_bw()

table1.lm.fit <- lm(log(x-table1.a) ~ log(-log(1-pr)), data=table1)
table1.mu <- table1.lm.fit$coefficients[1]
table1.sigma <- table1.lm.fit$coefficients[2]
table1.alpha <- 1/table1.sigma
table1.b <- exp(table1.mu)
table1.b.result <- cbind(table1.a, table1.b, table1.alpha)
rownames(table1.b.result) <- c("")
kable(table1.b.result) %>%   kable_styling(full_width = F, position = "left")
table1.a table1.b table1.alpha
30.4 5.949356 2.814474
#table3
table3 <- table3 %>% mutate(pr = observed.value /(3000+1))
a <- seq(0, 0.9, by=0.01)
r.sq <- NULL
for (i in 1:length(a)) {                        
  r.sq[i] <- summary(lm(log(x-a[i]) ~ log(-log(1-pr)), data=table3))$r.squared 
}
table3.a <- a[which.max(r.sq)]
table3 <- table3 %>% mutate(new.y = log(x-table3.a), 
                            new.x = log(-log(1-pr)))
ggplot(table3, aes(new.x, new.y)) + 
  geom_point(size=2) + geom_smooth(method="lm", se=F) +
  labs(title = "Table3 Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(new.y ~ new.x, table3))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + 
  theme_bw()

table3.lm.fit <- lm(log(x-table3.a) ~ log(-log(1-pr)), data=table3)
table3.mu <- table3.lm.fit$coefficients[1]
table3.sigma <- table3.lm.fit$coefficients[2]
table3.alpha <- 1/table3.sigma
table3.b <- exp(table3.mu)
table3.b.result <- cbind(table3.a, table3.b, table3.alpha)
rownames(table3.b.result) <- c("")
kable(table3.b.result) %>%   kable_styling(full_width = F, position = "left")
table3.a table3.b table3.alpha
0.37 3.990694 1.522365


  1. 아래 그룹화 된 자료의 MLE 방법을 이용하여 모수를 추정하고 b),c) 방법의 장단점을 설명하시오
#MLE Function
mle.function <- function(data, par) {
  a <- par[1]
  b <- par[2]
  alpha <- par[3]
  x <- data$x
  Fx <- 1- exp(-((x-a)/b)^alpha)
  pi <- diff(c(0, Fx))
  return(sum(data$n*log(pi)))
}

#table1
table1.optim <- optim(par=c(table1.a, table1.b, table1.alpha), data=table1, mle.function,  hessian=T, control=list(fnscale=-1))
table1.result <- t(data.frame(table1.optim$par))
colnames(table1.result) <- c("a", "b", "alpha")
rownames(table1.result) <- c("")
kable(table1.result) %>%   kable_styling(full_width = F, position = "left")
a b alpha
30.30591 5.996021 2.923084
#table3
table3.optim <- optim(par=c(table3.a,table3.b,table3.alpha), mle.function, data=table3, hessian=T, control=list(fnscale=-1))
table3.result <- t(data.frame(table3.optim$par))
colnames(table3.result) <- c("a", "b", "alpha")
rownames(table3.result) <- c("")
kable(table3.result) %>%   kable_styling(full_width = F, position = "left")
a b alpha
0.3313826 4.03279 1.55971


d) c) 에서 추정된 모수들의 추정오차를 구하고 이를 이용하여각 모수의 95% 신뢰구간을 구하시오.

#table1
table1.hessian <- table1.optim$hessian
table1.inv.h <- solve(table1.hessian)
table1.se1 <- sqrt(-table1.inv.h[1,1])
table1.se2 <- sqrt(-table1.inv.h[2,2])
table1.se3 <- sqrt(-table1.inv.h[3,3])
table1.result <- data.frame(rbind(c(table1.b.result[1], table1.se1, table1.b.result[1] - qnorm(.975)*table1.se1, table1.b.result[1] + qnorm(.975)*table1.se1), 
                                  c(table1.b.result[2], table1.se2, table1.b.result[2] - qnorm(.975)*table1.se2, table1.b.result[2] + qnorm(.975)*table1.se2), 
                                  c(table1.b.result[3], table1.se3, table1.b.result[3] - qnorm(.975)*table1.se3, table1.b.result[3] + qnorm(.975)*table1.se3)))
colnames(table1.result) <- c("estimator", "SE", "Lower", "Upper")
rownames(table1.result) <- c("a", "b", "alpha")
kable(table1.result) %>%   kable_styling(full_width = F, position = "left")
estimator SE Lower Upper
a 30.400000 0.5322226 29.356863 31.443137
b 5.949356 0.5799122 4.812749 7.085963
alpha 2.814474 0.3287433 2.170148 3.458798
#table3
table3.hessian <- table3.optim$hessian
table3.inv.h <- solve(table3.hessian)
table3.se1 <- sqrt(-table3.inv.h[1,1])
table3.se2 <- sqrt(-table3.inv.h[2,2])
table3.se3 <- sqrt(-table3.inv.h[3,3])
table3.result <- data.frame(rbind(c(table3.b.result[1], table3.se1, table3.b.result[1] - qnorm(.975)*table3.se1, table3.b.result[1] + qnorm(.975)*table3.se1), 
                                  c(table3.b.result[2], table3.se2, table3.b.result[2] - qnorm(.975)*table3.se2, table3.b.result[2] + qnorm(.975)*table3.se2), 
                                  c(table3.b.result[3], table3.se3, table3.b.result[3] - qnorm(.975)*table3.se3, table3.b.result[3] + qnorm(.975)*table3.se3)))
colnames(table3.result) <- c("estimator", "SE", "Lower", "Upper")
rownames(table3.result) <- c("a", "b", "alpha")
kable(table3.result) %>%   kable_styling(full_width = F, position = "left")
estimator SE Lower Upper
a 0.370000 0.0618628 0.2487511 0.4912489
b 3.990694 0.0930034 3.8084110 4.1729776
alpha 1.522365 0.0403153 1.4433487 1.6013816