MLE응용2


#### Part 2. Locomotive Control Reliability Data
(type I censored)
\(x_i\) : failure times of i-th item (miles)
\(c_i\) : censoring times (=136,000 miles)
\(y_i = min(x_i, c_i)\)
\(\delta_i = I(x_i <c_i)\), \(i = 1, ..., n\)
Objective: 중도절단된(censored) 기관차 부품의 수명자료 (reliabilty data)를 이용하여 부 품의 수명분포를 추정하여 부품의 신뢰도(reliability)를 예측.

  1. 완전(complete) 자료 \(y_i\), \(lny_i\), \(i=1, ..., m\) 에 대한 histogram 을 그리시오.
rm(list=ls())
options(scipen=100)
options(crayon.enabled = FALSE) 
setwd("D:\\0_grad\\data")
library(tidyverse)
library(readxl)
library(optimx)
library(knitr)
library(kableExtra)
loco <- read_xls("Locomotive-FDB-datasets.xls", sheet = 1, col_names = c("y", "delta"), skip=1)
attach(loco)

#y
loco %>% filter(delta == 1) %>% 
  ggplot(aes(y)) + geom_histogram(breaks = seq(20000,140000,10000), color="white") +
  theme_bw() +
  labs("title" = "Histogram of Complete Data (y)")

#lny
loco %>% filter(delta == 1) %>% 
  ggplot(aes(log(y))) + geom_histogram(breaks = seq(10, 12, .2), color="white") +
  theme_bw() +
  labs("title" = "Histogram of Complete Data (lny)", x="lny")


  1. 위 자료에 대한 적절한 Q-Q plot을 그려보고 가능성 있는 모형을 찾고 해당모수를 추정하시오.
loco <- loco %>%  filter(delta == 1) %>% 
  mutate(m= sum(delta), rank=rank(y), pr = rank/(m+1), 
         gompertz = log(-log(1-pr)), 
         normal = qnorm(pr), 
         gumbel = -log(-log(pr)))

#2-parameter Weibull
qq1 <-  ggplot(loco, aes(gompertz, log(y))) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "2-Parameter Weibull Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(log(y) ~ gompertz, loco))$r.squared,4)*100, "%", sep = "")) + 
  theme_bw()  

#3-parameter Weibull
a <- seq(-50000, 22400, 100)
r.sq <- NULL
for (i in 1:length(a)) {
  r.sq[i] <- summary(lm(log(loco$y - a[i])~gompertz, loco))$r.squared
}
a <- a[which.max(r.sq)]
qq2 <-  ggplot(loco, aes(gompertz, log(y-a))) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "3-Parameter Weibull Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(log(y-a) ~ gompertz, loco))$r.squared,4)*100, "%", sep = ""), 
       y=paste0("ln(y-a), a=", a)) + 
  theme_bw()  

#log-nomal 
qq3 <-  ggplot(loco, aes(normal, log(y))) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "Log-normal Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(log(y) ~ normal, loco))$r.squared,4)*100, "%", sep = "")) + 
  theme_bw() 

#Gompertz
qq4 <-  ggplot(loco, aes(gompertz, y)) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "Gompertz Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(y ~ gompertz, loco))$r.squared,4)*100, "%", sep = "")) + 
  theme_bw() 

#Gumbel
qq5 <-  ggplot(loco, aes(gumbel, y)) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "Gumbel Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(y ~ gumbel, loco))$r.squared,4)*100, "%", sep = "")) + 
  theme_bw() 

#normal
qq6 <-  ggplot(loco, aes(normal, y)) + 
  geom_point(size = 2) +  geom_smooth(method = "lm", se = F) +
  labs(title = "Normal Q-Q Plot", 
       subtitle = paste("R.sq = ", round(summary(lm(y ~ normal, loco))$r.squared,4)*100, "%", sep = "")) + 
  theme_bw() 

library(gridExtra)
grid.arrange(qq1, qq2, qq3, qq4, qq5, qq6, nrow=2)


c) 다음 각 모형에 대하여 해당 모수의 MLE 및 AIC 값을 구하시오.

#2-parameter Weibull 
weibull.lm.fit.2p <- lm(log(y) ~ gompertz, loco)
weibull.2p.parameter <- c(0, weibull.lm.fit.2p$coefficients[1], weibull.lm.fit.2p$coefficients[2])

#log-likihood : sum( delta * log( pdf ) ) + sum( (1-delta) * log( 1-CDF ) )
weibull.2p.mle.fn <- function(variable){
  mu <- variable[1]
  sigma <- variable[2]
  u <- ( log(y) - mu ) / sigma
  F <- 1 - exp( - exp(u) )
  f <- exp( u - exp(u))/(sigma*y)
  - sum(delta * log(f) + (1-delta)*log(1-F))
}
weibull.optim <- optimx(par=c(weibull.2p.parameter[2], weibull.2p.parameter[3]), weibull.2p.mle.fn, 
                        control=list(all.methods=TRUE,trace=0))

weibull.2p <- c(0, optim(par=c(weibull.2p.parameter[2], weibull.2p.parameter[3]), weibull.2p.mle.fn,
                       method="BFGS",hessian=T)$par, 
                2*weibull.optim$value[1], 
                2*2)

#3-parameter Weibull
weibull.lm.fit.3p <- lm(log(y-a) ~ gompertz, loco)
weibull.3p.parameter <- c(a, weibull.lm.fit.3p$coefficients)
weibull.3p.mle.fn <- function(variable){
  a <- variable[1]
  mu <- variable[2]
  sigma <- variable[3]
  u <- ( log(y-a) - mu ) / sigma
  F <- 1 - exp( - exp(u) )
  f <- exp( u - exp(u))/(sigma*(y-a))
  - sum(delta * log(f) + (1-delta)*log(1-F))
}

weibull.optim.3p <- optimx(par=weibull.3p.parameter, weibull.3p.mle.fn, 
                           control=list(all.methods=TRUE))

weibull.3p <- c(optim(par=weibull.3p.parameter, weibull.3p.mle.fn,
                               method="BFGS",hessian=T)$par,
                2*weibull.optim.3p$value[1], 2*3)

#log-normal
lognormal.lm.fit <- lm( log(y) ~ normal, loco )
log.normal.parameter <- c(lognormal.lm.fit$coefficients)

lognormal.mle.fn <- function(variable){
  mu <- variable[1]
  sigma <- variable[2]
  u <- ( log(y) - mu ) / sigma
  F <- pnorm(u)
  f <- dnorm(u)/(sigma*y)
  - sum(delta * log(f) + (1-delta)*log(1-F))
}

log.normal.optim <- optimx(par=log.normal.parameter, lognormal.mle.fn, 
                    control=list(all.methods=TRUE,trace=0))
log.normal <- c(0, optim(par=log.normal.parameter, lognormal.mle.fn,
                                   method="BFGS",hessian=T)$par,
                2*log.normal.optim$value[1], 2*2)

#Gompertz
gompertz.lm.fit <- lm( y ~ gompertz, loco)
gompertz.parameter <- c(gompertz.lm.fit$coefficients)

gompertz.mle.fn <- function(variable){
  mu <- variable[1]
  sigma <- variable[2]
  u <- ( y - mu ) / sigma
  F <- 1 - exp( - exp(u) )
  f <- exp( u - exp(u))/sigma
  - sum(delta * log(f) + (1-delta)*log(1-F))
}
gompertz.optim <- optimx(par=gompertz.parameter, gompertz.mle.fn, 
                         control=list(all.methods=TRUE,trace=0))
gompertz <- c(0, optim(par=gompertz.parameter, gompertz.mle.fn,
                                 method="BFGS",hessian=T)$par, 
              2*gompertz.optim$value[1], 2*2)

#Gumbel
gumbel.lm.fit <- lm(y ~ gumbel, loco) 
gumbel.parameter <- c(gumbel.lm.fit$coefficients)
gumbel.mle.fn <- function(variable){
  mu <- variable[1]
  sigma <- variable[2]
  u <- ( y - mu ) / sigma
  F <- exp( - exp(-u) )
  f <- exp(-u - exp(-u))/sigma
  - sum(delta * log(f) + (1-delta)*log(1-F))
}
gumbel.optim <- optimx(par=gumbel.parameter, gumbel.mle.fn, 
                         control=list(all.methods=TRUE,trace=0))
gumbel <- c(0, optim(par=gumbel.parameter, gumbel.mle.fn,
                                 method="BFGS",hessian=T)$par, 
            2*gumbel.optim$value[1],
            2*2)

#Normal
normal.lm.fit <- lm( y ~ normal, loco)
normal.parameter <- c(normal.lm.fit$coefficients)
normal.mle.fn <- function(variable){
  mu <- variable[1]
  sigma <- variable[2]
  u <- ( y - mu ) / sigma
  F <- pnorm(u)
  f <- dnorm(u)/sigma
  - sum(delta * log(f) + (1-delta)*log(1-F))
}
normal.optim <- optimx(par=normal.parameter, normal.mle.fn, 
                       control=list(all.methods=TRUE,trace=0))
normal <- c(0, optim(par=normal.parameter, normal.mle.fn,
                               method="BFGS",hessian=T)$par, 
            2*normal.optim$value[1], 
            2*2)

result.c <- data.frame(rbind(weibull.2p, weibull.3p, log.normal, gompertz, gumbel, normal))
colnames(result.c) <- c("a", "mu", "sigma", "-2l(theta)", "2p")
result.c <- result.c %>% mutate(AIC=result.c[,4]+result.c[,5])
rownames(result.c) <- c("Weibull.2p", "Weibull.3p", "log.normal", "Gompertz", "Gumbel", "Normal")

kable(result.c) %>%   kable_styling(full_width = F, position = "left")
a mu sigma -2l(theta) 2p AIC
Weibull.2p 0.00 12.11942 0.4289763 985.9389 4 989.9389
Weibull.3p -33499.97 12.25359 0.3091112 987.8194 6 993.8194
log.normal 0.00 12.02468 0.7055005 985.3610 4 989.3610
Gompertz 0.00 122327.18661 48810.3093730 1046.5939 4 1050.5939
Gumbel 0.00 87124.05546 59738.9134097 1017.7191 4 1021.7191
Normal 0.00 114060.32832 66068.2076455 1017.5809 4 1021.5809


AIC가 작은 Weibull 2parameter 모형과 log normal 모형 선택
d) 위에서 선택된 가장 적절한 수명분포 2개를 이용하여 기관차 부품의 수명이 15만, 20만, 30만 mile 이상일 확률을 구하시오.

t <- c(150000, 200000, 300000)
model1 <- c(mu = weibull.2p[2], sigma = weibull.2p[3])
model2 <- c(mu = log.normal[2], sigma = log.normal[3])
result.d <- data.frame(t=t, weibull =exp(-exp((log(t)-model1[1])/model1[2])), 
            log.normal = 1-pnorm((log(t)-model2[1] )/model2[2]))
kable(result.d) %>%   kable_styling(full_width = F, position = "left")
t weibull log.normal
150000 0.5348041 0.5598795
200000 0.2941008 0.3985483
300000 0.0428822 0.2027535