MLE2

MLE응용2

#### Part 1. (Smoking and Mortality)
a) 각 연령구간별 중간나이 (\(x\))를 이용하여 nonsmoker 및 cigarette only 그룹의 사망확률을 각각 구하여
(\(_t\hat{q_x}, x\)) 및 \((ln[-ln(1- _t\hat{q_{x}}, x)\)의 산점도를 겹쳐서 그려 보시오.
또한 산점도의 의미를 설명하고 각 그룹별 Gompertz 법칙이 성립하는지 검토하시오.

options(scipen=100)
options(crayon.enabled = FALSE) 
setwd("D:\\0_grad\\data")

library(tidyverse)
library(readxl)
library(optimx)
library(knitr)
library(kableExtra)

smoke <- read_xlsx("smoke.xlsx", col_names=T, na="*", sheet=1, skip=1) 
names(smoke) <- c("age", "non.pop", "non.death", "cig.pop", "cig.death")
smoke <- smoke %>% mutate(non.death.rate = non.death/non.pop, 
                          cig.death.rate = cig.death/cig.pop)
x <- c(median(40:44), median(45:49), median(50:54), median(55:59), median(60:64), median(65:69), median(70:74), median(75:80), 80)
smoke <- cbind(smoke, x)

산점도

#(tqx, x)
ggplot(smoke) + 
  geom_point(aes(x, non.death.rate, color="non-smoker")) +
  geom_point(aes(x, cig.death.rate, color="smoker")) +
  scale_color_manual(name="Smoke", values=c("blue", "red")) +
  theme_bw() +
  labs(x="Age", y="Death.Rate")

#(ln(-ln(1-tqx)), x)
ggplot(smoke) + 
  geom_point(aes(x, log(-log(1-non.death.rate)), color="non-smoker")) +
  geom_point(aes(x, log(-log(1-cig.death.rate)), color="smoker")) +
  scale_color_manual(name="Smoke", values=c("blue", "red")) +
  theme_bw() +
  labs(x="Age", y="Death.Rate")

b) 사망률에 대한 Gompertz 법칙을 가정하고 각 경우 (a,b) 값을 아래 두 방법으로 구하고 차이 및 장단점을 설명하시오.

#방법 1
smoke <- smoke %>% mutate(non.z = log(-log(1-non.death.rate)), 
                          cig.z = log(-log(1-cig.death.rate)))
non.lm.fit <- lm(non.z ~ x, data=smoke)
cig.lm.fit <- lm(cig.z ~ x, data=smoke)

method1.result <- rbind(non.lm.fit$coefficients, cig.lm.fit$coefficients)
rownames(method1.result) <- c("non-smoker", "smoker")
colnames(method1.result) <- c("a*", "b")
kable(method1.result) %>%   kable_styling(full_width = F, position = "left")

	a*	b
non-smoker	-6.425917	0.0722823
smoker	-6.396859	0.0782870

#방법2 - Optim/ MLE
#Gompit
gompit.function <- function(variable){
  alpha <- variable[1]
  beta <- variable[2]
  z <- alpha + beta * x
  gompit <- 1-exp(-exp(z))
  return(sum(y*log(gompit)+(n-y)*log(1-gompit)))
}
n <- smoke$non.pop
y <- smoke$non.death
non.variable <- optim(par=c(non.lm.fit$coefficients[1],non.lm.fit$coefficients[2]),gompit.function,hessian=T,control=list(fnscale=-1))$par

n <- smoke$cig.pop
y <- smoke$cig.death
cig.variable <- optim(par=c(non.lm.fit$coefficients[1],non.lm.fit$coefficients[2]),gompit.function,hessian=T,control=list(fnscale=-1))$par

method2.result <- rbind(non.variable, cig.variable)
rownames(method2.result) <- c("non-smoker", "smoker")
colnames(method2.result) <- c("a*", "b")
kable(method2.result) %>%   kable_styling(full_width = F, position = "left")

	a*	b
non-smoker	-6.690550	0.0758840
smoker	-6.259466	0.0767435

#방법2 - GLM
non.glm.fit <- glm(cbind(non.death, non.pop-non.death) ~ x, family = binomial(link="cloglog"), data=smoke)
cig.glm.fit <- glm(cbind(cig.death, cig.pop-cig.death) ~ x, family = binomial(link="cloglog"), data=smoke)
method2.result.GLM <- rbind(non.glm.fit$coefficients, cig.glm.fit$coefficients)
rownames(method2.result.GLM) <- c("non-smoker", "smoker")
colnames(method2.result.GLM) <- c("a*", "b")
kable(method2.result.GLM) %>%   kable_styling(full_width = F, position = "left")

	a*	b
non-smoker	-6.691208	0.0758921
smoker	-6.260516	0.0767619

c) b 에서 구한 (a,b) 값을 이용하여 각 그룹별 생명표를 구하시오.

l.x.t <- function(a.star, b, x=20, t=seq(0, 70, by=5)){
  a <- a.star - log((exp(b*6)-1)/b)
  tpx <-   exp(-exp(a+b*x)*(exp(b*t)-1)/b)
  l.x.t <- 10^5 * tpx
  return(list(l.x.t, tpx))
}

non.l.x.t <- l.x.t(a.star=non.variable[1], b=non.variable[2])[1]
cig.l.x.t <- l.x.t(a.star=cig.variable[1], b=cig.variable[2])[1]

non.ex <- NULL
cig.ex <- NULL

for (i in 1:length(seq(20, 90, by=5))) {
  non.ex[i] <- 1/2 + sum(unlist(l.x.t(a.star= non.variable[1], b=non.variable[2], 
                        x = 20 + 5*(i-1), t=1:100)[2]))
  cig.ex[i] <- 1/2 + sum(unlist(l.x.t(a.star= cig.variable[1], b=cig.variable[2],
                               x = 20 + 5*(i-1), t=1:100)[2]))
}

life.table <- data.frame(x=seq(20, 90, 5), non.l.x.t, non.ex, cig.l.x.t, cig.ex)
life.table <- life.table %>% mutate(d.ex = non.ex - cig.ex, 
                                    "24*dex/ex" = 24*d.ex/non.ex)

colnames(life.table) <- c("x", "non.lx", "non.ex", "cig.lx", "cig.ex", "dex", "24*dex/ex")
kable(life.table) %>%   kable_styling(full_width = F, position = "left")

x	non.lx	non.ex	cig.lx	cig.ex	dex	24*dex/ex
20	100000.00	53.964009	100000.000	47.971086	5.992923	2.665298
25	99547.45	49.197287	99292.740	43.293906	5.903382	2.879857
30	98889.76	44.506869	98263.721	38.719540	5.787329	3.120774
35	97936.40	39.914349	96772.679	34.275334	5.639016	3.390670
40	96559.63	35.445735	94625.107	29.993221	5.452514	3.691850
45	94582.30	31.131503	91559.059	25.909202	5.222301	4.025994
50	91765.15	27.006198	87237.926	22.062093	4.944105	4.393751
55	87798.30	23.107378	81262.254	18.491423	4.615955	4.794266
60	82307.36	19.473766	73224.864	15.234437	4.239329	5.224664
65	74894.36	16.142553	62845.282	12.322433	3.820120	5.679578
70	65244.73	13.146003	50215.296	9.776899	3.369104	6.150805
75	53333.50	10.507741	36126.445	7.606210	2.901532	6.627186
80	39724.47	8.239397	22280.443	5.803729	2.435668	7.094699
85	25827.30	6.338371	10960.870	4.347983	1.990388	7.536528
90	13766.41	4.787450	3869.619	3.205114	1.582336	7.932420

비흡연자와 흡연자의 \((l_x, l^*_x)\) 및 \((e_x, e^*_x)\)시계열 도표를 서로 겹쳐서 그려 보고 흡연이 잔여수명에 미치는 영향을 설명하시오.

#생존자수 
ggplot(life.table) +
  geom_point(aes(x, non.lx, color="non-smoker")) + geom_line(aes(x, non.lx, color="non-smoker")) +
  geom_point(aes(x, cig.lx, color="smoker")) + geom_line(aes(x, cig.lx, color="smoker")) +
  theme_bw() +
  scale_color_manual(name="Smoke", values=c("blue", "red")) +
  labs(title = "x세 생존자 수", x="Age(x)", y="생존자수")

#잔여수명 
ggplot(life.table) +
  geom_point(aes(x, non.ex, color="non-smoker")) + geom_line(aes(x, non.ex, color="non-smoker")) +
  geom_point(aes(x, cig.ex, color="smoker")) + geom_line(aes(x, cig.ex, color="smoker")) +
  theme_bw() +
  scale_color_manual(name="Smoke", values=c("blue", "red")) +
  labs(title = "평균 잔여수명", x="Age(x)", y="평균 잔여수명")

생존자수의 경우, 저연령군에서는 큰 차이가 없으나, 고연령군에서는 같은나이 일 때 비흡연자에 비해 흡연자의 생존자수가 적음을 알 수 있다.
이 때 두 그룹간 차이는 연령이 높아질수록 커진다.
잔여 수명의 경우, 모든 연령에서 비흡연자의 잔여수명이 높음을 볼 수 있고,
사람들의 연령이 증가할 수록 차이는 줄어든다.

MLE2

YJ

MLE응용2