Part1) KOSPI 일별수익률의 VAR(Value at Risk) 값 계산
#Setup
library(tidyverse)
library(jmuOutlier)
library(knitr)
library(kableExtra)
options(scipen = 100)
setwd("D:\\0_grad\\data")
kospi <- read.csv("kospi_day.csv", skip=3)
names(kospi) <- c("date", "P")
kospi$P <- as.numeric(gsub(",", "", kospi$P))
kospi$date <- as.Date(kospi$date)
kospi <- kospi %>% mutate(rt=log(P/lag(P)))
Histogram & Q-Q Plot
#histogram
ggplot(kospi) + geom_histogram(aes(rt)) +
theme_bw() + labs(title="Histogram of rt")
#Q-Q plot
kospi <- kospi %>% mutate(rank=rank(rt), pr=rank/(nrow(kospi)+1)) %>% mutate(norm=qnorm(pr))
ggplot(kospi, aes(norm, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Normal Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ norm, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
직선이 normal Q-Q plot에 꽤 잘 맞으므로 정규성 가정이 성립한다.
#Q-Q plot
kospi <- kospi %>% mutate(logistic=log(pr/(1-pr)), laplace=qlaplace(pr), t.1=qt(pr,1), t.2=qt(pr,2), t.3=qt(pr,3))
#Logistic
ggplot(kospi, aes(logistic, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Logistic Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ logistic, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
#Laplace
ggplot(kospi, aes(laplace, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Laplace Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ laplace, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
#t1
ggplot(kospi, aes(t.1, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "t.1 Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ t.1, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
#t2
ggplot(kospi, aes(t.2, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "t.2 Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ t.2, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
#t3
ggplot(kospi, aes(t.3, rt)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "t.3 Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rt ~ t.3, kospi))$r.squared,4)*100, "%", sep = ""),
x="", y="rt") + theme_bw()
자유도가 3인 t분포가 가장 적절한 것으로 보인다.
lm.kospi <- lm(rt~t.3,data=kospi)
mu.kospi <- lm.kospi$coefficients[1]
sigma.kospi <- lm.kospi$coefficients[2]
kable(data.frame("mu"=mu.kospi, "sigma"=sigma.kospi))%>%
kable_styling(full_width = F,
position = "left")
mu | sigma | |
---|---|---|
(Intercept) | 0.0005674 | 0.0083474 |
alpha <- c(0.01, 0.004, 0.001, 0.0004)
Var.alpha <- -(mu.kospi+sigma.kospi*qt(alpha,3)) #VaR= -mean(R) - sqrt(sigma)*qnorm(c)
kable(cbind(alpha,Var.alpha)) %>%
kable_styling(full_width = F,
position = "left")
alpha | Var.alpha |
---|---|
0.0100 | 0.0373357 |
0.0040 | 0.0522058 |
0.0010 | 0.0846975 |
0.0004 | 0.1157598 |
c) 분포의 왼쪽 꼬리부분을 GPD(Generalized Pareto Distribution)를 이용해 적합 시키려한다. \(Loss=-r_t >0.04\)인 경우 exess 값을 \(X=Loss-0.04\)로 정의할 때 \(X\)의Q-Q plot 을 그려보고 적절한 GPD를 찾아보시오.
GPD : \(G_{\gamma , \sigma} (x) = 1-(1+ \gamma x /\sigma )^{-1/\gamma}, x>0, \sigma>0, \gamma\)
(GPD) 모수 \((\gamma, \sigma)\) 추정 방법
방법 1.
\(ln(1+x_{(r)} /a)=\gamma [-ln(1-p_r )+e_r, p_r=r/(m+1), a=\sigma/\gamma\)에서 a값이 0.01, 0.05, 0.1 등일 때, Q-Q plot을 그려보아 가장 직선에 가까운 a값과 해당 직선의 기울기 \(\gamma\) 값을 추정후 \(\sigma\) 추정
setwd("D:\\0_grad\\data")
kospi <- read.csv("kospi_day.csv", skip=3)
names(kospi) <- c("date", "P")
kospi$P <- as.numeric(gsub(",", "", kospi$P))
kospi$date <- as.Date(kospi$date)
kospi <- kospi %>% mutate(rt=log(P/lag(P))) %>% mutate(Loss=-rt, X=Loss-0.04) %>% filter(X>0)
kospi <- kospi %>% mutate(rank=rank(X), pr=rank/(nrow(kospi)+1), exp=qexp(pr))
method1.function <- function(a){
method1.lm <- lm(log(1+X/a)~exp, data=kospi)
slope <- method1.lm$coefficients[2]
r2 <- summary(method1.lm)$r.squared
return(c(slope, r2))
}
interval <- 0.01
a.matrix <- data.frame("a"=seq(interval, 0.5, by=interval), "slope"=NA, "r2"=NA)
for (i in 1:nrow(a.matrix)){
a.matrix[i,2:3] <- method1.function(a.matrix[i,1])
}
final.a <- a.matrix[which.max(a.matrix$r2),1]
final.gamma <- a.matrix[which.max(a.matrix$r2),2]
final.sigma <- final.a*final.gamma
ggplot(kospi, aes(exp, log(1+X/final.a))) + geom_point(size=2) +
geom_smooth(method="lm", se=T) +
labs(title = "Q-Q Plot",
subtitle = paste("R.sq = ", round(a.matrix[which.max(a.matrix$r2),3]*100,2), "%",
" a = ", final.a,
" gamma = ", round(final.gamma,2),
" sigma = ", round(final.sigma,2), sep=""),
x="", y="log(1+X/a)") + theme_bw()
방법2.
\(x_{r} = \sigma \cdot G^{-1}_{\gamma} (p_r) + e_{\gamma}\), \(G_{\gamma}^{-1}(p)=((1-p)^{-\gamma} -1)/\gamma\) 에서 \(\gamma\)값을 변화시키면서 Q-Q plot 을 그린 후 가장 직선에 가까운 값 \(\gamma\)을 선택하여 해당 직선의 기울기로 \(\sigma\)를 추정
method2.function <- function(gamma){
kospi <- kospi %>% mutate(g.inverse=((1-pr)^(-gamma)-1)/gamma)
method2.lm <- lm(X ~ g.inverse, data=kospi)
slope <- method2.lm$coefficients[2]
r2 <- summary(method2.lm)$r.squared
return(c(slope, r2))
}
interval <- 0.01
gamma.matrix <- data.frame("gamma"=seq(interval, 1, by=interval), "slope"=NA, "r2"=NA)
for (i in 1:nrow(gamma.matrix)){
gamma.matrix[i,2:3] <- method2.function(gamma.matrix[i,1])
}
final.gamma <- gamma.matrix[which.max(gamma.matrix$r2),1]
final.sigma <- gamma.matrix[which.max(gamma.matrix$r2),2]
kospi <- kospi %>% mutate(g.inverse=((1-pr)^(-final.gamma)-1)/final.gamma)
ggplot(kospi, aes(g.inverse, X)) + geom_point(size=2) +
geom_smooth(method="lm", se=T) +
labs(title = "Q-Q Plot",
subtitle = paste("R.sq = ", round(a.matrix[which.max(gamma.matrix$r2),3]*100,2), "%",
" gamma = ", round(final.gamma,2),
" sigma = ", round(final.sigma,2), sep=""),
x="", y="X") + theme_bw()
방법3.
MLE \((\gamma, \sigma)\)
library(optimx)
method3.function <- function(variable){
sum(log((1/variable[2])*(1+variable[1]*kospi$X/variable[2])^(-1/variable[1]-1))) #GPD미분 후 sum(log)
}
optimx.result <- optimx(par=c(final.gamma, final.sigma), method3.function, control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
optim.result <-optim(par=c(final.gamma,final.sigma),method3.function,method="CG",hessian=T,control=list(fnscale=-1))
kable(data.frame("gamma"=optim.result$par[1], "sigma"=optim.result$par[2])) %>%
kable_styling(full_width = F,
position = "left")
gamma | sigma |
---|---|
0.1065556 | 0.0175199 |
d)위 GPD를 이용하여 \(r_t\)의 \(VAR_{\alpha}\) 값을 구하시오.
\(\alpha = 0.01, 0.004, 0.001, 0.0004\)
alpha <- c(0.01,0.004,0.001,0.0004)
alpha0 <- mean(kospi$X>0)
alpha1 <- alpha/alpha0
VAR.alpha <- 0.04+final.sigma *(alpha1^(-final.gamma)-1)/final.gamma
kable(cbind(alpha,VAR.alpha)) %>%
kable_styling(full_width = F, position = "left")
alpha | VAR.alpha |
---|---|
0.0100 | 0.1617317 |
0.0040 | 0.1950570 |
0.0010 | 0.2529836 |
0.0004 | 0.2969149 |
#datasetiing
setwd("D:\\0_grad\\data")
age <- read.csv("age.csv")
age <- age[,-1]
names(age) <- c("gender", "x", "dx")
age$x <- as.numeric(gsub("세", "", age$x))
ggplot(age, aes(x, dx, colour=gender, fill=gender)) +
geom_histogram(stat="identity", position="dodge") +
theme_bw() +
labs(title="연령별 사망자수",
x="age", y="")
age.male <- age %>% filter(age$gender=="남자"&age$x>0&age$x<90) %>% mutate(new.dx=cumsum(dx), px=lag(new.dx)/sum(dx), gompertz=log(-log(1-px)))
#Q-Q plot
ggplot(age.male, aes(gompertz, x)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Male.Gompertz Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(x ~ gompertz,age.male,na.action=na.exclude))$r.squared,4)*100, "%", sep = ""),x="", y="age") + theme_bw()
직선에 가까운것을 알수 있다.
male.lm <- lm(x~gompertz, age.male, na.action=na.exclude)
male.mu <- male.lm$coefficients[1]
male.sigma <- male.lm$coefficients[2]
male.mu.sigma <- data.frame("mu"=male.mu, "sigma"=male.sigma)
rownames(male.mu.sigma) <- c("")
kable(male.mu.sigma) %>% kable_styling(full_width = F, position = "left")
mu | sigma | |
---|---|---|
73.07829 | 10.6856 |
Female
age.female <- age %>% filter(age$gender=="여자"&age$x>0&age$x<90) %>% mutate(new.dx=cumsum(dx), px=lag(new.dx)/sum(dx), gompertz=log(-log(1-px)))
#Q-Q Plot
ggplot(age.female, aes(gompertz, x)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Female.Gompertz Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(x ~ gompertz, age.female, na.action=na.exclude))$r.squared,4)*100, "%", sep = ""), x="", y="age") + theme_bw()
female.lm <- lm(x~gompertz, age.female, na.action=na.exclude)
female.mu <- female.lm$coefficients[1]
female.sigma <- female.lm$coefficients[2]
female.mu.sigma <- data.frame("mu"=female.mu, "sigma"=female.sigma)
rownames(female.mu.sigma) <- c("")
kable(female.mu.sigma) %>% kable_styling(full_width = F, position = "left")
mu | sigma | |
---|---|---|
80.42886 | 11.71953 |
c)연령별 사망확률을 정의하고 연령별 사망률을 정의할 때, 남녀별 \(\mu_x, ln\mu_x\)의 그래프를 겹쳐서 그려보고 차이를 설명하고 사망률에 대한
Gompertz법칙 \(\mu_x=exp(a+bx), ln\mu_x = a_bx, x>20\) 이 성립하는지 검토하고 추정된 모수(a,b)값의 의미를 설명하시오
age.male <- age.male %>% mutate(lx=sum(dx)-lag(new.dx))
age.male$lx[1] <- sum(age.male$dx)
age.male <- age.male %>% mutate(mu=dx/(lx-dx/2), lnmu=log(mu))
age.female <- age.female %>% mutate(lx=sum(dx)-lag(new.dx))
age.female$lx[1] <- sum(age.female$dx)
age.female <- age.female %>% mutate(mu=dx/(lx-dx/2), lnmu=log(mu))
age.total <- rbind(age.male, age.female)
#mu
ggplot(age.total, aes(x, mu, group=gender, fill=gender)) + geom_histogram(stat="identity", position="dodge") +
theme_bw() + labs(title="남녀 연령별 사망률", x="age", y="mortality rate")
#lnmu
ggplot(age.total, aes(x, lnmu, group=gender, fill=gender)) + geom_histogram(stat="identity", position="dodge") +
theme_bw() + labs(title="남녀 연령별 로그 사망률", x="age", y="log mortality rate")
#Gompertz 추정 모수
male.lm <- lm(lnmu~x, data=age.male[age.male$x>20,])
male.a <- male.lm$coefficients[1]
male.b <- male.lm$coefficients[2]
female.lm <- lm(lnmu~x, data=age.female[age.female$x>20,])
female.a <- female.lm$coefficients[1]
female.b <- female.lm$coefficients[2]
a.b <- data.frame("a"= c(male.a, female.a), "b"=c(male.b, female.b))
rownames(a.b) <- c("Male", "Female")
kable(a.b) %>% kable_styling(full_width = F, position = "left")
a | b | |
---|---|---|
Male | -8.845884 | 0.0881708 |
Female | -9.340800 | 0.0876224 |
a) 연중최대일강수량 \({x_t}\)자료의 시계열도표 및 histogram을 그려보시오.
#data setting
setwd("D:\\0_grad\\data")
seoul_1971_1990 <- read.csv("seoul_1971_1990.csv", skip=7)
seoul_1991_2015 <- read.csv("seoul_1991_2015.csv", skip=7)
mount_1971_1990 <- read.csv("mount_1971_1990.csv", skip=7)
rain <- na.omit(rbind(seoul_1971_1990, seoul_1991_2015, mount_1971_1990))
rain <- rain[,c(-1, -3:-4, -6)]
names(rain) <- c("no", "year", "mm")
rain <- arrange(rain, year, desc(mm))
rain <- rain[-which(duplicated(rain$year)),-1] #중복값은 제거하고 하나를 남김, row번호가 빠른 행을 남기고, 뒤에 데이터 삭제
#시계열도표
ggplot(rain, aes(x=year, y=mm)) + geom_point() + geom_line() + theme_bw() +
labs(title="연중최대일강수량 시계열도표" , x="Year", y="최대강수량")
#Histogram
ggplot(rain, aes(x=year, y=mm)) + geom_histogram(stat="identity") + theme_bw() +
labs(title="연중최대일강수량 히스토그램", x="Year", y="최대강수량")
b) Gumbel 최대 극값 분포 \(G_0 ((x-\mu)/\sigma)\)의 Q-Q plot을 그려보고 해당 모수 \((\mu, \sigma)\)를 추정하시오.
rain <- rain %>% mutate(rank = rank(mm), pr=rank/(nrow(rain)+1), gumbel=-log(-log(pr)))
ggplot(rain, aes(gumbel, mm)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Gumbel Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(mm ~ gumbel, rain))$r.squared,4)*100, "%", sep = ""),
x="", y="rain") + theme_bw()
#모수추정
rain.lm <- lm(mm ~ gumbel, rain)
rain.mu <- rain.lm$coefficients[1]
rain.sigma <- rain.lm$coefficients[2]
rain.mu.sigma <- data.frame("mu"=rain.mu, "sigma"=rain.sigma)
rownames(rain.mu.sigma) <- c("")
kable(rain.mu.sigma) %>% kable_styling(full_width = F, position = "left")
mu | sigma | |
---|---|---|
126.6453 | 60.50695 |
c) 위 결과를 이용하여 T=50, 100, 200일 때, T-년 최대 강수량(\(\mu_T\))을 구하시오.
t <- c(50, 100, 200)
p.t <- 1-1/t
g.inverse <- -log(-log(p.t))
mu.t <- rain.mu + rain.sigma * g.inverse
#T-년 최대강수량
kable(cbind(t, mu.t)) %>% kable_styling(full_width = F, position = "left")
t | mu.t |
---|---|
50 | 362.7397 |
100 | 404.9863 |
200 | 447.0788 |
d) 두 변수 \((lny_t , x_t)\)의 산점도를 그려보고 회귀모형 :
\(lny_t = a+b \cdot x_t + e_t\) 을 적합시켜 모수 (a,b)를 추정하고 식의 의미를 설명하시오.
#data setting
setwd("D:\\0_grad\\data")
flood <- read.csv("floor.csv", na.strings = TRUE, fileEncoding = "euc-kr", header=TRUE)
names(flood) <- c("year", "total", "location")
year.flood <- flood[flood$year<2007,] %>% select("year", "total") %>% group_by(year) %>% summarise(flood=sum(total))
new.data.set <- year.flood %>% left_join(rain, by="year") %>% mutate(lny=log(flood))
#산점도
ggplot(new.data.set, aes(mm, lny)) + geom_point() + theme_bw() +
labs(title="강수량과 홍수피해액간 산점도", x="rain", y="log(flood cost)")
#모수추정
new.lm <- lm(lny ~ mm, data=new.data.set)
new.a <- new.lm$coefficients[1]
new.b <- new.lm$coefficients[2]
new.a.b <- data.frame("a"=new.a, "b"=new.b)
rownames(new.a.b) <- c("")
kable(new.a.b) %>% kable_styling(full_width = F, position = "left")
a | b | |
---|---|---|
7.379017 | 0.0325794 |
e) 연도별 홍수피해액 \(y_t\) 및 \(lny_t\)의 시계열 도표 및 histogram을 그려보시오.또 Frechet 분포의 Q-Q plot 을 그려 \(y_t\)의 분포를 찾고 해당 모수를 추정하시오.
#yt 시계열도표
yt.time <- ggplot(new.data.set, aes(year, flood))+ geom_point() + geom_line() + theme_bw() +
labs(title="yt Time series", x="Year", y="flood")
#yt 히스토그램
yt.histo <- ggplot(new.data.set, aes(year, flood))+ geom_histogram(stat="identity") + theme_bw() +
labs(title="yt Histogram", x="Year", y="flood")
#lnyt 시계열도표
lnyt.time <- ggplot(new.data.set, aes(year, lny))+ geom_point() + geom_line() + theme_bw() +
labs(title="yt Time series", x="Year", y="log(flood)")
#yt 히스토그램
lnyt.histo <- ggplot(new.data.set, aes(year, lny))+ geom_histogram(stat="identity") + theme_bw() +
labs(title="yt Histogram", x="Year", y="log(flood)")
library(gridExtra)
grid.arrange(yt.time, yt.histo, ncol=2)
grid.arrange(lnyt.time, lnyt.histo, ncol=2)
#Q-Q plot
new.data.set <- new.data.set %>% mutate(rank=rank(lny), pr=rank/(nrow(new.data.set)+1), frechet=-log(-log(pr)))
ggplot(new.data.set, aes(frechet, lny)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Frechet Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lny ~ frechet, new.data.set))$r.squared,4)*100, "%", sep = ""),x="", y="") + theme_bw()
#모수추정
lm.fit <- lm(lny ~ frechet, data=new.data.set)
mu.hat <- lm.fit$coefficients[1]
sigma.hat <- lm.fit$coefficients[2]
new.a.b <- data.frame("mu"=mu.hat, "sigma"=sigma.hat)
rownames(new.a.b) <- c("")
kable(new.a.b) %>% kable_styling(full_width = F, position = "left")
mu | sigma | |
---|---|---|
11.42023 | 2.359451 |
아래 첨부한 과거 조선시대 및 일제시대에 측정한 서울지역 연중최대일강수량 \({x_t}\) 자료를 이용하여 최근 45년간 (1971-2015) 자료로 추정한 분포 및 이에 근거한 T-년 최대강우량 \((u_T)\) : T=50, 100, 200 값이 과연 타당한지 검토하려한다.
a) 조선시대 자료 (1777-1907)를 이용하여 Q-Q plot 으로 적절한 분포를 찾고 추정 분포를 이용하여 T=50, 100, 200 년 일때, T-년 최대강수량 \((u_T)\)을 구하시오.
library(gridExtra)
setwd("D:\\0_grad\\data")
rain <- read.csv("rain_1777_2015.csv", na.strings = "*")
names(rain) <- c("year", "gubun", "rainfull.cm")
rain$rainfull.cm <- as.numeric(rain$rainfull.cm)
rain.ggplot <- rain[complete.cases(rain),] %>% filter(gubun==1) %>% select(year, rainfull.cm)
rain.ggplot <- rain.ggplot %>%
mutate(lnx=log(rainfull.cm), rank=rank(rainfull.cm), pr=rank/(nrow(rain.ggplot)+1), normal=qnorm(pr),
exponential = -log(1-pr), logistic=log(pr/(1-pr)),gompertz=log(-log(1-pr)), gumbel=-log(-log(pr)))
nomal <- ggplot(rain.ggplot, aes(normal, rainfull.cm)) +
geom_point(size=2) + geom_smooth(method="lm", se=F) +
labs(title = "Normal Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rainfull.cm ~ normal, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") +
theme_bw()
log.nomal <- ggplot(rain.ggplot, aes(normal, lnx)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "log-Normal Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lnx ~ normal, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
exponential <- ggplot(rain.ggplot, aes(exponential, rainfull.cm)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Exponential Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rainfull.cm ~ exponential, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
pareto <- ggplot(rain.ggplot, aes(exponential, lnx)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Pareto Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lnx ~ exponential, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
gompertz <- ggplot(rain.ggplot, aes(gompertz, rainfull.cm)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Gompertz Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rainfull.cm ~ gompertz, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
Weibull <- ggplot(rain.ggplot, aes(gompertz, lnx)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Weibull Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lnx ~ gompertz, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
gumbel <- ggplot(rain.ggplot, aes(gumbel, rainfull.cm)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Gumbel Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rainfull.cm ~ gumbel, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
Frechet <- ggplot(rain.ggplot, aes(gumbel, lnx)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Frechet Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lnx ~ gumbel, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
logistic <- ggplot(rain.ggplot, aes(logistic, rainfull.cm)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Logistic Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(rainfull.cm ~ logistic, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
log.logistic <- ggplot(rain.ggplot, aes(logistic, lnx)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "log-Logistic Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(lnx ~ logistic, rain.ggplot))$r.squared,4)*100, "%", sep = ""),x="", y="rain") + theme_bw()
grid.arrange(nomal, log.nomal, exponential, pareto, ncol=2)
grid.arrange(gompertz, Weibull, gumbel, Frechet, logistic, log.logistic, ncol=2)
log-logistic이 가장 적절한 분포로 보인다.
lm.fit <- lm(lnx~ logistic, rain.ggplot)
mu.hat <- lm.fit$coefficients[1]
sigma.hat <- lm.fit$coefficients[2]
t <- c(50, 100, 200)
pr <- 1- 1/t
logistic <- log(pr/(1-pr))
#최대 강수량 (ut)
ut <- exp(mu.hat + sigma.hat*logistic)
result <- data.frame(rbind(t, ut))
colnames(result) <- c("T1", "T2", "T3")
kable(result) %>% kable_styling(full_width = F, position = "left")
T1 | T2 | T3 | |
---|---|---|---|
t | 50.00000 | 100.00000 | 200.00000 |
ut | 27.69309 | 32.59512 | 38.31943 |
b) 위에서 구한 결과 (분포 및 T-년 최대강우량)와 최근자료(1971-2015)를 이용해 Part3) a,b,c 에서 얻은 결과와 차이점 있는지 검토하고 차이점이 있는 경우 적절한 이유를 제시하시오.
#최근자료(1971-2015)로부터 얻은 최대강우량 ~ Gumbel
#조선시대(1777-1907)에 측정한 최대강우량 ~ log-logistic
result <- rbind(t, mu.t/10, ut)
colnames(result) <- c("T1", "T2", "T3")
rownames(result) <- c("T", "Part3", "Part4")
kable(result) %>% kable_styling(full_width = F, position = "left")
T1 | T2 | T3 | |
---|---|---|---|
T | 50.00000 | 100.00000 | 200.00000 |
Part3 | 36.27397 | 40.49863 | 44.70788 |
Part4 | 27.69309 | 32.59512 | 38.31943 |
단위를 맞춰주기 위해 Part3의 결과를 10으로 나눠주었고, 그 결과 최근자료로부터 계산한 T-년 최대 강우량이
조선시대 자료로부터 계산한 최대 강우량과 차이가 있음을 확인하였다.
지구 온난화 등으로 조선시대 강우량에 비해 최근의 강우량이 증가한것으로 보인다.
rain <- rain[complete.cases(rain),]
#시계열 도표
ggplot(rain, aes(year, rainfull.cm)) + geom_point() + geom_line() +
theme_bw() +labs(x="Year", y="최대강수량")
#선형모형 적합
lm.fit <- lm(rainfull.cm ~ year, rain)
alpha.hat <- lm.fit$coefficients[1]
beta.hat <- lm.fit$coefficients[2]
result <- as_tibble(cbind(alpha.hat, beta.hat))
kable(result) %>% kable_styling(full_width = F, position = "left")
alpha.hat | beta.hat |
---|---|
-20.87183 | 0.0180233 |
d) 잔차 \(e_t\)에 대한 Q-Q plot 으로 적절한 분포를 찾으시오
\(e_t = \mu + \sigma u_t\) ~ Gumble(\(\mu\), \(\sigma\))
rain <- rain %>% mutate(et = rainfull.cm - alpha.hat - beta.hat * year,
rank = rank(et), pr=rank/(nrow(rain)+1), gumbel=-log(-log(pr)))
ggplot(rain, aes(gumbel, et)) + geom_point(size=2) +
geom_smooth(method="lm", se=F) +
labs(title = "Gumbel Q-Q Plot",
subtitle = paste("R.sq = ", round(summary(lm(et ~ gumbel, rain))$r.squared,4)*100, "%", sep = ""),
x="", y="rain") + theme_bw()
#모수추정
rain.lm <- lm(et ~ gumbel, rain)
mu.hat <- rain.lm$coefficients[1]
sigma.hat <- rain.lm$coefficients[2]
rain.mu.sigma <- data.frame("mu"=mu.hat, "sigma"=sigma.hat)
rownames(rain.mu.sigma) <- c("")
kable(rain.mu.sigma) %>% kable_styling(full_width = F, position = "left")
mu | sigma | |
---|---|---|
-2.713883 | 4.775207 |
mu.2016 <- alpha.hat + mu.hat + beta.hat * 2016
u <- (30-mu.2016)/sigma.hat
prob <- 1 - exp(-exp(-u))
result <- data.frame(prob*100)
rownames(result) <- c("")
colnames(result) <- c("Probability (%)")
kable(result) %>% kable_styling(full_width = F, position = "left")
Probability (%) | |
---|---|
2.662225 |
#확률 추정
mle.function <- function(variable, t=2016) {
alpha <- variable[1]
beta <- variable[2]
sigma <- variable[3]
ut <- (rain$rainfull.cm - alpha - beta * t)/sigma
return(sum(log(exp(-ut-exp(-ut)))- log(sigma)))
}
library(optimx)
optimx(par=c(alpha.hat, beta.hat, sigma.hat), mle.function, control=list(all.methods=TRUE,maximize=TRUE,save.failures=TRUE,trace=0))
## Maximizing -- use negfn and neggr
## X.Intercept. year gumbel value fevals gevals
## BFGS -14.64166 0.01255301 4.287773 -7.166340e+02 145 11
## CG -20.87183 0.01570748 4.775208 -7.186825e+02 47 4
## Nelder-Mead -20.14394 0.01528986 4.285680 -7.166324e+02 104 NA
## L-BFGS-B NA NA NA -8.988466e+307 NA NA
## nlm -20.87183 0.01573840 4.775208 -7.186620e+02 NA NA
## nlminb -20.87182 0.01565079 4.286163 -7.166324e+02 21 45
## spg NA NA NA -8.988466e+307 NA NA
## ucminf NA NA NA -8.988466e+307 NA NA
## Rcgmin -20.87183 0.01565079 4.286102 -7.166324e+02 1002 115
## Rvmmin -20.87183 0.01565080 4.286162 -7.166324e+02 36 11
## newuoa NA NA NA -8.988466e+307 NA NA
## bobyqa NA NA NA -8.988466e+307 NA NA
## nmkb NA NA NA -8.988466e+307 NA NA
## hjkb NA NA NA -8.988466e+307 NA NA
## niter convcode kkt1 kkt2 xtime
## BFGS NA 0 FALSE FALSE 0.02
## CG NA 0 FALSE FALSE 0.00
## Nelder-Mead NA 0 FALSE FALSE 0.00
## L-BFGS-B NA 9999 NA NA 0.02
## nlm 6 0 FALSE FALSE 0.00
## nlminb 11 0 TRUE FALSE 0.00
## spg NA 9999 NA NA 0.00
## ucminf NA 9999 NA NA 0.00
## Rcgmin NA 1 TRUE FALSE 0.07
## Rvmmin NA 0 TRUE FALSE 0.00
## newuoa NA 9999 NA NA 0.00
## bobyqa NA 9999 NA NA 0.00
## nmkb NA 9999 NA NA 0.00
## hjkb NA 9999 NA NA 0.00
mle.abs <- optim(par=c(alpha.hat, beta.hat, sigma.hat), mle.function, method="BFGS", hessian=T, control=list(fnscale=-1))
mle.alpha <- mle.abs$par[1]
mle.beta <- mle.abs$par[2]
mle.sigma <- mle.abs$par[3]
mle.mu.2016 <- mle.alpha + mle.beta * 2016
mle.u <- (30-mle.mu.2016)/mle.sigma
mle.prob <- 1 - exp(-exp(-mle.u))
result <- data.frame(mle.prob*100)
rownames(result) <- c("")
colnames(result) <- c("Probability (%)")
kable(result) %>% kable_styling(full_width = F, position = "left")
Probability (%) | |
---|---|
1.094597 |
#e)의 추정값과 비교
options(scipen=100)
result <- round(rbind(c(alpha.hat, beta.hat, sigma.hat, prob), c(mle.alpha, mle.beta, mle.sigma, mle.prob)),4)
colnames(result) <- c("alpha", "beta", "sigma", "probability")
rownames(result) <- c("e) regression", "f) mle")
kable(result) %>% kable_styling(full_width = F, position = "left")
alpha | beta | sigma | probability | |
---|---|---|---|---|
|
-20.8718 | 0.0180 | 4.7752 | 0.0266 |
|
-14.6417 | 0.0126 | 4.2878 | 0.0109 |