반응 변수(target variable)
\(y=1\) (정상) / \(y=0\) (부도)
#Setup
options(scipen=100)
options(crayon.enabled = FALSE)
setwd("D:\\0_grad\\data")
library(tidyverse)
library(readxl)
library(knitr)
library(optimx)
library(kableExtra)
bankruptcy <- read_xls("Haskins-Default-100.xls", sheet=3)
bankruptcy <- bankruptcy[,-c(1,3)]
AICHp <- data.frame(p=24:0,AIC=0)
variable <- list()
a.bank <- bankruptcy
for (i in 1:24) {
variable[[i]] <- colnames(a.bank)
H <- glm (Y~., data=a.bank, family=binomial(link=logit))
AICHp[i,2] <- H$aic
k <- which.max(abs(summary(H)$coef[-1,4]))
a.bank <- a.bank[,-(k[[1]]+1)]
}
AICHp[25,2] <- glm(Y~1, data=bankruptcy, family=binomial(link=logit))$aic
ggplot(AICHp, aes(p, AIC)) + geom_point() +
geom_line() + theme_bw()
#AIC를 최소화하는 모형의 변수
variable[[which.min(AICHp$AIC)]]
## [1] "Y" "R5" "R6" "R16" "R18" "R22"
b) 최적 예측 모형 \(H_p^*\)을 선택하고 이를 이용하여 현재의 기업재무변수를 근거로 2년 후 기업의 생존확률을 구하는 예측방정식을 구하시오.
summary( glm(Y ~ R5 + R6 + R16 + R18 + R22, data=bankruptcy, family=binomial(link=logit)))
##
## Call:
## glm(formula = Y ~ R5 + R6 + R16 + R18 + R22, family = binomial(link = logit),
## data = bankruptcy)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7189 -0.4923 0.0000 0.2657 2.2575
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1831 0.5561 -3.926 0.00008643 ***
## R5 26.6133 8.3544 3.186 0.001445 **
## R6 -29.5910 8.1330 -3.638 0.000274 ***
## R16 -41.2054 14.7451 -2.795 0.005198 **
## R18 31.2559 6.9454 4.500 0.00000679 ***
## R22 16.7776 12.1240 1.384 0.166409
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 60.115 on 94 degrees of freedom
## AIC: 72.115
##
## Number of Fisher Scoring iterations: 8
c) 최적 예측 모형 \(H_p&*\)에 포함되지 않는 \(\bar p = 24 - p^*\)개 변수들에 해당하는 회귀계수 벡터를 \(\bar \beta\)로 나타낼 때,
\(H_0 : \bar \beta = 0\) v.s. \(H_1 :\) not \(H_0\)을 LRT를 이용하여 검정하시오.
library(lmtest)
H0 <- glm(Y ~ R5 + R6 + R16 + R18 + R22, data=bankruptcy, family=binomial(link=logit))
H1 <- glm(Y ~., data=bankruptcy, family=binomial(link=logit))
lrtest(H0, H1)
## Likelihood ratio test
##
## Model 1: Y ~ R5 + R6 + R16 + R18 + R22
## Model 2: Y ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10 + R11 +
## R12 + R13 + R14 + R15 + R16 + R17 + R18 + R19 + R20 + R21 +
## R22 + R23 + R24
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -30.057
## 2 25 -22.328 19 15.458 0.6931
d) 최종 선택된 예측 모형(\(H_p^*\))의 재무변수 산점도 matrix를 그려보고 그 의미를 간략히 서술하시오.
또한 두 그룹의 예측 점수의 Box-plot 및 kernel density plot 을 서로 겹쳐서 그려보고 그 의미를 간략히 설명하시오.
library(GGally)
ggpairs(bankruptcy[,c(6,7,17,19,23)], aes(colour=as.factor(bankruptcy$Y), shape=as.factor(bankruptcy$Y), alpha=0.4)) + theme_bw()
myglm <- glm(Y ~ R5 + R6 + R16 + R18 + R22, data=bankruptcy, family=binomial(link=logit))
pred <- predict(myglm, bankruptcy[,c(6,7,17,19,23)] )
prob <- 1/(1+exp(-pred))
plot.data <- data.frame(y=bankruptcy[,1], Prob=prob, Logit=pred)
ggplot(plot.data) + geom_boxplot(aes(x=as.factor(Y), y=Logit)) + theme_bw() +
labs(x="Bankruptcy 여부")
ggplot(plot.data) + geom_boxplot(aes(x=as.factor(Y), y=Prob)) + theme_bw() +
labs(x="Bankruptcy 여부")
ggplot(plot.data) + geom_density(aes(x=Logit, group=as.factor(Y), color=as.factor(Y))) + theme_bw()
ggplot(plot.data) + geom_density(aes(x=Prob, group=as.factor(Y), color=as.factor(Y))) + theme_bw()
e) 최적 예측 모형의 MLE의 분산 공분산 행렬을 구하고 이를 이용하여 90% 신뢰구간을 구하시오.
attach(bankruptcy)
e.function <- function(beta){
z <- beta[1] + beta[2] * R5 + beta[3] * R6 + beta[4] * R16 + beta[5] * R18 + beta[6] * R22
G_z <- 1/ (1+exp(-z))
return(sum(log(G_z^Y * (1-G_z)^(1-Y))))
}
library(numDeriv)
e.optim.result <- optimx(par=c(myglm$coefficients), e.function, control=list(maximize=T, save.failures=T, trace=0))
## Maximizing -- use negfn and neggr
e.mle <- as.numeric(e.optim.result[1,c(1:6)])
J <- -hessian(e.function, x=e.mle)
I_inverse <- solve(J)
kable(J) %>% kable_styling(full_width = F, position = "left")
| 9.4857565 | 0.5257478 | 0.3949430 | 0.7453223 | 1.1408409 | 0.6070306 |
| 0.5257478 | 0.1833801 | 0.1431241 | 0.1190391 | 0.1148668 | 0.1073383 |
| 0.3949430 | 0.1431241 | 0.1633985 | 0.0800820 | 0.1254098 | 0.0743107 |
| 0.7453223 | 0.1190391 | 0.0800820 | 0.1409359 | 0.1482171 | 0.1195704 |
| 1.1408409 | 0.1148668 | 0.1254098 | 0.1482171 | 0.2469001 | 0.1201876 |
| 0.6070306 | 0.1073383 | 0.0743107 | 0.1195704 | 0.1201876 | 0.1096454 |
se <- sqrt(diag(I_inverse))
CI <- matrix(NA, nrow = length(e.mle), ncol = 2)
for (i in 1:length(e.mle)) {
CI[i,] <- c(e.mle[i] - qnorm(.95)*se[i], e.mle[i] + qnorm(.95)*se[i])
}
colnames(CI) <- c("Lwr", "Upr")
rownames(CI) <- c("Beta0", "Beta1", "Beta2", "Beta3", "Beta4","Beta5")
kable(CI) %>% kable_styling(full_width = F, position = "left")
| Lwr | Upr | |
|---|---|---|
| Beta0 | -3.097803 | -1.268371 |
| Beta1 | 12.871507 | 40.355142 |
| Beta2 | -42.968722 | -16.213278 |
| Beta3 | -65.458775 | -16.952059 |
| Beta4 | 19.831186 | 42.680646 |
| Beta5 | -3.164559 | 36.719770 |
#other method
e.optim.result <- optim(par=c(myglm$coefficients), e.function, method="BFGS", hessian=T)
J <- - e.optim.result$hessian
I_inverse <- solve(J)
f) 추정된 최적모형을 이용하여 각 기업의 2년 후 생존확률이 0.5이상이면 정상(1), 0.5이하이면 부도(0)로 예측할 경우
오류 발생 빈도와 총오진율을 구하고 이를 full model 완전모형 \(H_p; p=24\)의 오진율과 비교해 보시오.
predict <- ifelse(prob>=.5, 1, 0)
f.result <- table(bankruptcy$Y, predict)
kable(f.result) %>% kable_styling(full_width = F, position = "left")
| 0 | 1 | |
|---|---|---|
| 0 | 45 | 5 |
| 1 | 7 | 43 |
total.missfication <- (5+7) / (45+5+7+43) *100
paste0("Total missfication Error = ", total.missfication, "%")
## [1] "Total missfication Error = 12%"
#Full model
total.glm <- glm (Y~., data=bankruptcy, family=binomial(link=logit))
total.prob <- predict(total.glm, type="response")
total.predict <- ifelse(total.prob>=.5, 1, 0)
f.total.result <- table(bankruptcy$Y, total.predict)
kable(f.total.result) %>% kable_styling(full_width = F, position = "left")
| 0 | 1 | |
|---|---|---|
| 0 | 46 | 4 |
| 1 | 8 | 42 |
missfication.error <- (4+8)/(46+4+8+42) * 100
paste0("Total missfication Error = ", missfication.error, "%")
## [1] "Total missfication Error = 12%"