The materials below are based on discussion with Bolun Zhou and Nan Li. This work would not be possible otherwise.
| Models | In-sample | Out-of-sample |
|---|---|---|
| GLM | 0.506 | 0.521 |
| Classification tree | 0.429 | 0.454 |
| Generalized Additive Models (GAM) | 0.512 | 0.523 |
| Neutral Network | 0.145 | 0.135 |
set.seed(13003366);
bankrupt = read.csv("C:/Users/Zhaohu/Box/UC_SP_19_BANA7046_group_study/HW3/bankruptcy.csv", header=TRUE, sep=",")
str(bankrupt)
## 'data.frame': 5436 obs. of 13 variables:
## $ FYEAR: int 1999 1999 1999 1994 1999 1999 1999 1999 1987 1999 ...
## $ DLRSN: int 0 0 0 1 0 0 0 0 1 0 ...
## $ CUSIP: Factor w/ 5436 levels "00036020","00036110",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ R1 : num 0.307 0.761 -0.514 -0.466 2.023 ...
## $ R2 : num 0.887 0.592 0.338 0.371 0.215 ...
## $ R3 : num 1.648 0.453 0.299 0.496 0.183 ...
## $ R4 : num -0.1992 -0.3699 -0.0291 -0.3734 6.6954 ...
## $ R5 : num 1.093 0.186 -0.433 -0.267 -1.148 ...
## $ R6 : num -0.3133 0.0396 0.83 0.9778 -1.5059 ...
## $ R7 : num -0.197 0.327 -0.708 -0.611 2.876 ...
## $ R8 : num 1.207 0.428 0.476 0.457 0.287 ...
## $ R9 : num 0.282 1.107 2.179 0.152 -0.986 ...
## $ R10 : num 0.1589 0.7934 2.4846 0.0478 0.7911 ...
bankrupt <- select(bankrupt, -c(CUSIP, FYEAR))
index <- sample(nrow(bankrupt),nrow(bankrupt)*0.75)
bank.train = bankrupt[index,]
bank.test = bankrupt[-index,]
cost <- function(r, pi){
weight1 = 35
weight0 = 1
c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
# best mdoel
bank.glm<- glm(DLRSN~., family=binomial, data=bank.train)
bank.glm.back <- step(bank.glm, direction = "back")
## Start: AIC=2313.74
## DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## - R5 1 2292.2 2312.2
## <none> 2291.7 2313.7
## - R4 1 2294.4 2314.4
## - R1 1 2299.2 2319.2
## - R9 1 2299.5 2319.5
## - R8 1 2306.6 2326.6
## - R6 1 2307.2 2327.2
## - R3 1 2309.2 2329.2
## - R7 1 2315.8 2335.8
## - R2 1 2338.0 2358.0
## - R10 1 2526.7 2546.7
##
## Step: AIC=2312.17
## DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## <none> 2292.2 2312.2
## - R4 1 2294.5 2312.5
## - R1 1 2299.7 2317.7
## - R9 1 2305.4 2323.4
## - R8 1 2307.0 2325.0
## - R6 1 2309.0 2327.0
## - R3 1 2309.6 2327.6
## - R7 1 2316.6 2334.6
## - R2 1 2338.0 2356.0
## - R10 1 2609.2 2627.2
summary(bank.glm.back)
##
## Call:
## glm(formula = DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 +
## R10, family = binomial, data = bank.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2562 -0.4631 -0.2389 -0.1051 3.2518
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.54841 0.08125 -31.366 < 2e-16 ***
## R1 0.20554 0.07551 2.722 0.006489 **
## R2 0.52829 0.08102 6.520 7.02e-11 ***
## R3 -0.42049 0.10104 -4.161 3.16e-05 ***
## R4 -0.17237 0.12296 -1.402 0.160978
## R6 0.22802 0.05583 4.084 4.43e-05 ***
## R7 -0.50970 0.10933 -4.662 3.13e-06 ***
## R8 -0.32562 0.08535 -3.815 0.000136 ***
## R9 0.33710 0.09379 3.594 0.000325 ***
## R10 -1.52747 0.09224 -16.559 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3378.2 on 4076 degrees of freedom
## Residual deviance: 2292.2 on 4067 degrees of freedom
## AIC: 2312.2
##
## Number of Fisher Scoring iterations: 7
## in sample prediction
pred.glm.back.train <- (predict(bank.glm.back, type="response"))
pcut1 <- 1/36
class.glm.train<- (pred.glm.back.train>pcut1)*1
MR_train<- mean(bank.train$DLRSN!=class.glm.train)
FPR_train<- sum(bank.train$DLRSN==0 & class.glm.train==1)/sum(bank.train$DLRSN==0)
FNR_train<- sum(bank.train$DLRSN==1 & class.glm.train==0)/sum(bank.train$DLRSN==1)
MR_train
## [1] 0.5064999
### out of sample prediction
pred.glm.back.test <- predict(bank.glm.back, newdata = bank.test, type="response")
class.glm.test<- (pred.glm.back.test>pcut1)*1
MR_test<- mean(bank.test$DLRSN!=class.glm.test)
FPR_test<- sum(bank.test$DLRSN==0 & class.glm.test==1)/sum(bank.test$DLRSN==0)
FNR_test<- sum(bank.test$DLRSN==1 & class.glm.test==0)/sum(bank.test$DLRSN==1)
MR_test
## [1] 0.5209713
## cost for train and test
bank.test.pred.glm <- as.numeric(predict(bank.glm.back, bank.test, type="response")>pcut1)
bank.trian.pre.glm <- as.numeric(predict(bank.glm.back, type="response")>pcut1)
cost1=cost(bank.test$DLRSN,bank.test.pred.glm)
cost1
## [1] 0.6961001
cost2=cost(bank.train$DLRSN,bank.trian.pre.glm)
cost2
## [1] 0.6649497
bank.rpart <- rpart(formula = DLRSN~., data =bank.train, method = "class")
bank.rpart1 <- rpart(formula = DLRSN~. , data = bank.train, method = "class", parms = list(loss=matrix(c(0,35,1,0), nrow = 2)))
bank.rpart1
## n= 4077
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 4077 3485 1 (0.854795193 0.145204807)
## 2) R10>=0.5515403 1527 385 0 (0.992796333 0.007203667)
## 4) R4>=-0.4143737 1493 280 0 (0.994641661 0.005358339)
## 8) R10>=0.8677265 1159 70 0 (0.998274374 0.001725626) *
## 9) R10< 0.8677265 334 210 0 (0.982035928 0.017964072)
## 18) R1>=0.08178313 167 0 0 (1.000000000 0.000000000) *
## 19) R1< 0.08178313 167 161 1 (0.964071856 0.035928144)
## 38) R8< 0.5106475 121 35 0 (0.991735537 0.008264463) *
## 39) R8>=0.5106475 46 41 1 (0.891304348 0.108695652)
## 78) R3>=0.5425691 36 0 0 (1.000000000 0.000000000) *
## 79) R3< 0.5425691 10 5 1 (0.500000000 0.500000000) *
## 5) R4< -0.4143737 34 31 1 (0.911764706 0.088235294) *
## 3) R10< 0.5515403 2550 1969 1 (0.772156863 0.227843137)
## 6) R6< 0.0655765 1277 1156 1 (0.905246672 0.094753328)
## 12) R10>=-0.7581692 994 937 1 (0.942655936 0.057344064)
## 24) R2< -1.340476 255 140 0 (0.984313725 0.015686275)
## 48) R4< 0.5185751 127 0 0 (1.000000000 0.000000000) *
## 49) R4>=0.5185751 128 124 1 (0.968750000 0.031250000)
## 98) R1>=0.662725 72 0 0 (1.000000000 0.000000000) *
## 99) R1< 0.662725 56 52 1 (0.928571429 0.071428571) *
## 25) R2>=-1.340476 739 686 1 (0.928281461 0.071718539)
## 50) R9>=0.5943266 61 0 0 (1.000000000 0.000000000) *
## 51) R9< 0.5943266 678 625 1 (0.921828909 0.078171091) *
## 13) R10< -0.7581692 283 219 1 (0.773851590 0.226148410) *
## 7) R6>=0.0655765 1273 813 1 (0.638648861 0.361351139) *
prp(bank.rpart1, extra = 1)
## In-sample prediction
bank.train.pred.tree <- predict(bank.rpart1, bank.train, type="class")
table(bank.train$DLRSN, bank.train.pred.tree, dnn = c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 1740 1745
## 1 3 589
## Out-of-sample prediction
bank.test.pred.tree <- predict(bank.rpart1, bank.test, type="class")
table(bank.test$DLRSN, bank.test.pred.tree, dnn = c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 563 612
## 1 5 179
# misclassification rate
mean(ifelse(bank.train$DLRSN != bank.train.pred.tree, 1, 0))
## [1] 0.4287466
mean(ifelse(bank.test$DLRSN != bank.test.pred.tree, 1, 0))
## [1] 0.4540103
#cost for training sample
cost1=cost(bank.train$DLRSN, bank.train.pred.tree)
cost1
## [1] 0.453765
#cost for testing sample
cost2=cost(bank.test$DLRSN, bank.test.pred.tree)
cost2
## [1] 0.5791023
bank.gam <- gam(formula = DLRSN ~ s(R1) + R2 + R3 + R6 + R7 +
R8 + R9 + R10, family = binomial, data = bank.train)
summary(bank.gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## DLRSN ~ s(R1) + R2 + R3 + R6 + R7 + R8 + R9 + R10
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.58237 0.08436 -30.611 < 2e-16 ***
## R2 0.56438 0.08118 6.952 3.60e-12 ***
## R3 -0.42691 0.10172 -4.197 2.70e-05 ***
## R6 0.29977 0.05937 5.050 4.43e-07 ***
## R7 -0.52972 0.15474 -3.423 0.000619 ***
## R8 -0.29051 0.08637 -3.363 0.000770 ***
## R9 0.38644 0.09012 4.288 1.80e-05 ***
## R10 -1.55902 0.08522 -18.294 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(R1) 6.953 8.012 24.99 0.0016 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.313 Deviance explained = 32.7%
## UBRE = -0.43481 Scale est. = 1 n = 4077
pred.gam.train <- as.numeric(predict(bank.gam, bank.train, type = "response") > pcut1)
pred.gam.test <- as.numeric(predict(bank.gam, newdata = bank.test, type = "response") > pcut1)
mean(ifelse(bank.train$DLRSN != pred.gam.train, 1, 0))
## [1] 0.5123866
mean(ifelse(bank.test$DLRSN != pred.gam.test, 1, 0))
## [1] 0.5231788
cost1=cost(bank.train$DLRSN, pred.gam.train)
cost1
## [1] 0.6624969
cost2=cost(bank.test$DLRSN, pred.gam.test)
cost2
## [1] 0.6983076
bank.nnet <- nnet(DLRSN ~ ., data = bank.train, size = 1, maxit = 500)
## # weights: 13
## initial value 1311.396434
## final value 592.000000
## converged
prob.nnet.train = predict(bank.nnet, bank.train)
pred.nnet.train = as.numeric(prob.nnet.train > pcut1)
table(bank.train$DLRSN, pred.nnet.train, dnn = c("Observed","Predicted"))
## Predicted
## Observed 0
## 0 3485
## 1 592
prob.nnet.test = predict(bank.nnet, newdata = bank.test)
pred.nnet.test = as.numeric(prob.nnet.test > pcut1)
table(bank.test$DLRSN, pred.nnet.test, dnn = c("Observed","Predicted"))
## Predicted
## Observed 0
## 0 1175
## 1 184
mean(ifelse(bank.train$DLRSN != pred.nnet.train, 1, 0))
## [1] 0.1452048
mean(ifelse(bank.test$DLRSN != pred.nnet.test, 1, 0))
## [1] 0.1353937
cost1=cost(bank.train$DLRSN, pred.nnet.train)
cost1
## [1] 5.082168
cost2=cost(bank.test$DLRSN, pred.nnet.test)
cost2
## [1] 4.738779