1 Acknowledgement:

The materials below are based on discussion with Bolun Zhou and Nan Li. This work would not be possible otherwise.

2 Summary

Models In-sample Out-of-sample
GLM 0.506 0.521
Classification tree 0.429 0.454
Generalized Additive Models (GAM) 0.512 0.523
Neutral Network 0.145 0.135

go to top

3 Data preparation

3.1 Load data set-Bankruptcy

set.seed(13003366);
bankrupt = read.csv("C:/Users/Zhaohu/Box/UC_SP_19_BANA7046_group_study/HW3/bankruptcy.csv", header=TRUE, sep=",")
str(bankrupt)
## 'data.frame':    5436 obs. of  13 variables:
##  $ FYEAR: int  1999 1999 1999 1994 1999 1999 1999 1999 1987 1999 ...
##  $ DLRSN: int  0 0 0 1 0 0 0 0 1 0 ...
##  $ CUSIP: Factor w/ 5436 levels "00036020","00036110",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ R1   : num  0.307 0.761 -0.514 -0.466 2.023 ...
##  $ R2   : num  0.887 0.592 0.338 0.371 0.215 ...
##  $ R3   : num  1.648 0.453 0.299 0.496 0.183 ...
##  $ R4   : num  -0.1992 -0.3699 -0.0291 -0.3734 6.6954 ...
##  $ R5   : num  1.093 0.186 -0.433 -0.267 -1.148 ...
##  $ R6   : num  -0.3133 0.0396 0.83 0.9778 -1.5059 ...
##  $ R7   : num  -0.197 0.327 -0.708 -0.611 2.876 ...
##  $ R8   : num  1.207 0.428 0.476 0.457 0.287 ...
##  $ R9   : num  0.282 1.107 2.179 0.152 -0.986 ...
##  $ R10  : num  0.1589 0.7934 2.4846 0.0478 0.7911 ...
bankrupt <- select(bankrupt, -c(CUSIP, FYEAR))

3.2 Training data set containing 75% of original data

index <- sample(nrow(bankrupt),nrow(bankrupt)*0.75)
bank.train = bankrupt[index,]
bank.test = bankrupt[-index,]

3.3 Cost founction

cost <- function(r, pi){
          weight1 = 35
          weight0 = 1
          c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
          c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
          return(mean(weight1*c1+weight0*c0))
}

go to top

4 GLM (logistic regression model)

# best mdoel
bank.glm<- glm(DLRSN~., family=binomial, data=bank.train)
bank.glm.back <- step(bank.glm, direction = "back")
## Start:  AIC=2313.74
## DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10
## 
##        Df Deviance    AIC
## - R5    1   2292.2 2312.2
## <none>      2291.7 2313.7
## - R4    1   2294.4 2314.4
## - R1    1   2299.2 2319.2
## - R9    1   2299.5 2319.5
## - R8    1   2306.6 2326.6
## - R6    1   2307.2 2327.2
## - R3    1   2309.2 2329.2
## - R7    1   2315.8 2335.8
## - R2    1   2338.0 2358.0
## - R10   1   2526.7 2546.7
## 
## Step:  AIC=2312.17
## DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 + R10
## 
##        Df Deviance    AIC
## <none>      2292.2 2312.2
## - R4    1   2294.5 2312.5
## - R1    1   2299.7 2317.7
## - R9    1   2305.4 2323.4
## - R8    1   2307.0 2325.0
## - R6    1   2309.0 2327.0
## - R3    1   2309.6 2327.6
## - R7    1   2316.6 2334.6
## - R2    1   2338.0 2356.0
## - R10   1   2609.2 2627.2
summary(bank.glm.back)
## 
## Call:
## glm(formula = DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 + 
##     R10, family = binomial, data = bank.train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2562  -0.4631  -0.2389  -0.1051   3.2518  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.54841    0.08125 -31.366  < 2e-16 ***
## R1           0.20554    0.07551   2.722 0.006489 ** 
## R2           0.52829    0.08102   6.520 7.02e-11 ***
## R3          -0.42049    0.10104  -4.161 3.16e-05 ***
## R4          -0.17237    0.12296  -1.402 0.160978    
## R6           0.22802    0.05583   4.084 4.43e-05 ***
## R7          -0.50970    0.10933  -4.662 3.13e-06 ***
## R8          -0.32562    0.08535  -3.815 0.000136 ***
## R9           0.33710    0.09379   3.594 0.000325 ***
## R10         -1.52747    0.09224 -16.559  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3378.2  on 4076  degrees of freedom
## Residual deviance: 2292.2  on 4067  degrees of freedom
## AIC: 2312.2
## 
## Number of Fisher Scoring iterations: 7
## in sample prediction
pred.glm.back.train <- (predict(bank.glm.back, type="response"))
pcut1 <- 1/36
class.glm.train<- (pred.glm.back.train>pcut1)*1
MR_train<- mean(bank.train$DLRSN!=class.glm.train)
FPR_train<- sum(bank.train$DLRSN==0 & class.glm.train==1)/sum(bank.train$DLRSN==0)
FNR_train<- sum(bank.train$DLRSN==1 & class.glm.train==0)/sum(bank.train$DLRSN==1)
MR_train
## [1] 0.5064999
### out of sample prediction
pred.glm.back.test <- predict(bank.glm.back, newdata = bank.test, type="response")
class.glm.test<- (pred.glm.back.test>pcut1)*1
MR_test<- mean(bank.test$DLRSN!=class.glm.test)
FPR_test<- sum(bank.test$DLRSN==0 & class.glm.test==1)/sum(bank.test$DLRSN==0)
FNR_test<- sum(bank.test$DLRSN==1 & class.glm.test==0)/sum(bank.test$DLRSN==1)
MR_test
## [1] 0.5209713
## cost for train and test
bank.test.pred.glm <- as.numeric(predict(bank.glm.back, bank.test, type="response")>pcut1)
bank.trian.pre.glm <- as.numeric(predict(bank.glm.back, type="response")>pcut1)
cost1=cost(bank.test$DLRSN,bank.test.pred.glm)
cost1
## [1] 0.6961001
cost2=cost(bank.train$DLRSN,bank.trian.pre.glm)
cost2
## [1] 0.6649497

go to top

5 Classification Tree

bank.rpart <- rpart(formula = DLRSN~., data =bank.train, method = "class")
bank.rpart1 <- rpart(formula =  DLRSN~. , data = bank.train, method = "class", parms = list(loss=matrix(c(0,35,1,0), nrow = 2)))
bank.rpart1
## n= 4077 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 4077 3485 1 (0.854795193 0.145204807)  
##    2) R10>=0.5515403 1527  385 0 (0.992796333 0.007203667)  
##      4) R4>=-0.4143737 1493  280 0 (0.994641661 0.005358339)  
##        8) R10>=0.8677265 1159   70 0 (0.998274374 0.001725626) *
##        9) R10< 0.8677265 334  210 0 (0.982035928 0.017964072)  
##         18) R1>=0.08178313 167    0 0 (1.000000000 0.000000000) *
##         19) R1< 0.08178313 167  161 1 (0.964071856 0.035928144)  
##           38) R8< 0.5106475 121   35 0 (0.991735537 0.008264463) *
##           39) R8>=0.5106475 46   41 1 (0.891304348 0.108695652)  
##             78) R3>=0.5425691 36    0 0 (1.000000000 0.000000000) *
##             79) R3< 0.5425691 10    5 1 (0.500000000 0.500000000) *
##      5) R4< -0.4143737 34   31 1 (0.911764706 0.088235294) *
##    3) R10< 0.5515403 2550 1969 1 (0.772156863 0.227843137)  
##      6) R6< 0.0655765 1277 1156 1 (0.905246672 0.094753328)  
##       12) R10>=-0.7581692 994  937 1 (0.942655936 0.057344064)  
##         24) R2< -1.340476 255  140 0 (0.984313725 0.015686275)  
##           48) R4< 0.5185751 127    0 0 (1.000000000 0.000000000) *
##           49) R4>=0.5185751 128  124 1 (0.968750000 0.031250000)  
##             98) R1>=0.662725 72    0 0 (1.000000000 0.000000000) *
##             99) R1< 0.662725 56   52 1 (0.928571429 0.071428571) *
##         25) R2>=-1.340476 739  686 1 (0.928281461 0.071718539)  
##           50) R9>=0.5943266 61    0 0 (1.000000000 0.000000000) *
##           51) R9< 0.5943266 678  625 1 (0.921828909 0.078171091) *
##       13) R10< -0.7581692 283  219 1 (0.773851590 0.226148410) *
##      7) R6>=0.0655765 1273  813 1 (0.638648861 0.361351139) *
prp(bank.rpart1, extra = 1)

## In-sample prediction
bank.train.pred.tree <- predict(bank.rpart1, bank.train, type="class")
table(bank.train$DLRSN, bank.train.pred.tree, dnn = c("Truth","Predicted"))
##      Predicted
## Truth    0    1
##     0 1740 1745
##     1    3  589
## Out-of-sample prediction
bank.test.pred.tree <- predict(bank.rpart1, bank.test, type="class")
table(bank.test$DLRSN, bank.test.pred.tree, dnn = c("Truth","Predicted"))
##      Predicted
## Truth   0   1
##     0 563 612
##     1   5 179
# misclassification rate
mean(ifelse(bank.train$DLRSN != bank.train.pred.tree, 1, 0))
## [1] 0.4287466
mean(ifelse(bank.test$DLRSN != bank.test.pred.tree, 1, 0))
## [1] 0.4540103
#cost for training sample
cost1=cost(bank.train$DLRSN, bank.train.pred.tree) 
cost1
## [1] 0.453765
#cost for testing sample
cost2=cost(bank.test$DLRSN, bank.test.pred.tree)
cost2
## [1] 0.5791023

go to top

6 GAM (generalized additive model)

bank.gam <- gam(formula = DLRSN ~ s(R1) + R2 + R3  + R6 + R7 +
                          R8 + R9 + R10, family = binomial, data = bank.train)
summary(bank.gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## DLRSN ~ s(R1) + R2 + R3 + R6 + R7 + R8 + R9 + R10
## 
## Parametric coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.58237    0.08436 -30.611  < 2e-16 ***
## R2           0.56438    0.08118   6.952 3.60e-12 ***
## R3          -0.42691    0.10172  -4.197 2.70e-05 ***
## R6           0.29977    0.05937   5.050 4.43e-07 ***
## R7          -0.52972    0.15474  -3.423 0.000619 ***
## R8          -0.29051    0.08637  -3.363 0.000770 ***
## R9           0.38644    0.09012   4.288 1.80e-05 ***
## R10         -1.55902    0.08522 -18.294  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##         edf Ref.df Chi.sq p-value   
## s(R1) 6.953  8.012  24.99  0.0016 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.313   Deviance explained = 32.7%
## UBRE = -0.43481  Scale est. = 1         n = 4077

6.1 prediction in train and test

pred.gam.train <- as.numeric(predict(bank.gam, bank.train, type = "response") > pcut1)
pred.gam.test <- as.numeric(predict(bank.gam, newdata = bank.test, type = "response") > pcut1)

6.2 misclassification rate

mean(ifelse(bank.train$DLRSN != pred.gam.train, 1, 0))
## [1] 0.5123866
mean(ifelse(bank.test$DLRSN != pred.gam.test, 1, 0))
## [1] 0.5231788

6.3 cost for training and testing

cost1=cost(bank.train$DLRSN, pred.gam.train)
cost1
## [1] 0.6624969
cost2=cost(bank.test$DLRSN, pred.gam.test)
cost2
## [1] 0.6983076

go to top

7 Neural network

bank.nnet <- nnet(DLRSN ~ ., data = bank.train, size = 1, maxit = 500)
## # weights:  13
## initial  value 1311.396434 
## final  value 592.000000 
## converged

7.1 in-sample error

prob.nnet.train = predict(bank.nnet, bank.train)
pred.nnet.train = as.numeric(prob.nnet.train > pcut1)
table(bank.train$DLRSN, pred.nnet.train, dnn = c("Observed","Predicted"))
##         Predicted
## Observed    0
##        0 3485
##        1  592

7.2 out of sample error

prob.nnet.test = predict(bank.nnet, newdata = bank.test)
pred.nnet.test = as.numeric(prob.nnet.test > pcut1)
table(bank.test$DLRSN, pred.nnet.test, dnn = c("Observed","Predicted"))
##         Predicted
## Observed    0
##        0 1175
##        1  184

8 misclassification rate

mean(ifelse(bank.train$DLRSN != pred.nnet.train, 1, 0))
## [1] 0.1452048
mean(ifelse(bank.test$DLRSN != pred.nnet.test, 1, 0))
## [1] 0.1353937

9 cost for training and testing

cost1=cost(bank.train$DLRSN, pred.nnet.train)
cost1
## [1] 5.082168
cost2=cost(bank.test$DLRSN, pred.nnet.test)
cost2
## [1] 4.738779

go to top