Problem
Predict whether a company will default based on its financial performance using logistic, stepwise - forward and backward regression.
Approach
We will follow these steps to classify companies for default cases:
Data Preparation and Summary
Data contains 5436 observations and 13 variables. There are no missing values in the data. For the purpose of this analysis we are ignoring the FYEAR and CUSIP columns as they are not relevant for our analysis.
library(ggplot2)
library(dplyr)
library(tidyr)
library(corrplot)
#### Input and summary of data ###
Bankruptcy <- read.csv("Bankruptcy.csv")
Bank_clean <- Bankruptcy[, -c(1, 3)]
Sample data: first six rows
head(Bankruptcy)
## FYEAR DLRSN CUSIP R1 R2 R3 R4
## 1 1999 0 00036020 0.3071395 0.8870057 1.6476808 -0.19915760
## 2 1999 0 00036110 0.7607365 0.5924934 0.4530028 -0.36989014
## 3 1999 0 00037520 -0.5135961 0.3376148 0.2990145 -0.02907996
## 4 1994 1 00078110 -0.4661293 0.3707467 0.4960667 -0.37342897
## 5 1999 0 00079X10 2.0234223 0.2148764 0.1825948 6.69536040
## 6 1999 0 00086T10 0.9074985 0.3868797 0.4778914 -0.34715872
## R5 R6 R7 R8 R9 R10
## 1 1.0929640 -0.31328867 -0.19679332 1.2067628 0.2824709 0.15889625
## 2 0.1861539 0.03961907 0.32749732 0.4284181 1.1069652 0.79344276
## 3 -0.4326047 0.82999324 -0.70778613 0.4761533 2.1791755 2.48458450
## 4 -0.2674240 0.97779888 -0.61097507 0.4568098 0.1519511 0.04778851
## 5 -1.1483381 -1.50588927 2.87647687 0.2873749 -0.9864421 0.79107673
## 6 1.4079893 -0.48390230 0.07025944 0.5278110 0.5024659 -0.16464802
Structure of the data
str(Bankruptcy)
## 'data.frame': 5436 obs. of 13 variables:
## $ FYEAR: int 1999 1999 1999 1994 1999 1999 1999 1999 1987 1999 ...
## $ DLRSN: int 0 0 0 1 0 0 0 0 1 0 ...
## $ CUSIP: Factor w/ 5436 levels "00036020","00036110",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ R1 : num 0.307 0.761 -0.514 -0.466 2.023 ...
## $ R2 : num 0.887 0.592 0.338 0.371 0.215 ...
## $ R3 : num 1.648 0.453 0.299 0.496 0.183 ...
## $ R4 : num -0.1992 -0.3699 -0.0291 -0.3734 6.6954 ...
## $ R5 : num 1.093 0.186 -0.433 -0.267 -1.148 ...
## $ R6 : num -0.3133 0.0396 0.83 0.9778 -1.5059 ...
## $ R7 : num -0.197 0.327 -0.708 -0.611 2.876 ...
## $ R8 : num 1.207 0.428 0.476 0.457 0.287 ...
## $ R9 : num 0.282 1.107 2.179 0.152 -0.986 ...
## $ R10 : num 0.1589 0.7934 2.4846 0.0478 0.7911 ...
Distribution of bankruptcy - 0’s and 1’s in the data (1 means bankrupt)
table(Bankruptcy$DLRSN)
##
## 0 1
## 4660 776
Dimension of data
dim(Bankruptcy)
## [1] 5436 13
Distribution of 0’s and 1’s in the data over the years
table(Bankruptcy$FYEAR, Bankruptcy$DLRSN)
##
## 0 1
## 1980 0 24
## 1981 0 36
## 1982 0 20
## 1983 0 41
## 1984 0 55
## 1985 0 48
## 1986 0 47
## 1987 0 44
## 1988 0 49
## 1989 0 73
## 1990 0 77
## 1991 0 47
## 1992 0 35
## 1993 0 33
## 1994 0 31
## 1995 0 40
## 1996 0 51
## 1997 0 24
## 1998 0 1
## 1999 4660 0
Univariate Analysis
Long <- Bank_clean %>% gather()
ggplot(Long, aes(value)) + facet_wrap(~ key) + geom_histogram()
Long <- Bank_clean %>% gather()
ggplot(Long, aes(x = key, y = value)) + geom_boxplot()
Bivariate Analysis
Correlation <- cor(Bank_clean)
corrplot(Correlation)
Response Variable DLRSN is highly correlated with variable R10 followed by R8, R1 and R3.
We will now employ different methods to find the best model.
Logistic Regression:Summary
### Sampling ###
set.seed(12957846)
index <- sample(nrow(Bank_clean), 0.75*nrow(Bank_clean))
Bankruptcy_train <- Bank_clean[index,]
Bankruptcy_test <- Bank_clean[-index,]
### Model Building ###
# Logistic #
model_logistic <- glm(DLRSN ~., data = Bankruptcy_train, family = "binomial")
summary(model_logistic)
##
## Call:
## glm(formula = DLRSN ~ ., family = "binomial", data = Bankruptcy_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2267 -0.4653 -0.2484 -0.1154 3.3268
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.47076 0.07762 -31.833 < 2e-16 ***
## R1 0.23062 0.07690 2.999 0.002709 **
## R2 0.54190 0.07988 6.784 1.17e-11 ***
## R3 -0.35785 0.10055 -3.559 0.000372 ***
## R4 -0.09394 0.10360 -0.907 0.364528
## R5 0.02784 0.05359 0.520 0.603349
## R6 0.23985 0.05685 4.219 2.46e-05 ***
## R7 -0.49011 0.10819 -4.530 5.90e-06 ***
## R8 -0.35951 0.08607 -4.177 2.96e-05 ***
## R9 0.28565 0.10672 2.677 0.007433 **
## R10 -1.49359 0.09925 -15.048 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3364.0 on 4076 degrees of freedom
## Residual deviance: 2308.7 on 4066 degrees of freedom
## AIC: 2330.7
##
## Number of Fisher Scoring iterations: 7
Logistic model deviance = 2308.66
Logistic model AIC = 2330.66
Logistic model BIC = 2400.11
Stepwise - Forward, backward, both and backward with bic
# Variabel Selection #
NUll_model <- glm(DLRSN ~ 1, data = Bankruptcy_train, family = "binomial")
Full_model <- glm(DLRSN ~ ., data = Bankruptcy_train, family = "binomial")
Backward_selection <- step(Full_model, direction = "backward")
## Start: AIC=2330.66
## DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## - R5 1 2308.9 2328.9
## - R4 1 2309.6 2329.6
## <none> 2308.7 2330.7
## - R9 1 2315.9 2335.9
## - R1 1 2317.8 2337.8
## - R3 1 2321.3 2341.3
## - R6 1 2326.6 2346.6
## - R8 1 2327.0 2347.0
## - R7 1 2331.4 2351.4
## - R2 1 2358.3 2378.3
## - R10 1 2559.7 2579.7
##
## Step: AIC=2328.93
## DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## - R4 1 2309.7 2327.7
## <none> 2308.9 2328.9
## - R1 1 2318.2 2336.2
## - R9 1 2320.5 2338.5
## - R3 1 2321.6 2339.6
## - R8 1 2327.3 2345.3
## - R6 1 2328.1 2346.1
## - R7 1 2331.9 2349.9
## - R2 1 2358.3 2376.3
## - R10 1 2642.8 2660.8
##
## Step: AIC=2327.69
## DLRSN ~ R1 + R2 + R3 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## <none> 2309.7 2327.7
## - R1 1 2319.4 2335.4
## - R3 1 2322.3 2338.3
## - R9 1 2325.3 2341.3
## - R8 1 2327.7 2343.7
## - R6 1 2330.2 2346.2
## - R7 1 2337.9 2353.9
## - R2 1 2361.2 2377.2
## - R10 1 2763.3 2779.3
Forward_selection <- step(NUll_model, direction = "forward", scope =
list(lower = NUll_model, upper = Full_model))
## Start: AIC=3365.99
## DLRSN ~ 1
##
## Df Deviance AIC
## + R10 1 2535.2 2539.2
## + R6 1 3053.5 3057.5
## + R8 1 3066.3 3070.3
## + R3 1 3097.0 3101.0
## + R1 1 3160.0 3164.0
## + R9 1 3193.3 3197.3
## + R7 1 3195.6 3199.6
## + R4 1 3196.9 3200.9
## + R2 1 3208.8 3212.8
## + R5 1 3251.3 3255.3
## <none> 3364.0 3366.0
##
## Step: AIC=2539.2
## DLRSN ~ R10
##
## Df Deviance AIC
## + R7 1 2470.4 2476.4
## + R6 1 2481.1 2487.1
## + R9 1 2487.6 2493.6
## + R8 1 2488.8 2494.8
## + R4 1 2489.3 2495.3
## + R3 1 2507.7 2513.7
## + R1 1 2508.4 2514.4
## + R5 1 2510.9 2516.9
## + R2 1 2533.0 2539.0
## <none> 2535.2 2539.2
##
## Step: AIC=2476.38
## DLRSN ~ R10 + R7
##
## Df Deviance AIC
## + R9 1 2437.8 2445.8
## + R8 1 2438.7 2446.7
## + R3 1 2450.2 2458.2
## + R4 1 2456.1 2464.1
## + R5 1 2456.1 2464.1
## + R6 1 2456.2 2464.2
## + R2 1 2460.0 2468.0
## <none> 2470.4 2476.4
## + R1 1 2469.5 2477.5
##
## Step: AIC=2445.77
## DLRSN ~ R10 + R7 + R9
##
## Df Deviance AIC
## + R8 1 2376.2 2386.2
## + R3 1 2379.0 2389.0
## + R6 1 2422.8 2432.8
## <none> 2437.8 2445.8
## + R4 1 2436.5 2446.5
## + R5 1 2436.6 2446.6
## + R2 1 2437.3 2447.3
## + R1 1 2437.4 2447.4
##
## Step: AIC=2386.21
## DLRSN ~ R10 + R7 + R9 + R8
##
## Df Deviance AIC
## + R2 1 2342.8 2354.8
## + R3 1 2370.4 2382.4
## + R4 1 2372.4 2384.4
## + R6 1 2373.5 2385.5
## <none> 2376.2 2386.2
## + R1 1 2375.1 2387.1
## + R5 1 2376.2 2388.2
##
## Step: AIC=2354.84
## DLRSN ~ R10 + R7 + R9 + R8 + R2
##
## Df Deviance AIC
## + R3 1 2330.3 2344.3
## + R6 1 2332.0 2346.0
## + R4 1 2340.8 2354.8
## <none> 2342.8 2354.8
## + R5 1 2341.9 2355.9
## + R1 1 2342.7 2356.7
##
## Step: AIC=2344.34
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3
##
## Df Deviance AIC
## + R6 1 2319.4 2335.4
## + R4 1 2328.3 2344.3
## <none> 2330.3 2344.3
## + R5 1 2329.4 2345.4
## + R1 1 2330.2 2346.2
##
## Step: AIC=2335.44
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3 + R6
##
## Df Deviance AIC
## + R1 1 2309.7 2327.7
## <none> 2319.4 2335.4
## + R4 1 2318.2 2336.2
## + R5 1 2319.2 2337.2
##
## Step: AIC=2327.69
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3 + R6 + R1
##
## Df Deviance AIC
## <none> 2309.7 2327.7
## + R4 1 2308.9 2328.9
## + R5 1 2309.6 2329.6
Both_selection <- step(NUll_model, direction = "both", scope =
list(lower = NUll_model, upper = Full_model))
## Start: AIC=3365.99
## DLRSN ~ 1
##
## Df Deviance AIC
## + R10 1 2535.2 2539.2
## + R6 1 3053.5 3057.5
## + R8 1 3066.3 3070.3
## + R3 1 3097.0 3101.0
## + R1 1 3160.0 3164.0
## + R9 1 3193.3 3197.3
## + R7 1 3195.6 3199.6
## + R4 1 3196.9 3200.9
## + R2 1 3208.8 3212.8
## + R5 1 3251.3 3255.3
## <none> 3364.0 3366.0
##
## Step: AIC=2539.2
## DLRSN ~ R10
##
## Df Deviance AIC
## + R7 1 2470.4 2476.4
## + R6 1 2481.0 2487.0
## + R9 1 2487.6 2493.6
## + R8 1 2488.8 2494.8
## + R4 1 2489.3 2495.3
## + R3 1 2507.7 2513.7
## + R1 1 2508.4 2514.4
## + R5 1 2511.0 2517.0
## + R2 1 2533.0 2539.0
## <none> 2535.2 2539.2
## - R10 1 3364.0 3366.0
##
## Step: AIC=2476.38
## DLRSN ~ R10 + R7
##
## Df Deviance AIC
## + R9 1 2437.8 2445.8
## + R8 1 2438.7 2446.7
## + R3 1 2450.2 2458.2
## + R4 1 2456.1 2464.1
## + R5 1 2456.1 2464.1
## + R6 1 2456.2 2464.2
## + R2 1 2460.0 2468.0
## <none> 2470.4 2476.4
## + R1 1 2469.5 2477.5
## - R7 1 2535.2 2539.2
## - R10 1 3195.6 3199.6
##
## Step: AIC=2445.77
## DLRSN ~ R10 + R7 + R9
##
## Df Deviance AIC
## + R8 1 2376.2 2386.2
## + R3 1 2379.0 2389.0
## + R6 1 2422.8 2432.8
## <none> 2437.8 2445.8
## + R4 1 2436.5 2446.5
## + R5 1 2436.6 2446.6
## + R2 1 2437.3 2447.3
## + R1 1 2437.4 2447.4
## - R9 1 2470.4 2476.4
## - R7 1 2487.6 2493.6
## - R10 1 2993.5 2999.5
##
## Step: AIC=2386.21
## DLRSN ~ R10 + R7 + R9 + R8
##
## Df Deviance AIC
## + R2 1 2342.8 2354.8
## + R3 1 2370.4 2382.4
## + R4 1 2372.4 2384.4
## + R6 1 2373.5 2385.5
## <none> 2376.2 2386.2
## + R1 1 2375.1 2387.1
## + R5 1 2376.2 2388.2
## - R7 1 2401.7 2409.7
## - R8 1 2437.8 2445.8
## - R9 1 2438.7 2446.7
## - R10 1 2905.1 2913.1
##
## Step: AIC=2354.84
## DLRSN ~ R10 + R7 + R9 + R8 + R2
##
## Df Deviance AIC
## + R3 1 2330.3 2344.3
## + R6 1 2332.0 2346.0
## + R4 1 2340.8 2354.8
## <none> 2342.8 2354.8
## + R5 1 2341.9 2355.9
## + R1 1 2342.7 2356.7
## - R9 1 2369.2 2379.2
## - R2 1 2376.2 2386.2
## - R7 1 2381.4 2391.4
## - R8 1 2437.3 2447.3
## - R10 1 2879.6 2889.6
##
## Step: AIC=2344.34
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3
##
## Df Deviance AIC
## + R6 1 2319.4 2335.4
## + R4 1 2328.3 2344.3
## <none> 2330.3 2344.3
## + R5 1 2329.4 2345.4
## + R1 1 2330.2 2346.2
## - R3 1 2342.8 2354.8
## - R8 1 2349.5 2361.5
## - R9 1 2364.2 2376.2
## - R2 1 2370.4 2382.4
## - R7 1 2370.5 2382.5
## - R10 1 2872.8 2884.8
##
## Step: AIC=2335.44
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3 + R6
##
## Df Deviance AIC
## + R1 1 2309.7 2327.7
## <none> 2319.4 2335.4
## + R4 1 2318.2 2336.2
## + R5 1 2319.2 2337.2
## - R6 1 2330.3 2344.3
## - R3 1 2332.0 2346.0
## - R8 1 2336.5 2350.5
## - R7 1 2338.0 2352.0
## - R9 1 2345.9 2359.9
## - R2 1 2368.1 2382.1
## - R10 1 2798.2 2812.2
##
## Step: AIC=2327.69
## DLRSN ~ R10 + R7 + R9 + R8 + R2 + R3 + R6 + R1
##
## Df Deviance AIC
## <none> 2309.7 2327.7
## + R4 1 2308.9 2328.9
## + R5 1 2309.6 2329.6
## - R1 1 2319.4 2335.4
## - R3 1 2322.3 2338.3
## - R9 1 2325.3 2341.3
## - R8 1 2327.7 2343.7
## - R6 1 2330.2 2346.2
## - R7 1 2337.9 2353.9
## - R2 1 2361.2 2377.2
## - R10 1 2763.3 2779.3
Backward_bic_selection <- step(Full_model, k = log(nrow(Bankruptcy_train)))
## Start: AIC=2400.11
## DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## - R5 1 2308.9 2392.1
## - R4 1 2309.6 2392.7
## - R9 1 2315.9 2399.1
## <none> 2308.7 2400.1
## - R1 1 2317.8 2400.9
## - R3 1 2321.3 2404.4
## - R6 1 2326.6 2409.7
## - R8 1 2327.0 2410.1
## - R7 1 2331.4 2414.5
## - R2 1 2358.3 2441.4
## - R10 1 2559.7 2642.8
##
## Step: AIC=2392.06
## DLRSN ~ R1 + R2 + R3 + R4 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## - R4 1 2309.7 2384.5
## <none> 2308.9 2392.1
## - R1 1 2318.2 2393.0
## - R9 1 2320.5 2395.4
## - R3 1 2321.6 2396.4
## - R8 1 2327.3 2402.1
## - R6 1 2328.1 2402.9
## - R7 1 2331.9 2406.7
## - R2 1 2358.3 2433.1
## - R10 1 2642.8 2717.6
##
## Step: AIC=2384.51
## DLRSN ~ R1 + R2 + R3 + R6 + R7 + R8 + R9 + R10
##
## Df Deviance AIC
## <none> 2309.7 2384.5
## - R1 1 2319.4 2385.9
## - R3 1 2322.3 2388.8
## - R9 1 2325.3 2391.8
## - R8 1 2327.7 2394.2
## - R6 1 2330.2 2396.7
## - R7 1 2337.9 2404.4
## - R2 1 2361.2 2427.7
## - R10 1 2763.3 2829.8
AIC - Backward Selection = 2327.69
AIC - Forward Selection = 2327.69
AIC - Both Selection = 2327.69
AIC - Backward BIC Selection = 2327.69
BIC - Backward Selection = 2384.51
BIC - Forward Selection = 2384.51
BIC - Both Selection = 2384.51
BIC - Backward BIC Selection = 2384.51
In-sample Prediction
# ROC Curve #
# In-sample prediction
pred_Bankruptcy_train <- predict(Forward_selection, type = "response")
library(ROCR)
pred <- prediction(pred_Bankruptcy_train, Bankruptcy_train$DLRSN)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE)
AUC for in-sample prediction = 0.87
Dimension for train data: 4077, 11
Dimension for test data: 1359, 11
Optimal Cut-off
# Optimal Cut-off
costfunc = function(obs, pred.p, pcut){
weight1 = 35 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs == 1) & (pred.p < pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs == 0) & (pred.p >= pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
p.seq = seq(0.01, 1, 0.01)
cost = rep(0, length(p.seq))
for (i in 1:length(p.seq)) {
cost[i] = costfunc(obs = Bankruptcy_train$DLRSN,
pred.p = pred_Bankruptcy_train, pcut = p.seq[i])
} # end of the loop
plot(p.seq, cost)
optimal.pcut = p.seq[which(cost == min(cost))]
Optimal cut-off = 0.02
Contingency table for train data set
class_Bankruptcy_train_opt <- (pred_Bankruptcy_train > optimal.pcut)*1
table(Bankruptcy_train$DLRSN, class_Bankruptcy_train_opt,
dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 1105 2384
## 1 11 577
In_AMR <- (35*11+1*577)/4077
Asymmetric Misclassification rate for train data = 0.24
Out-of-sample Prediction
pred_Bankruptcy_test <- predict(Forward_selection,
newdata = Bankruptcy_test, type = "response")
pred_test <- prediction(pred_Bankruptcy_test, Bankruptcy_test$DLRSN)
perf_test <- performance(pred_test, "tpr", "fpr")
plot(perf_test, colorize = TRUE)
AUC for out-of-sample prediction = 0.9
class_Bankruptcy_test_opt <- (pred_Bankruptcy_test > optimal.pcut)*1
table(Bankruptcy_test$DLRSN, class_Bankruptcy_test_opt,
dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 360 811
## 1 2 186
Out_AMR <- (35*2 + 1*186)/1359
Asymmetric Misclassification rate on test data = 0.19
Final Model
We developed the following models and calculated their AIC and BIC to find the best model. Stepwise – forward, backward and backward with BIC gave us the same result. We chose Stepwise – Backward with BIC as our final model.
AUC and cost function were calculated for in sample and out of sample data. Following results were obtained:
Training Data
plot(perf, colorize = TRUE)
AUC for in-sample prediction = 0.87
Contingency table on train data
class_Bankruptcy_train_opt <- (pred_Bankruptcy_train > optimal.pcut)*1
table(Bankruptcy_train$DLRSN, class_Bankruptcy_train_opt,
dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 1105 2384
## 1 11 577
Asymmetric Misclassification rate for train data = 0.24
Test Data
plot(perf_test, colorize = TRUE)
AUC for out-of-sample prediction = 0.9
Contingency table on test data
table(Bankruptcy_test$DLRSN, class_Bankruptcy_test_opt,
dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 360 811
## 1 2 186
Asymmetric Misclassification rate on test data = 0.19
AUC for out of sample was found to be more. Cost function of out-of-sample was found to be less than in sample data.