This analysis aims to accurately predict whether a business will declare bankruptcy or not. The dataset is available here: https://www.kaggle.com/shebrahimi/financial-distress/data
library(caret)
library(mlbench)
library(DMwR)
library(e1071)
library(randomForest)
library(dplyr)
library(plotly)
df <- read.csv("Financial Distress.csv", stringsAsFactors=FALSE)
#check the dimensions of the dataset
dim(df)
## [1] 3672 86
#take a look at column data types
str(df)
## 'data.frame': 3672 obs. of 86 variables:
## $ Company : int 1 1 1 1 2 2 2 2 2 2 ...
## $ Time : int 1 2 3 4 1 2 3 4 5 6 ...
## $ Financial.Distress: num 0.0106 -0.456 -0.3254 -0.5666 1.3573 ...
## $ x1 : num 1.28 1.27 1.05 1.11 1.06 ...
## $ x2 : num 0.02293 0.00645 -0.05938 -0.01523 0.10702 ...
## $ x3 : num 0.875 0.821 0.922 0.859 0.815 ...
## $ x4 : num 1.216 1.005 0.729 0.81 0.836 ...
## $ x5 : num 0.0609 -0.0141 0.0205 0.076 0.2 ...
## $ x6 : num 0.1883 0.181 0.0449 0.091 0.0478 ...
## $ x7 : num 0.525 0.623 0.433 0.675 0.742 ...
## $ x8 : num 0.01885 0.00642 -0.08142 -0.01881 0.12803 ...
## $ x9 : num 0.183 0.036 -0.765 -0.108 0.577 ...
## $ x10 : num 0.00645 0.0018 -0.05432 -0.06532 0.09408 ...
## $ x11 : num 0.858 0.852 0.893 0.896 0.815 ...
## $ x12 : num 2.006 -0.486 0.412 0.995 3.015 ...
## $ x13 : num 0.1255 0.1793 0.0776 0.1411 0.1854 ...
## $ x14 : num 6.97 4.58 11.89 6.09 4.39 ...
## $ x15 : num 4.65 3.75 2.49 1.64 1.62 ...
## $ x16 : num 0.0501 -0.014 0.0281 0.0939 0.2392 ...
## $ x17 : num 2.2 2.46 1.4 2.06 3.03 ...
## $ x18 : num 0.01826 0.02756 0.0126 0.0116 0.00681 ...
## $ x19 : num 0.025 0.0288 0.0681 0.0944 0.0793 ...
## $ x20 : num 0.02726 0.0411 0.01485 0.01442 0.00888 ...
## $ x21 : num 1.417 1.18 0.817 0.904 1.025 ...
## $ x22 : num 9.56 7.3 7.12 7.98 4.75 ...
## $ x23 : num 0.1487 0.056 0.0652 0.1252 0.266 ...
## $ x24 : num 0.67 0.67 0.848 0.805 0.768 ...
## $ x25 : num 214.8 38.2 -498.4 -75.9 1423.1 ...
## $ x26 : num 12.6 12.9 13.2 13.3 11.6 ...
## $ x27 : num 6.46 5.55 16.25 8.89 17.49 ...
## $ x28 : num 0.0438 0.2655 0.4166 0.0838 0.6208 ...
## $ x29 : num 0.2046 0.1502 0.0741 0.0541 0.0469 ...
## $ x30 : num 0.352 0.418 0.367 0.544 0.57 ...
## $ x31 : num 8.32 9.53 9.35 7.09 9.49 ...
## $ x32 : num 0.289 0.416 0.504 0.671 0.681 ...
## $ x33 : num 0.766 0.817 0.92 0.937 0.942 ...
## $ x34 : num 2.58 2.6 1.49 2.35 4.13 ...
## $ x35 : num 77.4 95.9 144.7 219.8 222.7 ...
## $ x36 : num 0.02672 0.00758 -0.06648 -0.017 0.13123 ...
## $ x37 : num 1.631 0.838 0.956 0.383 0.253 ...
## $ x38 : num 0.01502 0.02743 0.01727 0.01433 0.00815 ...
## $ x39 : num 0.00548 0.04543 0.02806 0.20337 0.35301 ...
## $ x40 : num 0.127 0.138 0.102 0.101 0.176 ...
## $ x41 : num 9.7 5.6 9.4 5.74 4.51 ...
## $ x42 : num -0.736 -0.644 -14.032 0.722 -0.113 ...
## $ x43 : num 0.986 1.302 0.757 1.391 1.053 ...
## $ x44 : num 0.1802 0.0469 -0.5798 -0.1501 0.6077 ...
## $ x45 : num 1.501 1.01 0.578 0.645 0.258 ...
## $ x46 : num 0.02622 0.00786 -0.06437 -0.01773 0.13138 ...
## $ x47 : num 7.05 4.6 11.99 6.11 4.42 ...
## $ x48 : num 1175 1062 651 703 2465 ...
## $ x49 : num 5.34 3.74 10.93 5.7 4.14 ...
## $ x50 : num 0.851 0.944 0.935 0.875 0.734 ...
## $ x51 : num 12.8 12.9 12.9 13.1 11.4 ...
## $ x52 : num 0.061737 -0.000565 0.041625 0.1084 0.25031 ...
## $ x53 : num 0.1809 0.0563 0.0476 0.1013 0.2224 ...
## $ x54 : num 210 250 281 414 315 ...
## $ x55 : num -0.5826 -0.4748 -1 0.565 -0.0601 ...
## $ x56 : num 0.471 0.386 0.488 0.344 0.202 ...
## $ x57 : num 0.1099 0.3693 0.0533 0.0734 1.2291 ...
## $ x58 : num 0.00 0.00 3.79e-03 3.66e-05 -2.49e-03 ...
## $ x59 : num 0.00 0.00 5.19e-03 4.53e-05 -2.98e-03 ...
## $ x60 : num 0.22 0 0 0 0.227 ...
## $ x61 : num 7.12 7.42 3.64 5.14 7.12 ...
## $ x62 : num 15.38 7.11 7.02 9.91 15.38 ...
## $ x63 : num 3.27 14.32 1.15 2.04 3.27 ...
## $ x64 : num 17.87 18.77 9.9 -1.49 17.87 ...
## $ x65 : num 34.69 124.76 6.45 -21.91 34.69 ...
## $ x66 : num 30.1 26.1 30.2 34.3 30.1 ...
## $ x67 : num 12.8 11.8 10.3 11.5 12.8 11.8 10.3 11.5 11.3 10.5 ...
## $ x68 : num 7991 8323 8747 9042 7991 ...
## $ x69 : num 364.95 0.19 11.95 -18.75 364.95 ...
## $ x70 : num 15.8 15.6 15.2 10.4 15.8 15.6 15.2 10.4 11.9 18.4 ...
## $ x71 : num 61.5 24.6 20.7 47.4 61.5 ...
## $ x72 : num 4 0 0 4 4 0 0 4 4 2 ...
## $ x73 : num 36 36 35 33 36 36 35 33 31 29 ...
## $ x74 : num 85.4 107.1 120.9 54.8 85.4 ...
## $ x75 : num 27.1 31.3 36.1 39.8 27.1 ...
## $ x76 : num 26.1 30.2 35.3 38.4 26.1 ...
## $ x77 : num 16 17 17 17.2 16 ...
## $ x78 : num 16 16 15 16 16 16 15 16 14 12 ...
## $ x79 : num 0.2 0.4 -0.2 5.6 0.2 0.4 -0.2 5.6 2.1 -6.4 ...
## $ x80 : int 22 22 22 22 29 29 29 29 29 29 ...
## $ x81 : num 0.0604 0.0106 -0.456 -0.3254 1.251 ...
## $ x82 : int 30 31 32 33 7 8 9 10 11 12 ...
## $ x83 : int 49 50 51 52 27 28 29 30 31 32 ...
#check for NA values
table(is.na(df))
##
## FALSE
## 315792
#if target variable "Financial.Distress" is greater than -0.50 the company is healthy (0), otherwise, it is financially distressed (1).
df$Financial.Distress <- ifelse(df$Financial.Distress > -0.5, 0, 1)
df$Financial.Distress <- as.factor(df$Financial.Distress)
Since this is a classification problem, I will visualize the class balance with an interactive plotly donut chart:
percs <- df %>%
group_by(Financial.Distress) %>%
summarise(n = n()) %>%
mutate(perc = n/nrow(df) * 100) %>%
mutate(perc = round(perc, digits = 1))
percs$health <- ifelse(percs$Financial.Distress == 0, "Healthy", "Distressed")
percs %>% plot_ly(labels = ~health, values = ~perc) %>%
add_pie(hole = 0.6) %>%
layout(title = "Class balance in target variable", showlegend = T,
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
We can see that the classes are very unbalanced, with businesses in financial distress only accounting for 3.7 percent of observations in the data. Let’s first see how the model performs without oversampling the minority class in the training data:
n <- nrow(df)
trainIndex <- sample(1:n, size = round(0.7*n), replace=FALSE)
train <- df[trainIndex, 3:82]
test <- df[-trainIndex, 3:82]
set.seed(7)
trainControl <- trainControl(method="cv", number=5)
fit.rf <- train(Financial.Distress ~ ., data = train, method="rf", trControl=trainControl)
p <- predict(fit.rf, test)
confusionMatrix(p, test$Financial.Distress)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1060 22
## 1 9 11
##
## Accuracy : 0.9719
## 95% CI : (0.9603, 0.9808)
## No Information Rate : 0.9701
## P-Value [Acc > NIR] : 0.40552
##
## Kappa : 0.4016
## Mcnemar's Test P-Value : 0.03114
##
## Sensitivity : 0.9916
## Specificity : 0.3333
## Pos Pred Value : 0.9797
## Neg Pred Value : 0.5500
## Prevalence : 0.9701
## Detection Rate : 0.9619
## Detection Prevalence : 0.9819
## Balanced Accuracy : 0.6625
##
## 'Positive' Class : 0
##
At 96.82 percent, the accuracy is deceivingly high, with a very high true negative rate, but a very low true positive rate, as shown by the confusion matrix.
Now, I will oversample the minority class by 200 percent and undersample the majority class by 100 percent in the training set using the SMOTE function in the DMwR package:
n <- nrow(df)
trainIndex <- sample(1:n, size = round(0.7*n), replace=FALSE)
train <- df[trainIndex, 3:82]
test <- df[-trainIndex, 3:82]
trainSplit <- SMOTE(Financial.Distress ~ ., train, perc.over = 200, perc.under = 100)
set.seed(7)
trainControl <- trainControl(method="cv", number=5)
fit.rf <- train(Financial.Distress ~ ., data = trainSplit, method="rf", trControl=trainControl)
p <- predict(fit.rf, test)
confusionMatrix(p, test$Financial.Distress)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 914 7
## 1 143 38
##
## Accuracy : 0.8639
## 95% CI : (0.8422, 0.8836)
## No Information Rate : 0.9592
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2898
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8647
## Specificity : 0.8444
## Pos Pred Value : 0.9924
## Neg Pred Value : 0.2099
## Prevalence : 0.9592
## Detection Rate : 0.8294
## Detection Prevalence : 0.8358
## Balanced Accuracy : 0.8546
##
## 'Positive' Class : 0
##
Here we can see that although the overall accuracy has dropped slightly, the true positive rate in the confusion matrix has risen significantly to accurately predict nearly all positive cases.