# Initial setup
options(warn = -1)
options(scipen=999)
suppressWarnings(suppressPackageStartupMessages({
library(tidymodels)
library(tidyverse)
library(dplyr)
library(rpart)
library(Amelia)
library(corrr)
library(corrplot)
library(DMwR)
library(ROSE)
library(caret)
library(skimr)
library(DataExplorer)
library(themis)
library(vip)
}))
company_data <- read.csv("~/Loan Prediction Approval/Bankrupcy Company Prediction/company_bankrupcy.csv")
colnames(company_data)[1] <- "Bankrupt"
company_data$Bankrupt <- factor(company_data$Bankrupt, levels = c(1,0), labels = c("Bankrupt", "No Bankrupt") )
company_data$Net.Income.Flag <- NULL
skim(company_data)
| Name | company_data |
| Number of rows | 6819 |
| Number of columns | 95 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 94 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Bankrupt | 0 | 1 | FALSE | 2 | No : 6599, Ban: 220 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| ROA.C..before.interest.and.depreciation.before.interest | 0 | 1 | 0.51 | 0.06 | 0 | 0.48 | 0.50 | 0.54 | 1 | ▁▁▇▁▁ |
| ROA.A..before.interest.and…after.tax | 0 | 1 | 0.56 | 0.07 | 0 | 0.54 | 0.56 | 0.59 | 1 | ▁▁▇▂▁ |
| ROA.B..before.interest.and.depreciation.after.tax | 0 | 1 | 0.55 | 0.06 | 0 | 0.53 | 0.55 | 0.58 | 1 | ▁▁▇▂▁ |
| Operating.Gross.Margin | 0 | 1 | 0.61 | 0.02 | 0 | 0.60 | 0.61 | 0.61 | 1 | ▁▁▂▇▁ |
| Realized.Sales.Gross.Margin | 0 | 1 | 0.61 | 0.02 | 0 | 0.60 | 0.61 | 0.61 | 1 | ▁▁▂▇▁ |
| Operating.Profit.Rate | 0 | 1 | 1.00 | 0.01 | 0 | 1.00 | 1.00 | 1.00 | 1 | ▁▁▁▁▇ |
| Pre.tax.net.Interest.Rate | 0 | 1 | 0.80 | 0.01 | 0 | 0.80 | 0.80 | 0.80 | 1 | ▁▁▁▇▁ |
| After.tax.net.Interest.Rate | 0 | 1 | 0.81 | 0.01 | 0 | 0.81 | 0.81 | 0.81 | 1 | ▁▁▁▁▇ |
| Non.industry.income.and.expenditure.revenue | 0 | 1 | 0.30 | 0.01 | 0 | 0.30 | 0.30 | 0.30 | 1 | ▁▇▁▁▁ |
| Continuous.interest.rate..after.tax. | 0 | 1 | 0.78 | 0.01 | 0 | 0.78 | 0.78 | 0.78 | 1 | ▁▁▁▇▁ |
| Operating.Expense.Rate | 0 | 1 | 1995347312.80 | 3237683890.52 | 0 | 0.00 | 0.00 | 4145000000.00 | 9990000000 | ▇▁▁▁▁ |
| Research.and.development.expense.rate | 0 | 1 | 1950427306.06 | 2598291554.00 | 0 | 0.00 | 509000000.00 | 3450000000.00 | 9980000000 | ▇▂▁▁▁ |
| Cash.flow.rate | 0 | 1 | 0.47 | 0.02 | 0 | 0.46 | 0.47 | 0.47 | 1 | ▁▁▇▁▁ |
| Interest.bearing.debt.interest.rate | 0 | 1 | 16448012.91 | 108275033.53 | 0 | 0.00 | 0.00 | 0.00 | 990000000 | ▇▁▁▁▁ |
| Tax.rate..A. | 0 | 1 | 0.12 | 0.14 | 0 | 0.00 | 0.07 | 0.21 | 1 | ▇▂▁▁▁ |
| Net.Value.Per.Share..B. | 0 | 1 | 0.19 | 0.03 | 0 | 0.17 | 0.18 | 0.20 | 1 | ▇▂▁▁▁ |
| Net.Value.Per.Share..A. | 0 | 1 | 0.19 | 0.03 | 0 | 0.17 | 0.18 | 0.20 | 1 | ▇▂▁▁▁ |
| Net.Value.Per.Share..C. | 0 | 1 | 0.19 | 0.03 | 0 | 0.17 | 0.18 | 0.20 | 1 | ▇▂▁▁▁ |
| Persistent.EPS.in.the.Last.Four.Seasons | 0 | 1 | 0.23 | 0.03 | 0 | 0.21 | 0.22 | 0.24 | 1 | ▁▇▁▁▁ |
| Cash.Flow.Per.Share | 0 | 1 | 0.32 | 0.02 | 0 | 0.32 | 0.32 | 0.33 | 1 | ▁▇▁▁▁ |
| Revenue.Per.Share..Yuan… | 0 | 1 | 1328640.60 | 51707089.77 | 0 | 0.02 | 0.03 | 0.05 | 3020000000 | ▇▁▁▁▁ |
| Operating.Profit.Per.Share..Yuan… | 0 | 1 | 0.11 | 0.03 | 0 | 0.10 | 0.10 | 0.12 | 1 | ▇▁▁▁▁ |
| Per.Share.Net.profit.before.tax..Yuan… | 0 | 1 | 0.18 | 0.03 | 0 | 0.17 | 0.18 | 0.19 | 1 | ▇▂▁▁▁ |
| Realized.Sales.Gross.Profit.Growth.Rate | 0 | 1 | 0.02 | 0.01 | 0 | 0.02 | 0.02 | 0.02 | 1 | ▇▁▁▁▁ |
| Operating.Profit.Growth.Rate | 0 | 1 | 0.85 | 0.01 | 0 | 0.85 | 0.85 | 0.85 | 1 | ▁▁▁▁▇ |
| After.tax.Net.Profit.Growth.Rate | 0 | 1 | 0.69 | 0.01 | 0 | 0.69 | 0.69 | 0.69 | 1 | ▁▁▁▇▁ |
| Regular.Net.Profit.Growth.Rate | 0 | 1 | 0.69 | 0.01 | 0 | 0.69 | 0.69 | 0.69 | 1 | ▁▁▁▇▁ |
| Continuous.Net.Profit.Growth.Rate | 0 | 1 | 0.22 | 0.01 | 0 | 0.22 | 0.22 | 0.22 | 1 | ▁▇▁▁▁ |
| Total.Asset.Growth.Rate | 0 | 1 | 5508096595.25 | 2897717771.17 | 0 | 4860000000.00 | 6400000000.00 | 7390000000.00 | 9990000000 | ▃▁▃▇▂ |
| Net.Value.Growth.Rate | 0 | 1 | 1566212.06 | 114159389.52 | 0 | 0.00 | 0.00 | 0.00 | 9330000000 | ▇▁▁▁▁ |
| Total.Asset.Return.Growth.Rate.Ratio | 0 | 1 | 0.26 | 0.01 | 0 | 0.26 | 0.26 | 0.26 | 1 | ▁▇▁▁▁ |
| Cash.Reinvestment.. | 0 | 1 | 0.38 | 0.02 | 0 | 0.37 | 0.38 | 0.39 | 1 | ▁▇▁▁▁ |
| Current.Ratio | 0 | 1 | 403284.95 | 33302155.83 | 0 | 0.01 | 0.01 | 0.02 | 2750000000 | ▇▁▁▁▁ |
| Quick.Ratio | 0 | 1 | 8376594.82 | 244684748.45 | 0 | 0.00 | 0.01 | 0.01 | 9230000000 | ▇▁▁▁▁ |
| Interest.Expense.Ratio | 0 | 1 | 0.63 | 0.01 | 0 | 0.63 | 0.63 | 0.63 | 1 | ▁▁▁▇▁ |
| Total.debt.Total.net.worth | 0 | 1 | 4416336.71 | 168406905.28 | 0 | 0.00 | 0.01 | 0.01 | 9940000000 | ▇▁▁▁▁ |
| Debt.ratio.. | 0 | 1 | 0.11 | 0.05 | 0 | 0.07 | 0.11 | 0.15 | 1 | ▇▁▁▁▁ |
| Net.worth.Assets | 0 | 1 | 0.89 | 0.05 | 0 | 0.85 | 0.89 | 0.93 | 1 | ▁▁▁▁▇ |
| Long.term.fund.suitability.ratio..A. | 0 | 1 | 0.01 | 0.03 | 0 | 0.01 | 0.01 | 0.01 | 1 | ▇▁▁▁▁ |
| Borrowing.dependency | 0 | 1 | 0.37 | 0.02 | 0 | 0.37 | 0.37 | 0.38 | 1 | ▁▇▁▁▁ |
| Contingent.liabilities.Net.worth | 0 | 1 | 0.01 | 0.01 | 0 | 0.01 | 0.01 | 0.01 | 1 | ▇▁▁▁▁ |
| Operating.profit.Paid.in.capital | 0 | 1 | 0.11 | 0.03 | 0 | 0.10 | 0.10 | 0.12 | 1 | ▇▁▁▁▁ |
| Net.profit.before.tax.Paid.in.capital | 0 | 1 | 0.18 | 0.03 | 0 | 0.17 | 0.18 | 0.19 | 1 | ▇▂▁▁▁ |
| Inventory.and.accounts.receivable.Net.value | 0 | 1 | 0.40 | 0.01 | 0 | 0.40 | 0.40 | 0.40 | 1 | ▁▇▇▁▁ |
| Total.Asset.Turnover | 0 | 1 | 0.14 | 0.10 | 0 | 0.08 | 0.12 | 0.18 | 1 | ▇▂▁▁▁ |
| Accounts.Receivable.Turnover | 0 | 1 | 12789705.24 | 278259836.98 | 0 | 0.00 | 0.00 | 0.00 | 9740000000 | ▇▁▁▁▁ |
| Average.Collection.Days | 0 | 1 | 9826220.86 | 256358895.71 | 0 | 0.00 | 0.01 | 0.01 | 9730000000 | ▇▁▁▁▁ |
| Inventory.Turnover.Rate..times. | 0 | 1 | 2149106056.61 | 3247967014.05 | 0 | 0.00 | 0.00 | 4620000000.00 | 9990000000 | ▇▁▁▁▁ |
| Fixed.Assets.Turnover.Frequency | 0 | 1 | 1008595981.82 | 2477557316.92 | 0 | 0.00 | 0.00 | 0.00 | 9990000000 | ▇▁▁▁▁ |
| Net.Worth.Turnover.Rate..times. | 0 | 1 | 0.04 | 0.04 | 0 | 0.02 | 0.03 | 0.04 | 1 | ▇▁▁▁▁ |
| Revenue.per.person | 0 | 1 | 2325854.27 | 136632654.39 | 0 | 0.01 | 0.02 | 0.04 | 8810000000 | ▇▁▁▁▁ |
| Operating.profit.per.person | 0 | 1 | 0.40 | 0.03 | 0 | 0.39 | 0.40 | 0.40 | 1 | ▁▇▃▁▁ |
| Allocation.rate.per.person | 0 | 1 | 11255785.32 | 294506294.12 | 0 | 0.00 | 0.01 | 0.02 | 9570000000 | ▇▁▁▁▁ |
| Working.Capital.to.Total.Assets | 0 | 1 | 0.81 | 0.06 | 0 | 0.77 | 0.81 | 0.85 | 1 | ▁▁▁▆▇ |
| Quick.Assets.Total.Assets | 0 | 1 | 0.40 | 0.20 | 0 | 0.24 | 0.39 | 0.54 | 1 | ▅▇▇▃▁ |
| Current.Assets.Total.Assets | 0 | 1 | 0.52 | 0.22 | 0 | 0.35 | 0.51 | 0.69 | 1 | ▂▆▇▆▃ |
| Cash.Total.Assets | 0 | 1 | 0.12 | 0.14 | 0 | 0.03 | 0.07 | 0.16 | 1 | ▇▁▁▁▁ |
| Quick.Assets.Current.Liability | 0 | 1 | 3592902.20 | 171620908.61 | 0 | 0.01 | 0.01 | 0.01 | 8820000000 | ▇▁▁▁▁ |
| Cash.Current.Liability | 0 | 1 | 37159994.15 | 510350903.16 | 0 | 0.00 | 0.00 | 0.01 | 9650000000 | ▇▁▁▁▁ |
| Current.Liability.to.Assets | 0 | 1 | 0.09 | 0.05 | 0 | 0.05 | 0.08 | 0.12 | 1 | ▇▁▁▁▁ |
| Operating.Funds.to.Liability | 0 | 1 | 0.35 | 0.04 | 0 | 0.34 | 0.35 | 0.36 | 1 | ▁▇▁▁▁ |
| Inventory.Working.Capital | 0 | 1 | 0.28 | 0.01 | 0 | 0.28 | 0.28 | 0.28 | 1 | ▁▇▁▁▁ |
| Inventory.Current.Liability | 0 | 1 | 55806804.53 | 582051554.62 | 0 | 0.00 | 0.01 | 0.01 | 9910000000 | ▇▁▁▁▁ |
| Current.Liabilities.Liability | 0 | 1 | 0.76 | 0.21 | 0 | 0.63 | 0.81 | 0.94 | 1 | ▁▁▂▅▇ |
| Working.Capital.Equity | 0 | 1 | 0.74 | 0.01 | 0 | 0.73 | 0.74 | 0.74 | 1 | ▁▁▁▇▁ |
| Current.Liabilities.Equity | 0 | 1 | 0.33 | 0.01 | 0 | 0.33 | 0.33 | 0.33 | 1 | ▁▇▁▁▁ |
| Long.term.Liability.to.Current.Assets | 0 | 1 | 54160038.14 | 570270621.96 | 0 | 0.00 | 0.00 | 0.01 | 9540000000 | ▇▁▁▁▁ |
| Retained.Earnings.to.Total.Assets | 0 | 1 | 0.93 | 0.03 | 0 | 0.93 | 0.94 | 0.94 | 1 | ▁▁▁▁▇ |
| Total.income.Total.expense | 0 | 1 | 0.00 | 0.01 | 0 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| Total.expense.Assets | 0 | 1 | 0.03 | 0.03 | 0 | 0.01 | 0.02 | 0.04 | 1 | ▇▁▁▁▁ |
| Current.Asset.Turnover.Rate | 0 | 1 | 1195855763.31 | 2821161238.26 | 0 | 0.00 | 0.00 | 0.00 | 10000000000 | ▇▁▁▁▁ |
| Quick.Asset.Turnover.Rate | 0 | 1 | 2163735272.03 | 3374944402.17 | 0 | 0.00 | 0.00 | 4900000000.00 | 10000000000 | ▇▁▁▁▁ |
| Working.capitcal.Turnover.Rate | 0 | 1 | 0.59 | 0.01 | 0 | 0.59 | 0.59 | 0.59 | 1 | ▁▁▇▁▁ |
| Cash.Turnover.Rate | 0 | 1 | 2471976967.44 | 2938623226.68 | 0 | 0.00 | 1080000000.00 | 4510000000.00 | 10000000000 | ▇▂▂▁▁ |
| Cash.Flow.to.Sales | 0 | 1 | 0.67 | 0.01 | 0 | 0.67 | 0.67 | 0.67 | 1 | ▁▁▁▇▁ |
| Fixed.Assets.to.Assets | 0 | 1 | 1220120.50 | 100754158.71 | 0 | 0.09 | 0.20 | 0.37 | 8320000000 | ▇▁▁▁▁ |
| Current.Liability.to.Liability | 0 | 1 | 0.76 | 0.21 | 0 | 0.63 | 0.81 | 0.94 | 1 | ▁▁▂▅▇ |
| Current.Liability.to.Equity | 0 | 1 | 0.33 | 0.01 | 0 | 0.33 | 0.33 | 0.33 | 1 | ▁▇▁▁▁ |
| Equity.to.Long.term.Liability | 0 | 1 | 0.12 | 0.02 | 0 | 0.11 | 0.11 | 0.12 | 1 | ▇▁▁▁▁ |
| Cash.Flow.to.Total.Assets | 0 | 1 | 0.65 | 0.05 | 0 | 0.63 | 0.65 | 0.66 | 1 | ▁▁▁▇▁ |
| Cash.Flow.to.Liability | 0 | 1 | 0.46 | 0.03 | 0 | 0.46 | 0.46 | 0.46 | 1 | ▁▁▇▁▁ |
| CFO.to.Assets | 0 | 1 | 0.59 | 0.06 | 0 | 0.57 | 0.59 | 0.62 | 1 | ▁▁▇▆▁ |
| Cash.Flow.to.Equity | 0 | 1 | 0.32 | 0.01 | 0 | 0.31 | 0.31 | 0.32 | 1 | ▁▇▁▁▁ |
| Current.Liability.to.Current.Assets | 0 | 1 | 0.03 | 0.03 | 0 | 0.02 | 0.03 | 0.04 | 1 | ▇▁▁▁▁ |
| Liability.Assets.Flag | 0 | 1 | 0.00 | 0.03 | 0 | 0.00 | 0.00 | 0.00 | 1 | ▇▁▁▁▁ |
| Net.Income.to.Total.Assets | 0 | 1 | 0.81 | 0.04 | 0 | 0.80 | 0.81 | 0.83 | 1 | ▁▁▁▃▇ |
| Total.assets.to.GNP.price | 0 | 1 | 18629417.81 | 376450059.75 | 0 | 0.00 | 0.00 | 0.01 | 9820000000 | ▇▁▁▁▁ |
| No.credit.Interval | 0 | 1 | 0.62 | 0.01 | 0 | 0.62 | 0.62 | 0.62 | 1 | ▁▁▁▇▁ |
| Gross.Profit.to.Sales | 0 | 1 | 0.61 | 0.02 | 0 | 0.60 | 0.61 | 0.61 | 1 | ▁▁▂▇▁ |
| Net.Income.to.Stockholder.s.Equity | 0 | 1 | 0.84 | 0.01 | 0 | 0.84 | 0.84 | 0.84 | 1 | ▁▁▁▁▇ |
| Liability.to.Equity | 0 | 1 | 0.28 | 0.01 | 0 | 0.28 | 0.28 | 0.28 | 1 | ▁▇▁▁▁ |
| Degree.of.Financial.Leverage..DFL. | 0 | 1 | 0.03 | 0.02 | 0 | 0.03 | 0.03 | 0.03 | 1 | ▇▁▁▁▁ |
| Interest.Coverage.Ratio..Interest.expense.to.EBIT. | 0 | 1 | 0.57 | 0.01 | 0 | 0.57 | 0.57 | 0.57 | 1 | ▁▁▇▁▁ |
| Equity.to.Liability | 0 | 1 | 0.05 | 0.05 | 0 | 0.02 | 0.03 | 0.05 | 1 | ▇▁▁▁▁ |
# Retrieving basic statistics for the dataset
summary(company_data)
## Bankrupt ROA.C..before.interest.and.depreciation.before.interest
## Bankrupt : 220 Min. :0.0000
## No Bankrupt:6599 1st Qu.:0.4765
## Median :0.5027
## Mean :0.5052
## 3rd Qu.:0.5356
## Max. :1.0000
## ROA.A..before.interest.and...after.tax
## Min. :0.0000
## 1st Qu.:0.5355
## Median :0.5598
## Mean :0.5586
## 3rd Qu.:0.5892
## Max. :1.0000
## ROA.B..before.interest.and.depreciation.after.tax Operating.Gross.Margin
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5273 1st Qu.:0.6004
## Median :0.5523 Median :0.6060
## Mean :0.5536 Mean :0.6079
## 3rd Qu.:0.5841 3rd Qu.:0.6139
## Max. :1.0000 Max. :1.0000
## Realized.Sales.Gross.Margin Operating.Profit.Rate Pre.tax.net.Interest.Rate
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6004 1st Qu.:0.9990 1st Qu.:0.7974
## Median :0.6060 Median :0.9990 Median :0.7975
## Mean :0.6079 Mean :0.9988 Mean :0.7972
## 3rd Qu.:0.6138 3rd Qu.:0.9991 3rd Qu.:0.7976
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## After.tax.net.Interest.Rate Non.industry.income.and.expenditure.revenue
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.8093 1st Qu.:0.3035
## Median :0.8094 Median :0.3035
## Mean :0.8091 Mean :0.3036
## 3rd Qu.:0.8095 3rd Qu.:0.3036
## Max. :1.0000 Max. :1.0000
## Continuous.interest.rate..after.tax. Operating.Expense.Rate
## Min. :0.0000 Min. : 0
## 1st Qu.:0.7816 1st Qu.: 0
## Median :0.7816 Median : 0
## Mean :0.7814 Mean :1995347313
## 3rd Qu.:0.7817 3rd Qu.:4145000000
## Max. :1.0000 Max. :9990000000
## Research.and.development.expense.rate Cash.flow.rate
## Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.4616
## Median : 509000000 Median :0.4651
## Mean :1950427306 Mean :0.4674
## 3rd Qu.:3450000000 3rd Qu.:0.4710
## Max. :9980000000 Max. :1.0000
## Interest.bearing.debt.interest.rate Tax.rate..A. Net.Value.Per.Share..B.
## Min. : 0 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.00000 1st Qu.:0.1736
## Median : 0 Median :0.07349 Median :0.1844
## Mean : 16448013 Mean :0.11500 Mean :0.1907
## 3rd Qu.: 0 3rd Qu.:0.20584 3rd Qu.:0.1996
## Max. :990000000 Max. :1.00000 Max. :1.0000
## Net.Value.Per.Share..A. Net.Value.Per.Share..C.
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1736 1st Qu.:0.1737
## Median :0.1844 Median :0.1844
## Mean :0.1906 Mean :0.1907
## 3rd Qu.:0.1996 3rd Qu.:0.1996
## Max. :1.0000 Max. :1.0000
## Persistent.EPS.in.the.Last.Four.Seasons Cash.Flow.Per.Share
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2147 1st Qu.:0.3177
## Median :0.2245 Median :0.3225
## Mean :0.2288 Mean :0.3235
## 3rd Qu.:0.2388 3rd Qu.:0.3286
## Max. :1.0000 Max. :1.0000
## Revenue.Per.Share..Yuan... Operating.Profit.Per.Share..Yuan...
## Min. : 0 Min. :0.00000
## 1st Qu.: 0 1st Qu.:0.09608
## Median : 0 Median :0.10423
## Mean : 1328641 Mean :0.10909
## 3rd Qu.: 0 3rd Qu.:0.11615
## Max. :3020000000 Max. :1.00000
## Per.Share.Net.profit.before.tax..Yuan...
## Min. :0.0000
## 1st Qu.:0.1704
## Median :0.1797
## Mean :0.1844
## 3rd Qu.:0.1935
## Max. :1.0000
## Realized.Sales.Gross.Profit.Growth.Rate Operating.Profit.Growth.Rate
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.02206 1st Qu.:0.8480
## Median :0.02210 Median :0.8480
## Mean :0.02241 Mean :0.8480
## 3rd Qu.:0.02215 3rd Qu.:0.8481
## Max. :1.00000 Max. :1.0000
## After.tax.Net.Profit.Growth.Rate Regular.Net.Profit.Growth.Rate
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6893 1st Qu.:0.6893
## Median :0.6894 Median :0.6894
## Mean :0.6891 Mean :0.6892
## 3rd Qu.:0.6896 3rd Qu.:0.6896
## Max. :1.0000 Max. :1.0000
## Continuous.Net.Profit.Growth.Rate Total.Asset.Growth.Rate
## Min. :0.0000 Min. : 0
## 1st Qu.:0.2176 1st Qu.:4860000000
## Median :0.2176 Median :6400000000
## Mean :0.2176 Mean :5508096595
## 3rd Qu.:0.2176 3rd Qu.:7390000000
## Max. :1.0000 Max. :9990000000
## Net.Value.Growth.Rate Total.Asset.Return.Growth.Rate.Ratio Cash.Reinvestment..
## Min. : 0 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.2638 1st Qu.:0.3747
## Median : 0 Median :0.2640 Median :0.3804
## Mean : 1566212 Mean :0.2642 Mean :0.3797
## 3rd Qu.: 0 3rd Qu.:0.2644 3rd Qu.:0.3867
## Max. :9330000000 Max. :1.0000 Max. :1.0000
## Current.Ratio Quick.Ratio Interest.Expense.Ratio
## Min. : 0 Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0 1st Qu.:0.6306
## Median : 0 Median : 0 Median :0.6307
## Mean : 403285 Mean : 8376595 Mean :0.6310
## 3rd Qu.: 0 3rd Qu.: 0 3rd Qu.:0.6311
## Max. :2750000000 Max. :9230000000 Max. :1.0000
## Total.debt.Total.net.worth Debt.ratio.. Net.worth.Assets
## Min. : 0 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.07289 1st Qu.:0.8512
## Median : 0 Median :0.11141 Median :0.8886
## Mean : 4416337 Mean :0.11318 Mean :0.8868
## 3rd Qu.: 0 3rd Qu.:0.14880 3rd Qu.:0.9271
## Max. :9940000000 Max. :1.00000 Max. :1.0000
## Long.term.fund.suitability.ratio..A. Borrowing.dependency
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.005244 1st Qu.:0.3702
## Median :0.005665 Median :0.3726
## Mean :0.008783 Mean :0.3747
## 3rd Qu.:0.006847 3rd Qu.:0.3763
## Max. :1.000000 Max. :1.0000
## Contingent.liabilities.Net.worth Operating.profit.Paid.in.capital
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.005366 1st Qu.:0.0961
## Median :0.005366 Median :0.1041
## Mean :0.005968 Mean :0.1090
## 3rd Qu.:0.005764 3rd Qu.:0.1159
## Max. :1.000000 Max. :1.0000
## Net.profit.before.tax.Paid.in.capital
## Min. :0.0000
## 1st Qu.:0.1694
## Median :0.1785
## Mean :0.1827
## 3rd Qu.:0.1916
## Max. :1.0000
## Inventory.and.accounts.receivable.Net.value Total.Asset.Turnover
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.3974 1st Qu.:0.07646
## Median :0.4001 Median :0.11844
## Mean :0.4025 Mean :0.14161
## 3rd Qu.:0.4046 3rd Qu.:0.17691
## Max. :1.0000 Max. :1.00000
## Accounts.Receivable.Turnover Average.Collection.Days
## Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0
## Mean : 12789705 Mean : 9826221
## 3rd Qu.: 0 3rd Qu.: 0
## Max. :9740000000 Max. :9730000000
## Inventory.Turnover.Rate..times. Fixed.Assets.Turnover.Frequency
## Min. : 0 Min. : 0
## 1st Qu.: 0 1st Qu.: 0
## Median : 0 Median : 0
## Mean :2149106057 Mean :1008595982
## 3rd Qu.:4620000000 3rd Qu.: 0
## Max. :9990000000 Max. :9990000000
## Net.Worth.Turnover.Rate..times. Revenue.per.person
## Min. :0.00000 Min. : 0
## 1st Qu.:0.02177 1st Qu.: 0
## Median :0.02952 Median : 0
## Mean :0.03860 Mean : 2325854
## 3rd Qu.:0.04290 3rd Qu.: 0
## Max. :1.00000 Max. :8810000000
## Operating.profit.per.person Allocation.rate.per.person
## Min. :0.0000 Min. : 0
## 1st Qu.:0.3924 1st Qu.: 0
## Median :0.3959 Median : 0
## Mean :0.4007 Mean : 11255785
## 3rd Qu.:0.4019 3rd Qu.: 0
## Max. :1.0000 Max. :9570000000
## Working.Capital.to.Total.Assets Quick.Assets.Total.Assets
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.7743 1st Qu.:0.2420
## Median :0.8103 Median :0.3865
## Mean :0.8141 Mean :0.4001
## 3rd Qu.:0.8504 3rd Qu.:0.5406
## Max. :1.0000 Max. :1.0000
## Current.Assets.Total.Assets Cash.Total.Assets Quick.Assets.Current.Liability
## Min. :0.0000 Min. :0.00000 Min. : 0
## 1st Qu.:0.3528 1st Qu.:0.03354 1st Qu.: 0
## Median :0.5148 Median :0.07489 Median : 0
## Mean :0.5223 Mean :0.12409 Mean : 3592902
## 3rd Qu.:0.6891 3rd Qu.:0.16107 3rd Qu.: 0
## Max. :1.0000 Max. :1.00000 Max. :8820000000
## Cash.Current.Liability Current.Liability.to.Assets
## Min. : 0 Min. :0.00000
## 1st Qu.: 0 1st Qu.:0.05330
## Median : 0 Median :0.08270
## Mean : 37159994 Mean :0.09067
## 3rd Qu.: 0 3rd Qu.:0.11952
## Max. :9650000000 Max. :1.00000
## Operating.Funds.to.Liability Inventory.Working.Capital
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3410 1st Qu.:0.2770
## Median :0.3486 Median :0.2772
## Mean :0.3538 Mean :0.2774
## 3rd Qu.:0.3609 3rd Qu.:0.2774
## Max. :1.0000 Max. :1.0000
## Inventory.Current.Liability Current.Liabilities.Liability
## Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.6270
## Median : 0 Median :0.8069
## Mean : 55806805 Mean :0.7616
## 3rd Qu.: 0 3rd Qu.:0.9420
## Max. :9910000000 Max. :1.0000
## Working.Capital.Equity Current.Liabilities.Equity
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.7336 1st Qu.:0.3281
## Median :0.7360 Median :0.3297
## Mean :0.7358 Mean :0.3314
## 3rd Qu.:0.7386 3rd Qu.:0.3323
## Max. :1.0000 Max. :1.0000
## Long.term.Liability.to.Current.Assets Retained.Earnings.to.Total.Assets
## Min. : 0 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.9311
## Median : 0 Median :0.9377
## Mean : 54160038 Mean :0.9347
## 3rd Qu.: 0 3rd Qu.:0.9448
## Max. :9540000000 Max. :1.0000
## Total.income.Total.expense Total.expense.Assets Current.Asset.Turnover.Rate
## Min. :0.000000 Min. :0.00000 Min. : 0
## 1st Qu.:0.002236 1st Qu.:0.01457 1st Qu.: 0
## Median :0.002336 Median :0.02267 Median : 0
## Mean :0.002549 Mean :0.02918 Mean : 1195855763
## 3rd Qu.:0.002492 3rd Qu.:0.03593 3rd Qu.: 0
## Max. :1.000000 Max. :1.00000 Max. :10000000000
## Quick.Asset.Turnover.Rate Working.capitcal.Turnover.Rate Cash.Turnover.Rate
## Min. : 0 Min. :0.0000 Min. : 0
## 1st Qu.: 0 1st Qu.:0.5939 1st Qu.: 0
## Median : 0 Median :0.5940 Median : 1080000000
## Mean : 2163735272 Mean :0.5940 Mean : 2471976967
## 3rd Qu.: 4900000000 3rd Qu.:0.5940 3rd Qu.: 4510000000
## Max. :10000000000 Max. :1.0000 Max. :10000000000
## Cash.Flow.to.Sales Fixed.Assets.to.Assets Current.Liability.to.Liability
## Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.:0.6716 1st Qu.: 0 1st Qu.:0.6270
## Median :0.6716 Median : 0 Median :0.8069
## Mean :0.6715 Mean : 1220121 Mean :0.7616
## 3rd Qu.:0.6716 3rd Qu.: 0 3rd Qu.:0.9420
## Max. :1.0000 Max. :8320000000 Max. :1.0000
## Current.Liability.to.Equity Equity.to.Long.term.Liability
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3281 1st Qu.:0.1109
## Median :0.3297 Median :0.1123
## Mean :0.3314 Mean :0.1156
## 3rd Qu.:0.3323 3rd Qu.:0.1171
## Max. :1.0000 Max. :1.0000
## Cash.Flow.to.Total.Assets Cash.Flow.to.Liability CFO.to.Assets
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6333 1st Qu.:0.4571 1st Qu.:0.5660
## Median :0.6454 Median :0.4598 Median :0.5933
## Mean :0.6497 Mean :0.4618 Mean :0.5934
## 3rd Qu.:0.6631 3rd Qu.:0.4642 3rd Qu.:0.6248
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Cash.Flow.to.Equity Current.Liability.to.Current.Assets Liability.Assets.Flag
## Min. :0.0000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.3130 1st Qu.:0.01803 1st Qu.:0.000000
## Median :0.3150 Median :0.02760 Median :0.000000
## Mean :0.3156 Mean :0.03151 Mean :0.001173
## 3rd Qu.:0.3177 3rd Qu.:0.03837 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.00000 Max. :1.000000
## Net.Income.to.Total.Assets Total.assets.to.GNP.price No.credit.Interval
## Min. :0.0000 Min. : 0 Min. :0.0000
## 1st Qu.:0.7967 1st Qu.: 0 1st Qu.:0.6236
## Median :0.8106 Median : 0 Median :0.6239
## Mean :0.8078 Mean : 18629418 Mean :0.6239
## 3rd Qu.:0.8265 3rd Qu.: 0 3rd Qu.:0.6242
## Max. :1.0000 Max. :9820000000 Max. :1.0000
## Gross.Profit.to.Sales Net.Income.to.Stockholder.s.Equity Liability.to.Equity
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6004 1st Qu.:0.8401 1st Qu.:0.2769
## Median :0.6060 Median :0.8412 Median :0.2788
## Mean :0.6079 Mean :0.8404 Mean :0.2804
## 3rd Qu.:0.6139 3rd Qu.:0.8424 3rd Qu.:0.2814
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Degree.of.Financial.Leverage..DFL.
## Min. :0.00000
## 1st Qu.:0.02679
## Median :0.02681
## Mean :0.02754
## 3rd Qu.:0.02691
## Max. :1.00000
## Interest.Coverage.Ratio..Interest.expense.to.EBIT. Equity.to.Liability
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.5652 1st Qu.:0.02448
## Median :0.5653 Median :0.03380
## Mean :0.5654 Mean :0.04758
## 3rd Qu.:0.5657 3rd Qu.:0.05284
## Max. :1.0000 Max. :1.00000
plot_density(
data = company_data, ncol = 5, title = "Distribution of predictors",
theme_config = theme_classic()
)
ggplot(company_data, aes(Bankrupt)) + geom_bar(aes(fill = Bankrupt)) + geom_text(aes(label = ..count..), stat = "count", vjust = 1) + labs(y= "Number of companies") + ggtitle(label = "Distribution companies that when bankrupt")
# Visualizing correlations between features
company_data %>% select (where(is.numeric)) %>% correlate() %>% shave() %>% rplot(print_cor = FALSE ) + theme(axis.text.x = element_text(angle = 90, hjust=.1))
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
The dataset has 6,819 observations and 95 variables, there are 94 predictors which are correctly identified as numeric variables also there is a factor variable which is the dependent variable. The dataset has no missing values, but there are outliers for some variables, for example, the maximum number of days that a company sells its inventory in one year is 9990000000, the maximum value of the average collection days in one year is 9730000000 and the maximum value of the quick ratio is 2750000000. On the other hand, the dataset has other problems like class imbalance because only 3.3% of all companies went bankrupt, as well as high dimensionality and multicollinearity problems. With that in mind, a random forest could be appropriate to tackle this kind of problem, in addition, this method performs automatic feature selection.
set.seed(1996)
company_split <- initial_split(company_data, prop = 0.80, strata = Bankrupt)
company_train <- company_split %>% training()
company_test <- company_split %>% testing()
company_recipe <- recipe(formula = Bankrupt~.,
data = company_train) %>%
step_corr(all_numeric_predictors(), threshold = 0.80) %>%
step_downsample(Bankrupt)
tree_prep <- prep(company_recipe)
tree_prep
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 94
##
## ── Training information
## Training data contained 5455 data points and no incomplete rows.
##
## ── Operations
## • Correlation filter on: ROA.A..before.interest.and...after.tax, ... | Trained
## • Down-sampling based on: Bankrupt | Trained
Since one of the goals of this project is to identify important predictors to predict company bankruptcy, correlated features are removed from the dataset. On the other hand, the class imbalance is handled by implementing downsampling.
tune_spec <- rand_forest(mtry = tune(),
min_n = tune(),
trees = 350) %>%
set_mode("classification") %>%
set_engine("ranger", importance = "permutation")
tune_spec
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 350
## min_n = tune()
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: ranger
tune_workflow <- workflow() %>%
add_recipe(company_recipe) %>%
add_model(tune_spec)
tune_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_corr()
## • step_downsample()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 350
## min_n = tune()
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: ranger
set.seed(2019)
threes_folds <- vfold_cv(company_train, v = 10)
doParallel::registerDoParallel()
set.seed(1996)
tune_res <- tune_grid(tune_workflow,
resamples = threes_folds,
grid = 15)
## i Creating pre-processing data to finalize unknown parameter: mtry
tune_res %>% collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter") %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE) +
facet_wrap( ~parameter, scales = "free_x") + labs(title = "area under the curve for differents values of min_n and mtry", y = "mean roc_auc")
### Tuning hyperparameters
rf_grid <- grid_regular(
mtry(range = c(2, 20)),
min_n(range = c(2,10)),
levels = 5)
set.seed(897)
regular_res <- tune_grid(
tune_workflow,
resamples = threes_folds,
grid=rf_grid)
regular_res
## # Tuning results
## # 10-fold cross-validation
## # A tibble: 10 × 4
## splits id .metrics .notes
## <list> <chr> <list> <list>
## 1 <split [4909/546]> Fold01 <tibble [50 × 6]> <tibble [0 × 3]>
## 2 <split [4909/546]> Fold02 <tibble [50 × 6]> <tibble [0 × 3]>
## 3 <split [4909/546]> Fold03 <tibble [50 × 6]> <tibble [0 × 3]>
## 4 <split [4909/546]> Fold04 <tibble [50 × 6]> <tibble [0 × 3]>
## 5 <split [4909/546]> Fold05 <tibble [50 × 6]> <tibble [0 × 3]>
## 6 <split [4910/545]> Fold06 <tibble [50 × 6]> <tibble [0 × 3]>
## 7 <split [4910/545]> Fold07 <tibble [50 × 6]> <tibble [0 × 3]>
## 8 <split [4910/545]> Fold08 <tibble [50 × 6]> <tibble [0 × 3]>
## 9 <split [4910/545]> Fold09 <tibble [50 × 6]> <tibble [0 × 3]>
## 10 <split [4910/545]> Fold10 <tibble [50 × 6]> <tibble [0 × 3]>
regular_res %>% collect_metrics() %>% filter(.metric=="roc_auc") %>% mutate(min_n = factor(min_n)) %>% ggplot(aes(mtry, mean, color = min_n)) + geom_line(alpha = 0.5, size = 1.5) + geom_point() + labs(title = "Area Under the Curve for different values of min_n and mtry ", y = "mean roc_auc")
regular_res %>% collect_metrics(summarize = FALSE) %>%
filter(.metric == "roc_auc") %>%
group_by(id) %>%
summarize(min_roc_auc = min(.estimate),
median_roc_auc = median(.estimate),
max_roc_auc = max(.estimate))
## # A tibble: 10 × 4
## id min_roc_auc median_roc_auc max_roc_auc
## <chr> <dbl> <dbl> <dbl>
## 1 Fold01 0.922 0.927 0.934
## 2 Fold02 0.935 0.940 0.944
## 3 Fold03 0.962 0.964 0.970
## 4 Fold04 0.913 0.919 0.923
## 5 Fold05 0.928 0.935 0.941
## 6 Fold06 0.937 0.941 0.947
## 7 Fold07 0.898 0.907 0.911
## 8 Fold08 0.897 0.906 0.911
## 9 Fold09 0.933 0.941 0.947
## 10 Fold10 0.923 0.928 0.933
The results are fairly consistent across folds indicating that there are no problems of model overfitting.
regular_res %>% show_best(metric = "roc_auc", n = 5)
## # A tibble: 5 × 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 20 10 roc_auc binary 0.932 10 0.00539 Preprocessor1_Model25
## 2 20 6 roc_auc binary 0.932 10 0.00574 Preprocessor1_Model15
## 3 11 2 roc_auc binary 0.932 10 0.00558 Preprocessor1_Model03
## 4 20 2 roc_auc binary 0.932 10 0.00577 Preprocessor1_Model05
## 5 2 4 roc_auc binary 0.932 10 0.00614 Preprocessor1_Model06
best_auc <- select_best(regular_res, "roc_auc")
best_auc
## # A tibble: 1 × 3
## mtry min_n .config
## <int> <int> <chr>
## 1 20 10 Preprocessor1_Model25
final_rf <- finalize_model(
tune_spec,
best_auc
)
final_rf %>% set_engine("ranger", importance = "permutation") %>%
fit(Bankrupt~., data = juice(tree_prep)) %>% vip(geom = "point", n=15)
final_workflow <- workflow() %>% add_recipe(company_recipe) %>% add_model(final_rf)
final_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
##
## • step_corr()
## • step_downsample()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 20
## trees = 350
## min_n = 10
##
## Engine-Specific Arguments:
## importance = permutation
##
## Computational engine: ranger
company_last_fit <- final_workflow %>% last_fit(company_split)
company_last_fit %>% collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.861 Preprocessor1_Model1
## 2 roc_auc binary 0.958 Preprocessor1_Model1
company_last_fit %>% collect_predictions() %>% roc_curve(truth = Bankrupt, .pred_Bankrupt) %>% autoplot()
company_last_fit %>% collect_predictions() %>% conf_mat(truth = Bankrupt, estimate = .pred_class )
## Truth
## Prediction Bankrupt No Bankrupt
## Bankrupt 35 186
## No Bankrupt 3 1140