# Initial setup
options(warn =  -1)
options(scipen=999)

Loading libraries

suppressWarnings(suppressPackageStartupMessages({
library(tidymodels)
library(tidyverse)
library(dplyr)
library(rpart)
library(Amelia)
library(corrr)
library(corrplot)
library(DMwR)
library(ROSE)
library(caret)
library(skimr)
library(DataExplorer)
library(themis)
library(vip)
}))

Loading the data

company_data <- read.csv("~/Loan Prediction Approval/Bankrupcy Company Prediction/company_bankrupcy.csv")
colnames(company_data)[1]  <- "Bankrupt"
company_data$Bankrupt <- factor(company_data$Bankrupt, levels = c(1,0), labels = c("Bankrupt", "No Bankrupt") )
company_data$Net.Income.Flag <- NULL

Data Exploratory Data Analysis

Looking for missing values and structure of the dataset

skim(company_data)
Data summary
Name company_data
Number of rows 6819
Number of columns 95
_______________________
Column type frequency:
factor 1
numeric 94
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Bankrupt 0 1 FALSE 2 No : 6599, Ban: 220

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
ROA.C..before.interest.and.depreciation.before.interest 0 1 0.51 0.06 0 0.48 0.50 0.54 1 ▁▁▇▁▁
ROA.A..before.interest.and…after.tax 0 1 0.56 0.07 0 0.54 0.56 0.59 1 ▁▁▇▂▁
ROA.B..before.interest.and.depreciation.after.tax 0 1 0.55 0.06 0 0.53 0.55 0.58 1 ▁▁▇▂▁
Operating.Gross.Margin 0 1 0.61 0.02 0 0.60 0.61 0.61 1 ▁▁▂▇▁
Realized.Sales.Gross.Margin 0 1 0.61 0.02 0 0.60 0.61 0.61 1 ▁▁▂▇▁
Operating.Profit.Rate 0 1 1.00 0.01 0 1.00 1.00 1.00 1 ▁▁▁▁▇
Pre.tax.net.Interest.Rate 0 1 0.80 0.01 0 0.80 0.80 0.80 1 ▁▁▁▇▁
After.tax.net.Interest.Rate 0 1 0.81 0.01 0 0.81 0.81 0.81 1 ▁▁▁▁▇
Non.industry.income.and.expenditure.revenue 0 1 0.30 0.01 0 0.30 0.30 0.30 1 ▁▇▁▁▁
Continuous.interest.rate..after.tax. 0 1 0.78 0.01 0 0.78 0.78 0.78 1 ▁▁▁▇▁
Operating.Expense.Rate 0 1 1995347312.80 3237683890.52 0 0.00 0.00 4145000000.00 9990000000 ▇▁▁▁▁
Research.and.development.expense.rate 0 1 1950427306.06 2598291554.00 0 0.00 509000000.00 3450000000.00 9980000000 ▇▂▁▁▁
Cash.flow.rate 0 1 0.47 0.02 0 0.46 0.47 0.47 1 ▁▁▇▁▁
Interest.bearing.debt.interest.rate 0 1 16448012.91 108275033.53 0 0.00 0.00 0.00 990000000 ▇▁▁▁▁
Tax.rate..A. 0 1 0.12 0.14 0 0.00 0.07 0.21 1 ▇▂▁▁▁
Net.Value.Per.Share..B. 0 1 0.19 0.03 0 0.17 0.18 0.20 1 ▇▂▁▁▁
Net.Value.Per.Share..A. 0 1 0.19 0.03 0 0.17 0.18 0.20 1 ▇▂▁▁▁
Net.Value.Per.Share..C. 0 1 0.19 0.03 0 0.17 0.18 0.20 1 ▇▂▁▁▁
Persistent.EPS.in.the.Last.Four.Seasons 0 1 0.23 0.03 0 0.21 0.22 0.24 1 ▁▇▁▁▁
Cash.Flow.Per.Share 0 1 0.32 0.02 0 0.32 0.32 0.33 1 ▁▇▁▁▁
Revenue.Per.Share..Yuan… 0 1 1328640.60 51707089.77 0 0.02 0.03 0.05 3020000000 ▇▁▁▁▁
Operating.Profit.Per.Share..Yuan… 0 1 0.11 0.03 0 0.10 0.10 0.12 1 ▇▁▁▁▁
Per.Share.Net.profit.before.tax..Yuan… 0 1 0.18 0.03 0 0.17 0.18 0.19 1 ▇▂▁▁▁
Realized.Sales.Gross.Profit.Growth.Rate 0 1 0.02 0.01 0 0.02 0.02 0.02 1 ▇▁▁▁▁
Operating.Profit.Growth.Rate 0 1 0.85 0.01 0 0.85 0.85 0.85 1 ▁▁▁▁▇
After.tax.Net.Profit.Growth.Rate 0 1 0.69 0.01 0 0.69 0.69 0.69 1 ▁▁▁▇▁
Regular.Net.Profit.Growth.Rate 0 1 0.69 0.01 0 0.69 0.69 0.69 1 ▁▁▁▇▁
Continuous.Net.Profit.Growth.Rate 0 1 0.22 0.01 0 0.22 0.22 0.22 1 ▁▇▁▁▁
Total.Asset.Growth.Rate 0 1 5508096595.25 2897717771.17 0 4860000000.00 6400000000.00 7390000000.00 9990000000 ▃▁▃▇▂
Net.Value.Growth.Rate 0 1 1566212.06 114159389.52 0 0.00 0.00 0.00 9330000000 ▇▁▁▁▁
Total.Asset.Return.Growth.Rate.Ratio 0 1 0.26 0.01 0 0.26 0.26 0.26 1 ▁▇▁▁▁
Cash.Reinvestment.. 0 1 0.38 0.02 0 0.37 0.38 0.39 1 ▁▇▁▁▁
Current.Ratio 0 1 403284.95 33302155.83 0 0.01 0.01 0.02 2750000000 ▇▁▁▁▁
Quick.Ratio 0 1 8376594.82 244684748.45 0 0.00 0.01 0.01 9230000000 ▇▁▁▁▁
Interest.Expense.Ratio 0 1 0.63 0.01 0 0.63 0.63 0.63 1 ▁▁▁▇▁
Total.debt.Total.net.worth 0 1 4416336.71 168406905.28 0 0.00 0.01 0.01 9940000000 ▇▁▁▁▁
Debt.ratio.. 0 1 0.11 0.05 0 0.07 0.11 0.15 1 ▇▁▁▁▁
Net.worth.Assets 0 1 0.89 0.05 0 0.85 0.89 0.93 1 ▁▁▁▁▇
Long.term.fund.suitability.ratio..A. 0 1 0.01 0.03 0 0.01 0.01 0.01 1 ▇▁▁▁▁
Borrowing.dependency 0 1 0.37 0.02 0 0.37 0.37 0.38 1 ▁▇▁▁▁
Contingent.liabilities.Net.worth 0 1 0.01 0.01 0 0.01 0.01 0.01 1 ▇▁▁▁▁
Operating.profit.Paid.in.capital 0 1 0.11 0.03 0 0.10 0.10 0.12 1 ▇▁▁▁▁
Net.profit.before.tax.Paid.in.capital 0 1 0.18 0.03 0 0.17 0.18 0.19 1 ▇▂▁▁▁
Inventory.and.accounts.receivable.Net.value 0 1 0.40 0.01 0 0.40 0.40 0.40 1 ▁▇▇▁▁
Total.Asset.Turnover 0 1 0.14 0.10 0 0.08 0.12 0.18 1 ▇▂▁▁▁
Accounts.Receivable.Turnover 0 1 12789705.24 278259836.98 0 0.00 0.00 0.00 9740000000 ▇▁▁▁▁
Average.Collection.Days 0 1 9826220.86 256358895.71 0 0.00 0.01 0.01 9730000000 ▇▁▁▁▁
Inventory.Turnover.Rate..times. 0 1 2149106056.61 3247967014.05 0 0.00 0.00 4620000000.00 9990000000 ▇▁▁▁▁
Fixed.Assets.Turnover.Frequency 0 1 1008595981.82 2477557316.92 0 0.00 0.00 0.00 9990000000 ▇▁▁▁▁
Net.Worth.Turnover.Rate..times. 0 1 0.04 0.04 0 0.02 0.03 0.04 1 ▇▁▁▁▁
Revenue.per.person 0 1 2325854.27 136632654.39 0 0.01 0.02 0.04 8810000000 ▇▁▁▁▁
Operating.profit.per.person 0 1 0.40 0.03 0 0.39 0.40 0.40 1 ▁▇▃▁▁
Allocation.rate.per.person 0 1 11255785.32 294506294.12 0 0.00 0.01 0.02 9570000000 ▇▁▁▁▁
Working.Capital.to.Total.Assets 0 1 0.81 0.06 0 0.77 0.81 0.85 1 ▁▁▁▆▇
Quick.Assets.Total.Assets 0 1 0.40 0.20 0 0.24 0.39 0.54 1 ▅▇▇▃▁
Current.Assets.Total.Assets 0 1 0.52 0.22 0 0.35 0.51 0.69 1 ▂▆▇▆▃
Cash.Total.Assets 0 1 0.12 0.14 0 0.03 0.07 0.16 1 ▇▁▁▁▁
Quick.Assets.Current.Liability 0 1 3592902.20 171620908.61 0 0.01 0.01 0.01 8820000000 ▇▁▁▁▁
Cash.Current.Liability 0 1 37159994.15 510350903.16 0 0.00 0.00 0.01 9650000000 ▇▁▁▁▁
Current.Liability.to.Assets 0 1 0.09 0.05 0 0.05 0.08 0.12 1 ▇▁▁▁▁
Operating.Funds.to.Liability 0 1 0.35 0.04 0 0.34 0.35 0.36 1 ▁▇▁▁▁
Inventory.Working.Capital 0 1 0.28 0.01 0 0.28 0.28 0.28 1 ▁▇▁▁▁
Inventory.Current.Liability 0 1 55806804.53 582051554.62 0 0.00 0.01 0.01 9910000000 ▇▁▁▁▁
Current.Liabilities.Liability 0 1 0.76 0.21 0 0.63 0.81 0.94 1 ▁▁▂▅▇
Working.Capital.Equity 0 1 0.74 0.01 0 0.73 0.74 0.74 1 ▁▁▁▇▁
Current.Liabilities.Equity 0 1 0.33 0.01 0 0.33 0.33 0.33 1 ▁▇▁▁▁
Long.term.Liability.to.Current.Assets 0 1 54160038.14 570270621.96 0 0.00 0.00 0.01 9540000000 ▇▁▁▁▁
Retained.Earnings.to.Total.Assets 0 1 0.93 0.03 0 0.93 0.94 0.94 1 ▁▁▁▁▇
Total.income.Total.expense 0 1 0.00 0.01 0 0.00 0.00 0.00 1 ▇▁▁▁▁
Total.expense.Assets 0 1 0.03 0.03 0 0.01 0.02 0.04 1 ▇▁▁▁▁
Current.Asset.Turnover.Rate 0 1 1195855763.31 2821161238.26 0 0.00 0.00 0.00 10000000000 ▇▁▁▁▁
Quick.Asset.Turnover.Rate 0 1 2163735272.03 3374944402.17 0 0.00 0.00 4900000000.00 10000000000 ▇▁▁▁▁
Working.capitcal.Turnover.Rate 0 1 0.59 0.01 0 0.59 0.59 0.59 1 ▁▁▇▁▁
Cash.Turnover.Rate 0 1 2471976967.44 2938623226.68 0 0.00 1080000000.00 4510000000.00 10000000000 ▇▂▂▁▁
Cash.Flow.to.Sales 0 1 0.67 0.01 0 0.67 0.67 0.67 1 ▁▁▁▇▁
Fixed.Assets.to.Assets 0 1 1220120.50 100754158.71 0 0.09 0.20 0.37 8320000000 ▇▁▁▁▁
Current.Liability.to.Liability 0 1 0.76 0.21 0 0.63 0.81 0.94 1 ▁▁▂▅▇
Current.Liability.to.Equity 0 1 0.33 0.01 0 0.33 0.33 0.33 1 ▁▇▁▁▁
Equity.to.Long.term.Liability 0 1 0.12 0.02 0 0.11 0.11 0.12 1 ▇▁▁▁▁
Cash.Flow.to.Total.Assets 0 1 0.65 0.05 0 0.63 0.65 0.66 1 ▁▁▁▇▁
Cash.Flow.to.Liability 0 1 0.46 0.03 0 0.46 0.46 0.46 1 ▁▁▇▁▁
CFO.to.Assets 0 1 0.59 0.06 0 0.57 0.59 0.62 1 ▁▁▇▆▁
Cash.Flow.to.Equity 0 1 0.32 0.01 0 0.31 0.31 0.32 1 ▁▇▁▁▁
Current.Liability.to.Current.Assets 0 1 0.03 0.03 0 0.02 0.03 0.04 1 ▇▁▁▁▁
Liability.Assets.Flag 0 1 0.00 0.03 0 0.00 0.00 0.00 1 ▇▁▁▁▁
Net.Income.to.Total.Assets 0 1 0.81 0.04 0 0.80 0.81 0.83 1 ▁▁▁▃▇
Total.assets.to.GNP.price 0 1 18629417.81 376450059.75 0 0.00 0.00 0.01 9820000000 ▇▁▁▁▁
No.credit.Interval 0 1 0.62 0.01 0 0.62 0.62 0.62 1 ▁▁▁▇▁
Gross.Profit.to.Sales 0 1 0.61 0.02 0 0.60 0.61 0.61 1 ▁▁▂▇▁
Net.Income.to.Stockholder.s.Equity 0 1 0.84 0.01 0 0.84 0.84 0.84 1 ▁▁▁▁▇
Liability.to.Equity 0 1 0.28 0.01 0 0.28 0.28 0.28 1 ▁▇▁▁▁
Degree.of.Financial.Leverage..DFL. 0 1 0.03 0.02 0 0.03 0.03 0.03 1 ▇▁▁▁▁
Interest.Coverage.Ratio..Interest.expense.to.EBIT. 0 1 0.57 0.01 0 0.57 0.57 0.57 1 ▁▁▇▁▁
Equity.to.Liability 0 1 0.05 0.05 0 0.02 0.03 0.05 1 ▇▁▁▁▁

Calculating basic statistics for the dataset

# Retrieving basic statistics for the dataset
summary(company_data)
##         Bankrupt    ROA.C..before.interest.and.depreciation.before.interest
##  Bankrupt   : 220   Min.   :0.0000                                         
##  No Bankrupt:6599   1st Qu.:0.4765                                         
##                     Median :0.5027                                         
##                     Mean   :0.5052                                         
##                     3rd Qu.:0.5356                                         
##                     Max.   :1.0000                                         
##  ROA.A..before.interest.and...after.tax
##  Min.   :0.0000                        
##  1st Qu.:0.5355                        
##  Median :0.5598                        
##  Mean   :0.5586                        
##  3rd Qu.:0.5892                        
##  Max.   :1.0000                        
##  ROA.B..before.interest.and.depreciation.after.tax Operating.Gross.Margin
##  Min.   :0.0000                                    Min.   :0.0000        
##  1st Qu.:0.5273                                    1st Qu.:0.6004        
##  Median :0.5523                                    Median :0.6060        
##  Mean   :0.5536                                    Mean   :0.6079        
##  3rd Qu.:0.5841                                    3rd Qu.:0.6139        
##  Max.   :1.0000                                    Max.   :1.0000        
##  Realized.Sales.Gross.Margin Operating.Profit.Rate Pre.tax.net.Interest.Rate
##  Min.   :0.0000              Min.   :0.0000        Min.   :0.0000           
##  1st Qu.:0.6004              1st Qu.:0.9990        1st Qu.:0.7974           
##  Median :0.6060              Median :0.9990        Median :0.7975           
##  Mean   :0.6079              Mean   :0.9988        Mean   :0.7972           
##  3rd Qu.:0.6138              3rd Qu.:0.9991        3rd Qu.:0.7976           
##  Max.   :1.0000              Max.   :1.0000        Max.   :1.0000           
##  After.tax.net.Interest.Rate Non.industry.income.and.expenditure.revenue
##  Min.   :0.0000              Min.   :0.0000                             
##  1st Qu.:0.8093              1st Qu.:0.3035                             
##  Median :0.8094              Median :0.3035                             
##  Mean   :0.8091              Mean   :0.3036                             
##  3rd Qu.:0.8095              3rd Qu.:0.3036                             
##  Max.   :1.0000              Max.   :1.0000                             
##  Continuous.interest.rate..after.tax. Operating.Expense.Rate
##  Min.   :0.0000                       Min.   :         0    
##  1st Qu.:0.7816                       1st Qu.:         0    
##  Median :0.7816                       Median :         0    
##  Mean   :0.7814                       Mean   :1995347313    
##  3rd Qu.:0.7817                       3rd Qu.:4145000000    
##  Max.   :1.0000                       Max.   :9990000000    
##  Research.and.development.expense.rate Cash.flow.rate  
##  Min.   :         0                    Min.   :0.0000  
##  1st Qu.:         0                    1st Qu.:0.4616  
##  Median : 509000000                    Median :0.4651  
##  Mean   :1950427306                    Mean   :0.4674  
##  3rd Qu.:3450000000                    3rd Qu.:0.4710  
##  Max.   :9980000000                    Max.   :1.0000  
##  Interest.bearing.debt.interest.rate  Tax.rate..A.     Net.Value.Per.Share..B.
##  Min.   :        0                   Min.   :0.00000   Min.   :0.0000         
##  1st Qu.:        0                   1st Qu.:0.00000   1st Qu.:0.1736         
##  Median :        0                   Median :0.07349   Median :0.1844         
##  Mean   : 16448013                   Mean   :0.11500   Mean   :0.1907         
##  3rd Qu.:        0                   3rd Qu.:0.20584   3rd Qu.:0.1996         
##  Max.   :990000000                   Max.   :1.00000   Max.   :1.0000         
##  Net.Value.Per.Share..A. Net.Value.Per.Share..C.
##  Min.   :0.0000          Min.   :0.0000         
##  1st Qu.:0.1736          1st Qu.:0.1737         
##  Median :0.1844          Median :0.1844         
##  Mean   :0.1906          Mean   :0.1907         
##  3rd Qu.:0.1996          3rd Qu.:0.1996         
##  Max.   :1.0000          Max.   :1.0000         
##  Persistent.EPS.in.the.Last.Four.Seasons Cash.Flow.Per.Share
##  Min.   :0.0000                          Min.   :0.0000     
##  1st Qu.:0.2147                          1st Qu.:0.3177     
##  Median :0.2245                          Median :0.3225     
##  Mean   :0.2288                          Mean   :0.3235     
##  3rd Qu.:0.2388                          3rd Qu.:0.3286     
##  Max.   :1.0000                          Max.   :1.0000     
##  Revenue.Per.Share..Yuan... Operating.Profit.Per.Share..Yuan...
##  Min.   :         0         Min.   :0.00000                    
##  1st Qu.:         0         1st Qu.:0.09608                    
##  Median :         0         Median :0.10423                    
##  Mean   :   1328641         Mean   :0.10909                    
##  3rd Qu.:         0         3rd Qu.:0.11615                    
##  Max.   :3020000000         Max.   :1.00000                    
##  Per.Share.Net.profit.before.tax..Yuan...
##  Min.   :0.0000                          
##  1st Qu.:0.1704                          
##  Median :0.1797                          
##  Mean   :0.1844                          
##  3rd Qu.:0.1935                          
##  Max.   :1.0000                          
##  Realized.Sales.Gross.Profit.Growth.Rate Operating.Profit.Growth.Rate
##  Min.   :0.00000                         Min.   :0.0000              
##  1st Qu.:0.02206                         1st Qu.:0.8480              
##  Median :0.02210                         Median :0.8480              
##  Mean   :0.02241                         Mean   :0.8480              
##  3rd Qu.:0.02215                         3rd Qu.:0.8481              
##  Max.   :1.00000                         Max.   :1.0000              
##  After.tax.Net.Profit.Growth.Rate Regular.Net.Profit.Growth.Rate
##  Min.   :0.0000                   Min.   :0.0000                
##  1st Qu.:0.6893                   1st Qu.:0.6893                
##  Median :0.6894                   Median :0.6894                
##  Mean   :0.6891                   Mean   :0.6892                
##  3rd Qu.:0.6896                   3rd Qu.:0.6896                
##  Max.   :1.0000                   Max.   :1.0000                
##  Continuous.Net.Profit.Growth.Rate Total.Asset.Growth.Rate
##  Min.   :0.0000                    Min.   :         0     
##  1st Qu.:0.2176                    1st Qu.:4860000000     
##  Median :0.2176                    Median :6400000000     
##  Mean   :0.2176                    Mean   :5508096595     
##  3rd Qu.:0.2176                    3rd Qu.:7390000000     
##  Max.   :1.0000                    Max.   :9990000000     
##  Net.Value.Growth.Rate Total.Asset.Return.Growth.Rate.Ratio Cash.Reinvestment..
##  Min.   :         0    Min.   :0.0000                       Min.   :0.0000     
##  1st Qu.:         0    1st Qu.:0.2638                       1st Qu.:0.3747     
##  Median :         0    Median :0.2640                       Median :0.3804     
##  Mean   :   1566212    Mean   :0.2642                       Mean   :0.3797     
##  3rd Qu.:         0    3rd Qu.:0.2644                       3rd Qu.:0.3867     
##  Max.   :9330000000    Max.   :1.0000                       Max.   :1.0000     
##  Current.Ratio         Quick.Ratio         Interest.Expense.Ratio
##  Min.   :         0   Min.   :         0   Min.   :0.0000        
##  1st Qu.:         0   1st Qu.:         0   1st Qu.:0.6306        
##  Median :         0   Median :         0   Median :0.6307        
##  Mean   :    403285   Mean   :   8376595   Mean   :0.6310        
##  3rd Qu.:         0   3rd Qu.:         0   3rd Qu.:0.6311        
##  Max.   :2750000000   Max.   :9230000000   Max.   :1.0000        
##  Total.debt.Total.net.worth  Debt.ratio..     Net.worth.Assets
##  Min.   :         0         Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:         0         1st Qu.:0.07289   1st Qu.:0.8512  
##  Median :         0         Median :0.11141   Median :0.8886  
##  Mean   :   4416337         Mean   :0.11318   Mean   :0.8868  
##  3rd Qu.:         0         3rd Qu.:0.14880   3rd Qu.:0.9271  
##  Max.   :9940000000         Max.   :1.00000   Max.   :1.0000  
##  Long.term.fund.suitability.ratio..A. Borrowing.dependency
##  Min.   :0.000000                     Min.   :0.0000      
##  1st Qu.:0.005244                     1st Qu.:0.3702      
##  Median :0.005665                     Median :0.3726      
##  Mean   :0.008783                     Mean   :0.3747      
##  3rd Qu.:0.006847                     3rd Qu.:0.3763      
##  Max.   :1.000000                     Max.   :1.0000      
##  Contingent.liabilities.Net.worth Operating.profit.Paid.in.capital
##  Min.   :0.000000                 Min.   :0.0000                  
##  1st Qu.:0.005366                 1st Qu.:0.0961                  
##  Median :0.005366                 Median :0.1041                  
##  Mean   :0.005968                 Mean   :0.1090                  
##  3rd Qu.:0.005764                 3rd Qu.:0.1159                  
##  Max.   :1.000000                 Max.   :1.0000                  
##  Net.profit.before.tax.Paid.in.capital
##  Min.   :0.0000                       
##  1st Qu.:0.1694                       
##  Median :0.1785                       
##  Mean   :0.1827                       
##  3rd Qu.:0.1916                       
##  Max.   :1.0000                       
##  Inventory.and.accounts.receivable.Net.value Total.Asset.Turnover
##  Min.   :0.0000                              Min.   :0.00000     
##  1st Qu.:0.3974                              1st Qu.:0.07646     
##  Median :0.4001                              Median :0.11844     
##  Mean   :0.4025                              Mean   :0.14161     
##  3rd Qu.:0.4046                              3rd Qu.:0.17691     
##  Max.   :1.0000                              Max.   :1.00000     
##  Accounts.Receivable.Turnover Average.Collection.Days
##  Min.   :         0           Min.   :         0     
##  1st Qu.:         0           1st Qu.:         0     
##  Median :         0           Median :         0     
##  Mean   :  12789705           Mean   :   9826221     
##  3rd Qu.:         0           3rd Qu.:         0     
##  Max.   :9740000000           Max.   :9730000000     
##  Inventory.Turnover.Rate..times. Fixed.Assets.Turnover.Frequency
##  Min.   :         0              Min.   :         0             
##  1st Qu.:         0              1st Qu.:         0             
##  Median :         0              Median :         0             
##  Mean   :2149106057              Mean   :1008595982             
##  3rd Qu.:4620000000              3rd Qu.:         0             
##  Max.   :9990000000              Max.   :9990000000             
##  Net.Worth.Turnover.Rate..times. Revenue.per.person  
##  Min.   :0.00000                 Min.   :         0  
##  1st Qu.:0.02177                 1st Qu.:         0  
##  Median :0.02952                 Median :         0  
##  Mean   :0.03860                 Mean   :   2325854  
##  3rd Qu.:0.04290                 3rd Qu.:         0  
##  Max.   :1.00000                 Max.   :8810000000  
##  Operating.profit.per.person Allocation.rate.per.person
##  Min.   :0.0000              Min.   :         0        
##  1st Qu.:0.3924              1st Qu.:         0        
##  Median :0.3959              Median :         0        
##  Mean   :0.4007              Mean   :  11255785        
##  3rd Qu.:0.4019              3rd Qu.:         0        
##  Max.   :1.0000              Max.   :9570000000        
##  Working.Capital.to.Total.Assets Quick.Assets.Total.Assets
##  Min.   :0.0000                  Min.   :0.0000           
##  1st Qu.:0.7743                  1st Qu.:0.2420           
##  Median :0.8103                  Median :0.3865           
##  Mean   :0.8141                  Mean   :0.4001           
##  3rd Qu.:0.8504                  3rd Qu.:0.5406           
##  Max.   :1.0000                  Max.   :1.0000           
##  Current.Assets.Total.Assets Cash.Total.Assets Quick.Assets.Current.Liability
##  Min.   :0.0000              Min.   :0.00000   Min.   :         0            
##  1st Qu.:0.3528              1st Qu.:0.03354   1st Qu.:         0            
##  Median :0.5148              Median :0.07489   Median :         0            
##  Mean   :0.5223              Mean   :0.12409   Mean   :   3592902            
##  3rd Qu.:0.6891              3rd Qu.:0.16107   3rd Qu.:         0            
##  Max.   :1.0000              Max.   :1.00000   Max.   :8820000000            
##  Cash.Current.Liability Current.Liability.to.Assets
##  Min.   :         0     Min.   :0.00000            
##  1st Qu.:         0     1st Qu.:0.05330            
##  Median :         0     Median :0.08270            
##  Mean   :  37159994     Mean   :0.09067            
##  3rd Qu.:         0     3rd Qu.:0.11952            
##  Max.   :9650000000     Max.   :1.00000            
##  Operating.Funds.to.Liability Inventory.Working.Capital
##  Min.   :0.0000               Min.   :0.0000           
##  1st Qu.:0.3410               1st Qu.:0.2770           
##  Median :0.3486               Median :0.2772           
##  Mean   :0.3538               Mean   :0.2774           
##  3rd Qu.:0.3609               3rd Qu.:0.2774           
##  Max.   :1.0000               Max.   :1.0000           
##  Inventory.Current.Liability Current.Liabilities.Liability
##  Min.   :         0          Min.   :0.0000               
##  1st Qu.:         0          1st Qu.:0.6270               
##  Median :         0          Median :0.8069               
##  Mean   :  55806805          Mean   :0.7616               
##  3rd Qu.:         0          3rd Qu.:0.9420               
##  Max.   :9910000000          Max.   :1.0000               
##  Working.Capital.Equity Current.Liabilities.Equity
##  Min.   :0.0000         Min.   :0.0000            
##  1st Qu.:0.7336         1st Qu.:0.3281            
##  Median :0.7360         Median :0.3297            
##  Mean   :0.7358         Mean   :0.3314            
##  3rd Qu.:0.7386         3rd Qu.:0.3323            
##  Max.   :1.0000         Max.   :1.0000            
##  Long.term.Liability.to.Current.Assets Retained.Earnings.to.Total.Assets
##  Min.   :         0                    Min.   :0.0000                   
##  1st Qu.:         0                    1st Qu.:0.9311                   
##  Median :         0                    Median :0.9377                   
##  Mean   :  54160038                    Mean   :0.9347                   
##  3rd Qu.:         0                    3rd Qu.:0.9448                   
##  Max.   :9540000000                    Max.   :1.0000                   
##  Total.income.Total.expense Total.expense.Assets Current.Asset.Turnover.Rate
##  Min.   :0.000000           Min.   :0.00000      Min.   :          0        
##  1st Qu.:0.002236           1st Qu.:0.01457      1st Qu.:          0        
##  Median :0.002336           Median :0.02267      Median :          0        
##  Mean   :0.002549           Mean   :0.02918      Mean   : 1195855763        
##  3rd Qu.:0.002492           3rd Qu.:0.03593      3rd Qu.:          0        
##  Max.   :1.000000           Max.   :1.00000      Max.   :10000000000        
##  Quick.Asset.Turnover.Rate Working.capitcal.Turnover.Rate Cash.Turnover.Rate   
##  Min.   :          0       Min.   :0.0000                 Min.   :          0  
##  1st Qu.:          0       1st Qu.:0.5939                 1st Qu.:          0  
##  Median :          0       Median :0.5940                 Median : 1080000000  
##  Mean   : 2163735272       Mean   :0.5940                 Mean   : 2471976967  
##  3rd Qu.: 4900000000       3rd Qu.:0.5940                 3rd Qu.: 4510000000  
##  Max.   :10000000000       Max.   :1.0000                 Max.   :10000000000  
##  Cash.Flow.to.Sales Fixed.Assets.to.Assets Current.Liability.to.Liability
##  Min.   :0.0000     Min.   :         0     Min.   :0.0000                
##  1st Qu.:0.6716     1st Qu.:         0     1st Qu.:0.6270                
##  Median :0.6716     Median :         0     Median :0.8069                
##  Mean   :0.6715     Mean   :   1220121     Mean   :0.7616                
##  3rd Qu.:0.6716     3rd Qu.:         0     3rd Qu.:0.9420                
##  Max.   :1.0000     Max.   :8320000000     Max.   :1.0000                
##  Current.Liability.to.Equity Equity.to.Long.term.Liability
##  Min.   :0.0000              Min.   :0.0000               
##  1st Qu.:0.3281              1st Qu.:0.1109               
##  Median :0.3297              Median :0.1123               
##  Mean   :0.3314              Mean   :0.1156               
##  3rd Qu.:0.3323              3rd Qu.:0.1171               
##  Max.   :1.0000              Max.   :1.0000               
##  Cash.Flow.to.Total.Assets Cash.Flow.to.Liability CFO.to.Assets   
##  Min.   :0.0000            Min.   :0.0000         Min.   :0.0000  
##  1st Qu.:0.6333            1st Qu.:0.4571         1st Qu.:0.5660  
##  Median :0.6454            Median :0.4598         Median :0.5933  
##  Mean   :0.6497            Mean   :0.4618         Mean   :0.5934  
##  3rd Qu.:0.6631            3rd Qu.:0.4642         3rd Qu.:0.6248  
##  Max.   :1.0000            Max.   :1.0000         Max.   :1.0000  
##  Cash.Flow.to.Equity Current.Liability.to.Current.Assets Liability.Assets.Flag
##  Min.   :0.0000      Min.   :0.00000                     Min.   :0.000000     
##  1st Qu.:0.3130      1st Qu.:0.01803                     1st Qu.:0.000000     
##  Median :0.3150      Median :0.02760                     Median :0.000000     
##  Mean   :0.3156      Mean   :0.03151                     Mean   :0.001173     
##  3rd Qu.:0.3177      3rd Qu.:0.03837                     3rd Qu.:0.000000     
##  Max.   :1.0000      Max.   :1.00000                     Max.   :1.000000     
##  Net.Income.to.Total.Assets Total.assets.to.GNP.price No.credit.Interval
##  Min.   :0.0000             Min.   :         0        Min.   :0.0000    
##  1st Qu.:0.7967             1st Qu.:         0        1st Qu.:0.6236    
##  Median :0.8106             Median :         0        Median :0.6239    
##  Mean   :0.8078             Mean   :  18629418        Mean   :0.6239    
##  3rd Qu.:0.8265             3rd Qu.:         0        3rd Qu.:0.6242    
##  Max.   :1.0000             Max.   :9820000000        Max.   :1.0000    
##  Gross.Profit.to.Sales Net.Income.to.Stockholder.s.Equity Liability.to.Equity
##  Min.   :0.0000        Min.   :0.0000                     Min.   :0.0000     
##  1st Qu.:0.6004        1st Qu.:0.8401                     1st Qu.:0.2769     
##  Median :0.6060        Median :0.8412                     Median :0.2788     
##  Mean   :0.6079        Mean   :0.8404                     Mean   :0.2804     
##  3rd Qu.:0.6139        3rd Qu.:0.8424                     3rd Qu.:0.2814     
##  Max.   :1.0000        Max.   :1.0000                     Max.   :1.0000     
##  Degree.of.Financial.Leverage..DFL.
##  Min.   :0.00000                   
##  1st Qu.:0.02679                   
##  Median :0.02681                   
##  Mean   :0.02754                   
##  3rd Qu.:0.02691                   
##  Max.   :1.00000                   
##  Interest.Coverage.Ratio..Interest.expense.to.EBIT. Equity.to.Liability
##  Min.   :0.0000                                     Min.   :0.00000    
##  1st Qu.:0.5652                                     1st Qu.:0.02448    
##  Median :0.5653                                     Median :0.03380    
##  Mean   :0.5654                                     Mean   :0.04758    
##  3rd Qu.:0.5657                                     3rd Qu.:0.05284    
##  Max.   :1.0000                                     Max.   :1.00000

Distribution of numeric variables

plot_density(
  data  =  company_data, ncol = 5, title  = "Distribution of predictors",
  theme_config = theme_classic()
)

Distribution of the dependent variable

ggplot(company_data, aes(Bankrupt)) + geom_bar(aes(fill = Bankrupt)) + geom_text(aes(label = ..count..), stat = "count", vjust = 1) + labs(y= "Number of companies") + ggtitle(label = "Distribution companies that when bankrupt")

Correlation between numeric variables

# Visualizing correlations between features
company_data %>% select (where(is.numeric)) %>% correlate() %>% shave() %>% rplot(print_cor = FALSE ) + theme(axis.text.x = element_text(angle = 90, hjust=.1))
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'

The dataset has 6,819 observations and 95 variables, there are 94 predictors which are correctly identified as numeric variables also there is a factor variable which is the dependent variable. The dataset has no missing values, but there are outliers for some variables, for example, the maximum number of days that a company sells its inventory in one year is 9990000000, the maximum value of the average collection days in one year is 9730000000 and the maximum value of the quick ratio is 2750000000. On the other hand, the dataset has other problems like class imbalance because only 3.3% of all companies went bankrupt, as well as high dimensionality and multicollinearity problems. With that in mind, a random forest could be appropriate to tackle this kind of problem, in addition, this method performs automatic feature selection.

Training, test and validation splits

set.seed(1996)
company_split <- initial_split(company_data, prop = 0.80, strata = Bankrupt)
company_train <- company_split %>% training()
company_test <- company_split %>% testing()

Data preprocessing

company_recipe <-     recipe(formula = Bankrupt~.,
                      data = company_train) %>%           
                      step_corr(all_numeric_predictors(), threshold = 0.80) %>%
                      step_downsample(Bankrupt)

tree_prep <- prep(company_recipe)
tree_prep
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 94
## 
## ── Training information
## Training data contained 5455 data points and no incomplete rows.
## 
## ── Operations
## • Correlation filter on: ROA.A..before.interest.and...after.tax, ... | Trained
## • Down-sampling based on: Bankrupt | Trained

Since one of the goals of this project is to identify important predictors to predict company bankruptcy, correlated features are removed from the dataset. On the other hand, the class imbalance is handled by implementing downsampling.

Model specification

tune_spec <-  rand_forest(mtry = tune(),
                           min_n = tune(),
                           trees = 350) %>% 
              set_mode("classification") %>% 
              set_engine("ranger", importance = "permutation")

tune_spec
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 350
##   min_n = tune()
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: ranger
tune_workflow <-  workflow() %>% 
                  add_recipe(company_recipe) %>%
                  add_model(tune_spec)

tune_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_corr()
## • step_downsample()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 350
##   min_n = tune()
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: ranger

Training hyperparameters

set.seed(2019)
threes_folds <- vfold_cv(company_train, v = 10)

doParallel::registerDoParallel()
set.seed(1996)
tune_res <- tune_grid(tune_workflow,
          resamples = threes_folds,
          grid = 15)
## i Creating pre-processing data to finalize unknown parameter: mtry
tune_res %>%  collect_metrics() %>% 
              filter(.metric == "roc_auc") %>% 
              select(mean, min_n, mtry) %>%
              pivot_longer(min_n:mtry,
              values_to = "value",
              names_to = "parameter") %>%
              ggplot(aes(value, mean, color = parameter)) + 
              geom_point(show.legend = FALSE) +
              facet_wrap( ~parameter, scales = "free_x") + labs(title =  "area under the curve for differents values of min_n and mtry", y = "mean roc_auc")

### Tuning hyperparameters

rf_grid <- grid_regular(
  mtry(range = c(2, 20)),
  min_n(range = c(2,10)),
  levels = 5)

set.seed(897)
regular_res <- tune_grid(
  tune_workflow,
  resamples = threes_folds,
  grid=rf_grid)

regular_res
## # Tuning results
## # 10-fold cross-validation 
## # A tibble: 10 × 4
##    splits             id     .metrics          .notes          
##    <list>             <chr>  <list>            <list>          
##  1 <split [4909/546]> Fold01 <tibble [50 × 6]> <tibble [0 × 3]>
##  2 <split [4909/546]> Fold02 <tibble [50 × 6]> <tibble [0 × 3]>
##  3 <split [4909/546]> Fold03 <tibble [50 × 6]> <tibble [0 × 3]>
##  4 <split [4909/546]> Fold04 <tibble [50 × 6]> <tibble [0 × 3]>
##  5 <split [4909/546]> Fold05 <tibble [50 × 6]> <tibble [0 × 3]>
##  6 <split [4910/545]> Fold06 <tibble [50 × 6]> <tibble [0 × 3]>
##  7 <split [4910/545]> Fold07 <tibble [50 × 6]> <tibble [0 × 3]>
##  8 <split [4910/545]> Fold08 <tibble [50 × 6]> <tibble [0 × 3]>
##  9 <split [4910/545]> Fold09 <tibble [50 × 6]> <tibble [0 × 3]>
## 10 <split [4910/545]> Fold10 <tibble [50 × 6]> <tibble [0 × 3]>

Exploring tuning results

regular_res %>% collect_metrics() %>% filter(.metric=="roc_auc") %>% mutate(min_n = factor(min_n)) %>% ggplot(aes(mtry, mean, color = min_n)) + geom_line(alpha = 0.5, size  = 1.5) + geom_point() + labs(title = "Area Under the Curve for different values of min_n and mtry ", y = "mean roc_auc") 

regular_res  %>%  collect_metrics(summarize = FALSE) %>% 
                  filter(.metric == "roc_auc") %>%
                  group_by(id) %>%
                  summarize(min_roc_auc = min(.estimate),
                  median_roc_auc = median(.estimate),
                  max_roc_auc = max(.estimate))
## # A tibble: 10 × 4
##    id     min_roc_auc median_roc_auc max_roc_auc
##    <chr>        <dbl>          <dbl>       <dbl>
##  1 Fold01       0.922          0.927       0.934
##  2 Fold02       0.935          0.940       0.944
##  3 Fold03       0.962          0.964       0.970
##  4 Fold04       0.913          0.919       0.923
##  5 Fold05       0.928          0.935       0.941
##  6 Fold06       0.937          0.941       0.947
##  7 Fold07       0.898          0.907       0.911
##  8 Fold08       0.897          0.906       0.911
##  9 Fold09       0.933          0.941       0.947
## 10 Fold10       0.923          0.928       0.933

The results are fairly consistent across folds indicating that there are no problems of model overfitting.

Viewing the best performing models

regular_res  %>% show_best(metric = "roc_auc", n = 5)
## # A tibble: 5 × 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    20    10 roc_auc binary     0.932    10 0.00539 Preprocessor1_Model25
## 2    20     6 roc_auc binary     0.932    10 0.00574 Preprocessor1_Model15
## 3    11     2 roc_auc binary     0.932    10 0.00558 Preprocessor1_Model03
## 4    20     2 roc_auc binary     0.932    10 0.00577 Preprocessor1_Model05
## 5     2     4 roc_auc binary     0.932    10 0.00614 Preprocessor1_Model06

Selecting the best model

best_auc <-  select_best(regular_res, "roc_auc")
best_auc
## # A tibble: 1 × 3
##    mtry min_n .config              
##   <int> <int> <chr>                
## 1    20    10 Preprocessor1_Model25
final_rf <- finalize_model(
tune_spec,
best_auc
)

Variable importance

final_rf %>% set_engine("ranger", importance = "permutation") %>%
fit(Bankrupt~., data = juice(tree_prep)) %>% vip(geom = "point", n=15)

Last fit

final_workflow <- workflow() %>% add_recipe(company_recipe) %>%  add_model(final_rf)

final_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 2 Recipe Steps
## 
## • step_corr()
## • step_downsample()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
## 
## Main Arguments:
##   mtry = 20
##   trees = 350
##   min_n = 10
## 
## Engine-Specific Arguments:
##   importance = permutation
## 
## Computational engine: ranger
company_last_fit <- final_workflow %>% last_fit(company_split)

Viewing performance metrics for the last fit

company_last_fit %>% collect_metrics()
## # A tibble: 2 × 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.861 Preprocessor1_Model1
## 2 roc_auc  binary         0.958 Preprocessor1_Model1
company_last_fit %>% collect_predictions() %>% roc_curve(truth = Bankrupt, .pred_Bankrupt) %>% autoplot()

company_last_fit %>% collect_predictions() %>% conf_mat(truth = Bankrupt, estimate = .pred_class )
##              Truth
## Prediction    Bankrupt No Bankrupt
##   Bankrupt          35         186
##   No Bankrupt        3        1140