The data were collected from the Taiwan Economic Journal for the years 1999 to 2009. Company bankruptcy was defined based on the business regulations of the Taiwan Stock Exchange.
The data is made available on Kaggle here- https://www.kaggle.com/fedesoriano/company-bankruptcy-prediction
library(ROCR)
library(ggplot2)
library(caTools)
library(class)
library(pROC)
library(imbalance)
library(rpart)
library(pillar)
library(caret)
library(dplyr)
library(tibble)
library(car)
library(olsrr)
library(performanceEstimation)
library(randomForest)
library(RRF)
library(xgboost)
bank <- read.csv("bankrupt dataset-classification.csv")
glimpse(bank)
## Rows: 6,819
## Columns: 96
## $ Bankrupt <int> 1, 1, 1, 1, 1,~
## $ ROA.C..before.interest.and.depreciation.before.interest <dbl> 0.3705943, 0.4~
## $ ROA.A..before.interest.and...after.tax <dbl> 0.4243894, 0.5~
## $ ROA.B..before.interest.and.depreciation.after.tax <dbl> 0.4057498, 0.5~
## $ Operating.Gross.Margin <dbl> 0.6014572, 0.6~
## $ Realized.Sales.Gross.Margin <dbl> 0.6014572, 0.6~
## $ Operating.Profit.Rate <dbl> 0.9989692, 0.9~
## $ Pre.tax.net.Interest.Rate <dbl> 0.7968871, 0.7~
## $ After.tax.net.Interest.Rate <dbl> 0.8088094, 0.8~
## $ Non.industry.income.and.expenditure.revenue <dbl> 0.3026464, 0.3~
## $ Continuous.interest.rate..after.tax. <dbl> 0.7809848, 0.7~
## $ Operating.Expense.Rate <dbl> 1.25697e-04, 2~
## $ Research.and.development.expense.rate <dbl> 0.00e+00, 0.00~
## $ Cash.flow.rate <dbl> 0.4581431, 0.4~
## $ Interest.bearing.debt.interest.rate <dbl> 0.000725073, 0~
## $ Tax.rate..A. <dbl> 0.000000000, 0~
## $ Net.Value.Per.Share..B. <dbl> 0.1479499, 0.1~
## $ Net.Value.Per.Share..A. <dbl> 0.1479499, 0.1~
## $ Net.Value.Per.Share..C. <dbl> 0.1479499, 0.1~
## $ Persistent.EPS.in.the.Last.Four.Seasons <dbl> 0.1691406, 0.2~
## $ Cash.Flow.Per.Share <dbl> 0.3116644, 0.3~
## $ Revenue.Per.Share..Yuan.Â.. <dbl> 0.017559780, 0~
## $ Operating.Profit.Per.Share..Yuan.Â.. <dbl> 0.09592053, 0.~
## $ Per.Share.Net.profit.before.tax..Yuan.Â.. <dbl> 0.1387362, 0.1~
## $ Realized.Sales.Gross.Profit.Growth.Rate <dbl> 0.02210228, 0.~
## $ Operating.Profit.Growth.Rate <dbl> 0.8481950, 0.8~
## $ After.tax.Net.Profit.Growth.Rate <dbl> 0.6889795, 0.6~
## $ Regular.Net.Profit.Growth.Rate <dbl> 0.6889795, 0.6~
## $ Continuous.Net.Profit.Growth.Rate <dbl> 0.2175354, 0.2~
## $ Total.Asset.Growth.Rate <dbl> 4.98e+09, 6.11~
## $ Net.Value.Growth.Rate <dbl> 0.000326977, 0~
## $ Total.Asset.Return.Growth.Rate.Ratio <dbl> 0.2631000, 0.2~
## $ Cash.Reinvestment.. <dbl> 0.3637253, 0.3~
## $ Current.Ratio <dbl> 0.002258963, 0~
## $ Quick.Ratio <dbl> 0.001207755, 0~
## $ Interest.Expense.Ratio <dbl> 0.6299513, 0.6~
## $ Total.debt.Total.net.worth <dbl> 0.021265924, 0~
## $ Debt.ratio.. <dbl> 0.20757626, 0.~
## $ Net.worth.Assets <dbl> 0.7924237, 0.8~
## $ Long.term.fund.suitability.ratio..A. <dbl> 0.005024455, 0~
## $ Borrowing.dependency <dbl> 0.3902844, 0.3~
## $ Contingent.liabilities.Net.worth <dbl> 0.006478502, 0~
## $ Operating.profit.Paid.in.capital <dbl> 0.09588483, 0.~
## $ Net.profit.before.tax.Paid.in.capital <dbl> 0.1377573, 0.1~
## $ Inventory.and.accounts.receivable.Net.value <dbl> 0.3980357, 0.3~
## $ Total.Asset.Turnover <dbl> 0.08695652, 0.~
## $ Accounts.Receivable.Turnover <dbl> 0.001813884, 0~
## $ Average.Collection.Days <dbl> 0.003487364, 0~
## $ Inventory.Turnover.Rate..times. <dbl> 1.82093e-04, 9~
## $ Fixed.Assets.Turnover.Frequency <dbl> 1.16501e-04, 7~
## $ Net.Worth.Turnover.Rate..times. <dbl> 0.03290323, 0.~
## $ Revenue.per.person <dbl> 0.034164182, 0~
## $ Operating.profit.per.person <dbl> 0.3929129, 0.3~
## $ Allocation.rate.per.person <dbl> 0.037135302, 0~
## $ Working.Capital.to.Total.Assets <dbl> 0.6727753, 0.7~
## $ Quick.Assets.Total.Assets <dbl> 0.16667296, 0.~
## $ Current.Assets.Total.Assets <dbl> 0.1906430, 0.1~
## $ Cash.Total.Assets <dbl> 0.004094406, 0~
## $ Quick.Assets.Current.Liability <dbl> 0.001996771, 0~
## $ Cash.Current.Liability <dbl> 1.473360e-04, ~
## $ Current.Liability.to.Assets <dbl> 0.14730845, 0.~
## $ Operating.Funds.to.Liability <dbl> 0.3340152, 0.3~
## $ Inventory.Working.Capital <dbl> 0.2769202, 0.2~
## $ Inventory.Current.Liability <dbl> 0.001035990, 0~
## $ Current.Liabilities.Liability <dbl> 0.6762692, 0.3~
## $ Working.Capital.Equity <dbl> 0.7212746, 0.7~
## $ Current.Liabilities.Equity <dbl> 0.3390770, 0.3~
## $ Long.term.Liability.to.Current.Assets <dbl> 0.025592368, 0~
## $ Retained.Earnings.to.Total.Assets <dbl> 0.9032248, 0.9~
## $ Total.income.Total.expense <dbl> 0.002021613, 0~
## $ Total.expense.Assets <dbl> 0.064855708, 0~
## $ Current.Asset.Turnover.Rate <dbl> 7.010000e+08, ~
## $ Quick.Asset.Turnover.Rate <dbl> 6.550000e+09, ~
## $ Working.capitcal.Turnover.Rate <dbl> 0.5938305, 0.5~
## $ Cash.Turnover.Rate <dbl> 4.58000e+08, 2~
## $ Cash.Flow.to.Sales <dbl> 0.6715677, 0.6~
## $ Fixed.Assets.to.Assets <dbl> 0.4242058, 0.4~
## $ Current.Liability.to.Liability <dbl> 0.6762692, 0.3~
## $ Current.Liability.to.Equity <dbl> 0.3390770, 0.3~
## $ Equity.to.Long.term.Liability <dbl> 0.1265495, 0.1~
## $ Cash.Flow.to.Total.Assets <dbl> 0.6375554, 0.6~
## $ Cash.Flow.to.Liability <dbl> 0.4586091, 0.4~
## $ CFO.to.Assets <dbl> 0.5203819, 0.5~
## $ Cash.Flow.to.Equity <dbl> 0.3129049, 0.3~
## $ Current.Liability.to.Current.Assets <dbl> 0.11825048, 0.~
## $ Liability.Assets.Flag <int> 0, 0, 0, 0, 0,~
## $ Net.Income.to.Total.Assets <dbl> 0.7168453, 0.7~
## $ Total.assets.to.GNP.price <dbl> 0.009219440, 0~
## $ No.credit.Interval <dbl> 0.6228790, 0.6~
## $ Gross.Profit.to.Sales <dbl> 0.6014533, 0.6~
## $ Net.Income.to.Stockholder.s.Equity <dbl> 0.8278902, 0.8~
## $ Liability.to.Equity <dbl> 0.2902019, 0.2~
## $ Degree.of.Financial.Leverage..DFL. <dbl> 0.02660063, 0.~
## $ Interest.Coverage.Ratio..Interest.expense.to.EBIT. <dbl> 0.5640501, 0.5~
## $ Net.Income.Flag <int> 1, 1, 1, 1, 1,~
## $ Equity.to.Liability <dbl> 0.01646874, 0.~
So far, it has been clear that apart from 3 features(including the dependent variable Bankrupt) which are categorical, all other variables are of numeric type.
print(paste0("Number of missing values in the dataset : ",sum(colSums(is.na(bank)))))
## [1] "Number of missing values in the dataset : 0"
table(bank$Bankrupt)
##
## 0 1
## 6599 220
prop.table(table(bank$Bankrupt))
##
## 0 1
## 0.9677372 0.0322628
ggplot(data = bank,aes(x=Bankrupt,fill=factor(Bankrupt)))+geom_bar(stat = 'count')+
ggtitle("Distribution of firms with respect to Bankruptcy")+
theme_bw() + scale_x_continuous(breaks = bank$Bankrupt) +
geom_text(aes(label = ..count..), stat = 'count', vjust = -0.4) +
labs(fill = "Bankrupt?")
The dataset is highly imbalanced, the positive class (Bankrupt) accounts for mere 3.23% of all . This is not the ideal situation for further analysis. So, before executing the models, the dataset needs to be made balanced.
ggplot(data = bank,aes(x=Liability.Assets.Flag))+
geom_bar(aes(fill = factor(Liability.Assets.Flag)))+
theme_bw() + scale_x_continuous(breaks = bank$Liability.Assets.Flag)+
guides(fill="none")
table(bank$Liability.Assets.Flag)
##
## 0 1
## 6811 8
The Liability Assets Flag depicts the status of a firm and takes up the value 1 if the total liability exceeds total assets and 0 otherwise. The bargraph above clearly shows that most of the companies possess more assets than their liabilities.
ggplot(data = bank,aes(x=Liability.Assets.Flag))+
geom_bar(aes(fill = factor(Bankrupt)), position = 'dodge')+theme_bw() +
scale_x_continuous(breaks = bank$Liability.Assets.Flag) +labs(fill = "Bankrupt?")
Even though it looks like some organizations go bankrupt while having more assets, the proportion is relatively very small.
ggplot(data = bank,aes(x=Net.Income.Flag))+
geom_bar(aes(fill = factor(Net.Income.Flag)))+ theme_bw() +
scale_x_continuous(breaks = bank$Net.Income.Flag)+ guides(fill="none")
table(bank$Net.Income.Flag)
##
## 1
## 6819
It is clear from the graph that all of the organizations in the data are suffering from loss for the last two years.
ggplot(data = bank,aes(x=Net.Income.Flag))+
geom_bar(aes(fill = factor(Bankrupt)), position = 'dodge')+theme_bw() +
scale_x_continuous(breaks = bank$Net.Income.Flag) +labs(fill = "Bankrupt?")
Now, here is an interesting thing to notice, that although all of the firms were suffering from loss, most of them managed to secure themselves from going bankrupt.
As there are two types of variables in the data, i.e, Numeric and Integer, so it is better to divide the dataset and then analyze the relationship among variables, if exists.
bank$Liability.Assets.Flag <- as.factor(bank$Liability.Assets.Flag)
bank$Net.Income.Flag <- as.factor(bank$Net.Income.Flag)
bank.fac <- bank[, c('Bankrupt', 'Liability.Assets.Flag', 'Net.Income.Flag')]
myvars <- names(bank) %in% c('Bankrupt', 'Liability.Assets.Flag', 'Net.Income.Flag')
bank.num <- bank[!myvars]
cor.matrix <- as.matrix(cor(bank.num))
cor.matrix <- round(cor.matrix, digits = 2)
bank.numcor <- findCorrelation(cor.matrix,
cutoff = 0.70,
names = TRUE)
bank.numcor
## [1] "ROA.A..before.interest.and...after.tax"
## [2] "ROA.C..before.interest.and.depreciation.before.interest"
## [3] "ROA.B..before.interest.and.depreciation.after.tax"
## [4] "Net.Income.to.Total.Assets"
## [5] "Persistent.EPS.in.the.Last.Four.Seasons"
## [6] "Per.Share.Net.profit.before.tax..Yuan.Â.."
## [7] "Net.profit.before.tax.Paid.in.capital"
## [8] "Operating.Profit.Per.Share..Yuan.Â.."
## [9] "Net.Value.Per.Share..C."
## [10] "Net.Value.Per.Share..A."
## [11] "Working.Capital.to.Total.Assets"
## [12] "Debt.ratio.."
## [13] "Net.worth.Assets"
## [14] "CFO.to.Assets"
## [15] "Operating.Funds.to.Liability"
## [16] "Borrowing.dependency"
## [17] "Current.Liabilities.Equity"
## [18] "Current.Liability.to.Equity"
## [19] "Liability.to.Equity"
## [20] "Operating.Gross.Margin"
## [21] "Gross.Profit.to.Sales"
## [22] "Quick.Assets.Total.Assets"
## [23] "Total.Asset.Turnover"
## [24] "Working.Capital.Equity"
## [25] "Cash.Flow.to.Total.Assets"
## [26] "Current.Liabilities.Liability"
## [27] "After.tax.net.Interest.Rate"
## [28] "Pre.tax.net.Interest.Rate"
## [29] "Operating.Profit.Rate"
## [30] "After.tax.Net.Profit.Growth.Rate"
## [31] "Non.industry.income.and.expenditure.revenue"
## [32] "Cash.Flow.to.Sales"
As there are 32 variables which has a correlation coefficient more than 0.70, the possibility is very high that these variables are strongly correlated.
cor1 <- cor(bank.num, bank$Bankrupt)
cor1 <- round(cor1, digits = 2)
cor1
## [,1]
## ROA.C..before.interest.and.depreciation.before.interest -0.26
## ROA.A..before.interest.and...after.tax -0.28
## ROA.B..before.interest.and.depreciation.after.tax -0.27
## Operating.Gross.Margin -0.10
## Realized.Sales.Gross.Margin -0.10
## Operating.Profit.Rate 0.00
## Pre.tax.net.Interest.Rate -0.01
## After.tax.net.Interest.Rate -0.01
## Non.industry.income.and.expenditure.revenue -0.02
## Continuous.interest.rate..after.tax. -0.01
## Operating.Expense.Rate -0.01
## Research.and.development.expense.rate -0.02
## Cash.flow.rate -0.07
## Interest.bearing.debt.interest.rate -0.02
## Tax.rate..A. -0.11
## Net.Value.Per.Share..B. -0.17
## Net.Value.Per.Share..A. -0.17
## Net.Value.Per.Share..C. -0.16
## Persistent.EPS.in.the.Last.Four.Seasons -0.22
## Cash.Flow.Per.Share -0.08
## Revenue.Per.Share..Yuan.Â.. 0.00
## Operating.Profit.Per.Share..Yuan.Â.. -0.14
## Per.Share.Net.profit.before.tax..Yuan.Â.. -0.20
## Realized.Sales.Gross.Profit.Growth.Rate 0.00
## Operating.Profit.Growth.Rate -0.02
## After.tax.Net.Profit.Growth.Rate -0.04
## Regular.Net.Profit.Growth.Rate -0.04
## Continuous.Net.Profit.Growth.Rate -0.01
## Total.Asset.Growth.Rate -0.04
## Net.Value.Growth.Rate 0.07
## Total.Asset.Return.Growth.Rate.Ratio -0.02
## Cash.Reinvestment.. -0.05
## Current.Ratio 0.00
## Quick.Ratio 0.03
## Interest.Expense.Ratio 0.00
## Total.debt.Total.net.worth 0.01
## Debt.ratio.. 0.25
## Net.worth.Assets -0.25
## Long.term.fund.suitability.ratio..A. 0.02
## Borrowing.dependency 0.18
## Contingent.liabilities.Net.worth 0.07
## Operating.profit.Paid.in.capital -0.14
## Net.profit.before.tax.Paid.in.capital -0.21
## Inventory.and.accounts.receivable.Net.value 0.08
## Total.Asset.Turnover -0.07
## Accounts.Receivable.Turnover 0.00
## Average.Collection.Days -0.01
## Inventory.Turnover.Rate..times. 0.00
## Fixed.Assets.Turnover.Frequency 0.07
## Net.Worth.Turnover.Rate..times. 0.02
## Revenue.per.person 0.04
## Operating.profit.per.person -0.09
## Allocation.rate.per.person 0.00
## Working.Capital.to.Total.Assets -0.19
## Quick.Assets.Total.Assets -0.09
## Current.Assets.Total.Assets -0.04
## Cash.Total.Assets -0.10
## Quick.Assets.Current.Liability 0.00
## Cash.Current.Liability 0.08
## Current.Liability.to.Assets 0.19
## Operating.Funds.to.Liability -0.08
## Inventory.Working.Capital 0.00
## Inventory.Current.Liability 0.00
## Current.Liabilities.Liability -0.02
## Working.Capital.Equity -0.15
## Current.Liabilities.Equity 0.15
## Long.term.Liability.to.Current.Assets 0.00
## Retained.Earnings.to.Total.Assets -0.22
## Total.income.Total.expense -0.01
## Total.expense.Assets 0.14
## Current.Asset.Turnover.Rate 0.01
## Quick.Asset.Turnover.Rate 0.03
## Working.capitcal.Turnover.Rate 0.00
## Cash.Turnover.Rate -0.02
## Cash.Flow.to.Sales 0.00
## Fixed.Assets.to.Assets 0.07
## Current.Liability.to.Liability -0.02
## Current.Liability.to.Equity 0.15
## Equity.to.Long.term.Liability 0.14
## Cash.Flow.to.Total.Assets -0.07
## Cash.Flow.to.Liability -0.04
## CFO.to.Assets -0.12
## Cash.Flow.to.Equity -0.06
## Current.Liability.to.Current.Assets 0.17
## Net.Income.to.Total.Assets -0.32
## Total.assets.to.GNP.price 0.04
## No.credit.Interval -0.01
## Gross.Profit.to.Sales -0.10
## Net.Income.to.Stockholder.s.Equity -0.18
## Liability.to.Equity 0.17
## Degree.of.Financial.Leverage..DFL. 0.01
## Interest.Coverage.Ratio..Interest.expense.to.EBIT. -0.01
## Equity.to.Liability -0.08
print(paste0("Maximum correlation coefficient : ", max(cor1),
" and Minimum correlation coefficient : ", min(cor1)))
## [1] "Maximum correlation coefficient : 0.25 and Minimum correlation coefficient : -0.32"
The numeric variables are not strongly correlated with the target variable, as proved by the correlation matrix.
bank.linear <- findLinearCombos(bank.num)
bank.linear
## $linearCombos
## $linearCombos[[1]]
## [1] 60 4 6 7 9 37 38 54 56 88
##
## $linearCombos[[2]]
## [1] 77 64
##
## $linearCombos[[3]]
## [1] 78 66
##
##
## $remove
## [1] 60 77 78
Clearly, there exist some linear relations among the variables and as the function suggests to remove 3 variables with indexes 60, 77 and 78, it will be dealt with below.
bank.nzv <- nearZeroVar(bank, saveMetrics = TRUE)
bank.nzv %>%
rownames_to_column("variable") %>%
arrange(-zeroVar, -nzv, -freqRatio)
## variable freqRatio
## 1 Net.Income.Flag 0.000000
## 2 Liability.Assets.Flag 851.375000
## 3 Bankrupt 29.995455
## 4 Long.term.Liability.to.Current.Assets 1284.500000
## 5 Equity.to.Long.term.Liability 1284.500000
## 6 Contingent.liabilities.Net.worth 594.000000
## 7 Tax.rate..A. 366.857143
## 8 Current.Liabilities.Liability 193.000000
## 9 Current.Liability.to.Liability 193.000000
## 10 Degree.of.Financial.Leverage..DFL. 116.400000
## 11 Inventory.Current.Liability 113.500000
## 12 Borrowing.dependency 76.866667
## 13 Research.and.development.expense.rate 64.727273
## 14 Inventory.Working.Capital 57.000000
## 15 Interest.bearing.debt.interest.rate 29.700000
## 16 Interest.Coverage.Ratio..Interest.expense.to.EBIT. 19.612903
## 17 Inventory.Turnover.Rate..times. 13.312500
## 18 Interest.Expense.Ratio 11.303571
## 19 Realized.Sales.Gross.Profit.Growth.Rate 6.400000
## 20 Fixed.Assets.to.Assets 6.000000
## 21 Allocation.rate.per.person 3.000000
## 22 Long.term.fund.suitability.ratio..A. 2.000000
## 23 Cash.Flow.to.Total.Assets 2.000000
## 24 Gross.Profit.to.Sales 2.000000
## 25 Inventory.and.accounts.receivable.Net.value 1.500000
## 26 Retained.Earnings.to.Total.Assets 1.500000
## 27 Regular.Net.Profit.Growth.Rate 1.400000
## 28 Average.Collection.Days 1.400000
## 29 Current.Liabilities.Equity 1.333333
## 30 Current.Liability.to.Equity 1.333333
## 31 Quick.Ratio 1.250000
## 32 Quick.Asset.Turnover.Rate 1.222222
## 33 Continuous.Net.Profit.Growth.Rate 1.200000
## 34 Revenue.per.person 1.200000
## 35 No.credit.Interval 1.200000
## 36 Persistent.EPS.in.the.Last.Four.Seasons 1.185185
## 37 Operating.profit.per.person 1.181818
## 38 Operating.profit.Paid.in.capital 1.142857
## 39 Net.profit.before.tax.Paid.in.capital 1.142857
## 40 Current.Asset.Turnover.Rate 1.142857
## 41 Revenue.Per.Share..Yuan.Â.. 1.125000
## 42 Net.Worth.Turnover.Rate..times. 1.125000
## 43 Pre.tax.net.Interest.Rate 1.111111
## 44 Per.Share.Net.profit.before.tax..Yuan.Â.. 1.103448
## 45 Continuous.interest.rate..after.tax. 1.100000
## 46 ROA.C..before.interest.and.depreciation.before.interest 1.090909
## 47 Operating.Profit.Rate 1.090909
## 48 Cash.Turnover.Rate 1.090909
## 49 Cash.Flow.to.Sales 1.088235
## 50 Working.capitcal.Turnover.Rate 1.083333
## 51 Net.Value.Per.Share..B. 1.076923
## 52 Net.Value.Per.Share..A. 1.076923
## 53 Net.Value.Per.Share..C. 1.076923
## 54 Fixed.Assets.Turnover.Frequency 1.052632
## 55 Operating.Profit.Per.Share..Yuan.Â.. 1.035714
## 56 Total.Asset.Turnover 1.027027
## 57 ROA.A..before.interest.and...after.tax 1.000000
## 58 ROA.B..before.interest.and.depreciation.after.tax 1.000000
## 59 Operating.Gross.Margin 1.000000
## 60 Realized.Sales.Gross.Margin 1.000000
## 61 After.tax.net.Interest.Rate 1.000000
## 62 Non.industry.income.and.expenditure.revenue 1.000000
## 63 Operating.Expense.Rate 1.000000
## 64 Cash.flow.rate 1.000000
## 65 Cash.Flow.Per.Share 1.000000
## 66 Operating.Profit.Growth.Rate 1.000000
## 67 After.tax.Net.Profit.Growth.Rate 1.000000
## 68 Total.Asset.Growth.Rate 1.000000
## 69 Net.Value.Growth.Rate 1.000000
## 70 Total.Asset.Return.Growth.Rate.Ratio 1.000000
## 71 Cash.Reinvestment.. 1.000000
## 72 Current.Ratio 1.000000
## 73 Total.debt.Total.net.worth 1.000000
## 74 Debt.ratio.. 1.000000
## 75 Net.worth.Assets 1.000000
## 76 Accounts.Receivable.Turnover 1.000000
## 77 Working.Capital.to.Total.Assets 1.000000
## 78 Quick.Assets.Total.Assets 1.000000
## 79 Current.Assets.Total.Assets 1.000000
## 80 Cash.Total.Assets 1.000000
## 81 Quick.Assets.Current.Liability 1.000000
## 82 Cash.Current.Liability 1.000000
## 83 Current.Liability.to.Assets 1.000000
## 84 Operating.Funds.to.Liability 1.000000
## 85 Working.Capital.Equity 1.000000
## 86 Total.income.Total.expense 1.000000
## 87 Total.expense.Assets 1.000000
## 88 Cash.Flow.to.Liability 1.000000
## 89 CFO.to.Assets 1.000000
## 90 Cash.Flow.to.Equity 1.000000
## 91 Current.Liability.to.Current.Assets 1.000000
## 92 Net.Income.to.Total.Assets 1.000000
## 93 Total.assets.to.GNP.price 1.000000
## 94 Net.Income.to.Stockholder.s.Equity 1.000000
## 95 Liability.to.Equity 1.000000
## 96 Equity.to.Liability 1.000000
## percentUnique zeroVar nzv
## 1 0.01466491 TRUE TRUE
## 2 0.02932981 FALSE TRUE
## 3 0.02932981 FALSE TRUE
## 4 62.28185951 FALSE FALSE
## 5 62.25252970 FALSE FALSE
## 6 27.20340226 FALSE FALSE
## 7 36.48628831 FALSE FALSE
## 8 97.18433788 FALSE FALSE
## 9 97.18433788 FALSE FALSE
## 10 89.54392140 FALSE FALSE
## 11 96.59774160 FALSE FALSE
## 12 63.61636604 FALSE FALSE
## 13 22.52529696 FALSE FALSE
## 14 93.51811116 FALSE FALSE
## 15 15.83809943 FALSE FALSE
## 16 73.14855551 FALSE FALSE
## 17 35.15178179 FALSE FALSE
## 18 55.63865669 FALSE FALSE
## 19 81.87417510 FALSE FALSE
## 20 99.92667547 FALSE FALSE
## 21 99.25208975 FALSE FALSE
## 22 95.65918756 FALSE FALSE
## 23 99.98533509 FALSE FALSE
## 24 98.91479689 FALSE FALSE
## 25 77.56269248 FALSE FALSE
## 26 98.84147236 FALSE FALSE
## 27 76.46282446 FALSE FALSE
## 28 79.93840739 FALSE FALSE
## 29 96.24578384 FALSE FALSE
## 30 96.24578384 FALSE FALSE
## 31 89.36794251 FALSE FALSE
## 32 77.97330987 FALSE FALSE
## 33 78.38392726 FALSE FALSE
## 34 83.10602728 FALSE FALSE
## 35 83.31133597 FALSE FALSE
## 36 19.91494354 FALSE FALSE
## 37 44.33201349 FALSE FALSE
## 38 64.86288312 FALSE FALSE
## 39 70.17157941 FALSE FALSE
## 40 90.92242264 FALSE FALSE
## 41 55.82930048 FALSE FALSE
## 42 10.86669600 FALSE FALSE
## 43 55.56533216 FALSE FALSE
## 44 22.31998827 FALSE FALSE
## 45 53.04296818 FALSE FALSE
## 46 48.87813462 FALSE FALSE
## 47 49.50872562 FALSE FALSE
## 48 58.86493621 FALSE FALSE
## 49 23.96245784 FALSE FALSE
## 50 36.07567092 FALSE FALSE
## 51 33.40665787 FALSE FALSE
## 52 33.50931222 FALSE FALSE
## 53 33.49464731 FALSE FALSE
## 54 35.94368676 FALSE FALSE
## 55 18.12582490 FALSE FALSE
## 56 5.58732952 FALSE FALSE
## 57 46.20912157 FALSE FALSE
## 58 46.34110573 FALSE FALSE
## 59 55.44801291 FALSE FALSE
## 60 55.55066725 FALSE FALSE
## 61 52.85232439 FALSE FALSE
## 62 37.41017745 FALSE FALSE
## 63 43.49611380 FALSE FALSE
## 64 81.49288752 FALSE FALSE
## 65 22.65728113 FALSE FALSE
## 66 56.07860390 FALSE FALSE
## 67 76.41882974 FALSE FALSE
## 68 25.67825194 FALSE FALSE
## 69 66.02141076 FALSE FALSE
## 70 42.57222467 FALSE FALSE
## 71 52.77899985 FALSE FALSE
## 72 89.92520897 FALSE FALSE
## 73 80.92095615 FALSE FALSE
## 74 61.70992814 FALSE FALSE
## 75 61.70992814 FALSE FALSE
## 76 23.36119666 FALSE FALSE
## 77 99.89734565 FALSE FALSE
## 78 100.00000000 FALSE FALSE
## 79 100.00000000 FALSE FALSE
## 80 100.00000000 FALSE FALSE
## 81 99.97067019 FALSE FALSE
## 82 99.86801584 FALSE FALSE
## 83 100.00000000 FALSE FALSE
## 84 99.85335093 FALSE FALSE
## 85 97.56562546 FALSE FALSE
## 86 99.56005279 FALSE FALSE
## 87 100.00000000 FALSE FALSE
## 88 99.85335093 FALSE FALSE
## 89 100.00000000 FALSE FALSE
## 90 99.75069658 FALSE FALSE
## 91 100.00000000 FALSE FALSE
## 92 99.53072298 FALSE FALSE
## 93 99.95600528 FALSE FALSE
## 94 94.67663880 FALSE FALSE
## 95 96.52441707 FALSE FALSE
## 96 100.00000000 FALSE FALSE
Both the dummy variables ‘Net.Income.Flag’ and ‘Liability.Assets.Flag’ have zero or near-zero variance.
nearZeroVar(bank, names = TRUE)
## [1] "Bankrupt" "Liability.Assets.Flag" "Net.Income.Flag"
new_bank[,c("Liability.Assets.Flag","Net.Income.Flag")] <- list(NULL)
Removing mutually correlated variables as identified by the ‘findCorrelation’ function
initialvar <- names(new_bank)
new_bank <- bank[!initialvar %in% bank.numcor]
Now, before moving to the final stage of building models on the data, let’s use VIF and Backward elimination approaches to see if any more variables need to be removed
new.ols <- lm(Bankrupt ~ ., new_bank)
summary(new.ols)
##
## Call:
## lm(formula = Bankrupt ~ ., data = new_bank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68293 -0.04881 -0.01812 0.00999 1.04608
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value
## (Intercept) -1.894e+00 4.600e-01 -4.117
## Realized.Sales.Gross.Margin 4.446e-01 3.751e+00 0.119
## Continuous.interest.rate..after.tax. 3.322e-01 1.591e-01 2.089
## Operating.Expense.Rate 9.930e-13 6.676e-13 1.487
## Research.and.development.expense.rate 5.997e-13 7.694e-13 0.779
## Cash.flow.rate -2.993e-01 2.827e-01 -1.059
## Interest.bearing.debt.interest.rate 9.515e-12 1.833e-11 0.519
## Tax.rate..A. -4.939e-02 1.493e-02 -3.309
## Net.Value.Per.Share..B. -1.177e-01 8.440e-02 -1.395
## Cash.Flow.Per.Share -5.583e-01 1.932e-01 -2.890
## Revenue.Per.Share..Yuan.Â.. -9.191e-11 4.473e-11 -2.055
## Realized.Sales.Gross.Profit.Growth.Rate 1.796e-02 1.645e-01 0.109
## Operating.Profit.Growth.Rate -1.017e-01 2.477e-01 -0.411
## Regular.Net.Profit.Growth.Rate 1.999e-01 1.880e-01 1.063
## Continuous.Net.Profit.Growth.Rate 4.861e-02 1.968e-01 0.247
## Total.Asset.Growth.Rate -8.686e-13 7.012e-13 -1.239
## Net.Value.Growth.Rate 8.522e-11 1.771e-11 4.813
## Total.Asset.Return.Growth.Rate.Ratio -1.427e-01 2.285e-01 -0.625
## Cash.Reinvestment.. 3.613e-01 1.478e-01 2.444
## Current.Ratio -2.047e-10 6.533e-11 -3.134
## Quick.Ratio 6.930e-12 8.147e-12 0.851
## Interest.Expense.Ratio -7.150e-02 1.748e-01 -0.409
## Total.debt.Total.net.worth 2.531e-12 1.297e-11 0.195
## Long.term.fund.suitability.ratio..A. 7.124e-02 8.361e-02 0.852
## Contingent.liabilities.Net.worth -3.254e+00 3.730e-01 -8.725
## Operating.profit.Paid.in.capital 6.027e-01 1.168e-01 5.160
## Inventory.and.accounts.receivable.Net.value -3.884e+00 3.962e-01 -9.804
## Accounts.Receivable.Turnover -4.590e-12 7.753e-12 -0.592
## Average.Collection.Days -6.249e-12 8.670e-12 -0.721
## Inventory.Turnover.Rate..times. -7.513e-13 6.284e-13 -1.196
## Fixed.Assets.Turnover.Frequency 2.163e-12 8.909e-13 2.428
## Net.Worth.Turnover.Rate..times. -2.384e-01 7.947e-02 -3.000
## Revenue.per.person 4.616e-11 1.536e-11 3.006
## Operating.profit.per.person -1.525e-01 7.237e-02 -2.107
## Allocation.rate.per.person -5.384e-12 7.123e-12 -0.756
## Current.Assets.Total.Assets 1.267e-01 1.547e-02 8.186
## Cash.Total.Assets -1.014e-01 2.064e-02 -4.912
## Quick.Assets.Current.Liability 2.814e-12 1.247e-11 0.226
## Cash.Current.Liability 1.903e-11 3.927e-12 4.848
## Operating.Funds.to.Liability 4.830e-01 1.342e-01 3.600
## Inventory.Working.Capital 1.477e-02 1.941e-01 0.076
## Current.Liabilities.Equity -6.252e-01 6.003e-01 -1.041
## Long.term.Liability.to.Current.Assets 2.983e-12 3.450e-12 0.865
## Retained.Earnings.to.Total.Assets 8.318e-01 1.392e-01 5.977
## Total.income.Total.expense -9.163e-02 1.857e-01 -0.493
## Total.expense.Assets 1.009e-01 1.163e-01 0.867
## Current.Asset.Turnover.Rate 1.053e-13 8.224e-13 0.128
## Quick.Asset.Turnover.Rate 4.201e-13 6.691e-13 0.628
## Working.capitcal.Turnover.Rate 5.818e-01 2.318e-01 2.510
## Fixed.Assets.to.Assets 1.038e-10 2.137e-11 4.856
## Current.Liability.to.Equity NA NA NA
## Cash.Flow.to.Total.Assets 1.649e-01 8.344e-02 1.977
## Cash.Flow.to.Liability -1.442e-01 1.052e-01 -1.371
## Cash.Flow.to.Equity -6.154e-01 2.287e-01 -2.691
## Current.Liability.to.Current.Assets 7.609e-01 8.883e-02 8.566
## Net.Income.to.Total.Assets -1.876e+00 1.053e-01 -17.820
## No.credit.Interval 4.502e-02 1.602e-01 0.281
## Gross.Profit.to.Sales -3.028e-01 3.749e+00 -0.081
## Net.Income.to.Stockholder.s.Equity 2.095e+00 3.275e-01 6.397
## Liability.to.Equity 7.513e+00 7.745e-01 9.700
## Pr(>|t|)
## (Intercept) 3.88e-05 ***
## Realized.Sales.Gross.Margin 0.905639
## Continuous.interest.rate..after.tax. 0.036776 *
## Operating.Expense.Rate 0.136939
## Research.and.development.expense.rate 0.435723
## Cash.flow.rate 0.289822
## Interest.bearing.debt.interest.rate 0.603754
## Tax.rate..A. 0.000942 ***
## Net.Value.Per.Share..B. 0.163211
## Cash.Flow.Per.Share 0.003867 **
## Revenue.Per.Share..Yuan.Â.. 0.039921 *
## Realized.Sales.Gross.Profit.Growth.Rate 0.913072
## Operating.Profit.Growth.Rate 0.681342
## Regular.Net.Profit.Growth.Rate 0.287658
## Continuous.Net.Profit.Growth.Rate 0.804907
## Total.Asset.Growth.Rate 0.215512
## Net.Value.Growth.Rate 1.52e-06 ***
## Total.Asset.Return.Growth.Rate.Ratio 0.532261
## Cash.Reinvestment.. 0.014560 *
## Current.Ratio 0.001733 **
## Quick.Ratio 0.395004
## Interest.Expense.Ratio 0.682492
## Total.debt.Total.net.worth 0.845319
## Long.term.fund.suitability.ratio..A. 0.394213
## Contingent.liabilities.Net.worth < 2e-16 ***
## Operating.profit.Paid.in.capital 2.54e-07 ***
## Inventory.and.accounts.receivable.Net.value < 2e-16 ***
## Accounts.Receivable.Turnover 0.553878
## Average.Collection.Days 0.471126
## Inventory.Turnover.Rate..times. 0.231871
## Fixed.Assets.Turnover.Frequency 0.015229 *
## Net.Worth.Turnover.Rate..times. 0.002709 **
## Revenue.per.person 0.002661 **
## Operating.profit.per.person 0.035180 *
## Allocation.rate.per.person 0.449736
## Current.Assets.Total.Assets 3.20e-16 ***
## Cash.Total.Assets 9.20e-07 ***
## Quick.Assets.Current.Liability 0.821483
## Cash.Current.Liability 1.28e-06 ***
## Operating.Funds.to.Liability 0.000321 ***
## Inventory.Working.Capital 0.939331
## Current.Liabilities.Equity 0.297760
## Long.term.Liability.to.Current.Assets 0.387247
## Retained.Earnings.to.Total.Assets 2.38e-09 ***
## Total.income.Total.expense 0.621746
## Total.expense.Assets 0.386001
## Current.Asset.Turnover.Rate 0.898165
## Quick.Asset.Turnover.Rate 0.530138
## Working.capitcal.Turnover.Rate 0.012108 *
## Fixed.Assets.to.Assets 1.22e-06 ***
## Current.Liability.to.Equity NA
## Cash.Flow.to.Total.Assets 0.048135 *
## Cash.Flow.to.Liability 0.170555
## Cash.Flow.to.Equity 0.007134 **
## Current.Liability.to.Current.Assets < 2e-16 ***
## Net.Income.to.Total.Assets < 2e-16 ***
## No.credit.Interval 0.778697
## Gross.Profit.to.Sales 0.935627
## Net.Income.to.Stockholder.s.Equity 1.69e-10 ***
## Liability.to.Equity < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1618 on 6760 degrees of freedom
## Multiple R-squared: 0.1691, Adjusted R-squared: 0.162
## F-statistic: 23.72 on 58 and 6760 DF, p-value: < 2.2e-16
As clearly visible from the summary of above model, one variable named “Current.Liability.to.Equity” produces NA. It might be due to perfect collinearity, and so let’s investigate further.
alias(new.ols)
## Model :
## Bankrupt ~ Realized.Sales.Gross.Margin + Continuous.interest.rate..after.tax. +
## Operating.Expense.Rate + Research.and.development.expense.rate +
## Cash.flow.rate + Interest.bearing.debt.interest.rate + Tax.rate..A. +
## Net.Value.Per.Share..B. + Cash.Flow.Per.Share + Revenue.Per.Share..Yuan.Â.. +
## Realized.Sales.Gross.Profit.Growth.Rate + Operating.Profit.Growth.Rate +
## Regular.Net.Profit.Growth.Rate + Continuous.Net.Profit.Growth.Rate +
## Total.Asset.Growth.Rate + Net.Value.Growth.Rate + Total.Asset.Return.Growth.Rate.Ratio +
## Cash.Reinvestment.. + Current.Ratio + Quick.Ratio + Interest.Expense.Ratio +
## Total.debt.Total.net.worth + Long.term.fund.suitability.ratio..A. +
## Contingent.liabilities.Net.worth + Operating.profit.Paid.in.capital +
## Inventory.and.accounts.receivable.Net.value + Accounts.Receivable.Turnover +
## Average.Collection.Days + Inventory.Turnover.Rate..times. +
## Fixed.Assets.Turnover.Frequency + Net.Worth.Turnover.Rate..times. +
## Revenue.per.person + Operating.profit.per.person + Allocation.rate.per.person +
## Current.Assets.Total.Assets + Cash.Total.Assets + Quick.Assets.Current.Liability +
## Cash.Current.Liability + Operating.Funds.to.Liability + Inventory.Working.Capital +
## Current.Liabilities.Equity + Long.term.Liability.to.Current.Assets +
## Retained.Earnings.to.Total.Assets + Total.income.Total.expense +
## Total.expense.Assets + Current.Asset.Turnover.Rate + Quick.Asset.Turnover.Rate +
## Working.capitcal.Turnover.Rate + Fixed.Assets.to.Assets +
## Current.Liability.to.Equity + Cash.Flow.to.Total.Assets +
## Cash.Flow.to.Liability + Cash.Flow.to.Equity + Current.Liability.to.Current.Assets +
## Net.Income.to.Total.Assets + No.credit.Interval + Gross.Profit.to.Sales +
## Net.Income.to.Stockholder.s.Equity + Liability.to.Equity
##
## Complete :
## (Intercept) Realized.Sales.Gross.Margin
## Current.Liability.to.Equity 0 0
## Continuous.interest.rate..after.tax.
## Current.Liability.to.Equity 0
## Operating.Expense.Rate
## Current.Liability.to.Equity 0
## Research.and.development.expense.rate
## Current.Liability.to.Equity 0
## Cash.flow.rate Interest.bearing.debt.interest.rate
## Current.Liability.to.Equity 0 0
## Tax.rate..A. Net.Value.Per.Share..B.
## Current.Liability.to.Equity 0 0
## Cash.Flow.Per.Share Revenue.Per.Share..Yuan.Â..
## Current.Liability.to.Equity 0 0
## Realized.Sales.Gross.Profit.Growth.Rate
## Current.Liability.to.Equity 0
## Operating.Profit.Growth.Rate
## Current.Liability.to.Equity 0
## Regular.Net.Profit.Growth.Rate
## Current.Liability.to.Equity 0
## Continuous.Net.Profit.Growth.Rate
## Current.Liability.to.Equity 0
## Total.Asset.Growth.Rate Net.Value.Growth.Rate
## Current.Liability.to.Equity 0 0
## Total.Asset.Return.Growth.Rate.Ratio
## Current.Liability.to.Equity 0
## Cash.Reinvestment.. Current.Ratio Quick.Ratio
## Current.Liability.to.Equity 0 0 0
## Interest.Expense.Ratio Total.debt.Total.net.worth
## Current.Liability.to.Equity 0 0
## Long.term.fund.suitability.ratio..A.
## Current.Liability.to.Equity 0
## Contingent.liabilities.Net.worth
## Current.Liability.to.Equity 0
## Operating.profit.Paid.in.capital
## Current.Liability.to.Equity 0
## Inventory.and.accounts.receivable.Net.value
## Current.Liability.to.Equity 0
## Accounts.Receivable.Turnover
## Current.Liability.to.Equity 0
## Average.Collection.Days
## Current.Liability.to.Equity 0
## Inventory.Turnover.Rate..times.
## Current.Liability.to.Equity 0
## Fixed.Assets.Turnover.Frequency
## Current.Liability.to.Equity 0
## Net.Worth.Turnover.Rate..times. Revenue.per.person
## Current.Liability.to.Equity 0 0
## Operating.profit.per.person
## Current.Liability.to.Equity 0
## Allocation.rate.per.person
## Current.Liability.to.Equity 0
## Current.Assets.Total.Assets Cash.Total.Assets
## Current.Liability.to.Equity 0 0
## Quick.Assets.Current.Liability
## Current.Liability.to.Equity 0
## Cash.Current.Liability Operating.Funds.to.Liability
## Current.Liability.to.Equity 0 0
## Inventory.Working.Capital
## Current.Liability.to.Equity 0
## Current.Liabilities.Equity
## Current.Liability.to.Equity 1
## Long.term.Liability.to.Current.Assets
## Current.Liability.to.Equity 0
## Retained.Earnings.to.Total.Assets
## Current.Liability.to.Equity 0
## Total.income.Total.expense Total.expense.Assets
## Current.Liability.to.Equity 0 0
## Current.Asset.Turnover.Rate
## Current.Liability.to.Equity 0
## Quick.Asset.Turnover.Rate
## Current.Liability.to.Equity 0
## Working.capitcal.Turnover.Rate
## Current.Liability.to.Equity 0
## Fixed.Assets.to.Assets Cash.Flow.to.Total.Assets
## Current.Liability.to.Equity 0 0
## Cash.Flow.to.Liability Cash.Flow.to.Equity
## Current.Liability.to.Equity 0 0
## Current.Liability.to.Current.Assets
## Current.Liability.to.Equity 0
## Net.Income.to.Total.Assets No.credit.Interval
## Current.Liability.to.Equity 0 0
## Gross.Profit.to.Sales
## Current.Liability.to.Equity 0
## Net.Income.to.Stockholder.s.Equity
## Current.Liability.to.Equity 0
## Liability.to.Equity
## Current.Liability.to.Equity 0
As expected, “Current.Liability.to.Equity” has perfect collinearity with another predictor named “Current.Liabilities.Equity”.
new_bank[,c('Current.Liability.to.Equity')] <- list(NULL)
new.ols <- lm(Bankrupt ~ ., new_bank)
summary(new.ols)
##
## Call:
## lm(formula = Bankrupt ~ ., data = new_bank)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.68293 -0.04881 -0.01812 0.00999 1.04608
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) -1.894e+00 4.600e-01 -4.117
## Realized.Sales.Gross.Margin 4.446e-01 3.751e+00 0.119
## Continuous.interest.rate..after.tax. 3.322e-01 1.591e-01 2.089
## Operating.Expense.Rate 9.930e-13 6.676e-13 1.487
## Research.and.development.expense.rate 5.997e-13 7.694e-13 0.779
## Cash.flow.rate -2.993e-01 2.827e-01 -1.059
## Interest.bearing.debt.interest.rate 9.515e-12 1.833e-11 0.519
## Tax.rate..A. -4.939e-02 1.493e-02 -3.309
## Net.Value.Per.Share..B. -1.177e-01 8.440e-02 -1.395
## Cash.Flow.Per.Share -5.583e-01 1.932e-01 -2.890
## Revenue.Per.Share..Yuan.Â.. -9.191e-11 4.473e-11 -2.055
## Realized.Sales.Gross.Profit.Growth.Rate 1.796e-02 1.645e-01 0.109
## Operating.Profit.Growth.Rate -1.017e-01 2.477e-01 -0.411
## Regular.Net.Profit.Growth.Rate 1.999e-01 1.880e-01 1.063
## Continuous.Net.Profit.Growth.Rate 4.861e-02 1.968e-01 0.247
## Total.Asset.Growth.Rate -8.686e-13 7.012e-13 -1.239
## Net.Value.Growth.Rate 8.522e-11 1.771e-11 4.813
## Total.Asset.Return.Growth.Rate.Ratio -1.427e-01 2.285e-01 -0.625
## Cash.Reinvestment.. 3.613e-01 1.478e-01 2.444
## Current.Ratio -2.047e-10 6.533e-11 -3.134
## Quick.Ratio 6.930e-12 8.147e-12 0.851
## Interest.Expense.Ratio -7.150e-02 1.748e-01 -0.409
## Total.debt.Total.net.worth 2.531e-12 1.297e-11 0.195
## Long.term.fund.suitability.ratio..A. 7.124e-02 8.361e-02 0.852
## Contingent.liabilities.Net.worth -3.254e+00 3.730e-01 -8.725
## Operating.profit.Paid.in.capital 6.027e-01 1.168e-01 5.160
## Inventory.and.accounts.receivable.Net.value -3.884e+00 3.962e-01 -9.804
## Accounts.Receivable.Turnover -4.590e-12 7.753e-12 -0.592
## Average.Collection.Days -6.249e-12 8.670e-12 -0.721
## Inventory.Turnover.Rate..times. -7.513e-13 6.284e-13 -1.196
## Fixed.Assets.Turnover.Frequency 2.163e-12 8.909e-13 2.428
## Net.Worth.Turnover.Rate..times. -2.384e-01 7.947e-02 -3.000
## Revenue.per.person 4.616e-11 1.536e-11 3.006
## Operating.profit.per.person -1.525e-01 7.237e-02 -2.107
## Allocation.rate.per.person -5.384e-12 7.123e-12 -0.756
## Current.Assets.Total.Assets 1.267e-01 1.547e-02 8.186
## Cash.Total.Assets -1.014e-01 2.064e-02 -4.912
## Quick.Assets.Current.Liability 2.814e-12 1.247e-11 0.226
## Cash.Current.Liability 1.903e-11 3.927e-12 4.848
## Operating.Funds.to.Liability 4.830e-01 1.342e-01 3.600
## Inventory.Working.Capital 1.477e-02 1.941e-01 0.076
## Current.Liabilities.Equity -6.252e-01 6.003e-01 -1.041
## Long.term.Liability.to.Current.Assets 2.983e-12 3.450e-12 0.865
## Retained.Earnings.to.Total.Assets 8.318e-01 1.392e-01 5.977
## Total.income.Total.expense -9.163e-02 1.857e-01 -0.493
## Total.expense.Assets 1.009e-01 1.163e-01 0.867
## Current.Asset.Turnover.Rate 1.053e-13 8.224e-13 0.128
## Quick.Asset.Turnover.Rate 4.201e-13 6.691e-13 0.628
## Working.capitcal.Turnover.Rate 5.818e-01 2.318e-01 2.510
## Fixed.Assets.to.Assets 1.038e-10 2.137e-11 4.856
## Cash.Flow.to.Total.Assets 1.649e-01 8.344e-02 1.977
## Cash.Flow.to.Liability -1.442e-01 1.052e-01 -1.371
## Cash.Flow.to.Equity -6.154e-01 2.287e-01 -2.691
## Current.Liability.to.Current.Assets 7.609e-01 8.883e-02 8.566
## Net.Income.to.Total.Assets -1.876e+00 1.053e-01 -17.820
## No.credit.Interval 4.502e-02 1.602e-01 0.281
## Gross.Profit.to.Sales -3.028e-01 3.749e+00 -0.081
## Net.Income.to.Stockholder.s.Equity 2.095e+00 3.275e-01 6.397
## Liability.to.Equity 7.513e+00 7.745e-01 9.700
## Pr(>|t|)
## (Intercept) 3.88e-05 ***
## Realized.Sales.Gross.Margin 0.905639
## Continuous.interest.rate..after.tax. 0.036776 *
## Operating.Expense.Rate 0.136939
## Research.and.development.expense.rate 0.435723
## Cash.flow.rate 0.289822
## Interest.bearing.debt.interest.rate 0.603754
## Tax.rate..A. 0.000942 ***
## Net.Value.Per.Share..B. 0.163211
## Cash.Flow.Per.Share 0.003867 **
## Revenue.Per.Share..Yuan.Â.. 0.039921 *
## Realized.Sales.Gross.Profit.Growth.Rate 0.913072
## Operating.Profit.Growth.Rate 0.681342
## Regular.Net.Profit.Growth.Rate 0.287658
## Continuous.Net.Profit.Growth.Rate 0.804907
## Total.Asset.Growth.Rate 0.215512
## Net.Value.Growth.Rate 1.52e-06 ***
## Total.Asset.Return.Growth.Rate.Ratio 0.532261
## Cash.Reinvestment.. 0.014560 *
## Current.Ratio 0.001733 **
## Quick.Ratio 0.395004
## Interest.Expense.Ratio 0.682492
## Total.debt.Total.net.worth 0.845319
## Long.term.fund.suitability.ratio..A. 0.394213
## Contingent.liabilities.Net.worth < 2e-16 ***
## Operating.profit.Paid.in.capital 2.54e-07 ***
## Inventory.and.accounts.receivable.Net.value < 2e-16 ***
## Accounts.Receivable.Turnover 0.553878
## Average.Collection.Days 0.471126
## Inventory.Turnover.Rate..times. 0.231871
## Fixed.Assets.Turnover.Frequency 0.015229 *
## Net.Worth.Turnover.Rate..times. 0.002709 **
## Revenue.per.person 0.002661 **
## Operating.profit.per.person 0.035180 *
## Allocation.rate.per.person 0.449736
## Current.Assets.Total.Assets 3.20e-16 ***
## Cash.Total.Assets 9.20e-07 ***
## Quick.Assets.Current.Liability 0.821483
## Cash.Current.Liability 1.28e-06 ***
## Operating.Funds.to.Liability 0.000321 ***
## Inventory.Working.Capital 0.939331
## Current.Liabilities.Equity 0.297760
## Long.term.Liability.to.Current.Assets 0.387247
## Retained.Earnings.to.Total.Assets 2.38e-09 ***
## Total.income.Total.expense 0.621746
## Total.expense.Assets 0.386001
## Current.Asset.Turnover.Rate 0.898165
## Quick.Asset.Turnover.Rate 0.530138
## Working.capitcal.Turnover.Rate 0.012108 *
## Fixed.Assets.to.Assets 1.22e-06 ***
## Cash.Flow.to.Total.Assets 0.048135 *
## Cash.Flow.to.Liability 0.170555
## Cash.Flow.to.Equity 0.007134 **
## Current.Liability.to.Current.Assets < 2e-16 ***
## Net.Income.to.Total.Assets < 2e-16 ***
## No.credit.Interval 0.778697
## Gross.Profit.to.Sales 0.935627
## Net.Income.to.Stockholder.s.Equity 1.69e-10 ***
## Liability.to.Equity < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1618 on 6760 degrees of freedom
## Multiple R-squared: 0.1691, Adjusted R-squared: 0.162
## F-statistic: 23.72 on 58 and 6760 DF, p-value: < 2.2e-16
Using the vif function from ‘car’ library to check the VIF factor for any existence of multicollinearity
newbank.vif <- vif(new.ols)
sort(newbank.vif, decreasing = T)
## Gross.Profit.to.Sales
## 1050.339938
## Realized.Sales.Gross.Margin
## 1048.722589
## Liability.to.Equity
## 32.697215
## Current.Liabilities.Equity
## 17.083650
## Inventory.and.accounts.receivable.Net.value
## 7.260690
## Cash.flow.rate
## 6.044349
## Net.Income.to.Stockholder.s.Equity
## 5.895380
## Operating.Funds.to.Liability
## 5.793829
## Contingent.liabilities.Net.worth
## 5.384077
## Net.Income.to.Total.Assets
## 4.695117
## Cash.Flow.to.Total.Assets
## 4.071111
## Retained.Earnings.to.Total.Assets
## 3.297798
## Cash.Flow.Per.Share
## 3.016564
## Current.Assets.Total.Assets
## 2.967037
## Operating.profit.Paid.in.capital
## 2.743480
## Total.expense.Assets
## 2.599203
## Cash.Flow.to.Liability
## 2.584443
## Cash.Reinvestment..
## 2.448957
## Cash.Flow.to.Equity
## 2.288732
## Net.Worth.Turnover.Rate..times.
## 2.214076
## Cash.Total.Assets
## 2.152678
## Net.Value.Per.Share..B.
## 2.069288
## Current.Liability.to.Current.Assets
## 1.956178
## Operating.Profit.Growth.Rate
## 1.848010
## Regular.Net.Profit.Growth.Rate
## 1.781233
## Operating.profit.per.person
## 1.460973
## Long.term.fund.suitability.ratio..A.
## 1.443441
## Current.Asset.Turnover.Rate
## 1.402420
## Revenue.Per.Share..Yuan.Â..
## 1.393572
## Quick.Asset.Turnover.Rate
## 1.328583
## Total.income.Total.expense
## 1.313925
## Average.Collection.Days
## 1.287190
## Fixed.Assets.Turnover.Frequency
## 1.269348
## Total.Asset.Return.Growth.Rate.Ratio
## 1.262459
## Total.debt.Total.net.worth
## 1.243152
## Current.Ratio
## 1.233319
## Operating.Expense.Rate
## 1.217133
## Accounts.Receivable.Turnover
## 1.212595
## Fixed.Assets.to.Assets
## 1.207439
## Quick.Assets.Current.Liability
## 1.193620
## Revenue.per.person
## 1.147427
## Allocation.rate.per.person
## 1.146451
## Working.capitcal.Turnover.Rate
## 1.124160
## Tax.rate..A.
## 1.116114
## Inventory.Turnover.Rate..times.
## 1.085207
## Total.Asset.Growth.Rate
## 1.075727
## Inventory.Working.Capital
## 1.075432
## Net.Value.Growth.Rate
## 1.064683
## Continuous.interest.rate..after.tax.
## 1.059551
## Cash.Current.Liability
## 1.046339
## Research.and.development.expense.rate
## 1.041222
## Quick.Ratio
## 1.035425
## Realized.Sales.Gross.Profit.Growth.Rate
## 1.029278
## Interest.bearing.debt.interest.rate
## 1.026540
## Continuous.Net.Profit.Growth.Rate
## 1.021719
## No.credit.Interval
## 1.010067
## Long.term.Liability.to.Current.Assets
## 1.008517
## Interest.Expense.Ratio
## 1.005330
As per the general thumb rule, if a feature has a VIF value of more than 5, it is good to drop that variable. So, let’s drop the 9 variables which have VIF value of more than 5
new_bank[,c('Gross.Profit.to.Sales', 'Realized.Sales.Gross.Margin',
'Liability.to.Equity', 'Current.Liabilities.Equity',
'Inventory.and.accounts.receivable.Net.value', 'Cash.flow.rate',
'Net.Income.to.Stockholder.s.Equity', 'Operating.Funds.to.Liability',
'Contingent.liabilities.Net.worth')] <- list(NULL)
newbank.back <- ols_step_backward_p(lm(Bankrupt ~ ., new_bank), prem = 0.05,
progress = F)
summary(newbank.back$model)
##
## Call:
## lm(formula = paste(response, "~", paste(preds, collapse = " + ")),
## data = l)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.75894 -0.04857 -0.02096 0.00740 0.99270
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.619e-02 2.055e-01 0.079 0.93723
## Continuous.interest.rate..after.tax. 3.705e-01 1.606e-01 2.308 0.02104 *
## Tax.rate..A. -5.909e-02 1.496e-02 -3.949 7.92e-05 ***
## Net.Value.Per.Share..B. -2.407e-01 8.195e-02 -2.937 0.00332 **
## Net.Value.Growth.Rate 4.396e-11 1.763e-11 2.494 0.01265 *
## Cash.Reinvestment.. 2.214e-01 1.038e-01 2.133 0.03300 *
## Current.Ratio -2.559e-10 6.523e-11 -3.923 8.84e-05 ***
## Operating.profit.Paid.in.capital 5.734e-01 1.055e-01 5.435 5.68e-08 ***
## Accounts.Receivable.Turnover -1.522e-11 7.219e-12 -2.108 0.03507 *
## Fixed.Assets.Turnover.Frequency 2.267e-12 8.809e-13 2.574 0.01008 *
## Revenue.per.person 4.669e-11 1.460e-11 3.197 0.00140 **
## Current.Assets.Total.Assets 4.752e-02 1.187e-02 4.002 6.34e-05 ***
## Cash.Total.Assets -4.000e-02 1.828e-02 -2.188 0.02867 *
## Quick.Assets.Current.Liability -3.989e-11 1.198e-11 -3.331 0.00087 ***
## Cash.Current.Liability 1.909e-11 3.918e-12 4.873 1.13e-06 ***
## Retained.Earnings.to.Total.Assets 8.539e-01 1.343e-01 6.359 2.17e-10 ***
## Working.capitcal.Turnover.Rate 5.466e-01 2.300e-01 2.376 0.01751 *
## Fixed.Assets.to.Assets 1.070e-10 1.977e-11 5.412 6.44e-08 ***
## Cash.Flow.to.Total.Assets 1.498e-01 5.893e-02 2.542 0.01106 *
## Cash.Flow.to.Equity -6.177e-01 1.909e-01 -3.236 0.00122 **
## Current.Liability.to.Current.Assets 8.655e-01 8.289e-02 10.441 < 2e-16 ***
## Net.Income.to.Total.Assets -1.786e+00 9.274e-02 -19.256 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1641 on 6797 degrees of freedom
## Multiple R-squared: 0.1405, Adjusted R-squared: 0.1378
## F-statistic: 52.9 on 21 and 6797 DF, p-value: < 2.2e-16
newbank.back$removed
## [1] "Realized.Sales.Gross.Profit.Growth.Rate"
## [2] "Interest.bearing.debt.interest.rate"
## [3] "Total.Asset.Return.Growth.Rate.Ratio"
## [4] "Quick.Asset.Turnover.Rate"
## [5] "Inventory.Working.Capital"
## [6] "Interest.Expense.Ratio"
## [7] "Continuous.Net.Profit.Growth.Rate"
## [8] "Current.Asset.Turnover.Rate"
## [9] "Long.term.fund.suitability.ratio..A."
## [10] "No.credit.Interval"
## [11] "Research.and.development.expense.rate"
## [12] "Cash.Flow.to.Liability"
## [13] "Allocation.rate.per.person"
## [14] "Total.income.Total.expense"
## [15] "Quick.Ratio"
## [16] "Operating.Profit.Growth.Rate"
## [17] "Long.term.Liability.to.Current.Assets"
## [18] "Regular.Net.Profit.Growth.Rate"
## [19] "Cash.Flow.Per.Share"
## [20] "Total.debt.Total.net.worth"
## [21] "Operating.Expense.Rate"
## [22] "Total.expense.Assets"
## [23] "Average.Collection.Days"
## [24] "Total.Asset.Growth.Rate"
## [25] "Inventory.Turnover.Rate..times."
## [26] "Revenue.Per.Share..Yuan.Â.."
## [27] "Net.Worth.Turnover.Rate..times."
## [28] "Operating.profit.per.person"
initialvar1 <- names(new_bank)
new_bank <- new_bank[!initialvar1 %in% newbank.back$removed]
set.seed(1234) # For reproducability of the random selection
We are using “sample.split” and “subset” functions from “caTools” library to split the data randomly.
split = sample.split(new_bank$Bankrupt, SplitRatio = 0.75)
train.set = subset(new_bank, split == TRUE)
test.set = subset(new_bank, split == FALSE)
table(train.set$Bankrupt)
##
## 0 1
## 4949 165
prop.table(table(train.set$Bankrupt))
##
## 0 1
## 0.96773563 0.03226437
table(test.set$Bankrupt)
##
## 0 1
## 1650 55
prop.table(table(test.set$Bankrupt))
##
## 0 1
## 0.96774194 0.03225806
Now, as it was seen in the plots earlier, the dataset is very imbalanced with barely around 3.2% of the total observations being Bankrupt. In order to deal with this issue, we balance out the minority(Bankrupt(1)) and majority class in the dataset using the ‘SMOTE’ technique. We are setting up the ratio at 0.8 which means we are oversampling our minority class to come upto 80% of the majority class.
smoted.train.set <- oversample(train.set, method = 'SMOTE',ratio = 0.8,
classAttr = 'Bankrupt')
table(smoted.train.set$Bankrupt)
##
## 0 1
## 4949 3960
prop.table(table(smoted.train.set$Bankrupt))
##
## 0 1
## 0.5555057 0.4444943
colnames(smoted.train.set)
## [1] "Bankrupt"
## [2] "Continuous.interest.rate..after.tax."
## [3] "Tax.rate..A."
## [4] "Net.Value.Per.Share..B."
## [5] "Net.Value.Growth.Rate"
## [6] "Cash.Reinvestment.."
## [7] "Current.Ratio"
## [8] "Operating.profit.Paid.in.capital"
## [9] "Accounts.Receivable.Turnover"
## [10] "Fixed.Assets.Turnover.Frequency"
## [11] "Revenue.per.person"
## [12] "Current.Assets.Total.Assets"
## [13] "Cash.Total.Assets"
## [14] "Quick.Assets.Current.Liability"
## [15] "Cash.Current.Liability"
## [16] "Retained.Earnings.to.Total.Assets"
## [17] "Working.capitcal.Turnover.Rate"
## [18] "Fixed.Assets.to.Assets"
## [19] "Cash.Flow.to.Total.Assets"
## [20] "Cash.Flow.to.Equity"
## [21] "Current.Liability.to.Current.Assets"
## [22] "Net.Income.to.Total.Assets"
bank.lr <- glm(formula = Bankrupt ~ .,
family = binomial,
data = smoted.train.set, control=glm.control(maxit=100))
summary(bank.lr)
##
## Call:
## glm(formula = Bankrupt ~ ., family = binomial, data = smoted.train.set,
## control = glm.control(maxit = 100))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.3516 -0.4450 -0.0551 0.4975 2.5338
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.247e+01 5.721e+00 -2.180 0.02926 *
## Continuous.interest.rate..after.tax. -8.003e+00 1.416e+01 -0.565 0.57199
## Tax.rate..A. -9.095e-01 2.904e-01 -3.132 0.00173 **
## Net.Value.Per.Share..B. -3.662e+01 3.273e+00 -11.188 < 2e-16 ***
## Net.Value.Growth.Rate -2.535e-08 9.385e-06 -0.003 0.99784
## Cash.Reinvestment.. -2.539e+00 1.938e+00 -1.311 0.19002
## Current.Ratio -1.580e-08 3.989e-06 -0.004 0.99684
## Operating.profit.Paid.in.capital -2.473e+00 4.100e+00 -0.603 0.54644
## Accounts.Receivable.Turnover -4.751e-08 1.084e-06 -0.044 0.96504
## Fixed.Assets.Turnover.Frequency 1.340e-10 1.261e-11 10.634 < 2e-16 ***
## Revenue.per.person -2.215e-09 1.193e-06 -0.002 0.99852
## Current.Assets.Total.Assets 3.419e+00 2.076e-01 16.470 < 2e-16 ***
## Cash.Total.Assets -1.061e+01 6.485e-01 -16.361 < 2e-16 ***
## Quick.Assets.Current.Liability -2.280e-09 8.863e-07 -0.003 0.99795
## Cash.Current.Liability 1.097e-10 4.023e-11 2.727 0.00639 **
## Retained.Earnings.to.Total.Assets 3.756e+01 3.875e+00 9.694 < 2e-16 ***
## Working.capitcal.Turnover.Rate 4.473e+01 1.563e+01 2.863 0.00420 **
## Fixed.Assets.to.Assets 5.407e-08 1.103e-06 0.049 0.96090
## Cash.Flow.to.Total.Assets 1.862e+00 1.189e+00 1.565 0.11751
## Cash.Flow.to.Equity -3.023e+00 3.209e+00 -0.942 0.34618
## Current.Liability.to.Current.Assets 3.314e+01 2.052e+00 16.152 < 2e-16 ***
## Net.Income.to.Total.Assets -4.851e+01 2.539e+00 -19.105 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 12240 on 8908 degrees of freedom
## Residual deviance: 5838 on 8887 degrees of freedom
## AIC: 5882
##
## Number of Fisher Scoring iterations: 18
Summary of this model clearly suggests that only around half of the variables are statistically significant to the dependent variable ‘Bankrupt’ at the 5% level of significance.
prob_pred <- predict(bank.lr, type = 'response', newdata = test.set[-1])
Setting a condition that even if there are more than 40% chances(>0.4) for a firm being Bankrupt, model should predict it as Bankrupt.
y_pred <- ifelse(prob_pred > 0.4, 1, 0)
AUC(Area under the receiver operating characteristic curve) curve tells that how accurately our model is predicting together with its overall Performance.
The thumb-rule is that Higher the AUC Better our model is at predicting.
calc_auc <- function (prob_pred,test_y){
predict_log <- prediction(prob_pred,test_y) # Prediction Probability
#Calculating ROC Curve...
table(test_y, prob_pred>0.4)
roc_curve<- performance(predict_log,"tpr","fpr")
# plot(roc_curve)
plot(roc_curve, colorize=T)
#Calculating AUC and printing....
auc<- performance(predict_log,"auc")
paste(auc@y.values[[1]])
}
confusionMatrix(as.factor(y_pred),as.factor(test.set$Bankrupt))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1370 16
## 1 280 39
##
## Accuracy : 0.8264
## 95% CI : (0.8076, 0.8441)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1625
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8303
## Specificity : 0.7091
## Pos Pred Value : 0.9885
## Neg Pred Value : 0.1223
## Prevalence : 0.9677
## Detection Rate : 0.8035
## Detection Prevalence : 0.8129
## Balanced Accuracy : 0.7697
##
## 'Positive' Class : 0
##
print(calc_auc(prob_pred,test.set$Bankrupt))
## [1] "0.859790633608784"
The value of AUC is 0.86 which is really good. Let’s train other kind of models and see how well they perform. Logistic Regression will remain as the benchmark for other models to compare with.
tree.model <- rpart(Bankrupt~.,smoted.train.set,method='class')
tree.predict <- predict(tree.model, newdata = test.set,type = 'class')
tree.predict.log <- prediction(as.numeric(tree.predict),
as.numeric(test.set$Bankrupt))
tree.predict.training <- predict(tree.model, newdata = smoted.train.set,
type = 'class')
tree.predict.log.training <- prediction(as.numeric(tree.predict.training),
as.numeric(smoted.train.set$Bankrupt))
tree.auc<- performance(tree.predict.log,"auc")
print((tree.auc@y.values[[1]]))
## [1] 0.8163636
tree.auc<- performance(tree.predict.log.training,"auc")
print((tree.auc@y.values[[1]]))
## [1] 0.8802379
As the AUC value for the training set(0.88) is more than that of the test set(0.82) it might suggest the existence of over-fitting in the dataset.
To check this, let’s use the cross-validation technique and compare the end results with the original results of the Decision tree classifier.
smoted.train.set1 <- smoted.train.set
test.set1 <- test.set
smoted.train.set1$Bankrupt <- ifelse(smoted.train.set1$Bankrupt == 0, 'No', 'Yes')
test.set1$Bankrupt <- ifelse(test.set1$Bankrupt == 0, 'No', 'Yes')
tc <- trainControl(method = "cv",
number = 10,
classProbs = TRUE,
summaryFunction = twoClassSummary)
cp.grid <- expand.grid(cp = seq(0, 0.03, 0.001))
set.seed(123456789)
tree.cv <-
train(Bankrupt ~ .,
data = smoted.train.set1,
method = "rpart",
metric = "ROC",
trControl = tc,
tuneGrid = cp.grid)
pred.train.treecv <- predict(tree.cv,
smoted.train.set1)
pred.test.treecv <- predict(tree.cv,
test.set1)
confusionMatrix(data = pred.train.treecv, # predictions
# actual values
reference = as.factor(smoted.train.set1$Bankrupt),
# definitions of the "success" label
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 4753 204
## Yes 196 3756
##
## Accuracy : 0.9551
## 95% CI : (0.9506, 0.9593)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9091
##
## Mcnemar's Test P-Value : 0.7263
##
## Sensitivity : 0.9485
## Specificity : 0.9604
## Pos Pred Value : 0.9504
## Neg Pred Value : 0.9588
## Prevalence : 0.4445
## Detection Rate : 0.4216
## Detection Prevalence : 0.4436
## Balanced Accuracy : 0.9544
##
## 'Positive' Class : Yes
##
confusionMatrix(data = pred.test.treecv, # predictions
# actual values
reference = as.factor(test.set1$Bankrupt),
# definitions of the "success" label
positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1530 23
## Yes 120 32
##
## Accuracy : 0.9161
## 95% CI : (0.902, 0.9289)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2748
##
## Mcnemar's Test P-Value : 9.914e-16
##
## Sensitivity : 0.58182
## Specificity : 0.92727
## Pos Pred Value : 0.21053
## Neg Pred Value : 0.98519
## Prevalence : 0.03226
## Detection Rate : 0.01877
## Detection Prevalence : 0.08915
## Balanced Accuracy : 0.75455
##
## 'Positive' Class : Yes
##
predict_tree <- prediction(as.numeric(pred.test.treecv),
as.factor(test.set1$Bankrupt))
tree.cv.auc <- performance(predict_tree,'auc')
print((tree.cv.auc@y.values[[1]]))
## [1] 0.7545455
tree.plot <- performance(predict_tree, 'tpr','fpr')
plot(tree.plot, main = "Decision tree AUC", colorize = T)
set.seed(1234)
for(ntree in c(3,5,7,10,13,15,17,20,30,45,60)){
smoted.train.set$Bankrupt <- as.factor(as.character(smoted.train.set$Bankrupt))
test.set$Bankrupt <- as.factor(as.character(test.set$Bankrupt))
forest.classifier = randomForest(Bankrupt ~ .,
data = smoted.train.set,
ntree=ntree,
importance=TRUE)
forest.classifier$importance
# Predicting Bankruptcy using the model
forest.y_pred = predict(forest.classifier, newdata = test.set)
forest.y_pred.train = predict(forest.classifier, newdata = smoted.train.set)
print(paste(ntree,'ConfusionMatrix train'))
print(confusionMatrix(forest.y_pred.train, smoted.train.set$Bankrupt))
print(paste(ntree,'confusionMatrix Test'))
print(confusionMatrix(forest.y_pred, test.set$Bankrupt) )
}
## [1] "3 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4897 28
## 1 52 3932
##
## Accuracy : 0.991
## 95% CI : (0.9888, 0.9929)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9818
##
## Mcnemar's Test P-Value : 0.01013
##
## Sensitivity : 0.9895
## Specificity : 0.9929
## Pos Pred Value : 0.9943
## Neg Pred Value : 0.9869
## Prevalence : 0.5555
## Detection Rate : 0.5497
## Detection Prevalence : 0.5528
## Balanced Accuracy : 0.9912
##
## 'Positive' Class : 0
##
## [1] "3 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1542 30
## 1 108 25
##
## Accuracy : 0.9191
## 95% CI : (0.9051, 0.9316)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2309
##
## Mcnemar's Test P-Value : 5.576e-11
##
## Sensitivity : 0.9345
## Specificity : 0.4545
## Pos Pred Value : 0.9809
## Neg Pred Value : 0.1880
## Prevalence : 0.9677
## Detection Rate : 0.9044
## Detection Prevalence : 0.9220
## Balanced Accuracy : 0.6945
##
## 'Positive' Class : 0
##
## [1] "5 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4920 4
## 1 29 3956
##
## Accuracy : 0.9963
## 95% CI : (0.9948, 0.9974)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9925
##
## Mcnemar's Test P-Value : 2.943e-05
##
## Sensitivity : 0.9941
## Specificity : 0.9990
## Pos Pred Value : 0.9992
## Neg Pred Value : 0.9927
## Prevalence : 0.5555
## Detection Rate : 0.5523
## Detection Prevalence : 0.5527
## Balanced Accuracy : 0.9966
##
## 'Positive' Class : 0
##
## [1] "5 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1569 29
## 1 81 26
##
## Accuracy : 0.9355
## 95% CI : (0.9228, 0.9467)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2908
##
## Mcnemar's Test P-Value : 1.158e-06
##
## Sensitivity : 0.9509
## Specificity : 0.4727
## Pos Pred Value : 0.9819
## Neg Pred Value : 0.2430
## Prevalence : 0.9677
## Detection Rate : 0.9202
## Detection Prevalence : 0.9372
## Balanced Accuracy : 0.7118
##
## 'Positive' Class : 0
##
## [1] "7 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4932 4
## 1 17 3956
##
## Accuracy : 0.9976
## 95% CI : (0.9964, 0.9985)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9952
##
## Mcnemar's Test P-Value : 0.008829
##
## Sensitivity : 0.9966
## Specificity : 0.9990
## Pos Pred Value : 0.9992
## Neg Pred Value : 0.9957
## Prevalence : 0.5555
## Detection Rate : 0.5536
## Detection Prevalence : 0.5540
## Balanced Accuracy : 0.9978
##
## 'Positive' Class : 0
##
## [1] "7 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1565 32
## 1 85 23
##
## Accuracy : 0.9314
## 95% CI : (0.9183, 0.9429)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2502
##
## Mcnemar's Test P-Value : 1.529e-06
##
## Sensitivity : 0.9485
## Specificity : 0.4182
## Pos Pred Value : 0.9800
## Neg Pred Value : 0.2130
## Prevalence : 0.9677
## Detection Rate : 0.9179
## Detection Prevalence : 0.9367
## Balanced Accuracy : 0.6833
##
## 'Positive' Class : 0
##
## [1] "10 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4941 3
## 1 8 3957
##
## Accuracy : 0.9988
## 95% CI : (0.9978, 0.9994)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9975
##
## Mcnemar's Test P-Value : 0.2278
##
## Sensitivity : 0.9984
## Specificity : 0.9992
## Pos Pred Value : 0.9994
## Neg Pred Value : 0.9980
## Prevalence : 0.5555
## Detection Rate : 0.5546
## Detection Prevalence : 0.5549
## Balanced Accuracy : 0.9988
##
## 'Positive' Class : 0
##
## [1] "10 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1566 32
## 1 84 23
##
## Accuracy : 0.932
## 95% CI : (0.919, 0.9435)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2521
##
## Mcnemar's Test P-Value : 2.188e-06
##
## Sensitivity : 0.9491
## Specificity : 0.4182
## Pos Pred Value : 0.9800
## Neg Pred Value : 0.2150
## Prevalence : 0.9677
## Detection Rate : 0.9185
## Detection Prevalence : 0.9372
## Balanced Accuracy : 0.6836
##
## 'Positive' Class : 0
##
## [1] "13 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4944 1
## 1 5 3959
##
## Accuracy : 0.9993
## 95% CI : (0.9985, 0.9998)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9986
##
## Mcnemar's Test P-Value : 0.2207
##
## Sensitivity : 0.9990
## Specificity : 0.9997
## Pos Pred Value : 0.9998
## Neg Pred Value : 0.9987
## Prevalence : 0.5555
## Detection Rate : 0.5549
## Detection Prevalence : 0.5551
## Balanced Accuracy : 0.9994
##
## 'Positive' Class : 0
##
## [1] "13 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1568 30
## 1 82 25
##
## Accuracy : 0.9343
## 95% CI : (0.9215, 0.9456)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2779
##
## Mcnemar's Test P-Value : 1.442e-06
##
## Sensitivity : 0.9503
## Specificity : 0.4545
## Pos Pred Value : 0.9812
## Neg Pred Value : 0.2336
## Prevalence : 0.9677
## Detection Rate : 0.9196
## Detection Prevalence : 0.9372
## Balanced Accuracy : 0.7024
##
## 'Positive' Class : 0
##
## [1] "15 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4943 0
## 1 6 3960
##
## Accuracy : 0.9993
## 95% CI : (0.9985, 0.9998)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.9986
##
## Mcnemar's Test P-Value : 0.04123
##
## Sensitivity : 0.9988
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9985
## Prevalence : 0.5555
## Detection Rate : 0.5548
## Detection Prevalence : 0.5548
## Balanced Accuracy : 0.9994
##
## 'Positive' Class : 0
##
## [1] "15 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1569 31
## 1 81 24
##
## Accuracy : 0.9343
## 95% CI : (0.9215, 0.9456)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2691
##
## Mcnemar's Test P-Value : 3.656e-06
##
## Sensitivity : 0.9509
## Specificity : 0.4364
## Pos Pred Value : 0.9806
## Neg Pred Value : 0.2286
## Prevalence : 0.9677
## Detection Rate : 0.9202
## Detection Prevalence : 0.9384
## Balanced Accuracy : 0.6936
##
## 'Positive' Class : 0
##
## [1] "17 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4947 1
## 1 2 3959
##
## Accuracy : 0.9997
## 95% CI : (0.999, 0.9999)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9993
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9996
## Specificity : 0.9997
## Pos Pred Value : 0.9998
## Neg Pred Value : 0.9995
## Prevalence : 0.5555
## Detection Rate : 0.5553
## Detection Prevalence : 0.5554
## Balanced Accuracy : 0.9997
##
## 'Positive' Class : 0
##
## [1] "17 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1584 28
## 1 66 27
##
## Accuracy : 0.9449
## 95% CI : (0.933, 0.9552)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.9999996
##
## Kappa : 0.338
##
## Mcnemar's Test P-Value : 0.0001355
##
## Sensitivity : 0.9600
## Specificity : 0.4909
## Pos Pred Value : 0.9826
## Neg Pred Value : 0.2903
## Prevalence : 0.9677
## Detection Rate : 0.9290
## Detection Prevalence : 0.9455
## Balanced Accuracy : 0.7255
##
## 'Positive' Class : 0
##
## [1] "20 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4948 0
## 1 1 3960
##
## Accuracy : 0.9999
## 95% CI : (0.9994, 1)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9998
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9998
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9997
## Prevalence : 0.5555
## Detection Rate : 0.5554
## Detection Prevalence : 0.5554
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
## [1] "20 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1583 30
## 1 67 25
##
## Accuracy : 0.9431
## 95% CI : (0.931, 0.9536)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.9999999
##
## Kappa : 0.3124
##
## Mcnemar's Test P-Value : 0.0002569
##
## Sensitivity : 0.9594
## Specificity : 0.4545
## Pos Pred Value : 0.9814
## Neg Pred Value : 0.2717
## Prevalence : 0.9677
## Detection Rate : 0.9284
## Detection Prevalence : 0.9460
## Balanced Accuracy : 0.7070
##
## 'Positive' Class : 0
##
## [1] "30 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4948 0
## 1 1 3960
##
## Accuracy : 0.9999
## 95% CI : (0.9994, 1)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9998
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9998
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9997
## Prevalence : 0.5555
## Detection Rate : 0.5554
## Detection Prevalence : 0.5554
## Balanced Accuracy : 0.9999
##
## 'Positive' Class : 0
##
## [1] "30 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1584 28
## 1 66 27
##
## Accuracy : 0.9449
## 95% CI : (0.933, 0.9552)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.9999996
##
## Kappa : 0.338
##
## Mcnemar's Test P-Value : 0.0001355
##
## Sensitivity : 0.9600
## Specificity : 0.4909
## Pos Pred Value : 0.9826
## Neg Pred Value : 0.2903
## Prevalence : 0.9677
## Detection Rate : 0.9290
## Detection Prevalence : 0.9455
## Balanced Accuracy : 0.7255
##
## 'Positive' Class : 0
##
## [1] "45 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4949 0
## 1 0 3960
##
## Accuracy : 1
## 95% CI : (0.9996, 1)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5555
## Detection Rate : 0.5555
## Detection Prevalence : 0.5555
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
## [1] "45 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1580 27
## 1 70 28
##
## Accuracy : 0.9431
## 95% CI : (0.931, 0.9536)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3387
##
## Mcnemar's Test P-Value : 2.004e-05
##
## Sensitivity : 0.9576
## Specificity : 0.5091
## Pos Pred Value : 0.9832
## Neg Pred Value : 0.2857
## Prevalence : 0.9677
## Detection Rate : 0.9267
## Detection Prevalence : 0.9425
## Balanced Accuracy : 0.7333
##
## 'Positive' Class : 0
##
## [1] "60 ConfusionMatrix train"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4949 0
## 1 0 3960
##
## Accuracy : 1
## 95% CI : (0.9996, 1)
## No Information Rate : 0.5555
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.5555
## Detection Rate : 0.5555
## Detection Prevalence : 0.5555
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : 0
##
## [1] "60 confusionMatrix Test"
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1574 26
## 1 76 29
##
## Accuracy : 0.9402
## 95% CI : (0.9278, 0.951)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.3343
##
## Mcnemar's Test P-Value : 1.224e-06
##
## Sensitivity : 0.9539
## Specificity : 0.5273
## Pos Pred Value : 0.9838
## Neg Pred Value : 0.2762
## Prevalence : 0.9677
## Detection Rate : 0.9232
## Detection Prevalence : 0.9384
## Balanced Accuracy : 0.7406
##
## 'Positive' Class : 0
##
From the number of confusion matrices, we can see that 17 trees is the best ntrees to add.
Finally, a 17 tree Random forest model to move further.
forest.classifier = randomForest(Bankrupt ~ .,
data = smoted.train.set,
ntree=17,
importance=TRUE)
smoted.train.set$Bankrupt <- as.factor(as.character(smoted.train.set$Bankrupt))
test.set$Bankrupt <- as.factor(as.character(test.set$Bankrupt))
forest.y_pred = predict(forest.classifier, newdata = test.set)
forest.y_pred.train = predict(forest.classifier, newdata = smoted.train.set)
forest.y_prob = predict(forest.classifier, newdata = test.set, type='class')
forest.y_prob.train = predict(forest.classifier, newdata = smoted.train.set,
type='class')
forest.predict_log <- prediction(as.numeric(forest.y_pred),
as.numeric(test.set$Bankrupt))
forest.predict_log.train <- prediction(as.numeric(forest.y_pred.train),
as.numeric(smoted.train.set$Bankrupt))
forest.auc<- performance(forest.predict_log,"auc")
print((forest.auc@y.values[[1]]))
## [1] 0.720303
forest.auc.train<- performance(forest.predict_log.train,"auc")
print((forest.auc.train@y.values[[1]]))
## [1] 0.9996969
The AUC value for training set is 0.9996 which clearly suggests there is some issue. This needs to be addressed and we will discuss about the Cross validation technique used later on.
roc_curve<- performance(forest.predict_log,"tpr","fpr")
plot(roc_curve, colorize=T)
smoted.train.set$Bankrupt <- as.factor(smoted.train.set$Bankrupt)
result <- rfcv(trainx = smoted.train.set[-1], trainy = smoted.train.set$Bankrupt)
with(result, plot(n.var, error.cv, log="x", type="o", lwd=2))
result <- replicate(5, rfcv(smoted.train.set[-1], as.factor(smoted.train.set$Bankrupt)), simplify=FALSE)
error.cv <- sapply(result, "[[", "error.cv")
print(result[[1]]$error.cv)
## 21 10 5 3 1
## 0.03232686 0.03749018 0.06128634 0.11381749 0.46941295
matplot(result[[1]]$n.var, cbind(rowMeans(error.cv), error.cv), type="l",
lwd=c(2, rep(1, ncol(error.cv))), col=1, lty=1,log="x",
xlab="Number of variables", ylab="CV Error")
Here we see in result$error.cv that when we are using 10 features we are getting error > 0.05 and there is no significant change in 21 and 10 features. So, we will be considering top 10 important features from the list of important features.
ctrl_cv3 <- trainControl(method = "cv", number = 3)
parameters_xgb <- expand.grid(nrounds = seq(20, 80, 10),
max_depth = c(8),
eta = c(0.25),
gamma = 1,
colsample_bytree = c(0.2),
min_child_weight = c(150),
subsample = 0.8)
set.seed(123456789)
bank.xg <- train(Bankrupt ~ .,
data = smoted.train.set,
method = "xgbTree",
trControl = ctrl_cv3,
tuneGrid = parameters_xgb)
bank.xg
## eXtreme Gradient Boosting
##
## 8909 samples
## 21 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (3 fold)
## Summary of sample sizes: 5939, 5939, 5940
## Resampling results across tuning parameters:
##
## nrounds Accuracy Kappa
## 20 0.8738351 0.7456884
## 30 0.8763045 0.7502896
## 40 0.8809060 0.7597071
## 50 0.8836002 0.7651155
## 60 0.8858451 0.7696559
## 70 0.8886513 0.7753544
## 80 0.8908967 0.7798752
##
## Tuning parameter 'max_depth' was held constant at a value of 8
## Tuning
## parameter 'min_child_weight' was held constant at a value of 150
##
## Tuning parameter 'subsample' was held constant at a value of 0.8
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were nrounds = 80, max_depth = 8, eta
## = 0.25, gamma = 1, colsample_bytree = 0.2, min_child_weight = 150
## and subsample = 0.8.
xgb <- xgboost(data = data.matrix(smoted.train.set[-1]), label = smoted.train.set$Bankrupt, nrounds=25, verbose=2)
## [1] train-rmse:0.768377
## [2] train-rmse:0.564043
## [3] train-rmse:0.425362
## [4] train-rmse:0.332692
## [5] train-rmse:0.271945
## [6] train-rmse:0.234583
## [7] train-rmse:0.208649
## [8] train-rmse:0.191692
## [9] train-rmse:0.180239
## [10] train-rmse:0.169834
## [11] train-rmse:0.163711
## [12] train-rmse:0.157790
## [13] train-rmse:0.155134
## [14] train-rmse:0.152572
## [15] train-rmse:0.149336
## [16] train-rmse:0.146894
## [17] train-rmse:0.140304
## [18] train-rmse:0.137398
## [19] train-rmse:0.133466
## [20] train-rmse:0.131739
## [21] train-rmse:0.128605
## [22] train-rmse:0.124493
## [23] train-rmse:0.123792
## [24] train-rmse:0.123232
## [25] train-rmse:0.122305
xgb.y_pred <- predict(xgb,newdata = data.matrix(test.set[-1]), type='response')
xgb.pred.train = predict(xgb, newdata = data.matrix(smoted.train.set[-1]))
xgb.y_prob = predict(xgb, data.matrix(test.set[-1]), type='class')
xgb.y_prob.train = predict(xgb, newdata = data.matrix(smoted.train.set[-1]),
type='class')
xgb.predict_log <- prediction(as.numeric(xgb.y_pred),
as.numeric(test.set$Bankrupt))
xgb.predict_log.train <- prediction(as.numeric(xgb.pred.train),
as.numeric(smoted.train.set$Bankrupt))
xgb.auc<- performance(xgb.predict_log,"auc")
print((xgb.auc@y.values[[1]]))
## [1] 0.8800661
xgb.auc.train<- performance(xgb.predict_log.train,"auc")
print((xgb.auc.train@y.values[[1]]))
## [1] 0.9995406
roc_curve<- performance(xgb.predict_log,"tpr","fpr")
plot(roc_curve, colorize=T)