rm(list = ls())
set.seed(5535)
# ---- packages ----
pkgs <- c(
"tidyverse", # data wrangling + ggplot2
"caret", # split, CV utilities, confusionMatrix
"pROC", # ROC/AUC
"class", # KNN
"ranger" # Random Forest (fast)
)
to_install <- pkgs[!pkgs %in% installed.packages()[, "Package"]]
if (length(to_install) > 0) install.packages(to_install)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(class)
library(ranger)
# ---- path ----
data_path <- "data.csv"
# keep original colnames (some may contain symbols)
dat <- read.csv(data_path, check.names = FALSE)
# basic checks
dim(dat)
## [1] 6819 96
names(dat)[1:20]
## [1] "Bankrupt?"
## [2] "ROA(C) before interest and depreciation before interest"
## [3] "ROA(A) before interest and % after tax"
## [4] "ROA(B) before interest and depreciation after tax"
## [5] "Operating Gross Margin"
## [6] "Realized Sales Gross Margin"
## [7] "Operating Profit Rate"
## [8] "Pre-tax net Interest Rate"
## [9] "After-tax net Interest Rate"
## [10] "Non-industry income and expenditure/revenue"
## [11] "Continuous interest rate (after tax)"
## [12] "Operating Expense Rate"
## [13] "Research and development expense rate"
## [14] "Cash flow rate"
## [15] "Interest-bearing debt interest rate"
## [16] "Tax rate (A)"
## [17] "Net Value Per Share (B)"
## [18] "Net Value Per Share (A)"
## [19] "Net Value Per Share (C)"
## [20] "Persistent EPS in the Last Four Seasons"
str(dat)
## 'data.frame': 6819 obs. of 96 variables:
## $ Bankrupt? : int 1 1 1 1 1 1 0 0 0 0 ...
## $ ROA(C) before interest and depreciation before interest: num 0.371 0.464 0.426 0.4 0.465 ...
## $ ROA(A) before interest and % after tax : num 0.424 0.538 0.499 0.451 0.538 ...
## $ ROA(B) before interest and depreciation after tax : num 0.406 0.517 0.472 0.458 0.522 ...
## $ Operating Gross Margin : num 0.601 0.61 0.601 0.584 0.599 ...
## $ Realized Sales Gross Margin : num 0.601 0.61 0.601 0.584 0.599 ...
## $ Operating Profit Rate : num 0.999 0.999 0.999 0.999 0.999 ...
## $ Pre-tax net Interest Rate : num 0.797 0.797 0.796 0.797 0.797 ...
## $ After-tax net Interest Rate : num 0.809 0.809 0.808 0.809 0.809 ...
## $ Non-industry income and expenditure/revenue : num 0.303 0.304 0.302 0.303 0.303 ...
## $ Continuous interest rate (after tax) : num 0.781 0.782 0.78 0.781 0.782 ...
## $ Operating Expense Rate : num 1.26e-04 2.90e-04 2.36e-04 1.08e-04 7.89e+09 ...
## $ Research and development expense rate : num 0.00 0.00 2.55e+07 0.00 0.00 0.00 7.30e+08 5.09e+07 0.00 0.00 ...
## $ Cash flow rate : num 0.458 0.462 0.459 0.466 0.463 ...
## $ Interest-bearing debt interest rate : num 0.000725 0.000647 0.00079 0.000449 0.000686 ...
## $ Tax rate (A) : num 0 0 0 0 0 ...
## $ Net Value Per Share (B) : num 0.148 0.182 0.178 0.154 0.168 ...
## $ Net Value Per Share (A) : num 0.148 0.182 0.178 0.154 0.168 ...
## $ Net Value Per Share (C) : num 0.148 0.182 0.194 0.154 0.168 ...
## $ Persistent EPS in the Last Four Seasons : num 0.169 0.209 0.181 0.194 0.213 ...
## $ Cash Flow Per Share : num 0.312 0.318 0.307 0.322 0.319 ...
## $ Revenue Per Share (Yuan ¥) : num 0.01756 0.02114 0.00594 0.01437 0.02969 ...
## $ Operating Profit Per Share (Yuan ¥) : num 0.0959 0.0937 0.0923 0.0778 0.0969 ...
## $ Per Share Net profit before tax (Yuan ¥) : num 0.139 0.17 0.143 0.149 0.168 ...
## $ Realized Sales Gross Profit Growth Rate : num 0.0221 0.0221 0.0228 0.022 0.0221 ...
## $ Operating Profit Growth Rate : num 0.848 0.848 0.848 0.848 0.848 ...
## $ After-tax Net Profit Growth Rate : num 0.689 0.69 0.689 0.689 0.69 ...
## $ Regular Net Profit Growth Rate : num 0.689 0.69 0.689 0.689 0.69 ...
## $ Continuous Net Profit Growth Rate : num 0.218 0.218 0.218 0.218 0.218 ...
## $ Total Asset Growth Rate : num 4.98e+09 6.11e+09 7.28e+09 4.88e+09 5.51e+09 6.08e+08 5.72e+09 6.63e+09 6.89e+09 5.55e+09 ...
## $ Net Value Growth Rate : num 0.000327 0.000443 0.000396 0.000382 0.000439 ...
## $ Total Asset Return Growth Rate Ratio : num 0.263 0.265 0.264 0.263 0.265 ...
## $ Cash Reinvestment % : num 0.364 0.377 0.369 0.384 0.38 ...
## $ Current Ratio : num 0.00226 0.00602 0.01154 0.00419 0.00602 ...
## $ Quick Ratio : num 0.00121 0.00404 0.00535 0.0029 0.00373 ...
## $ Interest Expense Ratio : num 0.63 0.635 0.63 0.63 0.636 ...
## $ Total debt/Total net worth : num 0.02127 0.0125 0.02125 0.00957 0.00515 ...
## $ Debt ratio % : num 0.208 0.171 0.208 0.151 0.107 ...
## $ Net worth/Assets : num 0.792 0.829 0.792 0.849 0.893 ...
## $ Long-term fund suitability ratio (A) : num 0.00502 0.00506 0.0051 0.00505 0.0053 ...
## $ Borrowing dependency : num 0.39 0.377 0.379 0.38 0.375 ...
## $ Contingent liabilities/Net worth : num 0.00648 0.00584 0.00656 0.00537 0.00662 ...
## $ Operating profit/Paid-in capital : num 0.0959 0.0937 0.0923 0.0777 0.0969 ...
## $ Net profit before tax/Paid-in capital : num 0.138 0.169 0.148 0.148 0.167 ...
## $ Inventory and accounts receivable/Net value : num 0.398 0.398 0.407 0.398 0.4 ...
## $ Total Asset Turnover : num 0.087 0.0645 0.015 0.09 0.1754 ...
## $ Accounts Receivable Turnover : num 0.00181 0.00129 0.0015 0.00197 0.00145 ...
## $ Average Collection Days : num 0.00349 0.00492 0.00423 0.00321 0.00437 ...
## $ Inventory Turnover Rate (times) : num 1.82e-04 9.36e+09 6.50e+07 7.13e+09 1.63e-04 ...
## $ Fixed Assets Turnover Frequency : num 1.17e-04 7.19e+08 2.65e+09 9.15e+09 2.94e-04 ...
## $ Net Worth Turnover Rate (times) : num 0.0329 0.0255 0.0134 0.0281 0.0402 ...
## $ Revenue per person : num 0.03416 0.00689 0.029 0.01546 0.05811 ...
## $ Operating profit per person : num 0.393 0.392 0.382 0.378 0.394 ...
## $ Allocation rate per person : num 0.0371 0.0123 0.141 0.0213 0.024 ...
## $ Working Capital to Total Assets : num 0.673 0.751 0.83 0.726 0.752 ...
## $ Quick Assets/Total Assets : num 0.167 0.127 0.34 0.162 0.26 ...
## $ Current Assets/Total Assets : num 0.191 0.182 0.603 0.226 0.358 ...
## $ Cash/Total Assets : num 0.004094 0.014948 0.000991 0.018851 0.014161 ...
## $ Quick Assets/Current Liability : num 0.002 0.00414 0.0063 0.00296 0.00427 ...
## $ Cash/Current Liability : num 1.47e-04 1.38e-03 5.34e+09 1.01e-03 6.80e-04 ...
## $ Current Liability to Assets : num 0.1473 0.057 0.0982 0.0987 0.1102 ...
## $ Operating Funds to Liability : num 0.334 0.341 0.337 0.349 0.345 ...
## $ Inventory/Working Capital : num 0.277 0.29 0.277 0.277 0.288 ...
## $ Inventory/Current Liability : num 0.00104 0.00521 0.01388 0.00354 0.00487 ...
## $ Current Liabilities/Liability : num 0.676 0.309 0.446 0.616 0.975 ...
## $ Working Capital/Equity : num 0.721 0.732 0.743 0.73 0.732 ...
## $ Current Liabilities/Equity : num 0.339 0.33 0.335 0.332 0.331 ...
## $ Long-term Liability to Current Assets : num 0.02559 0.02395 0.00372 0.02217 0 ...
## $ Retained Earnings to Total Assets : num 0.903 0.931 0.91 0.907 0.914 ...
## $ Total income/Total expense : num 0.00202 0.00223 0.00206 0.00183 0.00222 ...
## $ Total expense/Assets : num 0.0649 0.0255 0.0214 0.0242 0.0264 ...
## $ Current Asset Turnover Rate : num 7.01e+08 1.07e-04 1.79e-03 8.14e+09 6.68e+09 ...
## $ Quick Asset Turnover Rate : num 6.55e+09 7.70e+09 1.02e-03 6.05e+09 5.05e+09 ...
## $ Working capitcal Turnover Rate : num 0.594 0.594 0.595 0.594 0.594 ...
## $ Cash Turnover Rate : num 4.58e+08 2.49e+09 7.61e+08 2.03e+09 8.24e+08 ...
## $ Cash Flow to Sales : num 0.672 0.672 0.672 0.672 0.672 ...
## $ Fixed Assets to Assets : num 0.424 0.469 0.276 0.559 0.31 ...
## $ Current Liability to Liability : num 0.676 0.309 0.446 0.616 0.975 ...
## $ Current Liability to Equity : num 0.339 0.33 0.335 0.332 0.331 ...
## $ Equity to Long-term Liability : num 0.127 0.121 0.118 0.121 0.111 ...
## $ Cash Flow to Total Assets : num 0.638 0.641 0.643 0.579 0.622 ...
## $ Cash Flow to Liability : num 0.459 0.459 0.459 0.449 0.454 ...
## $ CFO to Assets : num 0.52 0.567 0.538 0.604 0.578 ...
## $ Cash Flow to Equity : num 0.313 0.314 0.315 0.302 0.312 ...
## $ Current Liability to Current Assets : num 0.1183 0.0478 0.0253 0.0672 0.0477 ...
## $ Liability-Assets Flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Net Income to Total Assets : num 0.717 0.795 0.775 0.74 0.795 ...
## $ Total assets to GNP price : num 0.00922 0.00832 0.04 0.00325 0.00388 ...
## $ No-credit Interval : num 0.623 0.624 0.624 0.623 0.624 ...
## $ Gross Profit to Sales : num 0.601 0.61 0.601 0.584 0.599 ...
## $ Net Income to Stockholder's Equity : num 0.828 0.84 0.837 0.835 0.84 ...
## $ Liability to Equity : num 0.29 0.284 0.29 0.282 0.279 ...
## $ Degree of Financial Leverage (DFL) : num 0.0266 0.2646 0.0266 0.0267 0.0248 ...
## $ Interest Coverage Ratio (Interest expense to EBIT) : num 0.564 0.57 0.564 0.565 0.576 ...
## $ Net Income Flag : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Equity to Liability : num 0.0165 0.0208 0.0165 0.024 0.0355 ...
summary(dat)
## Bankrupt? ROA(C) before interest and depreciation before interest
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.4765
## Median :0.00000 Median :0.5027
## Mean :0.03226 Mean :0.5052
## 3rd Qu.:0.00000 3rd Qu.:0.5356
## Max. :1.00000 Max. :1.0000
## ROA(A) before interest and % after tax
## Min. :0.0000
## 1st Qu.:0.5355
## Median :0.5598
## Mean :0.5586
## 3rd Qu.:0.5892
## Max. :1.0000
## ROA(B) before interest and depreciation after tax Operating Gross Margin
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5273 1st Qu.:0.6004
## Median :0.5523 Median :0.6060
## Mean :0.5536 Mean :0.6079
## 3rd Qu.:0.5841 3rd Qu.:0.6139
## Max. :1.0000 Max. :1.0000
## Realized Sales Gross Margin Operating Profit Rate Pre-tax net Interest Rate
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6004 1st Qu.:0.9990 1st Qu.:0.7974
## Median :0.6060 Median :0.9990 Median :0.7975
## Mean :0.6079 Mean :0.9988 Mean :0.7972
## 3rd Qu.:0.6138 3rd Qu.:0.9991 3rd Qu.:0.7976
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## After-tax net Interest Rate Non-industry income and expenditure/revenue
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.8093 1st Qu.:0.3035
## Median :0.8094 Median :0.3035
## Mean :0.8091 Mean :0.3036
## 3rd Qu.:0.8095 3rd Qu.:0.3036
## Max. :1.0000 Max. :1.0000
## Continuous interest rate (after tax) Operating Expense Rate
## Min. :0.0000 Min. :0.000e+00
## 1st Qu.:0.7816 1st Qu.:0.000e+00
## Median :0.7816 Median :0.000e+00
## Mean :0.7814 Mean :1.995e+09
## 3rd Qu.:0.7817 3rd Qu.:4.145e+09
## Max. :1.0000 Max. :9.990e+09
## Research and development expense rate Cash flow rate
## Min. :0.00e+00 Min. :0.0000
## 1st Qu.:0.00e+00 1st Qu.:0.4616
## Median :5.09e+08 Median :0.4651
## Mean :1.95e+09 Mean :0.4674
## 3rd Qu.:3.45e+09 3rd Qu.:0.4710
## Max. :9.98e+09 Max. :1.0000
## Interest-bearing debt interest rate Tax rate (A) Net Value Per Share (B)
## Min. : 0 Min. :0.00000 Min. :0.0000
## 1st Qu.: 0 1st Qu.:0.00000 1st Qu.:0.1736
## Median : 0 Median :0.07349 Median :0.1844
## Mean : 16448013 Mean :0.11500 Mean :0.1907
## 3rd Qu.: 0 3rd Qu.:0.20584 3rd Qu.:0.1996
## Max. :990000000 Max. :1.00000 Max. :1.0000
## Net Value Per Share (A) Net Value Per Share (C)
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1736 1st Qu.:0.1737
## Median :0.1844 Median :0.1844
## Mean :0.1906 Mean :0.1907
## 3rd Qu.:0.1996 3rd Qu.:0.1996
## Max. :1.0000 Max. :1.0000
## Persistent EPS in the Last Four Seasons Cash Flow Per Share
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2147 1st Qu.:0.3177
## Median :0.2245 Median :0.3225
## Mean :0.2288 Mean :0.3235
## 3rd Qu.:0.2388 3rd Qu.:0.3286
## Max. :1.0000 Max. :1.0000
## Revenue Per Share (Yuan ¥) Operating Profit Per Share (Yuan ¥)
## Min. :0.000e+00 Min. :0.00000
## 1st Qu.:0.000e+00 1st Qu.:0.09608
## Median :0.000e+00 Median :0.10423
## Mean :1.329e+06 Mean :0.10909
## 3rd Qu.:0.000e+00 3rd Qu.:0.11616
## Max. :3.020e+09 Max. :1.00000
## Per Share Net profit before tax (Yuan ¥)
## Min. :0.0000
## 1st Qu.:0.1704
## Median :0.1797
## Mean :0.1844
## 3rd Qu.:0.1935
## Max. :1.0000
## Realized Sales Gross Profit Growth Rate Operating Profit Growth Rate
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.02206 1st Qu.:0.8480
## Median :0.02210 Median :0.8480
## Mean :0.02241 Mean :0.8480
## 3rd Qu.:0.02215 3rd Qu.:0.8481
## Max. :1.00000 Max. :1.0000
## After-tax Net Profit Growth Rate Regular Net Profit Growth Rate
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6893 1st Qu.:0.6893
## Median :0.6894 Median :0.6894
## Mean :0.6891 Mean :0.6892
## 3rd Qu.:0.6896 3rd Qu.:0.6896
## Max. :1.0000 Max. :1.0000
## Continuous Net Profit Growth Rate Total Asset Growth Rate
## Min. :0.0000 Min. :0.000e+00
## 1st Qu.:0.2176 1st Qu.:4.860e+09
## Median :0.2176 Median :6.400e+09
## Mean :0.2176 Mean :5.508e+09
## 3rd Qu.:0.2176 3rd Qu.:7.390e+09
## Max. :1.0000 Max. :9.990e+09
## Net Value Growth Rate Total Asset Return Growth Rate Ratio Cash Reinvestment %
## Min. :0.000e+00 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000e+00 1st Qu.:0.2638 1st Qu.:0.3747
## Median :0.000e+00 Median :0.2640 Median :0.3804
## Mean :1.566e+06 Mean :0.2642 Mean :0.3797
## 3rd Qu.:0.000e+00 3rd Qu.:0.2644 3rd Qu.:0.3867
## Max. :9.330e+09 Max. :1.0000 Max. :1.0000
## Current Ratio Quick Ratio Interest Expense Ratio
## Min. :0.000e+00 Min. :0.000e+00 Min. :0.0000
## 1st Qu.:0.000e+00 1st Qu.:0.000e+00 1st Qu.:0.6306
## Median :0.000e+00 Median :0.000e+00 Median :0.6307
## Mean :4.033e+05 Mean :8.377e+06 Mean :0.6310
## 3rd Qu.:0.000e+00 3rd Qu.:0.000e+00 3rd Qu.:0.6311
## Max. :2.750e+09 Max. :9.230e+09 Max. :1.0000
## Total debt/Total net worth Debt ratio % Net worth/Assets
## Min. :0.000e+00 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000e+00 1st Qu.:0.07289 1st Qu.:0.8512
## Median :0.000e+00 Median :0.11141 Median :0.8886
## Mean :4.416e+06 Mean :0.11318 Mean :0.8868
## 3rd Qu.:0.000e+00 3rd Qu.:0.14880 3rd Qu.:0.9271
## Max. :9.940e+09 Max. :1.00000 Max. :1.0000
## Long-term fund suitability ratio (A) Borrowing dependency
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.005244 1st Qu.:0.3702
## Median :0.005665 Median :0.3726
## Mean :0.008783 Mean :0.3747
## 3rd Qu.:0.006847 3rd Qu.:0.3763
## Max. :1.000000 Max. :1.0000
## Contingent liabilities/Net worth Operating profit/Paid-in capital
## Min. :0.000000 Min. :0.0000
## 1st Qu.:0.005366 1st Qu.:0.0961
## Median :0.005366 Median :0.1041
## Mean :0.005968 Mean :0.1090
## 3rd Qu.:0.005764 3rd Qu.:0.1159
## Max. :1.000000 Max. :1.0000
## Net profit before tax/Paid-in capital
## Min. :0.0000
## 1st Qu.:0.1694
## Median :0.1785
## Mean :0.1827
## 3rd Qu.:0.1916
## Max. :1.0000
## Inventory and accounts receivable/Net value Total Asset Turnover
## Min. :0.0000 Min. :0.00000
## 1st Qu.:0.3974 1st Qu.:0.07646
## Median :0.4001 Median :0.11844
## Mean :0.4025 Mean :0.14161
## 3rd Qu.:0.4046 3rd Qu.:0.17691
## Max. :1.0000 Max. :1.00000
## Accounts Receivable Turnover Average Collection Days
## Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:0.000e+00 1st Qu.:0.000e+00
## Median :0.000e+00 Median :0.000e+00
## Mean :1.279e+07 Mean :9.826e+06
## 3rd Qu.:0.000e+00 3rd Qu.:0.000e+00
## Max. :9.740e+09 Max. :9.730e+09
## Inventory Turnover Rate (times) Fixed Assets Turnover Frequency
## Min. :0.000e+00 Min. :0.000e+00
## 1st Qu.:0.000e+00 1st Qu.:0.000e+00
## Median :0.000e+00 Median :0.000e+00
## Mean :2.149e+09 Mean :1.009e+09
## 3rd Qu.:4.620e+09 3rd Qu.:0.000e+00
## Max. :9.990e+09 Max. :9.990e+09
## Net Worth Turnover Rate (times) Revenue per person
## Min. :0.00000 Min. :0.000e+00
## 1st Qu.:0.02177 1st Qu.:0.000e+00
## Median :0.02952 Median :0.000e+00
## Mean :0.03860 Mean :2.326e+06
## 3rd Qu.:0.04290 3rd Qu.:0.000e+00
## Max. :1.00000 Max. :8.810e+09
## Operating profit per person Allocation rate per person
## Min. :0.0000 Min. :0.000e+00
## 1st Qu.:0.3924 1st Qu.:0.000e+00
## Median :0.3959 Median :0.000e+00
## Mean :0.4007 Mean :1.126e+07
## 3rd Qu.:0.4019 3rd Qu.:0.000e+00
## Max. :1.0000 Max. :9.570e+09
## Working Capital to Total Assets Quick Assets/Total Assets
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.7743 1st Qu.:0.2420
## Median :0.8103 Median :0.3865
## Mean :0.8141 Mean :0.4001
## 3rd Qu.:0.8504 3rd Qu.:0.5406
## Max. :1.0000 Max. :1.0000
## Current Assets/Total Assets Cash/Total Assets Quick Assets/Current Liability
## Min. :0.0000 Min. :0.00000 Min. :0.000e+00
## 1st Qu.:0.3528 1st Qu.:0.03354 1st Qu.:0.000e+00
## Median :0.5148 Median :0.07489 Median :0.000e+00
## Mean :0.5223 Mean :0.12409 Mean :3.593e+06
## 3rd Qu.:0.6891 3rd Qu.:0.16107 3rd Qu.:0.000e+00
## Max. :1.0000 Max. :1.00000 Max. :8.820e+09
## Cash/Current Liability Current Liability to Assets
## Min. :0.000e+00 Min. :0.00000
## 1st Qu.:0.000e+00 1st Qu.:0.05330
## Median :0.000e+00 Median :0.08270
## Mean :3.716e+07 Mean :0.09067
## 3rd Qu.:0.000e+00 3rd Qu.:0.11952
## Max. :9.650e+09 Max. :1.00000
## Operating Funds to Liability Inventory/Working Capital
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3410 1st Qu.:0.2770
## Median :0.3486 Median :0.2772
## Mean :0.3538 Mean :0.2774
## 3rd Qu.:0.3609 3rd Qu.:0.2774
## Max. :1.0000 Max. :1.0000
## Inventory/Current Liability Current Liabilities/Liability
## Min. :0.000e+00 Min. :0.0000
## 1st Qu.:0.000e+00 1st Qu.:0.6270
## Median :0.000e+00 Median :0.8069
## Mean :5.581e+07 Mean :0.7616
## 3rd Qu.:0.000e+00 3rd Qu.:0.9420
## Max. :9.910e+09 Max. :1.0000
## Working Capital/Equity Current Liabilities/Equity
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.7336 1st Qu.:0.3281
## Median :0.7360 Median :0.3297
## Mean :0.7358 Mean :0.3314
## 3rd Qu.:0.7386 3rd Qu.:0.3323
## Max. :1.0000 Max. :1.0000
## Long-term Liability to Current Assets Retained Earnings to Total Assets
## Min. :0.000e+00 Min. :0.0000
## 1st Qu.:0.000e+00 1st Qu.:0.9311
## Median :0.000e+00 Median :0.9377
## Mean :5.416e+07 Mean :0.9347
## 3rd Qu.:0.000e+00 3rd Qu.:0.9448
## Max. :9.540e+09 Max. :1.0000
## Total income/Total expense Total expense/Assets Current Asset Turnover Rate
## Min. :0.000000 Min. :0.00000 Min. :0.000e+00
## 1st Qu.:0.002236 1st Qu.:0.01457 1st Qu.:0.000e+00
## Median :0.002336 Median :0.02267 Median :0.000e+00
## Mean :0.002549 Mean :0.02918 Mean :1.196e+09
## 3rd Qu.:0.002492 3rd Qu.:0.03593 3rd Qu.:0.000e+00
## Max. :1.000000 Max. :1.00000 Max. :1.000e+10
## Quick Asset Turnover Rate Working capitcal Turnover Rate Cash Turnover Rate
## Min. :0.000e+00 Min. :0.0000 Min. :0.000e+00
## 1st Qu.:0.000e+00 1st Qu.:0.5939 1st Qu.:0.000e+00
## Median :0.000e+00 Median :0.5940 Median :1.080e+09
## Mean :2.164e+09 Mean :0.5940 Mean :2.472e+09
## 3rd Qu.:4.900e+09 3rd Qu.:0.5940 3rd Qu.:4.510e+09
## Max. :1.000e+10 Max. :1.0000 Max. :1.000e+10
## Cash Flow to Sales Fixed Assets to Assets Current Liability to Liability
## Min. :0.0000 Min. :0.00e+00 Min. :0.0000
## 1st Qu.:0.6716 1st Qu.:0.00e+00 1st Qu.:0.6270
## Median :0.6716 Median :0.00e+00 Median :0.8069
## Mean :0.6715 Mean :1.22e+06 Mean :0.7616
## 3rd Qu.:0.6716 3rd Qu.:0.00e+00 3rd Qu.:0.9420
## Max. :1.0000 Max. :8.32e+09 Max. :1.0000
## Current Liability to Equity Equity to Long-term Liability
## Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3281 1st Qu.:0.1109
## Median :0.3297 Median :0.1123
## Mean :0.3314 Mean :0.1156
## 3rd Qu.:0.3323 3rd Qu.:0.1171
## Max. :1.0000 Max. :1.0000
## Cash Flow to Total Assets Cash Flow to Liability CFO to Assets
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6333 1st Qu.:0.4571 1st Qu.:0.5660
## Median :0.6454 Median :0.4598 Median :0.5933
## Mean :0.6497 Mean :0.4618 Mean :0.5934
## 3rd Qu.:0.6631 3rd Qu.:0.4642 3rd Qu.:0.6248
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Cash Flow to Equity Current Liability to Current Assets Liability-Assets Flag
## Min. :0.0000 Min. :0.00000 Min. :0.000000
## 1st Qu.:0.3130 1st Qu.:0.01803 1st Qu.:0.000000
## Median :0.3150 Median :0.02760 Median :0.000000
## Mean :0.3156 Mean :0.03151 Mean :0.001173
## 3rd Qu.:0.3177 3rd Qu.:0.03837 3rd Qu.:0.000000
## Max. :1.0000 Max. :1.00000 Max. :1.000000
## Net Income to Total Assets Total assets to GNP price No-credit Interval
## Min. :0.0000 Min. :0.000e+00 Min. :0.0000
## 1st Qu.:0.7967 1st Qu.:0.000e+00 1st Qu.:0.6236
## Median :0.8106 Median :0.000e+00 Median :0.6239
## Mean :0.8078 Mean :1.863e+07 Mean :0.6239
## 3rd Qu.:0.8265 3rd Qu.:0.000e+00 3rd Qu.:0.6242
## Max. :1.0000 Max. :9.820e+09 Max. :1.0000
## Gross Profit to Sales Net Income to Stockholder's Equity Liability to Equity
## Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.6004 1st Qu.:0.8401 1st Qu.:0.2769
## Median :0.6060 Median :0.8412 Median :0.2788
## Mean :0.6079 Mean :0.8404 Mean :0.2804
## 3rd Qu.:0.6139 3rd Qu.:0.8424 3rd Qu.:0.2814
## Max. :1.0000 Max. :1.0000 Max. :1.0000
## Degree of Financial Leverage (DFL)
## Min. :0.00000
## 1st Qu.:0.02679
## Median :0.02681
## Mean :0.02754
## 3rd Qu.:0.02691
## Max. :1.00000
## Interest Coverage Ratio (Interest expense to EBIT) Net Income Flag
## Min. :0.0000 Min. :1
## 1st Qu.:0.5652 1st Qu.:1
## Median :0.5653 Median :1
## Mean :0.5654 Mean :1
## 3rd Qu.:0.5657 3rd Qu.:1
## Max. :1.0000 Max. :1
## Equity to Liability
## Min. :0.00000
## 1st Qu.:0.02448
## Median :0.03380
## Mean :0.04758
## 3rd Qu.:0.05284
## Max. :1.00000
# ---- label detection ----
candidate_labels <- c("Bankrupt?", "Bankrupt", "bankrupt", "Class", "class", "Target", "target", "Y", "y")
label_name <- intersect(candidate_labels, names(dat))
if (length(label_name) == 0) {
two_level_cols <- names(dat)[sapply(dat, function(x) length(unique(x)) == 2)]
message("No standard label name found. Two-level columns are:\n",
paste(two_level_cols, collapse = ", "))
stop("Please set label_name manually (likely one of the two-level columns).")
}
# If multiple matched, use the first
label_name <- label_name[1]
label_name
## [1] "Bankrupt?"
# ---- coerce label to factor ----
# Expecting 0/1; if not, adapt levels accordingly
dat[[label_name]] <- factor(dat[[label_name]], levels = c(0, 1), labels = c("No", "Yes"))
table(dat[[label_name]])
##
## No Yes
## 6599 220
prop.table(table(dat[[label_name]]))
##
## No Yes
## 0.9677372 0.0322628
ggplot(dat, aes(x = .data[[label_name]])) +
geom_bar() +
labs(title = paste("Class Balance:", label_name), x = label_name, y = "Count")
idx_train <- createDataPartition(dat[[label_name]], p = 0.7, list = FALSE)
train_df <- dat[idx_train, ]
test_df <- dat[-idx_train, ]
prop.table(table(train_df[[label_name]]))
##
## No Yes
## 0.96774194 0.03225806
prop.table(table(test_df[[label_name]]))
##
## No Yes
## 0.96772616 0.03227384
y_train <- train_df[[label_name]]
y_test <- test_df[[label_name]]
X_train <- train_df %>% select(-all_of(label_name))
X_test <- test_df %>% select(-all_of(label_name))
# quick NA check
sum(is.na(X_train))
## [1] 0
sum(is.na(X_test))
## [1] 0
# ---- impute (median) ----
pp_impute <- preProcess(X_train, method = c("medianImpute"))
X_train_imp <- predict(pp_impute, X_train)
X_test_imp <- predict(pp_impute, X_test)
# ---- remove zero / near-zero variance predictors ----
nzv <- nearZeroVar(X_train_imp) # 返回需要删除的列索引(含零方差)
if(length(nzv) > 0){
X_train_imp <- X_train_imp[, -nzv, drop = FALSE]
X_test_imp <- X_test_imp[, -nzv, drop = FALSE]
}
# ---- scale (center/scale) ----
pp_scale <- preProcess(X_train_imp, method = c("center", "scale"))
X_train_sc <- predict(pp_scale, X_train_imp)
X_test_sc <- predict(pp_scale, X_test_imp)
pca_fit <- prcomp(X_train_sc, center = FALSE, scale. = FALSE)
# variance explained
pve <- (pca_fit$sdev^2) / sum(pca_fit$sdev^2)
cum_pve <- cumsum(pve)
# choose k to reach 80% explained variance
k_pca <- which(cum_pve >= 0.80)[1]
k_pca
## [1] 32
pve_df <- data.frame(PC = seq_along(pve), PVE = pve, CumPVE = cum_pve)
ggplot(pve_df, aes(x = PC, y = PVE)) +
geom_line() +
labs(title = "Scree Plot (PVE by Principal Component)", x = "PC", y = "Proportion of Variance Explained")
ggplot(pve_df, aes(x = PC, y = CumPVE)) +
geom_line() +
geom_hline(yintercept = 0.80, linetype = "dashed") +
labs(title = "Cumulative PVE", x = "PC", y = "Cumulative Proportion")
5.2 PCA scatter plot (PC1 vs PC2)
pc_train_2 <- as.data.frame(pca_fit$x[, 1:2])
pc_train_2$y <- y_train
ggplot(pc_train_2, aes(x = PC1, y = PC2, color = y)) +
geom_point(alpha = 0.6) +
labs(title = "PCA Scatter (Train): PC1 vs PC2", color = label_name)
# ---- project data onto top PCs ----
train_scores <- as.data.frame(pca_fit$x[, 1:k_pca])
test_scores <- as.data.frame(predict(pca_fit, newdata = X_test_sc)[, 1:k_pca])
train_mat <- as.matrix(train_scores)
test_mat <- as.matrix(test_scores)
# ---- choose K for KNN ----
K_grid <- c(1,3,5,7,9,15,25,35,51)
auc_knn <- numeric(length(K_grid))
for (i in seq_along(K_grid)) {
K <- K_grid[i]
pred_knn <- knn(train = train_mat, test = test_mat, cl = y_train, k = K, prob = TRUE)
prob_yes <- ifelse(pred_knn == "Yes", attr(pred_knn, "prob"), 1 - attr(pred_knn, "prob"))
roc_obj <- roc(response = y_test, predictor = prob_yes, levels = c("No","Yes"), quiet = TRUE)
auc_knn[i] <- as.numeric(auc(roc_obj))
}
knn_tune <- data.frame(K = K_grid, AUC = auc_knn) %>% arrange(desc(AUC))
knn_tune
## K AUC
## 1 35 0.9257430
## 2 51 0.9253832
## 3 25 0.9190592
## 4 15 0.8707106
## 5 9 0.8363039
## 6 7 0.8254705
## 7 5 0.7708477
## 8 3 0.7237471
## 9 1 0.6320647
bestK <- knn_tune$K[1]
bestK
## [1] 35
pred_knn <- knn(train = train_mat, test = test_mat, cl = y_train, k = bestK, prob = TRUE)
prob_yes_knn <- ifelse(pred_knn == "Yes", attr(pred_knn, "prob"), 1 - attr(pred_knn, "prob"))
# confusion matrix
confusionMatrix(pred_knn, y_test, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1979 65
## Yes 0 1
##
## Accuracy : 0.9682
## 95% CI : (0.9597, 0.9754)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.4828
##
## Kappa : 0.0289
##
## Mcnemar's Test P-Value : 2.051e-15
##
## Sensitivity : 0.015152
## Specificity : 1.000000
## Pos Pred Value : 1.000000
## Neg Pred Value : 0.968200
## Prevalence : 0.032274
## Detection Rate : 0.000489
## Detection Prevalence : 0.000489
## Balanced Accuracy : 0.507576
##
## 'Positive' Class : Yes
##
# ROC / AUC
roc_knn <- roc(y_test, prob_yes_knn, levels = c("No","Yes"), quiet = TRUE)
plot(roc_knn, main = paste0("ROC: PCA + KNN (K=", bestK, "), AUC=", round(auc(roc_knn), 3)))
train_rf <- data.frame(y = y_train, X_train_sc)
test_rf <- data.frame(y = y_test, X_test_sc)
rf_fit <- ranger(
y ~ .,
data = train_rf,
probability = TRUE,
num.trees = 500,
mtry = floor(sqrt(ncol(X_train_sc))),
importance = "impurity"
)
rf_prob <- predict(rf_fit, data = test_rf)$predictions[, "Yes"]
rf_pred <- factor(ifelse(rf_prob >= 0.5, "Yes", "No"), levels = c("No","Yes"))
confusionMatrix(rf_pred, y_test, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1976 53
## Yes 3 13
##
## Accuracy : 0.9726
## 95% CI : (0.9646, 0.9792)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.1155
##
## Kappa : 0.3084
##
## Mcnemar's Test P-Value : 5.835e-11
##
## Sensitivity : 0.196970
## Specificity : 0.998484
## Pos Pred Value : 0.812500
## Neg Pred Value : 0.973879
## Prevalence : 0.032274
## Detection Rate : 0.006357
## Detection Prevalence : 0.007824
## Balanced Accuracy : 0.597727
##
## 'Positive' Class : Yes
##
roc_rf <- roc(y_test, rf_prob, levels = c("No","Yes"), quiet = TRUE)
plot(roc_rf, main = paste0("ROC: Random Forest, AUC=", round(auc(roc_rf), 3)))
imp <- sort(rf_fit$variable.importance, decreasing = TRUE)
imp_df <- data.frame(
feature = names(imp),
importance = as.numeric(imp)
) %>% slice_head(n = 20)
ggplot(imp_df, aes(x = reorder(feature, importance), y = importance)) +
geom_col() +
coord_flip() +
labs(title = "Top 20 Feature Importances (Random Forest)", x = "", y = "Importance")
# compute p-values per feature (train set only)
pvals <- sapply(names(X_train_sc), function(v) {
wilcox.test(X_train_sc[[v]] ~ y_train)$p.value
})
# BH adjustment
q <- 0.10
p_adj <- p.adjust(pvals, method = "BH")
sig_features <- names(p_adj)[p_adj <= q]
length(sig_features)
## [1] 87
head(sig_features, 30)
## [1] "ROA(C) before interest and depreciation before interest"
## [2] "ROA(A) before interest and % after tax"
## [3] "ROA(B) before interest and depreciation after tax"
## [4] "Operating Gross Margin"
## [5] "Realized Sales Gross Margin"
## [6] "Operating Profit Rate"
## [7] "Pre-tax net Interest Rate"
## [8] "After-tax net Interest Rate"
## [9] "Non-industry income and expenditure/revenue"
## [10] "Continuous interest rate (after tax)"
## [11] "Research and development expense rate"
## [12] "Cash flow rate"
## [13] "Interest-bearing debt interest rate"
## [14] "Tax rate (A)"
## [15] "Net Value Per Share (B)"
## [16] "Net Value Per Share (A)"
## [17] "Net Value Per Share (C)"
## [18] "Persistent EPS in the Last Four Seasons"
## [19] "Cash Flow Per Share"
## [20] "Revenue Per Share (Yuan ¥)"
## [21] "Operating Profit Per Share (Yuan ¥)"
## [22] "Per Share Net profit before tax (Yuan ¥)"
## [23] "Realized Sales Gross Profit Growth Rate"
## [24] "Operating Profit Growth Rate"
## [25] "After-tax Net Profit Growth Rate"
## [26] "Regular Net Profit Growth Rate"
## [27] "Continuous Net Profit Growth Rate"
## [28] "Total Asset Growth Rate"
## [29] "Net Value Growth Rate"
## [30] "Total Asset Return Growth Rate Ratio"
pv_df <- data.frame(
feature = names(pvals),
pval = as.numeric(pvals),
padj = as.numeric(p_adj)
)
ggplot(pv_df, aes(x = pval)) +
geom_histogram(bins = 40) +
labs(title = "Histogram of Raw p-values (Train)", x = "p-value", y = "Count")
ggplot(pv_df, aes(x = padj)) +
geom_histogram(bins = 40) +
labs(title = "Histogram of BH-adjusted p-values (Train)", x = "BH adjusted p-value", y = "Count")
train_rf_fdr <- data.frame(y = y_train, X_train_sc[, sig_features, drop = FALSE])
test_rf_fdr <- data.frame(y = y_test, X_test_sc[, sig_features, drop = FALSE])
rf_fit_fdr <- ranger(
y ~ .,
data = train_rf_fdr,
probability = TRUE,
num.trees = 500,
mtry = max(1, floor(sqrt(length(sig_features)))),
importance = "impurity"
)
rf_prob_fdr <- predict(rf_fit_fdr, data = test_rf_fdr)$predictions[, "Yes"]
rf_pred_fdr <- factor(ifelse(rf_prob_fdr >= 0.5, "Yes", "No"), levels = c("No","Yes"))
confusionMatrix(rf_pred_fdr, y_test, positive = "Yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1975 51
## Yes 4 15
##
## Accuracy : 0.9731
## 95% CI : (0.9651, 0.9797)
## No Information Rate : 0.9677
## P-Value [Acc > NIR] : 0.09179
##
## Kappa : 0.3435
##
## Mcnemar's Test P-Value : 5.552e-10
##
## Sensitivity : 0.227273
## Specificity : 0.997979
## Pos Pred Value : 0.789474
## Neg Pred Value : 0.974827
## Prevalence : 0.032274
## Detection Rate : 0.007335
## Detection Prevalence : 0.009291
## Balanced Accuracy : 0.612626
##
## 'Positive' Class : Yes
##
roc_rf_fdr <- roc(y_test, rf_prob_fdr, levels = c("No","Yes"), quiet = TRUE)
auc(roc_rf_fdr)
## Area under the curve: 0.9498
plot(roc_rf, main = "ROC Comparison: RF(All) vs RF(FDR-selected)")
lines(roc_rf_fdr)
legend("bottomright",
legend = c(paste0("RF All AUC=", round(auc(roc_rf), 3)),
paste0("RF FDR AUC=", round(auc(roc_rf_fdr), 3))),
lty = 1, bty = "n")
train_glm <- data.frame(y = y_train, X_train_sc)
test_glm <- data.frame(y = y_test, X_test_sc)
glm_fit <- glm(y ~ ., data = train_glm, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob <- predict(glm_fit, newdata = test_glm, type = "response")
roc_glm <- roc(y_test, glm_prob, levels = c("No","Yes"), quiet = TRUE)
plot(roc_glm, main = paste0("ROC: Logistic Regression (All), AUC=", round(auc(roc_glm), 3)))
glm_fit_fdr <- glm(y ~ ., data = train_rf_fdr, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob_fdr <- predict(glm_fit_fdr, newdata = test_rf_fdr, type = "response")
roc_glm_fdr <- roc(y_test, glm_prob_fdr, levels = c("No","Yes"), quiet = TRUE)
plot(roc_glm_fdr, main = paste0("ROC: Logistic Regression (FDR), AUC=", round(auc(roc_glm_fdr), 3)))
auc_summary <- data.frame(
Model = c(
paste0("PCA+KNN (K=", bestK, ", PCs=", k_pca, ")"),
"Random Forest (All features)",
paste0("Random Forest (FDR q=", q, ", |S|=", length(sig_features), ")"),
"Logistic (All features)",
paste0("Logistic (FDR q=", q, ", |S|=", length(sig_features), ")")
),
AUC = c(
as.numeric(auc(roc_knn)),
as.numeric(auc(roc_rf)),
as.numeric(auc(roc_rf_fdr)),
as.numeric(auc(roc_glm)),
as.numeric(auc(roc_glm_fdr))
)
)
auc_summary %>% arrange(desc(AUC))
## Model AUC
## 1 Random Forest (FDR q=0.1, |S|=87) 0.9498293
## 2 Random Forest (All features) 0.9483057
## 3 PCA+KNN (K=35, PCs=32) 0.9257430
## 4 Logistic (FDR q=0.1, |S|=87) 0.8902415
## 5 Logistic (All features) 0.8728237