rm(list = ls())
set.seed(5535)

# ---- packages ----
pkgs <- c(
  "tidyverse",  # data wrangling + ggplot2
  "caret",      # split, CV utilities, confusionMatrix
  "pROC",       # ROC/AUC
  "class",      # KNN
  "ranger"      # Random Forest (fast)
)
to_install <- pkgs[!pkgs %in% installed.packages()[, "Package"]]
if (length(to_install) > 0) install.packages(to_install)

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(class)
library(ranger)
# ---- path ----
data_path <- "data.csv"

# keep original colnames (some may contain symbols)
dat <- read.csv(data_path, check.names = FALSE)

# basic checks
dim(dat)
## [1] 6819   96
names(dat)[1:20]
##  [1] "Bankrupt?"                                              
##  [2] "ROA(C) before interest and depreciation before interest"
##  [3] "ROA(A) before interest and % after tax"                 
##  [4] "ROA(B) before interest and depreciation after tax"      
##  [5] "Operating Gross Margin"                                 
##  [6] "Realized Sales Gross Margin"                            
##  [7] "Operating Profit Rate"                                  
##  [8] "Pre-tax net Interest Rate"                              
##  [9] "After-tax net Interest Rate"                            
## [10] "Non-industry income and expenditure/revenue"            
## [11] "Continuous interest rate (after tax)"                   
## [12] "Operating Expense Rate"                                 
## [13] "Research and development expense rate"                  
## [14] "Cash flow rate"                                         
## [15] "Interest-bearing debt interest rate"                    
## [16] "Tax rate (A)"                                           
## [17] "Net Value Per Share (B)"                                
## [18] "Net Value Per Share (A)"                                
## [19] "Net Value Per Share (C)"                                
## [20] "Persistent EPS in the Last Four Seasons"
str(dat)
## 'data.frame':    6819 obs. of  96 variables:
##  $ Bankrupt?                                              : int  1 1 1 1 1 1 0 0 0 0 ...
##  $ ROA(C) before interest and depreciation before interest: num  0.371 0.464 0.426 0.4 0.465 ...
##  $ ROA(A) before interest and % after tax                 : num  0.424 0.538 0.499 0.451 0.538 ...
##  $ ROA(B) before interest and depreciation after tax      : num  0.406 0.517 0.472 0.458 0.522 ...
##  $ Operating Gross Margin                                 : num  0.601 0.61 0.601 0.584 0.599 ...
##  $ Realized Sales Gross Margin                            : num  0.601 0.61 0.601 0.584 0.599 ...
##  $ Operating Profit Rate                                  : num  0.999 0.999 0.999 0.999 0.999 ...
##  $ Pre-tax net Interest Rate                              : num  0.797 0.797 0.796 0.797 0.797 ...
##  $ After-tax net Interest Rate                            : num  0.809 0.809 0.808 0.809 0.809 ...
##  $ Non-industry income and expenditure/revenue            : num  0.303 0.304 0.302 0.303 0.303 ...
##  $ Continuous interest rate (after tax)                   : num  0.781 0.782 0.78 0.781 0.782 ...
##  $ Operating Expense Rate                                 : num  1.26e-04 2.90e-04 2.36e-04 1.08e-04 7.89e+09 ...
##  $ Research and development expense rate                  : num  0.00 0.00 2.55e+07 0.00 0.00 0.00 7.30e+08 5.09e+07 0.00 0.00 ...
##  $ Cash flow rate                                         : num  0.458 0.462 0.459 0.466 0.463 ...
##  $ Interest-bearing debt interest rate                    : num  0.000725 0.000647 0.00079 0.000449 0.000686 ...
##  $ Tax rate (A)                                           : num  0 0 0 0 0 ...
##  $ Net Value Per Share (B)                                : num  0.148 0.182 0.178 0.154 0.168 ...
##  $ Net Value Per Share (A)                                : num  0.148 0.182 0.178 0.154 0.168 ...
##  $ Net Value Per Share (C)                                : num  0.148 0.182 0.194 0.154 0.168 ...
##  $ Persistent EPS in the Last Four Seasons                : num  0.169 0.209 0.181 0.194 0.213 ...
##  $ Cash Flow Per Share                                    : num  0.312 0.318 0.307 0.322 0.319 ...
##  $ Revenue Per Share (Yuan ¥)                             : num  0.01756 0.02114 0.00594 0.01437 0.02969 ...
##  $ Operating Profit Per Share (Yuan ¥)                    : num  0.0959 0.0937 0.0923 0.0778 0.0969 ...
##  $ Per Share Net profit before tax (Yuan ¥)               : num  0.139 0.17 0.143 0.149 0.168 ...
##  $ Realized Sales Gross Profit Growth Rate                : num  0.0221 0.0221 0.0228 0.022 0.0221 ...
##  $ Operating Profit Growth Rate                           : num  0.848 0.848 0.848 0.848 0.848 ...
##  $ After-tax Net Profit Growth Rate                       : num  0.689 0.69 0.689 0.689 0.69 ...
##  $ Regular Net Profit Growth Rate                         : num  0.689 0.69 0.689 0.689 0.69 ...
##  $ Continuous Net Profit Growth Rate                      : num  0.218 0.218 0.218 0.218 0.218 ...
##  $ Total Asset Growth Rate                                : num  4.98e+09 6.11e+09 7.28e+09 4.88e+09 5.51e+09 6.08e+08 5.72e+09 6.63e+09 6.89e+09 5.55e+09 ...
##  $ Net Value Growth Rate                                  : num  0.000327 0.000443 0.000396 0.000382 0.000439 ...
##  $ Total Asset Return Growth Rate Ratio                   : num  0.263 0.265 0.264 0.263 0.265 ...
##  $ Cash Reinvestment %                                    : num  0.364 0.377 0.369 0.384 0.38 ...
##  $ Current Ratio                                          : num  0.00226 0.00602 0.01154 0.00419 0.00602 ...
##  $ Quick Ratio                                            : num  0.00121 0.00404 0.00535 0.0029 0.00373 ...
##  $ Interest Expense Ratio                                 : num  0.63 0.635 0.63 0.63 0.636 ...
##  $ Total debt/Total net worth                             : num  0.02127 0.0125 0.02125 0.00957 0.00515 ...
##  $ Debt ratio %                                           : num  0.208 0.171 0.208 0.151 0.107 ...
##  $ Net worth/Assets                                       : num  0.792 0.829 0.792 0.849 0.893 ...
##  $ Long-term fund suitability ratio (A)                   : num  0.00502 0.00506 0.0051 0.00505 0.0053 ...
##  $ Borrowing dependency                                   : num  0.39 0.377 0.379 0.38 0.375 ...
##  $ Contingent liabilities/Net worth                       : num  0.00648 0.00584 0.00656 0.00537 0.00662 ...
##  $ Operating profit/Paid-in capital                       : num  0.0959 0.0937 0.0923 0.0777 0.0969 ...
##  $ Net profit before tax/Paid-in capital                  : num  0.138 0.169 0.148 0.148 0.167 ...
##  $ Inventory and accounts receivable/Net value            : num  0.398 0.398 0.407 0.398 0.4 ...
##  $ Total Asset Turnover                                   : num  0.087 0.0645 0.015 0.09 0.1754 ...
##  $ Accounts Receivable Turnover                           : num  0.00181 0.00129 0.0015 0.00197 0.00145 ...
##  $ Average Collection Days                                : num  0.00349 0.00492 0.00423 0.00321 0.00437 ...
##  $ Inventory Turnover Rate (times)                        : num  1.82e-04 9.36e+09 6.50e+07 7.13e+09 1.63e-04 ...
##  $ Fixed Assets Turnover Frequency                        : num  1.17e-04 7.19e+08 2.65e+09 9.15e+09 2.94e-04 ...
##  $ Net Worth Turnover Rate (times)                        : num  0.0329 0.0255 0.0134 0.0281 0.0402 ...
##  $ Revenue per person                                     : num  0.03416 0.00689 0.029 0.01546 0.05811 ...
##  $ Operating profit per person                            : num  0.393 0.392 0.382 0.378 0.394 ...
##  $ Allocation rate per person                             : num  0.0371 0.0123 0.141 0.0213 0.024 ...
##  $ Working Capital to Total Assets                        : num  0.673 0.751 0.83 0.726 0.752 ...
##  $ Quick Assets/Total Assets                              : num  0.167 0.127 0.34 0.162 0.26 ...
##  $ Current Assets/Total Assets                            : num  0.191 0.182 0.603 0.226 0.358 ...
##  $ Cash/Total Assets                                      : num  0.004094 0.014948 0.000991 0.018851 0.014161 ...
##  $ Quick Assets/Current Liability                         : num  0.002 0.00414 0.0063 0.00296 0.00427 ...
##  $ Cash/Current Liability                                 : num  1.47e-04 1.38e-03 5.34e+09 1.01e-03 6.80e-04 ...
##  $ Current Liability to Assets                            : num  0.1473 0.057 0.0982 0.0987 0.1102 ...
##  $ Operating Funds to Liability                           : num  0.334 0.341 0.337 0.349 0.345 ...
##  $ Inventory/Working Capital                              : num  0.277 0.29 0.277 0.277 0.288 ...
##  $ Inventory/Current Liability                            : num  0.00104 0.00521 0.01388 0.00354 0.00487 ...
##  $ Current Liabilities/Liability                          : num  0.676 0.309 0.446 0.616 0.975 ...
##  $ Working Capital/Equity                                 : num  0.721 0.732 0.743 0.73 0.732 ...
##  $ Current Liabilities/Equity                             : num  0.339 0.33 0.335 0.332 0.331 ...
##  $ Long-term Liability to Current Assets                  : num  0.02559 0.02395 0.00372 0.02217 0 ...
##  $ Retained Earnings to Total Assets                      : num  0.903 0.931 0.91 0.907 0.914 ...
##  $ Total income/Total expense                             : num  0.00202 0.00223 0.00206 0.00183 0.00222 ...
##  $ Total expense/Assets                                   : num  0.0649 0.0255 0.0214 0.0242 0.0264 ...
##  $ Current Asset Turnover Rate                            : num  7.01e+08 1.07e-04 1.79e-03 8.14e+09 6.68e+09 ...
##  $ Quick Asset Turnover Rate                              : num  6.55e+09 7.70e+09 1.02e-03 6.05e+09 5.05e+09 ...
##  $ Working capitcal Turnover Rate                         : num  0.594 0.594 0.595 0.594 0.594 ...
##  $ Cash Turnover Rate                                     : num  4.58e+08 2.49e+09 7.61e+08 2.03e+09 8.24e+08 ...
##  $ Cash Flow to Sales                                     : num  0.672 0.672 0.672 0.672 0.672 ...
##  $ Fixed Assets to Assets                                 : num  0.424 0.469 0.276 0.559 0.31 ...
##  $ Current Liability to Liability                         : num  0.676 0.309 0.446 0.616 0.975 ...
##  $ Current Liability to Equity                            : num  0.339 0.33 0.335 0.332 0.331 ...
##  $ Equity to Long-term Liability                          : num  0.127 0.121 0.118 0.121 0.111 ...
##  $ Cash Flow to Total Assets                              : num  0.638 0.641 0.643 0.579 0.622 ...
##  $ Cash Flow to Liability                                 : num  0.459 0.459 0.459 0.449 0.454 ...
##  $ CFO to Assets                                          : num  0.52 0.567 0.538 0.604 0.578 ...
##  $ Cash Flow to Equity                                    : num  0.313 0.314 0.315 0.302 0.312 ...
##  $ Current Liability to Current Assets                    : num  0.1183 0.0478 0.0253 0.0672 0.0477 ...
##  $ Liability-Assets Flag                                  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Net Income to Total Assets                             : num  0.717 0.795 0.775 0.74 0.795 ...
##  $ Total assets to GNP price                              : num  0.00922 0.00832 0.04 0.00325 0.00388 ...
##  $ No-credit Interval                                     : num  0.623 0.624 0.624 0.623 0.624 ...
##  $ Gross Profit to Sales                                  : num  0.601 0.61 0.601 0.584 0.599 ...
##  $ Net Income to Stockholder's Equity                     : num  0.828 0.84 0.837 0.835 0.84 ...
##  $ Liability to Equity                                    : num  0.29 0.284 0.29 0.282 0.279 ...
##  $ Degree of Financial Leverage (DFL)                     : num  0.0266 0.2646 0.0266 0.0267 0.0248 ...
##  $ Interest Coverage Ratio (Interest expense to EBIT)     : num  0.564 0.57 0.564 0.565 0.576 ...
##  $ Net Income Flag                                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Equity to Liability                                    : num  0.0165 0.0208 0.0165 0.024 0.0355 ...
summary(dat)
##    Bankrupt?       ROA(C) before interest and depreciation before interest
##  Min.   :0.00000   Min.   :0.0000                                         
##  1st Qu.:0.00000   1st Qu.:0.4765                                         
##  Median :0.00000   Median :0.5027                                         
##  Mean   :0.03226   Mean   :0.5052                                         
##  3rd Qu.:0.00000   3rd Qu.:0.5356                                         
##  Max.   :1.00000   Max.   :1.0000                                         
##  ROA(A) before interest and % after tax
##  Min.   :0.0000                        
##  1st Qu.:0.5355                        
##  Median :0.5598                        
##  Mean   :0.5586                        
##  3rd Qu.:0.5892                        
##  Max.   :1.0000                        
##  ROA(B) before interest and depreciation after tax Operating Gross Margin
##  Min.   :0.0000                                    Min.   :0.0000        
##  1st Qu.:0.5273                                    1st Qu.:0.6004        
##  Median :0.5523                                    Median :0.6060        
##  Mean   :0.5536                                    Mean   :0.6079        
##  3rd Qu.:0.5841                                    3rd Qu.:0.6139        
##  Max.   :1.0000                                    Max.   :1.0000        
##  Realized Sales Gross Margin Operating Profit Rate Pre-tax net Interest Rate
##  Min.   :0.0000              Min.   :0.0000        Min.   :0.0000           
##  1st Qu.:0.6004              1st Qu.:0.9990        1st Qu.:0.7974           
##  Median :0.6060              Median :0.9990        Median :0.7975           
##  Mean   :0.6079              Mean   :0.9988        Mean   :0.7972           
##  3rd Qu.:0.6138              3rd Qu.:0.9991        3rd Qu.:0.7976           
##  Max.   :1.0000              Max.   :1.0000        Max.   :1.0000           
##  After-tax net Interest Rate Non-industry income and expenditure/revenue
##  Min.   :0.0000              Min.   :0.0000                             
##  1st Qu.:0.8093              1st Qu.:0.3035                             
##  Median :0.8094              Median :0.3035                             
##  Mean   :0.8091              Mean   :0.3036                             
##  3rd Qu.:0.8095              3rd Qu.:0.3036                             
##  Max.   :1.0000              Max.   :1.0000                             
##  Continuous interest rate (after tax) Operating Expense Rate
##  Min.   :0.0000                       Min.   :0.000e+00     
##  1st Qu.:0.7816                       1st Qu.:0.000e+00     
##  Median :0.7816                       Median :0.000e+00     
##  Mean   :0.7814                       Mean   :1.995e+09     
##  3rd Qu.:0.7817                       3rd Qu.:4.145e+09     
##  Max.   :1.0000                       Max.   :9.990e+09     
##  Research and development expense rate Cash flow rate  
##  Min.   :0.00e+00                      Min.   :0.0000  
##  1st Qu.:0.00e+00                      1st Qu.:0.4616  
##  Median :5.09e+08                      Median :0.4651  
##  Mean   :1.95e+09                      Mean   :0.4674  
##  3rd Qu.:3.45e+09                      3rd Qu.:0.4710  
##  Max.   :9.98e+09                      Max.   :1.0000  
##  Interest-bearing debt interest rate  Tax rate (A)     Net Value Per Share (B)
##  Min.   :        0                   Min.   :0.00000   Min.   :0.0000         
##  1st Qu.:        0                   1st Qu.:0.00000   1st Qu.:0.1736         
##  Median :        0                   Median :0.07349   Median :0.1844         
##  Mean   : 16448013                   Mean   :0.11500   Mean   :0.1907         
##  3rd Qu.:        0                   3rd Qu.:0.20584   3rd Qu.:0.1996         
##  Max.   :990000000                   Max.   :1.00000   Max.   :1.0000         
##  Net Value Per Share (A) Net Value Per Share (C)
##  Min.   :0.0000          Min.   :0.0000         
##  1st Qu.:0.1736          1st Qu.:0.1737         
##  Median :0.1844          Median :0.1844         
##  Mean   :0.1906          Mean   :0.1907         
##  3rd Qu.:0.1996          3rd Qu.:0.1996         
##  Max.   :1.0000          Max.   :1.0000         
##  Persistent EPS in the Last Four Seasons Cash Flow Per Share
##  Min.   :0.0000                          Min.   :0.0000     
##  1st Qu.:0.2147                          1st Qu.:0.3177     
##  Median :0.2245                          Median :0.3225     
##  Mean   :0.2288                          Mean   :0.3235     
##  3rd Qu.:0.2388                          3rd Qu.:0.3286     
##  Max.   :1.0000                          Max.   :1.0000     
##  Revenue Per Share (Yuan ¥) Operating Profit Per Share (Yuan ¥)
##  Min.   :0.000e+00          Min.   :0.00000                    
##  1st Qu.:0.000e+00          1st Qu.:0.09608                    
##  Median :0.000e+00          Median :0.10423                    
##  Mean   :1.329e+06          Mean   :0.10909                    
##  3rd Qu.:0.000e+00          3rd Qu.:0.11616                    
##  Max.   :3.020e+09          Max.   :1.00000                    
##  Per Share Net profit before tax (Yuan ¥)
##  Min.   :0.0000                          
##  1st Qu.:0.1704                          
##  Median :0.1797                          
##  Mean   :0.1844                          
##  3rd Qu.:0.1935                          
##  Max.   :1.0000                          
##  Realized Sales Gross Profit Growth Rate Operating Profit Growth Rate
##  Min.   :0.00000                         Min.   :0.0000              
##  1st Qu.:0.02206                         1st Qu.:0.8480              
##  Median :0.02210                         Median :0.8480              
##  Mean   :0.02241                         Mean   :0.8480              
##  3rd Qu.:0.02215                         3rd Qu.:0.8481              
##  Max.   :1.00000                         Max.   :1.0000              
##  After-tax Net Profit Growth Rate Regular Net Profit Growth Rate
##  Min.   :0.0000                   Min.   :0.0000                
##  1st Qu.:0.6893                   1st Qu.:0.6893                
##  Median :0.6894                   Median :0.6894                
##  Mean   :0.6891                   Mean   :0.6892                
##  3rd Qu.:0.6896                   3rd Qu.:0.6896                
##  Max.   :1.0000                   Max.   :1.0000                
##  Continuous Net Profit Growth Rate Total Asset Growth Rate
##  Min.   :0.0000                    Min.   :0.000e+00      
##  1st Qu.:0.2176                    1st Qu.:4.860e+09      
##  Median :0.2176                    Median :6.400e+09      
##  Mean   :0.2176                    Mean   :5.508e+09      
##  3rd Qu.:0.2176                    3rd Qu.:7.390e+09      
##  Max.   :1.0000                    Max.   :9.990e+09      
##  Net Value Growth Rate Total Asset Return Growth Rate Ratio Cash Reinvestment %
##  Min.   :0.000e+00     Min.   :0.0000                       Min.   :0.0000     
##  1st Qu.:0.000e+00     1st Qu.:0.2638                       1st Qu.:0.3747     
##  Median :0.000e+00     Median :0.2640                       Median :0.3804     
##  Mean   :1.566e+06     Mean   :0.2642                       Mean   :0.3797     
##  3rd Qu.:0.000e+00     3rd Qu.:0.2644                       3rd Qu.:0.3867     
##  Max.   :9.330e+09     Max.   :1.0000                       Max.   :1.0000     
##  Current Ratio        Quick Ratio        Interest Expense Ratio
##  Min.   :0.000e+00   Min.   :0.000e+00   Min.   :0.0000        
##  1st Qu.:0.000e+00   1st Qu.:0.000e+00   1st Qu.:0.6306        
##  Median :0.000e+00   Median :0.000e+00   Median :0.6307        
##  Mean   :4.033e+05   Mean   :8.377e+06   Mean   :0.6310        
##  3rd Qu.:0.000e+00   3rd Qu.:0.000e+00   3rd Qu.:0.6311        
##  Max.   :2.750e+09   Max.   :9.230e+09   Max.   :1.0000        
##  Total debt/Total net worth  Debt ratio %     Net worth/Assets
##  Min.   :0.000e+00          Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.000e+00          1st Qu.:0.07289   1st Qu.:0.8512  
##  Median :0.000e+00          Median :0.11141   Median :0.8886  
##  Mean   :4.416e+06          Mean   :0.11318   Mean   :0.8868  
##  3rd Qu.:0.000e+00          3rd Qu.:0.14880   3rd Qu.:0.9271  
##  Max.   :9.940e+09          Max.   :1.00000   Max.   :1.0000  
##  Long-term fund suitability ratio (A) Borrowing dependency
##  Min.   :0.000000                     Min.   :0.0000      
##  1st Qu.:0.005244                     1st Qu.:0.3702      
##  Median :0.005665                     Median :0.3726      
##  Mean   :0.008783                     Mean   :0.3747      
##  3rd Qu.:0.006847                     3rd Qu.:0.3763      
##  Max.   :1.000000                     Max.   :1.0000      
##  Contingent liabilities/Net worth Operating profit/Paid-in capital
##  Min.   :0.000000                 Min.   :0.0000                  
##  1st Qu.:0.005366                 1st Qu.:0.0961                  
##  Median :0.005366                 Median :0.1041                  
##  Mean   :0.005968                 Mean   :0.1090                  
##  3rd Qu.:0.005764                 3rd Qu.:0.1159                  
##  Max.   :1.000000                 Max.   :1.0000                  
##  Net profit before tax/Paid-in capital
##  Min.   :0.0000                       
##  1st Qu.:0.1694                       
##  Median :0.1785                       
##  Mean   :0.1827                       
##  3rd Qu.:0.1916                       
##  Max.   :1.0000                       
##  Inventory and accounts receivable/Net value Total Asset Turnover
##  Min.   :0.0000                              Min.   :0.00000     
##  1st Qu.:0.3974                              1st Qu.:0.07646     
##  Median :0.4001                              Median :0.11844     
##  Mean   :0.4025                              Mean   :0.14161     
##  3rd Qu.:0.4046                              3rd Qu.:0.17691     
##  Max.   :1.0000                              Max.   :1.00000     
##  Accounts Receivable Turnover Average Collection Days
##  Min.   :0.000e+00            Min.   :0.000e+00      
##  1st Qu.:0.000e+00            1st Qu.:0.000e+00      
##  Median :0.000e+00            Median :0.000e+00      
##  Mean   :1.279e+07            Mean   :9.826e+06      
##  3rd Qu.:0.000e+00            3rd Qu.:0.000e+00      
##  Max.   :9.740e+09            Max.   :9.730e+09      
##  Inventory Turnover Rate (times) Fixed Assets Turnover Frequency
##  Min.   :0.000e+00               Min.   :0.000e+00              
##  1st Qu.:0.000e+00               1st Qu.:0.000e+00              
##  Median :0.000e+00               Median :0.000e+00              
##  Mean   :2.149e+09               Mean   :1.009e+09              
##  3rd Qu.:4.620e+09               3rd Qu.:0.000e+00              
##  Max.   :9.990e+09               Max.   :9.990e+09              
##  Net Worth Turnover Rate (times) Revenue per person 
##  Min.   :0.00000                 Min.   :0.000e+00  
##  1st Qu.:0.02177                 1st Qu.:0.000e+00  
##  Median :0.02952                 Median :0.000e+00  
##  Mean   :0.03860                 Mean   :2.326e+06  
##  3rd Qu.:0.04290                 3rd Qu.:0.000e+00  
##  Max.   :1.00000                 Max.   :8.810e+09  
##  Operating profit per person Allocation rate per person
##  Min.   :0.0000              Min.   :0.000e+00         
##  1st Qu.:0.3924              1st Qu.:0.000e+00         
##  Median :0.3959              Median :0.000e+00         
##  Mean   :0.4007              Mean   :1.126e+07         
##  3rd Qu.:0.4019              3rd Qu.:0.000e+00         
##  Max.   :1.0000              Max.   :9.570e+09         
##  Working Capital to Total Assets Quick Assets/Total Assets
##  Min.   :0.0000                  Min.   :0.0000           
##  1st Qu.:0.7743                  1st Qu.:0.2420           
##  Median :0.8103                  Median :0.3865           
##  Mean   :0.8141                  Mean   :0.4001           
##  3rd Qu.:0.8504                  3rd Qu.:0.5406           
##  Max.   :1.0000                  Max.   :1.0000           
##  Current Assets/Total Assets Cash/Total Assets Quick Assets/Current Liability
##  Min.   :0.0000              Min.   :0.00000   Min.   :0.000e+00             
##  1st Qu.:0.3528              1st Qu.:0.03354   1st Qu.:0.000e+00             
##  Median :0.5148              Median :0.07489   Median :0.000e+00             
##  Mean   :0.5223              Mean   :0.12409   Mean   :3.593e+06             
##  3rd Qu.:0.6891              3rd Qu.:0.16107   3rd Qu.:0.000e+00             
##  Max.   :1.0000              Max.   :1.00000   Max.   :8.820e+09             
##  Cash/Current Liability Current Liability to Assets
##  Min.   :0.000e+00      Min.   :0.00000            
##  1st Qu.:0.000e+00      1st Qu.:0.05330            
##  Median :0.000e+00      Median :0.08270            
##  Mean   :3.716e+07      Mean   :0.09067            
##  3rd Qu.:0.000e+00      3rd Qu.:0.11952            
##  Max.   :9.650e+09      Max.   :1.00000            
##  Operating Funds to Liability Inventory/Working Capital
##  Min.   :0.0000               Min.   :0.0000           
##  1st Qu.:0.3410               1st Qu.:0.2770           
##  Median :0.3486               Median :0.2772           
##  Mean   :0.3538               Mean   :0.2774           
##  3rd Qu.:0.3609               3rd Qu.:0.2774           
##  Max.   :1.0000               Max.   :1.0000           
##  Inventory/Current Liability Current Liabilities/Liability
##  Min.   :0.000e+00           Min.   :0.0000               
##  1st Qu.:0.000e+00           1st Qu.:0.6270               
##  Median :0.000e+00           Median :0.8069               
##  Mean   :5.581e+07           Mean   :0.7616               
##  3rd Qu.:0.000e+00           3rd Qu.:0.9420               
##  Max.   :9.910e+09           Max.   :1.0000               
##  Working Capital/Equity Current Liabilities/Equity
##  Min.   :0.0000         Min.   :0.0000            
##  1st Qu.:0.7336         1st Qu.:0.3281            
##  Median :0.7360         Median :0.3297            
##  Mean   :0.7358         Mean   :0.3314            
##  3rd Qu.:0.7386         3rd Qu.:0.3323            
##  Max.   :1.0000         Max.   :1.0000            
##  Long-term Liability to Current Assets Retained Earnings to Total Assets
##  Min.   :0.000e+00                     Min.   :0.0000                   
##  1st Qu.:0.000e+00                     1st Qu.:0.9311                   
##  Median :0.000e+00                     Median :0.9377                   
##  Mean   :5.416e+07                     Mean   :0.9347                   
##  3rd Qu.:0.000e+00                     3rd Qu.:0.9448                   
##  Max.   :9.540e+09                     Max.   :1.0000                   
##  Total income/Total expense Total expense/Assets Current Asset Turnover Rate
##  Min.   :0.000000           Min.   :0.00000      Min.   :0.000e+00          
##  1st Qu.:0.002236           1st Qu.:0.01457      1st Qu.:0.000e+00          
##  Median :0.002336           Median :0.02267      Median :0.000e+00          
##  Mean   :0.002549           Mean   :0.02918      Mean   :1.196e+09          
##  3rd Qu.:0.002492           3rd Qu.:0.03593      3rd Qu.:0.000e+00          
##  Max.   :1.000000           Max.   :1.00000      Max.   :1.000e+10          
##  Quick Asset Turnover Rate Working capitcal Turnover Rate Cash Turnover Rate 
##  Min.   :0.000e+00         Min.   :0.0000                 Min.   :0.000e+00  
##  1st Qu.:0.000e+00         1st Qu.:0.5939                 1st Qu.:0.000e+00  
##  Median :0.000e+00         Median :0.5940                 Median :1.080e+09  
##  Mean   :2.164e+09         Mean   :0.5940                 Mean   :2.472e+09  
##  3rd Qu.:4.900e+09         3rd Qu.:0.5940                 3rd Qu.:4.510e+09  
##  Max.   :1.000e+10         Max.   :1.0000                 Max.   :1.000e+10  
##  Cash Flow to Sales Fixed Assets to Assets Current Liability to Liability
##  Min.   :0.0000     Min.   :0.00e+00       Min.   :0.0000                
##  1st Qu.:0.6716     1st Qu.:0.00e+00       1st Qu.:0.6270                
##  Median :0.6716     Median :0.00e+00       Median :0.8069                
##  Mean   :0.6715     Mean   :1.22e+06       Mean   :0.7616                
##  3rd Qu.:0.6716     3rd Qu.:0.00e+00       3rd Qu.:0.9420                
##  Max.   :1.0000     Max.   :8.32e+09       Max.   :1.0000                
##  Current Liability to Equity Equity to Long-term Liability
##  Min.   :0.0000              Min.   :0.0000               
##  1st Qu.:0.3281              1st Qu.:0.1109               
##  Median :0.3297              Median :0.1123               
##  Mean   :0.3314              Mean   :0.1156               
##  3rd Qu.:0.3323              3rd Qu.:0.1171               
##  Max.   :1.0000              Max.   :1.0000               
##  Cash Flow to Total Assets Cash Flow to Liability CFO to Assets   
##  Min.   :0.0000            Min.   :0.0000         Min.   :0.0000  
##  1st Qu.:0.6333            1st Qu.:0.4571         1st Qu.:0.5660  
##  Median :0.6454            Median :0.4598         Median :0.5933  
##  Mean   :0.6497            Mean   :0.4618         Mean   :0.5934  
##  3rd Qu.:0.6631            3rd Qu.:0.4642         3rd Qu.:0.6248  
##  Max.   :1.0000            Max.   :1.0000         Max.   :1.0000  
##  Cash Flow to Equity Current Liability to Current Assets Liability-Assets Flag
##  Min.   :0.0000      Min.   :0.00000                     Min.   :0.000000     
##  1st Qu.:0.3130      1st Qu.:0.01803                     1st Qu.:0.000000     
##  Median :0.3150      Median :0.02760                     Median :0.000000     
##  Mean   :0.3156      Mean   :0.03151                     Mean   :0.001173     
##  3rd Qu.:0.3177      3rd Qu.:0.03837                     3rd Qu.:0.000000     
##  Max.   :1.0000      Max.   :1.00000                     Max.   :1.000000     
##  Net Income to Total Assets Total assets to GNP price No-credit Interval
##  Min.   :0.0000             Min.   :0.000e+00         Min.   :0.0000    
##  1st Qu.:0.7967             1st Qu.:0.000e+00         1st Qu.:0.6236    
##  Median :0.8106             Median :0.000e+00         Median :0.6239    
##  Mean   :0.8078             Mean   :1.863e+07         Mean   :0.6239    
##  3rd Qu.:0.8265             3rd Qu.:0.000e+00         3rd Qu.:0.6242    
##  Max.   :1.0000             Max.   :9.820e+09         Max.   :1.0000    
##  Gross Profit to Sales Net Income to Stockholder's Equity Liability to Equity
##  Min.   :0.0000        Min.   :0.0000                     Min.   :0.0000     
##  1st Qu.:0.6004        1st Qu.:0.8401                     1st Qu.:0.2769     
##  Median :0.6060        Median :0.8412                     Median :0.2788     
##  Mean   :0.6079        Mean   :0.8404                     Mean   :0.2804     
##  3rd Qu.:0.6139        3rd Qu.:0.8424                     3rd Qu.:0.2814     
##  Max.   :1.0000        Max.   :1.0000                     Max.   :1.0000     
##  Degree of Financial Leverage (DFL)
##  Min.   :0.00000                   
##  1st Qu.:0.02679                   
##  Median :0.02681                   
##  Mean   :0.02754                   
##  3rd Qu.:0.02691                   
##  Max.   :1.00000                   
##  Interest Coverage Ratio (Interest expense to EBIT) Net Income Flag
##  Min.   :0.0000                                     Min.   :1      
##  1st Qu.:0.5652                                     1st Qu.:1      
##  Median :0.5653                                     Median :1      
##  Mean   :0.5654                                     Mean   :1      
##  3rd Qu.:0.5657                                     3rd Qu.:1      
##  Max.   :1.0000                                     Max.   :1      
##  Equity to Liability
##  Min.   :0.00000    
##  1st Qu.:0.02448    
##  Median :0.03380    
##  Mean   :0.04758    
##  3rd Qu.:0.05284    
##  Max.   :1.00000
  1. Identify label column & basic EDA
# ---- label detection ----
candidate_labels <- c("Bankrupt?", "Bankrupt", "bankrupt", "Class", "class", "Target", "target", "Y", "y")
label_name <- intersect(candidate_labels, names(dat))

if (length(label_name) == 0) {
  two_level_cols <- names(dat)[sapply(dat, function(x) length(unique(x)) == 2)]
  message("No standard label name found. Two-level columns are:\n",
          paste(two_level_cols, collapse = ", "))
  stop("Please set label_name manually (likely one of the two-level columns).")
}

# If multiple matched, use the first
label_name <- label_name[1]
label_name
## [1] "Bankrupt?"
# ---- coerce label to factor ----
# Expecting 0/1; if not, adapt levels accordingly
dat[[label_name]] <- factor(dat[[label_name]], levels = c(0, 1), labels = c("No", "Yes"))

table(dat[[label_name]])
## 
##   No  Yes 
## 6599  220
prop.table(table(dat[[label_name]]))
## 
##        No       Yes 
## 0.9677372 0.0322628
ggplot(dat, aes(x = .data[[label_name]])) +
  geom_bar() +
  labs(title = paste("Class Balance:", label_name), x = label_name, y = "Count")

  1. Train/Test split (stratified)
idx_train <- createDataPartition(dat[[label_name]], p = 0.7, list = FALSE)
train_df <- dat[idx_train, ]
test_df  <- dat[-idx_train, ]

prop.table(table(train_df[[label_name]]))
## 
##         No        Yes 
## 0.96774194 0.03225806
prop.table(table(test_df[[label_name]]))
## 
##         No        Yes 
## 0.96772616 0.03227384
y_train <- train_df[[label_name]]
y_test  <- test_df[[label_name]]

X_train <- train_df %>% select(-all_of(label_name))
X_test  <- test_df  %>% select(-all_of(label_name))

# quick NA check
sum(is.na(X_train))
## [1] 0
sum(is.na(X_test))
## [1] 0
  1. Preprocess: Impute + Standardize
# ---- impute (median) ----
pp_impute <- preProcess(X_train, method = c("medianImpute"))
X_train_imp <- predict(pp_impute, X_train)
X_test_imp  <- predict(pp_impute, X_test)

# ---- remove zero / near-zero variance predictors ----
nzv <- nearZeroVar(X_train_imp)   # 返回需要删除的列索引(含零方差)
if(length(nzv) > 0){
  X_train_imp <- X_train_imp[, -nzv, drop = FALSE]
  X_test_imp  <- X_test_imp[,  -nzv, drop = FALSE]
}

# ---- scale (center/scale) ----
pp_scale <- preProcess(X_train_imp, method = c("center", "scale"))
X_train_sc <- predict(pp_scale, X_train_imp)
X_test_sc  <- predict(pp_scale, X_test_imp)
  1. Topic A — PCA (dimension reduction) 5.1 Fit PCA on training set
pca_fit <- prcomp(X_train_sc, center = FALSE, scale. = FALSE)

# variance explained
pve <- (pca_fit$sdev^2) / sum(pca_fit$sdev^2)
cum_pve <- cumsum(pve)

# choose k to reach 80% explained variance
k_pca <- which(cum_pve >= 0.80)[1]
k_pca
## [1] 32
pve_df <- data.frame(PC = seq_along(pve), PVE = pve, CumPVE = cum_pve)

ggplot(pve_df, aes(x = PC, y = PVE)) +
  geom_line() +
  labs(title = "Scree Plot (PVE by Principal Component)", x = "PC", y = "Proportion of Variance Explained")

ggplot(pve_df, aes(x = PC, y = CumPVE)) +
  geom_line() +
  geom_hline(yintercept = 0.80, linetype = "dashed") +
  labs(title = "Cumulative PVE", x = "PC", y = "Cumulative Proportion")

5.2 PCA scatter plot (PC1 vs PC2)

pc_train_2 <- as.data.frame(pca_fit$x[, 1:2])
pc_train_2$y <- y_train

ggplot(pc_train_2, aes(x = PC1, y = PC2, color = y)) +
  geom_point(alpha = 0.6) +
  labs(title = "PCA Scatter (Train): PC1 vs PC2", color = label_name)

  1. Topic B — Classification (PCA + KNN)
# ---- project data onto top PCs ----
train_scores <- as.data.frame(pca_fit$x[, 1:k_pca])
test_scores  <- as.data.frame(predict(pca_fit, newdata = X_test_sc)[, 1:k_pca])

train_mat <- as.matrix(train_scores)
test_mat  <- as.matrix(test_scores)

# ---- choose K for KNN ----
K_grid <- c(1,3,5,7,9,15,25,35,51)
auc_knn <- numeric(length(K_grid))

for (i in seq_along(K_grid)) {
  K <- K_grid[i]
  pred_knn <- knn(train = train_mat, test = test_mat, cl = y_train, k = K, prob = TRUE)
  prob_yes <- ifelse(pred_knn == "Yes", attr(pred_knn, "prob"), 1 - attr(pred_knn, "prob"))
  roc_obj <- roc(response = y_test, predictor = prob_yes, levels = c("No","Yes"), quiet = TRUE)
  auc_knn[i] <- as.numeric(auc(roc_obj))
}

knn_tune <- data.frame(K = K_grid, AUC = auc_knn) %>% arrange(desc(AUC))
knn_tune
##    K       AUC
## 1 35 0.9257430
## 2 51 0.9253832
## 3 25 0.9190592
## 4 15 0.8707106
## 5  9 0.8363039
## 6  7 0.8254705
## 7  5 0.7708477
## 8  3 0.7237471
## 9  1 0.6320647
bestK <- knn_tune$K[1]
bestK
## [1] 35
pred_knn <- knn(train = train_mat, test = test_mat, cl = y_train, k = bestK, prob = TRUE)
prob_yes_knn <- ifelse(pred_knn == "Yes", attr(pred_knn, "prob"), 1 - attr(pred_knn, "prob"))

# confusion matrix
confusionMatrix(pred_knn, y_test, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1979   65
##        Yes    0    1
##                                           
##                Accuracy : 0.9682          
##                  95% CI : (0.9597, 0.9754)
##     No Information Rate : 0.9677          
##     P-Value [Acc > NIR] : 0.4828          
##                                           
##                   Kappa : 0.0289          
##                                           
##  Mcnemar's Test P-Value : 2.051e-15       
##                                           
##             Sensitivity : 0.015152        
##             Specificity : 1.000000        
##          Pos Pred Value : 1.000000        
##          Neg Pred Value : 0.968200        
##              Prevalence : 0.032274        
##          Detection Rate : 0.000489        
##    Detection Prevalence : 0.000489        
##       Balanced Accuracy : 0.507576        
##                                           
##        'Positive' Class : Yes             
## 
# ROC / AUC
roc_knn <- roc(y_test, prob_yes_knn, levels = c("No","Yes"), quiet = TRUE)
plot(roc_knn, main = paste0("ROC: PCA + KNN (K=", bestK, "), AUC=", round(auc(roc_knn), 3)))

  1. Topic C — Classification (Random Forest on scaled original features)
train_rf <- data.frame(y = y_train, X_train_sc)
test_rf  <- data.frame(y = y_test,  X_test_sc)

rf_fit <- ranger(
  y ~ .,
  data = train_rf,
  probability = TRUE,
  num.trees = 500,
  mtry = floor(sqrt(ncol(X_train_sc))),
  importance = "impurity"
)

rf_prob <- predict(rf_fit, data = test_rf)$predictions[, "Yes"]
rf_pred <- factor(ifelse(rf_prob >= 0.5, "Yes", "No"), levels = c("No","Yes"))

confusionMatrix(rf_pred, y_test, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1976   53
##        Yes    3   13
##                                           
##                Accuracy : 0.9726          
##                  95% CI : (0.9646, 0.9792)
##     No Information Rate : 0.9677          
##     P-Value [Acc > NIR] : 0.1155          
##                                           
##                   Kappa : 0.3084          
##                                           
##  Mcnemar's Test P-Value : 5.835e-11       
##                                           
##             Sensitivity : 0.196970        
##             Specificity : 0.998484        
##          Pos Pred Value : 0.812500        
##          Neg Pred Value : 0.973879        
##              Prevalence : 0.032274        
##          Detection Rate : 0.006357        
##    Detection Prevalence : 0.007824        
##       Balanced Accuracy : 0.597727        
##                                           
##        'Positive' Class : Yes             
## 
roc_rf <- roc(y_test, rf_prob, levels = c("No","Yes"), quiet = TRUE)
plot(roc_rf, main = paste0("ROC: Random Forest, AUC=", round(auc(roc_rf), 3)))

imp <- sort(rf_fit$variable.importance, decreasing = TRUE)
imp_df <- data.frame(
  feature = names(imp),
  importance = as.numeric(imp)
) %>% slice_head(n = 20)

ggplot(imp_df, aes(x = reorder(feature, importance), y = importance)) +
  geom_col() +
  coord_flip() +
  labs(title = "Top 20 Feature Importances (Random Forest)", x = "", y = "Importance")

  1. Topic D — FDR control (BH) for feature screening
# compute p-values per feature (train set only)
pvals <- sapply(names(X_train_sc), function(v) {
  wilcox.test(X_train_sc[[v]] ~ y_train)$p.value
})

# BH adjustment
q <- 0.10
p_adj <- p.adjust(pvals, method = "BH")

sig_features <- names(p_adj)[p_adj <= q]
length(sig_features)
## [1] 87
head(sig_features, 30)
##  [1] "ROA(C) before interest and depreciation before interest"
##  [2] "ROA(A) before interest and % after tax"                 
##  [3] "ROA(B) before interest and depreciation after tax"      
##  [4] "Operating Gross Margin"                                 
##  [5] "Realized Sales Gross Margin"                            
##  [6] "Operating Profit Rate"                                  
##  [7] "Pre-tax net Interest Rate"                              
##  [8] "After-tax net Interest Rate"                            
##  [9] "Non-industry income and expenditure/revenue"            
## [10] "Continuous interest rate (after tax)"                   
## [11] "Research and development expense rate"                  
## [12] "Cash flow rate"                                         
## [13] "Interest-bearing debt interest rate"                    
## [14] "Tax rate (A)"                                           
## [15] "Net Value Per Share (B)"                                
## [16] "Net Value Per Share (A)"                                
## [17] "Net Value Per Share (C)"                                
## [18] "Persistent EPS in the Last Four Seasons"                
## [19] "Cash Flow Per Share"                                    
## [20] "Revenue Per Share (Yuan ¥)"                             
## [21] "Operating Profit Per Share (Yuan ¥)"                    
## [22] "Per Share Net profit before tax (Yuan ¥)"               
## [23] "Realized Sales Gross Profit Growth Rate"                
## [24] "Operating Profit Growth Rate"                           
## [25] "After-tax Net Profit Growth Rate"                       
## [26] "Regular Net Profit Growth Rate"                         
## [27] "Continuous Net Profit Growth Rate"                      
## [28] "Total Asset Growth Rate"                                
## [29] "Net Value Growth Rate"                                  
## [30] "Total Asset Return Growth Rate Ratio"
pv_df <- data.frame(
  feature = names(pvals),
  pval = as.numeric(pvals),
  padj = as.numeric(p_adj)
)

ggplot(pv_df, aes(x = pval)) +
  geom_histogram(bins = 40) +
  labs(title = "Histogram of Raw p-values (Train)", x = "p-value", y = "Count")

ggplot(pv_df, aes(x = padj)) +
  geom_histogram(bins = 40) +
  labs(title = "Histogram of BH-adjusted p-values (Train)", x = "BH adjusted p-value", y = "Count")

  1. Classification after FDR screening (FDR + Random Forest)
train_rf_fdr <- data.frame(y = y_train, X_train_sc[, sig_features, drop = FALSE])
test_rf_fdr  <- data.frame(y = y_test,  X_test_sc[, sig_features, drop = FALSE])

rf_fit_fdr <- ranger(
  y ~ .,
  data = train_rf_fdr,
  probability = TRUE,
  num.trees = 500,
  mtry = max(1, floor(sqrt(length(sig_features)))),
  importance = "impurity"
)

rf_prob_fdr <- predict(rf_fit_fdr, data = test_rf_fdr)$predictions[, "Yes"]
rf_pred_fdr <- factor(ifelse(rf_prob_fdr >= 0.5, "Yes", "No"), levels = c("No","Yes"))

confusionMatrix(rf_pred_fdr, y_test, positive = "Yes")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   No  Yes
##        No  1975   51
##        Yes    4   15
##                                           
##                Accuracy : 0.9731          
##                  95% CI : (0.9651, 0.9797)
##     No Information Rate : 0.9677          
##     P-Value [Acc > NIR] : 0.09179         
##                                           
##                   Kappa : 0.3435          
##                                           
##  Mcnemar's Test P-Value : 5.552e-10       
##                                           
##             Sensitivity : 0.227273        
##             Specificity : 0.997979        
##          Pos Pred Value : 0.789474        
##          Neg Pred Value : 0.974827        
##              Prevalence : 0.032274        
##          Detection Rate : 0.007335        
##    Detection Prevalence : 0.009291        
##       Balanced Accuracy : 0.612626        
##                                           
##        'Positive' Class : Yes             
## 
roc_rf_fdr <- roc(y_test, rf_prob_fdr, levels = c("No","Yes"), quiet = TRUE)
auc(roc_rf_fdr)
## Area under the curve: 0.9498
plot(roc_rf, main = "ROC Comparison: RF(All) vs RF(FDR-selected)")
lines(roc_rf_fdr)
legend("bottomright",
       legend = c(paste0("RF All AUC=", round(auc(roc_rf), 3)),
                  paste0("RF FDR AUC=", round(auc(roc_rf_fdr), 3))),
       lty = 1, bty = "n")

  1. Logistic regression baseline (for comparison only)
train_glm <- data.frame(y = y_train, X_train_sc)
test_glm  <- data.frame(y = y_test,  X_test_sc)

glm_fit <- glm(y ~ ., data = train_glm, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob <- predict(glm_fit, newdata = test_glm, type = "response")

roc_glm <- roc(y_test, glm_prob, levels = c("No","Yes"), quiet = TRUE)
plot(roc_glm, main = paste0("ROC: Logistic Regression (All), AUC=", round(auc(roc_glm), 3)))

glm_fit_fdr <- glm(y ~ ., data = train_rf_fdr, family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_prob_fdr <- predict(glm_fit_fdr, newdata = test_rf_fdr, type = "response")

roc_glm_fdr <- roc(y_test, glm_prob_fdr, levels = c("No","Yes"), quiet = TRUE)
plot(roc_glm_fdr, main = paste0("ROC: Logistic Regression (FDR), AUC=", round(auc(roc_glm_fdr), 3)))

  1. Final summary table (AUC comparison)
auc_summary <- data.frame(
  Model = c(
    paste0("PCA+KNN (K=", bestK, ", PCs=", k_pca, ")"),
    "Random Forest (All features)",
    paste0("Random Forest (FDR q=", q, ", |S|=", length(sig_features), ")"),
    "Logistic (All features)",
    paste0("Logistic (FDR q=", q, ", |S|=", length(sig_features), ")")
  ),
  AUC = c(
    as.numeric(auc(roc_knn)),
    as.numeric(auc(roc_rf)),
    as.numeric(auc(roc_rf_fdr)),
    as.numeric(auc(roc_glm)),
    as.numeric(auc(roc_glm_fdr))
  )
)

auc_summary %>% arrange(desc(AUC))
##                               Model       AUC
## 1 Random Forest (FDR q=0.1, |S|=87) 0.9498293
## 2      Random Forest (All features) 0.9483057
## 3            PCA+KNN (K=35, PCs=32) 0.9257430
## 4      Logistic (FDR q=0.1, |S|=87) 0.8902415
## 5           Logistic (All features) 0.8728237