Project Overview

This project predicts corporate bankruptcy using five years of financial ratio data (64 attributes per company). We apply three models — Logistic Regression, Random Forest, and Classification Trees (standard and asymmetric cost) — and evaluate each using confusion matrices, ROC curves, and cross-validation.

Dataset: Polish Companies Bankruptcy Data (ARFF format, 1–5 year files)
Tools: R, RStudio — packages: farff, glmnet, ROCR, randomForest, rpart, rpart.plot, boot, dplyr


Part 1: Data Loading & Setup

Install Packages

install.packages("farff")
install.packages("glmnet")
install.packages("ROCR")
install.packages("randomForest")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("boot")
install.packages("dplyr")

Load Libraries

library(farff)
library(glmnet)
library(ROCR)
library(dplyr)
library(randomForest)
library(rpart)
library(rpart.plot)
library(boot)

Load & Combine All 5 Years

Note: Place your 5 ARFF files in the same folder as this .Rmd file and name them exactly: 1year.arff, 2year.arff, 3year.arff, 4year.arff, 5year.arff. Or update the filenames in the chunk below to match your actual file names.

data_1yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/1year.arff")
data_2yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/2year.arff")
data_3yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/3year.arff")
data_4yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/4year.arff")
data_5yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/5year.arff")

# Add year indicator
data_1yr$year <- 1
data_2yr$year <- 2
data_3yr$year <- 3
data_4yr$year <- 4
data_5yr$year <- 5

# Combine into one dataset
bankruptcy <- rbind(data_1yr, data_2yr, data_3yr, data_4yr, data_5yr)
save(bankruptcy, file = "bankruptcy_data.RData")

Clean Data

# Convert class to factor for classification
bankruptcy$class <- as.factor(bankruptcy$class)

# Replace NAs with column median to preserve data
bankruptcy_clean <- as.data.frame(lapply(bankruptcy, function(x) {
  if(is.numeric(x)) { x[is.na(x)] <- median(x, na.rm = TRUE); x } else { x }
}))

# Restore class as factor after lapply (lapply coerces it to numeric)
bankruptcy_clean$class <- as.factor(bankruptcy_clean$class)

Part 2: Logistic Regression

Exploratory Data Analysis

str(bankruptcy)
## 'data.frame':    43405 obs. of  66 variables:
##  $ Attr1 : num  0.2006 0.2091 0.2487 0.0815 0.1873 ...
##  $ Attr2 : num  0.38 0.5 0.696 0.307 0.613 ...
##  $ Attr3 : num  0.396 0.472 0.267 0.459 0.23 ...
##  $ Attr4 : num  2.05 1.94 1.55 2.49 1.41 ...
##  $ Attr5 : num  32.35 14.79 -1.15 51.95 -7.31 ...
##  $ Attr6 : num  0.388 0 0 0.15 0.187 ...
##  $ Attr7 : num  0.2498 0.2583 0.3091 0.0927 0.1873 ...
##  $ Attr8 : num  1.331 0.996 0.437 1.866 0.631 ...
##  $ Attr9 : num  1.14 1.7 1.31 1.06 1.16 ...
##  $ Attr10: num  0.505 0.498 0.304 0.574 0.387 ...
##  $ Attr11: num  0.2498 0.2611 0.3126 0.0927 0.1873 ...
##  $ Attr12: num  0.66 0.517 0.642 0.302 0.331 ...
##  $ Attr13: num  0.1666 0.1583 0.2444 0.0943 0.1218 ...
##  $ Attr14: num  0.2498 0.2583 0.3091 0.0927 0.1873 ...
##  $ Attr15: num  497 678 794 917 1133 ...
##  $ Attr16: num  0.734 0.538 0.46 0.398 0.322 ...
##  $ Attr17: num  2.63 2 1.44 3.25 1.63 ...
##  $ Attr18: num  0.2498 0.2583 0.3091 0.0927 0.1873 ...
##  $ Attr19: num  0.1494 0.152 0.2361 0.0714 0.1155 ...
##  $ Attr20: num  43.4 88 73.1 79.8 57 ...
##  $ Attr21: num  1.25 1.43 1.43 1.51 NA ...
##  $ Attr22: num  0.214 0.248 0.303 0.116 0.198 ...
##  $ Attr23: num  0.12 0.123 0.19 0.0628 0.1155 ...
##  $ Attr24: num  0.477 NA NA 0.172 0.187 ...
##  $ Attr25: num  0.505 0.395 0.289 0.574 0.387 ...
##  $ Attr26: num  0.604 0.44 0.373 0.362 0.322 ...
##  $ Attr27: num  1.458 88.444 86.011 0.941 1.414 ...
##  $ Attr28: num  1.76 16.95 1.06 1.96 1.12 ...
##  $ Attr29: num  5.94 3.69 4.37 4.65 4.14 ...
##  $ Attr30: num  0.118 0.27 0.419 0.143 0.279 ...
##  $ Attr31: num  0.1494 0.152 0.2382 0.0714 0.1155 ...
##  $ Attr32: num  94.1 122.2 176.9 91.4 147 ...
##  $ Attr33: num  3.88 2.99 2.06 3.99 2.48 ...
##  $ Attr34: num  0.564 2.988 1.427 0.376 0.323 ...
##  $ Attr35: num  0.214 0.206 0.316 0.116 0.198 ...
##  $ Attr36: num  1.74 1.7 1.31 1.36 1.63 ...
##  $ Attr37: num  593.3 NA 2.3 NA 11.2 ...
##  $ Attr38: num  0.506 0.498 0.515 0.574 0.435 ...
##  $ Attr39: num  0.128 0.121 0.241 0.089 0.122 ...
##  $ Attr40: num  0.663 0.0864 0.322 0.4014 0.293 ...
##  $ Attr41: num  0.0514 0.0644 0.074 0.0696 0.0967 ...
##  $ Attr42: num  0.128 0.146 0.231 0.089 0.122 ...
##  $ Attr43: num  114 199 166 181 142 ...
##  $ Attr44: num  71 111.5 92.4 101 84.6 ...
##  $ Attr45: num  1.01 0.51 0.948 0.287 0.739 ...
##  $ Attr46: num  1.522 1.125 1.01 1.57 0.958 ...
##  $ Attr47: num  49.4 100.1 96.4 84.3 65.9 ...
##  $ Attr48: num  0.1853 0.2373 0.2918 0.0859 0.1881 ...
##  $ Attr49: num  0.1109 0.1396 0.2229 0.0662 0.116 ...
##  $ Attr50: num  2.04 1.94 1.08 2.49 1.3 ...
##  $ Attr51: num  0.379 0.5 0.482 0.307 0.565 ...
##  $ Attr52: num  0.258 0.335 0.485 0.25 0.403 ...
##  $ Attr53: num  2.24 17.87 1.21 2.45 1.88 ...
##  $ Attr54: num  2.25 17.87 2.05 2.45 2.12 ...
##  $ Attr55: num  348690 2305 6333 20545 3187 ...
##  $ Attr56: num  0.122 0.121 0.241 0.054 0.135 ...
##  $ Attr57: num  0.397 0.42 0.818 0.142 0.484 ...
##  $ Attr58: num  0.878 0.853 0.766 0.946 0.865 ...
##  $ Attr59: num  0.00192 0 0.69484 0 0.12444 ...
##  $ Attr60: num  8.42 4.15 4.99 4.57 6.4 ...
##  $ Attr61: num  5.14 3.27 3.95 3.61 4.32 ...
##  $ Attr62: num  82.7 107.3 134.3 86.4 127.2 ...
##  $ Attr63: num  4.42 3.4 2.72 4.22 2.87 ...
##  $ Attr64: num  7.43 60.99 5.21 5.55 7.9 ...
##  $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ year  : num  1 1 1 1 1 1 1 1 1 1 ...
summary(bankruptcy)
##      Attr1               Attr2               Attr3               Attr4         
##  Min.   :-463.8900   Min.   :-430.8700   Min.   :-479.9600   Min.   :   -0.40  
##  1st Qu.:   0.0034   1st Qu.:   0.2690   1st Qu.:   0.0215   1st Qu.:    1.05  
##  Median :   0.0497   Median :   0.4719   Median :   0.1966   Median :    1.57  
##  Mean   :   0.0352   Mean   :   0.5902   Mean   :   0.1144   Mean   :    6.31  
##  3rd Qu.:   0.1296   3rd Qu.:   0.6883   3rd Qu.:   0.4034   3rd Qu.:    2.79  
##  Max.   :  94.2800   Max.   : 480.9600   Max.   :  28.3360   Max.   :53433.00  
##  NA's   :8           NA's   :8           NA's   :8           NA's   :134       
##      Attr5               Attr6               Attr7               Attr8         
##  Min.   :-11903000   Min.   :-508.4100   Min.   :-517.4800   Min.   : -141.41  
##  1st Qu.:      -49   1st Qu.:   0.0000   1st Qu.:   0.0058   1st Qu.:    0.43  
##  Median :       -1   Median :   0.0000   Median :   0.0596   Median :    1.07  
##  Mean   :     -385   Mean   :  -0.0561   Mean   :   0.0935   Mean   :   12.64  
##  3rd Qu.:       51   3rd Qu.:   0.0894   3rd Qu.:   0.1509   3rd Qu.:    2.62  
##  Max.   :  1250100   Max.   : 543.2500   Max.   : 649.2300   Max.   :53432.00  
##  NA's   :89          NA's   :8           NA's   :8           NA's   :94        
##      Attr9              Attr10              Attr11              Attr12         
##  Min.   :  -3.496   Min.   :-479.9100   Min.   :-463.8900   Min.   :-6331.800  
##  1st Qu.:   1.018   1st Qu.:   0.2955   1st Qu.:   0.0154   1st Qu.:    0.015  
##  Median :   1.195   Median :   0.5060   Median :   0.0753   Median :    0.172  
##  Mean   :   2.652   Mean   :   0.6269   Mean   :   0.1311   Mean   :    1.132  
##  3rd Qu.:   2.062   3rd Qu.:   0.7091   3rd Qu.:   0.1668   3rd Qu.:    0.587  
##  Max.   :9742.300   Max.   :1099.5000   Max.   : 681.5400   Max.   : 8259.400  
##  NA's   :9          NA's   :8           NA's   :44          NA's   :134        
##      Attr13              Attr14              Attr15             Attr16         
##  Min.   :-1460.600   Min.   :-517.4800   Min.   :-9632400   Min.   :-6331.800  
##  1st Qu.:    0.024   1st Qu.:   0.0058   1st Qu.:     223   1st Qu.:    0.073  
##  Median :    0.068   Median :   0.0597   Median :     846   Median :    0.246  
##  Mean   :    0.810   Mean   :   0.0935   Mean   :    1992   Mean   :    1.411  
##  3rd Qu.:    0.135   3rd Qu.:   0.1509   3rd Qu.:    2227   3rd Qu.:    0.665  
##  Max.   :13315.000   Max.   : 649.2300   Max.   :10236000   Max.   : 8259.400  
##  NA's   :127         NA's   :8           NA's   :36         NA's   :95         
##      Attr17             Attr18              Attr19              Attr20       
##  Min.   :   -0.41   Min.   :-517.4800   Min.   :-1578.700   Min.   :    -29  
##  1st Qu.:    1.45   1st Qu.:   0.0058   1st Qu.:    0.004   1st Qu.:     15  
##  Median :    2.12   Median :   0.0597   Median :    0.036   Median :     35  
##  Mean   :   13.80   Mean   :   0.0986   Mean   :    0.156   Mean   :    243  
##  3rd Qu.:    3.70   3rd Qu.:   0.1509   3rd Qu.:    0.091   3rd Qu.:     64  
##  Max.   :53433.00   Max.   : 649.2300   Max.   : 9230.500   Max.   :7809200  
##  NA's   :94         NA's   :8           NA's   :128         NA's   :127      
##      Attr21              Attr22              Attr23         
##  Min.   :-1325.000   Min.   :-431.5900   Min.   :-1578.700  
##  1st Qu.:    0.908   1st Qu.:   0.0000   1st Qu.:    0.002  
##  Median :    1.045   Median :   0.0623   Median :    0.030  
##  Mean   :    3.885   Mean   :   0.1139   Mean   :    0.139  
##  3rd Qu.:    1.204   3rd Qu.:   0.1500   3rd Qu.:    0.078  
##  Max.   :29907.000   Max.   : 681.5400   Max.   : 9230.500  
##  NA's   :5854        NA's   :8           NA's   :127        
##      Attr24              Attr25              Attr26              Attr27       
##  Min.   :-463.8900   Min.   :-500.9300   Min.   :-6331.800   Min.   :-259010  
##  1st Qu.:   0.0213   1st Qu.:   0.1501   1st Qu.:    0.067   1st Qu.:      0  
##  Median :   0.1551   Median :   0.3845   Median :    0.222   Median :      1  
##  Mean   :   0.2700   Mean   :   0.3928   Mean   :    1.264   Mean   :   1108  
##  3rd Qu.:   0.3556   3rd Qu.:   0.6107   3rd Qu.:    0.599   3rd Qu.:      5  
##  Max.   : 831.6600   Max.   :1353.3000   Max.   : 8262.300   Max.   :4208800  
##  NA's   :922         NA's   :8           NA's   :95          NA's   :2764     
##      Attr28              Attr29            Attr30              Attr31         
##  Min.   :-3829.900   Min.   :-0.8861   Min.   : -6351.70   Min.   :-1495.600  
##  1st Qu.:    0.038   1st Qu.: 3.4951   1st Qu.:     0.08   1st Qu.:    0.007  
##  Median :    0.465   Median : 4.0140   Median :     0.22   Median :    0.043  
##  Mean   :    6.003   Mean   : 4.0050   Mean   :     7.37   Mean   :    0.177  
##  3rd Qu.:    1.497   3rd Qu.: 4.5202   3rd Qu.:     0.41   3rd Qu.:    0.102  
##  Max.   :21701.000   Max.   : 9.6983   Max.   :152860.00   Max.   : 9244.300  
##  NA's   :812         NA's   :8         NA's   :127         NA's   :127        
##      Attr32             Attr33              Attr34              Attr35         
##  Min.   :   -9296   Min.   :  -19.197   Min.   :-1696.000   Min.   :-431.5900  
##  1st Qu.:      46   1st Qu.:    2.820   1st Qu.:    0.306   1st Qu.:   0.0060  
##  Median :      78   Median :    4.625   Median :    1.967   Median :   0.0607  
##  Mean   :    1163   Mean   :    8.636   Mean   :    5.411   Mean   :   0.1119  
##  3rd Qu.:     128   3rd Qu.:    7.803   3rd Qu.:    4.551   3rd Qu.:   0.1501  
##  Max.   :17364000   Max.   :21944.000   Max.   :21944.000   Max.   : 626.9200  
##  NA's   :368        NA's   :134         NA's   :94          NA's   :8          
##      Attr36             Attr37             Attr38              Attr39         
##  Min.   :  -0.001   Min.   :  -525.5   Min.   :-479.9100   Min.   :-7522.000  
##  1st Qu.:   1.101   1st Qu.:     1.1   1st Qu.:   0.4198   1st Qu.:    0.004  
##  Median :   1.643   Median :     3.1   Median :   0.6122   Median :    0.037  
##  Mean   :   2.911   Mean   :   105.1   Mean   :   0.7244   Mean   :   -0.289  
##  3rd Qu.:   2.421   3rd Qu.:    11.4   3rd Qu.:   0.7718   3rd Qu.:    0.092  
##  Max.   :9742.300   Max.   :398920.0   Max.   :1099.5000   Max.   : 2156.500  
##  NA's   :8          NA's   :18984      NA's   :8           NA's   :127        
##      Attr40             Attr41              Attr42               Attr43        
##  Min.   :-101.270   Min.   : -1234.40   Min.   :-1395.8000   Min.   : -115870  
##  1st Qu.:   0.053   1st Qu.:     0.03   1st Qu.:    0.0000   1st Qu.:      67  
##  Median :   0.177   Median :     0.09   Median :    0.0380   Median :      99  
##  Mean   :   2.147   Mean   :     7.72   Mean   :   -0.1425   Mean   :    1074  
##  3rd Qu.:   0.652   3rd Qu.:     0.21   3rd Qu.:    0.0921   3rd Qu.:     141  
##  Max.   :8007.100   Max.   :288770.00   Max.   : 2156.8000   Max.   :30393000  
##  NA's   :134        NA's   :754         NA's   :127          NA's   :127       
##      Attr44             Attr45              Attr46             Attr47       
##  Min.   : -115870   Min.   :-256230.0   Min.   : -101.26   Min.   :    -96  
##  1st Qu.:      35   1st Qu.:      0.0   1st Qu.:    0.61   1st Qu.:     16  
##  Median :      55   Median :      0.3   Median :    1.03   Median :     38  
##  Mean   :     831   Mean   :     14.8   Mean   :    5.43   Mean   :    358  
##  3rd Qu.:      81   3rd Qu.:      1.0   3rd Qu.:    1.91   3rd Qu.:     70  
##  Max.   :22584000   Max.   : 366030.0   Max.   :53433.00   Max.   :6084200  
##  NA's   :127        NA's   :2147        NA's   :135        NA's   :297      
##      Attr48              Attr49              Attr50             Attr51        
##  Min.   :-542.5600   Min.   :-9001.000   Min.   :   -0.05   Min.   : -0.1866  
##  1st Qu.:  -0.0382   1st Qu.:   -0.027   1st Qu.:    0.77   1st Qu.:  0.1901  
##  Median :   0.0184   Median :    0.011   Median :    1.22   Median :  0.3410  
##  Mean   :   0.0286   Mean   :   -0.483   Mean   :    5.84   Mean   :  0.4835  
##  3rd Qu.:   0.1073   3rd Qu.:    0.062   3rd Qu.:    2.21   3rd Qu.:  0.5347  
##  Max.   : 623.8500   Max.   :  178.890   Max.   :53433.00   Max.   :480.9600  
##  NA's   :9           NA's   :127         NA's   :94         NA's   :8         
##      Attr52             Attr53              Attr54              Attr55        
##  Min.   :  -25.47   Min.   : -3828.90   Min.   : -3828.90   Min.   :-1805200  
##  1st Qu.:    0.13   1st Qu.:     0.69   1st Qu.:     0.96   1st Qu.:      28  
##  Median :    0.21   Median :     1.21   Median :     1.38   Median :    1088  
##  Mean   :    6.48   Mean   :    23.77   Mean   :    24.65   Mean   :    7672  
##  3rd Qu.:    0.35   3rd Qu.:     2.22   3rd Qu.:     2.37   3rd Qu.:    4993  
##  Max.   :88433.00   Max.   :180440.00   Max.   :180440.00   Max.   : 6123700  
##  NA's   :301        NA's   :812         NA's   :812         NA's   :1         
##      Attr56               Attr57               Attr58         
##  Min.   :-1108300.0   Min.   :-1667.3000   Min.   :   -198.7  
##  1st Qu.:       0.0   1st Qu.:    0.0146   1st Qu.:      0.9  
##  Median :       0.1   Median :    0.1197   Median :      1.0  
##  Mean   :     -26.2   Mean   :   -0.0105   Mean   :     30.0  
##  3rd Qu.:       0.1   3rd Qu.:    0.2846   3rd Qu.:      1.0  
##  Max.   :     293.1   Max.   :  552.6400   Max.   :1108300.0  
##  NA's   :127          NA's   :7            NA's   :84         
##      Attr59              Attr60            Attr61              Attr62        
##  Min.   : -327.970   Min.   :    -12   Min.   :   -12.66   Min.   :-2336500  
##  1st Qu.:    0.000   1st Qu.:      6   1st Qu.:     4.51   1st Qu.:      42  
##  Median :    0.006   Median :     10   Median :     6.64   Median :      71  
##  Mean   :    1.333   Mean   :    448   Mean   :    17.03   Mean   :    1502  
##  3rd Qu.:    0.236   3rd Qu.:     20   3rd Qu.:    10.39   3rd Qu.:     117  
##  Max.   :23853.000   Max.   :4818700   Max.   :108000.00   Max.   :25016000  
##  NA's   :7           NA's   :2152      NA's   :102         NA's   :127       
##      Attr63              Attr64          class          year     
##  Min.   :   -1.543   Min.   :-10677.00   0:41314   Min.   :1.00  
##  1st Qu.:    3.098   1st Qu.:     2.18   1: 2091   1st Qu.:2.00  
##  Median :    5.088   Median :     4.28             Median :3.00  
##  Mean   :    9.343   Mean   :    72.79             Mean   :2.94  
##  3rd Qu.:    8.599   3rd Qu.:     9.78             3rd Qu.:4.00  
##  Max.   :23454.000   Max.   :294770.00             Max.   :5.00  
##  NA's   :134         NA's   :812
# Class distribution
table(bankruptcy$class)
## 
##     0     1 
## 41314  2091
prop.table(table(bankruptcy$class))
## 
##          0          1 
## 0.95182583 0.04817417
# Check missing values per column
colSums(is.na(bankruptcy))
##  Attr1  Attr2  Attr3  Attr4  Attr5  Attr6  Attr7  Attr8  Attr9 Attr10 Attr11 
##      8      8      8    134     89      8      8     94      9      8     44 
## Attr12 Attr13 Attr14 Attr15 Attr16 Attr17 Attr18 Attr19 Attr20 Attr21 Attr22 
##    134    127      8     36     95     94      8    128    127   5854      8 
## Attr23 Attr24 Attr25 Attr26 Attr27 Attr28 Attr29 Attr30 Attr31 Attr32 Attr33 
##    127    922      8     95   2764    812      8    127    127    368    134 
## Attr34 Attr35 Attr36 Attr37 Attr38 Attr39 Attr40 Attr41 Attr42 Attr43 Attr44 
##     94      8      8  18984      8    127    134    754    127    127    127 
## Attr45 Attr46 Attr47 Attr48 Attr49 Attr50 Attr51 Attr52 Attr53 Attr54 Attr55 
##   2147    135    297      9    127     94      8    301    812    812      1 
## Attr56 Attr57 Attr58 Attr59 Attr60 Attr61 Attr62 Attr63 Attr64  class   year 
##    127      7     84      7   2152    102    127    134    812      0      0
# Remove rows with missing values
bankruptcy <- na.omit(bankruptcy)

Boxplots of Financial Attributes

boxplot(log(abs(bankruptcy[, 1:10])  + 1), las = 2, main = "Attr1-Attr10")

boxplot(log(abs(bankruptcy[, 11:20]) + 1), las = 2, main = "Attr11-Attr20")

boxplot(log(abs(bankruptcy[, 21:30]) + 1), las = 2, main = "Attr21-Attr30")

boxplot(log(abs(bankruptcy[, 31:40]) + 1), las = 2, main = "Attr31-Attr40")

boxplot(log(abs(bankruptcy[, 41:50]) + 1), las = 2, main = "Attr41-Attr50")

boxplot(log(abs(bankruptcy[, 51:64]) + 1), las = 2, main = "Attr51-Attr64")

Prepare Data for Logistic Regression

# Convert class to numeric (required for glm binomial)
bankruptcy$class <- as.numeric(as.character(bankruptcy$class))

# Impute any remaining NAs with column median
for(i in 1:64){
  bankruptcy[,i][is.na(bankruptcy[,i])] <- median(bankruptcy[,i], na.rm = TRUE)
}

# 80/20 train-test split
set.seed(49)
index            <- sample(nrow(bankruptcy), nrow(bankruptcy) * 0.80)
bankruptcy_train <- bankruptcy[index, ]
bankruptcy_test  <- bankruptcy[-index, ]

Full Logistic Regression Model

bankruptcy_glm0 <- glm(class ~ ., 
                       family  = binomial, 
                       data    = bankruptcy_train,
                       control = glm.control(maxit = 100))
summary(bankruptcy_glm0)
## 
## Call:
## glm(formula = class ~ ., family = binomial, data = bankruptcy_train, 
##     control = glm.control(maxit = 100))
## 
## Coefficients:
##               Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)  3.961e+14  8.702e+06   45516279   <2e-16 ***
## Attr1        3.602e+14  1.612e+07   22338489   <2e-16 ***
## Attr2        4.144e+13  1.238e+07    3347781   <2e-16 ***
## Attr3        1.644e+14  4.166e+06   39464824   <2e-16 ***
## Attr4        3.129e+13  4.352e+05   71890313   <2e-16 ***
## Attr5        1.322e+08  5.555e+00   23792074   <2e-16 ***
## Attr6        3.541e+12  3.364e+05   10526649   <2e-16 ***
## Attr7       -1.480e+18  1.862e+10  -79492135   <2e-16 ***
## Attr8        2.985e+14  1.638e+06  182183757   <2e-16 ***
## Attr9       -5.343e+13  7.426e+05  -71958209   <2e-16 ***
## Attr10       1.467e+14  1.261e+07   11633462   <2e-16 ***
## Attr11      -3.665e+14  1.497e+07  -24475937   <2e-16 ***
## Attr12      -1.067e+13  6.039e+05  -17675483   <2e-16 ***
## Attr13       2.513e+13  3.097e+05   81125813   <2e-16 ***
## Attr14       1.484e+18  1.868e+10   79475330   <2e-16 ***
## Attr15      -4.081e+08  6.487e+00  -62910285   <2e-16 ***
## Attr16       4.307e+14  9.530e+06   45192077   <2e-16 ***
## Attr17      -3.192e+14  1.571e+06 -203190629   <2e-16 ***
## Attr18      -4.622e+15  6.104e+07  -75714140   <2e-16 ***
## Attr19      -1.025e+15  2.063e+07  -49659863   <2e-16 ***
## Attr20       2.325e+15  8.390e+07   27713202   <2e-16 ***
## Attr21      -1.280e+11  2.097e+03  -61041538   <2e-16 ***
## Attr22      -2.173e+15  1.565e+07 -138833991   <2e-16 ***
## Attr23       6.774e+14  1.821e+07   37195182   <2e-16 ***
## Attr24       2.182e+12  2.084e+05   10471750   <2e-16 ***
## Attr25      -6.724e+12  2.869e+06   -2343385   <2e-16 ***
## Attr26      -5.219e+14  9.821e+06  -53138050   <2e-16 ***
## Attr27      -1.345e+09  2.464e+01  -54584205   <2e-16 ***
## Attr28       4.356e+13  5.157e+05   84480026   <2e-16 ***
## Attr29       3.199e+13  9.605e+05   33305532   <2e-16 ***
## Attr30      -2.259e+13  4.513e+05  -50049226   <2e-16 ***
## Attr31       4.421e+14  1.237e+07   35743001   <2e-16 ***
## Attr32       1.116e+11  1.272e+04    8777522   <2e-16 ***
## Attr33      -7.110e+12  5.783e+05  -12294231   <2e-16 ***
## Attr34       4.772e+13  2.981e+05  160085098   <2e-16 ***
## Attr35       1.186e+15  8.410e+06  140994299   <2e-16 ***
## Attr36       5.288e+13  7.886e+05   67059457   <2e-16 ***
## Attr37      -3.868e+09  2.773e+02  -13949664   <2e-16 ***
## Attr38      -3.141e+14  1.116e+07  -28145563   <2e-16 ***
## Attr39      -1.593e+14  2.793e+06  -57046291   <2e-16 ***
## Attr40       1.941e+14  8.162e+05  237772357   <2e-16 ***
## Attr41      -7.683e+09  2.325e+02  -33045757   <2e-16 ***
## Attr42      -8.993e+14  8.840e+06 -101722899   <2e-16 ***
## Attr43      -2.325e+15  8.390e+07  -27715865   <2e-16 ***
## Attr44       2.325e+15  8.389e+07   27715687   <2e-16 ***
## Attr45      -2.743e+10  1.699e+03  -16142540   <2e-16 ***
## Attr46      -2.125e+14  7.062e+05 -300849354   <2e-16 ***
## Attr47       3.459e+10  7.253e+02   47688619   <2e-16 ***
## Attr48       1.182e+15  6.373e+06  185451361   <2e-16 ***
## Attr49       7.594e+14  5.069e+06  149805143   <2e-16 ***
## Attr50      -1.453e+14  9.251e+05 -157066264   <2e-16 ***
## Attr51      -1.036e+14  1.137e+07   -9115614   <2e-16 ***
## Attr52      -6.038e+13  4.656e+06  -12967592   <2e-16 ***
## Attr53       1.543e+12  3.075e+04   50164331   <2e-16 ***
## Attr54      -4.435e+13  5.195e+05  -85370131   <2e-16 ***
## Attr55      -4.033e+07  8.921e+00   -4520430   <2e-16 ***
## Attr56      -2.645e+13  2.660e+06   -9943304   <2e-16 ***
## Attr57       5.112e+10  7.421e+04     688851   <2e-16 ***
## Attr58      -1.157e+13  2.638e+06   -4385567   <2e-16 ***
## Attr59      -2.262e+11  1.686e+04  -13419976   <2e-16 ***
## Attr60      -7.441e+10  8.720e+02  -85325844   <2e-16 ***
## Attr61      -5.039e+12  2.453e+04 -205411267   <2e-16 ***
## Attr62      -6.083e+10  1.782e+03  -34141323   <2e-16 ***
## Attr63      -1.642e+13  5.675e+05  -28933516   <2e-16 ***
## Attr64      -3.461e+11  5.475e+03  -63213678   <2e-16 ***
## year         2.528e+13  4.119e+05   61381950   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  3298.2  on 15972  degrees of freedom
## Residual deviance: 31502.2  on 15907  degrees of freedom
## AIC: 31634
## 
## Number of Fisher Scoring iterations: 100
AIC(bankruptcy_glm0)
## [1] 31634.15
BIC(bankruptcy_glm0)
## [1] 32140.94

Stepwise Backward Variable Selection

# Suppress step-by-step output — summary shown below
bankruptcy_glm_back <- step(bankruptcy_glm0)
summary(bankruptcy_glm_back)
## 
## Call:
## glm(formula = class ~ Attr1 + Attr2 + Attr3 + Attr4 + Attr5 + 
##     Attr6 + Attr7 + Attr8 + Attr9 + Attr10 + Attr11 + Attr12 + 
##     Attr13 + Attr14 + Attr15 + Attr16 + Attr17 + Attr18 + Attr19 + 
##     Attr20 + Attr21 + Attr22 + Attr23 + Attr24 + Attr25 + Attr26 + 
##     Attr27 + Attr29 + Attr30 + Attr31 + Attr32 + Attr33 + Attr34 + 
##     Attr35 + Attr36 + Attr37 + Attr38 + Attr39 + Attr40 + Attr41 + 
##     Attr42 + Attr43 + Attr44 + Attr45 + Attr46 + Attr47 + Attr48 + 
##     Attr49 + Attr50 + Attr51 + Attr52 + Attr53 + Attr55 + Attr56 + 
##     Attr57 + Attr58 + Attr59 + Attr60 + Attr61 + Attr62 + Attr63 + 
##     Attr64 + year, family = binomial, data = bankruptcy_train, 
##     control = glm.control(maxit = 100))
## 
## Coefficients:
##               Estimate Std. Error    z value Pr(>|z|)    
## (Intercept)  1.479e+15  8.688e+06  170212115   <2e-16 ***
## Attr1       -6.162e+14  1.607e+07  -38357193   <2e-16 ***
## Attr2       -5.816e+14  1.229e+07  -47323642   <2e-16 ***
## Attr3        3.778e+14  4.149e+06   91044276   <2e-16 ***
## Attr4        4.484e+13  4.277e+05  104827972   <2e-16 ***
## Attr5        3.469e+08  5.555e+00   62448644   <2e-16 ***
## Attr6        2.359e+11  3.364e+05     701243   <2e-16 ***
## Attr7       -1.049e+18  1.836e+10  -57143235   <2e-16 ***
## Attr8        7.645e+14  1.637e+06  467063378   <2e-16 ***
## Attr9       -8.774e+13  7.425e+05 -118172362   <2e-16 ***
## Attr10      -4.280e+14  1.251e+07  -34205423   <2e-16 ***
## Attr11       4.471e+14  1.497e+07   29865690   <2e-16 ***
## Attr12       3.092e+13  6.038e+05   51213535   <2e-16 ***
## Attr13       1.571e+13  3.023e+05   51951637   <2e-16 ***
## Attr14       1.052e+18  1.842e+10   57118759   <2e-16 ***
## Attr15       1.575e+08  6.487e+00   24282657   <2e-16 ***
## Attr16      -3.297e+14  9.522e+06  -34619612   <2e-16 ***
## Attr17      -7.982e+14  1.570e+06 -508484781   <2e-16 ***
## Attr18      -2.870e+15  5.974e+07  -48038741   <2e-16 ***
## Attr19      -8.943e+14  2.062e+07  -43374650   <2e-16 ***
## Attr20       1.116e+15  8.365e+07   13343981   <2e-16 ***
## Attr21      -1.168e+11  2.097e+03  -55719101   <2e-16 ***
## Attr22      -8.178e+14  1.564e+07  -52279705   <2e-16 ***
## Attr23       8.682e+14  1.819e+07   47716105   <2e-16 ***
## Attr24       5.433e+12  2.084e+05   26071942   <2e-16 ***
## Attr25       1.678e+13  2.869e+06    5849725   <2e-16 ***
## Attr26       2.972e+14  9.815e+06   30277386   <2e-16 ***
## Attr27      -1.145e+09  2.464e+01  -46453419   <2e-16 ***
## Attr29      -1.109e+13  9.604e+05  -11551950   <2e-16 ***
## Attr30      -1.604e+13  4.504e+05  -35610454   <2e-16 ***
## Attr31       1.804e+14  1.234e+07   14616982   <2e-16 ***
## Attr32       5.329e+11  1.271e+04   41942329   <2e-16 ***
## Attr33       2.811e+13  5.779e+05   48638822   <2e-16 ***
## Attr34       6.718e+13  2.978e+05  225623532   <2e-16 ***
## Attr35      -1.761e+14  8.397e+06  -20974791   <2e-16 ***
## Attr36       8.026e+13  7.882e+05  101825993   <2e-16 ***
## Attr37      -1.361e+09  2.773e+02   -4906531   <2e-16 ***
## Attr38      -4.791e+13  1.104e+07   -4341086   <2e-16 ***
## Attr39      -1.733e+14  2.793e+06  -62036082   <2e-16 ***
## Attr40       1.904e+14  8.131e+05  234219198   <2e-16 ***
## Attr41      -1.812e+10  2.325e+02  -77927466   <2e-16 ***
## Attr42      -9.572e+14  8.827e+06 -108436523   <2e-16 ***
## Attr43      -1.117e+15  8.365e+07  -13347700   <2e-16 ***
## Attr44       1.116e+15  8.365e+07   13346328   <2e-16 ***
## Attr45       1.049e+09  1.699e+03     617437   <2e-16 ***
## Attr46      -2.214e+14  7.044e+05 -314355030   <2e-16 ***
## Attr47       5.211e+10  7.252e+02   71864807   <2e-16 ***
## Attr48       9.142e+14  6.373e+06  143457679   <2e-16 ***
## Attr49       7.802e+14  5.069e+06  153918862   <2e-16 ***
## Attr50      -1.717e+14  9.250e+05 -185601739   <2e-16 ***
## Attr51       9.594e+12  1.126e+07     852027   <2e-16 ***
## Attr52      -2.257e+14  4.653e+06  -48518608   <2e-16 ***
## Attr53      -8.558e+11  2.551e+04  -33545505   <2e-16 ***
## Attr55       4.996e+07  8.921e+00    5601063   <2e-16 ***
## Attr56      -6.748e+13  2.659e+06  -25377827   <2e-16 ***
## Attr57      -2.397e+12  7.421e+04  -32295493   <2e-16 ***
## Attr58      -5.126e+13  2.636e+06  -19442686   <2e-16 ***
## Attr59      -7.410e+11  1.686e+04  -43958843   <2e-16 ***
## Attr60      -5.071e+10  8.720e+02  -58155907   <2e-16 ***
## Attr61      -4.954e+12  2.453e+04 -201954799   <2e-16 ***
## Attr62      -3.061e+10  1.779e+03  -17209185   <2e-16 ***
## Attr63      -6.463e+13  5.673e+05 -113932261   <2e-16 ***
## Attr64      -7.357e+10  3.680e+03  -19991259   <2e-16 ***
## year         2.721e+13  4.118e+05   66070885   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance:  3298.2  on 15972  degrees of freedom
## Residual deviance: 25302.6  on 15909  degrees of freedom
## AIC: 25431
## 
## Number of Fisher Scoring iterations: 100
bankruptcy_glm_back$deviance
## [1] 25302.64
AIC(bankruptcy_glm_back)
## [1] 25430.64
BIC(bankruptcy_glm_back)
## [1] 25922.08

In-Sample Performance — Training ROC

pred_glm0_train <- predict(bankruptcy_glm_back, type = "response")
pred            <- prediction(pred_glm0_train, bankruptcy_train$class)
perf            <- performance(pred, "tpr", "fpr")

plot(perf, colorize = TRUE, main = "Training ROC Curve")

# Training AUC
cat("Training AUC:", unlist(slot(performance(pred, "auc"), "y.values")))
## Training AUC: 0.5642232
hist(predict(bankruptcy_glm_back), main = "Log-Odds — Training")

pred_resp <- predict(bankruptcy_glm_back, type = "response")
hist(pred_resp, main = "Predicted Probabilities — Training")

table(bankruptcy_train$class,
      (pred_resp > 0.45) * 1,
      dnn = c("Truth", "Predicted"))
##      Predicted
## Truth     0     1
##     0 15577    55
##     1   296    45

Out-of-Sample Performance — Test ROC

pred_glm0_test <- predict(bankruptcy_glm_back,
                          newdata = bankruptcy_test,
                          type    = "response")
pred_test <- prediction(pred_glm0_test, bankruptcy_test$class)
perf_test <- performance(pred_test, "tpr", "fpr")

plot(perf_test, colorize = TRUE, main = "Test ROC Curve")

# Test AUC
cat("Test AUC:", unlist(slot(performance(pred_test, "auc"), "y.values")))
## Test AUC: 0.5357713
hist(pred_glm0_test, main = "Predicted Probabilities — Test")

table(bankruptcy_test$class,
      (pred_glm0_test > 0.45) * 1,
      dnn = c("Truth", "Predicted"))
##      Predicted
## Truth    0    1
##     0 3882   21
##     1   84    7
length(bankruptcy_test$class)
## [1] 3994
length(pred_glm0_test)
## [1] 3994

Cross-Validation

# Full model on entire dataset — required for cv.glm
# FIX: defined BEFORE cv calls so both symmetric and asymmetric CV can use it
bankruptcy_glm1 <- glm(class ~ ., family = binomial, data = bankruptcy)

# --- Symmetric Cost (equal penalty for false positives and false negatives) ---
Sym_cost <- function(r, pi){
  pcut <- 0.45
  mean(((r == 0) & (pi > pcut)) | ((r == 1) & (pi < pcut)))
}

cv_result_sym <- cv.glm(data   = bankruptcy,
                        glmfit = bankruptcy_glm1,
                        cost   = Sym_cost,
                        K      = 5)
cat("Symmetric CV Error:", cv_result_sym$delta[2])
## Symmetric CV Error: 0.02735501
# --- Asymmetric Cost (missed bankruptcies penalized 10x more than false alarms) ---
costfunc <- function(obs, pred.p){
  weight1 <- 10
  weight0 <- 1
  pcut    <- 1 / (1 + weight1 / weight0)
  c1      <- (obs == 1) & (pred.p < pcut)
  c0      <- (obs == 0) & (pred.p >= pcut)
  cost    <- mean(weight1 * c1 + weight0 * c0)
  return(cost)
}

cv_result <- cv.glm(data   = bankruptcy,
                    glmfit = bankruptcy_glm1,
                    cost   = costfunc,
                    K      = 5)
cat("Asymmetric CV Error:", cv_result$delta[2])
## Asymmetric CV Error: 0.2114973

Part 3: Random Forest

Train/Test Split

set.seed(49)
index      <- sample(1:nrow(bankruptcy_clean), 0.8 * nrow(bankruptcy_clean))
train_data <- bankruptcy_clean[index, ] %>% select(-year)
test_data  <- bankruptcy_clean[-index, ] %>% select(-year)

Train Model

rf_model <- randomForest(class ~ ., 
                         data       = train_data,
                         ntree      = 500,
                         importance = TRUE)
print(rf_model)
## 
## Call:
##  randomForest(formula = class ~ ., data = train_data, ntree = 500,      importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 8
## 
##         OOB estimate of  error rate: 3.69%
## Confusion matrix:
##       0   1 class.error
## 0 32943  92 0.002784925
## 1  1188 501 0.703374778

Predictions & Confusion Matrices

predictions_ts <- predict(rf_model, test_data)
predictions_tr <- predict(rf_model, train_data)

cat("--- Test Confusion Matrix ---\n")
## --- Test Confusion Matrix ---
confusion_matrix <- table(Predicted = predictions_ts, Actual = test_data$class)
print(confusion_matrix)
##          Actual
## Predicted    0    1
##         0 8250  289
##         1   29  113
cat("\n--- Training Confusion Matrix ---\n")
## 
## --- Training Confusion Matrix ---
confusion_matrix_tr <- table(Predicted = predictions_tr, Actual = train_data$class)
print(confusion_matrix_tr)
##          Actual
## Predicted     0     1
##         0 33035     1
##         1     0  1688

Variable Importance

importance(rf_model)
##                0            1 MeanDecreaseAccuracy MeanDecreaseGini
## Attr1  16.606896  -4.62829772            16.549753         34.22019
## Attr2  12.825919 -14.28133995            12.697868         31.56926
## Attr3  14.906236 -10.05502409            14.745010         39.46503
## Attr4  14.328891  -9.48999565            14.090354         40.31501
## Attr5  18.412297  11.16428281            19.337372         61.65713
## Attr6  30.907568  20.02666632            36.015391         59.55725
## Attr7  14.502785  -9.59044994            14.446049         33.64986
## Attr8  13.530473 -13.02644495            13.408588         32.53687
## Attr9  21.222955  16.96220909            22.630786         69.50581
## Attr10 16.744612 -13.65277180            16.665943         33.80095
## Attr11 16.323718  -8.13665256            16.268027         38.05544
## Attr12 13.274865  -1.26906753            13.257259         34.25865
## Attr13 17.564946   5.69835444            17.932058         48.94109
## Attr14 15.658742  -5.83654038            15.609562         34.00251
## Attr15 12.161186   1.41609149            12.364717         43.25968
## Attr16 16.635736   0.53070782            16.763292         45.34406
## Attr17 15.032969 -14.76587783            14.873581         30.52931
## Attr18 15.399354 -10.08756496            15.305691         32.39138
## Attr19 13.949108  -4.12034193            13.924831         35.90499
## Attr20 21.772861   1.12177017            22.025468         43.96938
## Attr21  9.662612   1.01458962             9.442617         35.88365
## Attr22 18.254718   1.63706467            18.577494         52.92922
## Attr23 17.954142   0.05072284            17.732231         34.98786
## Attr24 38.965614  20.72044746            42.735709         87.12716
## Attr25 18.996455  -7.86459799            19.417101         45.43018
## Attr26 23.743066   4.64224574            24.010787         49.79202
## Attr27 68.354361  38.16800110            71.903366        217.57040
## Attr28 14.375249  -5.08016948            14.403208         36.55971
## Attr29 34.680379   6.45343627            35.142111         64.87872
## Attr30 26.671636  -9.48619142            26.523057         38.73579
## Attr31 15.477427  -4.99683778            15.504789         39.36529
## Attr32 17.040603  -9.68158824            16.962842         35.73729
## Attr33 21.061455 -15.02460726            20.801407         40.09735
## Attr34 33.317377  31.03749156            37.016551        142.96988
## Attr35 18.767108  11.44385340            19.471397         65.66435
## Attr36 23.523612 -10.61351108            23.211712         47.18448
## Attr37 19.576186  -3.17660310            19.013180         35.02208
## Attr38 16.081179  -1.09994717            16.256995         40.92263
## Attr39 18.975125  12.72958628            19.925325         63.92841
## Attr40 21.139368   5.03596213            22.175491         56.31590
## Attr41 13.069486  10.06966343            13.808547         54.27079
## Attr42 16.110562   3.45169819            16.698404         48.92070
## Attr43 21.529408  -9.99308313            21.128624         35.18745
## Attr44 22.253959  -7.05187648            22.336775         54.23203
## Attr45 17.492236   4.18572008            17.937643         37.99157
## Attr46 32.263106  26.30473964            33.137302        109.48304
## Attr47 26.227297   3.93457020            26.663260         43.29199
## Attr48 17.978762 -10.04985732            18.024265         42.36592
## Attr49 16.401903  -8.38436544            16.415412         39.03707
## Attr50 19.974318 -12.43553381            19.849692         37.68821
## Attr51 16.448519  -4.04504482            16.470906         35.31536
## Attr52 20.403383 -15.48726000            20.063324         35.19586
## Attr53 14.252096  -9.28249791            14.165008         33.12886
## Attr54 16.872355  -4.55754412            16.947069         37.25879
## Attr55 20.884555   1.70206657            21.943277         56.35552
## Attr56 34.576317 -15.80732813            35.161157         85.20735
## Attr57 13.917590   0.48146631            14.211672         36.95150
## Attr58 53.127287 -24.15673765            53.198881        102.85798
## Attr59  7.952038  -2.03503861             7.984726         25.37383
## Attr60 18.242923  -1.49299770            18.492969         46.40723
## Attr61 21.554066  -8.10032568            21.481571         55.58762
## Attr62 17.668996 -11.98109822            17.435031         32.44292
## Attr63 16.078487 -12.06401836            15.851214         32.97737
## Attr64 16.507808  -1.43887801            16.539998         36.90226
varImpPlot(rf_model, main = "Random Forest — Variable Importance")


Part 4: Classification Tree

Train/Test Split

set.seed(49)
index      <- sample(1:nrow(bankruptcy_clean), 0.8 * nrow(bankruptcy_clean))
train_data <- bankruptcy_clean[index, ] %>% select(-year)
test_data  <- bankruptcy_clean[-index, ] %>% select(-year)

dim(train_data)
## [1] 34724    65
dim(test_data)
## [1] 8681   65
summary(train_data)
##      Attr1               Attr2               Attr3               Attr4         
##  Min.   :-463.8900   Min.   :-430.8700   Min.   :-479.9600   Min.   :   0.000  
##  1st Qu.:   0.0036   1st Qu.:   0.2689   1st Qu.:   0.0215   1st Qu.:   1.051  
##  Median :   0.0503   Median :   0.4710   Median :   0.1965   Median :   1.570  
##  Mean   :   0.0338   Mean   :   0.6064   Mean   :   0.0983   Mean   :   4.616  
##  3rd Qu.:   0.1301   3rd Qu.:   0.6890   3rd Qu.:   0.4038   3rd Qu.:   2.787  
##  Max.   :  87.4590   Max.   : 480.9600   Max.   :  22.7690   Max.   :8199.100  
##      Attr5               Attr6               Attr7          
##  Min.   :-11903000   Min.   :-508.4100   Min.   :-517.4800  
##  1st Qu.:      -49   1st Qu.:   0.0000   1st Qu.:   0.0060  
##  Median :       -1   Median :   0.0000   Median :   0.0604  
##  Mean   :     -466   Mean   :  -0.0753   Mean   :   0.0823  
##  3rd Qu.:       51   3rd Qu.:   0.0882   3rd Qu.:   0.1515  
##  Max.   :  1250100   Max.   : 543.2500   Max.   : 649.2300  
##      Attr8               Attr9              Attr10              Attr11         
##  Min.   : -141.410   Min.   :  -3.496   Min.   :-479.9100   Min.   :-463.8900  
##  1st Qu.:    0.431   1st Qu.:   1.019   1st Qu.:   0.2957   1st Qu.:   0.0159  
##  Median :    1.070   Median :   1.198   Median :   0.5066   Median :   0.0755  
##  Mean   :   10.180   Mean   :   2.589   Mean   :   0.5520   Mean   :   0.1235  
##  3rd Qu.:    2.608   3rd Qu.:   2.065   3rd Qu.:   0.7086   3rd Qu.:   0.1676  
##  Max.   :29636.000   Max.   :9742.300   Max.   :1084.7000   Max.   : 681.5400  
##      Attr12              Attr13              Attr14              Attr15        
##  Min.   :-6331.800   Min.   :-1460.600   Min.   :-517.4800   Min.   :-9632400  
##  1st Qu.:    0.016   1st Qu.:    0.024   1st Qu.:   0.0060   1st Qu.:     222  
##  Median :    0.173   Median :    0.069   Median :   0.0604   Median :     846  
##  Mean   :    0.987   Mean   :    0.835   Mean   :   0.0823   Mean   :    2172  
##  3rd Qu.:    0.590   3rd Qu.:    0.134   3rd Qu.:   0.1515   3rd Qu.:    2214  
##  Max.   : 8259.400   Max.   :13315.000   Max.   : 649.2300   Max.   :10236000  
##      Attr16              Attr17              Attr18         
##  Min.   :-6331.800   Min.   :   -0.413   Min.   :-517.4800  
##  1st Qu.:    0.074   1st Qu.:    1.451   1st Qu.:   0.0060  
##  Median :    0.246   Median :    2.117   Median :   0.0604  
##  Mean   :    1.269   Mean   :   11.353   Mean   :   0.0855  
##  3rd Qu.:    0.665   3rd Qu.:    3.697   3rd Qu.:   0.1515  
##  Max.   : 8259.400   Max.   :29642.000   Max.   : 649.2300  
##      Attr19              Attr20            Attr21              Attr22         
##  Min.   :-1578.700   Min.   :      0   Min.   :-1325.000   Min.   :-431.5900  
##  1st Qu.:    0.004   1st Qu.:     15   1st Qu.:    0.935   1st Qu.:   0.0000  
##  Median :    0.036   Median :     35   Median :    1.045   Median :   0.0625  
##  Mean   :    0.243   Mean   :    289   Mean   :    3.935   Mean   :   0.1057  
##  3rd Qu.:    0.091   3rd Qu.:     63   3rd Qu.:    1.173   3rd Qu.:   0.1505  
##  Max.   : 9230.500   Max.   :7809200   Max.   :29907.000   Max.   : 681.5400  
##      Attr23              Attr24              Attr25         
##  Min.   :-1578.700   Min.   :-463.8900   Min.   :-500.9300  
##  1st Qu.:    0.002   1st Qu.:   0.0245   1st Qu.:   0.1491  
##  Median :    0.030   Median :   0.1551   Median :   0.3844  
##  Mean   :    0.220   Mean   :   0.2526   Mean   :   0.3326  
##  3rd Qu.:    0.078   3rd Qu.:   0.3508   3rd Qu.:   0.6089  
##  Max.   : 9230.500   Max.   : 649.2300   Max.   :1353.3000  
##      Attr26              Attr27            Attr28              Attr29       
##  Min.   :-6331.800   Min.   :-259010   Min.   :-3829.900   Min.   :-0.8861  
##  1st Qu.:    0.067   1st Qu.:      0   1st Qu.:    0.045   1st Qu.: 3.4986  
##  Median :    0.222   Median :      1   Median :    0.465   Median : 4.0163  
##  Mean   :    1.109   Mean   :   1073   Mean   :    6.051   Mean   : 4.0063  
##  3rd Qu.:    0.599   3rd Qu.:      4   3rd Qu.:    1.456   3rd Qu.: 4.5219  
##  Max.   : 8262.300   Max.   :4208800   Max.   :21701.000   Max.   : 9.6983  
##      Attr30              Attr31              Attr32             Attr33         
##  Min.   : -6351.70   Min.   :-1495.600   Min.   :   -9296   Min.   :  -19.197  
##  1st Qu.:     0.08   1st Qu.:    0.007   1st Qu.:      47   1st Qu.:    2.825  
##  Median :     0.22   Median :    0.043   Median :      78   Median :    4.625  
##  Mean   :     7.68   Mean   :    0.262   Mean   :    1010   Mean   :    8.872  
##  3rd Qu.:     0.40   3rd Qu.:    0.101   3rd Qu.:     127   3rd Qu.:    7.796  
##  Max.   :152860.00   Max.   : 9244.300   Max.   :17364000   Max.   :21944.000  
##      Attr34              Attr35              Attr36             Attr37        
##  Min.   : -306.710   Min.   :-431.5900   Min.   :  -0.001   Min.   :  -525.5  
##  1st Qu.:    0.310   1st Qu.:   0.0062   1st Qu.:   1.104   1st Qu.:     2.5  
##  Median :    1.969   Median :   0.0611   Median :   1.648   Median :     3.1  
##  Mean   :    5.597   Mean   :   0.1063   Mean   :   2.843   Mean   :    62.9  
##  3rd Qu.:    4.549   3rd Qu.:   0.1506   3rd Qu.:   2.417   3rd Qu.:     3.9  
##  Max.   :21944.000   Max.   : 626.9200   Max.   :9742.300   Max.   :398920.0  
##      Attr38              Attr39              Attr40             Attr41        
##  Min.   :-479.9100   Min.   :-7522.000   Min.   : -18.718   Min.   :-667.730  
##  1st Qu.:   0.4202   1st Qu.:    0.004   1st Qu.:   0.053   1st Qu.:   0.028  
##  Median :   0.6125   Median :    0.037   Median :   0.177   Median :   0.086  
##  Mean   :   0.6509   Mean   :   -0.330   Mean   :   2.168   Mean   :   1.029  
##  3rd Qu.:   0.7722   3rd Qu.:    0.092   3rd Qu.:   0.651   3rd Qu.:   0.201  
##  Max.   :1084.7000   Max.   : 2156.500   Max.   :8007.100   Max.   :5043.300  
##      Attr42               Attr43             Attr44             Attr45         
##  Min.   :-1395.8000   Min.   :  -48532   Min.   :  -48532   Min.   :-256230.0  
##  1st Qu.:    0.0000   1st Qu.:      67   1st Qu.:      35   1st Qu.:      0.0  
##  Median :    0.0381   Median :      99   Median :      55   Median :      0.3  
##  Mean   :   -0.1461   Mean   :    1282   Mean   :     994   Mean   :     15.3  
##  3rd Qu.:    0.0921   3rd Qu.:     140   3rd Qu.:      81   3rd Qu.:      0.9  
##  Max.   : 2156.8000   Max.   :30393000   Max.   :22584000   Max.   : 366030.0  
##      Attr46             Attr47            Attr48              Attr49         
##  Min.   : -13.554   Min.   :    -64   Min.   :-542.5600   Min.   :-9001.000  
##  1st Qu.:   0.611   1st Qu.:     16   1st Qu.:  -0.0377   1st Qu.:   -0.026  
##  Median :   1.027   Median :     38   Median :   0.0189   Median :    0.011  
##  Mean   :   3.741   Mean   :    292   Mean   :   0.0230   Mean   :   -0.556  
##  3rd Qu.:   1.907   3rd Qu.:     70   3rd Qu.:   0.1080   3rd Qu.:    0.062  
##  Max.   :8199.100   Max.   :6084200   Max.   : 623.8500   Max.   :  178.890  
##      Attr50             Attr51             Attr52             Attr53         
##  Min.   :  -0.012   Min.   :  0.0000   Min.   :  -25.47   Min.   : -3828.90  
##  1st Qu.:   0.776   1st Qu.:  0.1906   1st Qu.:    0.13   1st Qu.:     0.70  
##  Median :   1.222   Median :  0.3405   Median :    0.21   Median :     1.21  
##  Mean   :   3.675   Mean   :  0.4990   Mean   :    4.93   Mean   :    22.80  
##  3rd Qu.:   2.200   3rd Qu.:  0.5341   3rd Qu.:    0.35   3rd Qu.:     2.18  
##  Max.   :8199.100   Max.   :480.9600   Max.   :88433.00   Max.   :180440.00  
##      Attr54              Attr55             Attr56          
##  Min.   : -3828.90   Min.   :-1805200   Min.   :-1108300.0  
##  1st Qu.:     0.96   1st Qu.:      30   1st Qu.:       0.0  
##  Median :     1.38   Median :    1102   Median :       0.1  
##  Mean   :    23.78   Mean   :    7761   Mean   :     -32.7  
##  3rd Qu.:     2.33   3rd Qu.:    5023   3rd Qu.:       0.1  
##  Max.   :180440.00   Max.   : 6123700   Max.   :     293.1  
##      Attr57               Attr58              Attr59              Attr60       
##  Min.   :-1667.3000   Min.   :   -198.7   Min.   : -327.970   Min.   :      0  
##  1st Qu.:    0.0151   1st Qu.:      0.9   1st Qu.:    0.000   1st Qu.:      6  
##  Median :    0.1204   Median :      1.0   Median :    0.006   Median :     10  
##  Mean   :   -0.0142   Mean   :     37.2   Mean   :    1.505   Mean   :    487  
##  3rd Qu.:    0.2843   3rd Qu.:      1.0   3rd Qu.:    0.235   3rd Qu.:     19  
##  Max.   :  552.6400   Max.   :1108300.0   Max.   :23853.000   Max.   :4818700  
##      Attr61              Attr62             Attr63              Attr64         
##  Min.   :   -12.66   Min.   :-2336500   Min.   :   -0.368   Min.   :-10677.00  
##  1st Qu.:     4.50   1st Qu.:      42   1st Qu.:    3.104   1st Qu.:     2.21  
##  Median :     6.64   Median :      71   Median :    5.088   Median :     4.28  
##  Mean   :    17.62   Mean   :    1499   Mean   :    9.630   Mean   :    71.63  
##  3rd Qu.:    10.36   3rd Qu.:     117   3rd Qu.:    8.595   3rd Qu.:     9.57  
##  Max.   :108000.00   Max.   :25016000   Max.   :23454.000   Max.   :294770.00  
##  class    
##  0:33035  
##  1: 1689  
##           
##           
##           
## 

Standard Classification Tree

bankruptcy_rpart <- rpart(formula = class ~ ., data = train_data, method = "class")
prp(bankruptcy_rpart, extra = 1, main = "Standard Classification Tree")

# Test set
pred <- predict(bankruptcy_rpart, test_data, type = "class")
cat("--- Test Confusion Matrix ---\n")
## --- Test Confusion Matrix ---
table(test_data$class, pred, dnn = c("True", "Predict"))
##     Predict
## True    0    1
##    0 8253   26
##    1  270  132
mean(pred == test_data$class)
## [1] 0.9659025
misclass_rate_test  <- mean(pred != test_data$class);  misclass_rate_test
## [1] 0.03409745
# Train set
pred0 <- predict(bankruptcy_rpart, train_data, type = "class")
cat("--- Train Confusion Matrix ---\n")
## --- Train Confusion Matrix ---
table(train_data$class, pred0, dnn = c("True", "Predict"))
##     Predict
## True     0     1
##    0 32973    62
##    1  1052   637
mean(pred0 == train_data$class)
## [1] 0.9679184
misclass_rate_train <- mean(pred0 != train_data$class); misclass_rate_train
## [1] 0.03208156
imp <- bankruptcy_rpart$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Standard Tree", col = "lightblue")

Asymmetric Cost Tree — Training Data

bankruptcy_rpartAs <- rpart(formula = class ~ ., data = train_data,
                            method = "class",
                            parms  = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(bankruptcy_rpartAs, extra = 1, main = "Asymmetric Tree — Train")

predAs <- predict(bankruptcy_rpartAs, train_data, type = "class")
table(train_data$class, predAs, dnn = c("True", "Predict"))
##     Predict
## True     0     1
##    0 32398   637
##    1   878   811
mean(predAs == train_data$class)
## [1] 0.9563702
misclass_rate_train <- mean(predAs != train_data$class); misclass_rate_train
## [1] 0.04362977
cost <- function(r, phat){
  weight1 <- 5
  weight0 <- 1
  pcut    <- 0.35
  c1 <- (r == 1) & (phat < pcut)   # actual 1, predicted 0
  c0 <- (r == 0) & (phat > pcut)   # actual 0, predicted 1
  return(mean(weight1 * c1 + weight0 * c0))
}
cost(train_data$class, predict(bankruptcy_rpartAs, train_data, type = "prob"))
## [1] 0.5853876
imp <- bankruptcy_rpartAs$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Asymmetric Tree (Train)", col = "lightblue")

Asymmetric Cost Tree — Test Data

# FIX: model defined first, then predictions made below
bankruptcy_rpartAt <- rpart(formula = class ~ ., data = test_data,
                            method = "class",
                            parms  = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(bankruptcy_rpartAt, extra = 1, main = "Asymmetric Tree — Test")

predAt <- predict(bankruptcy_rpartAt, test_data, type = "class")
table(test_data$class, predAt, dnn = c("True", "Predict"))
##     Predict
## True    0    1
##    0 8086  193
##    1  214  188
mean(predAt == test_data$class)
## [1] 0.953116
misclass_rate_test <- mean(predAt != test_data$class); misclass_rate_test
## [1] 0.046884
cost(test_data$class, predict(bankruptcy_rpartAt, test_data, type = "prob"))
## [1] 0.5877779
imp <- bankruptcy_rpartAt$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Asymmetric Tree (Test)", col = "lightblue")


Results Summary

Model Variant Data Key Output
Logistic Regression Stepwise backward Train/Test ROC curve + AUC
Logistic Regression 5-fold CV Full dataset Symmetric & asymmetric CV error
Random Forest 500 trees Train/Test Confusion matrix + variable importance
Classification Tree Standard Train/Test Accuracy + misclassification rate
Classification Tree Asymmetric (5:1 loss) Train Accuracy + asymmetric cost
Classification Tree Asymmetric (5:1 loss) Test Accuracy + asymmetric cost