This project predicts corporate bankruptcy using five years of financial ratio data (64 attributes per company). We apply three models — Logistic Regression, Random Forest, and Classification Trees (standard and asymmetric cost) — and evaluate each using confusion matrices, ROC curves, and cross-validation.
Dataset: Polish Companies Bankruptcy Data (ARFF
format, 1–5 year files)
Tools: R, RStudio — packages: farff,
glmnet, ROCR, randomForest,
rpart, rpart.plot, boot,
dplyr
install.packages("farff")
install.packages("glmnet")
install.packages("ROCR")
install.packages("randomForest")
install.packages("rpart")
install.packages("rpart.plot")
install.packages("boot")
install.packages("dplyr")
library(farff)
library(glmnet)
library(ROCR)
library(dplyr)
library(randomForest)
library(rpart)
library(rpart.plot)
library(boot)
Note: Place your 5 ARFF files in the same folder as this .Rmd file and name them exactly:
1year.arff,2year.arff,3year.arff,4year.arff,5year.arff. Or update the filenames in the chunk below to match your actual file names.
data_1yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/1year.arff")
data_2yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/2year.arff")
data_3yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/3year.arff")
data_4yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/4year.arff")
data_5yr <- readARFF("/Users/a/Desktop/BUS 193/polish+companies+bankruptcy+data (1)/5year.arff")
# Add year indicator
data_1yr$year <- 1
data_2yr$year <- 2
data_3yr$year <- 3
data_4yr$year <- 4
data_5yr$year <- 5
# Combine into one dataset
bankruptcy <- rbind(data_1yr, data_2yr, data_3yr, data_4yr, data_5yr)
save(bankruptcy, file = "bankruptcy_data.RData")
# Convert class to factor for classification
bankruptcy$class <- as.factor(bankruptcy$class)
# Replace NAs with column median to preserve data
bankruptcy_clean <- as.data.frame(lapply(bankruptcy, function(x) {
if(is.numeric(x)) { x[is.na(x)] <- median(x, na.rm = TRUE); x } else { x }
}))
# Restore class as factor after lapply (lapply coerces it to numeric)
bankruptcy_clean$class <- as.factor(bankruptcy_clean$class)
str(bankruptcy)
## 'data.frame': 43405 obs. of 66 variables:
## $ Attr1 : num 0.2006 0.2091 0.2487 0.0815 0.1873 ...
## $ Attr2 : num 0.38 0.5 0.696 0.307 0.613 ...
## $ Attr3 : num 0.396 0.472 0.267 0.459 0.23 ...
## $ Attr4 : num 2.05 1.94 1.55 2.49 1.41 ...
## $ Attr5 : num 32.35 14.79 -1.15 51.95 -7.31 ...
## $ Attr6 : num 0.388 0 0 0.15 0.187 ...
## $ Attr7 : num 0.2498 0.2583 0.3091 0.0927 0.1873 ...
## $ Attr8 : num 1.331 0.996 0.437 1.866 0.631 ...
## $ Attr9 : num 1.14 1.7 1.31 1.06 1.16 ...
## $ Attr10: num 0.505 0.498 0.304 0.574 0.387 ...
## $ Attr11: num 0.2498 0.2611 0.3126 0.0927 0.1873 ...
## $ Attr12: num 0.66 0.517 0.642 0.302 0.331 ...
## $ Attr13: num 0.1666 0.1583 0.2444 0.0943 0.1218 ...
## $ Attr14: num 0.2498 0.2583 0.3091 0.0927 0.1873 ...
## $ Attr15: num 497 678 794 917 1133 ...
## $ Attr16: num 0.734 0.538 0.46 0.398 0.322 ...
## $ Attr17: num 2.63 2 1.44 3.25 1.63 ...
## $ Attr18: num 0.2498 0.2583 0.3091 0.0927 0.1873 ...
## $ Attr19: num 0.1494 0.152 0.2361 0.0714 0.1155 ...
## $ Attr20: num 43.4 88 73.1 79.8 57 ...
## $ Attr21: num 1.25 1.43 1.43 1.51 NA ...
## $ Attr22: num 0.214 0.248 0.303 0.116 0.198 ...
## $ Attr23: num 0.12 0.123 0.19 0.0628 0.1155 ...
## $ Attr24: num 0.477 NA NA 0.172 0.187 ...
## $ Attr25: num 0.505 0.395 0.289 0.574 0.387 ...
## $ Attr26: num 0.604 0.44 0.373 0.362 0.322 ...
## $ Attr27: num 1.458 88.444 86.011 0.941 1.414 ...
## $ Attr28: num 1.76 16.95 1.06 1.96 1.12 ...
## $ Attr29: num 5.94 3.69 4.37 4.65 4.14 ...
## $ Attr30: num 0.118 0.27 0.419 0.143 0.279 ...
## $ Attr31: num 0.1494 0.152 0.2382 0.0714 0.1155 ...
## $ Attr32: num 94.1 122.2 176.9 91.4 147 ...
## $ Attr33: num 3.88 2.99 2.06 3.99 2.48 ...
## $ Attr34: num 0.564 2.988 1.427 0.376 0.323 ...
## $ Attr35: num 0.214 0.206 0.316 0.116 0.198 ...
## $ Attr36: num 1.74 1.7 1.31 1.36 1.63 ...
## $ Attr37: num 593.3 NA 2.3 NA 11.2 ...
## $ Attr38: num 0.506 0.498 0.515 0.574 0.435 ...
## $ Attr39: num 0.128 0.121 0.241 0.089 0.122 ...
## $ Attr40: num 0.663 0.0864 0.322 0.4014 0.293 ...
## $ Attr41: num 0.0514 0.0644 0.074 0.0696 0.0967 ...
## $ Attr42: num 0.128 0.146 0.231 0.089 0.122 ...
## $ Attr43: num 114 199 166 181 142 ...
## $ Attr44: num 71 111.5 92.4 101 84.6 ...
## $ Attr45: num 1.01 0.51 0.948 0.287 0.739 ...
## $ Attr46: num 1.522 1.125 1.01 1.57 0.958 ...
## $ Attr47: num 49.4 100.1 96.4 84.3 65.9 ...
## $ Attr48: num 0.1853 0.2373 0.2918 0.0859 0.1881 ...
## $ Attr49: num 0.1109 0.1396 0.2229 0.0662 0.116 ...
## $ Attr50: num 2.04 1.94 1.08 2.49 1.3 ...
## $ Attr51: num 0.379 0.5 0.482 0.307 0.565 ...
## $ Attr52: num 0.258 0.335 0.485 0.25 0.403 ...
## $ Attr53: num 2.24 17.87 1.21 2.45 1.88 ...
## $ Attr54: num 2.25 17.87 2.05 2.45 2.12 ...
## $ Attr55: num 348690 2305 6333 20545 3187 ...
## $ Attr56: num 0.122 0.121 0.241 0.054 0.135 ...
## $ Attr57: num 0.397 0.42 0.818 0.142 0.484 ...
## $ Attr58: num 0.878 0.853 0.766 0.946 0.865 ...
## $ Attr59: num 0.00192 0 0.69484 0 0.12444 ...
## $ Attr60: num 8.42 4.15 4.99 4.57 6.4 ...
## $ Attr61: num 5.14 3.27 3.95 3.61 4.32 ...
## $ Attr62: num 82.7 107.3 134.3 86.4 127.2 ...
## $ Attr63: num 4.42 3.4 2.72 4.22 2.87 ...
## $ Attr64: num 7.43 60.99 5.21 5.55 7.9 ...
## $ class : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
## $ year : num 1 1 1 1 1 1 1 1 1 1 ...
summary(bankruptcy)
## Attr1 Attr2 Attr3 Attr4
## Min. :-463.8900 Min. :-430.8700 Min. :-479.9600 Min. : -0.40
## 1st Qu.: 0.0034 1st Qu.: 0.2690 1st Qu.: 0.0215 1st Qu.: 1.05
## Median : 0.0497 Median : 0.4719 Median : 0.1966 Median : 1.57
## Mean : 0.0352 Mean : 0.5902 Mean : 0.1144 Mean : 6.31
## 3rd Qu.: 0.1296 3rd Qu.: 0.6883 3rd Qu.: 0.4034 3rd Qu.: 2.79
## Max. : 94.2800 Max. : 480.9600 Max. : 28.3360 Max. :53433.00
## NA's :8 NA's :8 NA's :8 NA's :134
## Attr5 Attr6 Attr7 Attr8
## Min. :-11903000 Min. :-508.4100 Min. :-517.4800 Min. : -141.41
## 1st Qu.: -49 1st Qu.: 0.0000 1st Qu.: 0.0058 1st Qu.: 0.43
## Median : -1 Median : 0.0000 Median : 0.0596 Median : 1.07
## Mean : -385 Mean : -0.0561 Mean : 0.0935 Mean : 12.64
## 3rd Qu.: 51 3rd Qu.: 0.0894 3rd Qu.: 0.1509 3rd Qu.: 2.62
## Max. : 1250100 Max. : 543.2500 Max. : 649.2300 Max. :53432.00
## NA's :89 NA's :8 NA's :8 NA's :94
## Attr9 Attr10 Attr11 Attr12
## Min. : -3.496 Min. :-479.9100 Min. :-463.8900 Min. :-6331.800
## 1st Qu.: 1.018 1st Qu.: 0.2955 1st Qu.: 0.0154 1st Qu.: 0.015
## Median : 1.195 Median : 0.5060 Median : 0.0753 Median : 0.172
## Mean : 2.652 Mean : 0.6269 Mean : 0.1311 Mean : 1.132
## 3rd Qu.: 2.062 3rd Qu.: 0.7091 3rd Qu.: 0.1668 3rd Qu.: 0.587
## Max. :9742.300 Max. :1099.5000 Max. : 681.5400 Max. : 8259.400
## NA's :9 NA's :8 NA's :44 NA's :134
## Attr13 Attr14 Attr15 Attr16
## Min. :-1460.600 Min. :-517.4800 Min. :-9632400 Min. :-6331.800
## 1st Qu.: 0.024 1st Qu.: 0.0058 1st Qu.: 223 1st Qu.: 0.073
## Median : 0.068 Median : 0.0597 Median : 846 Median : 0.246
## Mean : 0.810 Mean : 0.0935 Mean : 1992 Mean : 1.411
## 3rd Qu.: 0.135 3rd Qu.: 0.1509 3rd Qu.: 2227 3rd Qu.: 0.665
## Max. :13315.000 Max. : 649.2300 Max. :10236000 Max. : 8259.400
## NA's :127 NA's :8 NA's :36 NA's :95
## Attr17 Attr18 Attr19 Attr20
## Min. : -0.41 Min. :-517.4800 Min. :-1578.700 Min. : -29
## 1st Qu.: 1.45 1st Qu.: 0.0058 1st Qu.: 0.004 1st Qu.: 15
## Median : 2.12 Median : 0.0597 Median : 0.036 Median : 35
## Mean : 13.80 Mean : 0.0986 Mean : 0.156 Mean : 243
## 3rd Qu.: 3.70 3rd Qu.: 0.1509 3rd Qu.: 0.091 3rd Qu.: 64
## Max. :53433.00 Max. : 649.2300 Max. : 9230.500 Max. :7809200
## NA's :94 NA's :8 NA's :128 NA's :127
## Attr21 Attr22 Attr23
## Min. :-1325.000 Min. :-431.5900 Min. :-1578.700
## 1st Qu.: 0.908 1st Qu.: 0.0000 1st Qu.: 0.002
## Median : 1.045 Median : 0.0623 Median : 0.030
## Mean : 3.885 Mean : 0.1139 Mean : 0.139
## 3rd Qu.: 1.204 3rd Qu.: 0.1500 3rd Qu.: 0.078
## Max. :29907.000 Max. : 681.5400 Max. : 9230.500
## NA's :5854 NA's :8 NA's :127
## Attr24 Attr25 Attr26 Attr27
## Min. :-463.8900 Min. :-500.9300 Min. :-6331.800 Min. :-259010
## 1st Qu.: 0.0213 1st Qu.: 0.1501 1st Qu.: 0.067 1st Qu.: 0
## Median : 0.1551 Median : 0.3845 Median : 0.222 Median : 1
## Mean : 0.2700 Mean : 0.3928 Mean : 1.264 Mean : 1108
## 3rd Qu.: 0.3556 3rd Qu.: 0.6107 3rd Qu.: 0.599 3rd Qu.: 5
## Max. : 831.6600 Max. :1353.3000 Max. : 8262.300 Max. :4208800
## NA's :922 NA's :8 NA's :95 NA's :2764
## Attr28 Attr29 Attr30 Attr31
## Min. :-3829.900 Min. :-0.8861 Min. : -6351.70 Min. :-1495.600
## 1st Qu.: 0.038 1st Qu.: 3.4951 1st Qu.: 0.08 1st Qu.: 0.007
## Median : 0.465 Median : 4.0140 Median : 0.22 Median : 0.043
## Mean : 6.003 Mean : 4.0050 Mean : 7.37 Mean : 0.177
## 3rd Qu.: 1.497 3rd Qu.: 4.5202 3rd Qu.: 0.41 3rd Qu.: 0.102
## Max. :21701.000 Max. : 9.6983 Max. :152860.00 Max. : 9244.300
## NA's :812 NA's :8 NA's :127 NA's :127
## Attr32 Attr33 Attr34 Attr35
## Min. : -9296 Min. : -19.197 Min. :-1696.000 Min. :-431.5900
## 1st Qu.: 46 1st Qu.: 2.820 1st Qu.: 0.306 1st Qu.: 0.0060
## Median : 78 Median : 4.625 Median : 1.967 Median : 0.0607
## Mean : 1163 Mean : 8.636 Mean : 5.411 Mean : 0.1119
## 3rd Qu.: 128 3rd Qu.: 7.803 3rd Qu.: 4.551 3rd Qu.: 0.1501
## Max. :17364000 Max. :21944.000 Max. :21944.000 Max. : 626.9200
## NA's :368 NA's :134 NA's :94 NA's :8
## Attr36 Attr37 Attr38 Attr39
## Min. : -0.001 Min. : -525.5 Min. :-479.9100 Min. :-7522.000
## 1st Qu.: 1.101 1st Qu.: 1.1 1st Qu.: 0.4198 1st Qu.: 0.004
## Median : 1.643 Median : 3.1 Median : 0.6122 Median : 0.037
## Mean : 2.911 Mean : 105.1 Mean : 0.7244 Mean : -0.289
## 3rd Qu.: 2.421 3rd Qu.: 11.4 3rd Qu.: 0.7718 3rd Qu.: 0.092
## Max. :9742.300 Max. :398920.0 Max. :1099.5000 Max. : 2156.500
## NA's :8 NA's :18984 NA's :8 NA's :127
## Attr40 Attr41 Attr42 Attr43
## Min. :-101.270 Min. : -1234.40 Min. :-1395.8000 Min. : -115870
## 1st Qu.: 0.053 1st Qu.: 0.03 1st Qu.: 0.0000 1st Qu.: 67
## Median : 0.177 Median : 0.09 Median : 0.0380 Median : 99
## Mean : 2.147 Mean : 7.72 Mean : -0.1425 Mean : 1074
## 3rd Qu.: 0.652 3rd Qu.: 0.21 3rd Qu.: 0.0921 3rd Qu.: 141
## Max. :8007.100 Max. :288770.00 Max. : 2156.8000 Max. :30393000
## NA's :134 NA's :754 NA's :127 NA's :127
## Attr44 Attr45 Attr46 Attr47
## Min. : -115870 Min. :-256230.0 Min. : -101.26 Min. : -96
## 1st Qu.: 35 1st Qu.: 0.0 1st Qu.: 0.61 1st Qu.: 16
## Median : 55 Median : 0.3 Median : 1.03 Median : 38
## Mean : 831 Mean : 14.8 Mean : 5.43 Mean : 358
## 3rd Qu.: 81 3rd Qu.: 1.0 3rd Qu.: 1.91 3rd Qu.: 70
## Max. :22584000 Max. : 366030.0 Max. :53433.00 Max. :6084200
## NA's :127 NA's :2147 NA's :135 NA's :297
## Attr48 Attr49 Attr50 Attr51
## Min. :-542.5600 Min. :-9001.000 Min. : -0.05 Min. : -0.1866
## 1st Qu.: -0.0382 1st Qu.: -0.027 1st Qu.: 0.77 1st Qu.: 0.1901
## Median : 0.0184 Median : 0.011 Median : 1.22 Median : 0.3410
## Mean : 0.0286 Mean : -0.483 Mean : 5.84 Mean : 0.4835
## 3rd Qu.: 0.1073 3rd Qu.: 0.062 3rd Qu.: 2.21 3rd Qu.: 0.5347
## Max. : 623.8500 Max. : 178.890 Max. :53433.00 Max. :480.9600
## NA's :9 NA's :127 NA's :94 NA's :8
## Attr52 Attr53 Attr54 Attr55
## Min. : -25.47 Min. : -3828.90 Min. : -3828.90 Min. :-1805200
## 1st Qu.: 0.13 1st Qu.: 0.69 1st Qu.: 0.96 1st Qu.: 28
## Median : 0.21 Median : 1.21 Median : 1.38 Median : 1088
## Mean : 6.48 Mean : 23.77 Mean : 24.65 Mean : 7672
## 3rd Qu.: 0.35 3rd Qu.: 2.22 3rd Qu.: 2.37 3rd Qu.: 4993
## Max. :88433.00 Max. :180440.00 Max. :180440.00 Max. : 6123700
## NA's :301 NA's :812 NA's :812 NA's :1
## Attr56 Attr57 Attr58
## Min. :-1108300.0 Min. :-1667.3000 Min. : -198.7
## 1st Qu.: 0.0 1st Qu.: 0.0146 1st Qu.: 0.9
## Median : 0.1 Median : 0.1197 Median : 1.0
## Mean : -26.2 Mean : -0.0105 Mean : 30.0
## 3rd Qu.: 0.1 3rd Qu.: 0.2846 3rd Qu.: 1.0
## Max. : 293.1 Max. : 552.6400 Max. :1108300.0
## NA's :127 NA's :7 NA's :84
## Attr59 Attr60 Attr61 Attr62
## Min. : -327.970 Min. : -12 Min. : -12.66 Min. :-2336500
## 1st Qu.: 0.000 1st Qu.: 6 1st Qu.: 4.51 1st Qu.: 42
## Median : 0.006 Median : 10 Median : 6.64 Median : 71
## Mean : 1.333 Mean : 448 Mean : 17.03 Mean : 1502
## 3rd Qu.: 0.236 3rd Qu.: 20 3rd Qu.: 10.39 3rd Qu.: 117
## Max. :23853.000 Max. :4818700 Max. :108000.00 Max. :25016000
## NA's :7 NA's :2152 NA's :102 NA's :127
## Attr63 Attr64 class year
## Min. : -1.543 Min. :-10677.00 0:41314 Min. :1.00
## 1st Qu.: 3.098 1st Qu.: 2.18 1: 2091 1st Qu.:2.00
## Median : 5.088 Median : 4.28 Median :3.00
## Mean : 9.343 Mean : 72.79 Mean :2.94
## 3rd Qu.: 8.599 3rd Qu.: 9.78 3rd Qu.:4.00
## Max. :23454.000 Max. :294770.00 Max. :5.00
## NA's :134 NA's :812
# Class distribution
table(bankruptcy$class)
##
## 0 1
## 41314 2091
prop.table(table(bankruptcy$class))
##
## 0 1
## 0.95182583 0.04817417
# Check missing values per column
colSums(is.na(bankruptcy))
## Attr1 Attr2 Attr3 Attr4 Attr5 Attr6 Attr7 Attr8 Attr9 Attr10 Attr11
## 8 8 8 134 89 8 8 94 9 8 44
## Attr12 Attr13 Attr14 Attr15 Attr16 Attr17 Attr18 Attr19 Attr20 Attr21 Attr22
## 134 127 8 36 95 94 8 128 127 5854 8
## Attr23 Attr24 Attr25 Attr26 Attr27 Attr28 Attr29 Attr30 Attr31 Attr32 Attr33
## 127 922 8 95 2764 812 8 127 127 368 134
## Attr34 Attr35 Attr36 Attr37 Attr38 Attr39 Attr40 Attr41 Attr42 Attr43 Attr44
## 94 8 8 18984 8 127 134 754 127 127 127
## Attr45 Attr46 Attr47 Attr48 Attr49 Attr50 Attr51 Attr52 Attr53 Attr54 Attr55
## 2147 135 297 9 127 94 8 301 812 812 1
## Attr56 Attr57 Attr58 Attr59 Attr60 Attr61 Attr62 Attr63 Attr64 class year
## 127 7 84 7 2152 102 127 134 812 0 0
# Remove rows with missing values
bankruptcy <- na.omit(bankruptcy)
boxplot(log(abs(bankruptcy[, 1:10]) + 1), las = 2, main = "Attr1-Attr10")
boxplot(log(abs(bankruptcy[, 11:20]) + 1), las = 2, main = "Attr11-Attr20")
boxplot(log(abs(bankruptcy[, 21:30]) + 1), las = 2, main = "Attr21-Attr30")
boxplot(log(abs(bankruptcy[, 31:40]) + 1), las = 2, main = "Attr31-Attr40")
boxplot(log(abs(bankruptcy[, 41:50]) + 1), las = 2, main = "Attr41-Attr50")
boxplot(log(abs(bankruptcy[, 51:64]) + 1), las = 2, main = "Attr51-Attr64")
# Convert class to numeric (required for glm binomial)
bankruptcy$class <- as.numeric(as.character(bankruptcy$class))
# Impute any remaining NAs with column median
for(i in 1:64){
bankruptcy[,i][is.na(bankruptcy[,i])] <- median(bankruptcy[,i], na.rm = TRUE)
}
# 80/20 train-test split
set.seed(49)
index <- sample(nrow(bankruptcy), nrow(bankruptcy) * 0.80)
bankruptcy_train <- bankruptcy[index, ]
bankruptcy_test <- bankruptcy[-index, ]
bankruptcy_glm0 <- glm(class ~ .,
family = binomial,
data = bankruptcy_train,
control = glm.control(maxit = 100))
summary(bankruptcy_glm0)
##
## Call:
## glm(formula = class ~ ., family = binomial, data = bankruptcy_train,
## control = glm.control(maxit = 100))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.961e+14 8.702e+06 45516279 <2e-16 ***
## Attr1 3.602e+14 1.612e+07 22338489 <2e-16 ***
## Attr2 4.144e+13 1.238e+07 3347781 <2e-16 ***
## Attr3 1.644e+14 4.166e+06 39464824 <2e-16 ***
## Attr4 3.129e+13 4.352e+05 71890313 <2e-16 ***
## Attr5 1.322e+08 5.555e+00 23792074 <2e-16 ***
## Attr6 3.541e+12 3.364e+05 10526649 <2e-16 ***
## Attr7 -1.480e+18 1.862e+10 -79492135 <2e-16 ***
## Attr8 2.985e+14 1.638e+06 182183757 <2e-16 ***
## Attr9 -5.343e+13 7.426e+05 -71958209 <2e-16 ***
## Attr10 1.467e+14 1.261e+07 11633462 <2e-16 ***
## Attr11 -3.665e+14 1.497e+07 -24475937 <2e-16 ***
## Attr12 -1.067e+13 6.039e+05 -17675483 <2e-16 ***
## Attr13 2.513e+13 3.097e+05 81125813 <2e-16 ***
## Attr14 1.484e+18 1.868e+10 79475330 <2e-16 ***
## Attr15 -4.081e+08 6.487e+00 -62910285 <2e-16 ***
## Attr16 4.307e+14 9.530e+06 45192077 <2e-16 ***
## Attr17 -3.192e+14 1.571e+06 -203190629 <2e-16 ***
## Attr18 -4.622e+15 6.104e+07 -75714140 <2e-16 ***
## Attr19 -1.025e+15 2.063e+07 -49659863 <2e-16 ***
## Attr20 2.325e+15 8.390e+07 27713202 <2e-16 ***
## Attr21 -1.280e+11 2.097e+03 -61041538 <2e-16 ***
## Attr22 -2.173e+15 1.565e+07 -138833991 <2e-16 ***
## Attr23 6.774e+14 1.821e+07 37195182 <2e-16 ***
## Attr24 2.182e+12 2.084e+05 10471750 <2e-16 ***
## Attr25 -6.724e+12 2.869e+06 -2343385 <2e-16 ***
## Attr26 -5.219e+14 9.821e+06 -53138050 <2e-16 ***
## Attr27 -1.345e+09 2.464e+01 -54584205 <2e-16 ***
## Attr28 4.356e+13 5.157e+05 84480026 <2e-16 ***
## Attr29 3.199e+13 9.605e+05 33305532 <2e-16 ***
## Attr30 -2.259e+13 4.513e+05 -50049226 <2e-16 ***
## Attr31 4.421e+14 1.237e+07 35743001 <2e-16 ***
## Attr32 1.116e+11 1.272e+04 8777522 <2e-16 ***
## Attr33 -7.110e+12 5.783e+05 -12294231 <2e-16 ***
## Attr34 4.772e+13 2.981e+05 160085098 <2e-16 ***
## Attr35 1.186e+15 8.410e+06 140994299 <2e-16 ***
## Attr36 5.288e+13 7.886e+05 67059457 <2e-16 ***
## Attr37 -3.868e+09 2.773e+02 -13949664 <2e-16 ***
## Attr38 -3.141e+14 1.116e+07 -28145563 <2e-16 ***
## Attr39 -1.593e+14 2.793e+06 -57046291 <2e-16 ***
## Attr40 1.941e+14 8.162e+05 237772357 <2e-16 ***
## Attr41 -7.683e+09 2.325e+02 -33045757 <2e-16 ***
## Attr42 -8.993e+14 8.840e+06 -101722899 <2e-16 ***
## Attr43 -2.325e+15 8.390e+07 -27715865 <2e-16 ***
## Attr44 2.325e+15 8.389e+07 27715687 <2e-16 ***
## Attr45 -2.743e+10 1.699e+03 -16142540 <2e-16 ***
## Attr46 -2.125e+14 7.062e+05 -300849354 <2e-16 ***
## Attr47 3.459e+10 7.253e+02 47688619 <2e-16 ***
## Attr48 1.182e+15 6.373e+06 185451361 <2e-16 ***
## Attr49 7.594e+14 5.069e+06 149805143 <2e-16 ***
## Attr50 -1.453e+14 9.251e+05 -157066264 <2e-16 ***
## Attr51 -1.036e+14 1.137e+07 -9115614 <2e-16 ***
## Attr52 -6.038e+13 4.656e+06 -12967592 <2e-16 ***
## Attr53 1.543e+12 3.075e+04 50164331 <2e-16 ***
## Attr54 -4.435e+13 5.195e+05 -85370131 <2e-16 ***
## Attr55 -4.033e+07 8.921e+00 -4520430 <2e-16 ***
## Attr56 -2.645e+13 2.660e+06 -9943304 <2e-16 ***
## Attr57 5.112e+10 7.421e+04 688851 <2e-16 ***
## Attr58 -1.157e+13 2.638e+06 -4385567 <2e-16 ***
## Attr59 -2.262e+11 1.686e+04 -13419976 <2e-16 ***
## Attr60 -7.441e+10 8.720e+02 -85325844 <2e-16 ***
## Attr61 -5.039e+12 2.453e+04 -205411267 <2e-16 ***
## Attr62 -6.083e+10 1.782e+03 -34141323 <2e-16 ***
## Attr63 -1.642e+13 5.675e+05 -28933516 <2e-16 ***
## Attr64 -3.461e+11 5.475e+03 -63213678 <2e-16 ***
## year 2.528e+13 4.119e+05 61381950 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3298.2 on 15972 degrees of freedom
## Residual deviance: 31502.2 on 15907 degrees of freedom
## AIC: 31634
##
## Number of Fisher Scoring iterations: 100
AIC(bankruptcy_glm0)
## [1] 31634.15
BIC(bankruptcy_glm0)
## [1] 32140.94
# Suppress step-by-step output — summary shown below
bankruptcy_glm_back <- step(bankruptcy_glm0)
summary(bankruptcy_glm_back)
##
## Call:
## glm(formula = class ~ Attr1 + Attr2 + Attr3 + Attr4 + Attr5 +
## Attr6 + Attr7 + Attr8 + Attr9 + Attr10 + Attr11 + Attr12 +
## Attr13 + Attr14 + Attr15 + Attr16 + Attr17 + Attr18 + Attr19 +
## Attr20 + Attr21 + Attr22 + Attr23 + Attr24 + Attr25 + Attr26 +
## Attr27 + Attr29 + Attr30 + Attr31 + Attr32 + Attr33 + Attr34 +
## Attr35 + Attr36 + Attr37 + Attr38 + Attr39 + Attr40 + Attr41 +
## Attr42 + Attr43 + Attr44 + Attr45 + Attr46 + Attr47 + Attr48 +
## Attr49 + Attr50 + Attr51 + Attr52 + Attr53 + Attr55 + Attr56 +
## Attr57 + Attr58 + Attr59 + Attr60 + Attr61 + Attr62 + Attr63 +
## Attr64 + year, family = binomial, data = bankruptcy_train,
## control = glm.control(maxit = 100))
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.479e+15 8.688e+06 170212115 <2e-16 ***
## Attr1 -6.162e+14 1.607e+07 -38357193 <2e-16 ***
## Attr2 -5.816e+14 1.229e+07 -47323642 <2e-16 ***
## Attr3 3.778e+14 4.149e+06 91044276 <2e-16 ***
## Attr4 4.484e+13 4.277e+05 104827972 <2e-16 ***
## Attr5 3.469e+08 5.555e+00 62448644 <2e-16 ***
## Attr6 2.359e+11 3.364e+05 701243 <2e-16 ***
## Attr7 -1.049e+18 1.836e+10 -57143235 <2e-16 ***
## Attr8 7.645e+14 1.637e+06 467063378 <2e-16 ***
## Attr9 -8.774e+13 7.425e+05 -118172362 <2e-16 ***
## Attr10 -4.280e+14 1.251e+07 -34205423 <2e-16 ***
## Attr11 4.471e+14 1.497e+07 29865690 <2e-16 ***
## Attr12 3.092e+13 6.038e+05 51213535 <2e-16 ***
## Attr13 1.571e+13 3.023e+05 51951637 <2e-16 ***
## Attr14 1.052e+18 1.842e+10 57118759 <2e-16 ***
## Attr15 1.575e+08 6.487e+00 24282657 <2e-16 ***
## Attr16 -3.297e+14 9.522e+06 -34619612 <2e-16 ***
## Attr17 -7.982e+14 1.570e+06 -508484781 <2e-16 ***
## Attr18 -2.870e+15 5.974e+07 -48038741 <2e-16 ***
## Attr19 -8.943e+14 2.062e+07 -43374650 <2e-16 ***
## Attr20 1.116e+15 8.365e+07 13343981 <2e-16 ***
## Attr21 -1.168e+11 2.097e+03 -55719101 <2e-16 ***
## Attr22 -8.178e+14 1.564e+07 -52279705 <2e-16 ***
## Attr23 8.682e+14 1.819e+07 47716105 <2e-16 ***
## Attr24 5.433e+12 2.084e+05 26071942 <2e-16 ***
## Attr25 1.678e+13 2.869e+06 5849725 <2e-16 ***
## Attr26 2.972e+14 9.815e+06 30277386 <2e-16 ***
## Attr27 -1.145e+09 2.464e+01 -46453419 <2e-16 ***
## Attr29 -1.109e+13 9.604e+05 -11551950 <2e-16 ***
## Attr30 -1.604e+13 4.504e+05 -35610454 <2e-16 ***
## Attr31 1.804e+14 1.234e+07 14616982 <2e-16 ***
## Attr32 5.329e+11 1.271e+04 41942329 <2e-16 ***
## Attr33 2.811e+13 5.779e+05 48638822 <2e-16 ***
## Attr34 6.718e+13 2.978e+05 225623532 <2e-16 ***
## Attr35 -1.761e+14 8.397e+06 -20974791 <2e-16 ***
## Attr36 8.026e+13 7.882e+05 101825993 <2e-16 ***
## Attr37 -1.361e+09 2.773e+02 -4906531 <2e-16 ***
## Attr38 -4.791e+13 1.104e+07 -4341086 <2e-16 ***
## Attr39 -1.733e+14 2.793e+06 -62036082 <2e-16 ***
## Attr40 1.904e+14 8.131e+05 234219198 <2e-16 ***
## Attr41 -1.812e+10 2.325e+02 -77927466 <2e-16 ***
## Attr42 -9.572e+14 8.827e+06 -108436523 <2e-16 ***
## Attr43 -1.117e+15 8.365e+07 -13347700 <2e-16 ***
## Attr44 1.116e+15 8.365e+07 13346328 <2e-16 ***
## Attr45 1.049e+09 1.699e+03 617437 <2e-16 ***
## Attr46 -2.214e+14 7.044e+05 -314355030 <2e-16 ***
## Attr47 5.211e+10 7.252e+02 71864807 <2e-16 ***
## Attr48 9.142e+14 6.373e+06 143457679 <2e-16 ***
## Attr49 7.802e+14 5.069e+06 153918862 <2e-16 ***
## Attr50 -1.717e+14 9.250e+05 -185601739 <2e-16 ***
## Attr51 9.594e+12 1.126e+07 852027 <2e-16 ***
## Attr52 -2.257e+14 4.653e+06 -48518608 <2e-16 ***
## Attr53 -8.558e+11 2.551e+04 -33545505 <2e-16 ***
## Attr55 4.996e+07 8.921e+00 5601063 <2e-16 ***
## Attr56 -6.748e+13 2.659e+06 -25377827 <2e-16 ***
## Attr57 -2.397e+12 7.421e+04 -32295493 <2e-16 ***
## Attr58 -5.126e+13 2.636e+06 -19442686 <2e-16 ***
## Attr59 -7.410e+11 1.686e+04 -43958843 <2e-16 ***
## Attr60 -5.071e+10 8.720e+02 -58155907 <2e-16 ***
## Attr61 -4.954e+12 2.453e+04 -201954799 <2e-16 ***
## Attr62 -3.061e+10 1.779e+03 -17209185 <2e-16 ***
## Attr63 -6.463e+13 5.673e+05 -113932261 <2e-16 ***
## Attr64 -7.357e+10 3.680e+03 -19991259 <2e-16 ***
## year 2.721e+13 4.118e+05 66070885 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3298.2 on 15972 degrees of freedom
## Residual deviance: 25302.6 on 15909 degrees of freedom
## AIC: 25431
##
## Number of Fisher Scoring iterations: 100
bankruptcy_glm_back$deviance
## [1] 25302.64
AIC(bankruptcy_glm_back)
## [1] 25430.64
BIC(bankruptcy_glm_back)
## [1] 25922.08
pred_glm0_train <- predict(bankruptcy_glm_back, type = "response")
pred <- prediction(pred_glm0_train, bankruptcy_train$class)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE, main = "Training ROC Curve")
# Training AUC
cat("Training AUC:", unlist(slot(performance(pred, "auc"), "y.values")))
## Training AUC: 0.5642232
hist(predict(bankruptcy_glm_back), main = "Log-Odds — Training")
pred_resp <- predict(bankruptcy_glm_back, type = "response")
hist(pred_resp, main = "Predicted Probabilities — Training")
table(bankruptcy_train$class,
(pred_resp > 0.45) * 1,
dnn = c("Truth", "Predicted"))
## Predicted
## Truth 0 1
## 0 15577 55
## 1 296 45
pred_glm0_test <- predict(bankruptcy_glm_back,
newdata = bankruptcy_test,
type = "response")
pred_test <- prediction(pred_glm0_test, bankruptcy_test$class)
perf_test <- performance(pred_test, "tpr", "fpr")
plot(perf_test, colorize = TRUE, main = "Test ROC Curve")
# Test AUC
cat("Test AUC:", unlist(slot(performance(pred_test, "auc"), "y.values")))
## Test AUC: 0.5357713
hist(pred_glm0_test, main = "Predicted Probabilities — Test")
table(bankruptcy_test$class,
(pred_glm0_test > 0.45) * 1,
dnn = c("Truth", "Predicted"))
## Predicted
## Truth 0 1
## 0 3882 21
## 1 84 7
length(bankruptcy_test$class)
## [1] 3994
length(pred_glm0_test)
## [1] 3994
# Full model on entire dataset — required for cv.glm
# FIX: defined BEFORE cv calls so both symmetric and asymmetric CV can use it
bankruptcy_glm1 <- glm(class ~ ., family = binomial, data = bankruptcy)
# --- Symmetric Cost (equal penalty for false positives and false negatives) ---
Sym_cost <- function(r, pi){
pcut <- 0.45
mean(((r == 0) & (pi > pcut)) | ((r == 1) & (pi < pcut)))
}
cv_result_sym <- cv.glm(data = bankruptcy,
glmfit = bankruptcy_glm1,
cost = Sym_cost,
K = 5)
cat("Symmetric CV Error:", cv_result_sym$delta[2])
## Symmetric CV Error: 0.02735501
# --- Asymmetric Cost (missed bankruptcies penalized 10x more than false alarms) ---
costfunc <- function(obs, pred.p){
weight1 <- 10
weight0 <- 1
pcut <- 1 / (1 + weight1 / weight0)
c1 <- (obs == 1) & (pred.p < pcut)
c0 <- (obs == 0) & (pred.p >= pcut)
cost <- mean(weight1 * c1 + weight0 * c0)
return(cost)
}
cv_result <- cv.glm(data = bankruptcy,
glmfit = bankruptcy_glm1,
cost = costfunc,
K = 5)
cat("Asymmetric CV Error:", cv_result$delta[2])
## Asymmetric CV Error: 0.2114973
set.seed(49)
index <- sample(1:nrow(bankruptcy_clean), 0.8 * nrow(bankruptcy_clean))
train_data <- bankruptcy_clean[index, ] %>% select(-year)
test_data <- bankruptcy_clean[-index, ] %>% select(-year)
rf_model <- randomForest(class ~ .,
data = train_data,
ntree = 500,
importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = class ~ ., data = train_data, ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 8
##
## OOB estimate of error rate: 3.69%
## Confusion matrix:
## 0 1 class.error
## 0 32943 92 0.002784925
## 1 1188 501 0.703374778
predictions_ts <- predict(rf_model, test_data)
predictions_tr <- predict(rf_model, train_data)
cat("--- Test Confusion Matrix ---\n")
## --- Test Confusion Matrix ---
confusion_matrix <- table(Predicted = predictions_ts, Actual = test_data$class)
print(confusion_matrix)
## Actual
## Predicted 0 1
## 0 8250 289
## 1 29 113
cat("\n--- Training Confusion Matrix ---\n")
##
## --- Training Confusion Matrix ---
confusion_matrix_tr <- table(Predicted = predictions_tr, Actual = train_data$class)
print(confusion_matrix_tr)
## Actual
## Predicted 0 1
## 0 33035 1
## 1 0 1688
importance(rf_model)
## 0 1 MeanDecreaseAccuracy MeanDecreaseGini
## Attr1 16.606896 -4.62829772 16.549753 34.22019
## Attr2 12.825919 -14.28133995 12.697868 31.56926
## Attr3 14.906236 -10.05502409 14.745010 39.46503
## Attr4 14.328891 -9.48999565 14.090354 40.31501
## Attr5 18.412297 11.16428281 19.337372 61.65713
## Attr6 30.907568 20.02666632 36.015391 59.55725
## Attr7 14.502785 -9.59044994 14.446049 33.64986
## Attr8 13.530473 -13.02644495 13.408588 32.53687
## Attr9 21.222955 16.96220909 22.630786 69.50581
## Attr10 16.744612 -13.65277180 16.665943 33.80095
## Attr11 16.323718 -8.13665256 16.268027 38.05544
## Attr12 13.274865 -1.26906753 13.257259 34.25865
## Attr13 17.564946 5.69835444 17.932058 48.94109
## Attr14 15.658742 -5.83654038 15.609562 34.00251
## Attr15 12.161186 1.41609149 12.364717 43.25968
## Attr16 16.635736 0.53070782 16.763292 45.34406
## Attr17 15.032969 -14.76587783 14.873581 30.52931
## Attr18 15.399354 -10.08756496 15.305691 32.39138
## Attr19 13.949108 -4.12034193 13.924831 35.90499
## Attr20 21.772861 1.12177017 22.025468 43.96938
## Attr21 9.662612 1.01458962 9.442617 35.88365
## Attr22 18.254718 1.63706467 18.577494 52.92922
## Attr23 17.954142 0.05072284 17.732231 34.98786
## Attr24 38.965614 20.72044746 42.735709 87.12716
## Attr25 18.996455 -7.86459799 19.417101 45.43018
## Attr26 23.743066 4.64224574 24.010787 49.79202
## Attr27 68.354361 38.16800110 71.903366 217.57040
## Attr28 14.375249 -5.08016948 14.403208 36.55971
## Attr29 34.680379 6.45343627 35.142111 64.87872
## Attr30 26.671636 -9.48619142 26.523057 38.73579
## Attr31 15.477427 -4.99683778 15.504789 39.36529
## Attr32 17.040603 -9.68158824 16.962842 35.73729
## Attr33 21.061455 -15.02460726 20.801407 40.09735
## Attr34 33.317377 31.03749156 37.016551 142.96988
## Attr35 18.767108 11.44385340 19.471397 65.66435
## Attr36 23.523612 -10.61351108 23.211712 47.18448
## Attr37 19.576186 -3.17660310 19.013180 35.02208
## Attr38 16.081179 -1.09994717 16.256995 40.92263
## Attr39 18.975125 12.72958628 19.925325 63.92841
## Attr40 21.139368 5.03596213 22.175491 56.31590
## Attr41 13.069486 10.06966343 13.808547 54.27079
## Attr42 16.110562 3.45169819 16.698404 48.92070
## Attr43 21.529408 -9.99308313 21.128624 35.18745
## Attr44 22.253959 -7.05187648 22.336775 54.23203
## Attr45 17.492236 4.18572008 17.937643 37.99157
## Attr46 32.263106 26.30473964 33.137302 109.48304
## Attr47 26.227297 3.93457020 26.663260 43.29199
## Attr48 17.978762 -10.04985732 18.024265 42.36592
## Attr49 16.401903 -8.38436544 16.415412 39.03707
## Attr50 19.974318 -12.43553381 19.849692 37.68821
## Attr51 16.448519 -4.04504482 16.470906 35.31536
## Attr52 20.403383 -15.48726000 20.063324 35.19586
## Attr53 14.252096 -9.28249791 14.165008 33.12886
## Attr54 16.872355 -4.55754412 16.947069 37.25879
## Attr55 20.884555 1.70206657 21.943277 56.35552
## Attr56 34.576317 -15.80732813 35.161157 85.20735
## Attr57 13.917590 0.48146631 14.211672 36.95150
## Attr58 53.127287 -24.15673765 53.198881 102.85798
## Attr59 7.952038 -2.03503861 7.984726 25.37383
## Attr60 18.242923 -1.49299770 18.492969 46.40723
## Attr61 21.554066 -8.10032568 21.481571 55.58762
## Attr62 17.668996 -11.98109822 17.435031 32.44292
## Attr63 16.078487 -12.06401836 15.851214 32.97737
## Attr64 16.507808 -1.43887801 16.539998 36.90226
varImpPlot(rf_model, main = "Random Forest — Variable Importance")
set.seed(49)
index <- sample(1:nrow(bankruptcy_clean), 0.8 * nrow(bankruptcy_clean))
train_data <- bankruptcy_clean[index, ] %>% select(-year)
test_data <- bankruptcy_clean[-index, ] %>% select(-year)
dim(train_data)
## [1] 34724 65
dim(test_data)
## [1] 8681 65
summary(train_data)
## Attr1 Attr2 Attr3 Attr4
## Min. :-463.8900 Min. :-430.8700 Min. :-479.9600 Min. : 0.000
## 1st Qu.: 0.0036 1st Qu.: 0.2689 1st Qu.: 0.0215 1st Qu.: 1.051
## Median : 0.0503 Median : 0.4710 Median : 0.1965 Median : 1.570
## Mean : 0.0338 Mean : 0.6064 Mean : 0.0983 Mean : 4.616
## 3rd Qu.: 0.1301 3rd Qu.: 0.6890 3rd Qu.: 0.4038 3rd Qu.: 2.787
## Max. : 87.4590 Max. : 480.9600 Max. : 22.7690 Max. :8199.100
## Attr5 Attr6 Attr7
## Min. :-11903000 Min. :-508.4100 Min. :-517.4800
## 1st Qu.: -49 1st Qu.: 0.0000 1st Qu.: 0.0060
## Median : -1 Median : 0.0000 Median : 0.0604
## Mean : -466 Mean : -0.0753 Mean : 0.0823
## 3rd Qu.: 51 3rd Qu.: 0.0882 3rd Qu.: 0.1515
## Max. : 1250100 Max. : 543.2500 Max. : 649.2300
## Attr8 Attr9 Attr10 Attr11
## Min. : -141.410 Min. : -3.496 Min. :-479.9100 Min. :-463.8900
## 1st Qu.: 0.431 1st Qu.: 1.019 1st Qu.: 0.2957 1st Qu.: 0.0159
## Median : 1.070 Median : 1.198 Median : 0.5066 Median : 0.0755
## Mean : 10.180 Mean : 2.589 Mean : 0.5520 Mean : 0.1235
## 3rd Qu.: 2.608 3rd Qu.: 2.065 3rd Qu.: 0.7086 3rd Qu.: 0.1676
## Max. :29636.000 Max. :9742.300 Max. :1084.7000 Max. : 681.5400
## Attr12 Attr13 Attr14 Attr15
## Min. :-6331.800 Min. :-1460.600 Min. :-517.4800 Min. :-9632400
## 1st Qu.: 0.016 1st Qu.: 0.024 1st Qu.: 0.0060 1st Qu.: 222
## Median : 0.173 Median : 0.069 Median : 0.0604 Median : 846
## Mean : 0.987 Mean : 0.835 Mean : 0.0823 Mean : 2172
## 3rd Qu.: 0.590 3rd Qu.: 0.134 3rd Qu.: 0.1515 3rd Qu.: 2214
## Max. : 8259.400 Max. :13315.000 Max. : 649.2300 Max. :10236000
## Attr16 Attr17 Attr18
## Min. :-6331.800 Min. : -0.413 Min. :-517.4800
## 1st Qu.: 0.074 1st Qu.: 1.451 1st Qu.: 0.0060
## Median : 0.246 Median : 2.117 Median : 0.0604
## Mean : 1.269 Mean : 11.353 Mean : 0.0855
## 3rd Qu.: 0.665 3rd Qu.: 3.697 3rd Qu.: 0.1515
## Max. : 8259.400 Max. :29642.000 Max. : 649.2300
## Attr19 Attr20 Attr21 Attr22
## Min. :-1578.700 Min. : 0 Min. :-1325.000 Min. :-431.5900
## 1st Qu.: 0.004 1st Qu.: 15 1st Qu.: 0.935 1st Qu.: 0.0000
## Median : 0.036 Median : 35 Median : 1.045 Median : 0.0625
## Mean : 0.243 Mean : 289 Mean : 3.935 Mean : 0.1057
## 3rd Qu.: 0.091 3rd Qu.: 63 3rd Qu.: 1.173 3rd Qu.: 0.1505
## Max. : 9230.500 Max. :7809200 Max. :29907.000 Max. : 681.5400
## Attr23 Attr24 Attr25
## Min. :-1578.700 Min. :-463.8900 Min. :-500.9300
## 1st Qu.: 0.002 1st Qu.: 0.0245 1st Qu.: 0.1491
## Median : 0.030 Median : 0.1551 Median : 0.3844
## Mean : 0.220 Mean : 0.2526 Mean : 0.3326
## 3rd Qu.: 0.078 3rd Qu.: 0.3508 3rd Qu.: 0.6089
## Max. : 9230.500 Max. : 649.2300 Max. :1353.3000
## Attr26 Attr27 Attr28 Attr29
## Min. :-6331.800 Min. :-259010 Min. :-3829.900 Min. :-0.8861
## 1st Qu.: 0.067 1st Qu.: 0 1st Qu.: 0.045 1st Qu.: 3.4986
## Median : 0.222 Median : 1 Median : 0.465 Median : 4.0163
## Mean : 1.109 Mean : 1073 Mean : 6.051 Mean : 4.0063
## 3rd Qu.: 0.599 3rd Qu.: 4 3rd Qu.: 1.456 3rd Qu.: 4.5219
## Max. : 8262.300 Max. :4208800 Max. :21701.000 Max. : 9.6983
## Attr30 Attr31 Attr32 Attr33
## Min. : -6351.70 Min. :-1495.600 Min. : -9296 Min. : -19.197
## 1st Qu.: 0.08 1st Qu.: 0.007 1st Qu.: 47 1st Qu.: 2.825
## Median : 0.22 Median : 0.043 Median : 78 Median : 4.625
## Mean : 7.68 Mean : 0.262 Mean : 1010 Mean : 8.872
## 3rd Qu.: 0.40 3rd Qu.: 0.101 3rd Qu.: 127 3rd Qu.: 7.796
## Max. :152860.00 Max. : 9244.300 Max. :17364000 Max. :21944.000
## Attr34 Attr35 Attr36 Attr37
## Min. : -306.710 Min. :-431.5900 Min. : -0.001 Min. : -525.5
## 1st Qu.: 0.310 1st Qu.: 0.0062 1st Qu.: 1.104 1st Qu.: 2.5
## Median : 1.969 Median : 0.0611 Median : 1.648 Median : 3.1
## Mean : 5.597 Mean : 0.1063 Mean : 2.843 Mean : 62.9
## 3rd Qu.: 4.549 3rd Qu.: 0.1506 3rd Qu.: 2.417 3rd Qu.: 3.9
## Max. :21944.000 Max. : 626.9200 Max. :9742.300 Max. :398920.0
## Attr38 Attr39 Attr40 Attr41
## Min. :-479.9100 Min. :-7522.000 Min. : -18.718 Min. :-667.730
## 1st Qu.: 0.4202 1st Qu.: 0.004 1st Qu.: 0.053 1st Qu.: 0.028
## Median : 0.6125 Median : 0.037 Median : 0.177 Median : 0.086
## Mean : 0.6509 Mean : -0.330 Mean : 2.168 Mean : 1.029
## 3rd Qu.: 0.7722 3rd Qu.: 0.092 3rd Qu.: 0.651 3rd Qu.: 0.201
## Max. :1084.7000 Max. : 2156.500 Max. :8007.100 Max. :5043.300
## Attr42 Attr43 Attr44 Attr45
## Min. :-1395.8000 Min. : -48532 Min. : -48532 Min. :-256230.0
## 1st Qu.: 0.0000 1st Qu.: 67 1st Qu.: 35 1st Qu.: 0.0
## Median : 0.0381 Median : 99 Median : 55 Median : 0.3
## Mean : -0.1461 Mean : 1282 Mean : 994 Mean : 15.3
## 3rd Qu.: 0.0921 3rd Qu.: 140 3rd Qu.: 81 3rd Qu.: 0.9
## Max. : 2156.8000 Max. :30393000 Max. :22584000 Max. : 366030.0
## Attr46 Attr47 Attr48 Attr49
## Min. : -13.554 Min. : -64 Min. :-542.5600 Min. :-9001.000
## 1st Qu.: 0.611 1st Qu.: 16 1st Qu.: -0.0377 1st Qu.: -0.026
## Median : 1.027 Median : 38 Median : 0.0189 Median : 0.011
## Mean : 3.741 Mean : 292 Mean : 0.0230 Mean : -0.556
## 3rd Qu.: 1.907 3rd Qu.: 70 3rd Qu.: 0.1080 3rd Qu.: 0.062
## Max. :8199.100 Max. :6084200 Max. : 623.8500 Max. : 178.890
## Attr50 Attr51 Attr52 Attr53
## Min. : -0.012 Min. : 0.0000 Min. : -25.47 Min. : -3828.90
## 1st Qu.: 0.776 1st Qu.: 0.1906 1st Qu.: 0.13 1st Qu.: 0.70
## Median : 1.222 Median : 0.3405 Median : 0.21 Median : 1.21
## Mean : 3.675 Mean : 0.4990 Mean : 4.93 Mean : 22.80
## 3rd Qu.: 2.200 3rd Qu.: 0.5341 3rd Qu.: 0.35 3rd Qu.: 2.18
## Max. :8199.100 Max. :480.9600 Max. :88433.00 Max. :180440.00
## Attr54 Attr55 Attr56
## Min. : -3828.90 Min. :-1805200 Min. :-1108300.0
## 1st Qu.: 0.96 1st Qu.: 30 1st Qu.: 0.0
## Median : 1.38 Median : 1102 Median : 0.1
## Mean : 23.78 Mean : 7761 Mean : -32.7
## 3rd Qu.: 2.33 3rd Qu.: 5023 3rd Qu.: 0.1
## Max. :180440.00 Max. : 6123700 Max. : 293.1
## Attr57 Attr58 Attr59 Attr60
## Min. :-1667.3000 Min. : -198.7 Min. : -327.970 Min. : 0
## 1st Qu.: 0.0151 1st Qu.: 0.9 1st Qu.: 0.000 1st Qu.: 6
## Median : 0.1204 Median : 1.0 Median : 0.006 Median : 10
## Mean : -0.0142 Mean : 37.2 Mean : 1.505 Mean : 487
## 3rd Qu.: 0.2843 3rd Qu.: 1.0 3rd Qu.: 0.235 3rd Qu.: 19
## Max. : 552.6400 Max. :1108300.0 Max. :23853.000 Max. :4818700
## Attr61 Attr62 Attr63 Attr64
## Min. : -12.66 Min. :-2336500 Min. : -0.368 Min. :-10677.00
## 1st Qu.: 4.50 1st Qu.: 42 1st Qu.: 3.104 1st Qu.: 2.21
## Median : 6.64 Median : 71 Median : 5.088 Median : 4.28
## Mean : 17.62 Mean : 1499 Mean : 9.630 Mean : 71.63
## 3rd Qu.: 10.36 3rd Qu.: 117 3rd Qu.: 8.595 3rd Qu.: 9.57
## Max. :108000.00 Max. :25016000 Max. :23454.000 Max. :294770.00
## class
## 0:33035
## 1: 1689
##
##
##
##
bankruptcy_rpart <- rpart(formula = class ~ ., data = train_data, method = "class")
prp(bankruptcy_rpart, extra = 1, main = "Standard Classification Tree")
# Test set
pred <- predict(bankruptcy_rpart, test_data, type = "class")
cat("--- Test Confusion Matrix ---\n")
## --- Test Confusion Matrix ---
table(test_data$class, pred, dnn = c("True", "Predict"))
## Predict
## True 0 1
## 0 8253 26
## 1 270 132
mean(pred == test_data$class)
## [1] 0.9659025
misclass_rate_test <- mean(pred != test_data$class); misclass_rate_test
## [1] 0.03409745
# Train set
pred0 <- predict(bankruptcy_rpart, train_data, type = "class")
cat("--- Train Confusion Matrix ---\n")
## --- Train Confusion Matrix ---
table(train_data$class, pred0, dnn = c("True", "Predict"))
## Predict
## True 0 1
## 0 32973 62
## 1 1052 637
mean(pred0 == train_data$class)
## [1] 0.9679184
misclass_rate_train <- mean(pred0 != train_data$class); misclass_rate_train
## [1] 0.03208156
imp <- bankruptcy_rpart$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Standard Tree", col = "lightblue")
bankruptcy_rpartAs <- rpart(formula = class ~ ., data = train_data,
method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(bankruptcy_rpartAs, extra = 1, main = "Asymmetric Tree — Train")
predAs <- predict(bankruptcy_rpartAs, train_data, type = "class")
table(train_data$class, predAs, dnn = c("True", "Predict"))
## Predict
## True 0 1
## 0 32398 637
## 1 878 811
mean(predAs == train_data$class)
## [1] 0.9563702
misclass_rate_train <- mean(predAs != train_data$class); misclass_rate_train
## [1] 0.04362977
cost <- function(r, phat){
weight1 <- 5
weight0 <- 1
pcut <- 0.35
c1 <- (r == 1) & (phat < pcut) # actual 1, predicted 0
c0 <- (r == 0) & (phat > pcut) # actual 0, predicted 1
return(mean(weight1 * c1 + weight0 * c0))
}
cost(train_data$class, predict(bankruptcy_rpartAs, train_data, type = "prob"))
## [1] 0.5853876
imp <- bankruptcy_rpartAs$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Asymmetric Tree (Train)", col = "lightblue")
# FIX: model defined first, then predictions made below
bankruptcy_rpartAt <- rpart(formula = class ~ ., data = test_data,
method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(bankruptcy_rpartAt, extra = 1, main = "Asymmetric Tree — Test")
predAt <- predict(bankruptcy_rpartAt, test_data, type = "class")
table(test_data$class, predAt, dnn = c("True", "Predict"))
## Predict
## True 0 1
## 0 8086 193
## 1 214 188
mean(predAt == test_data$class)
## [1] 0.953116
misclass_rate_test <- mean(predAt != test_data$class); misclass_rate_test
## [1] 0.046884
cost(test_data$class, predict(bankruptcy_rpartAt, test_data, type = "prob"))
## [1] 0.5877779
imp <- bankruptcy_rpartAt$variable.importance
barplot(imp, las = 2, main = "Variable Importance — Asymmetric Tree (Test)", col = "lightblue")
| Model | Variant | Data | Key Output |
|---|---|---|---|
| Logistic Regression | Stepwise backward | Train/Test | ROC curve + AUC |
| Logistic Regression | 5-fold CV | Full dataset | Symmetric & asymmetric CV error |
| Random Forest | 500 trees | Train/Test | Confusion matrix + variable importance |
| Classification Tree | Standard | Train/Test | Accuracy + misclassification rate |
| Classification Tree | Asymmetric (5:1 loss) | Train | Accuracy + asymmetric cost |
| Classification Tree | Asymmetric (5:1 loss) | Test | Accuracy + asymmetric cost |