The following RMD contains the appendix code for CUNY SPS DATA 621 Fall 2025 final project.
library(tidyverse)
library(ggplot2)
library(dplyr)
library(caret)
library(pROC)
# Download provided data set
bank_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA621/refs/heads/main/american_bankruptcy.csv")
# Save the data
bank_data <- bank_raw
# Rename columns
colnames(bank_data) <- c("company_name",
"status_label",
"year",
"current_assets",
"cost_of_goods_sold",
"depreciation",
"EBITDA",
"inventory",
"net_income",
"total_receivables",
"market_value",
"net_sales",
"total_assets",
"total_long_term_debt",
"EBIT",
"gross_profit",
"current_liabilities",
"retained_earnings",
"total_revenue",
"total_liabilities",
"operating_expenses"
)
# Take a look
head(bank_data)
## # A tibble: 6 × 21
## company_name status_label year current_assets cost_of_goods_sold depreciation
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 C_1 alive 1999 511. 833. 18.4
## 2 C_1 alive 2000 486. 714. 18.6
## 3 C_1 alive 2001 437. 526. 22.5
## 4 C_1 alive 2002 396. 497. 27.2
## 5 C_1 alive 2003 432. 523. 26.7
## 6 C_1 alive 2004 475. 598. 28.0
## # ℹ 15 more variables: EBITDA <dbl>, inventory <dbl>, net_income <dbl>,
## # total_receivables <dbl>, market_value <dbl>, net_sales <dbl>,
## # total_assets <dbl>, total_long_term_debt <dbl>, EBIT <dbl>,
## # gross_profit <dbl>, current_liabilities <dbl>, retained_earnings <dbl>,
## # total_revenue <dbl>, total_liabilities <dbl>, operating_expenses <dbl>
# Crop
bank_latest <- bank_data |>
group_by(company_name) |>
filter(year == max(year)) |>
ungroup()
# Status label
bank_latest$status_label <- ifelse(bank_latest$status_label == "alive", 1, 0)
# See changes
head(bank_latest)
## # A tibble: 6 × 21
## company_name status_label year current_assets cost_of_goods_sold depreciation
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 C_1 1 2017 943. 1413. 40.5
## 2 C_2 1 2010 1108. 677. 61.5
## 3 C_3 1 2008 12.7 19.3 1.69
## 4 C_4 1 2007 582. 268. 46.3
## 5 C_5 1 1999 29.0 79.6 2.02
## 6 C_6 0 2010 6838 18138 995
## # ℹ 15 more variables: EBITDA <dbl>, inventory <dbl>, net_income <dbl>,
## # total_receivables <dbl>, market_value <dbl>, net_sales <dbl>,
## # total_assets <dbl>, total_long_term_debt <dbl>, EBIT <dbl>,
## # gross_profit <dbl>, current_liabilities <dbl>, retained_earnings <dbl>,
## # total_revenue <dbl>, total_liabilities <dbl>, operating_expenses <dbl>
glimpse(bank_latest)
## Rows: 8,971
## Columns: 21
## $ company_name <chr> "C_1", "C_2", "C_3", "C_4", "C_5", "C_6", "C_7", …
## $ status_label <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ year <dbl> 2017, 2010, 2008, 2007, 1999, 2010, 2018, 2006, 2…
## $ current_assets <dbl> 942.700, 1107.700, 12.686, 581.502, 28.957, 6838.…
## $ cost_of_goods_sold <dbl> 1413.200, 677.200, 19.334, 267.810, 79.567, 18138…
## $ depreciation <dbl> 40.500, 61.500, 1.686, 46.338, 2.024, 995.000, 13…
## $ EBITDA <dbl> 126.500, 121.400, -2.721, 40.816, 3.873, 1303.000…
## $ inventory <dbl> 547.900, 106.400, 6.950, 125.963, 10.947, 594.000…
## $ net_income <dbl> 15.600, 62.000, -11.049, -13.581, -0.138, -471.00…
## $ total_receivables <dbl> 203.000, 252.500, 4.282, 130.246, 15.890, 738.000…
## $ market_value <dbl> 1551.4580, 1231.5240, 5.0201, 882.4491, 6.1972, 2…
## $ net_sales <dbl> 1748.300, 1156.600, 34.280, 722.425, 107.310, 221…
## $ total_assets <dbl> 1524.700, 1474.500, 21.401, 1288.165, 42.210, 250…
## $ total_long_term_debt <dbl> 177.200, 650.800, 0.023, 300.000, 0.591, 9253.000…
## $ EBIT <dbl> 86.000, 59.900, -4.407, -5.522, 1.849, 308.000, 1…
## $ gross_profit <dbl> 335.100, 479.400, 14.946, 454.615, 27.743, 4032.0…
## $ current_liabilities <dbl> 333.300, 288.700, 16.483, 198.375, 39.835, 8780.0…
## $ retained_earnings <dbl> 701.200, -918.400, -21.447, -95.949, -33.199, -83…
## $ total_revenue <dbl> 1748.300, 1156.600, 34.280, 722.425, 107.310, 221…
## $ total_liabilities <dbl> 588.400, 1040.100, 17.224, 557.038, 52.453, 29033…
## $ operating_expenses <dbl> 1621.800, 1035.200, 37.001, 681.609, 103.437, 208…
# Separate majority and minority classes
alive_df <- bank_latest |> filter(status_label == 1)
bankrupt_df <- bank_latest |> filter(status_label == 0)
# Count bankrupt cases
n_bankrupt <- nrow(bankrupt_df)
# Randomly sample alive firms to match bankrupt count
alive_sampled <- alive_df |>
sample_n(n_bankrupt)
# Combine into balanced dataset
bank_balanced <- bind_rows(bankrupt_df, alive_sampled)
# Check class balance
table(bank_balanced$status_label)
##
## 0 1
## 609 609
set.seed(64) # for reproducibility
# Train test split
train_index <- createDataPartition(bank_balanced$status_label, p = 0.8, list = FALSE)
train_df <- bank_balanced[train_index, ]
test_df <- bank_balanced[-train_index, ]
# Align factor levels for company_name
test_df$company_name <- factor(test_df$company_name, levels = levels(train_df$company_name))
# Fit logistic regression (excluding company_name as an ID)
model_logit <- glm(
status_label ~ . - company_name,
data = train_df,
family = binomial(link = "logit")
)
# See model results
summary(model_logit)
##
## Call:
## glm(formula = status_label ~ . - company_name, family = binomial(link = "logit"),
## data = train_df)
##
## Coefficients: (4 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.213e+01 2.379e+01 -2.611 0.009023 **
## year 3.090e-02 1.184e-02 2.609 0.009076 **
## current_assets 2.220e-03 6.080e-04 3.651 0.000261 ***
## cost_of_goods_sold 1.213e-03 4.337e-04 2.797 0.005156 **
## depreciation -2.803e-04 1.144e-03 -0.245 0.806460
## EBITDA 2.627e-03 9.228e-04 2.847 0.004420 **
## inventory -2.498e-03 1.165e-03 -2.144 0.032016 *
## net_income -1.082e-03 5.128e-04 -2.111 0.034769 *
## total_receivables 6.075e-04 1.185e-03 0.512 0.608303
## market_value 6.877e-04 1.690e-04 4.069 4.73e-05 ***
## net_sales -1.034e-03 4.533e-04 -2.281 0.022540 *
## total_assets -4.040e-05 2.329e-04 -0.173 0.862297
## total_long_term_debt -5.587e-04 2.380e-04 -2.348 0.018871 *
## EBIT NA NA NA NA
## gross_profit NA NA NA NA
## current_liabilities -2.189e-03 4.518e-04 -4.845 1.27e-06 ***
## retained_earnings 1.916e-04 9.519e-05 2.013 0.044130 *
## total_revenue NA NA NA NA
## total_liabilities -1.410e-04 2.576e-04 -0.547 0.584041
## operating_expenses NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1353.0 on 975 degrees of freedom
## Residual deviance: 1182.9 on 960 degrees of freedom
## AIC: 1214.9
##
## Number of Fisher Scoring iterations: 8
# Predict probabilities of being alive
pred_logit_prob <- predict(model_logit, newdata = test_df, type = "response")
# Convert probabilities to class labels using 0.5 threshold
pred_logit_class <- ifelse(pred_logit_prob >= 0.5, 1, 0)
# Inspect first few predictions
head(pred_logit_prob)
## 1 2 3 4 5 6
## 0.6387623 0.4525814 0.4534655 0.4343652 0.3825950 0.5033385
head(pred_logit_class)
## 1 2 3 4 5 6
## 1 0 0 0 0 1
# Select only numeric predictors (exclude company_name)
numeric_vars <- bank_balanced |>
select(-company_name)
# Compute correlations with status_label
cor_values <- cor(numeric_vars)[, "status_label"]
# Sort by absolute correlation strength
cor_sorted <- sort(abs(cor_values), decreasing = TRUE)
cor_sorted
## status_label net_income market_value
## 1.000000000 0.181567962 0.156595456
## retained_earnings EBIT year
## 0.154498604 0.146084226 0.119184839
## EBITDA current_assets gross_profit
## 0.118948815 0.100372160 0.099980597
## total_receivables total_assets net_sales
## 0.098208024 0.075179960 0.069830294
## total_revenue inventory operating_expenses
## 0.069830294 0.063355820 0.058154084
## cost_of_goods_sold total_long_term_debt depreciation
## 0.053518002 0.046461257 0.029330147
## total_liabilities current_liabilities
## 0.029312594 0.004442214
# Predicted ratio viz
prob_df <- data.frame(
prob_alive = pred_logit_prob,
actual_status = factor(test_df$status_label, levels = c(0, 1),
labels = c("Bankrupt", "Alive"))
)
ggplot(prob_df, aes(x = prob_alive, fill = actual_status)) +
geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
labs(
title = "Distribution of Predicted Survival Probabilities",
x = "Predicted Probability of Survival",
y = "Count",
fill = "Actual Status"
) +
theme_minimal()
# ROC
roc_obj <- roc(test_df$status_label, pred_logit_prob)
plot(
roc_obj,
main = "ROC Curve for Logistic Regression Model",
col = "salmon",
lwd = 2
)
auc(roc_obj)
## Area under the curve: 0.7601
# coefficient plot
coef_df <- summary(model_logit)$coefficients |>
as.data.frame() |>
tibble::rownames_to_column("variable") |>
dplyr::filter(variable != "(Intercept)") |>
dplyr::arrange(abs(Estimate))
ggplot(coef_df, aes(x = reorder(variable, Estimate), y = Estimate)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(
title = "Logistic Regression Coefficients",
x = "Predictor",
y = "Coefficient Estimate (Log-Odds)"
) +
theme_minimal()
# Odds ratios plot
odds_df <- coef_df |>
mutate(
odds_ratio = exp(Estimate),
lower_ci = exp(Estimate - 1.96 * `Std. Error`),
upper_ci = exp(Estimate + 1.96 * `Std. Error`)
)
ggplot(odds_df, aes(x = reorder(variable, odds_ratio), y = odds_ratio)) +
geom_point() +
geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
coord_flip() +
geom_hline(yintercept = 1, linetype = "dashed") +
labs(
title = "Odds Ratios with 95% Confidence Intervals",
x = "Predictor",
y = "Odds Ratio"
) +
theme_minimal()
# Top correlation features
cor_df <- data.frame(
variable = names(cor_values),
correlation = cor_values
) |>
dplyr::filter(variable != "status_label") |>
dplyr::mutate(abs_corr = abs(correlation)) |>
dplyr::arrange(desc(abs_corr)) |>
head(10)
ggplot(cor_df, aes(x = reorder(variable, correlation), y = correlation)) +
geom_col(fill = "salmon") +
coord_flip() +
labs(
title = "Top 10 Correlated Variables with Survival Status",
x = "Variable",
y = "Correlation with Status Label"
) +
theme_minimal()
# Confusion matrix
conf_mat <- confusionMatrix(
factor(pred_logit_class, levels = c(0,1)),
factor(test_df$status_label, levels = c(0,1))
)
conf_df <- as.data.frame(conf_mat$table)
ggplot(conf_df, aes(Prediction, Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), color = "white", size = 5) +
labs(
title = "Confusion Matrix",
x = "Predicted Class",
y = "Actual Class"
) +
scale_fill_gradient(low = "lightblue", high = "salmon") +
theme_minimal()