Introduction

The following RMD contains the appendix code for CUNY SPS DATA 621 Fall 2025 final project.

Load Libraries

library(tidyverse)
library(ggplot2)
library(dplyr)
library(caret)
library(pROC)

Data Prep

Load data

# Download provided data set
bank_raw <- read_csv("https://raw.githubusercontent.com/evanskaylie/DATA621/refs/heads/main/american_bankruptcy.csv")

# Save the data 
bank_data <- bank_raw

# Rename columns
colnames(bank_data) <- c("company_name",
                         "status_label",
                         "year",
                         "current_assets",
                         "cost_of_goods_sold",
                         "depreciation",
                         "EBITDA",
                         "inventory",
                         "net_income",
                         "total_receivables",
                         "market_value",
                         "net_sales",
                         "total_assets",
                         "total_long_term_debt",
                         "EBIT",
                         "gross_profit",
                         "current_liabilities",
                         "retained_earnings",
                         "total_revenue",
                         "total_liabilities",
                         "operating_expenses"
                         )


# Take a look
head(bank_data)

## # A tibble: 6 × 21
##   company_name status_label  year current_assets cost_of_goods_sold depreciation
##   <chr>        <chr>        <dbl>          <dbl>              <dbl>        <dbl>
## 1 C_1          alive         1999           511.               833.         18.4
## 2 C_1          alive         2000           486.               714.         18.6
## 3 C_1          alive         2001           437.               526.         22.5
## 4 C_1          alive         2002           396.               497.         27.2
## 5 C_1          alive         2003           432.               523.         26.7
## 6 C_1          alive         2004           475.               598.         28.0
## # ℹ 15 more variables: EBITDA <dbl>, inventory <dbl>, net_income <dbl>,
## #   total_receivables <dbl>, market_value <dbl>, net_sales <dbl>,
## #   total_assets <dbl>, total_long_term_debt <dbl>, EBIT <dbl>,
## #   gross_profit <dbl>, current_liabilities <dbl>, retained_earnings <dbl>,
## #   total_revenue <dbl>, total_liabilities <dbl>, operating_expenses <dbl>

Crop messy data

# Crop
bank_latest <- bank_data |> 
  group_by(company_name) |> 
  filter(year == max(year)) |> 
  ungroup()

# Status label
bank_latest$status_label <- ifelse(bank_latest$status_label == "alive", 1, 0)

# See changes
head(bank_latest)

## # A tibble: 6 × 21
##   company_name status_label  year current_assets cost_of_goods_sold depreciation
##   <chr>               <dbl> <dbl>          <dbl>              <dbl>        <dbl>
## 1 C_1                     1  2017          943.              1413.         40.5 
## 2 C_2                     1  2010         1108.               677.         61.5 
## 3 C_3                     1  2008           12.7               19.3         1.69
## 4 C_4                     1  2007          582.               268.         46.3 
## 5 C_5                     1  1999           29.0               79.6         2.02
## 6 C_6                     0  2010         6838              18138         995   
## # ℹ 15 more variables: EBITDA <dbl>, inventory <dbl>, net_income <dbl>,
## #   total_receivables <dbl>, market_value <dbl>, net_sales <dbl>,
## #   total_assets <dbl>, total_long_term_debt <dbl>, EBIT <dbl>,
## #   gross_profit <dbl>, current_liabilities <dbl>, retained_earnings <dbl>,
## #   total_revenue <dbl>, total_liabilities <dbl>, operating_expenses <dbl>

glimpse(bank_latest)

## Rows: 8,971
## Columns: 21
## $ company_name         <chr> "C_1", "C_2", "C_3", "C_4", "C_5", "C_6", "C_7", …
## $ status_label         <dbl> 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1…
## $ year                 <dbl> 2017, 2010, 2008, 2007, 1999, 2010, 2018, 2006, 2…
## $ current_assets       <dbl> 942.700, 1107.700, 12.686, 581.502, 28.957, 6838.…
## $ cost_of_goods_sold   <dbl> 1413.200, 677.200, 19.334, 267.810, 79.567, 18138…
## $ depreciation         <dbl> 40.500, 61.500, 1.686, 46.338, 2.024, 995.000, 13…
## $ EBITDA               <dbl> 126.500, 121.400, -2.721, 40.816, 3.873, 1303.000…
## $ inventory            <dbl> 547.900, 106.400, 6.950, 125.963, 10.947, 594.000…
## $ net_income           <dbl> 15.600, 62.000, -11.049, -13.581, -0.138, -471.00…
## $ total_receivables    <dbl> 203.000, 252.500, 4.282, 130.246, 15.890, 738.000…
## $ market_value         <dbl> 1551.4580, 1231.5240, 5.0201, 882.4491, 6.1972, 2…
## $ net_sales            <dbl> 1748.300, 1156.600, 34.280, 722.425, 107.310, 221…
## $ total_assets         <dbl> 1524.700, 1474.500, 21.401, 1288.165, 42.210, 250…
## $ total_long_term_debt <dbl> 177.200, 650.800, 0.023, 300.000, 0.591, 9253.000…
## $ EBIT                 <dbl> 86.000, 59.900, -4.407, -5.522, 1.849, 308.000, 1…
## $ gross_profit         <dbl> 335.100, 479.400, 14.946, 454.615, 27.743, 4032.0…
## $ current_liabilities  <dbl> 333.300, 288.700, 16.483, 198.375, 39.835, 8780.0…
## $ retained_earnings    <dbl> 701.200, -918.400, -21.447, -95.949, -33.199, -83…
## $ total_revenue        <dbl> 1748.300, 1156.600, 34.280, 722.425, 107.310, 221…
## $ total_liabilities    <dbl> 588.400, 1040.100, 17.224, 557.038, 52.453, 29033…
## $ operating_expenses   <dbl> 1621.800, 1035.200, 37.001, 681.609, 103.437, 208…

Balance data

# Separate majority and minority classes
alive_df     <- bank_latest |> filter(status_label == 1)
bankrupt_df  <- bank_latest |> filter(status_label == 0)

# Count bankrupt cases
n_bankrupt <- nrow(bankrupt_df)

# Randomly sample alive firms to match bankrupt count
alive_sampled <- alive_df |> 
  sample_n(n_bankrupt)

# Combine into balanced dataset
bank_balanced <- bind_rows(bankrupt_df, alive_sampled)

# Check class balance
table(bank_balanced$status_label)

## 
##   0   1 
## 609 609

Train test split

set.seed(64)  # for reproducibility

# Train test split
train_index <- createDataPartition(bank_balanced$status_label, p = 0.8, list = FALSE)

train_df <- bank_balanced[train_index, ]
test_df  <- bank_balanced[-train_index, ]

# Align factor levels for company_name
test_df$company_name <- factor(test_df$company_name, levels = levels(train_df$company_name))

Logistic Regression

Fit model

# Fit logistic regression (excluding company_name as an ID)
model_logit <- glm(
  status_label ~ . - company_name,
  data = train_df,
  family = binomial(link = "logit")
)

# See model results
summary(model_logit)

## 
## Call:
## glm(formula = status_label ~ . - company_name, family = binomial(link = "logit"), 
##     data = train_df)
## 
## Coefficients: (4 not defined because of singularities)
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -6.213e+01  2.379e+01  -2.611 0.009023 ** 
## year                  3.090e-02  1.184e-02   2.609 0.009076 ** 
## current_assets        2.220e-03  6.080e-04   3.651 0.000261 ***
## cost_of_goods_sold    1.213e-03  4.337e-04   2.797 0.005156 ** 
## depreciation         -2.803e-04  1.144e-03  -0.245 0.806460    
## EBITDA                2.627e-03  9.228e-04   2.847 0.004420 ** 
## inventory            -2.498e-03  1.165e-03  -2.144 0.032016 *  
## net_income           -1.082e-03  5.128e-04  -2.111 0.034769 *  
## total_receivables     6.075e-04  1.185e-03   0.512 0.608303    
## market_value          6.877e-04  1.690e-04   4.069 4.73e-05 ***
## net_sales            -1.034e-03  4.533e-04  -2.281 0.022540 *  
## total_assets         -4.040e-05  2.329e-04  -0.173 0.862297    
## total_long_term_debt -5.587e-04  2.380e-04  -2.348 0.018871 *  
## EBIT                         NA         NA      NA       NA    
## gross_profit                 NA         NA      NA       NA    
## current_liabilities  -2.189e-03  4.518e-04  -4.845 1.27e-06 ***
## retained_earnings     1.916e-04  9.519e-05   2.013 0.044130 *  
## total_revenue                NA         NA      NA       NA    
## total_liabilities    -1.410e-04  2.576e-04  -0.547 0.584041    
## operating_expenses           NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1353.0  on 975  degrees of freedom
## Residual deviance: 1182.9  on 960  degrees of freedom
## AIC: 1214.9
## 
## Number of Fisher Scoring iterations: 8

Predict on model

# Predict probabilities of being alive
pred_logit_prob <- predict(model_logit, newdata = test_df, type = "response")

# Convert probabilities to class labels using 0.5 threshold
pred_logit_class <- ifelse(pred_logit_prob >= 0.5, 1, 0)

# Inspect first few predictions
head(pred_logit_prob)

##         1         2         3         4         5         6 
## 0.6387623 0.4525814 0.4534655 0.4343652 0.3825950 0.5033385

head(pred_logit_class)

## 1 2 3 4 5 6 
## 1 0 0 0 0 1

Correlation strength

# Select only numeric predictors (exclude company_name)
numeric_vars <- bank_balanced |> 
  select(-company_name)

# Compute correlations with status_label
cor_values <- cor(numeric_vars)[, "status_label"]

# Sort by absolute correlation strength
cor_sorted <- sort(abs(cor_values), decreasing = TRUE)

cor_sorted

##         status_label           net_income         market_value 
##          1.000000000          0.181567962          0.156595456 
##    retained_earnings                 EBIT                 year 
##          0.154498604          0.146084226          0.119184839 
##               EBITDA       current_assets         gross_profit 
##          0.118948815          0.100372160          0.099980597 
##    total_receivables         total_assets            net_sales 
##          0.098208024          0.075179960          0.069830294 
##        total_revenue            inventory   operating_expenses 
##          0.069830294          0.063355820          0.058154084 
##   cost_of_goods_sold total_long_term_debt         depreciation 
##          0.053518002          0.046461257          0.029330147 
##    total_liabilities  current_liabilities 
##          0.029312594          0.004442214

Potential visulizations for report

# Predicted ratio viz
prob_df <- data.frame(
  prob_alive = pred_logit_prob,
  actual_status = factor(test_df$status_label, levels = c(0, 1),
                         labels = c("Bankrupt", "Alive"))
)

ggplot(prob_df, aes(x = prob_alive, fill = actual_status)) +
  geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
  labs(
    title = "Distribution of Predicted Survival Probabilities",
    x = "Predicted Probability of Survival",
    y = "Count",
    fill = "Actual Status"
  ) +
  theme_minimal()

# ROC
roc_obj <- roc(test_df$status_label, pred_logit_prob)

plot(
  roc_obj,
  main = "ROC Curve for Logistic Regression Model",
  col = "salmon",
  lwd = 2
)

auc(roc_obj)

## Area under the curve: 0.7601

# coefficient plot
coef_df <- summary(model_logit)$coefficients |>
  as.data.frame() |>
  tibble::rownames_to_column("variable") |>
  dplyr::filter(variable != "(Intercept)") |>
  dplyr::arrange(abs(Estimate)) 

ggplot(coef_df, aes(x = reorder(variable, Estimate), y = Estimate)) +
  geom_col(fill = "salmon") +
  coord_flip() +
  labs(
    title = "Logistic Regression Coefficients",
    x = "Predictor",
    y = "Coefficient Estimate (Log-Odds)"
  ) +
  theme_minimal()

# Odds ratios plot
odds_df <- coef_df |>
  mutate(
    odds_ratio = exp(Estimate),
    lower_ci = exp(Estimate - 1.96 * `Std. Error`),
    upper_ci = exp(Estimate + 1.96 * `Std. Error`)
  )

ggplot(odds_df, aes(x = reorder(variable, odds_ratio), y = odds_ratio)) +
  geom_point() +
  geom_errorbar(aes(ymin = lower_ci, ymax = upper_ci), width = 0.2) +
  coord_flip() +
  geom_hline(yintercept = 1, linetype = "dashed") +
  labs(
    title = "Odds Ratios with 95% Confidence Intervals",
    x = "Predictor",
    y = "Odds Ratio"
  ) +
  theme_minimal()

# Top correlation features
cor_df <- data.frame(
  variable = names(cor_values),
  correlation = cor_values
) |>
  dplyr::filter(variable != "status_label") |>
  dplyr::mutate(abs_corr = abs(correlation)) |>
  dplyr::arrange(desc(abs_corr)) |>
  head(10)

ggplot(cor_df, aes(x = reorder(variable, correlation), y = correlation)) +
  geom_col(fill = "salmon") +
  coord_flip() +
  labs(
    title = "Top 10 Correlated Variables with Survival Status",
    x = "Variable",
    y = "Correlation with Status Label"
  ) +
  theme_minimal()

# Confusion matrix
conf_mat <- confusionMatrix(
  factor(pred_logit_class, levels = c(0,1)),
  factor(test_df$status_label, levels = c(0,1))
)

conf_df <- as.data.frame(conf_mat$table)

ggplot(conf_df, aes(Prediction, Reference, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = Freq), color = "white", size = 5) +
  labs(
    title = "Confusion Matrix",
    x = "Predicted Class",
    y = "Actual Class"
  ) +
  scale_fill_gradient(low = "lightblue", high = "salmon") +
  theme_minimal()

FinalProject

Kaylie Evans

2025-12-12

Introduction

Load Libraries

Data Prep

Load data

Crop messy data

Balance data

Train test split

Logistic Regression

Fit model

Predict on model

Correlation strength

Potential visulizations for report