Research question: How is trade credit insurance associated with liquidity and payment default risk among small and medium-sized enterprises in Slovenia?

#Loading packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(broom)
library(knitr)
library(writexl)
## Warning: package 'writexl' was built under R version 4.3.3
#Importing dataset
mydata <- read_excel("Master_Data_Final.xlsx")

#Removing variable "Number of employees" and "N_days" because I don't need them in analysis
mydata <- mydata %>%
  select(-N_Employees)

mydata <- mydata %>%
  select(-N_Days)

head(mydata)
## # A tibble: 6 × 10
##   Company_ID Firm_Size NACE_Aggregated          Insured Cash_Ratio Current_Ratio
##   <chr>      <chr>     <chr>                      <dbl>      <dbl>         <dbl>
## 1 Company_1  Micro     Construction                   1    1.99            3.41 
## 2 Company_2  Small     Capital goods / transpo…       0    0.0157          0.845
## 3 Company_3  Micro     Construction                   1    1.13            1.56 
## 4 Company_4  Small     Capital goods / transpo…       1    0.552           1.61 
## 5 Company_5  Small     Process / consumer manu…       1    0.771           1.96 
## 6 Company_6  Small     Capital goods / transpo…       0    0.00201         1.56 
## # ℹ 4 more variables: Quick_Ratio <dbl>, `NWC/TA` <dbl>, DSO <dbl>,
## #   Accounts_Blocked <dbl>

Explanation of data set: - unit of observation: a firm (SME) - number of observations: 395 - number of variables: 10 - source: AJPES

# Rename NWC/TA to avoid problems with slash in R 
if ("NWC/TA" %in% names(mydata)) {
  mydata <- mydata %>%
    rename(NWC_TA = `NWC/TA`)
}

# Factoring variables
mydata$Insured <- factor(mydata$Insured,
  levels = c(1, 0),
  labels = c("Insured", "Non-insured")
)
mydata$Accounts_Blocked <- factor(mydata$Accounts_Blocked,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
mydata$Firm_Size <- factor(mydata$Firm_Size,
  levels = c("Micro", "Small", "Medium")
)
mydata$NACE_Aggregated <- trimws(mydata$NACE_Aggregated)

mydata$NACE_Aggregated <- factor(mydata$NACE_Aggregated,
  levels = c(
    "Capital goods / transport-related manufacturing",
    "Construction",
    "Commercial & services",
    "Process / consumer manufacturing & agro"
  )
)
# Create a numeric blocked-account variable for logistic regression
mydata$Accounts_Blocked_Num <- ifelse(mydata$Accounts_Blocked == "Yes", 1, 0)

str(mydata)
## tibble [395 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Company_ID          : chr [1:395] "Company_1" "Company_2" "Company_3" "Company_4" ...
##  $ Firm_Size           : Factor w/ 3 levels "Micro","Small",..: 1 2 1 2 2 2 1 2 1 3 ...
##  $ NACE_Aggregated     : Factor w/ 4 levels "Capital goods / transport-related manufacturing",..: 2 1 2 1 4 1 3 2 3 1 ...
##  $ Insured             : Factor w/ 2 levels "Insured","Non-insured": 1 2 1 1 1 2 2 1 2 1 ...
##  $ Cash_Ratio          : num [1:395] 1.988 0.0157 1.1258 0.5519 0.7714 ...
##  $ Current_Ratio       : num [1:395] 3.407 0.845 1.557 1.606 1.959 ...
##  $ Quick_Ratio         : num [1:395] 3.36 0.466 1.505 1.601 1.794 ...
##  $ NWC_TA              : num [1:395] 0.6159 -0.0612 0.3535 0.2822 0.0683 ...
##  $ DSO                 : num [1:395] 60.87 66.8 7.04 56.57 9.94 ...
##  $ Accounts_Blocked    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 2 1 ...
##  $ Accounts_Blocked_Num: num [1:395] 0 0 0 0 0 0 0 1 1 0 ...
summary(mydata)
##   Company_ID         Firm_Size  
##  Length:395         Micro :122  
##  Class :character   Small :151  
##  Mode  :character   Medium:122  
##                                 
##                                 
##                                 
##                                         NACE_Aggregated        Insured   
##  Capital goods / transport-related manufacturing:141    Insured    :238  
##  Construction                                   : 66    Non-insured:157  
##  Commercial & services                          : 75                     
##  Process / consumer manufacturing & agro        :113                     
##                                                                          
##                                                                          
##    Cash_Ratio      Current_Ratio      Quick_Ratio          NWC_TA        
##  Min.   :0.00000   Min.   : 0.2167   Min.   : 0.1046   Min.   :-0.77667  
##  1st Qu.:0.04125   1st Qu.: 1.2533   1st Qu.: 0.7762   1st Qu.: 0.09617  
##  Median :0.19210   Median : 1.7907   Median : 1.2600   Median : 0.24025  
##  Mean   :0.49119   Mean   : 2.4675   Mean   : 1.7496   Mean   : 0.24336  
##  3rd Qu.:0.54763   3rd Qu.: 2.8313   3rd Qu.: 2.0242   3rd Qu.: 0.41994  
##  Max.   :4.81808   Max.   :14.7800   Max.   :11.0700   Max.   : 0.88180  
##       DSO          Accounts_Blocked Accounts_Blocked_Num
##  Min.   :  5.091   No :345          Min.   :0.0000      
##  1st Qu.: 44.563   Yes: 50          1st Qu.:0.0000      
##  Median : 62.735                    Median :0.0000      
##  Mean   : 74.219                    Mean   :0.1266      
##  3rd Qu.: 85.061                    3rd Qu.:0.0000      
##  Max.   :578.932                    Max.   :1.0000
# Descriptive statistics by insurance status

#Cash ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Cash_Ratio, na.rm = TRUE), Median = median(Cash_Ratio, na.rm = TRUE), SD = sd(Cash_Ratio, na.rm = TRUE), Min = min(Cash_Ratio, na.rm = TRUE), Max = max(Cash_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
##   Insured         N  Mean Median    SD   Min   Max
##   <fct>       <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Insured       238 0.510  0.198 0.812     0  4.81
## 2 Non-insured   157 0.463  0.179 0.769     0  4.82
# Current ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Current_Ratio, na.rm = TRUE), Median = median(Current_Ratio, na.rm = TRUE), SD = sd(Current_Ratio, na.rm = TRUE), Min = min(Current_Ratio, na.rm = TRUE), Max = max(Current_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
##   Insured         N  Mean Median    SD   Min   Max
##   <fct>       <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Insured       238  2.64   1.97  2.11 0.217  14.8
## 2 Non-insured   157  2.20   1.56  2.00 0.217  11.3
# Quick ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Quick_Ratio, na.rm = TRUE), Median = median(Quick_Ratio, na.rm = TRUE), SD = sd(Quick_Ratio, na.rm = TRUE), Min = min(Quick_Ratio, na.rm = TRUE), Max = max(Quick_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
##   Insured         N  Mean Median    SD   Min   Max
##   <fct>       <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Insured       238  1.80   1.31  1.65 0.113 11.1 
## 2 Non-insured   157  1.68   1.18  1.61 0.105  9.29
# NWC/TA
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(NWC_TA, na.rm = TRUE), Median = median(NWC_TA, na.rm = TRUE), SD = sd(NWC_TA, na.rm = TRUE), Min = min(NWC_TA, na.rm = TRUE), Max = max(NWC_TA, na.rm = TRUE) )
## # A tibble: 2 × 7
##   Insured         N  Mean Median    SD    Min   Max
##   <fct>       <int> <dbl>  <dbl> <dbl>  <dbl> <dbl>
## 1 Insured       238 0.290  0.282 0.233 -0.478 0.882
## 2 Non-insured   157 0.173  0.180 0.294 -0.777 0.833
# DSO
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(DSO, na.rm = TRUE), Median = median(DSO, na.rm = TRUE), SD = sd(DSO, na.rm = TRUE), Min = min(DSO, na.rm = TRUE), Max = max(DSO, na.rm = TRUE) )
## # A tibble: 2 × 7
##   Insured         N  Mean Median    SD   Min   Max
##   <fct>       <int> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Insured       238  67.1   57.7  52.6  6.33  530.
## 2 Non-insured   157  85.0   69.1  77.2  5.09  579.
# Blocked-account incidence by insurance status
table(mydata$Insured, mydata$Accounts_Blocked) 
##              
##                No Yes
##   Insured     224  14
##   Non-insured 121  36
prop.table(table(mydata$Insured, mydata$Accounts_Blocked), margin = 1) * 100
##              
##                      No       Yes
##   Insured     94.117647  5.882353
##   Non-insured 77.070064 22.929936
# Distribution of main explanatory variable
table(mydata$Insured) 
## 
##     Insured Non-insured 
##         238         157
prop.table(table(mydata$Insured)) * 100
## 
##     Insured Non-insured 
##    60.25316    39.74684
# Distribution of control variable: firm size
table(mydata$Firm_Size) 
## 
##  Micro  Small Medium 
##    122    151    122
prop.table(table(mydata$Firm_Size)) * 100
## 
##    Micro    Small   Medium 
## 30.88608 38.22785 30.88608
# Distribution of control variable: industry sector
table(mydata$NACE_Aggregated) 
## 
## Capital goods / transport-related manufacturing 
##                                             141 
##                                    Construction 
##                                              66 
##                           Commercial & services 
##                                              75 
##         Process / consumer manufacturing & agro 
##                                             113
prop.table(table(mydata$NACE_Aggregated)) * 100
## 
## Capital goods / transport-related manufacturing 
##                                        35.69620 
##                                    Construction 
##                                        16.70886 
##                           Commercial & services 
##                                        18.98734 
##         Process / consumer manufacturing & agro 
##                                        28.60759
# Welch t-tests for numerical dependent variables

t_cash <- t.test(Cash_Ratio ~ Insured, data = mydata)
t_current <- t.test(Current_Ratio ~ Insured, data = mydata)
t_quick <- t.test(Quick_Ratio ~ Insured, data = mydata)
t_nwc <- t.test(NWC_TA ~ Insured, data = mydata)
t_dso <- t.test(DSO ~ Insured, data = mydata)

# Show individual outputs
t_cash
## 
##  Welch Two Sample t-test
## 
## data:  Cash_Ratio by Insured
## t = 0.58659, df = 346.64, p-value = 0.5579
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
##  -0.1116033  0.2064634
## sample estimates:
##     mean in group Insured mean in group Non-insured 
##                 0.5100473                 0.4626173
t_current
## 
##  Welch Two Sample t-test
## 
## data:  Current_Ratio by Insured
## t = 2.1006, df = 346.35, p-value = 0.0364
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
##  0.02810182 0.85461302
## sample estimates:
##     mean in group Insured mean in group Non-insured 
##                  2.642948                  2.201591
t_quick
## 
##  Welch Two Sample t-test
## 
## data:  Quick_Ratio by Insured
## t = 0.70032, df = 339.89, p-value = 0.4842
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
##  -0.2117328  0.4458616
## sample estimates:
##     mean in group Insured mean in group Non-insured 
##                  1.796163                  1.679098
t_nwc
## 
##  Welch Two Sample t-test
## 
## data:  NWC_TA by Insured
## t = 4.1917, df = 280.12, p-value = 3.716e-05
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
##  0.06197289 0.17171330
## sample estimates:
##     mean in group Insured mean in group Non-insured 
##                 0.2898005                 0.1729574
t_dso
## 
##  Welch Two Sample t-test
## 
## data:  DSO by Insured
## t = -2.5414, df = 250.89, p-value = 0.01165
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
##  -31.764319  -4.027193
## sample estimates:
##     mean in group Insured mean in group Non-insured 
##                  67.10628                  85.00203
# Chi-square test: Insurance status vs blocked-account incidence

# Create clean variables for this test only
chi_data <- mydata %>%
  mutate(
    Insurance_status = case_when(
      as.character(Insured) == "1" ~ "Insured",
      as.character(Insured) == "0" ~ "Non-insured",
      as.character(Insured) == "Insured" ~ "Insured",
      as.character(Insured) == "Non-insured" ~ "Non-insured",
      TRUE ~ NA_character_
    ),
    Blocked_account = case_when(
      as.character(Accounts_Blocked) == "1" ~ "Yes",
      as.character(Accounts_Blocked) == "0" ~ "No",
      as.character(Accounts_Blocked) == "Yes" ~ "Yes",
      as.character(Accounts_Blocked) == "No" ~ "No",
      TRUE ~ NA_character_
    )
  ) %>%
  mutate(
    Insurance_status = factor(
      Insurance_status,
      levels = c("Insured", "Non-insured")
    ),
    Blocked_account = factor(
      Blocked_account,
      levels = c("No", "Yes")
    )
  )

# Observed frequencies
blocked_table <- table(chi_data$Insurance_status, chi_data$Blocked_account)

blocked_table
##              
##                No Yes
##   Insured     224  14
##   Non-insured 121  36
# Pearson chi-square test
chi_test <- chisq.test(blocked_table, correct = FALSE)

chi_test
## 
##  Pearson's Chi-squared test
## 
## data:  blocked_table
## X-squared = 24.866, df = 1, p-value = 6.145e-07
# Expected frequencies
chi_test$expected
##              
##                     No      Yes
##   Insured     207.8734 30.12658
##   Non-insured 137.1266 19.87342
# Standardized residuals
chi_test$stdres
##              
##                      No       Yes
##   Insured      4.986607 -4.986607
##   Non-insured -4.986607  4.986607
# Odds ratio: insured relative to non-insured
odds_ratio <- (blocked_table["Insured", "Yes"] / blocked_table["Insured", "No"]) /
  (blocked_table["Non-insured", "Yes"] / blocked_table["Non-insured", "No"])

odds_ratio
## [1] 0.2100694
# Regression models with two control variables
# Reference category for insurance status: Non-insured

# Set reference categories
mydata$Insured <- factor(
  mydata$Insured,
  levels = c("Non-insured", "Insured")
)

mydata$Firm_Size <- factor(
  mydata$Firm_Size,
  levels = c("Micro", "Small", "Medium")
)

mydata$NACE_Aggregated <- factor(
  mydata$NACE_Aggregated,
  levels = c(
    "Capital goods / transport-related manufacturing",
    "Construction",
    "Commercial & services",
    "Process / consumer manufacturing & agro"
  )
)
# OLS regression models for numerical dependent variables
model_cash <- lm(Cash_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)

model_current <- lm(Current_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)

model_quick <- lm(Quick_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)

model_nwc <- lm(NWC_TA ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)

model_dso <- lm(DSO ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)

# Regression output for liquidity indicators
summary(model_cash)
## 
## Call:
## lm(formula = Cash_Ratio ~ Insured + Firm_Size + NACE_Aggregated, 
##     data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8930 -0.4143 -0.2210  0.0679  4.0046 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                             0.57811    0.09454
## InsuredInsured                                          0.09953    0.08417
## Firm_SizeSmall                                         -0.17688    0.09757
## Firm_SizeMedium                                        -0.33912    0.10424
## NACE_AggregatedConstruction                            -0.08649    0.11625
## NACE_AggregatedCommercial & services                    0.23535    0.11354
## NACE_AggregatedProcess / consumer manufacturing & agro -0.01663    0.10242
##                                                        t value Pr(>|t|)    
## (Intercept)                                              6.115 2.36e-09 ***
## InsuredInsured                                           1.183  0.23770    
## Firm_SizeSmall                                          -1.813  0.07062 .  
## Firm_SizeMedium                                         -3.253  0.00124 ** 
## NACE_AggregatedConstruction                             -0.744  0.45733    
## NACE_AggregatedCommercial & services                     2.073  0.03885 *  
## NACE_AggregatedProcess / consumer manufacturing & agro  -0.162  0.87110    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7789 on 388 degrees of freedom
## Multiple R-squared:  0.05398,    Adjusted R-squared:  0.03935 
## F-statistic:  3.69 on 6 and 388 DF,  p-value: 0.001407
summary(model_current)
## 
## Call:
## lm(formula = Current_Ratio ~ Insured + Firm_Size + NACE_Aggregated, 
##     data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8299 -1.1399 -0.4652  0.4653 12.2582 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             2.361738   0.245995
## InsuredInsured                                          0.520555   0.218993
## Firm_SizeSmall                                         -0.504605   0.253867
## Firm_SizeMedium                                        -0.701590   0.271233
## NACE_AggregatedConstruction                             0.001287   0.302469
## NACE_AggregatedCommercial & services                    0.844179   0.295423
## NACE_AggregatedProcess / consumer manufacturing & agro  0.144105   0.266496
##                                                        t value Pr(>|t|)    
## (Intercept)                                              9.601   <2e-16 ***
## InsuredInsured                                           2.377   0.0179 *  
## Firm_SizeSmall                                          -1.988   0.0476 *  
## Firm_SizeMedium                                         -2.587   0.0101 *  
## NACE_AggregatedConstruction                              0.004   0.9966    
## NACE_AggregatedCommercial & services                     2.858   0.0045 ** 
## NACE_AggregatedProcess / consumer manufacturing & agro   0.541   0.5890    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.027 on 388 degrees of freedom
## Multiple R-squared:  0.06054,    Adjusted R-squared:  0.04601 
## F-statistic: 4.167 on 6 and 388 DF,  p-value: 0.0004495
summary(model_quick)
## 
## Call:
## lm(formula = Quick_Ratio ~ Insured + Firm_Size + NACE_Aggregated, 
##     data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0914 -0.9495 -0.4249  0.3409  9.3516 
## 
## Coefficients:
##                                                         Estimate Std. Error
## (Intercept)                                             1.917846   0.193667
## InsuredInsured                                          0.236503   0.172409
## Firm_SizeSmall                                         -0.444743   0.199865
## Firm_SizeMedium                                        -0.779917   0.213536
## NACE_AggregatedConstruction                            -0.005552   0.238128
## NACE_AggregatedCommercial & services                    0.519285   0.232581
## NACE_AggregatedProcess / consumer manufacturing & agro  0.008801   0.209808
##                                                        t value Pr(>|t|)    
## (Intercept)                                              9.903  < 2e-16 ***
## InsuredInsured                                           1.372 0.170933    
## Firm_SizeSmall                                          -2.225 0.026641 *  
## Firm_SizeMedium                                         -3.652 0.000295 ***
## NACE_AggregatedConstruction                             -0.023 0.981410    
## NACE_AggregatedCommercial & services                     2.233 0.026138 *  
## NACE_AggregatedProcess / consumer manufacturing & agro   0.042 0.966561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.596 on 388 degrees of freedom
## Multiple R-squared:  0.06015,    Adjusted R-squared:  0.04561 
## F-statistic: 4.138 on 6 and 388 DF,  p-value: 0.0004816
summary(model_nwc)
## 
## Call:
## lm(formula = NWC_TA ~ Insured + Firm_Size + NACE_Aggregated, 
##     data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.95986 -0.12895  0.01395  0.16485  0.59836 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                             0.20566    0.03069
## InsuredInsured                                          0.14082    0.02732
## Firm_SizeSmall                                         -0.06589    0.03167
## Firm_SizeMedium                                        -0.11114    0.03383
## NACE_AggregatedConstruction                             0.02516    0.03773
## NACE_AggregatedCommercial & services                    0.07745    0.03685
## NACE_AggregatedProcess / consumer manufacturing & agro -0.02287    0.03324
##                                                        t value Pr(>|t|)    
## (Intercept)                                              6.702 7.26e-11 ***
## InsuredInsured                                           5.155 4.05e-07 ***
## Firm_SizeSmall                                          -2.081  0.03812 *  
## Firm_SizeMedium                                         -3.285  0.00111 ** 
## NACE_AggregatedConstruction                              0.667  0.50524    
## NACE_AggregatedCommercial & services                     2.102  0.03622 *  
## NACE_AggregatedProcess / consumer manufacturing & agro  -0.688  0.49182    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2528 on 388 degrees of freedom
## Multiple R-squared:  0.1007, Adjusted R-squared:  0.08675 
## F-statistic: 7.238 on 6 and 388 DF,  p-value: 2.445e-07
summary(model_dso)
## 
## Call:
## lm(formula = DSO ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -96.72 -27.92  -9.83  14.15 476.81 
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                             105.067      7.548
## InsuredInsured                                          -15.372      6.720
## Firm_SizeSmall                                          -31.835      7.790
## Firm_SizeMedium                                         -28.263      8.323
## NACE_AggregatedConstruction                             -12.290      9.281
## NACE_AggregatedCommercial & services                     -2.947      9.065
## NACE_AggregatedProcess / consumer manufacturing & agro    6.736      8.177
##                                                        t value Pr(>|t|)    
## (Intercept)                                             13.919  < 2e-16 ***
## InsuredInsured                                          -2.288 0.022703 *  
## Firm_SizeSmall                                          -4.087 5.32e-05 ***
## Firm_SizeMedium                                         -3.396 0.000755 ***
## NACE_AggregatedConstruction                             -1.324 0.186224    
## NACE_AggregatedCommercial & services                    -0.325 0.745266    
## NACE_AggregatedProcess / consumer manufacturing & agro   0.824 0.410611    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.19 on 388 degrees of freedom
## Multiple R-squared:  0.07169,    Adjusted R-squared:  0.05734 
## F-statistic: 4.994 on 6 and 388 DF,  p-value: 6.05e-05
# Tidy coefficient outputs for easier table preparation
tidy(model_cash)
## # A tibble: 7 × 5
##   term                                      estimate std.error statistic p.value
##   <chr>                                        <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)                                 0.578     0.0945     6.11  2.36e-9
## 2 InsuredInsured                              0.0995    0.0842     1.18  2.38e-1
## 3 Firm_SizeSmall                             -0.177     0.0976    -1.81  7.06e-2
## 4 Firm_SizeMedium                            -0.339     0.104     -3.25  1.24e-3
## 5 NACE_AggregatedConstruction                -0.0865    0.116     -0.744 4.57e-1
## 6 NACE_AggregatedCommercial & services        0.235     0.114      2.07  3.89e-2
## 7 NACE_AggregatedProcess / consumer manufa…  -0.0166    0.102     -0.162 8.71e-1
tidy(model_current)
## # A tibble: 7 × 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                               2.36        0.246   9.60    1.01e-19
## 2 InsuredInsured                            0.521       0.219   2.38    1.79e- 2
## 3 Firm_SizeSmall                           -0.505       0.254  -1.99    4.76e- 2
## 4 Firm_SizeMedium                          -0.702       0.271  -2.59    1.01e- 2
## 5 NACE_AggregatedConstruction               0.00129     0.302   0.00426 9.97e- 1
## 6 NACE_AggregatedCommercial & services      0.844       0.295   2.86    4.50e- 3
## 7 NACE_AggregatedProcess / consumer manuf…  0.144       0.266   0.541   5.89e- 1
tidy(model_quick)
## # A tibble: 7 × 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                               1.92        0.194    9.90   9.23e-21
## 2 InsuredInsured                            0.237       0.172    1.37   1.71e- 1
## 3 Firm_SizeSmall                           -0.445       0.200   -2.23   2.66e- 2
## 4 Firm_SizeMedium                          -0.780       0.214   -3.65   2.95e- 4
## 5 NACE_AggregatedConstruction              -0.00555     0.238   -0.0233 9.81e- 1
## 6 NACE_AggregatedCommercial & services      0.519       0.233    2.23   2.61e- 2
## 7 NACE_AggregatedProcess / consumer manuf…  0.00880     0.210    0.0419 9.67e- 1
tidy(model_nwc)
## # A tibble: 7 × 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                                0.206     0.0307     6.70  7.26e-11
## 2 InsuredInsured                             0.141     0.0273     5.15  4.05e- 7
## 3 Firm_SizeSmall                            -0.0659    0.0317    -2.08  3.81e- 2
## 4 Firm_SizeMedium                           -0.111     0.0338    -3.28  1.11e- 3
## 5 NACE_AggregatedConstruction                0.0252    0.0377     0.667 5.05e- 1
## 6 NACE_AggregatedCommercial & services       0.0775    0.0369     2.10  3.62e- 2
## 7 NACE_AggregatedProcess / consumer manuf…  -0.0229    0.0332    -0.688 4.92e- 1
tidy(model_dso)
## # A tibble: 7 × 5
##   term                                     estimate std.error statistic  p.value
##   <chr>                                       <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                                105.        7.55    13.9   5.24e-36
## 2 InsuredInsured                             -15.4       6.72    -2.29  2.27e- 2
## 3 Firm_SizeSmall                             -31.8       7.79    -4.09  5.32e- 5
## 4 Firm_SizeMedium                            -28.3       8.32    -3.40  7.55e- 4
## 5 NACE_AggregatedConstruction                -12.3       9.28    -1.32  1.86e- 1
## 6 NACE_AggregatedCommercial & services        -2.95      9.07    -0.325 7.45e- 1
## 7 NACE_AggregatedProcess / consumer manuf…     6.74      8.18     0.824 4.11e- 1
# Model fit statistics for OLS models
glance(model_cash)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0540        0.0393 0.779      3.69 0.00141     6  -458.  933.  964.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_current)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0605        0.0460  2.03      4.17 0.000450     6  -836. 1688. 1720.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_quick)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic  p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>    <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0601        0.0456  1.60      4.14 0.000482     6  -741. 1499. 1531.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_nwc)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic     p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>       <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1     0.101        0.0867 0.253      7.24 0.000000245     6  -13.8  43.6  75.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_dso)
## # A tibble: 1 × 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1    0.0717        0.0573  62.2      4.99 0.0000605     6 -2188. 4393. 4425.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Logistic regression model for blocked-account incidence
model_blocked <- glm(
  Accounts_Blocked_Num ~ Insured + Firm_Size + NACE_Aggregated,
  data = mydata,
  family = binomial
)

summary(model_blocked)
## 
## Call:
## glm(formula = Accounts_Blocked_Num ~ Insured + Firm_Size + NACE_Aggregated, 
##     family = binomial, data = mydata)
## 
## Coefficients:
##                                                        Estimate Std. Error
## (Intercept)                                             -0.6405     0.3115
## InsuredInsured                                          -1.3631     0.3533
## Firm_SizeSmall                                          -1.4729     0.4212
## Firm_SizeMedium                                         -0.7023     0.3973
## NACE_AggregatedConstruction                              0.3659     0.4283
## NACE_AggregatedCommercial & services                    -0.0744     0.4354
## NACE_AggregatedProcess / consumer manufacturing & agro  -0.3982     0.4910
##                                                        z value Pr(>|z|)    
## (Intercept)                                             -2.056 0.039776 *  
## InsuredInsured                                          -3.859 0.000114 ***
## Firm_SizeSmall                                          -3.497 0.000471 ***
## Firm_SizeMedium                                         -1.767 0.077154 .  
## NACE_AggregatedConstruction                              0.854 0.392870    
## NACE_AggregatedCommercial & services                    -0.171 0.864312    
## NACE_AggregatedProcess / consumer manufacturing & agro  -0.811 0.417378    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 300.07  on 394  degrees of freedom
## Residual deviance: 258.62  on 388  degrees of freedom
## AIC: 272.62
## 
## Number of Fisher Scoring iterations: 5
# Odds ratios for logistic regression
exp(coef(model_blocked))
##                                            (Intercept) 
##                                              0.5270421 
##                                         InsuredInsured 
##                                              0.2558557 
##                                         Firm_SizeSmall 
##                                              0.2292632 
##                                        Firm_SizeMedium 
##                                              0.4954485 
##                            NACE_AggregatedConstruction 
##                                              1.4418728 
##                   NACE_AggregatedCommercial & services 
##                                              0.9282988 
## NACE_AggregatedProcess / consumer manufacturing & agro 
##                                              0.6715185
# Odds ratios with 95% confidence intervals
exp(cbind(
  Odds_Ratio = coef(model_blocked),
  confint.default(model_blocked)
))
##                                                        Odds_Ratio     2.5 %
## (Intercept)                                             0.5270421 0.2862147
## InsuredInsured                                          0.2558557 0.1280262
## Firm_SizeSmall                                          0.2292632 0.1004109
## Firm_SizeMedium                                         0.4954485 0.2273918
## NACE_AggregatedConstruction                             1.4418728 0.6228230
## NACE_AggregatedCommercial & services                    0.9282988 0.3954475
## NACE_AggregatedProcess / consumer manufacturing & agro  0.6715185 0.2565038
##                                                           97.5 %
## (Intercept)                                            0.9705069
## InsuredInsured                                         0.5113183
## Firm_SizeSmall                                         0.5234654
## Firm_SizeMedium                                        1.0794988
## NACE_AggregatedConstruction                            3.3380226
## NACE_AggregatedCommercial & services                   2.1791478
## NACE_AggregatedProcess / consumer manufacturing & agro 1.7580132
# McFadden R-squared for logistic regression
mcfadden_r2 <- 1 - (model_blocked$deviance / model_blocked$null.deviance)

mcfadden_r2
## [1] 0.1381284
# Predicted probabilities from the logistic regression model
mydata$predicted_prob_blocked <- predict(model_blocked, type = "response")

# Check first few predicted probabilities
head(mydata$predicted_prob_blocked)
##          1          2          3          4          5          6 
## 0.16278186 0.10780511 0.16278186 0.02998829 0.02033803 0.10780511
# ROC curve and AUC
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_blocked <- roc(
  response = mydata$Accounts_Blocked_Num,
  predictor = mydata$predicted_prob_blocked,
  levels = c(0, 1),
  direction = "<"
)

# Area under the curve
auc_blocked <- auc(roc_blocked)

# Display AUC rounded to three decimals
round(auc_blocked, 3)
## [1] 0.756
# Plot ROC curve
plot(
  roc_blocked,
  main = "ROC curve for blocked-account incidence model"
)

# Optimal threshold based on Youden index
optimal_coords <- coords(
  roc_blocked,
  x = "best",
  best.method = "youden",
  ret = c("threshold", "sensitivity", "specificity"),
  transpose = FALSE
)
optimal_coords
##   threshold sensitivity specificity
## 1 0.1559851        0.62   0.8144928
# Save optimal threshold
threshold_best <- optimal_coords$threshold

threshold_best
## [1] 0.1559851
# Predicted class using optimal threshold
mydata$predicted_class_best <- ifelse(
  mydata$predicted_prob_blocked >= threshold_best,
  1,
  0
)
# Confusion matrix using optimal threshold
conf_matrix_best <- table(
  Actual = factor(mydata$Accounts_Blocked_Num, levels = c(0, 1)),
  Predicted = factor(mydata$predicted_class_best, levels = c(0, 1))
)

conf_matrix_best
##       Predicted
## Actual   0   1
##      0 281  64
##      1  19  31
# Extract values from confusion matrix
TN <- conf_matrix_best["0", "0"]
FP <- conf_matrix_best["0", "1"]
FN <- conf_matrix_best["1", "0"]
TP <- conf_matrix_best["1", "1"]

# Accuracy, sensitivity and specificity
accuracy_best <- (TP + TN) / sum(conf_matrix_best)
sensitivity_best <- TP / (TP + FN)
specificity_best <- TN / (TN + FP)

# Display rounded results
round(accuracy_best, 3)
## [1] 0.79
round(sensitivity_best, 3)
## [1] 0.62
round(specificity_best, 3)
## [1] 0.814
# Logistic regression model performance table
logit_performance <- data.frame(
  Measure = c(
    "McFadden R-squared",
    "AUC",
    "Optimal threshold",
    "Accuracy",
    "Sensitivity",
    "Specificity"
  ),
  Value = round(c(
    mcfadden_r2,
    as.numeric(auc_blocked),
    threshold_best,
    accuracy_best,
    sensitivity_best,
    specificity_best
  ), 3)
)

logit_performance
##              Measure Value
## 1 McFadden R-squared 0.138
## 2                AUC 0.756
## 3  Optimal threshold 0.156
## 4           Accuracy 0.790
## 5        Sensitivity 0.620
## 6        Specificity 0.814