#Loading packages
library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
library(broom)
library(knitr)
library(writexl)
## Warning: package 'writexl' was built under R version 4.3.3
#Importing dataset
mydata <- read_excel("Master_Data_Final.xlsx")
#Removing variable "Number of employees" and "N_days" because I don't need them in analysis
mydata <- mydata %>%
select(-N_Employees)
mydata <- mydata %>%
select(-N_Days)
head(mydata)
## # A tibble: 6 × 10
## Company_ID Firm_Size NACE_Aggregated Insured Cash_Ratio Current_Ratio
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Company_1 Micro Construction 1 1.99 3.41
## 2 Company_2 Small Capital goods / transpo… 0 0.0157 0.845
## 3 Company_3 Micro Construction 1 1.13 1.56
## 4 Company_4 Small Capital goods / transpo… 1 0.552 1.61
## 5 Company_5 Small Process / consumer manu… 1 0.771 1.96
## 6 Company_6 Small Capital goods / transpo… 0 0.00201 1.56
## # ℹ 4 more variables: Quick_Ratio <dbl>, `NWC/TA` <dbl>, DSO <dbl>,
## # Accounts_Blocked <dbl>
Explanation of data set: - unit of observation: a firm (SME) - number of observations: 395 - number of variables: 10 - source: AJPES
# Rename NWC/TA to avoid problems with slash in R
if ("NWC/TA" %in% names(mydata)) {
mydata <- mydata %>%
rename(NWC_TA = `NWC/TA`)
}
# Factoring variables
mydata$Insured <- factor(mydata$Insured,
levels = c(1, 0),
labels = c("Insured", "Non-insured")
)
mydata$Accounts_Blocked <- factor(mydata$Accounts_Blocked,
levels = c(0, 1),
labels = c("No", "Yes")
)
mydata$Firm_Size <- factor(mydata$Firm_Size,
levels = c("Micro", "Small", "Medium")
)
mydata$NACE_Aggregated <- trimws(mydata$NACE_Aggregated)
mydata$NACE_Aggregated <- factor(mydata$NACE_Aggregated,
levels = c(
"Capital goods / transport-related manufacturing",
"Construction",
"Commercial & services",
"Process / consumer manufacturing & agro"
)
)
# Create a numeric blocked-account variable for logistic regression
mydata$Accounts_Blocked_Num <- ifelse(mydata$Accounts_Blocked == "Yes", 1, 0)
str(mydata)
## tibble [395 × 11] (S3: tbl_df/tbl/data.frame)
## $ Company_ID : chr [1:395] "Company_1" "Company_2" "Company_3" "Company_4" ...
## $ Firm_Size : Factor w/ 3 levels "Micro","Small",..: 1 2 1 2 2 2 1 2 1 3 ...
## $ NACE_Aggregated : Factor w/ 4 levels "Capital goods / transport-related manufacturing",..: 2 1 2 1 4 1 3 2 3 1 ...
## $ Insured : Factor w/ 2 levels "Insured","Non-insured": 1 2 1 1 1 2 2 1 2 1 ...
## $ Cash_Ratio : num [1:395] 1.988 0.0157 1.1258 0.5519 0.7714 ...
## $ Current_Ratio : num [1:395] 3.407 0.845 1.557 1.606 1.959 ...
## $ Quick_Ratio : num [1:395] 3.36 0.466 1.505 1.601 1.794 ...
## $ NWC_TA : num [1:395] 0.6159 -0.0612 0.3535 0.2822 0.0683 ...
## $ DSO : num [1:395] 60.87 66.8 7.04 56.57 9.94 ...
## $ Accounts_Blocked : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 2 1 ...
## $ Accounts_Blocked_Num: num [1:395] 0 0 0 0 0 0 0 1 1 0 ...
summary(mydata)
## Company_ID Firm_Size
## Length:395 Micro :122
## Class :character Small :151
## Mode :character Medium:122
##
##
##
## NACE_Aggregated Insured
## Capital goods / transport-related manufacturing:141 Insured :238
## Construction : 66 Non-insured:157
## Commercial & services : 75
## Process / consumer manufacturing & agro :113
##
##
## Cash_Ratio Current_Ratio Quick_Ratio NWC_TA
## Min. :0.00000 Min. : 0.2167 Min. : 0.1046 Min. :-0.77667
## 1st Qu.:0.04125 1st Qu.: 1.2533 1st Qu.: 0.7762 1st Qu.: 0.09617
## Median :0.19210 Median : 1.7907 Median : 1.2600 Median : 0.24025
## Mean :0.49119 Mean : 2.4675 Mean : 1.7496 Mean : 0.24336
## 3rd Qu.:0.54763 3rd Qu.: 2.8313 3rd Qu.: 2.0242 3rd Qu.: 0.41994
## Max. :4.81808 Max. :14.7800 Max. :11.0700 Max. : 0.88180
## DSO Accounts_Blocked Accounts_Blocked_Num
## Min. : 5.091 No :345 Min. :0.0000
## 1st Qu.: 44.563 Yes: 50 1st Qu.:0.0000
## Median : 62.735 Median :0.0000
## Mean : 74.219 Mean :0.1266
## 3rd Qu.: 85.061 3rd Qu.:0.0000
## Max. :578.932 Max. :1.0000
# Descriptive statistics by insurance status
#Cash ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Cash_Ratio, na.rm = TRUE), Median = median(Cash_Ratio, na.rm = TRUE), SD = sd(Cash_Ratio, na.rm = TRUE), Min = min(Cash_Ratio, na.rm = TRUE), Max = max(Cash_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
## Insured N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Insured 238 0.510 0.198 0.812 0 4.81
## 2 Non-insured 157 0.463 0.179 0.769 0 4.82
# Current ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Current_Ratio, na.rm = TRUE), Median = median(Current_Ratio, na.rm = TRUE), SD = sd(Current_Ratio, na.rm = TRUE), Min = min(Current_Ratio, na.rm = TRUE), Max = max(Current_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
## Insured N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Insured 238 2.64 1.97 2.11 0.217 14.8
## 2 Non-insured 157 2.20 1.56 2.00 0.217 11.3
# Quick ratio
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(Quick_Ratio, na.rm = TRUE), Median = median(Quick_Ratio, na.rm = TRUE), SD = sd(Quick_Ratio, na.rm = TRUE), Min = min(Quick_Ratio, na.rm = TRUE), Max = max(Quick_Ratio, na.rm = TRUE) )
## # A tibble: 2 × 7
## Insured N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Insured 238 1.80 1.31 1.65 0.113 11.1
## 2 Non-insured 157 1.68 1.18 1.61 0.105 9.29
# NWC/TA
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(NWC_TA, na.rm = TRUE), Median = median(NWC_TA, na.rm = TRUE), SD = sd(NWC_TA, na.rm = TRUE), Min = min(NWC_TA, na.rm = TRUE), Max = max(NWC_TA, na.rm = TRUE) )
## # A tibble: 2 × 7
## Insured N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Insured 238 0.290 0.282 0.233 -0.478 0.882
## 2 Non-insured 157 0.173 0.180 0.294 -0.777 0.833
# DSO
mydata %>% group_by(Insured) %>% summarise( N = n(), Mean = mean(DSO, na.rm = TRUE), Median = median(DSO, na.rm = TRUE), SD = sd(DSO, na.rm = TRUE), Min = min(DSO, na.rm = TRUE), Max = max(DSO, na.rm = TRUE) )
## # A tibble: 2 × 7
## Insured N Mean Median SD Min Max
## <fct> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Insured 238 67.1 57.7 52.6 6.33 530.
## 2 Non-insured 157 85.0 69.1 77.2 5.09 579.
# Blocked-account incidence by insurance status
table(mydata$Insured, mydata$Accounts_Blocked)
##
## No Yes
## Insured 224 14
## Non-insured 121 36
prop.table(table(mydata$Insured, mydata$Accounts_Blocked), margin = 1) * 100
##
## No Yes
## Insured 94.117647 5.882353
## Non-insured 77.070064 22.929936
# Distribution of main explanatory variable
table(mydata$Insured)
##
## Insured Non-insured
## 238 157
prop.table(table(mydata$Insured)) * 100
##
## Insured Non-insured
## 60.25316 39.74684
# Distribution of control variable: firm size
table(mydata$Firm_Size)
##
## Micro Small Medium
## 122 151 122
prop.table(table(mydata$Firm_Size)) * 100
##
## Micro Small Medium
## 30.88608 38.22785 30.88608
# Distribution of control variable: industry sector
table(mydata$NACE_Aggregated)
##
## Capital goods / transport-related manufacturing
## 141
## Construction
## 66
## Commercial & services
## 75
## Process / consumer manufacturing & agro
## 113
prop.table(table(mydata$NACE_Aggregated)) * 100
##
## Capital goods / transport-related manufacturing
## 35.69620
## Construction
## 16.70886
## Commercial & services
## 18.98734
## Process / consumer manufacturing & agro
## 28.60759
# Welch t-tests for numerical dependent variables
t_cash <- t.test(Cash_Ratio ~ Insured, data = mydata)
t_current <- t.test(Current_Ratio ~ Insured, data = mydata)
t_quick <- t.test(Quick_Ratio ~ Insured, data = mydata)
t_nwc <- t.test(NWC_TA ~ Insured, data = mydata)
t_dso <- t.test(DSO ~ Insured, data = mydata)
# Show individual outputs
t_cash
##
## Welch Two Sample t-test
##
## data: Cash_Ratio by Insured
## t = 0.58659, df = 346.64, p-value = 0.5579
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
## -0.1116033 0.2064634
## sample estimates:
## mean in group Insured mean in group Non-insured
## 0.5100473 0.4626173
t_current
##
## Welch Two Sample t-test
##
## data: Current_Ratio by Insured
## t = 2.1006, df = 346.35, p-value = 0.0364
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
## 0.02810182 0.85461302
## sample estimates:
## mean in group Insured mean in group Non-insured
## 2.642948 2.201591
t_quick
##
## Welch Two Sample t-test
##
## data: Quick_Ratio by Insured
## t = 0.70032, df = 339.89, p-value = 0.4842
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
## -0.2117328 0.4458616
## sample estimates:
## mean in group Insured mean in group Non-insured
## 1.796163 1.679098
t_nwc
##
## Welch Two Sample t-test
##
## data: NWC_TA by Insured
## t = 4.1917, df = 280.12, p-value = 3.716e-05
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
## 0.06197289 0.17171330
## sample estimates:
## mean in group Insured mean in group Non-insured
## 0.2898005 0.1729574
t_dso
##
## Welch Two Sample t-test
##
## data: DSO by Insured
## t = -2.5414, df = 250.89, p-value = 0.01165
## alternative hypothesis: true difference in means between group Insured and group Non-insured is not equal to 0
## 95 percent confidence interval:
## -31.764319 -4.027193
## sample estimates:
## mean in group Insured mean in group Non-insured
## 67.10628 85.00203
# Chi-square test: Insurance status vs blocked-account incidence
# Create clean variables for this test only
chi_data <- mydata %>%
mutate(
Insurance_status = case_when(
as.character(Insured) == "1" ~ "Insured",
as.character(Insured) == "0" ~ "Non-insured",
as.character(Insured) == "Insured" ~ "Insured",
as.character(Insured) == "Non-insured" ~ "Non-insured",
TRUE ~ NA_character_
),
Blocked_account = case_when(
as.character(Accounts_Blocked) == "1" ~ "Yes",
as.character(Accounts_Blocked) == "0" ~ "No",
as.character(Accounts_Blocked) == "Yes" ~ "Yes",
as.character(Accounts_Blocked) == "No" ~ "No",
TRUE ~ NA_character_
)
) %>%
mutate(
Insurance_status = factor(
Insurance_status,
levels = c("Insured", "Non-insured")
),
Blocked_account = factor(
Blocked_account,
levels = c("No", "Yes")
)
)
# Observed frequencies
blocked_table <- table(chi_data$Insurance_status, chi_data$Blocked_account)
blocked_table
##
## No Yes
## Insured 224 14
## Non-insured 121 36
# Pearson chi-square test
chi_test <- chisq.test(blocked_table, correct = FALSE)
chi_test
##
## Pearson's Chi-squared test
##
## data: blocked_table
## X-squared = 24.866, df = 1, p-value = 6.145e-07
# Expected frequencies
chi_test$expected
##
## No Yes
## Insured 207.8734 30.12658
## Non-insured 137.1266 19.87342
# Standardized residuals
chi_test$stdres
##
## No Yes
## Insured 4.986607 -4.986607
## Non-insured -4.986607 4.986607
# Odds ratio: insured relative to non-insured
odds_ratio <- (blocked_table["Insured", "Yes"] / blocked_table["Insured", "No"]) /
(blocked_table["Non-insured", "Yes"] / blocked_table["Non-insured", "No"])
odds_ratio
## [1] 0.2100694
# Regression models with two control variables
# Reference category for insurance status: Non-insured
# Set reference categories
mydata$Insured <- factor(
mydata$Insured,
levels = c("Non-insured", "Insured")
)
mydata$Firm_Size <- factor(
mydata$Firm_Size,
levels = c("Micro", "Small", "Medium")
)
mydata$NACE_Aggregated <- factor(
mydata$NACE_Aggregated,
levels = c(
"Capital goods / transport-related manufacturing",
"Construction",
"Commercial & services",
"Process / consumer manufacturing & agro"
)
)
# OLS regression models for numerical dependent variables
model_cash <- lm(Cash_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
model_current <- lm(Current_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
model_quick <- lm(Quick_Ratio ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
model_nwc <- lm(NWC_TA ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
model_dso <- lm(DSO ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
# Regression output for liquidity indicators
summary(model_cash)
##
## Call:
## lm(formula = Cash_Ratio ~ Insured + Firm_Size + NACE_Aggregated,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8930 -0.4143 -0.2210 0.0679 4.0046
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 0.57811 0.09454
## InsuredInsured 0.09953 0.08417
## Firm_SizeSmall -0.17688 0.09757
## Firm_SizeMedium -0.33912 0.10424
## NACE_AggregatedConstruction -0.08649 0.11625
## NACE_AggregatedCommercial & services 0.23535 0.11354
## NACE_AggregatedProcess / consumer manufacturing & agro -0.01663 0.10242
## t value Pr(>|t|)
## (Intercept) 6.115 2.36e-09 ***
## InsuredInsured 1.183 0.23770
## Firm_SizeSmall -1.813 0.07062 .
## Firm_SizeMedium -3.253 0.00124 **
## NACE_AggregatedConstruction -0.744 0.45733
## NACE_AggregatedCommercial & services 2.073 0.03885 *
## NACE_AggregatedProcess / consumer manufacturing & agro -0.162 0.87110
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7789 on 388 degrees of freedom
## Multiple R-squared: 0.05398, Adjusted R-squared: 0.03935
## F-statistic: 3.69 on 6 and 388 DF, p-value: 0.001407
summary(model_current)
##
## Call:
## lm(formula = Current_Ratio ~ Insured + Firm_Size + NACE_Aggregated,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8299 -1.1399 -0.4652 0.4653 12.2582
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 2.361738 0.245995
## InsuredInsured 0.520555 0.218993
## Firm_SizeSmall -0.504605 0.253867
## Firm_SizeMedium -0.701590 0.271233
## NACE_AggregatedConstruction 0.001287 0.302469
## NACE_AggregatedCommercial & services 0.844179 0.295423
## NACE_AggregatedProcess / consumer manufacturing & agro 0.144105 0.266496
## t value Pr(>|t|)
## (Intercept) 9.601 <2e-16 ***
## InsuredInsured 2.377 0.0179 *
## Firm_SizeSmall -1.988 0.0476 *
## Firm_SizeMedium -2.587 0.0101 *
## NACE_AggregatedConstruction 0.004 0.9966
## NACE_AggregatedCommercial & services 2.858 0.0045 **
## NACE_AggregatedProcess / consumer manufacturing & agro 0.541 0.5890
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.027 on 388 degrees of freedom
## Multiple R-squared: 0.06054, Adjusted R-squared: 0.04601
## F-statistic: 4.167 on 6 and 388 DF, p-value: 0.0004495
summary(model_quick)
##
## Call:
## lm(formula = Quick_Ratio ~ Insured + Firm_Size + NACE_Aggregated,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0914 -0.9495 -0.4249 0.3409 9.3516
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 1.917846 0.193667
## InsuredInsured 0.236503 0.172409
## Firm_SizeSmall -0.444743 0.199865
## Firm_SizeMedium -0.779917 0.213536
## NACE_AggregatedConstruction -0.005552 0.238128
## NACE_AggregatedCommercial & services 0.519285 0.232581
## NACE_AggregatedProcess / consumer manufacturing & agro 0.008801 0.209808
## t value Pr(>|t|)
## (Intercept) 9.903 < 2e-16 ***
## InsuredInsured 1.372 0.170933
## Firm_SizeSmall -2.225 0.026641 *
## Firm_SizeMedium -3.652 0.000295 ***
## NACE_AggregatedConstruction -0.023 0.981410
## NACE_AggregatedCommercial & services 2.233 0.026138 *
## NACE_AggregatedProcess / consumer manufacturing & agro 0.042 0.966561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.596 on 388 degrees of freedom
## Multiple R-squared: 0.06015, Adjusted R-squared: 0.04561
## F-statistic: 4.138 on 6 and 388 DF, p-value: 0.0004816
summary(model_nwc)
##
## Call:
## lm(formula = NWC_TA ~ Insured + Firm_Size + NACE_Aggregated,
## data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95986 -0.12895 0.01395 0.16485 0.59836
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 0.20566 0.03069
## InsuredInsured 0.14082 0.02732
## Firm_SizeSmall -0.06589 0.03167
## Firm_SizeMedium -0.11114 0.03383
## NACE_AggregatedConstruction 0.02516 0.03773
## NACE_AggregatedCommercial & services 0.07745 0.03685
## NACE_AggregatedProcess / consumer manufacturing & agro -0.02287 0.03324
## t value Pr(>|t|)
## (Intercept) 6.702 7.26e-11 ***
## InsuredInsured 5.155 4.05e-07 ***
## Firm_SizeSmall -2.081 0.03812 *
## Firm_SizeMedium -3.285 0.00111 **
## NACE_AggregatedConstruction 0.667 0.50524
## NACE_AggregatedCommercial & services 2.102 0.03622 *
## NACE_AggregatedProcess / consumer manufacturing & agro -0.688 0.49182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2528 on 388 degrees of freedom
## Multiple R-squared: 0.1007, Adjusted R-squared: 0.08675
## F-statistic: 7.238 on 6 and 388 DF, p-value: 2.445e-07
summary(model_dso)
##
## Call:
## lm(formula = DSO ~ Insured + Firm_Size + NACE_Aggregated, data = mydata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -96.72 -27.92 -9.83 14.15 476.81
##
## Coefficients:
## Estimate Std. Error
## (Intercept) 105.067 7.548
## InsuredInsured -15.372 6.720
## Firm_SizeSmall -31.835 7.790
## Firm_SizeMedium -28.263 8.323
## NACE_AggregatedConstruction -12.290 9.281
## NACE_AggregatedCommercial & services -2.947 9.065
## NACE_AggregatedProcess / consumer manufacturing & agro 6.736 8.177
## t value Pr(>|t|)
## (Intercept) 13.919 < 2e-16 ***
## InsuredInsured -2.288 0.022703 *
## Firm_SizeSmall -4.087 5.32e-05 ***
## Firm_SizeMedium -3.396 0.000755 ***
## NACE_AggregatedConstruction -1.324 0.186224
## NACE_AggregatedCommercial & services -0.325 0.745266
## NACE_AggregatedProcess / consumer manufacturing & agro 0.824 0.410611
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.19 on 388 degrees of freedom
## Multiple R-squared: 0.07169, Adjusted R-squared: 0.05734
## F-statistic: 4.994 on 6 and 388 DF, p-value: 6.05e-05
# Tidy coefficient outputs for easier table preparation
tidy(model_cash)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.578 0.0945 6.11 2.36e-9
## 2 InsuredInsured 0.0995 0.0842 1.18 2.38e-1
## 3 Firm_SizeSmall -0.177 0.0976 -1.81 7.06e-2
## 4 Firm_SizeMedium -0.339 0.104 -3.25 1.24e-3
## 5 NACE_AggregatedConstruction -0.0865 0.116 -0.744 4.57e-1
## 6 NACE_AggregatedCommercial & services 0.235 0.114 2.07 3.89e-2
## 7 NACE_AggregatedProcess / consumer manufa… -0.0166 0.102 -0.162 8.71e-1
tidy(model_current)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 2.36 0.246 9.60 1.01e-19
## 2 InsuredInsured 0.521 0.219 2.38 1.79e- 2
## 3 Firm_SizeSmall -0.505 0.254 -1.99 4.76e- 2
## 4 Firm_SizeMedium -0.702 0.271 -2.59 1.01e- 2
## 5 NACE_AggregatedConstruction 0.00129 0.302 0.00426 9.97e- 1
## 6 NACE_AggregatedCommercial & services 0.844 0.295 2.86 4.50e- 3
## 7 NACE_AggregatedProcess / consumer manuf… 0.144 0.266 0.541 5.89e- 1
tidy(model_quick)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.92 0.194 9.90 9.23e-21
## 2 InsuredInsured 0.237 0.172 1.37 1.71e- 1
## 3 Firm_SizeSmall -0.445 0.200 -2.23 2.66e- 2
## 4 Firm_SizeMedium -0.780 0.214 -3.65 2.95e- 4
## 5 NACE_AggregatedConstruction -0.00555 0.238 -0.0233 9.81e- 1
## 6 NACE_AggregatedCommercial & services 0.519 0.233 2.23 2.61e- 2
## 7 NACE_AggregatedProcess / consumer manuf… 0.00880 0.210 0.0419 9.67e- 1
tidy(model_nwc)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 0.206 0.0307 6.70 7.26e-11
## 2 InsuredInsured 0.141 0.0273 5.15 4.05e- 7
## 3 Firm_SizeSmall -0.0659 0.0317 -2.08 3.81e- 2
## 4 Firm_SizeMedium -0.111 0.0338 -3.28 1.11e- 3
## 5 NACE_AggregatedConstruction 0.0252 0.0377 0.667 5.05e- 1
## 6 NACE_AggregatedCommercial & services 0.0775 0.0369 2.10 3.62e- 2
## 7 NACE_AggregatedProcess / consumer manuf… -0.0229 0.0332 -0.688 4.92e- 1
tidy(model_dso)
## # A tibble: 7 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 105. 7.55 13.9 5.24e-36
## 2 InsuredInsured -15.4 6.72 -2.29 2.27e- 2
## 3 Firm_SizeSmall -31.8 7.79 -4.09 5.32e- 5
## 4 Firm_SizeMedium -28.3 8.32 -3.40 7.55e- 4
## 5 NACE_AggregatedConstruction -12.3 9.28 -1.32 1.86e- 1
## 6 NACE_AggregatedCommercial & services -2.95 9.07 -0.325 7.45e- 1
## 7 NACE_AggregatedProcess / consumer manuf… 6.74 8.18 0.824 4.11e- 1
# Model fit statistics for OLS models
glance(model_cash)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0540 0.0393 0.779 3.69 0.00141 6 -458. 933. 964.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_current)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0605 0.0460 2.03 4.17 0.000450 6 -836. 1688. 1720.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_quick)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0601 0.0456 1.60 4.14 0.000482 6 -741. 1499. 1531.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_nwc)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.101 0.0867 0.253 7.24 0.000000245 6 -13.8 43.6 75.4
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
glance(model_dso)
## # A tibble: 1 × 12
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0717 0.0573 62.2 4.99 0.0000605 6 -2188. 4393. 4425.
## # ℹ 3 more variables: deviance <dbl>, df.residual <int>, nobs <int>
# Logistic regression model for blocked-account incidence
model_blocked <- glm(
Accounts_Blocked_Num ~ Insured + Firm_Size + NACE_Aggregated,
data = mydata,
family = binomial
)
summary(model_blocked)
##
## Call:
## glm(formula = Accounts_Blocked_Num ~ Insured + Firm_Size + NACE_Aggregated,
## family = binomial, data = mydata)
##
## Coefficients:
## Estimate Std. Error
## (Intercept) -0.6405 0.3115
## InsuredInsured -1.3631 0.3533
## Firm_SizeSmall -1.4729 0.4212
## Firm_SizeMedium -0.7023 0.3973
## NACE_AggregatedConstruction 0.3659 0.4283
## NACE_AggregatedCommercial & services -0.0744 0.4354
## NACE_AggregatedProcess / consumer manufacturing & agro -0.3982 0.4910
## z value Pr(>|z|)
## (Intercept) -2.056 0.039776 *
## InsuredInsured -3.859 0.000114 ***
## Firm_SizeSmall -3.497 0.000471 ***
## Firm_SizeMedium -1.767 0.077154 .
## NACE_AggregatedConstruction 0.854 0.392870
## NACE_AggregatedCommercial & services -0.171 0.864312
## NACE_AggregatedProcess / consumer manufacturing & agro -0.811 0.417378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 300.07 on 394 degrees of freedom
## Residual deviance: 258.62 on 388 degrees of freedom
## AIC: 272.62
##
## Number of Fisher Scoring iterations: 5
# Odds ratios for logistic regression
exp(coef(model_blocked))
## (Intercept)
## 0.5270421
## InsuredInsured
## 0.2558557
## Firm_SizeSmall
## 0.2292632
## Firm_SizeMedium
## 0.4954485
## NACE_AggregatedConstruction
## 1.4418728
## NACE_AggregatedCommercial & services
## 0.9282988
## NACE_AggregatedProcess / consumer manufacturing & agro
## 0.6715185
# Odds ratios with 95% confidence intervals
exp(cbind(
Odds_Ratio = coef(model_blocked),
confint.default(model_blocked)
))
## Odds_Ratio 2.5 %
## (Intercept) 0.5270421 0.2862147
## InsuredInsured 0.2558557 0.1280262
## Firm_SizeSmall 0.2292632 0.1004109
## Firm_SizeMedium 0.4954485 0.2273918
## NACE_AggregatedConstruction 1.4418728 0.6228230
## NACE_AggregatedCommercial & services 0.9282988 0.3954475
## NACE_AggregatedProcess / consumer manufacturing & agro 0.6715185 0.2565038
## 97.5 %
## (Intercept) 0.9705069
## InsuredInsured 0.5113183
## Firm_SizeSmall 0.5234654
## Firm_SizeMedium 1.0794988
## NACE_AggregatedConstruction 3.3380226
## NACE_AggregatedCommercial & services 2.1791478
## NACE_AggregatedProcess / consumer manufacturing & agro 1.7580132
# McFadden R-squared for logistic regression
mcfadden_r2 <- 1 - (model_blocked$deviance / model_blocked$null.deviance)
mcfadden_r2
## [1] 0.1381284
# Predicted probabilities from the logistic regression model
mydata$predicted_prob_blocked <- predict(model_blocked, type = "response")
# Check first few predicted probabilities
head(mydata$predicted_prob_blocked)
## 1 2 3 4 5 6
## 0.16278186 0.10780511 0.16278186 0.02998829 0.02033803 0.10780511
# ROC curve and AUC
library(pROC)
## Warning: package 'pROC' was built under R version 4.3.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_blocked <- roc(
response = mydata$Accounts_Blocked_Num,
predictor = mydata$predicted_prob_blocked,
levels = c(0, 1),
direction = "<"
)
# Area under the curve
auc_blocked <- auc(roc_blocked)
# Display AUC rounded to three decimals
round(auc_blocked, 3)
## [1] 0.756
# Plot ROC curve
plot(
roc_blocked,
main = "ROC curve for blocked-account incidence model"
)
# Optimal threshold based on Youden index
optimal_coords <- coords(
roc_blocked,
x = "best",
best.method = "youden",
ret = c("threshold", "sensitivity", "specificity"),
transpose = FALSE
)
optimal_coords
## threshold sensitivity specificity
## 1 0.1559851 0.62 0.8144928
# Save optimal threshold
threshold_best <- optimal_coords$threshold
threshold_best
## [1] 0.1559851
# Predicted class using optimal threshold
mydata$predicted_class_best <- ifelse(
mydata$predicted_prob_blocked >= threshold_best,
1,
0
)
# Confusion matrix using optimal threshold
conf_matrix_best <- table(
Actual = factor(mydata$Accounts_Blocked_Num, levels = c(0, 1)),
Predicted = factor(mydata$predicted_class_best, levels = c(0, 1))
)
conf_matrix_best
## Predicted
## Actual 0 1
## 0 281 64
## 1 19 31
# Extract values from confusion matrix
TN <- conf_matrix_best["0", "0"]
FP <- conf_matrix_best["0", "1"]
FN <- conf_matrix_best["1", "0"]
TP <- conf_matrix_best["1", "1"]
# Accuracy, sensitivity and specificity
accuracy_best <- (TP + TN) / sum(conf_matrix_best)
sensitivity_best <- TP / (TP + FN)
specificity_best <- TN / (TN + FP)
# Display rounded results
round(accuracy_best, 3)
## [1] 0.79
round(sensitivity_best, 3)
## [1] 0.62
round(specificity_best, 3)
## [1] 0.814
# Logistic regression model performance table
logit_performance <- data.frame(
Measure = c(
"McFadden R-squared",
"AUC",
"Optimal threshold",
"Accuracy",
"Sensitivity",
"Specificity"
),
Value = round(c(
mcfadden_r2,
as.numeric(auc_blocked),
threshold_best,
accuracy_best,
sensitivity_best,
specificity_best
), 3)
)
logit_performance
## Measure Value
## 1 McFadden R-squared 0.138
## 2 AUC 0.756
## 3 Optimal threshold 0.156
## 4 Accuracy 0.790
## 5 Sensitivity 0.620
## 6 Specificity 0.814