Link: https://rpubs.com/staszkiewicz/RID_Ins_Step_2

Initial checks and adding up variables to make the panel working

  1. I took the data from the step 1 and 2 as there in not changes
df2 <- readRDS("InterimData/DB_final.rds")

Now we are starting reshaping procedures, all status before 2015 will be inegrted to one variable Old_status

Create:

StatusBefore2015 = 1 if any status before 2015 indicates bankruptcy 0 otherwise

# Identify all Status columns before 2015
pre2015_cols <- grep("^Status_\\d{4}$", names(df2), value = TRUE)
pre2015_cols <- pre2015_cols[as.numeric(sub("Status_", "", pre2015_cols)) < 2015]

# Create binary indicator: 1 if any pre-2015 status contains "bankru", else 0
df2$StatusBefore2015 <- apply(df2[pre2015_cols], 1, function(x) {
  x <- tolower(as.character(x))
  x <- x[!is.na(x)]
  
  if (length(x) == 0) return(0)
  
  as.integer(any(grepl("bankru", x)))
})

Her we end up only with two obesravaion

table(df2$StatusBefore2015 )
## 
##     0     1 
## 35564     2

So I am going to clean up all the pre 2015 columns

# Drop the old pre-2015 status columns
df2 <- df2[, !(names(df2) %in% pre2015_cols)]

Now I am generating CoyIndex based on the DvdIDNumber and replacing it with existing CoyIndex

library(dplyr)
## 
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union
df2 <- df2 %>%
  mutate(company_id = as.integer(factor(BvdIdNumber))) %>%
  select(company_id, everything(), -CoyIndex)

Reshape

library(dplyr)
library(tidyr)

# -----------------------------
# 1. Create list of columns to reshape
# -----------------------------

# all columns ending with a 4-digit year
year_cols <- names(df2)[grepl("\\d{4}$", names(df2))]


# -----------------------------
# 2. Reshape wide -> long
# -----------------------------

df_long <- df2 %>%
  
  pivot_longer(
    cols = all_of(year_cols),
    names_to = c("variable", "year"),
    names_pattern = "^(.*?)(\\d{4})$",
    values_to = "value",
    values_transform = list(value = as.character)
  ) %>%
  
  mutate(year = as.integer(year)) %>%
  
  pivot_wider(
    names_from = variable,
    values_from = value
  ) %>%
  
  arrange(company_id, year)

# -----------------------------
# 3. Optional: convert financial columns back to numeric
# -----------------------------

df_long <- df_long %>%
  mutate(
    across(
      ends_with("Eur"),
      as.numeric
    )
  )

# -----------------------------
# Result:
# one row = one company-year
# -----------------------------

glimpse(df_long)
## Rows: 426,792
## Columns: 91
## $ company_id                                   <int> 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Source.Name                                  <chr> "11232-13104_EU_10_View_1…
## $ CompanyNameLatinAlphabet                     <chr> "NAME", "NAME", "NAME", "…
## $ Country                                      <chr> "COUNTRY", "COUNTRY", "CO…
## $ CountryIsoCode                               <chr> "COUNTRY_ISO_CODE", "COUN…
## $ Latitude                                     <chr> "LATITUDE", "LATITUDE", "…
## $ Longitude                                    <chr> "LONGITUDE", "LONGITUDE",…
## $ BvdIdNumber                                  <chr> "BVD_ID_NUMBER", "BVD_ID_…
## $ BvdAccountNumber                             <chr> "BVD_ACCOUNT_NUMBER", "BV…
## $ OrbisIdNumber                                <chr> "ORBISID", "ORBISID", "OR…
## $ NationalId                                   <chr> "NATIONAL_ID", "NATIONAL_…
## $ NationalIdLabel                              <chr> "NATIONAL_ID_LABEL", "NAT…
## $ TradeRegisterNumber                          <chr> "TRADE_REGISTER_NUMBER", …
## $ VatTaxNumber                                 <chr> "VAT_NUMBER", "VAT_NUMBER…
## $ EuropeanVatNumber                            <chr> "EUROPEAN_VAT_NUMBER", "E…
## $ TaxIdentificationNumberTin                   <chr> "TIN", "TIN", "TIN", "TIN…
## $ LeiLegalEntityIdentifier                     <chr> "LEI", "LEI", "LEI", "LEI…
## $ StatisticalNumber                            <chr> "STATISTICAL_NUMBER", "ST…
## $ TickerSymbol                                 <chr> "TICKER", "TICKER", "TICK…
## $ IsinNumber                                   <chr> "ISIN", "ISIN", "ISIN", "…
## $ NaceRev2MainSection                          <chr> "NACE2_MAIN_SECTION", "NA…
## $ SizeClassification                           <chr> "COMPANY_CATEGORY", "COMP…
## $ Status                                       <chr> "STATUS", "STATUS", "STAT…
## $ StatusDate                                   <chr> "STATUS_DATE", "STATUS_DA…
## $ StatusYear                                   <chr> "STATUS_DATE", "STATUS_DA…
## $ TypeOfEntity                                 <chr> "ENTITY_TYPE", "ENTITY_TY…
## $ WomanOwnedIndicatorInUs                      <chr> "WOMAN_OWNED_INDICATOR", …
## $ EthnicMinorityOwnedIndicatorInUs             <chr> "MINORITY_OWNED_INDICATOR…
## $ ConsolidationCode                            <chr> "CONSOLIDATION_CODE", "CO…
## $ FilingType                                   <chr> "FILING_TYPE", "FILING_TY…
## $ NumberOfAvailableYears                       <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2025_1   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2024_2   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2023_3   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2022_4   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2021_5   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2020_6   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2019_7   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2018_8   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2017_9   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2016_10  <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2015_11  <dbl> NA, NA, NA, NA, NA, NA, N…
## $ Status_STATUS_DATE                           <chr> "STATUS", "STATUS", "STAT…
## $ year                                         <int> 2015, 2016, 2017, 2018, 2…
## $ TotalAssetsEur                               <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur         <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NonCurrentAssetsEur                          <dbl> NA, NA, NA, NA, NA, NA, N…
## $ IntangibleAssetsEur                          <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TangibleFixedAssetsEur                       <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherNonCurrentAssetsEur                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CurrentAssetsEur                             <dbl> NA, NA, NA, NA, NA, NA, N…
## $ StockEur                                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ DebtorsEur                                   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherCurrentAssetsEur                        <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OfWhichCashCashEquivalentEur                 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ShareholdersFundsEur                         <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CapitalEur                                   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherShareholdersFundsEur                    <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NonCurrentLiabilitiesEur                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ LongTermDebtEur                              <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherNonCurrentLiabilitiesEur                <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OfWhichProvisionsEur                         <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CurrentLiabilitiesEur                        <dbl> NA, NA, NA, NA, NA, NA, N…
## $ LoansShortTermDebtEur                        <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CreditorsEur                                 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherCurrentLiabilitiesEur                   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OperatingRevenueTurnoverEur                  <dbl> NA, NA, NA, NA, NA, NA, N…
## $ SalesEur                                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CostsOfGoodsSoldEur                          <dbl> NA, NA, NA, NA, NA, NA, N…
## $ GrossProfitEur                               <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherOperatingExpenseIncomeEur               <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OperatingProfitLossEbitEur                   <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialProfitLossEur                       <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialRevenueEur                          <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialExpensesEur                         <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossBeforeTaxPbtEur                    <dbl> NA, NA, NA, NA, NA, NA, N…
## $ IncomeTaxExpensesBenefitEur                  <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossAfterTaxPatEur                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NetExtraordinaryRevenuesExpensesEur          <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ExtraordinaryRevenuesEur                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ExtraordinaryExpensesEur                     <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossForThePeriodNetIncomeEur           <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NetCashFromOperatingActivitiesEur            <dbl> 15514, 15514, 15514, 1551…
## $ NetCashFromInvestingActivitiesEur            <dbl> 15518, 15518, 15518, 1551…
## $ NetCashFromFinancingActivitiesEur            <dbl> 15528, 15528, 15528, 1552…
## $ CashCashEquivalentsAtTheBeginningOfPeriodEur <dbl> 15531, 15531, 15531, 1553…
## $ CashCashEquivalentsAtTheEndOfPeriodEur       <dbl> 15532, 15532, 15532, 1553…
## $ AuditStatus                                  <chr> "AUDIT_STATUS", "AUDIT_ST…
## $ FiscalYear                                   <chr> NA, NA, NA, NA, NA, NA, N…
## $ Status_                                      <chr> NA, NA, NA, NA, NA, NA, N…
## $ StatusBefore                                 <chr> "0", NA, NA, NA, NA, NA, …

Descriptive statistics by years

library(dplyr)
library(tidyr)

# ----------------------------------------
# 1. Financial variables only
# ----------------------------------------

financial_cols <- names(df_long)[grepl("Eur$", names(df_long))]

# ensure numeric
df_long <- df_long %>%
  mutate(
    across(
      all_of(financial_cols),
      as.numeric
    )
  )

# ----------------------------------------
# 2. Create summary statistics
# ----------------------------------------

summary_long <- df_long %>%
  
  group_by(year) %>%
  
  summarise(
    
    across(
      all_of(financial_cols),
      
      list(
        mean = ~ mean(., na.rm = TRUE),
        min  = ~ min(., na.rm = TRUE),
        max  = ~ max(., na.rm = TRUE),
        sd   = ~ sd(., na.rm = TRUE)
      ),
      
      .names = "{.col}__{.fn}"
    ),
    
    .groups = "drop"
  ) %>%
  
  pivot_longer(
    cols = -year,
    names_to = c("variable", "statistic"),
    names_sep = "__",
    values_to = "value"
  ) %>%
  
   mutate(                                                    #zaokrąglę sobie
    across(
      where(is.numeric),
      ~ round(., 1)
    )
  ) %>%
  
  pivot_wider(
    names_from = year,
    values_from = value
  ) %>%
  
  arrange(variable, statistic)
## Warning: There were 88 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## ℹ In group 12: `year = 2026`.
## Caused by warning in `min()`:
## ! brak argumentów w min; zwracanie wartości Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 87 remaining warnings.
# ----------------------------------------
# Result structure:
#
# variable                     statistic   2015   2016 ...
# TotalAssetsEur               mean
# TotalAssetsEur               min
# TotalAssetsEur               max
# TotalAssetsEur               sd
# SalesEur                     mean
# ...
# ----------------------------------------

summary_long
## # A tibble: 172 × 14
##    variable  statistic   `2015`  `2016`  `2017`   `2018`  `2019`  `2020`  `2021`
##    <chr>     <chr>        <dbl>   <dbl>   <dbl>    <dbl>   <dbl>   <dbl>   <dbl>
##  1 CapitalE… max        1.40e10  4.35e9  4.59e9  1.03e10  4.49e9  4.16e9  4.17e9
##  2 CapitalE… mean       6.43e 6  5.68e6  5.90e6  6.20e 6  5.78e6  5.57e6  5.65e6
##  3 CapitalE… min       -1.84e 8 -1.84e8 -4.05e6 -2.02e 7 -7.45e6 -4.95e6 -4.77e6
##  4 CapitalE… sd         1.07e 8  5.84e7  6.19e7  8.48e 7  5.92e7  5.68e7  5.74e7
##  5 CashCash… max        1.46e 9  7.04e8  6.39e8  8.46e 8  6.39e8  8.16e8  9.11e8
##  6 CashCash… mean       2.80e 7  2.52e7  2.83e7  2.82e 7  2.72e7  2.52e7  3.35e7
##  7 CashCash… min       -4.48e 7 -4.08e7 -6.71e7 -5.88e 7 -5.99e7 -8.43e7 -5.91e7
##  8 CashCash… sd         1.07e 8  7.54e7  7.71e7  8.34e 7  7.37e7  7.24e7  9.33e7
##  9 CashCash… max        7.30e 8  6.05e8  8.73e8  6.33e 8  8.82e8  9.05e8  1.46e9
## 10 CashCash… mean       2.68e 7  2.83e7  2.98e7  2.78e 7  2.78e7  3.41e7  4.37e7
## # ℹ 162 more rows
## # ℹ 5 more variables: `2022` <dbl>, `2023` <dbl>, `2024` <dbl>, `2025` <dbl>,
## #   `2026` <dbl>

Conclusion Plausible including the pretension of the prepayments

Saving data for the wide panel without merging with audit analitics and polish data

library(writexl)
saveRDS(df2, "InterimData/WideFormatStep3.rds")  # this the file before conversion
saveRDS(df_long, "InterimData/LongFormatStep3.rds")

Cleaning up

Removing all object from the 2 Stage

rm(list=ls())