Link: https://rpubs.com/staszkiewicz/RID_Ins_Step_2
df2 <- readRDS("InterimData/DB_final.rds")
Now we are starting reshaping procedures, all status before 2015 will be inegrted to one variable Old_status
Create:
StatusBefore2015 = 1 if any status before 2015 indicates bankruptcy 0 otherwise
# Identify all Status columns before 2015
pre2015_cols <- grep("^Status_\\d{4}$", names(df2), value = TRUE)
pre2015_cols <- pre2015_cols[as.numeric(sub("Status_", "", pre2015_cols)) < 2015]
# Create binary indicator: 1 if any pre-2015 status contains "bankru", else 0
df2$StatusBefore2015 <- apply(df2[pre2015_cols], 1, function(x) {
x <- tolower(as.character(x))
x <- x[!is.na(x)]
if (length(x) == 0) return(0)
as.integer(any(grepl("bankru", x)))
})
Her we end up only with two obesravaion
table(df2$StatusBefore2015 )
##
## 0 1
## 35564 2
So I am going to clean up all the pre 2015 columns
# Drop the old pre-2015 status columns
df2 <- df2[, !(names(df2) %in% pre2015_cols)]
Now I am generating CoyIndex based on the DvdIDNumber and replacing it with existing CoyIndex
library(dplyr)
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
df2 <- df2 %>%
mutate(company_id = as.integer(factor(BvdIdNumber))) %>%
select(company_id, everything(), -CoyIndex)
library(dplyr)
library(tidyr)
# -----------------------------
# 1. Create list of columns to reshape
# -----------------------------
# all columns ending with a 4-digit year
year_cols <- names(df2)[grepl("\\d{4}$", names(df2))]
# -----------------------------
# 2. Reshape wide -> long
# -----------------------------
df_long <- df2 %>%
pivot_longer(
cols = all_of(year_cols),
names_to = c("variable", "year"),
names_pattern = "^(.*?)(\\d{4})$",
values_to = "value",
values_transform = list(value = as.character)
) %>%
mutate(year = as.integer(year)) %>%
pivot_wider(
names_from = variable,
values_from = value
) %>%
arrange(company_id, year)
# -----------------------------
# 3. Optional: convert financial columns back to numeric
# -----------------------------
df_long <- df_long %>%
mutate(
across(
ends_with("Eur"),
as.numeric
)
)
# -----------------------------
# Result:
# one row = one company-year
# -----------------------------
glimpse(df_long)
## Rows: 426,792
## Columns: 91
## $ company_id <int> 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Source.Name <chr> "11232-13104_EU_10_View_1…
## $ CompanyNameLatinAlphabet <chr> "NAME", "NAME", "NAME", "…
## $ Country <chr> "COUNTRY", "COUNTRY", "CO…
## $ CountryIsoCode <chr> "COUNTRY_ISO_CODE", "COUN…
## $ Latitude <chr> "LATITUDE", "LATITUDE", "…
## $ Longitude <chr> "LONGITUDE", "LONGITUDE",…
## $ BvdIdNumber <chr> "BVD_ID_NUMBER", "BVD_ID_…
## $ BvdAccountNumber <chr> "BVD_ACCOUNT_NUMBER", "BV…
## $ OrbisIdNumber <chr> "ORBISID", "ORBISID", "OR…
## $ NationalId <chr> "NATIONAL_ID", "NATIONAL_…
## $ NationalIdLabel <chr> "NATIONAL_ID_LABEL", "NAT…
## $ TradeRegisterNumber <chr> "TRADE_REGISTER_NUMBER", …
## $ VatTaxNumber <chr> "VAT_NUMBER", "VAT_NUMBER…
## $ EuropeanVatNumber <chr> "EUROPEAN_VAT_NUMBER", "E…
## $ TaxIdentificationNumberTin <chr> "TIN", "TIN", "TIN", "TIN…
## $ LeiLegalEntityIdentifier <chr> "LEI", "LEI", "LEI", "LEI…
## $ StatisticalNumber <chr> "STATISTICAL_NUMBER", "ST…
## $ TickerSymbol <chr> "TICKER", "TICKER", "TICK…
## $ IsinNumber <chr> "ISIN", "ISIN", "ISIN", "…
## $ NaceRev2MainSection <chr> "NACE2_MAIN_SECTION", "NA…
## $ SizeClassification <chr> "COMPANY_CATEGORY", "COMP…
## $ Status <chr> "STATUS", "STATUS", "STAT…
## $ StatusDate <chr> "STATUS_DATE", "STATUS_DA…
## $ StatusYear <chr> "STATUS_DATE", "STATUS_DA…
## $ TypeOfEntity <chr> "ENTITY_TYPE", "ENTITY_TY…
## $ WomanOwnedIndicatorInUs <chr> "WOMAN_OWNED_INDICATOR", …
## $ EthnicMinorityOwnedIndicatorInUs <chr> "MINORITY_OWNED_INDICATOR…
## $ ConsolidationCode <chr> "CONSOLIDATION_CODE", "CO…
## $ FilingType <chr> "FILING_TYPE", "FILING_TY…
## $ NumberOfAvailableYears <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2025_1 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2024_2 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2023_3 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2022_4 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2021_5 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2020_6 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2019_7 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2018_8 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2017_9 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2016_10 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur2015_11 <dbl> NA, NA, NA, NA, NA, NA, N…
## $ Status_STATUS_DATE <chr> "STATUS", "STATUS", "STAT…
## $ year <int> 2015, 2016, 2017, 2018, 2…
## $ TotalAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TotalShareholdersFundsLiabilitiesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NonCurrentAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ IntangibleAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ TangibleFixedAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherNonCurrentAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CurrentAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ StockEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ DebtorsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherCurrentAssetsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OfWhichCashCashEquivalentEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ShareholdersFundsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CapitalEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherShareholdersFundsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NonCurrentLiabilitiesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ LongTermDebtEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherNonCurrentLiabilitiesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OfWhichProvisionsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CurrentLiabilitiesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ LoansShortTermDebtEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CreditorsEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherCurrentLiabilitiesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OperatingRevenueTurnoverEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ SalesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ CostsOfGoodsSoldEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ GrossProfitEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OtherOperatingExpenseIncomeEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ OperatingProfitLossEbitEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialProfitLossEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialRevenueEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ FinancialExpensesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossBeforeTaxPbtEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ IncomeTaxExpensesBenefitEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossAfterTaxPatEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NetExtraordinaryRevenuesExpensesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ExtraordinaryRevenuesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ExtraordinaryExpensesEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ ProfitLossForThePeriodNetIncomeEur <dbl> NA, NA, NA, NA, NA, NA, N…
## $ NetCashFromOperatingActivitiesEur <dbl> 15514, 15514, 15514, 1551…
## $ NetCashFromInvestingActivitiesEur <dbl> 15518, 15518, 15518, 1551…
## $ NetCashFromFinancingActivitiesEur <dbl> 15528, 15528, 15528, 1552…
## $ CashCashEquivalentsAtTheBeginningOfPeriodEur <dbl> 15531, 15531, 15531, 1553…
## $ CashCashEquivalentsAtTheEndOfPeriodEur <dbl> 15532, 15532, 15532, 1553…
## $ AuditStatus <chr> "AUDIT_STATUS", "AUDIT_ST…
## $ FiscalYear <chr> NA, NA, NA, NA, NA, NA, N…
## $ Status_ <chr> NA, NA, NA, NA, NA, NA, N…
## $ StatusBefore <chr> "0", NA, NA, NA, NA, NA, …
library(dplyr)
library(tidyr)
# ----------------------------------------
# 1. Financial variables only
# ----------------------------------------
financial_cols <- names(df_long)[grepl("Eur$", names(df_long))]
# ensure numeric
df_long <- df_long %>%
mutate(
across(
all_of(financial_cols),
as.numeric
)
)
# ----------------------------------------
# 2. Create summary statistics
# ----------------------------------------
summary_long <- df_long %>%
group_by(year) %>%
summarise(
across(
all_of(financial_cols),
list(
mean = ~ mean(., na.rm = TRUE),
min = ~ min(., na.rm = TRUE),
max = ~ max(., na.rm = TRUE),
sd = ~ sd(., na.rm = TRUE)
),
.names = "{.col}__{.fn}"
),
.groups = "drop"
) %>%
pivot_longer(
cols = -year,
names_to = c("variable", "statistic"),
names_sep = "__",
values_to = "value"
) %>%
mutate( #zaokrąglę sobie
across(
where(is.numeric),
~ round(., 1)
)
) %>%
pivot_wider(
names_from = year,
values_from = value
) %>%
arrange(variable, statistic)
## Warning: There were 88 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `across(...)`.
## ℹ In group 12: `year = 2026`.
## Caused by warning in `min()`:
## ! brak argumentów w min; zwracanie wartości Inf
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 87 remaining warnings.
# ----------------------------------------
# Result structure:
#
# variable statistic 2015 2016 ...
# TotalAssetsEur mean
# TotalAssetsEur min
# TotalAssetsEur max
# TotalAssetsEur sd
# SalesEur mean
# ...
# ----------------------------------------
summary_long
## # A tibble: 172 × 14
## variable statistic `2015` `2016` `2017` `2018` `2019` `2020` `2021`
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CapitalE… max 1.40e10 4.35e9 4.59e9 1.03e10 4.49e9 4.16e9 4.17e9
## 2 CapitalE… mean 6.43e 6 5.68e6 5.90e6 6.20e 6 5.78e6 5.57e6 5.65e6
## 3 CapitalE… min -1.84e 8 -1.84e8 -4.05e6 -2.02e 7 -7.45e6 -4.95e6 -4.77e6
## 4 CapitalE… sd 1.07e 8 5.84e7 6.19e7 8.48e 7 5.92e7 5.68e7 5.74e7
## 5 CashCash… max 1.46e 9 7.04e8 6.39e8 8.46e 8 6.39e8 8.16e8 9.11e8
## 6 CashCash… mean 2.80e 7 2.52e7 2.83e7 2.82e 7 2.72e7 2.52e7 3.35e7
## 7 CashCash… min -4.48e 7 -4.08e7 -6.71e7 -5.88e 7 -5.99e7 -8.43e7 -5.91e7
## 8 CashCash… sd 1.07e 8 7.54e7 7.71e7 8.34e 7 7.37e7 7.24e7 9.33e7
## 9 CashCash… max 7.30e 8 6.05e8 8.73e8 6.33e 8 8.82e8 9.05e8 1.46e9
## 10 CashCash… mean 2.68e 7 2.83e7 2.98e7 2.78e 7 2.78e7 3.41e7 4.37e7
## # ℹ 162 more rows
## # ℹ 5 more variables: `2022` <dbl>, `2023` <dbl>, `2024` <dbl>, `2025` <dbl>,
## # `2026` <dbl>
Conclusion Plausible including the pretension of the prepayments
library(writexl)
saveRDS(df2, "InterimData/WideFormatStep3.rds") # this the file before conversion
saveRDS(df_long, "InterimData/LongFormatStep3.rds")
Removing all object from the 2 Stage
rm(list=ls())