http: https://rpubs.com/staszkiewicz/RID_Inso_Step4_Oribs_AA_KRS

libraries

library(dplyr)
## 
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
## 
##     intersect, setdiff, setequal, union

Part one (AA data = audit opionns and fees)

load the data from step 3

df4 <- readRDS("InterimData/LongFormatStep3.rds")

Load audit anylitics data for Europe At the begining we have 12 header of the data we will dorp it

df4 <- df4[-c(1:12), ]

Le us read audit analytics data from excel (allready timmed manualy for 10 conties between 2015-2025)

library(readxl)


AA <- read_excel("Audit_Analitics_Data.xlsx", col_types = "text",  na = c("n.a.", "n.s.", "NA", "N/A", "b.d.", ""))

Let us shorthe the audit data to Euro relevant

AA_short<-AA[,c(1:7,19:20,25:41,47:49,54,63:76)]

Now chaing the names to cammel to make easy operation later on with prefix AA to distnguis between the datavases

library(janitor)
## 
## Dołączanie pakietu: 'janitor'
## Następujące obiekty zostały zakryte z 'package:stats':
## 
##     chisq.test, fisher.test
colnames(AA_short) <- paste0(
  "AA",
  make_clean_names(colnames(AA_short), case = "small_camel")
)

I will drop the KIK code, as those are the European dat

AA_short <- AA_short[, -4]

Drop not nesessary sharholders funds colums from the df4

df4 <- df4[, -c(32:42)]

merging at minimum

# library(dplyr)
#
# #1. clean AA
#
AA_short_clean <- AA_short %>%
   filter(!is.na(AAisin), !is.na(AAyearEnded))
# # 2. clean df4
#
 df4_clean <- df4 %>%
  filter(!is.na(IsinNumber))
 
 AA_short_clean <- AA_short_clean %>%
  mutate(AAyearEnded = as.integer(AAyearEnded))
 
 df4_clean <- df4_clean %>%
  mutate(year = as.integer(year))
#
# #joni for AA_short
#
merged_df <- AA_short_clean %>%
  left_join(
     df4_clean,
    by = c(
      "AAisin" = "IsinNumber",
      "AAyearEnded" = "year"
    )
  )
#

In sum less than 50% has been matched

mean(!is.na(merged_df$TotalAssetsEur))
## [1] 0.5557164

Those sets only for the robustness testig, or chacking the models fits (main model estitmate withoug the audit data

robust model with the audit data information), but generation of the vairalbes should be replicated in the both models

# Cretion combined big df4 but without texposionn.

AA_short_unique <- AA_short %>%
  filter(!is.na(AAisin), !is.na(AAyearEnded)) %>%
  mutate(AAyearEnded = as.integer(AAyearEnded)) %>%
  group_by(AAisin, AAyearEnded) %>%
  summarise(across(everything(), first), .groups = "drop")

Check that short unique is working

AA_short_unique %>%
  count(AAisin, AAyearEnded) %>%
  filter(n > 1)
## # A tibble: 0 × 3
## # ℹ 3 variables: AAisin <chr>, AAyearEnded <int>, n <int>
#
#

Ok no doubles

Safe merga to the df4 datavase

# request numeric year

AA_short_unique <- AA_short_unique %>%
  mutate(AAyearEnded = as.integer(AAyearEnded))

# merge

df4_merged <- df4 %>%
  left_join(
    AA_short_unique,
    by = c(
      "IsinNumber" = "AAisin",
      "year" = "AAyearEnded"
    )
  )

Final check

nrow(df4_merged) == nrow(df4)
## [1] TRUE

Dimension correct.

Now I have merged the data of AA for the Orbis data

OK is done step 4 concluded part I

cleaning up from this stem

rm(AA, AA_short,AA_short_clean,AA_short_unique, df4_clean)

Let us write to rds the AA_robust_checki_data

library(writexl)
saveRDS(merged_df, "InterimData/AAandFiacials_Step3.rds")  # we save the file with the Audit Analitic opinion, and audit fee in small format 

Merging the KRD data on Polish bankruptcies

library(readxl)


KRS <- read_excel("KRS_All.xlsx", col_types = "text",  na = c("n.a.", "n.s.", "NA", "N/A", "b.d.", ""))
library(janitor)



colnames(KRS) <- paste0(
  "KRS",
  make_clean_names(colnames(KRS), case = "small_camel")
)

Check for the KRS duplicates

 sum(duplicated(KRS$KRSodpisNaglowekANumerKrs))
## [1] 0

Link the both databases with the NIP (tax number)

“KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip” nad “TaxIdentificationNumberTin”

Remove from KRS repeated information

KRS2 <- KRS[!is.na(KRS$KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip), ]

Merge

merged_df_all <- merge(
  df4,
  KRS2,
  by.x = "TaxIdentificationNumberTin",
  by.y = "KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip",
  all.x = TRUE
)

Save the final version of the datset for the analysis:

library(writexl)
saveRDS(merged_df_all, "InterimData/LongOrbisAAKRSStep4.rds")  # this 
saveRDS(merged_df, "InterimData/LongOnlyAAFiacialAfterStep4.rds")

The final version of thee long format from Orbis after merging the Audit Analiytics and polsih KRS data is the object “merged_df_all” that is saved as “InterimData/LongOrbisAAKRSStep4.rds”