http: https://rpubs.com/staszkiewicz/RID_Inso_Step4_Oribs_AA_KRS
library(dplyr)
##
## Dołączanie pakietu: 'dplyr'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## filter, lag
## Następujące obiekty zostały zakryte z 'package:base':
##
## intersect, setdiff, setequal, union
load the data from step 3
df4 <- readRDS("InterimData/LongFormatStep3.rds")
Load audit anylitics data for Europe At the begining we have 12 header of the data we will dorp it
df4 <- df4[-c(1:12), ]
Le us read audit analytics data from excel (allready timmed manualy for 10 conties between 2015-2025)
library(readxl)
AA <- read_excel("Audit_Analitics_Data.xlsx", col_types = "text", na = c("n.a.", "n.s.", "NA", "N/A", "b.d.", ""))
Let us shorthe the audit data to Euro relevant
AA_short<-AA[,c(1:7,19:20,25:41,47:49,54,63:76)]
Now chaing the names to cammel to make easy operation later on with prefix AA to distnguis between the datavases
library(janitor)
##
## Dołączanie pakietu: 'janitor'
## Następujące obiekty zostały zakryte z 'package:stats':
##
## chisq.test, fisher.test
colnames(AA_short) <- paste0(
"AA",
make_clean_names(colnames(AA_short), case = "small_camel")
)
I will drop the KIK code, as those are the European dat
AA_short <- AA_short[, -4]
Drop not nesessary sharholders funds colums from the df4
df4 <- df4[, -c(32:42)]
# library(dplyr)
#
# #1. clean AA
#
AA_short_clean <- AA_short %>%
filter(!is.na(AAisin), !is.na(AAyearEnded))
# # 2. clean df4
#
df4_clean <- df4 %>%
filter(!is.na(IsinNumber))
AA_short_clean <- AA_short_clean %>%
mutate(AAyearEnded = as.integer(AAyearEnded))
df4_clean <- df4_clean %>%
mutate(year = as.integer(year))
#
# #joni for AA_short
#
merged_df <- AA_short_clean %>%
left_join(
df4_clean,
by = c(
"AAisin" = "IsinNumber",
"AAyearEnded" = "year"
)
)
#
mean(!is.na(merged_df$TotalAssetsEur))
## [1] 0.5557164
AA_short_unique <- AA_short %>%
filter(!is.na(AAisin), !is.na(AAyearEnded)) %>%
mutate(AAyearEnded = as.integer(AAyearEnded)) %>%
group_by(AAisin, AAyearEnded) %>%
summarise(across(everything(), first), .groups = "drop")
AA_short_unique %>%
count(AAisin, AAyearEnded) %>%
filter(n > 1)
## # A tibble: 0 × 3
## # ℹ 3 variables: AAisin <chr>, AAyearEnded <int>, n <int>
#
#
Safe merga to the df4 datavase
# request numeric year
AA_short_unique <- AA_short_unique %>%
mutate(AAyearEnded = as.integer(AAyearEnded))
# merge
df4_merged <- df4 %>%
left_join(
AA_short_unique,
by = c(
"IsinNumber" = "AAisin",
"year" = "AAyearEnded"
)
)
nrow(df4_merged) == nrow(df4)
## [1] TRUE
Dimension correct.
Now I have merged the data of AA for the Orbis data
cleaning up from this stem
rm(AA, AA_short,AA_short_clean,AA_short_unique, df4_clean)
Let us write to rds the AA_robust_checki_data
library(writexl)
saveRDS(merged_df, "InterimData/AAandFiacials_Step3.rds") # we save the file with the Audit Analitic opinion, and audit fee in small format
library(readxl)
KRS <- read_excel("KRS_All.xlsx", col_types = "text", na = c("n.a.", "n.s.", "NA", "N/A", "b.d.", ""))
library(janitor)
colnames(KRS) <- paste0(
"KRS",
make_clean_names(colnames(KRS), case = "small_camel")
)
Check for the KRS duplicates
sum(duplicated(KRS$KRSodpisNaglowekANumerKrs))
## [1] 0
Link the both databases with the NIP (tax number)
“KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip” nad “TaxIdentificationNumberTin”
Remove from KRS repeated information
KRS2 <- KRS[!is.na(KRS$KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip), ]
Merge
merged_df_all <- merge(
df4,
KRS2,
by.x = "TaxIdentificationNumberTin",
by.y = "KRSodpisDaneDzial1DanePodmiotuIdentyfikatoryNip",
all.x = TRUE
)
Save the final version of the datset for the analysis:
library(writexl)
saveRDS(merged_df_all, "InterimData/LongOrbisAAKRSStep4.rds") # this
saveRDS(merged_df, "InterimData/LongOnlyAAFiacialAfterStep4.rds")
The final version of thee long format from Orbis after merging the Audit Analiytics and polsih KRS data is the object “merged_df_all” that is saved as “InterimData/LongOrbisAAKRSStep4.rds”