Load libraries
# use this to save rmarkdown::render("~/Nextcloud/Dropbox/global_logs/win/notebook.Rmd")
# Load required libraries
library(arrow)
library(fixest)
library(dplyr)
library(ggplot2)
library(modelsummary)
library(ggplot2)
library(tidyr)
library(gridExtra)
# Load the merged Compustat data
df <- read_parquet("compustat_merged_data.parquet")
# Matplotlib default colors (first 10)
matplotlib_colors <- c(
"#1f77b4", # blue
"#ff7f0e", # orange
"#2ca02c", # green
"#d62728", # red
"#9467bd", # purple
"#8c564b", # brown
"#e377c2", # pink
"#7f7f7f", # gray
"#bcbd22", # olive
"#17becf" # cyan
)
# Create leverage variables after loading data
df <- df %>%
mutate(
leverage = ifelse(leverage > 0, leverage, NA),
leverage_decile = ntile(leverage, 10) - 1
)
# Verify the results
cat("Leverage summary:\n")
Leverage summary:
summary(df$leverage)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
0.000 0.334 0.507 0.512 0.697 13.503 32052
cat("\nLeverage decile distribution:\n")
Leverage decile distribution:
table(df$leverage_decile, useNA = "ifany")
0 1 2 3 4 5 6 7 8 9 <NA>
65611 65611 65611 65611 65611 65611 65611 65610 65610 65610 32052
Basic data exploration
cat(paste(
"Dataset dimensions:", paste(dim(df), collapse=" x "),
"\nTime period:", paste(range(df$fyear, na.rm=TRUE), collapse=" - "),
"\nNumber of firms:", length(unique(df$gvkey)),
"\nCountries:", length(unique(df$country)), "\n"
))
Dataset dimensions: 688159 x 62
Time period: 1981 - 2025
Number of firms: 65533
Countries: 119
Check data distribution by source/country
table(df$country, useNA = "ifany")
ARE ARG AUS AUT AZE BEL BFA BGD BGR BHR BMU BRA BRB BWA CAN CHE CHL CHN CIV COL CYM CYP CZE DEU DNK ECU EGY ESP EST FIN FLK FRA FRO GAB GBR
880 948 25851 1014 19 1568 1 2379 771 277 452 4403 11 175 40 3029 1605 59166 64 532 584 837 171 9023 1912 27 1656 2166 318 2260 12 9595 6 19 19259
GGY GHA GIB GRC GRL GUF HKG HRV HUN IDN IMN IND IRL ISL ISR ITA JAM JEY JOR JPN KAZ KEN KGZ KHM KOR KWT LBN LBR LKA LTU LUX LVA MAC MAR MCO
172 173 57 3071 7 3 18003 1044 314 5843 117 49076 684 104 4998 4194 485 207 1682 53923 217 590 16 19 22709 1427 26 2 3253 456 431 297 136 523 27
MEX MKD MLT MNG MOZ MUS MWI MYS NAM NGA NLD NOR NZL OMN PAK PAN PER PHL PNG POL PRT PSE QAT ROU RUS SAU SDN SEN SGP SLB SRB SVK SVN SWE THA
1496 15 230 15 19 536 53 15344 73 1379 1711 3035 1852 1126 5192 33 1297 2775 78 7863 705 216 374 2298 2133 1980 15 9 9045 11 291 93 369 8851 9125
TTO TUN TUR TWN TZA UGA UKR USA VEN VGB VNM ZAF ZMB ZWE
205 366 3945 30246 99 61 233 238086 92 197 5290 3704 171 511
Remove observations with missing key variables for regression
df_reg <- df %>%
filter(
!is.na(mv_usd), !is.na(bv_usd), !is.na(ni_usd), !is.na(at_usd),
mv_usd > 0, at_usd > 0
) %>%
mutate(industry = substr(as.character(sich), 1, 2))
Basic descriptive statistics for key variables
summary(df[c("sich", "mv_usd", "bv_usd", "ni_usd", "at_usd")])
sich mv_usd bv_usd ni_usd at_usd
Min. : 100 Min. : 0.0 Min. : -139965.0 Min. : -99289.0 Min. :0.000e+00
1st Qu.:2836 1st Qu.: 22.0 1st Qu.: 14.7 1st Qu.: -1.0 1st Qu.:3.440e+01
Median :3674 Median : 103.3 Median : 69.9 Median : 2.9 Median :1.612e+02
Mean :4289 Mean : 2003.4 Mean : 788.0 Mean : 99.0 Mean :2.994e+03
3rd Qu.:5812 3rd Qu.: 564.7 3rd Qu.: 292.5 3rd Qu.: 22.2 3rd Qu.:7.324e+02
Max. :9998 Max. :3522211.1 Max. :13003448.8 Max. :11854605.8 Max. :1.365e+07
NA's :43311
summary(df[c("at_usd", "m2b", "roe", "roa")])
at_usd m2b roe roa
Min. :0.000e+00 Min. :-99.9653 Min. :-98.28070 Min. :-97.33333
1st Qu.:3.440e+01 1st Qu.: 0.7509 1st Qu.: -0.03534 1st Qu.: -0.02769
Median :1.612e+02 Median : 1.4403 Median : 0.06659 Median : 0.02312
Mean :2.994e+03 Mean : 2.4510 Mean : -0.00484 Mean : -0.09716
3rd Qu.:7.324e+02 3rd Qu.: 2.8006 3rd Qu.: 0.14949 3rd Qu.: 0.06560
Max. :1.365e+07 Max. : 99.9534 Max. : 99.82143 Max. : 85.26677
Model 1: Basic market cap on book value regression
# Model 1: Basic market cap on book value regression
model1 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd, data = df_reg, vcov = "hetero")
Model 2: Add net income and other
model2 <- feols(ln_abs_mv_usd ~ ln_abs_bv_0_usd + ln_abs_ni_usd + ln_abs_oth_usd, data = df_reg, vcov = "hetero")
Model 3: Add total assets and total liabilities
model3 <- feols(ln_abs_mv_usd ~ ln_abs_at_usd + ln_abs_lt_usd, data = df_reg, vcov = "hetero")
Model 4: Add year fixed effects
model4 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd | fyear, data = df_reg, vcov = "hetero")
Model 5: Add country fixed effects
model5 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd | fyear + country, data = df_reg, vcov = "hetero")
Model 6: Add industry fixed effects (if sich available)
model6 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd | fyear + country + industry, data = df_reg, vcov = "hetero")
Model 7: Firm fixed effects (panel regression)
model7 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd | gvkey, data = df_reg, vcov = "hetero")
model8 <- feols(ln_abs_mv_usd ~ ln_abs_bv_usd | fyear + country, data = df_reg, cluster = c("gvkey", "fyear"))
Create model list
models <- list("Basic" = model1,
"+ Income and Other" = model2,
"+ Asset Liabilities" = model3,
"Income + Year FE" = model4,
"+ Country FE" = model5,
"+ Industry FE" = model6,
"Firm FE" = model7,
"Cluster Firm Year " = model8)
Display regression results
etable(models, title = "Market Capitalization Regressions",
notes = "Dependent variable: Log(Market Cap USD). Standard errors are heteroskedasticity-robust.",
digits = 3)
Basic + Income and Ot.. + Asset Liabili.. Income + Year FE + Country FE + Industry FE Firm FE Cluster Firm Y..
Dependent Var.: ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd ln_abs_mv_usd
Constant 0.681*** (0.004) 1.53*** (0.005) 0.128*** (0.007)
ln_abs_bv_usd 0.947*** (0.0007) 0.948*** (0.0007) 0.918*** (0.0008) 0.931*** (0.0008) 0.691*** (0.002) 0.918*** (0.004)
ln_abs_bv_0_usd 0.594*** (0.002)
ln_abs_ni_usd 0.331*** (0.001)
ln_abs_oth_usd 0.028*** (0.0003)
ln_abs_at_usd 1.09*** (0.004)
ln_abs_lt_usd -0.220*** (0.003)
Fixed-Effects: ----------------- ----------------- ----------------- ----------------- ----------------- ----------------- ---------------- ----------------
fyear No No No Yes Yes Yes No Yes
country No No No No Yes Yes No Yes
industry No No No No No Yes No No
gvkey No No No No No No Yes No
_______________ _________________ _________________ _________________ _________________ _________________ _________________ ________________ ________________
S.E. type Heteroskeda.-rob. Heteroskeda.-rob. Heteroskeda.-rob. Heteroskeda.-rob. Heteroskeda.-rob. Heteroskeda.-rob. Heterosked.-rob. by: gvkey & fyear
Observations 688,159 688,159 687,627 688,158 688,157 644,846 683,319 688,157
R2 0.77862 0.77200 0.74348 0.78422 0.80440 0.80891 0.92023 0.80440
Within R2 -- -- -- 0.78007 0.75552 0.75016 0.36578 0.75552
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Define the slope coefficient names we want to sum
slope_vars <- c("ln_abs_bv_usd", "ln_abs_bv_0_usd", "ln_abs_ni_usd", "ln_abs_oth_usd", "ln_abs_at_usd", "ln_abs_lt_usd")
# Function to sum coefficients for a model
sum_coefficients <- function(model, var_names) {
coeffs <- coef(model)
# Get coefficients that exist in the model
existing_vars <- intersect(names(coeffs), var_names)
if(length(existing_vars) > 0) {
return(sum(coeffs[existing_vars]))
} else {
return(NA)
}
}
Calculate the sum of coefficients for each model
coeff_sums <- sapply(models, sum_coefficients, var_names = slope_vars)
# Create a nice summary table
coeff_summary <- data.frame(
Model = names(models),
Sum_of_Coefficients = round(coeff_sums, 4),
stringsAsFactors = FALSE
)
print(coeff_summary)
Visualization of coefficient sums
# Remove NA values for plotting
plot_data <- coeff_summary[!is.na(coeff_summary$Sum_of_Coefficients), ]
ggplot(plot_data, aes(x = reorder(Model, Sum_of_Coefficients), y = Sum_of_Coefficients)) +
geom_col(fill = "steelblue", alpha = 0.7) +
geom_text(aes(label = round(Sum_of_Coefficients, 3)),
hjust = -0.1, size = 3) +
coord_flip() +
labs(title = "Sum of Slope Coefficients by Model",
x = "Model",
y = "Sum of Coefficients") +
theme_minimal()
Detailed breakdown of coefficients by model
# Show which coefficients are present in each model
for(i in 1:length(models)) {
model_name <- names(models)[i]
model <- models[[i]]
coeffs <- coef(model)
# Find slope coefficients that exist in this model
existing_slopes <- intersect(names(coeffs), slope_vars)
cat("\n", model_name, ":\n")
if(length(existing_slopes) > 0) {
for(var in existing_slopes) {
cat(" ", var, ":", round(coeffs[var], 4), "\n")
}
cat(" Sum:", round(sum(coeffs[existing_slopes]), 4), "\n")
} else {
cat(" No slope coefficients found\n")
}
}
Basic :
ln_abs_bv_usd : 0.9466
Sum: 0.9466
+ Income and Other :
ln_abs_bv_0_usd : 0.5939
ln_abs_ni_usd : 0.3308
ln_abs_oth_usd : 0.0276
Sum: 0.9523
+ Asset Liabilities :
ln_abs_at_usd : 1.0857
ln_abs_lt_usd : -0.2202
Sum: 0.8656
Income + Year FE :
ln_abs_bv_usd : 0.9485
Sum: 0.9485
+ Country FE :
ln_abs_bv_usd : 0.918
Sum: 0.918
+ Industry FE :
ln_abs_bv_usd : 0.931
Sum: 0.931
Firm FE :
ln_abs_bv_usd : 0.6911
Sum: 0.6911
Cluster Firm Year :
ln_abs_bv_usd : 0.918
Sum: 0.918
Annual regression model
annual_regression <- function(df, regmodel, copy_to_clipboard = FALSE) {
# Get sorted years
years <- sort(unique(df$fyear[!is.na(df$fyear)]))
# Initialize list to store results
results_by_year <- list()
# Extract expected variable names from the formula
formula_vars <- all.vars(as.formula(regmodel))
expected_vars <- formula_vars[-1] # Remove dependent variable
expected_vars <- c("(Intercept)", expected_vars) # Add intercept
# Loop through each year
for (y in years) {
cat("Running regression for year", y, "...\n")
# Try to run regression for this year
tryCatch({
# Filter data for this year
df_year <- df[df$fyear == y & !is.na(df$fyear), ]
# Check minimum observations
if (nrow(df_year) < 5) {
cat("⚠️ Skipping year", y, "- insufficient observations (", nrow(df_year), ")\n")
next
}
# Run regression with clustering by gvkey
model <- feols(as.formula(regmodel),
data = df_year,
cluster = "gvkey")
# Extract coefficients
coeffs <- coef(model)
# Create a standardized data frame with all expected variables
tidy_row <- data.frame(fyear = y)
# Add each expected coefficient (NA if not present)
for (var in expected_vars) {
if (var %in% names(coeffs)) {
tidy_row[[var]] <- coeffs[var]
} else {
tidy_row[[var]] <- NA
}
}
# Store the result
results_by_year[[length(results_by_year) + 1]] <- tidy_row
}, error = function(e) {
cat("⚠️ Regression for year", y, "failed:", e$message, "\n")
})
}
# Combine results if any exist
if (length(results_by_year) > 0) {
# Bind all rows together (now they all have the same columns)
df_yearly_results <- do.call(rbind, results_by_year)
# Calculate sum of slopes (excluding intercept)
slope_cols <- setdiff(names(df_yearly_results), c("fyear", "(Intercept)"))
df_yearly_results$sum_slopes <- rowSums(df_yearly_results[slope_cols], na.rm = TRUE)
# Set fyear as row names and remove the column
rownames(df_yearly_results) <- df_yearly_results$fyear
df_yearly_results$fyear <- NULL
# Copy to clipboard if requested
if (copy_to_clipboard) {
write.table(df_yearly_results, "clipboard", sep = "\t", row.names = TRUE)
cat("Results copied to clipboard!\n")
}
return(df_yearly_results)
} else {
cat("❌ No valid regressions could be run.\n")
return(NULL)
}
}
Run annual regression model
regmodel <- "ln_abs_mv_usd ~ ln_abs_bv_0_usd + ln_abs_ni_usd + ln_abs_oth_usd"
# Run the annual regression
df_y <- annual_regression(df_reg %>% filter(fyear <= 2024), regmodel)
Running regression for year 1981 ...
Running regression for year 1982 ...
Running regression for year 1983 ...
Running regression for year 1984 ...
Running regression for year 1985 ...
Running regression for year 1986 ...
Running regression for year 1987 ...
Running regression for year 1988 ...
Running regression for year 1989 ...
Running regression for year 1990 ...
Running regression for year 1991 ...
Running regression for year 1992 ...
Running regression for year 1993 ...
Running regression for year 1994 ...
Running regression for year 1995 ...
Running regression for year 1996 ...
Running regression for year 1997 ...
Running regression for year 1998 ...
Running regression for year 1999 ...
Running regression for year 2000 ...
Running regression for year 2001 ...
Running regression for year 2002 ...
Running regression for year 2003 ...
Running regression for year 2004 ...
Running regression for year 2005 ...
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
Display results
print(df_y)
Run annual regressions for each continent
continents <- c("Europe", "Asia", "Oceania", "Africa", "Latin America", "North America")
continent_results <- list()
for (cont in continents) {
cat("\n=== Running regressions for", cont, "===\n")
# Filter data for this continent
df_cont <- df_reg %>% filter(continent == cont, fyear <= 2024, fyr>0)
# Check if we have data for this continent
if (nrow(df_cont) > 0) {
continent_results[[cont]] <- annual_regression(df_cont, regmodel)
} else {
cat("No data available for", cont, "\n")
continent_results[[cont]] <- NULL
}
}
=== Running regressions for Europe ===
Running regression for year 2005 ...
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
=== Running regressions for Asia ===
Running regression for year 2005 ...
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
=== Running regressions for Oceania ===
Running regression for year 2005 ...
⚠️ Skipping year 2005 - insufficient observations ( 1 )
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
=== Running regressions for Africa ===
Running regression for year 2005 ...
⚠️ Skipping year 2005 - insufficient observations ( 2 )
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
=== Running regressions for Latin America ===
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
=== Running regressions for North America ===
Running regression for year 1981 ...
Running regression for year 1982 ...
Running regression for year 1983 ...
Running regression for year 1984 ...
Running regression for year 1985 ...
Running regression for year 1986 ...
Running regression for year 1987 ...
Running regression for year 1988 ...
Running regression for year 1989 ...
Running regression for year 1990 ...
Running regression for year 1991 ...
Running regression for year 1992 ...
Running regression for year 1993 ...
Running regression for year 1994 ...
Running regression for year 1995 ...
Running regression for year 1996 ...
Running regression for year 1997 ...
Running regression for year 1998 ...
Running regression for year 1999 ...
Running regression for year 2000 ...
Running regression for year 2001 ...
Running regression for year 2002 ...
Running regression for year 2003 ...
Running regression for year 2004 ...
Running regression for year 2005 ...
Running regression for year 2006 ...
Running regression for year 2007 ...
Running regression for year 2008 ...
Running regression for year 2009 ...
Running regression for year 2010 ...
Running regression for year 2011 ...
Running regression for year 2012 ...
Running regression for year 2013 ...
Running regression for year 2014 ...
Running regression for year 2015 ...
Running regression for year 2016 ...
Running regression for year 2017 ...
Running regression for year 2018 ...
Running regression for year 2019 ...
Running regression for year 2020 ...
Running regression for year 2021 ...
Running regression for year 2022 ...
Running regression for year 2023 ...
Running regression for year 2024 ...
# Access results
df_eu_loop <- continent_results[["Europe"]]
df_as_loop <- continent_results[["Asia"]]
df_oc_loop <- continent_results[["Oceania"]]
df_af_loop <- continent_results[["Africa"]]
df_latam_loop <- continent_results[["Latin America"]]
df_usacan_loop <- continent_results[["North America"]]
Plot line function
plot_lines <- function(df_y, lyst, region) {
# Check if df_y is NULL or empty
if (is.null(df_y) || nrow(df_y) == 0) {
cat("No data available for", region, "\n")
return(NULL)
}
# Convert rownames to numeric years for plotting
df_y$fyear <- as.numeric(rownames(df_y))
# Select only the variables we want to plot
plot_data <- df_y[c("fyear", lyst)]
# Remove rows with all NA values in the variables to plot
plot_data <- plot_data[rowSums(is.na(plot_data[lyst])) < length(lyst), ]
if (nrow(plot_data) == 0) {
cat("No valid data to plot for", region, "\n")
return(NULL)
}
# Reshape data for ggplot (wide to long format)
plot_data_long <- plot_data %>%
pivot_longer(cols = all_of(lyst),
names_to = "variable",
values_to = "value") %>%
filter(!is.na(value))
# Determine x-axis breaks based on number of years
n_years <- length(unique(plot_data$fyear))
if (n_years > 15) {
x_breaks <- seq(min(plot_data$fyear), max(plot_data$fyear), by = 2)
} else {
x_breaks <- sort(unique(plot_data$fyear))
}
# Create the plot
p <- ggplot(plot_data_long, aes(x = fyear, y = value, color = variable)) +
geom_line(linewidth = 1, alpha = 0.8) +
geom_point(size = 1.25, alpha = 0.7) +
labs(title = paste("Coefficient Evolution -", region),
x = "Year",
y = "Coefficient Value",
color = "Variable: ") +
theme_minimal() +
theme(
plot.title = element_text(size = 14, hjust = 0.5),
panel.grid.major = element_line(linetype = "dashed", linewidth = 0.3),
panel.grid.minor = element_line(linetype = "dotted", linewidth = 0.2),
axis.text.x = element_text(angle = 45, hjust = 1, size = 9),
axis.text.y = element_text(size = 9),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 9),
plot.margin = margin(10, 20, 10, 10)
) +
scale_x_continuous(breaks = x_breaks, labels = x_breaks) +
scale_color_manual(
values = matplotlib_colors,
labels = c("ln_abs_bv_0_usd" = "BV_0",
"ln_abs_ni_usd" = "NI",
"ln_abs_oth_usd" = "Other",
"sum_slopes" = "SumSlopes")
)
return(p)
}
Plot coefficients for each continent
plot_variables <- c("ln_abs_bv_0_usd", "ln_abs_ni_usd", "ln_abs_oth_usd" , "sum_slopes")
df_eu_filtered <- df_eu_loop[as.numeric(rownames(df_eu_loop)) >= 2008, ]
plot_lines(df_y, plot_variables, region = 'Global')
plot_lines(df_y, plot_variables, region = 'Global')
plot_lines(df_eu_loop, plot_variables, region = 'Europe')
plot_lines(df_eu_filtered, plot_variables, region = 'Europe')
plot_lines(df_as_loop, plot_variables, region = 'Asia')
plot_lines(df_oc_loop, plot_variables, region = 'Oceania')
plot_lines(df_af_loop, plot_variables, region = 'Africa')
plot_lines(df_latam_loop, plot_variables, region = 'Latin America')
plot_lines(df_usacan_loop, plot_variables, region = 'North America')
NA
NA
NA
regression_by_leverage_decile <- function(df, regmodel, copy_to_clipboard = FALSE) {
# Drop rows with missing decile information
df_valid <- df %>% filter(!is.na(leverage_decile))
# Get unique deciles
deciles <- sort(unique(df_valid$leverage_decile))
# Initialize list to store results
results_by_decile <- list()
# Loop through each decile
for (decile in deciles) {
cat("Running regression for leverage decile:", decile, "\n")
# Filter data for this decile
df_sub <- df_valid %>% filter(leverage_decile == decile)
# Check minimum observations
if (nrow(df_sub) < 50) {
cat("Skipping decile", decile, "(only", nrow(df_sub), "observations)\n")
next
}
# Try to run regression
tryCatch({
# Run regression with clustering by gvkey
model <- feols(as.formula(regmodel),
data = df_sub,
cluster = "gvkey")
# Extract coefficients
coeffs <- coef(model)
# Create a data frame with coefficients
tidy_row <- data.frame(
leverage_decile = decile,
t(coeffs),
stringsAsFactors = FALSE
)
# Store the result
results_by_decile[[length(results_by_decile) + 1]] <- tidy_row
}, error = function(e) {
cat("Error in decile", decile, ":", e$message, "\n")
})
}
# Combine results if any exist
if (length(results_by_decile) > 0) {
# Bind all rows together
df_decile_results <- do.call(rbind, results_by_decile)
# Set leverage_decile as row names and remove the column
rownames(df_decile_results) <- df_decile_results$leverage_decile
df_decile_results$leverage_decile <- NULL
# Calculate sum of slopes (excluding intercept - handle different intercept names)
# Check what the actual intercept column name is
intercept_cols <- names(df_decile_results)[grepl("Intercept", names(df_decile_results), ignore.case = TRUE)]
slope_cols <- setdiff(names(df_decile_results), intercept_cols)
if (length(slope_cols) > 0) {
df_decile_results$sum_slopes <- rowSums(df_decile_results[slope_cols], na.rm = TRUE)
}
# Copy to clipboard if requested
if (copy_to_clipboard) {
# Add model and year information
df_copy <- df_decile_results
df_copy <- cbind(
Model = regmodel,
Year = paste(range(df_valid$fyear, na.rm = TRUE), collapse = "-"),
df_copy
)
write.table(df_copy, "clipboard", sep = "\t", row.names = TRUE)
cat("Results copied to clipboard!\n")
}
return(df_decile_results)
} else {
cat("❌ No valid regressions could be run.\n")
return(NULL)
}
}
df_l <- regression_by_leverage_decile(df_reg %>% filter(fyear <= 2024), regmodel)
Running regression for leverage decile: 0
Running regression for leverage decile: 1
Running regression for leverage decile: 2
Running regression for leverage decile: 3
Running regression for leverage decile: 4
Running regression for leverage decile: 5
Running regression for leverage decile: 6
Running regression for leverage decile: 7
Running regression for leverage decile: 8
Running regression for leverage decile: 9
Plotting function for leverage decile regression results
plot_leverage_deciles <- function(df_leverage, lyst, title_suffix = "") {
# Check if df_leverage is NULL or empty
if (is.null(df_leverage) || nrow(df_leverage) == 0) {
cat("No data available for leverage decile plot\n")
return(NULL)
}
# Convert rownames to numeric deciles for plotting
df_leverage$leverage_decile <- as.numeric(rownames(df_leverage))
# Select only the variables we want to plot
plot_data <- df_leverage[c("leverage_decile", lyst)]
# Remove rows with all NA values in the variables to plot
plot_data <- plot_data[rowSums(is.na(plot_data[lyst])) < length(lyst), ]
if (nrow(plot_data) == 0) {
cat("No valid data to plot for leverage deciles\n")
return(NULL)
}
# Reshape data for ggplot (wide to long format)
plot_data_long <- plot_data %>%
pivot_longer(cols = all_of(lyst),
names_to = "variable",
values_to = "value") %>%
filter(!is.na(value))
# Create the plot
p <- ggplot(plot_data_long, aes(x = leverage_decile, y = value, color = variable)) +
geom_line(linewidth = 1, alpha = 0.8) +
geom_point(size = 1.25, alpha = 0.7) +
labs(title = paste("Coefficient Evolution by Leverage Decile", title_suffix),
x = "Leverage Decile",
y = "Coefficient Value",
color = "Variable") +
theme_minimal() +
theme(
plot.title = element_text(size = 14, hjust = 0.5),
panel.grid.major = element_line(linetype = "dashed", linewidth = 0.3),
panel.grid.minor = element_line(linetype = "dotted", linewidth = 0.2),
axis.text.x = element_text(size = 9),
axis.text.y = element_text(size = 9),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 9),
plot.margin = margin(10, 20, 10, 10)
) +
scale_x_continuous(breaks = sort(unique(plot_data$leverage_decile)),
labels = sort(unique(plot_data$leverage_decile))) +
scale_color_manual(
values = matplotlib_colors,
labels = c("ln_abs_bv_0_usd" = "BV_0",
"ln_abs_ni_usd" = "NI",
"ln_abs_oth_usd" = "Other",
"sum_slopes" = "SumSlopes")
)
return(p)
}
# For your specified variables (if they exist in the leverage regression)
plot_leverage_deciles(df_l, plot_variables)