set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)
df_firms <- data.frame(
Firm_ID = 1:n,
Total_Assets = firm_size,
RD_Expenditure = rd_expenditure
)
head(df_firms)## Firm_ID Total_Assets RD_Expenditure
## 1 1 458.2550 322.76939
## 2 2 469.1670 302.76313
## 3 3 150.2084 54.90529
## 4 4 416.9193 421.56611
## 5 5 324.4553 103.12089
## 6 6 264.3570 134.17397
ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
geom_point(color = "steelblue", alpha = 0.6, size = 2) +
geom_smooth(method = "lm", color = "red", se = TRUE) +
labs(
title = "Total Assets vs R&D Expenditure",
x = "Total Assets (Millions)",
y = "R&D Expenditure (Millions)"
) +
theme_minimal()The scatter plot reveals a non-linear, right-skewed relationship, with variance increasing as firm size grows — a classic sign of heteroscedasticity.
##
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.79 -42.06 -12.37 25.08 404.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.50788 11.25914 3.598 0.000405 ***
## Total_Assets 0.35091 0.03731 9.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.3053
## F-statistic: 88.46 on 1 and 198 DF, p-value: < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
Interpretation:
if (abs(lambda_opt) < 0.05) {
df_firms$RD_transformed <- log(df_firms$RD_Expenditure)
transform_label <- "log(RD_Expenditure)"
} else {
df_firms$RD_transformed <- (df_firms$RD_Expenditure^lambda_opt - 1) / lambda_opt
transform_label <- paste0("BoxCox(RD_Expenditure, λ=", round(lambda_opt, 2), ")")
}
model_transformed <- lm(RD_transformed ~ Total_Assets, data = df_firms)
summary(model_transformed)##
## Call:
## lm(formula = RD_transformed ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6671 -0.9255 -0.0303 0.7700 3.2087
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.4627033 0.1839737 29.69 <2e-16 ***
## Total_Assets 0.0075331 0.0006096 12.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.231 on 198 degrees of freedom
## Multiple R-squared: 0.4354, Adjusted R-squared: 0.4325
## F-statistic: 152.7 on 1 and 198 DF, p-value: < 2.2e-16
##
## Shapiro-Wilk normality test
##
## data: residuals(model_transformed)
## W = 0.98986, p-value = 0.1707
## === OLS Model (Untransformed) ===
## R-squared: 0.3088
## Adj. R-squared: 0.3053
## Shapiro-Wilk p: 0
## === Transformed Model ===
## R-squared: 0.4354
## Adj. R-squared: 0.4325
## Shapiro-Wilk p: 0.170673
| Metric | OLS (Raw) | Transformed |
|---|---|---|
| R² | Lower | Higher |
| Residual Normality | Violated | Satisfied |
| Heteroscedasticity | Present | Largely resolved |
The Box-Cox transformation (λ ≈ 0, i.e., log) substantially improves model fit, corrects non-normality, and reduces heteroscedasticity.