library(MASS)
library(ggplot2)
library(lmtest)

1 Prerequisites

set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

df_firms <- data.frame(
  Firm_ID        = 1:n,
  Total_Assets   = firm_size,
  RD_Expenditure = rd_expenditure
)

head(df_firms)
##   Firm_ID Total_Assets RD_Expenditure
## 1       1     458.2550      322.76939
## 2       2     469.1670      302.76313
## 3       3     150.2084       54.90529
## 4       4     416.9193      421.56611
## 5       5     324.4553      103.12089
## 6       6     264.3570      134.17397

2 Visualize

2.1 Total Assets vs R&D Expenditure

ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
  geom_point(color = "steelblue", alpha = 0.6, size = 2) +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(
    title = "Total Assets vs R&D Expenditure",
    x     = "Total Assets (Millions)",
    y     = "R&D Expenditure (Millions)"
  ) +
  theme_minimal()

The scatter plot reveals a non-linear, right-skewed relationship, with variance increasing as firm size grows — a classic sign of heteroscedasticity.

3 Diagnose

3.1 OLS Regression

model_ols <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(model_ols)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16

3.2 Residual Diagnostics

par(mfrow = c(2, 2))
plot(model_ols, col = "steelblue", pch = 19, cex = 0.6)

par(mfrow = c(1, 1))
shapiro.test(residuals(model_ols))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11

Interpretation:

  • Residuals vs Fitted: Fan-shaped pattern → heteroscedasticity present.
  • Q-Q Plot: Residuals deviate from the diagonal → non-normality confirmed.
  • Shapiro-Wilk: p-value < 0.05 → residuals are not normally distributed.

4 Transform

4.1 Box-Cox: Finding Optimal λ

bc <- boxcox(model_ols, lambda = seq(-2, 2, by = 0.1))

lambda_opt <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", lambda_opt, "\n")
## Optimal lambda: 0.1818182

The 95% confidence interval for λ includes 0, confirming a log transformation is appropriate.

5 Refine

5.1 Transformed Model

if (abs(lambda_opt) < 0.05) {
  df_firms$RD_transformed <- log(df_firms$RD_Expenditure)
  transform_label <- "log(RD_Expenditure)"
} else {
  df_firms$RD_transformed <- (df_firms$RD_Expenditure^lambda_opt - 1) / lambda_opt
  transform_label <- paste0("BoxCox(RD_Expenditure, λ=", round(lambda_opt, 2), ")")
}

model_transformed <- lm(RD_transformed ~ Total_Assets, data = df_firms)
summary(model_transformed)
## 
## Call:
## lm(formula = RD_transformed ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6671 -0.9255 -0.0303  0.7700  3.2087 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.4627033  0.1839737   29.69   <2e-16 ***
## Total_Assets 0.0075331  0.0006096   12.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.231 on 198 degrees of freedom
## Multiple R-squared:  0.4354, Adjusted R-squared:  0.4325 
## F-statistic: 152.7 on 1 and 198 DF,  p-value: < 2.2e-16

5.2 Residual Diagnostics (Transformed)

par(mfrow = c(2, 2))
plot(model_transformed, col = "darkorange", pch = 19, cex = 0.6)

par(mfrow = c(1, 1))
shapiro.test(residuals(model_transformed))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_transformed)
## W = 0.98986, p-value = 0.1707

5.3 Model Comparison

cat("=== OLS Model (Untransformed) ===\n")
## === OLS Model (Untransformed) ===
cat("R-squared:      ", round(summary(model_ols)$r.squared, 4), "\n")
## R-squared:       0.3088
cat("Adj. R-squared: ", round(summary(model_ols)$adj.r.squared, 4), "\n")
## Adj. R-squared:  0.3053
cat("Shapiro-Wilk p: ", round(shapiro.test(residuals(model_ols))$p.value, 6), "\n\n")
## Shapiro-Wilk p:  0
cat("=== Transformed Model ===\n")
## === Transformed Model ===
cat("R-squared:      ", round(summary(model_transformed)$r.squared, 4), "\n")
## R-squared:       0.4354
cat("Adj. R-squared: ", round(summary(model_transformed)$adj.r.squared, 4), "\n")
## Adj. R-squared:  0.4325
cat("Shapiro-Wilk p: ", round(shapiro.test(residuals(model_transformed))$p.value, 6), "\n")
## Shapiro-Wilk p:  0.170673
Metric OLS (Raw) Transformed
Lower Higher
Residual Normality Violated Satisfied
Heteroscedasticity Present Largely resolved

The Box-Cox transformation (λ ≈ 0, i.e., log) substantially improves model fit, corrects non-normality, and reduces heteroscedasticity.