Data Generation

set.seed(42)
n <- 200
firm_size      <- runif(n, 10, 500)
error          <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

df_firms <- data.frame(
  Firm_ID        = 1:n,
  Total_Assets   = firm_size,
  RD_Expenditure = rd_expenditure
)

head(df_firms)
##   Firm_ID Total_Assets RD_Expenditure
## 1       1     458.2550      322.76939
## 2       2     469.1670      302.76313
## 3       3     150.2084       54.90529
## 4       4     416.9193      421.56611
## 5       5     324.4553      103.12089
## 6       6     264.3570      134.17397
summary(df_firms[, -1])
##   Total_Assets    RD_Expenditure  
##  Min.   : 10.12   Min.   : 10.52  
##  1st Qu.:136.57   1st Qu.: 76.05  
##  Median :282.15   Median :116.25  
##  Mean   :265.89   Mean   :133.81  
##  3rd Qu.:382.78   3rd Qu.:164.97  
##  Max.   :494.56   Max.   :606.82

1. – Visualise: Total Assets vs R&D Expenditure

ggplot(df_firms, aes(Total_Assets, RD_Expenditure)) +
  geom_point(alpha = 0.55, colour = "#2D6A9F", size = 1.8) +
  geom_smooth(method = "lm",    colour = "#E74C3C", se = TRUE,  linetype = "dashed") +
  geom_smooth(method = "loess", colour = "#27AE60", se = FALSE) +
  labs(
    title    = "Total Assets vs R&D Expenditure (Raw)",
    subtitle = "Red dashed = OLS fit  |  Green = LOESS smoother",
    x = "Total Assets ($ millions)",
    y = "R&D Expenditure ($ millions)"
  ) +
  theme_bw(base_size = 13)
Raw scatter: non-linear relationship and increasing variance are both visible.

Raw scatter: non-linear relationship and increasing variance are both visible.

Interpretation:
The scatter plot reveals two key violations of OLS assumptions:

  1. Non-linearity – the LOESS smoother (green) curves away from the linear fit (red), suggesting a log-type relationship.
  2. Heteroscedasticity – variance in R&D expenditure clearly fans out as Total Assets grow, which will inflate standard errors and produce unreliable inference.

2. Diagnose: OLS Regression & Residual Checks

2.1 Fit the OLS model

model_ols <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(model_ols)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16

2.2 Residual Plots

df_diag <- data.frame(
  fitted    = fitted(model_ols),
  resid     = residuals(model_ols),
  std_resid = rstandard(model_ols)
)

p_rv <- ggplot(df_diag, aes(fitted, resid)) +
  geom_point(alpha = 0.5, colour = "#8E44AD") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
  geom_smooth(method = "loess", se = FALSE, colour = "#E67E22") +
  labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals") +
  theme_bw()

p_qq <- ggplot(df_diag, aes(sample = std_resid)) +
  stat_qq(colour = "#2980B9", alpha = 0.6) +
  stat_qq_line(colour = "red") +
  labs(title = "Normal Q-Q Plot", x = "Theoretical Quantiles", y = "Std. Residuals") +
  theme_bw()

p_sl <- ggplot(df_diag, aes(fitted, sqrt(abs(std_resid)))) +
  geom_point(alpha = 0.5, colour = "#16A085") +
  geom_smooth(method = "loess", se = FALSE, colour = "#E74C3C") +
  labs(title = "Scale-Location", x = "Fitted Values",
       y = expression(sqrt("|Std. Residuals|"))) +
  theme_bw()

p_rv + p_qq + p_sl

2.3 Formal Tests

# Normality
shapiro.test(residuals(model_ols))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
# Heteroscedasticity
bptest(model_ols)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 13.298, df = 1, p-value = 0.0002657

Interpretation:

Diagnostic Result Verdict
Residuals vs Fitted Clear funnel pattern ❌ Heteroscedasticity
Q-Q Plot Heavy right tail ❌ Non-normality
Shapiro-Wilk p < 0.001 ❌ Reject normality
Breusch-Pagan p < 0.001 ❌ Reject homoscedasticity

Both assumptions are violated. Transformation is required.


3. Transform: Box-Cox Optimal λ

bc         <- boxcox(model_ols, lambda = seq(-2, 2, by = 0.05))
The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.

The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.

lambda_opt <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", round(lambda_opt, 4))
## Optimal lambda: 0.1818
# Plot
bc_df      <- data.frame(lambda = bc$x, loglik = bc$y)
ci_cutoff  <- max(bc$y) - qchisq(0.95, 1) / 2

ggplot(bc_df, aes(lambda, loglik)) +
  geom_line(colour = "#2C3E50", size = 1) +
  geom_vline(xintercept = lambda_opt, colour = "red",    linetype = "dashed") +
  geom_hline(yintercept = ci_cutoff,  colour = "#7F8C8D", linetype = "dotted") +
  annotate("text", x = lambda_opt + 0.15, y = min(bc_df$loglik),
           label = sprintf("λ = %.2f", lambda_opt), colour = "red", size = 4.5) +
  labs(title    = "Box-Cox Log-Likelihood Profile",
       subtitle = "Dotted line = 95% confidence interval boundary",
       x = expression(lambda), y = "Log-Likelihood") +
  theme_bw(base_size = 13)
The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.

The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.

Result: The optimal λ ≈ 0.18.
Since this is very close to 0, the Box-Cox procedure recommends a log transformation of the response variable.

Box-Cox rule of thumb: λ ≈ 0 → use ln(Y); λ ≈ 0.5 → use √Y; λ ≈ 1 → no transformation needed.


4. Refine: Transformed Model & Comparison

# Apply log transformation (lambda ≈ 0)
df_firms$RD_log     <- log(df_firms$RD_Expenditure)
df_firms$Assets_log <- log(df_firms$Total_Assets)

# Log-level model
model_loglevel <- lm(RD_log ~ Total_Assets, data = df_firms)

# Log-log model (aligns with the data-generating process)
model_loglog   <- lm(RD_log ~ Assets_log, data = df_firms)

cat("=== Log-Level Model ===\n");  summary(model_loglevel)
## === Log-Level Model ===
## 
## Call:
## lm(formula = RD_log ~ Total_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47221 -0.39713  0.02358  0.35362  1.37330 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.7787315  0.0798760   47.31   <2e-16 ***
## Total_Assets 0.0033566  0.0002647   12.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5343 on 198 degrees of freedom
## Multiple R-squared:  0.4482, Adjusted R-squared:  0.4454 
## F-statistic: 160.8 on 1 and 198 DF,  p-value: < 2.2e-16
cat("=== Log-Log Model ===\n");    summary(model_loglog)
## === Log-Log Model ===
## 
## Call:
## lm(formula = RD_log ~ Assets_log, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31707 -0.31284 -0.00069  0.30462  1.38376 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.4356     0.2100   6.837 9.82e-11 ***
## Assets_log    0.6075     0.0389  15.616  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared:  0.5519, Adjusted R-squared:  0.5496 
## F-statistic: 243.9 on 1 and 198 DF,  p-value: < 2.2e-16

4.1 Residual Plots – Log-Log Model

df_diag2 <- data.frame(
  fitted    = fitted(model_loglog),
  resid     = residuals(model_loglog),
  std_resid = rstandard(model_loglog)
)

p4a <- ggplot(df_diag2, aes(fitted, resid)) +
  geom_point(alpha = 0.5, colour = "#C0392B") +
  geom_hline(yintercept = 0, linetype = "dashed", colour = "blue") +
  geom_smooth(method = "loess", se = FALSE, colour = "#F39C12") +
  labs(title = "Residuals vs Fitted (Log-Log)", x = "Fitted", y = "Residuals") +
  theme_bw()

p4b <- ggplot(df_diag2, aes(sample = std_resid)) +
  stat_qq(colour = "#1ABC9C", alpha = 0.6) +
  stat_qq_line(colour = "red") +
  labs(title = "Q-Q Plot (Log-Log)", x = "Theoretical Quantiles", y = "Std. Residuals") +
  theme_bw()

p4c <- ggplot(df_firms, aes(Assets_log, RD_log)) +
  geom_point(alpha = 0.5, colour = "#2980B9") +
  geom_smooth(method = "lm", colour = "#E74C3C", se = TRUE) +
  labs(title = "Log(Assets) vs Log(R&D)", x = "log(Total Assets)", y = "log(R&D Exp.)") +
  theme_bw()

p4a + p4b + p4c

4.2 Post-Transformation Formal Tests

cat("--- Shapiro-Wilk (Log-Log residuals) ---\n")
## --- Shapiro-Wilk (Log-Log residuals) ---
shapiro.test(residuals(model_loglog))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_loglog)
## W = 0.99476, p-value = 0.7145
cat("\n--- Breusch-Pagan (Log-Log) ---\n")
## 
## --- Breusch-Pagan (Log-Log) ---
bptest(model_loglog)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_loglog
## BP = 0.0095774, df = 1, p-value = 0.922

4.3 Model Comparison Summary

comp <- data.frame(
  Model    = c("OLS – Raw", "Log-Level", "Log-Log"),
  R2       = round(c(summary(model_ols)$r.squared,
                     summary(model_loglevel)$r.squared,
                     summary(model_loglog)$r.squared), 4),
  Adj_R2   = round(c(summary(model_ols)$adj.r.squared,
                     summary(model_loglevel)$adj.r.squared,
                     summary(model_loglog)$adj.r.squared), 4),
  BP_pval  = round(c(bptest(model_ols)$p.value,
                     bptest(model_loglevel)$p.value,
                     bptest(model_loglog)$p.value), 5),
  SW_pval  = round(c(shapiro.test(residuals(model_ols))$p.value,
                     shapiro.test(residuals(model_loglevel))$p.value,
                     shapiro.test(residuals(model_loglog))$p.value), 5)
)
knitr::kable(comp, caption = "Model Diagnostics Comparison")
Model Diagnostics Comparison
Model R2 Adj_R2 BP_pval SW_pval
OLS – Raw 0.3088 0.3053 0.00027 0.00000
Log-Level 0.4482 0.4454 0.02026 0.96169
Log-Log 0.5519 0.5496 0.92204 0.71451

Conclusion

Step Key Finding
Visualize Clear non-linearity and heteroscedasticity in raw data
Diagnose OLS violates normality (SW p<0.001) and homoscedasticity (BP p<0.001)
Transform Box-Cox optimal λ ≈ 0 → log transformation recommended
Refine Log-log model passes both tests; R² improves dramatically

The log-log model log(RD_Expenditure) ~ log(Total_Assets) is the best specification:

  • Residuals are approximately normal
  • Variance is homoscedastic
  • The slope coefficient (~0.6) is directly interpretable as an elasticity: a 1% increase in firm size is associated with a ~0.6% increase in R&D expenditure.