1. Visualise: Total Assets vs R&D Expenditure

p_raw <- ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
  geom_point(alpha = 0.6, colour = "#1D4ED8", size = 2) +
  geom_smooth(method = "lm", colour = "#93C5FD", fill = "#BFDBFE", se = TRUE, linewidth = 1) +
  labs(
    title    = "Total Assets vs R&D Expenditure (raw scale)",
    subtitle = "Linear fit clearly misses the curvature",
    x        = "Total Assets ($ millions)",
    y        = "R&D Expenditure ($ millions)"
  ) +
  theme_minimal(base_size = 13)

p_log <- ggplot(df_firms, aes(x = log(Total_Assets), y = log(RD_Expenditure))) +
  geom_point(alpha = 0.6, colour = "#2563EB", size = 2) +
  geom_smooth(method = "lm", colour = "#60A5FA", fill = "#BFDBFE", se = TRUE, linewidth = 1) +
  labs(
    title    = "log(Total Assets) vs log(R&D Expenditure)",
    subtitle = "Log-log scale reveals a strong linear relationship",
    x        = "log(Total Assets)",
    y        = "log(R&D Expenditure)"
  ) +
  theme_minimal(base_size = 13)

grid.arrange(p_raw, p_log, ncol = 1)


2. Diagnose: OLS on Raw Scale

2.1 Fit the model

model_raw <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(model_raw)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16

2.2 Residual diagnostics

par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_raw, which = 1:4, col = "#2563EB", pch = 20, cex = 0.8)

par(mfrow = c(1, 1))
# Shapiro-Wilk test for normality of residuals
shapiro.test(residuals(model_raw))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_raw)
## W = 0.87801, p-value = 1.231e-11
# Breusch-Pagan test for heteroscedasticity
bp_test(model_raw)
## $statistic
## [1] 13.29769
## 
## $p.value
## [1] 0.0002657328

Diagnosis summary:

Test Result Interpretation
Shapiro-Wilk p < 0.001 Residuals are not normally distributed
Breusch-Pagan p < 0.001 Significant heteroscedasticity detected

Both OLS assumptions are violated — transformation is needed.


3. Transform: Find Optimal λ via Box-Cox

The Box-Cox family transforms the response as:

\[Y^{(\lambda)} = \begin{cases} \frac{Y^\lambda - 1}{\lambda} & \lambda \neq 0 \\ \ln(Y) & \lambda = 0 \end{cases}\]

# boxcox() requires a fitted lm object
bc <- boxcox(model_raw, lambda = seq(-2, 2, by = 0.05), plotit = TRUE)
title(main = "Box-Cox Log-Likelihood Profile", col.main = "firebrick")

# Extract the optimal lambda
lambda_opt <- bc$x[which.max(bc$y)]
cat(sprintf("Optimal lambda (λ) = %.4f\n", lambda_opt))
## Optimal lambda (λ) = 0.1818
cat(sprintf("95%% CI: [%.4f, %.4f]\n",
    bc$x[bc$y > max(bc$y) - qchisq(0.95, 1)/2][1],
    tail(bc$x[bc$y > max(bc$y) - qchisq(0.95, 1)/2], 1)))
## 95% CI: [0.0606, 0.3030]
cat(sprintf(
  "\nOptimal λ ≈ %.3f\nSince the 95%% CI includes 0, a LOG transformation (λ=0) is appropriate.\n",
  lambda_opt
))
## 
## Optimal λ ≈ 0.182
## Since the 95% CI includes 0, a LOG transformation (λ=0) is appropriate.

4. Refine: Re-run with Log-Transformed Response

4.1 Fit transformed model

df_firms <- df_firms %>%
  mutate(log_RD     = log(RD_Expenditure),
         log_Assets = log(Total_Assets))

model_log <- lm(log_RD ~ log_Assets, data = df_firms)
summary(model_log)
## 
## Call:
## lm(formula = log_RD ~ log_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31707 -0.31284 -0.00069  0.30462  1.38376 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.4356     0.2100   6.837 9.82e-11 ***
## log_Assets    0.6075     0.0389  15.616  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared:  0.5519, Adjusted R-squared:  0.5496 
## F-statistic: 243.9 on 1 and 198 DF,  p-value: < 2.2e-16

4.2 Residual diagnostics (transformed model)

par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_log, which = 1:4, col = "#1D4ED8", pch = 20, cex = 0.8)

par(mfrow = c(1, 1))
shapiro.test(residuals(model_log))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_log)
## W = 0.99476, p-value = 0.7145
bp_test(model_log)
## $statistic
## [1] 0.009577446
## 
## $p.value
## [1] 0.9220399

4.3 Side-by-side comparison

sw_raw <- shapiro.test(residuals(model_raw))
sw_log <- shapiro.test(residuals(model_log))
bp_raw <- bp_test(model_raw)
bp_log <- bp_test(model_log)

comparison <- data.frame(
  Metric = c(
    "R-squared",
    "Adj. R-squared",
    "Residual Std Error",
    "Shapiro-Wilk p-value",
    "Breusch-Pagan p-value"
  ),
  Raw_OLS = c(
    round(summary(model_raw)$r.squared, 4),
    round(summary(model_raw)$adj.r.squared, 4),
    round(summary(model_raw)$sigma, 4),
    formatC(sw_raw$p.value, format = "e", digits = 3),
    formatC(bp_raw$p.value, format = "e", digits = 3)
  ),
  Log_Transformed = c(
    round(summary(model_log)$r.squared, 4),
    round(summary(model_log)$adj.r.squared, 4),
    round(summary(model_log)$sigma, 4),
    formatC(sw_log$p.value, format = "e", digits = 3),
    formatC(bp_log$p.value, format = "e", digits = 3)
  )
)

knitr::kable(comparison,
             col.names = c("Metric", "Raw OLS", "Log-Transformed"),
             align     = c("l", "c", "c"),
             caption   = "Model Comparison: Raw OLS vs Log-Transformed")
Model Comparison: Raw OLS vs Log-Transformed
Metric Raw OLS Log-Transformed
R-squared 0.3088 0.5519
Adj. R-squared 0.3053 0.5496
Residual Std Error 75.3118 0.4815
Shapiro-Wilk p-value 1.231e-11 7.145e-01
Breusch-Pagan p-value 2.657e-04 9.220e-01

4.4 Fitted vs Actual plot

df_plot <- data.frame(
  Actual    = df_firms$log_RD,
  Fitted    = fitted(model_log),
  Residuals = residuals(model_log)
)

p_fit <- ggplot(df_plot, aes(x = Fitted, y = Actual)) +
  geom_point(alpha = 0.5, colour = "#1D4ED8", size = 2) +
  geom_abline(intercept = 0, slope = 1, colour = "#93C5FD",
              linewidth = 1, linetype = "dashed") +
  labs(title    = "Actual vs Fitted — Log-Transformed Model",
       subtitle = sprintf("R² = %.4f", summary(model_log)$r.squared),
       x = "Fitted log(R&D)",
       y = "Actual log(R&D)") +
  theme_minimal(base_size = 13)

p_res <- ggplot(df_plot, aes(x = Fitted, y = Residuals)) +
  geom_point(alpha = 0.5, colour = "#60A5FA", size = 2) +
  geom_hline(yintercept = 0, colour = "#1E40AF",
             linewidth = 1, linetype = "dashed") +
  labs(title    = "Residuals vs Fitted — Log-Transformed Model",
       subtitle = "Residuals are now homoscedastic",
       x = "Fitted log(R&D)",
       y = "Residuals") +
  theme_minimal(base_size = 13)

grid.arrange(p_fit, p_res, ncol = 2)


5. Summary & Conclusions

## ── Model Coefficients (log-log model) ──────────────────────────────────
##   Intercept : 1.4356  (true value: 1.5)
##   log_Assets: 0.6075  (true value: 0.6)
## 
## ── Interpretation ──────────────────────────────────────────────────────
##   A 1% increase in Total Assets is associated with a 83.58% increase
##   in R&D Expenditure (elasticity ≈ 0.6).
## 
## ── Diagnostics after transformation ────────────────────────────────────
##   Shapiro-Wilk p = 0.7145  → Residuals are NOW normally distributed ✓
##   Breusch-Pagan p = 0.9220 → Homoscedasticity is NOW satisfied      ✓
##   R-squared        = 0.5519 → Model explains 55.2% of variance in log(R&D)

Session Info

sessionInfo()
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so;  LAPACK version 3.9.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] gridExtra_2.3   dplyr_1.1.2     MASS_7.3-60.0.1 ggplot2_3.4.2  
## 
## loaded via a namespace (and not attached):
##  [1] Matrix_1.6-5      gtable_0.3.6      jsonlite_1.8.7    highr_0.10       
##  [5] compiler_4.3.3    tidyselect_1.2.0  jquerylib_0.1.4   splines_4.3.3    
##  [9] scales_1.2.1      yaml_2.3.7        fastmap_1.1.1     lattice_0.22-5   
## [13] R6_2.5.1          labeling_0.4.2    generics_0.1.3    knitr_1.43       
## [17] tibble_3.2.1      munsell_0.5.0     bslib_0.5.0       pillar_1.9.0     
## [21] rlang_1.1.1       utf8_1.2.3        cachem_1.0.8      xfun_0.40        
## [25] sass_0.4.7        cli_3.6.1         withr_2.5.0       magrittr_2.0.3   
## [29] mgcv_1.9-1        digest_0.6.33     grid_4.3.3        rstudioapi_0.15.0
## [33] lifecycle_1.0.3   nlme_3.1-164      vctrs_0.6.3       evaluate_0.21    
## [37] glue_1.6.2        farver_2.1.1      fansi_1.0.4       colorspace_2.1-0 
## [41] rmarkdown_2.23    tools_4.3.3       pkgconfig_2.0.3   htmltools_0.5.6