p_raw <- ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
geom_point(alpha = 0.6, colour = "#1D4ED8", size = 2) +
geom_smooth(method = "lm", colour = "#93C5FD", fill = "#BFDBFE", se = TRUE, linewidth = 1) +
labs(
title = "Total Assets vs R&D Expenditure (raw scale)",
subtitle = "Linear fit clearly misses the curvature",
x = "Total Assets ($ millions)",
y = "R&D Expenditure ($ millions)"
) +
theme_minimal(base_size = 13)
p_log <- ggplot(df_firms, aes(x = log(Total_Assets), y = log(RD_Expenditure))) +
geom_point(alpha = 0.6, colour = "#2563EB", size = 2) +
geom_smooth(method = "lm", colour = "#60A5FA", fill = "#BFDBFE", se = TRUE, linewidth = 1) +
labs(
title = "log(Total Assets) vs log(R&D Expenditure)",
subtitle = "Log-log scale reveals a strong linear relationship",
x = "log(Total Assets)",
y = "log(R&D Expenditure)"
) +
theme_minimal(base_size = 13)
grid.arrange(p_raw, p_log, ncol = 1)##
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.79 -42.06 -12.37 25.08 404.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.50788 11.25914 3.598 0.000405 ***
## Total_Assets 0.35091 0.03731 9.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.3053
## F-statistic: 88.46 on 1 and 198 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_raw, which = 1:4, col = "#2563EB", pch = 20, cex = 0.8)##
## Shapiro-Wilk normality test
##
## data: residuals(model_raw)
## W = 0.87801, p-value = 1.231e-11
## $statistic
## [1] 13.29769
##
## $p.value
## [1] 0.0002657328
Diagnosis summary:
| Test | Result | Interpretation |
|---|---|---|
| Shapiro-Wilk | p < 0.001 | Residuals are not normally distributed |
| Breusch-Pagan | p < 0.001 | Significant heteroscedasticity detected |
Both OLS assumptions are violated — transformation is needed.
The Box-Cox family transforms the response as:
\[Y^{(\lambda)} = \begin{cases} \frac{Y^\lambda - 1}{\lambda} & \lambda \neq 0 \\ \ln(Y) & \lambda = 0 \end{cases}\]
# boxcox() requires a fitted lm object
bc <- boxcox(model_raw, lambda = seq(-2, 2, by = 0.05), plotit = TRUE)
title(main = "Box-Cox Log-Likelihood Profile", col.main = "firebrick")# Extract the optimal lambda
lambda_opt <- bc$x[which.max(bc$y)]
cat(sprintf("Optimal lambda (λ) = %.4f\n", lambda_opt))## Optimal lambda (λ) = 0.1818
cat(sprintf("95%% CI: [%.4f, %.4f]\n",
bc$x[bc$y > max(bc$y) - qchisq(0.95, 1)/2][1],
tail(bc$x[bc$y > max(bc$y) - qchisq(0.95, 1)/2], 1)))## 95% CI: [0.0606, 0.3030]
cat(sprintf(
"\nOptimal λ ≈ %.3f\nSince the 95%% CI includes 0, a LOG transformation (λ=0) is appropriate.\n",
lambda_opt
))##
## Optimal λ ≈ 0.182
## Since the 95% CI includes 0, a LOG transformation (λ=0) is appropriate.
df_firms <- df_firms %>%
mutate(log_RD = log(RD_Expenditure),
log_Assets = log(Total_Assets))
model_log <- lm(log_RD ~ log_Assets, data = df_firms)
summary(model_log)##
## Call:
## lm(formula = log_RD ~ log_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.31707 -0.31284 -0.00069 0.30462 1.38376
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4356 0.2100 6.837 9.82e-11 ***
## log_Assets 0.6075 0.0389 15.616 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared: 0.5519, Adjusted R-squared: 0.5496
## F-statistic: 243.9 on 1 and 198 DF, p-value: < 2.2e-16
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
plot(model_log, which = 1:4, col = "#1D4ED8", pch = 20, cex = 0.8)##
## Shapiro-Wilk normality test
##
## data: residuals(model_log)
## W = 0.99476, p-value = 0.7145
## $statistic
## [1] 0.009577446
##
## $p.value
## [1] 0.9220399
sw_raw <- shapiro.test(residuals(model_raw))
sw_log <- shapiro.test(residuals(model_log))
bp_raw <- bp_test(model_raw)
bp_log <- bp_test(model_log)
comparison <- data.frame(
Metric = c(
"R-squared",
"Adj. R-squared",
"Residual Std Error",
"Shapiro-Wilk p-value",
"Breusch-Pagan p-value"
),
Raw_OLS = c(
round(summary(model_raw)$r.squared, 4),
round(summary(model_raw)$adj.r.squared, 4),
round(summary(model_raw)$sigma, 4),
formatC(sw_raw$p.value, format = "e", digits = 3),
formatC(bp_raw$p.value, format = "e", digits = 3)
),
Log_Transformed = c(
round(summary(model_log)$r.squared, 4),
round(summary(model_log)$adj.r.squared, 4),
round(summary(model_log)$sigma, 4),
formatC(sw_log$p.value, format = "e", digits = 3),
formatC(bp_log$p.value, format = "e", digits = 3)
)
)
knitr::kable(comparison,
col.names = c("Metric", "Raw OLS", "Log-Transformed"),
align = c("l", "c", "c"),
caption = "Model Comparison: Raw OLS vs Log-Transformed")| Metric | Raw OLS | Log-Transformed |
|---|---|---|
| R-squared | 0.3088 | 0.5519 |
| Adj. R-squared | 0.3053 | 0.5496 |
| Residual Std Error | 75.3118 | 0.4815 |
| Shapiro-Wilk p-value | 1.231e-11 | 7.145e-01 |
| Breusch-Pagan p-value | 2.657e-04 | 9.220e-01 |
df_plot <- data.frame(
Actual = df_firms$log_RD,
Fitted = fitted(model_log),
Residuals = residuals(model_log)
)
p_fit <- ggplot(df_plot, aes(x = Fitted, y = Actual)) +
geom_point(alpha = 0.5, colour = "#1D4ED8", size = 2) +
geom_abline(intercept = 0, slope = 1, colour = "#93C5FD",
linewidth = 1, linetype = "dashed") +
labs(title = "Actual vs Fitted — Log-Transformed Model",
subtitle = sprintf("R² = %.4f", summary(model_log)$r.squared),
x = "Fitted log(R&D)",
y = "Actual log(R&D)") +
theme_minimal(base_size = 13)
p_res <- ggplot(df_plot, aes(x = Fitted, y = Residuals)) +
geom_point(alpha = 0.5, colour = "#60A5FA", size = 2) +
geom_hline(yintercept = 0, colour = "#1E40AF",
linewidth = 1, linetype = "dashed") +
labs(title = "Residuals vs Fitted — Log-Transformed Model",
subtitle = "Residuals are now homoscedastic",
x = "Fitted log(R&D)",
y = "Residuals") +
theme_minimal(base_size = 13)
grid.arrange(p_fit, p_res, ncol = 2)## ── Model Coefficients (log-log model) ──────────────────────────────────
## Intercept : 1.4356 (true value: 1.5)
## log_Assets: 0.6075 (true value: 0.6)
##
## ── Interpretation ──────────────────────────────────────────────────────
## A 1% increase in Total Assets is associated with a 83.58% increase
## in R&D Expenditure (elasticity ≈ 0.6).
##
## ── Diagnostics after transformation ────────────────────────────────────
## Shapiro-Wilk p = 0.7145 → Residuals are NOW normally distributed ✓
## Breusch-Pagan p = 0.9220 → Homoscedasticity is NOW satisfied ✓
## R-squared = 0.5519 → Model explains 55.2% of variance in log(R&D)
## R version 4.3.3 (2024-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.6 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.8.so; LAPACK version 3.9.0
##
## locale:
## [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8
## [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8
## [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C
## [10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C
##
## time zone: UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] gridExtra_2.3 dplyr_1.1.2 MASS_7.3-60.0.1 ggplot2_3.4.2
##
## loaded via a namespace (and not attached):
## [1] Matrix_1.6-5 gtable_0.3.6 jsonlite_1.8.7 highr_0.10
## [5] compiler_4.3.3 tidyselect_1.2.0 jquerylib_0.1.4 splines_4.3.3
## [9] scales_1.2.1 yaml_2.3.7 fastmap_1.1.1 lattice_0.22-5
## [13] R6_2.5.1 labeling_0.4.2 generics_0.1.3 knitr_1.43
## [17] tibble_3.2.1 munsell_0.5.0 bslib_0.5.0 pillar_1.9.0
## [21] rlang_1.1.1 utf8_1.2.3 cachem_1.0.8 xfun_0.40
## [25] sass_0.4.7 cli_3.6.1 withr_2.5.0 magrittr_2.0.3
## [29] mgcv_1.9-1 digest_0.6.33 grid_4.3.3 rstudioapi_0.15.0
## [33] lifecycle_1.0.3 nlme_3.1-164 vctrs_0.6.3 evaluate_0.21
## [37] glue_1.6.2 farver_2.1.1 fansi_1.0.4 colorspace_2.1-0
## [41] rmarkdown_2.23 tools_4.3.3 pkgconfig_2.0.3 htmltools_0.5.6