set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)
df_firms <- data.frame(
Firm_ID = 1:n,
Total_Assets = firm_size,
RD_Expenditure = rd_expenditure
)
head(df_firms)## Firm_ID Total_Assets RD_Expenditure
## 1 1 458.2550 322.76939
## 2 2 469.1670 302.76313
## 3 3 150.2084 54.90529
## 4 4 416.9193 421.56611
## 5 5 324.4553 103.12089
## 6 6 264.3570 134.17397
## Total_Assets RD_Expenditure
## Min. : 10.12 Min. : 10.52
## 1st Qu.:136.57 1st Qu.: 76.05
## Median :282.15 Median :116.25
## Mean :265.89 Mean :133.81
## 3rd Qu.:382.78 3rd Qu.:164.97
## Max. :494.56 Max. :606.82
ggplot(df_firms, aes(Total_Assets, RD_Expenditure)) +
geom_point(alpha = 0.55, colour = "#2D6A9F", size = 1.8) +
geom_smooth(method = "lm", colour = "#E74C3C", se = TRUE, linetype = "dashed") +
geom_smooth(method = "loess", colour = "#27AE60", se = FALSE) +
labs(
title = "Total Assets vs R&D Expenditure (Raw)",
subtitle = "Red dashed = OLS fit | Green = LOESS smoother",
x = "Total Assets ($ millions)",
y = "R&D Expenditure ($ millions)"
) +
theme_bw(base_size = 13)Raw scatter: non-linear relationship and increasing variance are both visible.
Interpretation:
The scatter plot reveals two key violations of OLS assumptions:
##
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.79 -42.06 -12.37 25.08 404.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.50788 11.25914 3.598 0.000405 ***
## Total_Assets 0.35091 0.03731 9.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.3053
## F-statistic: 88.46 on 1 and 198 DF, p-value: < 2.2e-16
df_diag <- data.frame(
fitted = fitted(model_ols),
resid = residuals(model_ols),
std_resid = rstandard(model_ols)
)
p_rv <- ggplot(df_diag, aes(fitted, resid)) +
geom_point(alpha = 0.5, colour = "#8E44AD") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "red") +
geom_smooth(method = "loess", se = FALSE, colour = "#E67E22") +
labs(title = "Residuals vs Fitted", x = "Fitted Values", y = "Residuals") +
theme_bw()
p_qq <- ggplot(df_diag, aes(sample = std_resid)) +
stat_qq(colour = "#2980B9", alpha = 0.6) +
stat_qq_line(colour = "red") +
labs(title = "Normal Q-Q Plot", x = "Theoretical Quantiles", y = "Std. Residuals") +
theme_bw()
p_sl <- ggplot(df_diag, aes(fitted, sqrt(abs(std_resid)))) +
geom_point(alpha = 0.5, colour = "#16A085") +
geom_smooth(method = "loess", se = FALSE, colour = "#E74C3C") +
labs(title = "Scale-Location", x = "Fitted Values",
y = expression(sqrt("|Std. Residuals|"))) +
theme_bw()
p_rv + p_qq + p_sl##
## Shapiro-Wilk normality test
##
## data: residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
##
## studentized Breusch-Pagan test
##
## data: model_ols
## BP = 13.298, df = 1, p-value = 0.0002657
Interpretation:
| Diagnostic | Result | Verdict |
|---|---|---|
| Residuals vs Fitted | Clear funnel pattern | ❌ Heteroscedasticity |
| Q-Q Plot | Heavy right tail | ❌ Non-normality |
| Shapiro-Wilk | p < 0.001 | ❌ Reject normality |
| Breusch-Pagan | p < 0.001 | ❌ Reject homoscedasticity |
Both assumptions are violated. Transformation is required.
The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.
## Optimal lambda: 0.1818
# Plot
bc_df <- data.frame(lambda = bc$x, loglik = bc$y)
ci_cutoff <- max(bc$y) - qchisq(0.95, 1) / 2
ggplot(bc_df, aes(lambda, loglik)) +
geom_line(colour = "#2C3E50", size = 1) +
geom_vline(xintercept = lambda_opt, colour = "red", linetype = "dashed") +
geom_hline(yintercept = ci_cutoff, colour = "#7F8C8D", linetype = "dotted") +
annotate("text", x = lambda_opt + 0.15, y = min(bc_df$loglik),
label = sprintf("λ = %.2f", lambda_opt), colour = "red", size = 4.5) +
labs(title = "Box-Cox Log-Likelihood Profile",
subtitle = "Dotted line = 95% confidence interval boundary",
x = expression(lambda), y = "Log-Likelihood") +
theme_bw(base_size = 13)The 95% confidence interval (dotted line) for λ includes 0, confirming a log transformation is appropriate.
Result: The optimal λ ≈ 0.18.
Since this is very close to 0, the Box-Cox procedure
recommends a log transformation of the response
variable.
Box-Cox rule of thumb: λ ≈ 0 → use ln(Y); λ ≈ 0.5 → use √Y; λ ≈ 1 → no transformation needed.
# Apply log transformation (lambda ≈ 0)
df_firms$RD_log <- log(df_firms$RD_Expenditure)
df_firms$Assets_log <- log(df_firms$Total_Assets)
# Log-level model
model_loglevel <- lm(RD_log ~ Total_Assets, data = df_firms)
# Log-log model (aligns with the data-generating process)
model_loglog <- lm(RD_log ~ Assets_log, data = df_firms)
cat("=== Log-Level Model ===\n"); summary(model_loglevel)## === Log-Level Model ===
##
## Call:
## lm(formula = RD_log ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47221 -0.39713 0.02358 0.35362 1.37330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7787315 0.0798760 47.31 <2e-16 ***
## Total_Assets 0.0033566 0.0002647 12.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5343 on 198 degrees of freedom
## Multiple R-squared: 0.4482, Adjusted R-squared: 0.4454
## F-statistic: 160.8 on 1 and 198 DF, p-value: < 2.2e-16
## === Log-Log Model ===
##
## Call:
## lm(formula = RD_log ~ Assets_log, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.31707 -0.31284 -0.00069 0.30462 1.38376
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.4356 0.2100 6.837 9.82e-11 ***
## Assets_log 0.6075 0.0389 15.616 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared: 0.5519, Adjusted R-squared: 0.5496
## F-statistic: 243.9 on 1 and 198 DF, p-value: < 2.2e-16
df_diag2 <- data.frame(
fitted = fitted(model_loglog),
resid = residuals(model_loglog),
std_resid = rstandard(model_loglog)
)
p4a <- ggplot(df_diag2, aes(fitted, resid)) +
geom_point(alpha = 0.5, colour = "#C0392B") +
geom_hline(yintercept = 0, linetype = "dashed", colour = "blue") +
geom_smooth(method = "loess", se = FALSE, colour = "#F39C12") +
labs(title = "Residuals vs Fitted (Log-Log)", x = "Fitted", y = "Residuals") +
theme_bw()
p4b <- ggplot(df_diag2, aes(sample = std_resid)) +
stat_qq(colour = "#1ABC9C", alpha = 0.6) +
stat_qq_line(colour = "red") +
labs(title = "Q-Q Plot (Log-Log)", x = "Theoretical Quantiles", y = "Std. Residuals") +
theme_bw()
p4c <- ggplot(df_firms, aes(Assets_log, RD_log)) +
geom_point(alpha = 0.5, colour = "#2980B9") +
geom_smooth(method = "lm", colour = "#E74C3C", se = TRUE) +
labs(title = "Log(Assets) vs Log(R&D)", x = "log(Total Assets)", y = "log(R&D Exp.)") +
theme_bw()
p4a + p4b + p4c## --- Shapiro-Wilk (Log-Log residuals) ---
##
## Shapiro-Wilk normality test
##
## data: residuals(model_loglog)
## W = 0.99476, p-value = 0.7145
##
## --- Breusch-Pagan (Log-Log) ---
##
## studentized Breusch-Pagan test
##
## data: model_loglog
## BP = 0.0095774, df = 1, p-value = 0.922
comp <- data.frame(
Model = c("OLS – Raw", "Log-Level", "Log-Log"),
R2 = round(c(summary(model_ols)$r.squared,
summary(model_loglevel)$r.squared,
summary(model_loglog)$r.squared), 4),
Adj_R2 = round(c(summary(model_ols)$adj.r.squared,
summary(model_loglevel)$adj.r.squared,
summary(model_loglog)$adj.r.squared), 4),
BP_pval = round(c(bptest(model_ols)$p.value,
bptest(model_loglevel)$p.value,
bptest(model_loglog)$p.value), 5),
SW_pval = round(c(shapiro.test(residuals(model_ols))$p.value,
shapiro.test(residuals(model_loglevel))$p.value,
shapiro.test(residuals(model_loglog))$p.value), 5)
)
knitr::kable(comp, caption = "Model Diagnostics Comparison")| Model | R2 | Adj_R2 | BP_pval | SW_pval |
|---|---|---|---|---|
| OLS – Raw | 0.3088 | 0.3053 | 0.00027 | 0.00000 |
| Log-Level | 0.4482 | 0.4454 | 0.02026 | 0.96169 |
| Log-Log | 0.5519 | 0.5496 | 0.92204 | 0.71451 |
| Step | Key Finding |
|---|---|
| Visualize | Clear non-linearity and heteroscedasticity in raw data |
| Diagnose | OLS violates normality (SW p<0.001) and homoscedasticity (BP p<0.001) |
| Transform | Box-Cox optimal λ ≈ 0 → log transformation recommended |
| Refine | Log-log model passes both tests; R² improves dramatically |
The log-log model
log(RD_Expenditure) ~ log(Total_Assets) is the best
specification: