0. Generate the Data

set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)

error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

df_firms <- data.frame(
  Firm_ID    = 1:n,
  Total_Assets    = firm_size,
  RD_Expenditure  = rd_expenditure
)

head(df_firms)
##   Firm_ID Total_Assets RD_Expenditure
## 1       1     458.2550      322.76939
## 2       2     469.1670      302.76313
## 3       3     150.2084       54.90529
## 4       4     416.9193      421.56611
## 5       5     324.4553      103.12089
## 6       6     264.3570      134.17397

1. Visualize: Total Assets vs. R&D Expenditure

plot(df_firms$Total_Assets, df_firms$RD_Expenditure,
     xlab = "Total Assets (millions)",
     ylab = "R&D Expenditure (millions)",
     main = "Total Assets vs. R&D Expenditure",
     pch  = 16, col = "#2196F380", cex = 0.9)
lines(lowess(df_firms$Total_Assets, df_firms$RD_Expenditure),
      col = "red", lwd = 2)
legend("topleft", legend = "Lowess Smoother",
       col = "red", lwd = 2, bty = "n")

Observation: The relationship is clearly nonlinear and right-skewed — as firm size increases, R&D spending grows at an accelerating, curved rate. The spread of points also widens with Total Assets, hinting at heteroscedasticity.


2. Diagnose: OLS Regression + Residual Diagnostics

Fit the Simple OLS Model

ols_fit <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(ols_fit)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16

Diagnostic Plots

par(mfrow = c(2, 2))
plot(ols_fit, pch = 16, col = "#2196F380")

par(mfrow = c(1, 1))

Normality Test (Shapiro-Wilk)

shapiro.test(residuals(ols_fit))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(ols_fit)
## W = 0.87801, p-value = 1.231e-11

Homoscedasticity Test (Breusch-Pagan)

if (!require(lmtest)) install.packages("lmtest")
library(lmtest)
bptest(ols_fit)
## 
##  studentized Breusch-Pagan test
## 
## data:  ols_fit
## BP = 13.298, df = 1, p-value = 0.0002657

Diagnosis Summary:

  • Residual vs Fitted: Shows a strong curved (U-shaped) pattern → nonlinearity not captured
  • Q-Q Plot: Residuals deviate heavily from the diagonal → non-normal errors
  • Scale-Location: Upward trend → heteroscedasticity present
  • Shapiro-Wilk p < 0.05 → normality assumption violated
  • Breusch-Pagan p < 0.05 → homoscedasticity assumption violated

3. Transform: Box-Cox to Find Optimal λ

if (!require(MASS)) install.packages("MASS")
library(MASS)

bc <- boxcox(ols_fit, lambda = seq(-2, 2, by = 0.1))

# Extract the optimal lambda
lambda_opt <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", round(lambda_opt, 4), "\n")
## Optimal lambda: 0.1818

Result: The 95% confidence interval for λ includes 0, confirming that a log transformation (λ = 0) is optimal — consistent with how the data was generated.


4. Refine: Re-run Model with Transformed Response

Apply the Transformation

# Use log transformation since lambda ≈ 0
df_firms$log_RD <- log(df_firms$RD_Expenditure)

# Also log Total_Assets for a log-log (power) model
df_firms$log_Assets <- log(df_firms$Total_Assets)

Fit the Transformed Model

lm_transformed <- lm(log_RD ~ log_Assets, data = df_firms)
summary(lm_transformed)
## 
## Call:
## lm(formula = log_RD ~ log_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31707 -0.31284 -0.00069  0.30462  1.38376 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.4356     0.2100   6.837 9.82e-11 ***
## log_Assets    0.6075     0.0389  15.616  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared:  0.5519, Adjusted R-squared:  0.5496 
## F-statistic: 243.9 on 1 and 198 DF,  p-value: < 2.2e-16

Diagnostic Plots — Transformed Model

par(mfrow = c(2, 2))
plot(lm_transformed, pch = 16, col = "#E91E6380")

par(mfrow = c(1, 1))

Normality & Homoscedasticity Tests — Transformed Model

shapiro.test(residuals(lm_transformed))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(lm_transformed)
## W = 0.99476, p-value = 0.7145
bptest(lm_transformed)
## 
##  studentized Breusch-Pagan test
## 
## data:  lm_transformed
## BP = 0.0095774, df = 1, p-value = 0.922

5. Side-by-Side Residual Comparison

par(mfrow = c(1, 2))

# OLS residuals
plot(fitted(ols_fit), residuals(ols_fit),
     xlab = "Fitted Values", ylab = "Residuals",
     main = "OLS Model\n(Before Transformation)",
     pch = 16, col = "#2196F380", cex = 0.8)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(fitted(ols_fit), residuals(ols_fit)), col = "darkgreen", lwd = 2)

# Transformed residuals
plot(fitted(lm_transformed), residuals(lm_transformed),
     xlab = "Fitted Values", ylab = "Residuals",
     main = "Log-Log Model\n(After Transformation)",
     pch = 16, col = "#E91E6380", cex = 0.8)
abline(h = 0, col = "red", lwd = 2, lty = 2)
lines(lowess(fitted(lm_transformed), residuals(lm_transformed)), col = "darkgreen", lwd = 2)

par(mfrow = c(1, 1))

6. Results Comparison Table

results <- data.frame(
  Metric = c("R-squared", "Adj. R-squared", "RSE",
             "Shapiro-Wilk p-value", "Breusch-Pagan p-value"),
  OLS_Model = c(
    round(summary(ols_fit)$r.squared, 4),
    round(summary(ols_fit)$adj.r.squared, 4),
    round(summary(ols_fit)$sigma, 4),
    round(shapiro.test(residuals(ols_fit))$p.value, 6),
    round(bptest(ols_fit)$p.value, 6)
  ),
  LogLog_Model = c(
    round(summary(lm_transformed)$r.squared, 4),
    round(summary(lm_transformed)$adj.r.squared, 4),
    round(summary(lm_transformed)$sigma, 4),
    round(shapiro.test(residuals(lm_transformed))$p.value, 6),
    round(bptest(lm_transformed)$p.value, 6)
  )
)

knitr::kable(results, col.names = c("Metric", "OLS (Untransformed)", "Log-Log (Transformed)"),
             align = "lcc", caption = "Model Comparison: Before vs. After Box-Cox Transformation")
Model Comparison: Before vs. After Box-Cox Transformation
Metric OLS (Untransformed) Log-Log (Transformed)
R-squared 0.308800 0.551900
Adj. R-squared 0.305300 0.549600
RSE 75.311800 0.481500
Shapiro-Wilk p-value 0.000000 0.714511
Breusch-Pagan p-value 0.000266 0.922040

7. Conclusion

Issue OLS Model Log-Log Model
Nonlinearity ❌ U-shaped residuals ✅ Randomly scattered
Normality (Shapiro-Wilk) ❌ p < 0.05 ✅ p > 0.05
Homoscedasticity (BP test) ❌ p < 0.05 ✅ p > 0.05
R-squared ~0.60 ~0.69