Step 1 – Visualise

set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)
df_firms <- data.frame(Firm_ID = 1:n, Total_Assets = firm_size, RD_Expenditure = rd_expenditure)

plot(df_firms$Total_Assets, df_firms$RD_Expenditure,
     xlab = "Total Assets ($ millions)", ylab = "R&D Expenditure",
     main = "Total Assets vs R&D Expenditure", pch = 16, col = "steelblue")
abline(lm(RD_Expenditure ~ Total_Assets, data = df_firms), col = "red", lwd = 2)

Step 2 – Diagnose

model_raw <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
par(mfrow = c(2, 2))
plot(model_raw)

shapiro.test(resid(model_raw))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model_raw)
## W = 0.87801, p-value = 1.231e-11

Step 3 – Box-Cox Transform

library(MASS)
bc <- boxcox(model_raw, lambda = seq(-3, 3, 0.1))

lambda_opt <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", lambda_opt)
## Optimal lambda: 0.1515152

Step 4 – Refined Model

df_firms$RD_BC <- (df_firms$RD_Expenditure^lambda_opt - 1) / lambda_opt
model_bc <- lm(RD_BC ~ Total_Assets, data = df_firms)
par(mfrow = c(2, 2))
plot(model_bc)

summary(model_bc)
## 
## Call:
## lm(formula = RD_BC ~ Total_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.38714 -0.80605 -0.01164  0.66607  2.74275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.1231145  0.1597059   32.08   <2e-16 ***
## Total_Assets 0.0065758  0.0005292   12.43   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.068 on 198 degrees of freedom
## Multiple R-squared:  0.4381, Adjusted R-squared:  0.4353 
## F-statistic: 154.4 on 1 and 198 DF,  p-value: < 2.2e-16
shapiro.test(resid(model_bc))
## 
##  Shapiro-Wilk normality test
## 
## data:  resid(model_bc)
## W = 0.99125, p-value = 0.2699