Step 1 – Visualise
set.seed(42)
n <- 200
firm_size <- runif(n, 10, 500)
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)
df_firms <- data.frame(Firm_ID = 1:n, Total_Assets = firm_size, RD_Expenditure = rd_expenditure)
plot(df_firms$Total_Assets, df_firms$RD_Expenditure,
xlab = "Total Assets ($ millions)", ylab = "R&D Expenditure",
main = "Total Assets vs R&D Expenditure", pch = 16, col = "steelblue")
abline(lm(RD_Expenditure ~ Total_Assets, data = df_firms), col = "red", lwd = 2)

Step 2 – Diagnose
model_raw <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
par(mfrow = c(2, 2))
plot(model_raw)

shapiro.test(resid(model_raw))
##
## Shapiro-Wilk normality test
##
## data: resid(model_raw)
## W = 0.87801, p-value = 1.231e-11
Step 4 – Refined Model
df_firms$RD_BC <- (df_firms$RD_Expenditure^lambda_opt - 1) / lambda_opt
model_bc <- lm(RD_BC ~ Total_Assets, data = df_firms)
par(mfrow = c(2, 2))
plot(model_bc)

summary(model_bc)
##
## Call:
## lm(formula = RD_BC ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.38714 -0.80605 -0.01164 0.66607 2.74275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.1231145 0.1597059 32.08 <2e-16 ***
## Total_Assets 0.0065758 0.0005292 12.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.068 on 198 degrees of freedom
## Multiple R-squared: 0.4381, Adjusted R-squared: 0.4353
## F-statistic: 154.4 on 1 and 198 DF, p-value: < 2.2e-16
shapiro.test(resid(model_bc))
##
## Shapiro-Wilk normality test
##
## data: resid(model_bc)
## W = 0.99125, p-value = 0.2699