Required packages

install.packages(c(“ggplot2”, “MASS”, “lmtest”, “car”))
library(ggplot2) # For visualization library(MASS) # For boxcox library(lmtest) # For Breusch-Pagan test library(car) # For additional diagnostics if needed

Set seed for reproducibility

set.seed(42)

Simulate 200 firms

n <- 200 firm_size <- runif(n, 10, 500) # Total Assets in millions

Generate R&D Expenditure with a non-linear relationship and heteroscedasticity

We assume the ‘true’ lambda is 0 (log-transformed relationship)

error <- rnorm(n, mean = 0, sd = 0.5) rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

Create the dataframe

df_firms <- data.frame( Firm_ID = 1:n, Total_Assets = firm_size, RD_Expenditure = rd_expenditure )

Preview the data

head(df_firms)

ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) + geom_point() + geom_smooth(method = “lm”, se = FALSE, color = “blue”) + # Add linear fit line labs(title = “Relationship between Total Assets and R&D Expenditure”, x = “Total Assets (millions)”, y = “R&D Expenditure”) + theme_bw()

Simple OLS

model <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)

Summary

summary(model)

Q-Q plot

qqPlot(model, main = “Q-Q Plot of Residuals”)

Shapiro-Wilk test

shapiro.test(residuals(model))

Residuals vs. Fitted

plot(model, which = 1, main = “Residuals vs. Fitted”)

Breusch-Pagan test

bptest(model)

Box-Cox on the model (add small constant if any zeros, but here all positive)

bc <- boxcox(model, lambda = seq(-2, 2, 1/10))

Optimal lambda

optimal_lambda <- bc\(x[which.max(bc\)y)] optimal_lambda

Transform response (using lambda; if lambda=0, use log)

if (optimal_lambda == 0) { df_firms\(RD_Transformed <- log(df_firms\)RD_Expenditure) } else { df_firms\(RD_Transformed <- (df_firms\)RD_Expenditure^optimal_lambda - 1) / optimal_lambda }

Re-run OLS

model_transformed <- lm(RD_Transformed ~ Total_Assets, data = df_firms)

Summary

summary(model_transformed)

Q-Q plot

qqPlot(model_transformed, main = “Q-Q Plot of Transformed Residuals”)

Shapiro-Wilk

shapiro.test(residuals(model_transformed))

Residuals vs. Fitted

plot(model_transformed, which = 1, main = “Transformed Residuals vs. Fitted”)

Breusch-Pagan

bptest(model_transformed)

“R&D Expenditure and Firm Size Analysis” output: html_document date: “2026-03-09”

2026-03-09

Required packages

Set seed for reproducibility

Simulate 200 firms

Generate R&D Expenditure with a non-linear relationship and heteroscedasticity

We assume the ‘true’ lambda is 0 (log-transformed relationship)

Create the dataframe

Preview the data

Simple OLS

Summary

Q-Q plot

Shapiro-Wilk test

Residuals vs. Fitted

Breusch-Pagan test

Box-Cox on the model (add small constant if any zeros, but here all positive)

Optimal lambda

Transform response (using lambda; if lambda=0, use log)

Re-run OLS

Summary

Q-Q plot

Shapiro-Wilk

Residuals vs. Fitted

Breusch-Pagan