Data Generation

# Set seed for reproducibility
set.seed(42)

# Simulate 200 firms
n <- 200
firm_size <- runif(n, 10, 500) # Total Assets in millions

# Generate R&D Expenditure 
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

# Create the dataframe
df_firms <- data.frame(
  Firm_ID = 1:n,
  Total_Assets = firm_size,
  RD_expenditure = rd_expenditure
)

Step 1: Visualize

plot(df_firms$Total_Assets, df_firms$RD_expenditure,
     main = "R&D Expenditure vs. Total Assets",
     xlab = "Total Assets (Millions)",
     ylab = "R&D Expenditure",
     pch = 16, col = "steelblue")

Step 2: Diagnose

# Diagnose with a simple OLS model
model_ols <- lm(RD_expenditure ~ Total_Assets, data = df_firms)
summary(model_ols)
## 
## Call:
## lm(formula = RD_expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16
# Plot diagnostic charts (2x2 grid)
par(mfrow = c(2, 2))
plot(model_ols)

par(mfrow = c(1, 1)) # Reset layout

Step 3: Transform

library(MASS)

# Run Box-Cox on the OLS model
bc <- boxcox(model_ols, lambda = seq(-2, 2, by = 0.1))

# Extract the exact optimal lambda
optimal_lambda <- bc$x[which.max(bc$y)]
cat("The optimal lambda is:", optimal_lambda, "\n")
## The optimal lambda is: 0.1818182

Step 4: Refine

# Apply log transformation 
df_firms$Log_RD <- log(df_firms$RD_expenditure)

# Re-run the OLS model 
model_refined <- lm(Log_RD ~ Total_Assets, data = df_firms)
summary(model_refined)
## 
## Call:
## lm(formula = Log_RD ~ Total_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47221 -0.39713  0.02358  0.35362  1.37330 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.7787315  0.0798760   47.31   <2e-16 ***
## Total_Assets 0.0033566  0.0002647   12.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5343 on 198 degrees of freedom
## Multiple R-squared:  0.4482, Adjusted R-squared:  0.4454 
## F-statistic: 160.8 on 1 and 198 DF,  p-value: < 2.2e-16
# Check the new residuals
par(mfrow = c(2, 2))
plot(model_refined)

par(mfrow = c(1, 1)) # Reset layout