Data Generation

set.seed(42)


n <- 200
firm_size <- runif(n, 10, 500) 

error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

# Create the dataframe
df_firms <- data.frame(
  Firm_ID = 1:n,
  Total_Assets = firm_size,
  RD_Expenditure = rd_expenditure
)

# Preview the data
head(df_firms)
##   Firm_ID Total_Assets RD_Expenditure
## 1       1     458.2550      322.76939
## 2       2     469.1670      302.76313
## 3       3     150.2084       54.90529
## 4       4     416.9193      421.56611
## 5       5     324.4553      103.12089
## 6       6     264.3570      134.17397

1. Visualize: Total Assets vs R&D Expenditure

library(ggplot2)

ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
  geom_point(alpha = 0.6, color = "steelblue") +
  geom_smooth(method = "lm", color = "red", se = TRUE) +
  labs(
    title = "Relationship between Total Assets and R&D Expenditure",
    x = "Total Assets (millions)",
    y = "R&D Expenditure"
  ) +
  theme_minimal()


2. Diagnose: OLS Regression and Residual Diagnostics

# Simple OLS regression
model_ols <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(model_ols)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16
# Residual diagnostic plots
par(mfrow = c(2, 2))
plot(model_ols)

par(mfrow = c(1, 1))
# Shapiro-Wilk test for normality of residuals
shapiro.test(residuals(model_ols))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
# Breusch-Pagan test for homoscedasticity
library(lmtest)
bptest(model_ols)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 13.298, df = 1, p-value = 0.0002657

Interpretation: If the Shapiro-Wilk p-value < 0.05, residuals are not normally distributed.
If the Breusch-Pagan p-value < 0.05, there is evidence of heteroscedasticity.


3. Transform: Box-Cox to Find Optimal Lambda

library(MASS)

# Apply Box-Cox transformation
bc <- boxcox(model_ols, lambda = seq(-2, 2, by = 0.1))

# Find optimal lambda
optimal_lambda <- bc$x[which.max(bc$y)]
cat("Optimal Lambda:", optimal_lambda, "\n")
## Optimal Lambda: 0.1818182

Note: A lambda near 0 suggests a log transformation is most appropriate, consistent with the true data-generating process.


4. Refine: Re-run Model with Transformed Variable

# Apply log transformation (lambda ≈ 0)
df_firms$log_RD <- log(df_firms$RD_Expenditure)

# Re-run regression with transformed response
model_log <- lm(log_RD ~ Total_Assets, data = df_firms)
summary(model_log)
## 
## Call:
## lm(formula = log_RD ~ Total_Assets, data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.47221 -0.39713  0.02358  0.35362  1.37330 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.7787315  0.0798760   47.31   <2e-16 ***
## Total_Assets 0.0033566  0.0002647   12.68   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5343 on 198 degrees of freedom
## Multiple R-squared:  0.4482, Adjusted R-squared:  0.4454 
## F-statistic: 160.8 on 1 and 198 DF,  p-value: < 2.2e-16
# Diagnostic plots for transformed model
par(mfrow = c(2, 2))
plot(model_log)

par(mfrow = c(1, 1))
# Compare normality
cat("=== OLS Model Residuals ===\n")
## === OLS Model Residuals ===
print(shapiro.test(residuals(model_ols)))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
cat("\n=== Log-Transformed Model Residuals ===\n")
## 
## === Log-Transformed Model Residuals ===
print(shapiro.test(residuals(model_log)))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_log)
## W = 0.9969, p-value = 0.9617
# Compare homoscedasticity
cat("\n=== OLS Model Breusch-Pagan Test ===\n")
## 
## === OLS Model Breusch-Pagan Test ===
print(bptest(model_ols))
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 13.298, df = 1, p-value = 0.0002657
cat("\n=== Log-Transformed Model Breusch-Pagan Test ===\n")
## 
## === Log-Transformed Model Breusch-Pagan Test ===
print(bptest(model_log))
## 
##  studentized Breusch-Pagan test
## 
## data:  model_log
## BP = 5.3894, df = 1, p-value = 0.02026
cat("OLS Model R-squared:         ", summary(model_ols)$r.squared, "\n")
## OLS Model R-squared:          0.3088064
cat("Log-Transformed Model R-squared:", summary(model_log)$r.squared, "\n")
## Log-Transformed Model R-squared: 0.448185

Conclusion: The log-transformed model should show improved normality and reduced heteroscedasticity in the residuals compared to the original OLS model, as the true relationship is log-linear by construction.