if (!require("MASS"))    install.packages("MASS")
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("lmtest"))  install.packages("lmtest")

library(MASS)
library(ggplot2)
library(lmtest)
set.seed(42)
n <- 200
firm_size      <- runif(n, 10, 500)
error          <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)

df_firms <- data.frame(
  Firm_ID        = 1:n,
  Total_Assets   = firm_size,
  RD_Expenditure = rd_expenditure
)

head(df_firms)
##   Firm_ID Total_Assets RD_Expenditure
## 1       1     458.2550      322.76939
## 2       2     469.1670      302.76313
## 3       3     150.2084       54.90529
## 4       4     416.9193      421.56611
## 5       5     324.4553      103.12089
## 6       6     264.3570      134.17397
ggplot(df_firms, aes(x = Total_Assets, y = RD_Expenditure)) +
  geom_point(alpha = 0.55, color = "#2E86AB") +
  geom_smooth(method = "lm",    se = TRUE,  color = "#E84855") +
  geom_smooth(method = "loess", se = FALSE, color = "#F4A261", linetype = "dashed") +
  labs(
    title = "Total Assets vs R&D Expenditure",
    x     = "Total Assets ($M)",
    y     = "R&D Expenditure ($M)"
  ) +
  theme_minimal()

model_ols <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
summary(model_ols)
## 
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  40.50788   11.25914   3.598 0.000405 ***
## Total_Assets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_ols)

par(mfrow = c(1, 1))

shapiro.test(residuals(model_ols))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_ols)
## W = 0.87801, p-value = 1.231e-11
bptest(model_ols)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 13.298, df = 1, p-value = 0.0002657
bc <- boxcox(model_ols, lambda = seq(-2, 2, by = 0.05))

optimal_lambda <- bc$x[which.max(bc$y)]
cat("Optimal lambda:", round(optimal_lambda, 4), "\n")
## Optimal lambda: 0.1818
df_firms$RD_log <- log(df_firms$RD_Expenditure)

model_refined <- lm(RD_log ~ log(Total_Assets), data = df_firms)
summary(model_refined)
## 
## Call:
## lm(formula = RD_log ~ log(Total_Assets), data = df_firms)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.31707 -0.31284 -0.00069  0.30462  1.38376 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.4356     0.2100   6.837 9.82e-11 ***
## log(Total_Assets)   0.6075     0.0389  15.616  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4815 on 198 degrees of freedom
## Multiple R-squared:  0.5519, Adjusted R-squared:  0.5496 
## F-statistic: 243.9 on 1 and 198 DF,  p-value: < 2.2e-16
par(mfrow = c(2, 2))
plot(model_refined)

par(mfrow = c(1, 1))

shapiro.test(residuals(model_refined))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model_refined)
## W = 0.99476, p-value = 0.7145
bptest(model_refined)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_refined
## BP = 0.0095774, df = 1, p-value = 0.922