library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(MASS)
## Warning: package 'MASS' was built under R version 4.5.2
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
set.seed(42)
n <- 200
firmsize <- runif(n, 10, 500)  # Total Assets in millions
error <- rnorm(n, mean = 0, sd = 0.5)
rdexpenditure <- exp(1.5 + 0.6 * log(firmsize) + error)
dffirms <- data.frame(
  FirmID = 1:n,
  TotalAssets = firmsize,
  RDExpenditure = rdexpenditure
)
head(dffirms)
##   FirmID TotalAssets RDExpenditure
## 1      1    458.2550     322.76939
## 2      2    469.1670     302.76313
## 3      3    150.2084      54.90529
## 4      4    416.9193     421.56611
## 5      5    324.4553     103.12089
## 6      6    264.3570     134.17397
ggplot(dffirms, aes(x = TotalAssets, y = RDExpenditure)) +
  geom_point(color = "blue") +
  labs(
    title = "Relationship between Firm Size and R&D Expenditure",
    x = "Total Assets (Million)",
    y = "R&D Expenditure"
  ) +
  theme_minimal()

model1 <- lm(RDExpenditure ~ TotalAssets, data = dffirms)
summary(model1)
## 
## Call:
## lm(formula = RDExpenditure ~ TotalAssets, data = dffirms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -135.79  -42.06  -12.37   25.08  404.97 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.50788   11.25914   3.598 0.000405 ***
## TotalAssets  0.35091    0.03731   9.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared:  0.3088, Adjusted R-squared:  0.3053 
## F-statistic: 88.46 on 1 and 198 DF,  p-value: < 2.2e-16
# Residual plots
par(mfrow=c(2,2))
plot(model1)

# Normality test
shapiro.test(residuals(model1))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model1)
## W = 0.87801, p-value = 1.231e-11
# Homoscedasticity test (Breusch-Pagan)
bptest(model1)
## 
##  studentized Breusch-Pagan test
## 
## data:  model1
## BP = 13.298, df = 1, p-value = 0.0002657
boxcox_result <- boxcox(model1, lambda = seq(-2, 2, 0.1))

# Extract optimal lambda
lambda_opt <- boxcox_result$x[which.max(boxcox_result$y)]
lambda_opt
## [1] 0.1818182
if(lambda_opt == 0){
  dffirms$RD_transformed <- log(dffirms$RDExpenditure)
} else {
  dffirms$RD_transformed <- (dffirms$RDExpenditure^lambda_opt - 1) / lambda_opt
}
model2 <- lm(RD_transformed ~ TotalAssets, data = dffirms)
summary(model2)
## 
## Call:
## lm(formula = RD_transformed ~ TotalAssets, data = dffirms)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6671 -0.9255 -0.0303  0.7700  3.2087 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 5.4627033  0.1839737   29.69   <2e-16 ***
## TotalAssets 0.0075331  0.0006096   12.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.231 on 198 degrees of freedom
## Multiple R-squared:  0.4354, Adjusted R-squared:  0.4325 
## F-statistic: 152.7 on 1 and 198 DF,  p-value: < 2.2e-16
# Diagnostics again
par(mfrow=c(2,2))
plot(model2)

bptest(model2)
## 
##  studentized Breusch-Pagan test
## 
## data:  model2
## BP = 0.0051706, df = 1, p-value = 0.9427
shapiro.test(residuals(model2))
## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(model2)
## W = 0.98986, p-value = 0.1707