# This chunk stays hidden. It just sets up the document.
knitr::opts_chunk$set(echo = TRUE)
if(!require(MASS)) install.packages("MASS")
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.5.2
library(MASS)
# This chunk will show up in your final document!
# Set seed for reproducibility
set.seed(42)
# Simulate 200 firms
n <- 200
firm_size <- runif(n, 10, 500) # Total Assets in millions
# Generate R&D Expenditure
error <- rnorm(n, mean = 0, sd = 0.5)
rd_expenditure <- exp(1.5 + 0.6 * log(firm_size) + error)
# Create the dataframe
df_firms <- data.frame(
Firm_ID = 1:n,
Total_Assets = firm_size,
RD_Expenditure = rd_expenditure
)
# Preview the data
head(df_firms)
## Firm_ID Total_Assets RD_Expenditure
## 1 1 458.2550 322.76939
## 2 2 469.1670 302.76313
## 3 3 150.2084 54.90529
## 4 4 416.9193 421.56611
## 5 5 324.4553 103.12089
## 6 6 264.3570 134.17397
# 1. Visualize
plot(df_firms$Total_Assets, df_firms$RD_Expenditure,
main = "R&D Expenditure vs. Total Assets",
xlab = "Total Assets (Millions)",
ylab = "R&D Expenditure",
pch = 19, col = "steelblue")

# 2. Diagnose
model_initial <- lm(RD_Expenditure ~ Total_Assets, data = df_firms)
par(mfrow = c(2, 2))
plot(model_initial)

# 3. Transform
par(mfrow = c(1, 1))
bc <- boxcox(model_initial, lambda = seq(-1, 1, by = 0.1))

optimal_lambda <- bc$x[which.max(bc$y)]
cat("The optimal lambda is approximately:", round(optimal_lambda, 3), "\n")
## The optimal lambda is approximately: 0.172
# 4. Refine
df_firms$Log_RD <- log(df_firms$RD_Expenditure)
model_refined <- lm(Log_RD ~ Total_Assets, data = df_firms)
# Compare results
summary(model_initial)
##
## Call:
## lm(formula = RD_Expenditure ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -135.79 -42.06 -12.37 25.08 404.97
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.50788 11.25914 3.598 0.000405 ***
## Total_Assets 0.35091 0.03731 9.405 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 75.31 on 198 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.3053
## F-statistic: 88.46 on 1 and 198 DF, p-value: < 2.2e-16
summary(model_refined)
##
## Call:
## lm(formula = Log_RD ~ Total_Assets, data = df_firms)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.47221 -0.39713 0.02358 0.35362 1.37330
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7787315 0.0798760 47.31 <2e-16 ***
## Total_Assets 0.0033566 0.0002647 12.68 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5343 on 198 degrees of freedom
## Multiple R-squared: 0.4482, Adjusted R-squared: 0.4454
## F-statistic: 160.8 on 1 and 198 DF, p-value: < 2.2e-16
# Check diagnostics of the new model
par(mfrow = c(2, 2))
plot(model_refined)
