# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lmtest)     # For Breusch-Pagan/White's test
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)   # For robust standard errors

# Simulate a dataset similar to MEAP00
set.seed(123)  # For reproducibility
n <- 500  # Number of observations
data <- data.frame(
  math4 = rnorm(n, mean = 70, sd = 10),      # Simulated test scores
  lunch = rbinom(n, 1, 0.4),                # Binary variable: free/reduced lunch
  enroll = runif(n, min = 50, max = 1500),  # Number of enrolled students
  exppp = runif(n, min = 5000, max = 15000) # Expenditure per pupil
)

# (i) Estimate the model and compare standard errors
model_ols <- lm(math4 ~ lunch + log(enroll) + log(exppp), data = data)

# Print the usual standard errors
cat("OLS Model Summary:\n")
## OLS Model Summary:
print(summary(model_ols))
## 
## Call:
## lm(formula = math4 ~ lunch + log(enroll) + log(exppp), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.709  -6.232  -0.037   6.446  31.608 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  56.1178    13.1904   4.254 2.51e-05 ***
## lunch         0.2242     0.8993   0.249    0.803    
## log(enroll)   0.5850     0.5655   1.034    0.301    
## log(exppp)    1.1340     1.3785   0.823    0.411    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.74 on 496 degrees of freedom
## Multiple R-squared:  0.003581,   Adjusted R-squared:  -0.002445 
## F-statistic: 0.5942 on 3 and 496 DF,  p-value: 0.619
# Calculate robust standard errors using White's correction
robust_se <- coeftest(model_ols, vcov = vcovHC(model_ols, type = "HC1"))
cat("\nRobust Standard Errors:\n")
## 
## Robust Standard Errors:
print(robust_se)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 56.11781   13.69623  4.0973 4.882e-05 ***
## lunch        0.22424    0.90038  0.2490    0.8034    
## log(enroll)  0.58497    0.55936  1.0458    0.2962    
## log(exppp)   1.13395    1.42590  0.7953    0.4268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (ii) Apply White's test for heteroskedasticity
white_test <- bptest(model_ols, ~ fitted(model_ols) + I(fitted(model_ols)^2), data = data)
cat("\nWhite's Test for Heteroskedasticity:\n")
## 
## White's Test for Heteroskedasticity:
print(white_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_ols
## BP = 1.4902, df = 2, p-value = 0.4747
# (iii) WLS Estimation
# Step 1: Calculate log of squared residuals from OLS
log_u2 <- log(resid(model_ols)^2)

# Step 2: Fit auxiliary regression of log(u^2) on predictors
aux_model <- lm(log_u2 ~ math4 + I(math4^2), data = data)

# Step 3: Obtain fitted values (g-hat) and exponentiate to get weights (h-hat)
data$h_hat <- exp(fitted(aux_model))

# Step 4: Perform WLS regression using weights
model_wls <- lm(math4 ~ lunch + log(enroll) + log(exppp), data = data, weights = 1/data$h_hat)

# Print WLS model summary
cat("\nWLS Model Summary:\n")
## 
## WLS Model Summary:
print(summary(model_wls))
## 
## Call:
## lm(formula = math4 ~ lunch + log(enroll) + log(exppp), data = data, 
##     weights = 1/data$h_hat)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.01720 -1.38088 -0.05323  1.36806  2.10045 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 71.85928    7.39208   9.721   <2e-16 ***
## lunch        0.92662    0.49634   1.867   0.0625 .  
## log(enroll) -0.04093    0.30980  -0.132   0.8949    
## log(exppp)  -0.18185    0.77204  -0.236   0.8139    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.393 on 496 degrees of freedom
## Multiple R-squared:  0.007107,   Adjusted R-squared:  0.001101 
## F-statistic: 1.183 on 3 and 496 DF,  p-value: 0.3155
# (iv) Compare WLS standard errors with robust standard errors
# Calculate robust standard errors for WLS
robust_se_wls <- coeftest(model_wls, vcov = vcovHC(model_wls, type = "HC1"))
cat("\nWLS Robust Standard Errors:\n")
## 
## WLS Robust Standard Errors:
print(robust_se_wls)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 71.85928    6.67950 10.7582  < 2e-16 ***
## lunch        0.92662    0.44418  2.0861  0.03748 *  
## log(enroll) -0.04093    0.27366 -0.1496  0.88117    
## log(exppp)  -0.18185    0.70256 -0.2588  0.79586    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (v) Compare precision of OLS and WLS for 'log(exppp)'
cat("\nStandard Errors for 'log(exppp)':\n")
## 
## Standard Errors for 'log(exppp)':
cat("OLS Standard Error:", summary(model_ols)$coefficients["log(exppp)", 2], "\n")
## OLS Standard Error: 1.378495
cat("WLS Standard Error:", summary(model_wls)$coefficients["log(exppp)", 2], "\n")
## WLS Standard Error: 0.7720417