# Load necessary libraries
install.packages("wooldridge")    # Install wooldridge package if not already installed
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("sandwich")     # Install sandwich package for robust SE
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("lmtest")       # Install lmtest package for heteroskedasticity tests
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(wooldridge)  # Access datasets from the wooldridge package
library(sandwich)    # For robust standard errors
library(lmtest)      # For heteroskedasticity tests
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# STEP 1: Load the MEAP00 dataset from the wooldridge package
data("meap00_1", package = "wooldridge")  # Load the dataset
## Warning in data("meap00_1", package = "wooldridge"): data set 'meap00_1' not
## found
MEAP00_1 <- meap00_01  # Assign it to a variable for clarity

# Inspect the dataset to verify it's loaded correctly
str(MEAP00_1)  # Check column names and structure
## 'data.frame':    1692 obs. of  9 variables:
##  $ dcode  : num  1010 3010 3010 3010 3020 3020 3020 3030 3030 3030 ...
##  $ bcode  : int  4937 790 1403 4056 922 2864 4851 881 2748 4469 ...
##  $ math4  : num  83.3 85.7 77.3 85.2 91.8 ...
##  $ read4  : num  77.8 60 59.1 67 67.7 ...
##  $ lunch  : num  40.6 12.8 17.1 23.2 25.6 ...
##  $ enroll : int  468 251 439 561 442 381 274 326 273 285 ...
##  $ exppp  : num  5871 4825 4359 4701 4312 ...
##  $ lenroll: num  6.15 5.53 6.08 6.33 6.09 ...
##  $ lexppp : num  8.68 8.48 8.38 8.46 8.37 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
summary(MEAP00_1)  # Basic summary statistics
##      dcode           bcode          math4            read4       
##  Min.   : 1010   Min.   :   1   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:33215   1st Qu.:1307   1st Qu.: 62.45   1st Qu.: 49.55  
##  Median :56010   Median :2680   Median : 76.90   Median : 63.30  
##  Mean   :53210   Mean   :3163   Mean   : 72.69   Mean   : 60.69  
##  3rd Qu.:78030   3rd Qu.:4551   3rd Qu.: 87.22   3rd Qu.: 74.12  
##  Max.   :83010   Max.   :8723   Max.   :100.00   Max.   :100.00  
##      lunch            enroll           exppp          lenroll     
##  Min.   :  0.00   Min.   :  62.0   Min.   : 1268   Min.   :4.127  
##  1st Qu.: 16.50   1st Qu.: 287.0   1st Qu.: 4595   1st Qu.:5.659  
##  Median : 34.89   Median : 376.5   Median : 5144   Median :5.931  
##  Mean   : 38.99   Mean   : 398.6   Mean   : 5264   Mean   :5.906  
##  3rd Qu.: 58.62   3rd Qu.: 480.0   3rd Qu.: 5817   3rd Qu.:6.174  
##  Max.   :100.00   Max.   :1496.0   Max.   :11958   Max.   :7.311  
##      lexppp     
##  Min.   :7.145  
##  1st Qu.:8.433  
##  Median :8.546  
##  Mean   :8.551  
##  3rd Qu.:8.668  
##  Max.   :9.389
# STEP 2: Estimate the model using OLS
model <- lm(math4 ~ lunch + log(enroll) + log(exppp), data = MEAP00_1)

# Output the summary of the OLS regression
print("OLS Regression Summary:")
## [1] "OLS Regression Summary:"
summary(model)
## 
## Call:
## lm(formula = math4 ~ lunch + log(enroll) + log(exppp), data = MEAP00_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -56.326  -8.758   0.914   9.164  51.416 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 91.93240   19.96170   4.605 4.42e-06 ***
## lunch       -0.44874    0.01464 -30.648  < 2e-16 ***
## log(enroll) -5.39915    0.94041  -5.741 1.11e-08 ***
## log(exppp)   3.52475    2.09785   1.680   0.0931 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.3 on 1688 degrees of freedom
## Multiple R-squared:  0.3729, Adjusted R-squared:  0.3718 
## F-statistic: 334.6 on 3 and 1688 DF,  p-value: < 2.2e-16
# STEP 3: Compare standard errors (usual vs. robust)
# Usual standard errors are in the summary above; now compute robust standard errors
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
print("Robust Standard Errors:")
## [1] "Robust Standard Errors:"
print(robust_se)
## 
## t test of coefficients:
## 
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) 91.932404  23.087103   3.9820 7.125e-05 ***
## lunch       -0.448743   0.016584 -27.0583 < 2.2e-16 ***
## log(enroll) -5.399152   1.131175  -4.7730 1.971e-06 ***
## log(exppp)   3.524751   2.353737   1.4975    0.1344    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# STEP 4: Conduct the White test for heteroskedasticity
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
print("White Test for Heteroskedasticity:")
## [1] "White Test for Heteroskedasticity:"
print(white_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 229.78, df = 2, p-value < 2.2e-16
# STEP 5: Perform WLS estimation
# Log of squared residuals
log_u2 <- log(residuals(model)^2)

# Fit auxiliary regression to predict heteroskedasticity
aux_model <- lm(log_u2 ~ fitted(model) + I(fitted(model)^2), data = MEAP00_1)

# Predict h_hat (variance function)
h_hat <- exp(predict(aux_model))

# Add h_hat as weights in WLS regression
MEAP00_1$weights <- 1 / h_hat
wls_model <- lm(math4 ~ lunch + log(enroll) + log(exppp), data = MEAP00_1, weights = MEAP00_1$weights)

# Output the summary of the WLS regression
print("WLS Regression Summary:")
## [1] "WLS Regression Summary:"
summary(wls_model)
## 
## Call:
## lm(formula = math4 ~ lunch + log(enroll) + log(exppp), data = MEAP00_1, 
##     weights = MEAP00_1$weights)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7122  -1.1914   0.1403   1.3741   3.9151 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 50.47815   16.51032   3.057 0.002268 ** 
## lunch       -0.44859    0.01461 -30.697  < 2e-16 ***
## log(enroll) -2.64728    0.83594  -3.167 0.001569 ** 
## log(exppp)   6.47420    1.68588   3.840 0.000127 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.9 on 1688 degrees of freedom
## Multiple R-squared:   0.36,  Adjusted R-squared:  0.3588 
## F-statistic: 316.5 on 3 and 1688 DF,  p-value: < 2.2e-16
# STEP 6: Compare OLS and WLS coefficients
ols_coefficients <- summary(model)$coefficients
wls_coefficients <- summary(wls_model)$coefficients

comparison <- data.frame(
  Coefficient = rownames(ols_coefficients),
  OLS_Estimate = ols_coefficients[, "Estimate"],
  WLS_Estimate = wls_coefficients[, "Estimate"],
  OLS_StdError = ols_coefficients[, "Std. Error"],
  WLS_StdError = wls_coefficients[, "Std. Error"]
)
print("Comparison of OLS and WLS Coefficients:")
## [1] "Comparison of OLS and WLS Coefficients:"
print(comparison)
##             Coefficient OLS_Estimate WLS_Estimate OLS_StdError WLS_StdError
## (Intercept) (Intercept)   91.9324043   50.4781522  19.96169801  16.51031594
## lunch             lunch   -0.4487434   -0.4485915   0.01464203   0.01461347
## log(enroll) log(enroll)   -5.3991517   -2.6472816   0.94041163   0.83593620
## log(exppp)   log(exppp)    3.5247506    6.4741999   2.09784606   1.68587676
# STEP 7: Robust standard errors for WLS
wls_robust_se <- coeftest(wls_model, vcov = vcovHC(wls_model, type = "HC1"))
print("Robust Standard Errors for WLS:")
## [1] "Robust Standard Errors for WLS:"
print(wls_robust_se)
## 
## t test of coefficients:
## 
##              Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept) 50.478152  18.925179   2.6672 0.0077206 ** 
## lunch       -0.448592   0.014254 -31.4711 < 2.2e-16 ***
## log(enroll) -2.647282   1.054608  -2.5102 0.0121590 *  
## log(exppp)   6.474200   1.812868   3.5712 0.0003652 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# STEP 8: Conclude whether OLS or WLS is more precise
precision_comparison <- data.frame(
  Variable = rownames(comparison),
  OLS_SE = comparison$OLS_StdError,
  WLS_SE = comparison$WLS_StdError
)
print("Precision Comparison (OLS vs. WLS Standard Errors):")
## [1] "Precision Comparison (OLS vs. WLS Standard Errors):"
print(precision_comparison)
##      Variable      OLS_SE      WLS_SE
## 1 (Intercept) 19.96169801 16.51031594
## 2       lunch  0.01464203  0.01461347
## 3 log(enroll)  0.94041163  0.83593620
## 4  log(exppp)  2.09784606  1.68587676