library(wooldridge)
data("sleep75", package = "wooldridge")
# Fit the regression model
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
# Summary of the regression model
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
# (i) Evidence that men sleep more than women
# Extract the coefficient and p-value for "male"
male_coeff <- coef(summary(model))["male", ]
cat("Coefficient for male:", male_coeff[1], "\n")
## Coefficient for male: 87.75243
cat("p-value for male:", male_coeff[4], "\n")
## p-value for male: 0.01078518
if (male_coeff[4] < 0.05) {
cat("There is significant evidence that men sleep more than women.\n")
} else {
cat("There is no significant evidence that men sleep more than women.\n")
}
## There is significant evidence that men sleep more than women.
# (ii) Tradeoff between work and sleep
# Extract the coefficient and p-value for "totwrk"
totwrk_coeff <- coef(summary(model))["totwrk", ]
cat("Tradeoff estimate (totwrk coefficient):", totwrk_coeff[1], "\n")
## Tradeoff estimate (totwrk coefficient): -0.1634232
cat("p-value for totwrk:", totwrk_coeff[4], "\n")
## p-value for totwrk: 1.891272e-18
if (totwrk_coeff[4] < 0.05) {
cat("There is significant evidence of a tradeoff between work and sleep.\n")
} else {
cat("There is no significant evidence of a tradeoff between work and sleep.\n")
}
## There is significant evidence of a tradeoff between work and sleep.
# (iii) Testing if age has no effect on sleep
# Fit a restricted model without age and age^2
model_restricted <- lm(sleep ~ totwrk + educ + male, data = sleep75)
# Compare models using an F-test
anova_results <- anova(model_restricted, model)
print(anova_results)
## Analysis of Variance Table
##
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
if (anova_results$`Pr(>F)`[2] < 0.05) {
cat("Age has a significant effect on sleep.\n")
} else {
cat("Age has no significant effect on sleep.\n")
}
## Age has no significant effect on sleep.
library(wooldridge)
data("gpa2", package= "wooldridge")
# Inspect the dataset
str(gpa2)
## 'data.frame': 4137 obs. of 12 variables:
## $ sat : int 920 1170 810 940 1180 980 880 980 1240 1230 ...
## $ tothrs : int 43 18 14 40 18 114 78 55 18 17 ...
## $ colgpa : num 2.04 4 1.78 2.42 2.61 ...
## $ athlete : int 1 0 1 0 0 0 0 0 0 0 ...
## $ verbmath: num 0.484 0.828 0.884 0.808 0.735 ...
## $ hsize : num 0.1 9.4 1.19 5.71 2.14 2.68 3.11 2.68 3.67 0.1 ...
## $ hsrank : int 4 191 42 252 86 41 161 101 161 3 ...
## $ hsperc : num 40 20.3 35.3 44.1 40.2 ...
## $ female : int 1 0 0 0 0 1 0 0 0 0 ...
## $ white : int 0 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hsizesq : num 0.01 88.36 1.42 32.6 4.58 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
head(gpa2)
## sat tothrs colgpa athlete verbmath hsize hsrank hsperc female white black
## 1 920 43 2.04 1 0.48387 0.10 4 40.00000 1 0 0
## 2 1170 18 4.00 0 0.82813 9.40 191 20.31915 0 1 0
## 3 810 14 1.78 1 0.88372 1.19 42 35.29412 0 1 0
## 4 940 40 2.42 0 0.80769 5.71 252 44.13310 0 1 0
## 5 1180 18 2.61 0 0.73529 2.14 86 40.18692 0 1 0
## 6 980 114 3.03 0 0.81481 2.68 41 15.29851 1 1 0
## hsizesq
## 1 0.0100
## 2 88.3600
## 3 1.4161
## 4 32.6041
## 5 4.5796
## 6 7.1824
# Fit the regression model
model <- lm(sat ~ hsize + I(hsize^2) + female + black + I(female * black), data = gpa2)
# Summary of the regression model
cat("\nRegression Model Summary:\n")
##
## Regression Model Summary:
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + I(female *
## black), data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## I(female * black) 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
# (i) Test if hsize^2 should be included in the model
cat("\n(i) Testing if hsize^2 should be included in the model:\n")
##
## (i) Testing if hsize^2 should be included in the model:
hsize2_coeff <- coef(summary(model))["I(hsize^2)", ]
cat("Coefficient for hsize^2:", hsize2_coeff[1], "\n")
## Coefficient for hsize^2: -2.194828
cat("p-value for hsize^2:", hsize2_coeff[4], "\n")
## p-value for hsize^2: 3.198417e-05
if (hsize2_coeff[4] < 0.05) {
cat("There is strong evidence that hsize^2 should be included in the model.\n")
} else {
cat("There is no strong evidence that hsize^2 should be included in the model.\n")
}
## There is strong evidence that hsize^2 should be included in the model.
# Calculate the optimal high school size
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
cat("The optimal high school size (hsize) is:", optimal_hsize, "\n")
## The optimal high school size (hsize) is: 4.39603
# (ii) Difference in SAT scores between nonblack females and nonblack males
cat("\n(ii) Difference in SAT scores between nonblack females and nonblack males:\n")
##
## (ii) Difference in SAT scores between nonblack females and nonblack males:
female_coeff <- coef(summary(model))["female", ]
cat("Difference in SAT scores (female coefficient):", female_coeff[1], "\n")
## Difference in SAT scores (female coefficient): -45.09145
cat("p-value for female:", female_coeff[4], "\n")
## p-value for female: 1.656301e-25
if (female_coeff[4] < 0.05) {
cat("The difference is statistically significant.\n")
} else {
cat("The difference is not statistically significant.\n")
}
## The difference is statistically significant.
# (iii) Difference in SAT scores between nonblack males and black males
cat("\n(iii) Difference in SAT scores between nonblack males and black males:\n")
##
## (iii) Difference in SAT scores between nonblack males and black males:
black_coeff <- coef(summary(model))["black", ]
cat("Difference in SAT scores (black coefficient):", black_coeff[1], "\n")
## Difference in SAT scores (black coefficient): -169.8126
cat("p-value for black:", black_coeff[4], "\n")
## p-value for black: 7.140387e-40
if (black_coeff[4] < 0.05) {
cat("There is a statistically significant difference in SAT scores between nonblack males and black males.\n")
} else {
cat("There is no statistically significant difference in SAT scores between nonblack males and black males.\n")
}
## There is a statistically significant difference in SAT scores between nonblack males and black males.
# (iv) Difference in SAT scores between black females and nonblack females
cat("\n(iv) Difference in SAT scores between black females and nonblack females:\n")
##
## (iv) Difference in SAT scores between black females and nonblack females:
female_black_coeff <- coef(summary(model))["I(female * black)", ]
total_diff <- female_coeff[1] + female_black_coeff[1]
cat("Difference in SAT scores (black females vs nonblack females):", total_diff, "\n")
## Difference in SAT scores (black females vs nonblack females): 17.21491
cat("p-value for female:black interaction:", female_black_coeff[4], "\n")
## p-value for female:black interaction: 0.0006048744
if (female_black_coeff[4] < 0.05) {
cat("The difference is statistically significant.\n")
} else {
cat("The difference is not statistically significant.\n")
}
## The difference is statistically significant.
##(i) Add mothcoll and fathcoll to the model and report results:
library(wooldridge)
# Load the gpa1 dataset
data("gpa1", package= "wooldridge")
# Fit the regression model with mothcoll and fathcoll
model_c1_i <- lm(colGPA ~ PC + mothcoll + fathcoll, data = gpa1)
summary(model_c1_i)
##
## Call:
## lm(formula = colGPA ~ PC + mothcoll + fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.97476 -0.26782 -0.01261 0.28739 0.82524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.963813 0.055760 53.153 <2e-16 ***
## PC 0.162145 0.064257 2.523 0.0128 *
## mothcoll 0.004012 0.065974 0.061 0.9516
## fathcoll 0.044787 0.066606 0.672 0.5024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3661 on 137 degrees of freedom
## Multiple R-squared: 0.05366, Adjusted R-squared: 0.03294
## F-statistic: 2.589 on 3 and 137 DF, p-value: 0.05545
# Check if PC is still statistically significant
pc_coeff <- coef(summary(model_c1_i))["PC", ]
cat("Coefficient for PC:", pc_coeff[1], "\n")
## Coefficient for PC: 0.1621452
cat("p-value for PC:", pc_coeff[4], "\n")
## p-value for PC: 0.01276597
##(ii) Test for joint significance of mothcoll and fathcoll:
# Restricted model without mothcoll and fathcoll
model_c1_ii <- lm(colGPA ~ PC, data = gpa1)
# Perform an ANOVA test to check joint significance
anova_results <- anova(model_c1_ii, model_c1_i)
cat("Joint significance test for mothcoll and fathcoll:\n")
## Joint significance test for mothcoll and fathcoll:
print(anova_results)
## Analysis of Variance Table
##
## Model 1: colGPA ~ PC
## Model 2: colGPA ~ PC + mothcoll + fathcoll
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 139 18.436
## 2 137 18.365 2 0.071262 0.2658 0.767
# Add hsGPA^2 to the model
model_c1_iii <- lm(colGPA ~ PC + mothcoll + fathcoll + hsGPA + I(hsGPA^2), data = gpa1)
summary(model_c1_iii)
##
## Call:
## lm(formula = colGPA ~ PC + mothcoll + fathcoll + hsGPA + I(hsGPA^2),
## data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78684 -0.24699 0.00681 0.26674 0.73552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.28913 2.37073 2.231 0.0273 *
## PC 0.14007 0.05868 2.387 0.0184 *
## mothcoll 0.00346 0.05992 0.058 0.9540
## fathcoll 0.06698 0.06149 1.089 0.2780
## hsGPA -1.89964 1.42261 -1.335 0.1840
## I(hsGPA^2) 0.35401 0.21178 1.672 0.0969 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3316 on 135 degrees of freedom
## Multiple R-squared: 0.235, Adjusted R-squared: 0.2067
## F-statistic: 8.295 on 5 and 135 DF, p-value: 7.341e-07
##(iii) Add hsGPA^2 to the model and decide if it's needed:
# Check the significance of hsGPA^2
hsGPA2_coeff <- coef(summary(model_c1_iii))["I(hsGPA^2)", ]
cat("Coefficient for hsGPA^2:", hsGPA2_coeff[1], "\n")
## Coefficient for hsGPA^2: 0.3540145
cat("p-value for hsGPA^2:", hsGPA2_coeff[4], "\n")
## p-value for hsGPA^2: 0.09691523
# Load the wage2 dataset
data("wage2", package= "wooldridge")
# (i) Estimate the base model and assess the difference in salary between blacks and nonblacks
model_c2 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
cat("Model Summary for Base Model:\n")
## Model Summary for Base Model:
summary(model_c2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
# Extract difference in salary for blacks and nonblacks
black_coeff <- coef(summary(model_c2))["black", ]
cat("\nDifference in salary (log-wage) for blacks:\n")
##
## Difference in salary (log-wage) for blacks:
cat("Coefficient:", black_coeff[1], "\n")
## Coefficient: -0.1883499
cat("p-value:", black_coeff[4], "\n")
## p-value: 6.839315e-07
# (ii) Add exper^2 and tenure^2 to the model and test for joint insignificance
model_c2_extended <- lm(log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
cat("\nModel Summary for Extended Model (with exper^2 and tenure^2):\n")
##
## Model Summary for Extended Model (with exper^2 and tenure^2):
summary(model_c2_extended)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure +
## I(tenure^2) + married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
# Perform ANOVA test for joint significance
anova_results <- anova(model_c2, model_c2_extended)
cat("\nJoint significance test for exper^2 and tenure^2:\n")
##
## Joint significance test for exper^2 and tenure^2:
print(anova_results)
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south +
## urban
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) +
## married + black + south + urban
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
# (iii) Extend the model to test if return to education depends on race
model_c2_race_education <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
cat("\nModel Summary for Race-Education Interaction Model:\n")
##
## Model Summary for Race-Education Interaction Model:
summary(model_c2_race_education)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
# Extract interaction term for educ:black
educ_black_coeff <- coef(summary(model_c2_race_education))["educ:black", ]
cat("\nInteraction term (educ:black):\n")
##
## Interaction term (educ:black):
cat("Coefficient:", educ_black_coeff[1], "\n")
## Coefficient: -0.02262361
cat("p-value:", educ_black_coeff[4], "\n")
## p-value: 0.2626026
# (iv) Allow wages to differ across groups (marital and racial status)
model_c2_marital_race <- lm(log(wage) ~ black * married + exper + tenure + educ + south + urban, data = wage2)
cat("\nModel Summary for Marital-Race Interaction Model:\n")
##
## Model Summary for Marital-Race Interaction Model:
summary(model_c2_marital_race)
##
## Call:
## lm(formula = log(wage) ~ black * married + exper + tenure + educ +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## black -0.240820 0.096023 -2.508 0.012314 *
## married 0.188915 0.042878 4.406 1.18e-05 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## black:married 0.061354 0.103275 0.594 0.552602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
# Extract interaction term for black:married
black_married_coeff <- coef(summary(model_c2_marital_race))["black:married", ]
cat("\nInteraction term (black:married):\n")
##
## Interaction term (black:married):
cat("Coefficient:", black_married_coeff[1], "\n")
## Coefficient: 0.0613537
cat("p-value:", black_married_coeff[4], "\n")
## p-value: 0.552602
# Simulation to demonstrate consequences of heteroskedasticity
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Set seed for reproducibility
set.seed(123)
# Function to generate heteroskedastic data
generate_hetero_data <- function(n = 1000) {
x <- rnorm(n)
# Error variance increases with x
e <- rnorm(n, 0, sd = abs(x))
y <- 1 + 2*x + e
return(data.frame(x = x, y = y))
}
# Simulate data
n_sims <- 1000
beta_ols <- matrix(NA, n_sims, 2)
f_stats <- numeric(n_sims)
for(i in 1:n_sims) {
data <- generate_hetero_data()
model <- lm(y ~ x, data = data)
# Store OLS estimates
beta_ols[i,] <- coef(model)
# Calculate F-statistic
f_stats[i] <- summary(model)$fstatistic[1]
}
# (i) Check consistency of OLS estimators
cat("(i) OLS Estimator Properties:\n")
## (i) OLS Estimator Properties:
cat("True parameters: beta0 = 1, beta1 = 2\n")
## True parameters: beta0 = 1, beta1 = 2
cat("Mean of OLS estimates:\n")
## Mean of OLS estimates:
cat("beta0:", mean(beta_ols[,1]), "\n")
## beta0: 1.00025
cat("beta1:", mean(beta_ols[,2]), "\n")
## beta1: 2.002806
cat("\nStandard deviation of OLS estimates:\n")
##
## Standard deviation of OLS estimates:
cat("beta0:", sd(beta_ols[,1]), "\n")
## beta0: 0.03049547
cat("beta1:", sd(beta_ols[,2]), "\n")
## beta1: 0.05499627
# (ii) F-statistic distribution
# Compare with theoretical F(1, 998) distribution
theoretical_quantiles <- qf(c(0.01, 0.05, 0.10, 0.90, 0.95, 0.99), 1, 998)
empirical_quantiles <- quantile(f_stats, probs = c(0.01, 0.05, 0.10, 0.90, 0.95, 0.99))
cat("\n(ii) F-statistic Distribution Comparison:\n")
##
## (ii) F-statistic Distribution Comparison:
cat("Quantiles comparison (Empirical vs Theoretical):\n")
## Quantiles comparison (Empirical vs Theoretical):
print(data.frame(
Probability = c(0.01, 0.05, 0.10, 0.90, 0.95, 0.99),
Empirical = empirical_quantiles,
Theoretical = theoretical_quantiles,
row.names = NULL
))
## Probability Empirical Theoretical
## 1 0.01 3282.263 0.0001571666
## 2 0.05 3449.068 0.0039341183
## 3 0.10 3577.323 0.0157988123
## 4 0.90 4513.646 2.7105732837
## 5 0.95 4671.961 3.8507933708
## 6 0.99 4894.430 6.6603458503
# (iii) BLUE property
# Function to calculate variance of alternative estimator
generate_and_compare <- function() {
data <- generate_hetero_data()
# OLS estimator
model_ols <- lm(y ~ x, data = data)
# Alternative weighted estimator
weights <- 1/abs(data$x) # Simple weighting scheme
model_weighted <- lm(y ~ x, data = data, weights = weights)
return(c(
var_ols = var(model_ols$coefficients[2]),
var_weighted = var(model_weighted$coefficients[2])
))
}
# Compare variances
results <- replicate(100, generate_and_compare())
mean_vars <- rowMeans(results)
cat("\n(iii) BLUE Property Check:\n")
##
## (iii) BLUE Property Check:
cat("Average variance of OLS estimator:", mean_vars[1], "\n")
## Average variance of OLS estimator: NA
cat("Average variance of weighted estimator:", mean_vars[2], "\n")
## Average variance of weighted estimator: NA
# Demonstrate with single example
data <- generate_hetero_data()
model <- lm(y ~ x, data = data)
# Compare regular and robust standard errors
regular_se <- coef(summary(model))[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(model, type = "HC1")))
cat("\nStandard Error Comparison:\n")
##
## Standard Error Comparison:
print(data.frame(
Regular = regular_se,
Robust = robust_se,
row.names = c("(Intercept)", "x")
))
## Regular Robust
## (Intercept) 0.03161957 0.03168073
## x 0.03079771 0.05158754
# Load required packages
library(wooldridge)
library(sandwich)
library(lmtest)
# Load the smoke dataset
data("smoke")
# Create binary smoke variable (1 if person smokes, 0 otherwise)
smoke$smoker <- ifelse(smoke$cigs > 0, 1, 0)
# Fit the linear probability model as specified in the textbook
model <- lm(smoker ~ log(cigpric) + log(income) + educ + age + I(age^2) +
restaurn + white, data = smoke)
# (i) Compare standard errors
# Get both regular and robust standard errors
regular_se <- coef(summary(model))[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(model, type = "HC1")))
# Create comparison table
se_comparison <- data.frame(
Regular = regular_se,
Robust = robust_se,
Difference = robust_se - regular_se
)
cat("(i) Comparison of Standard Errors:\n")
## (i) Comparison of Standard Errors:
print(round(se_comparison, 6))
## Regular Robust Difference
## (Intercept) 0.854962 0.864079 0.009118
## log(cigpric) 0.204109 0.208034 0.003925
## log(income) 0.025724 0.025823 0.000099
## educ 0.005901 0.005642 -0.000259
## age 0.005666 0.005411 -0.000255
## I(age^2) 0.000062 0.000057 -0.000005
## restaurn 0.039443 0.037690 -0.001753
## white 0.051517 0.050690 -0.000827
# (ii) Effect of 4 years of education
base_case <- data.frame(
cigpric = mean(smoke$cigpric),
income = mean(smoke$income),
educ = 16,
age = mean(smoke$age),
restaurn = 0,
white = 0
)
prob_base <- predict(model, newdata = base_case)
base_case$educ <- 20
prob_new <- predict(model, newdata = base_case)
cat("\n(ii) Effect of 4 More Years of Education:\n")
##
## (ii) Effect of 4 More Years of Education:
cat("Base probability (16 years):", round(prob_base, 4), "\n")
## Base probability (16 years): 0.4068
cat("New probability (20 years):", round(prob_new, 4), "\n")
## New probability (20 years): 0.2913
cat("Change in probability:", round(prob_new - prob_base, 4), "\n")
## Change in probability: -0.1155
# (iii) Age effect turning point
coef_age <- coef(model)["age"]
coef_age2 <- coef(model)["I(age^2)"]
turning_point <- -coef_age/(2*coef_age2)
cat("\n(iii) Age Turning Point:\n")
##
## (iii) Age Turning Point:
cat("Age where smoking probability starts decreasing:", round(turning_point, 1), "years\n")
## Age where smoking probability starts decreasing: 38.1 years
# (iv) Restaurant smoking restriction effect
cat("\n(iv) Restaurant Smoking Restriction Effect:\n")
##
## (iv) Restaurant Smoking Restriction Effect:
cat("Coefficient:", round(coef(model)["restaurn"], 4), "\n")
## Coefficient: -0.1008
cat("Regular SE:", round(regular_se["restaurn"], 4), "\n")
## Regular SE: 0.0394
cat("Robust SE:", round(robust_se["restaurn"], 4), "\n")
## Robust SE: 0.0377
# (v) Predicted probability for person 206
person_206 <- data.frame(
cigpric = 67.44,
income = 6500,
educ = 16,
age = 77,
restaurn = 0,
white = 0
)
prob_206 <- predict(model, newdata = person_206)
cat("\n(v) Prediction for Person 206:\n")
##
## (v) Prediction for Person 206:
cat("Predicted probability of smoking:", round(prob_206, 4), "\n")
## Predicted probability of smoking: -0.004
# Print model summary statistics
cat("\nModel Summary Statistics:\n")
##
## Model Summary Statistics:
cat("Sample size:", nobs(model), "\n")
## Sample size: 807
cat("R-squared:", round(summary(model)$r.squared, 4), "\n")
## R-squared: 0.062
# Load required packages
library(wooldridge)
library(lmtest)
library(sandwich)
# Load VOTE1 dataset
data("vote1")
# (i) Estimate the initial model
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# Get residuals
residuals <- resid(model)
# Regress residuals on independent variables
residual_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB),
data = vote1)
# Print R-squared of residual regression
cat("(i) R-squared from residual regression:\n")
## (i) R-squared from residual regression:
cat("R-squared:", summary(residual_model)$r.squared, "\n")
## R-squared: 1.198131e-32
cat("This should be zero or very close to zero due to OLS properties\n\n")
## This should be zero or very close to zero due to OLS properties
# (ii) Breusch-Pagan test
# Get squared residuals
residuals_squared <- residuals^2
# Regression for BP test
bp_model <- lm(residuals_squared ~ prtystrA + democA + log(expendA) + log(expendB),
data = vote1)
# Calculate BP test statistic
n <- nrow(vote1)
bp_stat <- n * summary(bp_model)$r.squared
bp_pvalue <- 1 - pchisq(bp_stat, df = 4) # df = number of regressors excluding intercept
cat("(ii) Breusch-Pagan Test:\n")
## (ii) Breusch-Pagan Test:
cat("BP test statistic:", bp_stat, "\n")
## BP test statistic: 9.093356
cat("P-value:", bp_pvalue, "\n")
## P-value: 0.05880792
# Alternative using built-in function
bp_test <- bptest(model)
cat("\nBreusch-Pagan test using bptest function:\n")
##
## Breusch-Pagan test using bptest function:
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test
# Create squared terms and interactions
vote1$expendA_log <- log(vote1$expendA)
vote1$expendB_log <- log(vote1$expendB)
white_model <- lm(residuals_squared ~ prtystrA + democA + expendA_log + expendB_log +
I(prtystrA^2) + I(democA^2) + I(expendA_log^2) + I(expendB_log^2) +
I(prtystrA*democA) + I(prtystrA*expendA_log) + I(prtystrA*expendB_log) +
I(democA*expendA_log) + I(democA*expendB_log) + I(expendA_log*expendB_log),
data = vote1)
# Calculate White test statistic
white_stat <- n * summary(white_model)$r.squared
white_df <- length(coef(white_model)) - 1 # degrees of freedom
white_pvalue <- 1 - pchisq(white_stat, df = white_df)
cat("\n(iii) White Test:\n")
##
## (iii) White Test:
cat("White test statistic:", white_stat, "\n")
## White test statistic: 31.10152
cat("Degrees of freedom:", white_df, "\n")
## Degrees of freedom: 14
cat("P-value:", white_pvalue, "\n")
## P-value: 0.00536492
# Summary of original model with both regular and robust standard errors
cat("\nOriginal Model Summary with Regular and Robust SE:\n")
##
## Original Model Summary with Regular and Robust SE:
coeftest_regular <- coeftest(model)
coeftest_robust <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
# Create comparison table
se_comparison <- data.frame(
Estimate = coeftest_regular[,1],
"Regular SE" = coeftest_regular[,2],
"Robust SE" = coeftest_robust[,2],
"Regular p-value" = coeftest_regular[,4],
"Robust p-value" = coeftest_robust[,4]
)
print(round(se_comparison, 4))
## Estimate Regular.SE Robust.SE Regular.p.value Robust.p.value
## (Intercept) 37.6614 4.7360 4.4189 0.0000 0.0000
## prtystrA 0.2519 0.0713 0.0661 0.0005 0.0002
## democA 3.7929 1.4065 1.4522 0.0077 0.0098
## log(expendA) 5.7793 0.3918 0.5331 0.0000 0.0000
## log(expendB) -6.2378 0.3975 0.3562 0.0000 0.0000
# Load required packages
library(wooldridge)
library(sandwich)
library(lmtest)
library(car)
## Loading required package: carData
# Load FERTIL2 dataset
data("fertil2")
# Display available variables
cat("Available variables in FERTIL2:\n")
## Available variables in FERTIL2:
print(names(fertil2))
## [1] "mnthborn" "yearborn" "age" "electric" "radio" "tv"
## [7] "bicycle" "educ" "ceb" "agefbrth" "children" "knowmeth"
## [13] "usemeth" "monthfm" "yearfm" "agefm" "idlnchld" "heduc"
## [19] "agesq" "urban" "urb_educ" "spirit" "protest" "catholic"
## [25] "frsthalf" "educ0" "evermarr"
# (i) Initial model estimation
model1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
# Compare regular and robust standard errors
regular_se <- coeftest(model1)
robust_se <- coeftest(model1, vcov = vcovHC(model1, type = "HC1"))
# Create comparison table
se_comparison <- data.frame(
Coefficient = regular_se[,1],
"Regular SE" = regular_se[,2],
"Robust SE" = robust_se[,2],
"Difference" = robust_se[,2] - regular_se[,2]
)
cat("\n(i) Comparison of Standard Errors:\n")
##
## (i) Comparison of Standard Errors:
print(round(se_comparison, 4))
## Coefficient Regular.SE Robust.SE Difference
## (Intercept) -4.2225 0.2402 0.2439 0.0037
## age 0.3409 0.0165 0.0192 0.0027
## I(age^2) -0.0027 0.0003 0.0004 0.0001
## educ -0.0752 0.0063 0.0063 0.0000
## electric -0.3100 0.0690 0.0639 -0.0051
## urban -0.2000 0.0465 0.0455 -0.0010
# Looking at the structure of religious variables
cat("\nReligious variables in the dataset:\n")
##
## Religious variables in the dataset:
print(head(fertil2[, grep("relig|cath|prot", names(fertil2), value = TRUE)]))
## protest catholic
## 1 0 0
## 2 0 0
## 3 0 0
## 4 0 0
## 5 1 0
## 6 0 0
# (ii) Add religious dummy variables and test joint significance
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + spirit + protest + catholic, data = fertil2)
# Non-robust F-test
non_robust_f_test <- linearHypothesis(model2, c("protest = 0", "catholic = 0", "spirit = 0"))
cat("Non-robust F-test p-value:\n")
## Non-robust F-test p-value:
print(non_robust_f_test)
##
## Linear hypothesis test:
## protest = 0
## catholic = 0
## spirit = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit +
## protest + catholic
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4352 9176.4
## 2 4349 9162.5 3 13.88 2.1961 0.08641 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Robust F-test
robust_f_test <- linearHypothesis(model2, c("protest = 0", "catholic = 0", "spirit = 0"), vcov = vcovHC(model2, type = "HC1"))
cat("Robust F-test p-value:\n")
## Robust F-test p-value:
print(robust_f_test)
##
## Linear hypothesis test:
## protest = 0
## catholic = 0
## spirit = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit +
## protest + catholic
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 4352
## 2 4349 3 2.1559 0.0911 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Test heteroskedasticity with fitted values
fitted_vals <- fitted(model2)
residuals <- residuals(model2)
residuals_sq <- residuals^2
# Auxiliary regression
aux_model <- lm(residuals_sq ~ fitted_vals + I(fitted_vals^2), data = fertil2)
summary(aux_model)
##
## Call:
## lm(formula = residuals_sq ~ fitted_vals + I(fitted_vals^2), data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.873 -1.290 -0.302 0.357 47.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3126 0.1114 2.807 0.00503 **
## fitted_vals -0.1489 0.1018 -1.462 0.14381
## I(fitted_vals^2) 0.2668 0.0196 13.607 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.632 on 4355 degrees of freedom
## Multiple R-squared: 0.2501, Adjusted R-squared: 0.2497
## F-statistic: 726.1 on 2 and 4355 DF, p-value: < 2.2e-16
# Joint significance of fitted values
hetero_test <- linearHypothesis(aux_model, c("fitted_vals = 0", "I(fitted_vals^2) = 0"))
cat("Joint significance of regressors (heteroskedasticity test):\n")
## Joint significance of regressors (heteroskedasticity test):
print(hetero_test)
##
## Linear hypothesis test:
## fitted_vals = 0
## I(fitted_vals^2) = 0
##
## Model 1: restricted model
## Model 2: residuals_sq ~ fitted_vals + I(fitted_vals^2)
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4357 76589
## 2 4355 57436 2 19153 726.11 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) Practical importance of heteroskedasticity
# Check if the p-value exists and handle missing values
hetero_p_value <- hetero_test$`Pr(>F)`[1]
if (!is.na(hetero_p_value)) {
if (hetero_p_value < 0.05) {
cat("Heteroskedasticity is statistically significant.\n")
} else {
cat("Heteroskedasticity is not statistically significant.\n")
}
} else {
cat("The p-value for the heteroskedasticity test is missing (NA). Unable to determine significance.\n")
}
## The p-value for the heteroskedasticity test is missing (NA). Unable to determine significance.
print(hetero_test)
##
## Linear hypothesis test:
## fitted_vals = 0
## I(fitted_vals^2) = 0
##
## Model 1: restricted model
## Model 2: residuals_sq ~ fitted_vals + I(fitted_vals^2)
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4357 76589
## 2 4355 57436 2 19153 726.11 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load required libraries
library(lmtest) # for resettest
library(wooldridge) # for ceosal2 dataset
# Load the data
data(ceosal2)
# Create log transformed variables
ceosal2$logsalary <- log(ceosal2$salary)
ceosal2$logsales <- log(ceosal2$sales)
ceosal2$logmktval <- log(ceosal2$mktval)
# Fit the original model
model1 <- lm(logsalary ~ logsales + logmktval + profmarg + ceoten + comten,
data = ceosal2)
# Fit the model with squared terms
ceosal2$ceoten2 <- ceosal2$ceoten^2
ceosal2$comten2 <- ceosal2$comten^2
model2 <- lm(logsalary ~ logsales + logmktval + profmarg + ceoten + comten +
ceoten2 + comten2,
data = ceosal2)
# Print R-squared values
cat("R-squared for original model:", summary(model1)$r.squared, "\n")
## R-squared for original model: 0.3525374
cat("R-squared for model with squared terms:", summary(model2)$r.squared, "\n")
## R-squared for model with squared terms: 0.3744746
# Perform RESET test
reset_result <- resettest(model1, power = 2:3, type = "regressor")
print(reset_result)
##
## RESET test
##
## data: model1
## RESET = 3.6308, df1 = 10, df2 = 161, p-value = 0.0002213
# If you want to examine the coefficients
summary(model1)
##
## Call:
## lm(formula = logsalary ~ logsales + logmktval + profmarg + ceoten +
## comten, data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## logsales 0.187787 0.040003 4.694 5.46e-06 ***
## logmktval 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
summary(model2)
##
## Call:
## lm(formula = logsalary ~ logsales + logmktval + profmarg + ceoten +
## comten + ceoten2 + comten2, data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47661 -0.26615 -0.03878 0.27787 1.89344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.424e+00 2.656e-01 16.655 < 2e-16 ***
## logsales 1.857e-01 3.980e-02 4.665 6.25e-06 ***
## logmktval 1.018e-01 4.872e-02 2.089 0.038228 *
## profmarg -2.575e-03 2.088e-03 -1.233 0.219137
## ceoten 4.772e-02 1.416e-02 3.371 0.000929 ***
## comten -6.063e-03 1.189e-02 -0.510 0.610815
## ceoten2 -1.119e-03 4.814e-04 -2.324 0.021329 *
## comten2 -5.389e-05 2.528e-04 -0.213 0.831474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared: 0.3745, Adjusted R-squared: 0.3486
## F-statistic: 14.45 on 7 and 169 DF, p-value: 1.136e-14
The issue of whether the failure of colleges to report campus crimes can be viewed as exogenous sample selection depends on whether the decision to report (or not report) crimes is correlated with factors that influence the dependent variable (number of campus crimes) or the independent variable (student enrollment).
Exogenous Sample Selection #Sample selection is considered exogenous if the selection mechanism is unrelated to the dependent variable or the independent variables in the model. In this case, the failure to report crimes would not bias the estimates of the relationship between campus crimes and student enrollment, as the missing data would occur randomly.
For example :If colleges fail to report campus crimes due to administrative oversight or lack of resources (and these factors are unrelated to the number of campus crimes or student enrollment), then the sample selection can be considered exogenous.
Chapter 9.C4
data("infmrt")
# Add a dummy variable for District of Columbia (assuming DC is observation 51)
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)
# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43481 -0.10338 0.00447 0.09096 0.39050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.456e+00 8.688e-02 28.264 < 2e-16 ***
## pcinc -3.078e-05 6.499e-06 -4.737 7.33e-06 ***
## physic 1.495e-03 2.729e-04 5.478 3.33e-07 ***
## dc -7.940e-02 1.664e-01 -0.477 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared: 0.2537, Adjusted R-squared: 0.2309
## F-statistic: 11.1 on 3 and 98 DF, p-value: 2.444e-06
# (ii) Compare with the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43432 -0.10235 0.00626 0.09337 0.39064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.448e+00 8.502e-02 28.794 < 2e-16 ***
## pcinc -3.026e-05 6.378e-06 -4.743 7.07e-06 ***
## physic 1.487e-03 2.713e-04 5.480 3.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared: 0.252, Adjusted R-squared: 0.2369
## F-statistic: 16.67 on 2 and 99 DF, p-value: 5.744e-07
# Compare both models
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
##
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 99 2.6064
## 2 98 2.6004 1 0.0060446 0.2278 0.6342
# Load required packages
library(wooldridge) # for RDCHEM dataset
library(quantreg) # for LAD regression
## Loading required package: SparseM
# Load the data
data(rdchem)
# Convert sales to billions
rdchem$sales_bil <- rdchem$sales / 1000 # assuming sales is in millions
# Create squared sales term
rdchem$sales_bil_sq <- rdchem$sales_bil^2
# Find the company with highest sales
max_sales_company <- which.max(rdchem$sales_bil)
# Create two datasets: one with and one without the largest firm
data_with_outlier <- rdchem
data_without_outlier <- rdchem[-max_sales_company,]
# (i) OLS Estimation
# With outlier
ols_with <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg,
data = data_with_outlier)
# Without outlier
ols_without <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg,
data = data_without_outlier)
# (ii) LAD Estimation
# With outlier
lad_with <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg,
data = data_with_outlier, tau = 0.5)
# Without outlier
lad_without <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg,
data = data_without_outlier, tau = 0.5)
# Print results
cat("\nLargest firm sales (billions):", max(rdchem$sales_bil))
##
## Largest firm sales (billions): 39.709
cat("\n\nOLS Results With Outlier:\n")
##
##
## OLS Results With Outlier:
print(summary(ols_with))
##
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## data = data_with_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## sales_bil_sq -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
cat("\nOLS Results Without Outlier:\n")
##
## OLS Results Without Outlier:
print(summary(ols_without))
##
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## data = data_without_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0843 -1.1354 -0.5505 0.7570 5.7783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.98352 0.71763 2.764 0.0102 *
## sales_bil 0.36062 0.23887 1.510 0.1427
## sales_bil_sq -0.01025 0.01308 -0.784 0.4401
## profmarg 0.05528 0.04579 1.207 0.2378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared: 0.1912, Adjusted R-squared: 0.1013
## F-statistic: 2.128 on 3 and 27 DF, p-value: 0.1201
cat("\nLAD Results With Outlier:\n")
##
## LAD Results With Outlier:
print(summary(lad_with))
##
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## tau = 0.5, data = data_with_outlier)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## sales_bil_sq -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
cat("\nLAD Results Without Outlier:\n")
##
## LAD Results Without Outlier:
print(summary(lad_without))
##
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## tau = 0.5, data = data_without_outlier)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 2.61047 0.58936 2.81404
## sales_bil -0.22364 -0.23542 0.87607
## sales_bil_sq 0.01681 -0.03201 0.02883
## profmarg 0.07594 0.00578 0.16392
# Create comparison table of coefficients
coef_comparison <- data.frame(
OLS_with = coef(ols_with),
OLS_without = coef(ols_without),
LAD_with = coef(lad_with),
LAD_without = coef(lad_without)
)
# Calculate percentage changes in coefficients
calc_pct_change <- function(with_outlier, without_outlier) {
return((without_outlier - with_outlier)/with_outlier * 100)
}
pct_change <- data.frame(
OLS_pct_change = calc_pct_change(coef(ols_with), coef(ols_without)),
LAD_pct_change = calc_pct_change(coef(lad_with), coef(lad_without))
)
cat("\nCoefficient Comparison:\n")
##
## Coefficient Comparison:
print(round(coef_comparison, 4))
## OLS_with OLS_without LAD_with LAD_without
## (Intercept) 2.0590 1.9835 1.4043 2.6105
## sales_bil 0.3166 0.3606 0.2635 -0.2236
## sales_bil_sq -0.0074 -0.0103 -0.0060 0.0168
## profmarg 0.0533 0.0553 0.1140 0.0759
cat("\nPercentage Changes in Coefficients When Removing Outlier:\n")
##
## Percentage Changes in Coefficients When Removing Outlier:
print(round(pct_change, 2))
## OLS_pct_change LAD_pct_change
## (Intercept) -3.66 85.89
## sales_bil 13.90 -184.89
## sales_bil_sq 38.72 -380.12
## profmarg 3.67 -33.39
Agree - While not always strictly true, in some cases, time series observations may exhibit little to no autocorrelation, allowing us to treat them as approximately independent. For example, in datasets where time dependence is weak or negligible, the assumption of independence may hold as a simplification.
Agree. - Under the first three Gauss-Markov assumptions—linearity of the model, random sampling (or weak dependence), and exogeneity—the OLS estimator is unbiased even for time series data. These assumptions ensure that the expected value of the estimator equals the true parameter value.
Disagree.- Detrending the data (removing the trend component). #Including a time trend as an explanatory variable. #Using first differences or other transformations to make the variable stationary. #Ignoring trends can lead to misleading statistical inferences, but there are proper methods to handle them.
Agree.- Seasonality is typically not a concern with annual data because it aggregates information over the year, eliminating within-year seasonal fluctuations. As a result, seasonal patterns are less likely to appear in annual time series data.
# Load required packages
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(seasonal)
# Create simulated quarterly data (1990-2024)
set.seed(123)
n <- 136 # 34 years * 4 quarters
# Generate time index
time <- 1:n
quarters <- rep(1:4, length.out = n)
# Create dummy variables for quarters
Q2 <- as.numeric(quarters == 2)
Q3 <- as.numeric(quarters == 3)
Q4 <- as.numeric(quarters == 4)
# Generate trending variables with seasonal components
# Interest rates: declining trend with seasonal variation
interest_rates <- 10 - 0.02*time +
0.5*sin(2*pi*time/4) + # Seasonal component
rnorm(n, 0, 0.5) # Random noise
# Real per capita income: upward trend with seasonal variation
income <- 30000 + 100*time +
2000*sin(2*pi*time/4) + # Seasonal component
rnorm(n, 0, 1000) # Random noise
# Housing starts with trend, seasonality, and dependence on other variables
housing_starts <- 1000 - 100*interest_rates + 0.01*income +
-200*Q2 + 300*Q3 - 400*Q4 + # Seasonal effects
50*time + # Time trend
rnorm(n, 0, 200) # Random noise
# Create data frame
housing_data <- data.frame(
date = seq(as.Date("1990-01-01"), by = "quarter", length.out = n),
housing_starts = housing_starts,
interest_rates = interest_rates,
income = income,
quarter = quarters
)
# Estimate the model
model <- lm(housing_starts ~ time + Q2 + Q3 + Q4 +
interest_rates + income, data = housing_data)
# Print summary
cat("Model Summary:\n")
## Model Summary:
print(summary(model))
##
## Call:
## lm(formula = housing_starts ~ time + Q2 + Q3 + Q4 + interest_rates +
## income, data = housing_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -545.7 -130.7 4.2 122.6 535.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.061e+02 6.717e+02 0.753 0.45261
## time 5.142e+01 2.068e+00 24.864 < 2e-16 ***
## Q2 -1.791e+02 6.382e+01 -2.806 0.00579 **
## Q3 3.775e+02 9.342e+01 4.041 9.09e-05 ***
## Q4 -3.971e+02 6.624e+01 -5.995 1.91e-08 ***
## interest_rates -2.930e+01 3.871e+01 -0.757 0.45041
## income 3.598e-03 1.766e-02 0.204 0.83888
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 201 on 129 degrees of freedom
## Multiple R-squared: 0.9912, Adjusted R-squared: 0.9908
## F-statistic: 2414 on 6 and 129 DF, p-value: < 2.2e-16
# Test for seasonal effects
seasonal_test <- linearHypothesis(model, c("Q2=0", "Q3=0", "Q4=0"))
cat("\nTest for Joint Significance of Seasonal Dummies:\n")
##
## Test for Joint Significance of Seasonal Dummies:
print(seasonal_test)
##
## Linear hypothesis test:
## Q2 = 0
## Q3 = 0
## Q4 = 0
##
## Model 1: restricted model
## Model 2: housing_starts ~ time + Q2 + Q3 + Q4 + interest_rates + income
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 132 14793296
## 2 129 5214314 3 9578982 78.993 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot actual vs fitted values
plot(housing_data$date, housing_data$housing_starts,
type = "l", col = "blue",
xlab = "Time", ylab = "Housing Starts",
main = "Housing Starts: Actual vs Fitted Values")
lines(housing_data$date, fitted(model), col = "red")
legend("topleft",
legend = c("Actual", "Fitted"),
col = c("blue", "red"),
lty = 1)
# Plot seasonal pattern
seasonal_means <- aggregate(housing_starts ~ quarter,
data = housing_data, mean)
barplot(seasonal_means$housing_starts,
names.arg = c("Q1", "Q2", "Q3", "Q4"),
main = "Average Housing Starts by Quarter",
ylab = "Housing Starts")
# Diagnostic plots
par(mfrow = c(2,2))
# Residuals vs Time
plot(housing_data$date, residuals(model),
type = "l",
xlab = "Time",
ylab = "Residuals",
main = "Residuals vs Time")
abline(h = 0, col = "red", lty = 2)
# Residuals vs Fitted
plot(fitted(model), residuals(model),
xlab = "Fitted values",
ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, col = "red", lty = 2)
# ACF of residuals
acf(residuals(model), main = "ACF of Residuals")
# PACF of residuals
pacf(residuals(model), main = "PACF of Residuals")
##Chapter 10.C1
# Load the intdef dataset
data("intdef")
# Define a dummy variable for years after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)
# Model specification: Interest rate as a function of the dummy variable
model_policy_shift <- lm(i3 ~ dummy, data = intdef)
# Summary of the model
summary(model_policy_shift)
##
## Call:
## lm(formula = i3 ~ dummy, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1979 -1.9449 -0.4569 1.3616 7.8121
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.9259 0.4692 8.367 2.53e-11 ***
## dummy 2.2920 0.7167 3.198 0.00232 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.654 on 54 degrees of freedom
## Multiple R-squared: 0.1592, Adjusted R-squared: 0.1437
## F-statistic: 10.23 on 1 and 54 DF, p-value: 0.002317
# Load required packages
library(wooldridge) # for VOLAT dataset
library(lmtest) # for statistical tests
library(car) # for additional regression diagnostics
# Load the data
data(volat)
# (i) First look at correlations to get a sense of relationships
cor_matrix <- cor(volat[c("rsp500", "pcip", "i3")])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(round(cor_matrix, 4))
## rsp500 pcip i3
## rsp500 1 NA NA
## pcip NA 1 NA
## i3 NA NA 1
# (ii) Estimate the OLS regression
model <- lm(rsp500 ~ pcip + i3, data = volat)
# Print detailed results
cat("\nRegression Results:\n")
##
## Regression Results:
print(summary(model))
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
# Calculate confidence intervals
cat("\n95% Confidence Intervals:\n")
##
## 95% Confidence Intervals:
print(confint(model))
## 2.5 % 97.5 %
## (Intercept) 12.4103555 25.2757570
## pcip -0.2177506 0.2905842
## i3 -2.4238093 -0.2995680
# Calculate some additional statistics
n <- nrow(volat)
rss <- sum(residuals(model)^2)
rmse <- sqrt(rss/model$df.residual)
# Print additional model diagnostics
cat("\nModel Diagnostics:")
##
## Model Diagnostics:
cat("\nNumber of observations:", n)
##
## Number of observations: 558
cat("\nRoot MSE:", round(rmse, 4))
##
## Root MSE: 40.1298
cat("\nF-statistic:", round(summary(model)$fstatistic[1], 4))
##
## F-statistic: 3.3338
cat("\nF-statistic p-value:", round(pf(summary(model)$fstatistic[1],
summary(model)$fstatistic[2],
summary(model)$fstatistic[3],
lower.tail = FALSE), 4))
##
## F-statistic p-value: 0.0364
# (iii) Check statistical significance
# Already included in the summary output, but we can highlight it
coef_summary <- summary(model)$coefficients
cat("\n\nStatistical Significance Tests:")
##
##
## Statistical Significance Tests:
cat("\nVariable Coefficient Std.Error t-value p-value")
##
## Variable Coefficient Std.Error t-value p-value
cat("\n----------------------------------------------------\n")
##
## ----------------------------------------------------
for(i in 1:nrow(coef_summary)) {
cat(sprintf("%-10s %10.4f %10.4f %8.4f %8.4f\n",
rownames(coef_summary)[i],
coef_summary[i,1],
coef_summary[i,2],
coef_summary[i,3],
coef_summary[i,4]))
}
## (Intercept) 18.8431 3.2749 5.7538 0.0000
## pcip 0.0364 0.1294 0.2814 0.7785
## i3 -1.3617 0.5407 -2.5183 0.0121
# (iv) Test for predictability
# Calculate R-squared and adjusted R-squared
cat("\nPredictability Measures:")
##
## Predictability Measures:
cat("\nR-squared:", round(summary(model)$r.squared, 4))
##
## R-squared: 0.0119
cat("\nAdjusted R-squared:", round(summary(model)$adj.r.squared, 4))
##
## Adjusted R-squared: 0.0083
# Optional: Add Breusch-Godfrey test for serial correlation
bg_test <- bgtest(model, order = 1)
cat("\n\nBreusch-Godfrey test for serial correlation:")
##
##
## Breusch-Godfrey test for serial correlation:
print(bg_test)
##
## Breusch-Godfrey test for serial correlation of order up to 1
##
## data: model
## LM test = 31.644, df = 1, p-value = 1.852e-08
# Load required packages
library(wooldridge) # for CONSUMP dataset
library(lmtest) # for statistical tests
library(car) # for linear hypothesis testing
# Load the data
data(consump)
# Calculate growth in consumption (gc)
consump$gc <- c(NA, diff(log(consump$c)))
# Create lagged variables
consump$gc_1 <- c(NA, head(consump$gc, -1)) # gc(t-1)
consump$gy_1 <- c(NA, diff(log(consump$y))) # gy(t-1)
consump$i3_1 <- c(NA, head(consump$i3, -1)) # i3(t-1)
consump$inf_1 <- c(NA, head(consump$inf, -1)) # inf(t-1)
# Remove NA values
consump_clean <- na.omit(consump)
# (i) Test PIH with simple regression
model1 <- lm(gc ~ gc_1, data = consump_clean)
cat("\nPart (i) - Basic PIH Test:")
##
## Part (i) - Basic PIH Test:
cat("\n------------------------")
##
## ------------------------
cat("\nNull Hypothesis: β₁ = 0 (Growth in consumption is unpredictable)")
##
## Null Hypothesis: β₁ = 0 (Growth in consumption is unpredictable)
cat("\nAlternative Hypothesis: β₁ ≠ 0 (Growth in consumption is predictable)\n")
##
## Alternative Hypothesis: β₁ ≠ 0 (Growth in consumption is predictable)
print(summary(model1))
##
## Call:
## lm(formula = gc ~ gc_1, data = consump_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0279582 -0.0063356 -0.0008016 0.0072396 0.0200893
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011674 0.003882 3.007 0.00510 **
## gc_1 0.440607 0.158791 2.775 0.00915 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01176 on 32 degrees of freedom
## Multiple R-squared: 0.1939, Adjusted R-squared: 0.1688
## F-statistic: 7.699 on 1 and 32 DF, p-value: 0.009146
# (ii) Extended regression with additional variables
model2 <- lm(gc ~ gc_1 + gy_1 + i3_1 + inf_1, data = consump_clean)
cat("\nPart (ii) - Extended Model:")
##
## Part (ii) - Extended Model:
cat("\n------------------------\n")
##
## ------------------------
print(summary(model2))
##
## Call:
## lm(formula = gc ~ gc_1 + gy_1 + i3_1 + inf_1, data = consump_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.011978 -0.003976 -0.000105 0.005128 0.014391
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0114642 0.0054381 2.108 0.0438 *
## gc_1 0.0649123 0.1212864 0.535 0.5966
## gy_1 0.5112779 0.0849732 6.017 1.52e-06 ***
## i3_1 -0.0003691 0.0007396 -0.499 0.6215
## inf_1 -0.0002166 0.0006745 -0.321 0.7505
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007612 on 29 degrees of freedom
## Multiple R-squared: 0.6939, Adjusted R-squared: 0.6517
## F-statistic: 16.43 on 4 and 29 DF, p-value: 3.881e-07
# Individual significance tests
cat("\nIndividual Significance Tests (5% level):")
##
## Individual Significance Tests (5% level):
coef_summary <- summary(model2)$coefficients
sig_vars <- coef_summary[, "Pr(>|t|)"] < 0.05
cat("\nVariables significant at 5% level:")
##
## Variables significant at 5% level:
print(sig_vars)
## (Intercept) gc_1 gy_1 i3_1 inf_1
## TRUE FALSE TRUE FALSE FALSE
# (iii) Compare p-value for gc_1 between models
cat("\nPart (iii) - Comparison of gc_1 coefficient:")
##
## Part (iii) - Comparison of gc_1 coefficient:
cat("\n----------------------------------------")
##
## ----------------------------------------
cat("\np-value in simple model:", summary(model1)$coefficients["gc_1", "Pr(>|t|)"])
##
## p-value in simple model: 0.009146085
cat("\np-value in extended model:", summary(model2)$coefficients["gc_1", "Pr(>|t|)"])
##
## p-value in extended model: 0.5965932
# (iv) F-test for joint significance
f_test <- linearHypothesis(model2,
c("gc_1 = 0", "gy_1 = 0", "i3_1 = 0", "inf_1 = 0"))
cat("\n\nPart (iv) - Joint Significance Test:")
##
##
## Part (iv) - Joint Significance Test:
cat("\n--------------------------------\n")
##
## --------------------------------
print(f_test)
##
## Linear hypothesis test:
## gc_1 = 0
## gy_1 = 0
## i3_1 = 0
## inf_1 = 0
##
## Model 1: restricted model
## Model 2: gc ~ gc_1 + gy_1 + i3_1 + inf_1
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0054896
## 2 29 0.0016804 4 0.0038092 16.434 3.881e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Additional model diagnostics
cat("\nModel Diagnostics:")
##
## Model Diagnostics:
cat("\nR-squared (Model 1):", round(summary(model1)$r.squared, 4))
##
## R-squared (Model 1): 0.1939
cat("\nR-squared (Model 2):", round(summary(model2)$r.squared, 4))
##
## R-squared (Model 2): 0.6939
cat("\nAdjusted R-squared (Model 1):", round(summary(model1)$adj.r.squared, 4))
##
## Adjusted R-squared (Model 1): 0.1688
cat("\nAdjusted R-squared (Model 2):", round(summary(model2)$adj.r.squared, 4))
##
## Adjusted R-squared (Model 2): 0.6517
# Load required packages
library(wooldridge) # for MINWAGE dataset
library(lmtest) # for statistical tests
library(car) # for additional regression diagnostics
# Load the data
data(minwage)
# Create lagged variables
minwage$gwage232_1 <- c(NA, head(minwage$gwage232, -1)) # lagged wage growth
minwage$gemp232_1 <- c(NA, head(minwage$gemp232, -1)) # lagged employment growth
# Remove NA values for autocorrelation test
gwage232_clean <- na.omit(minwage$gwage232)
# (i) First order autocorrelation test
acf_test <- acf(gwage232_clean, plot = FALSE, lag.max = 1)
cat("\nPart (i) - First Order Autocorrelation:")
##
## Part (i) - First Order Autocorrelation:
cat("\n------------------------------------")
##
## ------------------------------------
cat("\nAutocorrelation coefficient:", round(acf_test$acf[2], 4))
##
## Autocorrelation coefficient: -0.0349
# Remove NA values for regression analysis
minwage_clean <- na.omit(minwage)
# Simple AR(1) model for comparison
ar1_model <- lm(gwage232 ~ gwage232_1, data = minwage_clean)
cat("\n\nAR(1) Model Results:\n")
##
##
## AR(1) Model Results:
print(summary(ar1_model))
##
## Call:
## lm(formula = gwage232 ~ gwage232_1, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.025657 -0.003851 -0.001987 0.003913 0.071671
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0036781 0.0004079 9.017 <2e-16 ***
## gwage232_1 -0.0231642 0.0408427 -0.567 0.571
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00932 on 597 degrees of freedom
## Multiple R-squared: 0.0005385, Adjusted R-squared: -0.001136
## F-statistic: 0.3217 on 1 and 597 DF, p-value: 0.5708
# (ii) Dynamic model
model1 <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean)
cat("\nPart (ii) - Dynamic Model:")
##
## Part (ii) - Dynamic Model:
cat("\n----------------------\n")
##
## ----------------------
print(summary(model1))
##
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044649 -0.004114 -0.001262 0.004481 0.041568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0023648 0.0004295 5.506 5.45e-08 ***
## gwage232_1 -0.0684816 0.0343986 -1.991 0.0470 *
## gmwage 0.1517511 0.0095115 15.955 < 2e-16 ***
## gcpi 0.2586795 0.0858602 3.013 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007775 on 595 degrees of freedom
## Multiple R-squared: 0.3068, Adjusted R-squared: 0.3033
## F-statistic: 87.79 on 3 and 595 DF, p-value: < 2.2e-16
# (iii) Add lagged employment growth
model2 <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1,
data = minwage_clean)
cat("\nPart (iii) - Model with Lagged Employment:")
##
## Part (iii) - Model with Lagged Employment:
cat("\n-------------------------------------\n")
##
## -------------------------------------
print(summary(model2))
##
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1,
## data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043900 -0.004316 -0.000955 0.004255 0.042430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0023960 0.0004254 5.633 2.74e-08 ***
## gwage232_1 -0.0656875 0.0340720 -1.928 0.054344 .
## gmwage 0.1525470 0.0094213 16.192 < 2e-16 ***
## gcpi 0.2536899 0.0850342 2.983 0.002968 **
## gemp232_1 0.0606620 0.0169691 3.575 0.000379 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007699 on 594 degrees of freedom
## Multiple R-squared: 0.3214, Adjusted R-squared: 0.3169
## F-statistic: 70.34 on 4 and 594 DF, p-value: < 2.2e-16
# (iv) Model without lags for comparison
model3 <- lm(gwage232 ~ gmwage + gcpi, data = minwage_clean)
cat("\nPart (iv) - Comparison of gmwage coefficients:")
##
## Part (iv) - Comparison of gmwage coefficients:
cat("\n----------------------------------------")
##
## ----------------------------------------
cat("\nCoefficient in model without lags:",
round(coef(model3)["gmwage"], 4))
##
## Coefficient in model without lags: 0.1506
cat("\nCoefficient in full model:",
round(coef(model2)["gmwage"], 4))
##
## Coefficient in full model: 0.1525
# (v) Regression of gmwage on lagged variables
model4 <- lm(gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean)
cat("\n\nPart (v) - Auxiliary Regression:")
##
##
## Part (v) - Auxiliary Regression:
cat("\n----------------------------\n")
##
## ----------------------------
print(summary(model4))
##
## Call:
## lm(formula = gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01987 -0.00511 -0.00385 -0.00290 0.62191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003487 0.001465 2.380 0.0176 *
## gwage232_1 0.212051 0.146727 1.445 0.1489
## gemp232_1 -0.042776 0.073749 -0.580 0.5621
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03347 on 596 degrees of freedom
## Multiple R-squared: 0.004117, Adjusted R-squared: 0.0007754
## F-statistic: 1.232 on 2 and 596 DF, p-value: 0.2924
# Additional diagnostics
cat("\nModel Comparisons - R-squared values:")
##
## Model Comparisons - R-squared values:
cat("\nModel without lags:", round(summary(model3)$r.squared, 4))
##
## Model without lags: 0.3022
cat("\nDynamic model:", round(summary(model1)$r.squared, 4))
##
## Dynamic model: 0.3068
cat("\nFull model with employment:", round(summary(model2)$r.squared, 4))
##
## Full model with employment: 0.3214
cat("\nAuxiliary regression:", round(summary(model4)$r.squared, 4))
##
## Auxiliary regression: 0.0041
# F-test for joint significance of lagged variables
f_test <- linearHypothesis(model2,
c("gwage232_1 = 0", "gemp232_1 = 0"))
cat("\n\nF-test for joint significance of lagged variables:\n")
##
##
## F-test for joint significance of lagged variables:
print(f_test)
##
## Linear hypothesis test:
## gwage232_1 = 0
## gemp232_1 = 0
##
## Model 1: restricted model
## Model 2: gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 596 0.036206
## 2 594 0.035209 2 0.00099708 8.4107 0.0002501 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Load necessary libraries
library(wooldridge) # For the dataset
library(tseries) # For ARCH/GARCH modeling
library(lmtest) # For diagnostic tests
library(dplyr) # For data manipulation
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Load the NYSE dataset
data("nyse", package = "wooldridge")
# Part (i) - Estimate the OLS model and obtain squared OLS residuals
ols_model <- lm(return ~ lag(return, 1), data = nyse) # Lagged returns regression
summary(ols_model)
##
## Call:
## lm(formula = return ~ lag(return, 1), data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.261 -1.302 0.098 1.316 8.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17963 0.08074 2.225 0.0264 *
## lag(return, 1) 0.05890 0.03802 1.549 0.1218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.11 on 687 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.003481, Adjusted R-squared: 0.00203
## F-statistic: 2.399 on 1 and 687 DF, p-value: 0.1218
# Squared residuals
residuals_squared <- residuals(ols_model)^2
# Compute average, minimum, and maximum of squared residuals
avg_resid_sq <- mean(residuals_squared, na.rm = TRUE)
min_resid_sq <- min(residuals_squared, na.rm = TRUE)
max_resid_sq <- max(residuals_squared, na.rm = TRUE)
cat("Average Squared Residuals:", avg_resid_sq, "\n")
## Average Squared Residuals: 4.440839
cat("Minimum Squared Residuals:", min_resid_sq, "\n")
## Minimum Squared Residuals: 7.35465e-06
cat("Maximum Squared Residuals:", max_resid_sq, "\n")
## Maximum Squared Residuals: 232.8946
# Part (ii) - Heteroscedasticity model
# Fit the model: Var(u_t|X_t) = δ0 + δ1*return_t-1 + δ2*return_t-1^2
# Create lagged variables
nyse <- nyse %>%
mutate(lagged_return = lag(return, 1),
lagged_return_squared = lagged_return^2)
# Remove rows with NA values caused by lagging
nyse_clean <- nyse %>%
filter(!is.na(lagged_return) & !is.na(lagged_return_squared))
# Fit the heteroscedasticity model using the cleaned dataset
hetero_model <- lm(residuals_squared ~ lagged_return + lagged_return_squared, data = nyse_clean)
summary(hetero_model)
##
## Call:
## lm(formula = residuals_squared ~ lagged_return + lagged_return_squared,
## data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## lagged_return -0.78946 0.19569 -4.034 6.09e-05 ***
## lagged_return_squared 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
# Extract coefficients and R-squared
coefficients <- coef(hetero_model)
r_squared <- summary(hetero_model)$r.squared
adj_r_squared <- summary(hetero_model)$adj.r.squared
cat("Coefficients:\n", coefficients, "\n")
## Coefficients:
## 3.257337 -0.7894616 0.2966573
cat("R-squared:", r_squared, "\n")
## R-squared: 0.1303208
cat("Adjusted R-squared:", adj_r_squared, "\n")
## Adjusted R-squared: 0.1277853
# Part (iii) - Sketch the conditional variance
# Plot the conditional variance as a function of the lagged return
plot(
nyse_clean$lagged_return,
fitted(hetero_model),
main = "Conditional Variance vs. Lagged Return",
xlab = "Lagged Return",
ylab = "Conditional Variance",
pch = 20,
col = "blue"
)
# Find the lagged return value for the smallest variance
min_variance_index <- which.min(fitted(hetero_model))
cat("Smallest Variance at Lagged Return:", nyse_clean$lagged_return[min_variance_index], "\n")
## Smallest Variance at Lagged Return: 1.321984
cat("Minimum Variance:", min(fitted(hetero_model), na.rm = TRUE), "\n")
## Minimum Variance: 2.732132
# Remove rows with NA values from the dataset to ensure compatibility with garch()
nyse_clean_arch <- nyse %>%
filter(!is.na(return))
# Part (v) - Fit ARCH(1) model
arch1_model <- garch(nyse_clean_arch$return, order = c(1, 0)) # ARCH(1)
##
## ***** ESTIMATION WITH ANALYTICAL GRADIENT *****
##
##
## I INITIAL X(I) D(I)
##
## 1 4.247682e+00 1.000e+00
## 2 5.000000e-02 1.000e+00
##
## IT NF F RELDF PRELDF RELDX STPPAR D*STEP NPRELDF
## 0 1 8.621e+02
## 1 5 8.621e+02 4.76e-06 9.48e-06 5.2e-04 4.0e+02 4.5e-03 1.91e-03
## 2 13 8.621e+02 1.32e-11 9.44e-11 1.9e-06 2.8e+00 2.3e-05 9.99e-09
## 3 16 8.621e+02 1.23e-12 4.00e-11 8.9e-07 7.0e+00 7.9e-06 1.00e-08
## 4 30 8.621e+02 1.32e-16 4.02e-16 9.7e-12 3.6e+06 8.2e-11 1.04e-08
## 5 31 8.621e+02 1.32e-16 8.04e-16 1.9e-11 1.9e+06 1.6e-10 1.13e-08
## 6 36 8.621e+02 -9.23e-16 3.74e-19 9.0e-15 4.0e+09 7.6e-14 1.13e-08
##
## ***** FALSE CONVERGENCE *****
##
## FUNCTION 8.621095e+02 RELDX 8.989e-15
## FUNC. EVALS 36 GRAD. EVALS 6
## PRELDF 3.742e-19 NPRELDF 1.134e-08
##
## I FINAL X(I) D(I) G(I)
##
## 1 4.248635e+00 1.000e+00 4.216e-03
## 2 5.441568e-02 1.000e+00 -1.398e-04
## Warning in garch(nyse_clean_arch$return, order = c(1, 0)): singular information
summary(arch1_model)
##
## Call:
## garch(x = nyse_clean_arch$return, order = c(1, 0))
##
## Model:
## GARCH(1,0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2283 -0.5195 0.1573 0.7268 3.9858
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 4.24863 NA NA NA
## b1 0.05442 NA NA NA
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 785.48, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 87.848, df = 1, p-value < 2.2e-16
# Extract AIC for ARCH(1)
arch1_aic <- AIC(arch1_model)
cat("ARCH(1) AIC:", arch1_aic, "\n")
## ARCH(1) AIC: 2994.516
# Display model coefficients
cat("ARCH(1) model coefficients:\n", coef(arch1_model), "\n")
## ARCH(1) model coefficients:
## 4.248635 0.05441568
# Part (v) - Fit ARCH(1) model
# Ensure the data has no NAs
nyse_clean_arch <- nyse %>%
filter(!is.na(return))
# Fit the ARCH(1) model
arch1_model <- garch(nyse_clean_arch$return, order = c(1, 0)) # ARCH(1)
##
## ***** ESTIMATION WITH ANALYTICAL GRADIENT *****
##
##
## I INITIAL X(I) D(I)
##
## 1 4.247682e+00 1.000e+00
## 2 5.000000e-02 1.000e+00
##
## IT NF F RELDF PRELDF RELDX STPPAR D*STEP NPRELDF
## 0 1 8.621e+02
## 1 5 8.621e+02 4.76e-06 9.48e-06 5.2e-04 4.0e+02 4.5e-03 1.91e-03
## 2 13 8.621e+02 1.32e-11 9.44e-11 1.9e-06 2.8e+00 2.3e-05 9.99e-09
## 3 16 8.621e+02 1.23e-12 4.00e-11 8.9e-07 7.0e+00 7.9e-06 1.00e-08
## 4 30 8.621e+02 1.32e-16 4.02e-16 9.7e-12 3.6e+06 8.2e-11 1.04e-08
## 5 31 8.621e+02 1.32e-16 8.04e-16 1.9e-11 1.9e+06 1.6e-10 1.13e-08
## 6 36 8.621e+02 -9.23e-16 3.74e-19 9.0e-15 4.0e+09 7.6e-14 1.13e-08
##
## ***** FALSE CONVERGENCE *****
##
## FUNCTION 8.621095e+02 RELDX 8.989e-15
## FUNC. EVALS 36 GRAD. EVALS 6
## PRELDF 3.742e-19 NPRELDF 1.134e-08
##
## I FINAL X(I) D(I) G(I)
##
## 1 4.248635e+00 1.000e+00 4.216e-03
## 2 5.441568e-02 1.000e+00 -1.398e-04
## Warning in garch(nyse_clean_arch$return, order = c(1, 0)): singular information
summary(arch1_model)
##
## Call:
## garch(x = nyse_clean_arch$return, order = c(1, 0))
##
## Model:
## GARCH(1,0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2283 -0.5195 0.1573 0.7268 3.9858
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 4.24863 NA NA NA
## b1 0.05442 NA NA NA
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 785.48, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 87.848, df = 1, p-value < 2.2e-16
# Extract AIC for ARCH(1)
arch1_aic <- AIC(arch1_model)
cat("ARCH(1) AIC:", arch1_aic, "\n")
## ARCH(1) AIC: 2994.516
# Display model coefficients
cat("ARCH(1) model coefficients:\n", coef(arch1_model), "\n")
## ARCH(1) model coefficients:
## 4.248635 0.05441568