#####Chapter7-1
library(wooldridge)
data("sleep75")
#i
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
coef_male <- summary(model)$coefficients["male", ]
coef_male
## Estimate Std. Error t value Pr(>|t|)
## 87.75242861 34.32616153 2.55642999 0.01078518
#The coefficient for male is 87.75, meaning men sleep 87.75 minutes more than women. The p-value (0.0108) indicates this is statistically significant.
#ii Coefficient and p-value for 'totwrk'
coef_totwrk <- summary(model)$coefficients["totwrk", ]
coef_totwrk
## Estimate Std. Error t value Pr(>|t|)
## -1.634232e-01 1.813210e-02 -9.012918e+00 1.891272e-18
#The coefficient for totwrk is -0.163, indicating that each additional minute worked reduces sleep by 0.163 minutes. The p-value (1.89e-18) confirms this is significant.
#iii Perform an F-test to jointly test age and age^2
linearHypothesis <- car::linearHypothesis(model, c("age = 0", "I(age^2) = 0"))
linearHypothesis
##
## Linear hypothesis test:
## age = 0
## I(age^2) = 0
##
## Model 1: restricted model
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
# The F-test for age and age^2 yields a p-value of 0.2506, which is not significant, meaning age has no significant effect on sleep..
#####Chapter7-3
library(wooldridge)
data("gpa2")
#(i) Fit the regression model
model_gpa2 <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(model_gpa2)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black,
## data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## female:black 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
coef_hsiz2 <- summary(model_gpa2)$coefficients["I(hsize^2)", ]
coef_hsiz2
## Estimate Std. Error t value Pr(>|t|)
## -2.194828e+00 5.271635e-01 -4.163468e+00 3.198417e-05
b1 <- coef(model_gpa2)["hsize"]
b2 <- coef(model_gpa2)["I(hsize^2)"]
optimal_hsize <- -b1 / (2 * b2)
optimal_hsize
## hsize
## 4.39603
# The inclusion of hsize^2 in the model is supported by its highly significant p-value (3.20e-05). The coefficient for hsize^2 is -2.19, and its inclusion helps model the non-linear relationship between high school size and SAT scores.
#(ii) Coefficient and p-value for `female`
coef_female <- summary(model_gpa2)$coefficients["female", ]
coef_female
## Estimate Std. Error t value Pr(>|t|)
## -4.509145e+01 4.291063e+00 -1.050822e+01 1.656301e-25
#Holding hsize fixed, the estimated difference in SAT scores between nonblack females and nonblack males is -45.09, as indicated by the coefficient for female. The p-value for this coefficient is extremely low (<2e-16), meaning this difference is statistically significant.
#(iii) Coefficient and p-value for `black`
coef_black <- summary(model_gpa2)$coefficients["black", ]
coef_black
## Estimate Std. Error t value Pr(>|t|)
## -1.698126e+02 1.271315e+01 -1.335725e+01 7.140387e-40
#The estimated difference in SAT scores between nonblack males and black males is -169.81, as indicated by the coefficient for black. To test whether this difference is statistically significant, an F-test is performed, and the p-value is <2e-16, indicating a significant difference between the two groups.
#(iv) Calculate the difference: female + black + female:black
library(car)
## Loading required package: carData
library(carData)
coef_female_black <- coef(model_gpa2)["female"] + coef(model_gpa2)["black"] + coef(model_gpa2)["female:black"]
coef_female_black
## female
## -152.5977
linearHypothesis(model_gpa2, c("female + black + female:black = 0"))
##
## Linear hypothesis test:
## female + black + female:black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 75969420
## 2 4131 73479121 1 2490298 140 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The estimated difference in SAT scores between black females and nonblack females is -152.60. This is the combined effect of female, black, and female:black, which has a significant p-value (<2.2e-16) from the F-test. This shows that the difference is statistically significant. To formally test this difference, the F-test for the joint hypothesis (female + black + female:black = 0) provides strong evidence for the difference.
#####Chapter7-C1
library(wooldridge)
data("gpa1")
#(i) Fit the base model (without mothcoll and fathcoll)
model_base <- lm(colGPA ~ hsGPA + ACT + PC, data = gpa1)
# Fit the extended model (with mothcoll and fathcoll)
model_extended <- lm(colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll, data = gpa1)
summary(model_base) # Base model
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7901 -0.2622 -0.0107 0.2334 0.7570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263520 0.333125 3.793 0.000223 ***
## hsGPA 0.447242 0.093647 4.776 4.54e-06 ***
## ACT 0.008659 0.010534 0.822 0.412513
## PC 0.157309 0.057287 2.746 0.006844 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.2023
## F-statistic: 12.83 on 3 and 137 DF, p-value: 1.932e-07
summary(model_extended) # Extended model
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll,
## data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## PC 0.151854 0.058716 2.586 0.010762 *
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
#After adding mothcoll and fathcoll to the model, the estimated coefficient for PC ownership (0.1519) remains statistically significant with a p-value of 0.0108. This indicates that PC ownership still has a significant effect on college GPA, despite the inclusion of the parental variables.mothcoll and fathcoll are not statistically significant in the model, with p-values of 0.9504 and 0.4963, respectively, suggesting that their inclusion does not substantially affect the model's fit.
#(ii) Test for joint significance
linearHypothesis(model_extended, c("mothcoll = 0", "fathcoll = 0"))
##
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
##
## Model 1: restricted model
## Model 2: colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 137 15.149
## 2 135 15.094 2 0.054685 0.2446 0.7834
#The p-value for the joint hypothesis test (mothcoll = 0 and fathcoll = 0) is 0.7834, which is much greater than the common significance level of 0.05. This indicates that mothcoll and fathcoll are jointly not significant in explaining college GPA.
#(iii) Fit the model with hsGPA^2
model_hsGPA2 <- lm(colGPA ~ hsGPA + I(hsGPA^2) + ACT + PC + mothcoll + fathcoll, data = gpa1)
summary(model_hsGPA2)
##
## Call:
## lm(formula = colGPA ~ hsGPA + I(hsGPA^2) + ACT + PC + mothcoll +
## fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78998 -0.24327 -0.00648 0.26179 0.72231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.040328 2.443038 2.063 0.0410 *
## hsGPA -1.802520 1.443552 -1.249 0.2140
## I(hsGPA^2) 0.337341 0.215711 1.564 0.1202
## ACT 0.004786 0.010786 0.444 0.6580
## PC 0.140446 0.058858 2.386 0.0184 *
## mothcoll 0.003091 0.060110 0.051 0.9591
## fathcoll 0.062761 0.062401 1.006 0.3163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared: 0.2361, Adjusted R-squared: 0.2019
## F-statistic: 6.904 on 6 and 134 DF, p-value: 2.088e-06
coef_hsGPA2 <- summary(model_hsGPA2)$coefficients["I(hsGPA^2)", ]
coef_hsGPA2
## Estimate Std. Error t value Pr(>|t|)
## 0.3373405 0.2157105 1.5638577 0.1202096
#The addition of hsGPA² in the model results in a coefficient of 0.3373 with a p-value of 0.1202, which is not statistically significant at the 0.05 level. Given that hsGPA² is not significant, it suggests that this non-linear term is not needed to improve the model. Therefore, there is no strong evidence to support its inclusion.
#####Chapter7-C2
#(i) Estimate the log-linear wage model
library(wooldridge)
data("wage2")
model_wage <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model_wage)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
#The coefficient for black is -0.188350, which implies that, holding other factors constant, blacks earn approximately 18.8% less than nonblacks. This difference is statistically significant, with a p-value of 6.84e-07, indicating that the racial wage gap is unlikely due to random chance.
#(ii) Add quadratic terms for exper and tenure
model_wage_quad <- lm(log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model_wage_quad)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure +
## I(tenure^2) + married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
linearHypothesis(model_wage_quad, c("I(exper^2) = 0", "I(tenure^2) = 0"))
##
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) +
## married + black + south + urban
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
#When testing the joint significance of the quadratic terms for experience (exper^2) and tenure (tenure^2), the p-value is 0.226. This indicates that these variables are jointly insignificant at the 20% significance level. Thus, it fail to reject the null hypothesis that these quadratic terms do not significantly contribute to the model.
#(iii) Test interaction between race and return to education
model_interaction <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model_interaction)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
#The interaction term between education and race (educ:black) has a coefficient of -0.022624, and the associated p-value is 0.262603, which is above the 5% significance level. Therefore, it can be concluded that the return to education does not significantly depend on race in this model.
#(iv) Test wage differentials across marital and race groups
model_married_black <- lm(log(wage) ~ educ + exper + tenure + married * black + south + urban, data = wage2)
summary(model_married_black)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married * black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## married 0.188915 0.042878 4.406 1.18e-05 ***
## black -0.240820 0.096023 -2.508 0.012314 *
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## married:black 0.061354 0.103275 0.594 0.552602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
linearHypothesis(model_married_black, "married:black = 0")
##
## Linear hypothesis test:
## married:black = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + tenure + married * black + south +
## urban
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 926 123.77 1 0.047174 0.3529 0.5526
#The coefficient for the interaction term married:black is 0.061354, indicating a wage differential of about 6.1% between married blacks and married nonblacks. However, the p-value is 0.552602, which is quite high, meaning the wage differential is not statistically significant. Therefore, we cannot conclude that being married and black leads to a significant wage difference compared to married nonblacks.
#####Chapter8-1
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(car)
library(sandwich)
library(zoo)
set.seed(123)
n <- 100
x <- rnorm(n, mean = 50, sd = 10)
u <- rnorm(n, mean = 0, sd = x) # Heteroskedastic error
y <- 5 + 2 * x + u
model <- lm(y ~ x)
summary(model)
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.876 -35.522 -4.845 29.767 130.827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1847 27.6782 0.910 0.36510
## x 1.4874 0.5353 2.779 0.00654 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.62 on 98 degrees of freedom
## Multiple R-squared: 0.07304, Adjusted R-squared: 0.06358
## F-statistic: 7.722 on 1 and 98 DF, p-value: 0.006541
bp_test <- bptest(model)
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.2235, df = 1, p-value = 0.2687
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
print(white_test)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1.243, df = 2, p-value = 0.5371
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.18470 27.41571 0.9186 0.360548
## x 1.48743 0.54484 2.7300 0.007509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Regular OLS Summary ---\n")
##
## --- Regular OLS Summary ---
print(summary(model))
##
## Call:
## lm(formula = y ~ x)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.876 -35.522 -4.845 29.767 130.827
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1847 27.6782 0.910 0.36510
## x 1.4874 0.5353 2.779 0.00654 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 48.62 on 98 degrees of freedom
## Multiple R-squared: 0.07304, Adjusted R-squared: 0.06358
## F-statistic: 7.722 on 1 and 98 DF, p-value: 0.006541
cat("\n--- Robust Standard Errors ---\n")
##
## --- Robust Standard Errors ---
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.18470 27.41571 0.9186 0.360548
## x 1.48743 0.54484 2.7300 0.007509 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The consequences of heteroskedasticity are: The usual F statistic no longer has an F distribution, The OLS estimators are no longer BLUE.
#####Chapter8-5
# Input coefficients and data for the analysis
coefficients <- list(
intercept = 0.656,
log_cigpric = -0.069,
log_income = 0.012,
educ = -0.029,
age = 0.020,
age_sq = -0.00026,
restaurn = -0.101,
white = -0.026
)
# Standard errors for reference
standard_errors <- list(
usual = c(0.204, 0.026, 0.006, 0.006, 0.00006, 0.039, 0.052),
robust = c(0.207, 0.026, 0.006, 0.005, 0.00006, 0.038, 0.050)
)
#(i) Compare standard errors
cat("Comparison of Standard Errors:\n")
## Comparison of Standard Errors:
usual <- c(0.204, 0.026, 0.006, 0.006, 0.00006, 0.039, 0.052)
robust <- c(0.207, 0.026, 0.006, 0.005, 0.00006, 0.038, 0.050)
comparison <- data.frame(
Variable = c("log(cigpric)", "log(income)", "educ", "age", "age^2", "restaurn", "white"),
Usual_SE = usual,
Robust_SE = robust
)
print(comparison)
## Variable Usual_SE Robust_SE
## 1 log(cigpric) 0.20400 0.20700
## 2 log(income) 0.02600 0.02600
## 3 educ 0.00600 0.00600
## 4 age 0.00600 0.00500
## 5 age^2 0.00006 0.00006
## 6 restaurn 0.03900 0.03800
## 7 white 0.05200 0.05000
#(ii) Change in probability for 4 years of education
beta_educ <- coefficients$educ
delta_prob <- 4 * beta_educ
cat("Change in probability for 4 additional years of education:", delta_prob, "\n")
## Change in probability for 4 additional years of education: -0.116
#(iii) Calculate turning point for age
beta_age <- coefficients$age
beta_age_sq <- coefficients$age_sq
turning_point <- -beta_age / (2 * beta_age_sq)
cat("Turning point where age reduces probability:", turning_point, "\n")
## Turning point where age reduces probability: 38.46154
#(iv) Print coefficient interpretation
beta_restaurn <- coefficients$restaurn
cat("Effect of restaurant smoking restrictions:", beta_restaurn, "\n")
## Effect of restaurant smoking restrictions: -0.101
#(v) Define characteristics of person 206
log_cigpric <- log(67.44)
log_income <- log(6500)
educ <- 16
age <- 77
restaurn <- 0
white <- 0
# Predicted probability
predicted_prob <- coefficients$intercept +
coefficients$log_cigpric * log_cigpric +
coefficients$log_income * log_income +
coefficients$educ * educ +
coefficients$age * age +
coefficients$age_sq * (age^2) +
coefficients$restaurn * restaurn +
coefficients$white * white
cat("Predicted probability of smoking for person 206:", predicted_prob, "\n")
## Predicted probability of smoking for person 206: 0.005239246
#####Chapter8-C4
#(i)Regress residuals on independent variables
library(wooldridge)
data("vote1")
ols_model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
residuals <- resid(ols_model)
residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)$r.squared
## [1] 1.198131e-32
#The output indicates that the R^2from regressing the residuals on the independent variables is 0 (which is expected because residuals in OLS regressions are orthogonal to the independent variables). In this case, the regression of the residuals on the independent variables will not capture any variation, leading to R^2=0.
#(ii) Breusch-Pagan Test for Heteroskedasticity
squared_residuals <- residuals^2
bp_model <- lm(squared_residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
bp_fstat <- summary(bp_model)$fstatistic
bp_pvalue <- pf(bp_fstat[1], bp_fstat[2], bp_fstat[3], lower.tail = FALSE)
cat("Breusch-Pagan F-statistic:", bp_fstat[1], "\n")
## Breusch-Pagan F-statistic: 2.330113
cat("p-value:", bp_pvalue, "\n")
## p-value: 0.05805751
#The p-value of 0.0581 is slightly greater than the typical threshold of 0.05. This suggests marginal evidence for heteroskedasticity, but the result is not strong enough to reject the null hypothesis of homoskedasticity at the 5% significance level. The evidence for heteroskedasticity is weak.
#(iii) Special Case of White's Test for Heteroskedasticity
vote1$prtystrA_sq <- vote1$prtystrA^2
vote1$democA_sq <- vote1$democA^2
vote1$log_expendA_sq <- log(vote1$expendA)^2
vote1$log_expendB_sq <- log(vote1$expendB)^2
vote1$interaction <- log(vote1$expendA) * log(vote1$expendB)
white_model <- lm(squared_residuals ~ prtystrA + democA + log(expendA) + log(expendB) +
prtystrA_sq + democA_sq + log_expendA_sq + log_expendB_sq + interaction, data = vote1)
white_fstat <- summary(white_model)$fstatistic
white_pvalue <- pf(white_fstat[1], white_fstat[2], white_fstat[3], lower.tail = FALSE)
cat("White's Test F-statistic:", white_fstat[1], "\n")
## White's Test F-statistic: 2.745312
cat("p-value:", white_pvalue, "\n")
## p-value: 0.00719407
#The p-value of 0.0072 is less than 0.05, suggesting strong evidence of heteroskedasticity. This indicates that the model’s error variance is not constant across observations, and heteroskedasticity may be present.
#####Chapter8-C13
#(i) Estimate the Model
library(wooldridge)
data("fertil2")
model_1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(model_1)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
library(sandwich)
robust_se <- sqrt(diag(vcovHC(model_1)))
summary(model_1)$coefficients[, 2]
## (Intercept) age I(age^2) educ electric urban
## 0.2401887611 0.0165082295 0.0002717715 0.0062966083 0.0690044704 0.0465062118
robust_se
## (Intercept) age I(age^2) educ electric urban
## 0.2443961935 0.0192199445 0.0003513959 0.0063159137 0.0640737262 0.0455162364
#No, in this case, robust standard errors are not always bigger. They are slightly larger for some variables (e.g., age), but smaller for others (e.g., electric).
#(ii)Add Religious Dummy Variables and Test Joint Significance
model_2 <- lm(children ~ age + I(age^2) + educ + electric + urban + spirit + protest + catholic, data = fertil2)
summary(model_2)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban +
## spirit + protest + catholic, data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9514 -0.7041 -0.0101 0.7213 7.4435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.3147175 0.2433391 -17.731 < 2e-16 ***
## age 0.3418668 0.0165186 20.696 < 2e-16 ***
## I(age^2) -0.0027596 0.0002722 -10.139 < 2e-16 ***
## educ -0.0762130 0.0064608 -11.796 < 2e-16 ***
## electric -0.3057189 0.0690241 -4.429 9.69e-06 ***
## urban -0.2034131 0.0465864 -4.366 1.29e-05 ***
## spirit 0.1404603 0.0558052 2.517 0.0119 *
## protest 0.0753961 0.0652158 1.156 0.2477
## catholic 0.1173539 0.0834249 1.407 0.1596
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.451 on 4349 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.574, Adjusted R-squared: 0.5733
## F-statistic: 732.6 on 8 and 4349 DF, p-value: < 2.2e-16
robust_se_model_2 <- sqrt(diag(vcovHC(model_2)))
joint_test <- linearHypothesis(model_2, c("spirit = 0", "protest = 0", "catholic = 0"))
robust_joint_test <- linearHypothesis(model_2, c("spirit = 0", "protest = 0", "catholic = 0"), vcov = vcovHC(model_2))
cat("Nonrobust p-value:", joint_test$`Pr(>F)`[2], "\n")
## Nonrobust p-value: 0.08640607
cat("Robust p-value:", robust_joint_test$`Pr(>F)`[2], "\n")
## Robust p-value: 0.09170157
#Nonrobust p-value(0.0864),Robust p-value(0.0917). Both p-values suggest that the religious dummies (spirit, protest, catholic) are not jointly significant.
#(iii)Obtain Fitted Values and Residuals, Regress Residuals on Fitted Values
fitted_values <- fitted(model_2)
residuals_values <- residuals(model_2)
model_3 <- lm(residuals_values^2 ~ fitted_values^2)
joint_test_residuals <- summary(model_3)$fstatistic
joint_pvalue_residuals <- pf(joint_test_residuals[1], joint_test_residuals[2], joint_test_residuals[3], lower.tail = FALSE)
cat("Joint significance test p-value:", joint_pvalue_residuals, "\n")
## Joint significance test p-value: 3.906415e-235
#After regressing residuals on fitted values, if significant, it suggests heteroskedasticity is present.
#(iv) Heteroskedasticity affects the efficiency of OLS estimates but doesn't bias them. If robust standard errors are used, the effect may not be practically significant.
#####Chapter9-1
model_1 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
model_2 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceotensq + comtensq, data = ceosal2)
anova(model_1, model_2)
## Analysis of Variance Table
##
## Model 1: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten +
## comten
## Model 2: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten +
## comten + ceotensq + comtensq
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 171 41.856
## 2 169 40.438 2 1.4182 2.9634 0.05433 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The p-value of 0.05433 suggests that adding ceotensq and comtensq does not significantly improve the model at the 5% level. Therefore, there is no strong evidence of functional form misspecification, and the original model appears well-specified.
#####Chapter9-5
#The failure of colleges to report campus crimes in 1992 can likely be viewed as endogenous sample selection rather than exogenous. If colleges with higher crime rates are less likely to report, this creates a bias because the decision to report is correlated with the crime rate. Therefore, non-reporting is not random, and the sample is not exogenously selected. This would lead to biased estimates of the relationship between student enrollment and campus crimes.
#####Chapter9-C4
library(wooldridge)
data("infmrt")
str(infmrt)
## 'data.frame': 102 obs. of 12 variables:
## $ year : int 1987 1990 1987 1990 1987 1990 1987 1990 1987 1990 ...
## $ infmort: num 8.3 6.2 7.8 7.1 8.5 ...
## $ afdcprt: int 52 62 11 21 20 25 234 282 42 52 ...
## $ popul : int 1186 1228 1056 1109 547 563 5856 6016 986 1003 ...
## $ pcinc : int 13996 17125 18083 21051 14267 17630 19131 22558 15683 18771 ...
## $ physic : int 173 178 186 200 244 253 322 337 244 254 ...
## $ afdcper: num 4.38 5.05 1.04 1.89 3.66 ...
## $ d90 : int 0 1 0 1 0 1 0 1 0 1 ...
## $ lpcinc : num 9.55 9.75 9.8 9.95 9.57 ...
## $ lphysic: num 5.15 5.18 5.23 5.3 5.5 ...
## $ DC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lpopul : num 7.08 7.11 6.96 7.01 6.3 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)
# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43481 -0.10338 0.00447 0.09096 0.39050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.456e+00 8.688e-02 28.264 < 2e-16 ***
## pcinc -3.078e-05 6.499e-06 -4.737 7.33e-06 ***
## physic 1.495e-03 2.729e-04 5.478 3.33e-07 ***
## dc -7.940e-02 1.664e-01 -0.477 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared: 0.2537, Adjusted R-squared: 0.2309
## F-statistic: 11.1 on 3 and 98 DF, p-value: 2.444e-06
#Estimate the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43432 -0.10235 0.00626 0.09337 0.39064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.448e+00 8.502e-02 28.794 < 2e-16 ***
## pcinc -3.026e-05 6.378e-06 -4.743 7.07e-06 ***
## physic 1.487e-03 2.713e-04 5.480 3.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared: 0.252, Adjusted R-squared: 0.2369
## F-statistic: 16.67 on 2 and 99 DF, p-value: 5.744e-07
#The coefficient for dc is -0.0794, indicating a slight decrease in the log of the infant mortality rate for Washington, DC. However, with a p-value of 0.634, it is not statistically significant, meaning we can't conclude DC significantly affects infant mortality.
# (ii) Compare the models using ANOVA
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
##
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 99 2.6064
## 2 98 2.6004 1 0.0060446 0.2278 0.6342
#The estimates for pcinc and physic are nearly the same between the two models. Including the dc dummy slightly increases the standard errors but does not significantly affect the other coefficients. The dc variable's insignificance suggests adding a dummy for a single observation doesn't improve the model.
#####Chapter9-C5
library(quantreg)
## Loading required package: SparseM
library(wooldridge)
data("rdchem")
#(i) Estimate OLS
ols_all <- lm(rdintens ~ lsales + profmarg, data = rdchem) # Use log(sales) to avoid multicollinearity
summary(ols_all)
##
## Call:
## lm(formula = rdintens ~ lsales + profmarg, data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3016 -1.2707 -0.6895 0.8785 6.0369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47225 1.67606 0.282 0.780
## lsales 0.32135 0.21557 1.491 0.147
## profmarg 0.05004 0.04578 1.093 0.283
##
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared: 0.09847, Adjusted R-squared: 0.0363
## F-statistic: 1.584 on 2 and 29 DF, p-value: 0.2224
rdchem_no_large_firm <- subset(rdchem, sales < 40000) # Sales in millions
ols_no_large_firm <- lm(rdintens ~ lsales + profmarg, data = rdchem_no_large_firm) # Log sales
summary(ols_no_large_firm)
##
## Call:
## lm(formula = rdintens ~ lsales + profmarg, data = rdchem_no_large_firm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3016 -1.2707 -0.6895 0.8785 6.0369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47225 1.67606 0.282 0.780
## lsales 0.32135 0.21557 1.491 0.147
## profmarg 0.05004 0.04578 1.093 0.283
##
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared: 0.09847, Adjusted R-squared: 0.0363
## F-statistic: 1.584 on 2 and 29 DF, p-value: 0.2224
#The OLS estimates with all firms show no significant relationship between R&D intensity and the predictors. Specifically, the coefficients for lsales and profmarg are not significant (p-values > 0.1). When the large firm (sales near $40 billion) is removed, the results remain essentially unchanged, with similar coefficients and standard errors.
#(ii) Estimate LAD
lad_all <- rq(rdintens ~ lsales + profmarg, data = rdchem, tau = 0.5) # Use log(sales) for LAD
summary(lad_all)
##
## Call: rq(formula = rdintens ~ lsales + profmarg, tau = 0.5, data = rdchem)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 2.62121 -3.59242 3.64851
## lsales -0.05873 -0.15063 0.73912
## profmarg 0.08641 -0.00177 0.15854
lad_no_large_firm <- rq(rdintens ~ lsales + profmarg, data = rdchem_no_large_firm, tau = 0.5)
summary(lad_no_large_firm)
##
## Call: rq(formula = rdintens ~ lsales + profmarg, tau = 0.5, data = rdchem_no_large_firm)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 2.62121 -3.59242 3.64851
## lsales -0.05873 -0.15063 0.73912
## profmarg 0.08641 -0.00177 0.15854
#The LAD results, which use the median of the residuals, show no significant relationship between R&D intensity and the predictors either. Again, the coefficients for lsales and profmarg are not significant, and removing the large firm has little effect on the estimates or their standard errors.
#(iii)Both OLS and LAD appear equally resilient to outliers in this case. The large firm's exclusion did not significantly affect the coefficients or the standard errors in either model, suggesting that outliers in this dataset do not have a substantial impact on the results.
#####Chapter10-1
#(i) Disagree. Time series data often exhibit autocorrelation, unlike cross-sectional data, where observations are typically independent.
#(ii) Agree. OLS estimators are unbiased under the first three Gauss-Markov assumptions, but time series data might violate assumptions like homoscedasticity.
#(iii) Disagree. A trending variable can be used as the dependent variable if properly transformed (e.g., by differencing or detrending).
#(iv) Agree. Seasonality is usually not an issue in annual time series, as it typically refers to periodic fluctuations within a year.
#####Chapter10-5
# Assuming I have a data frame `data` with the variables: housing_starts, interest_rate, income, and a time variable
# Create quarterly dummy variables
#data$Q1 <- ifelse(format(data$Date, "%m") %in% c("01", "02", "03"), 1, 0)
#data$Q2 <- ifelse(format(data$Date, "%m") %in% c("04", "05", "06"), 1, 0)
#data$Q3 <- ifelse(format(data$Date, "%m") %in% c("07", "08", "09"), 1, 0)
# Create a time trend variable (e.g., starting from 1 for the first quarter)
#data$Trend <- seq_along(data$Date)
# Estimate the model
#model <- lm(housing_starts ~ interest_rate + income + Trend + Q1 + Q2 + Q3, data = data)
# View the summary of the model
#summary(model)
#####Chapter10-C1
library(wooldridge)
data("intdef")
intdef$after_1979 <- ifelse(intdef$year > 1979, 1, 0)
model <- lm(i3 ~ inf + rec + out + after_1979, data = intdef)
summary(model)
##
## Call:
## lm(formula = i3 ~ inf + rec + out + after_1979, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2076 -1.2159 0.1902 0.8597 3.5701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -12.0348 3.5889 -3.353 0.00151 **
## inf 0.5324 0.0712 7.478 9.59e-10 ***
## rec 0.2528 0.1968 1.285 0.20473
## out 0.5172 0.1153 4.487 4.14e-05 ***
## after_1979 0.5748 0.5236 1.098 0.27747
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.531 on 51 degrees of freedom
## Multiple R-squared: 0.7358, Adjusted R-squared: 0.7151
## F-statistic: 35.51 on 4 and 51 DF, p-value: 3.586e-14
#The result suggests that there is no significant shift in the interest rate equation after 1979 based on the data from this regression model. The coefficient for the dummy variable (after_1979) is not statistically significant, meaning the change in the Federal Reserve's policy in October 1979 did not cause a significant change in the relationship between the 3-month T-bill rate and the other variables in the model (inflation, receipts, outlays). This indicates that, according to this model, the shift in policy does not have a clear and measurable effect on the interest rate behavior in the data used.
#####Chapter10-C9
#(i)β₁(pcip): Likely positive, as higher industrial production typically boosts stock market returns.β₂(i3): Likely negative, since higher T-bill returns could make equities less attractive.
library(wooldridge)
data(volat)
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
#18.84306—the average S&P 500 return when both pcip and i3 are 0. pcip (β₁): The coefficient (0.03642) is very small and not significant (p = 0.7785). This suggests that industrial production growth does not strongly influence S&P 500 returns in this model. i3 (β₂): The coefficient (-1.36169) is statistically significant (p = 0.0121), implying that a 1% increase in T-bill returns reduces S&P 500 returns by approximately 1.36 percentage points.
#(iii) pcip: Not statistically significant (p = 0.7785). Changes in industrial production do not significantly impact S&P 500 returns in this model. i3: Statistically significant (p = 0.0121). Changes in T-bill returns have a meaningful impact on S&P 500 returns.
#(iv) Given that only i3 is statistically significant, it suggests that T-bill returns have some predictability for the S&P 500 returns. However, the model has a very low R-squared value (0.01189), meaning that only about 1.2% of the variation in S&P 500 returns is explained by the model. This indicates that while there is some statistical significance, the model is not very effective in predicting returns based on these two variables alone.
#####Chapter11-C7
library(wooldridge)
library(dynlm)
library(car)
data("consump")
summary(consump)
## year i3 inf rdisp
## Min. :1959 Min. : 2.380 Min. : 0.700 Min. :1530
## 1st Qu.:1968 1st Qu.: 4.070 1st Qu.: 2.800 1st Qu.:2298
## Median :1977 Median : 5.510 Median : 4.100 Median :3105
## Mean :1977 Mean : 6.062 Mean : 4.638 Mean :3154
## 3rd Qu.:1986 3rd Qu.: 7.480 3rd Qu.: 5.800 3rd Qu.:4087
## Max. :1995 Max. :14.030 Max. :13.500 Max. :4946
##
## rnondc rserv pop y
## Min. : 606.3 Min. : 687.4 Min. :177830 Min. : 8604
## 1st Qu.: 816.9 1st Qu.:1059.6 1st Qu.:200706 1st Qu.:11451
## Median :1010.4 Median :1518.2 Median :220239 Median :14099
## Mean :1003.9 Mean :1557.0 Mean :220749 Mean :13940
## 3rd Qu.:1215.9 3rd Qu.:2041.4 3rd Qu.:240651 3rd Qu.:16983
## Max. :1421.9 Max. :2577.0 Max. :263034 Max. :18803
##
## rcons c r3 lc
## Min. :1294 Min. : 7275 Min. :-3.260 Min. :8.892
## 1st Qu.:1876 1st Qu.: 9349 1st Qu.: 0.450 1st Qu.:9.143
## Median :2529 Median :11481 Median : 1.380 Median :9.348
## Mean :2561 Mean :11329 Mean : 1.424 Mean :9.310
## 3rd Qu.:3257 3rd Qu.:13535 3rd Qu.: 2.590 3rd Qu.:9.513
## Max. :3999 Max. :15203 Max. : 5.430 Max. :9.629
##
## ly gc gy gc_1
## Min. :9.060 Min. :-0.009107 Min. :-0.016973 Min. :-0.009107
## 1st Qu.:9.346 1st Qu.: 0.012964 1st Qu.: 0.009617 1st Qu.: 0.013130
## Median :9.554 Median : 0.021865 Median : 0.021580 Median : 0.022531
## Mean :9.515 Mean : 0.020474 Mean : 0.021715 Mean : 0.020689
## 3rd Qu.:9.740 3rd Qu.: 0.029746 3rd Qu.: 0.031721 3rd Qu.: 0.029868
## Max. :9.842 Max. : 0.040209 Max. : 0.061985 Max. : 0.040209
## NA's :1 NA's :1 NA's :2
## gy_1 r3_1 lc_ly lc_ly_1
## Min. :-0.016973 Min. :-3.2600 Min. :-0.2377 Min. :-0.2377
## 1st Qu.: 0.009032 1st Qu.: 0.3425 1st Qu.:-0.2138 1st Qu.:-0.2139
## Median : 0.021446 Median : 1.3050 Median :-0.2089 Median :-0.2087
## Mean : 0.021610 Mean : 1.3881 Mean :-0.2048 Mean :-0.2046
## 3rd Qu.: 0.031736 3rd Qu.: 2.4100 3rd Qu.:-0.2000 3rd Qu.:-0.1993
## Max. : 0.061985 Max. : 5.4300 Max. :-0.1609 Max. :-0.1609
## NA's :2 NA's :1 NA's :1
## gc_2 gy_2 r3_2 lc_ly_2
## Min. :-0.009107 Min. :-0.016973 Min. :-3.260 Min. :-0.2377
## 1st Qu.: 0.013053 1st Qu.: 0.008448 1st Qu.: 0.235 1st Qu.:-0.2140
## Median : 0.022565 Median : 0.021580 Median : 1.230 Median :-0.2089
## Mean : 0.020850 Mean : 0.021838 Mean : 1.379 Mean :-0.2047
## 3rd Qu.: 0.029990 3rd Qu.: 0.031751 3rd Qu.: 2.470 3rd Qu.:-0.1993
## Max. : 0.040209 Max. : 0.061985 Max. : 5.430 Max. :-0.1609
## NA's :3 NA's :3 NA's :2 NA's :2
cor(consump[, c("gc", "gy", "i3", "inf")], use = "complete.obs")
## gc gy i3 inf
## gc 1.0000000 0.8238203 -0.3271522 -0.4270330
## gy 0.8238203 1.0000000 -0.1993171 -0.3735170
## i3 -0.3271522 -0.1993171 1.0000000 0.7491123
## inf -0.4270330 -0.3735170 0.7491123 1.0000000
apply(consump, 2, function(x) length(unique(x)))
## year i3 inf rdisp rnondc rserv pop y rcons c
## 37 37 30 37 37 37 37 37 37 37
## r3 lc ly gc gy gc_1 gy_1 r3_1 lc_ly lc_ly_1
## 36 37 37 37 37 36 36 36 37 37
## gc_2 gy_2 r3_2 lc_ly_2
## 35 35 35 36
nrow(consump)
## [1] 37
nrow(na.omit(consump))
## [1] 34
consump$gc <- scale(consump$gc)
consump$gy <- scale(consump$gy)
consump$i3 <- scale(consump$i3)
consump$inf <- scale(consump$inf)
# (i) Test the PIH by estimating the model
model1 <- dynlm(gc ~ L(gc, 1), data = consump)
summary(model1)
## Warning in summary.lm(model1): essentially perfect fit: summary may be
## unreliable
##
## Time series regression with "zoo" data:
## Start = 2, End = 37
##
## Call:
## dynlm(formula = gc ~ L(gc, 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.688e-16 -6.850e-17 -3.871e-17 5.660e-18 1.315e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.643e-32 3.975e-17 0.00e+00 1
## L(gc, 1) 1.000e+00 4.032e-17 2.48e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.385e-16 on 34 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 6.152e+32 on 1 and 34 DF, p-value: < 2.2e-16
# The regression of current consumption growth (gct) on past consumption growth (gct−1) shows a significant relationship, rejecting the PIH hypothesis, which suggests consumption growth should be unpredictable.
# (ii) Add new variables and test their significance
model2 <- dynlm(gc ~ L(gc, 1) + L(gy, 1) + L(i3, 1) + L(inf, 1), data = consump)
summary(model2)
## Warning in summary.lm(model2): essentially perfect fit: summary may be
## unreliable
##
## Time series regression with "zoo" data:
## Start = 2, End = 37
##
## Call:
## dynlm(formula = gc ~ L(gc, 1) + L(gy, 1) + L(i3, 1) + L(inf,
## 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.464e-16 -9.577e-17 -2.519e-17 2.775e-17 1.198e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.501e-18 3.983e-17 6.300e-02 0.950
## L(gc, 1) 1.000e+00 7.449e-17 1.343e+16 <2e-16 ***
## L(gy, 1) 4.284e-17 7.323e-17 5.850e-01 0.563
## L(i3, 1) -2.030e-17 6.262e-17 -3.240e-01 0.748
## L(inf, 1) -5.549e-17 6.553e-17 -8.470e-01 0.404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.388e-16 on 31 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.535e+32 on 4 and 31 DF, p-value: < 2.2e-16
vif(model2)
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
## L(gc, 1) L(gy, 1) L(i3, 1) L(inf, 1)
## 3.405900 3.292336 2.406735 2.587971
anova_result <- anova(model1, model2)
cat("F-statistic for joint significance:", anova_result$F[2], "\n")
## F-statistic for joint significance: 0.9759717
cat("P-value for joint significance:", anova_result$`Pr(>F)`[2], "\n")
## P-value for joint significance: 0.4166377
#Adding variables doesn't improve the model significantly, as these variables are not individually significant at the 5% level. The joint significance test for all variables also fails to provide strong support.
# (iii) Compare p-values for gc(t-1) in both models
p_value_model1 <- summary(model1)$coefficients[2, 4]
## Warning in summary.lm(model1): essentially perfect fit: summary may be
## unreliable
p_value_model2 <- summary(model2)$coefficients[2, 4]
## Warning in summary.lm(model2): essentially perfect fit: summary may be
## unreliable
cat("P-value for gc_{t-1} in Model 1:", p_value_model1, "\n")
## P-value for gc_{t-1} in Model 1: 0
cat("P-value for gc_{t-1} in Model 2:", p_value_model2, "\n")
## P-value for gc_{t-1} in Model 2: 0
#The p-value for gct−1 remains highly significant even after adding other variables, indicating that past consumption growth is still a key predictor of current consumption growth, which contradicts the PIH hypothesis.
# (iv)Reduced model with fewer predictors
model2_reduced <- dynlm(gc ~ L(gc, 1) + L(gy, 1), data = consump)
summary(model2_reduced)
## Warning in summary.lm(model2_reduced): essentially perfect fit: summary may be
## unreliable
##
## Time series regression with "zoo" data:
## Start = 2, End = 37
##
## Call:
## dynlm(formula = gc ~ L(gc, 1) + L(gy, 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.527e-16 -6.157e-17 -4.485e-17 -6.420e-18 1.310e-15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.643e-32 4.014e-17 0.000e+00 1.000
## L(gc, 1) 1.000e+00 7.182e-17 1.392e+16 <2e-16 ***
## L(gy, 1) 4.212e-17 7.182e-17 5.860e-01 0.562
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.409e-16 on 33 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 3.016e+32 on 2 and 33 DF, p-value: < 2.2e-16
anova_result_reduced <- anova(model1, model2_reduced)
cat("F-statistic for joint significance in reduced model:", anova_result_reduced$F[2], "\n")
## F-statistic for joint significance in reduced model: 0.3439165
cat("P-value for joint significance in reduced model:", anova_result_reduced$`Pr(>F)`[2], "\n")
## P-value for joint significance in reduced model: 0.5615676
plot(residuals(model1), main = "Residuals of Model 1")

plot(residuals(model2), main = "Residuals of Model 2")

#The F-statistic confirms that all variables are jointly significant, but this does not change the overall conclusion that the PIH hypothesis is not supported by the data.
#####Chapter11-C2
library(wooldridge)
data("minwage")
minwage_clean <- minwage[!is.na(minwage$gwage232) & !is.na(minwage$gmwage) & !is.na(minwage$gcpi), ]
# (i) Find the first-order autocorrelation in gwage232
acf(minwage_clean$gwage232, lag.max = 1, main = "ACF for first-order autocorrelation in gwage232")

#The first-order autocorrelation is low, indicating weak dependence in the gwage232 series. It suggests past wage growth doesn’t strongly affect current wage growth.
# (ii) Estimate the dynamic model and analyze
minwage_clean$lag_gwage232 <- c(NA, minwage_clean$gwage232[-length(minwage_clean$gwage232)])
model_ols <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = minwage_clean)
summary(model_ols)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044642 -0.004134 -0.001312 0.004482 0.041612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024003 0.0004308 5.572 3.79e-08 ***
## lag_gwage232 -0.0779092 0.0342851 -2.272 0.02341 *
## gmwage 0.1518459 0.0096485 15.738 < 2e-16 ***
## gcpi 0.2630876 0.0824457 3.191 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007889 on 606 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2986, Adjusted R-squared: 0.2951
## F-statistic: 85.99 on 3 and 606 DF, p-value: < 2.2e-16
#The regression shows that an increase in the federal minimum wage (gmwage) leads to a contemporaneous increase in wage growth (gwage232), holding past wage growth and CPI constant. The effect is statistically significant.
# (iii) Add the lagged growth in employment (gemp232_t-1) to the model
minwage_clean$lag_gemp232 <- c(NA, minwage_clean$gemp232[-length(minwage_clean$gemp232)])
model_ols_with_employment <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag_gemp232, data = minwage_clean)
summary(model_ols_with_employment)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag_gemp232,
## data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043842 -0.004378 -0.001034 0.004321 0.042548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.002451 0.000426 5.753 1.4e-08 ***
## lag_gwage232 -0.074546 0.033901 -2.199 0.028262 *
## gmwage 0.152707 0.009540 16.007 < 2e-16 ***
## gcpi 0.252296 0.081544 3.094 0.002066 **
## lag_gemp232 0.066131 0.016962 3.899 0.000108 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007798 on 605 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3158, Adjusted R-squared: 0.3112
## F-statistic: 69.8 on 4 and 605 DF, p-value: < 2.2e-16
#When I added lagged employment growth (gemp232_{t-1}), it was positive and statistically significant, showing that past employment growth positively affects current wage growth.
# (iv) Compare the effect of adding lag_gwage232 and lag_gemp232 on the gmwage coefficient
model_1_coeffs <- summary(model_ols)$coefficients[, 1]
model_2_coeffs <- summary(model_ols_with_employment)$coefficients[, 1]
model_comparison <- data.frame(
Coefficients = c("Intercept", "Lag(gwage232)", "gmwage", "gcpi", "Lag(gemp232)"),
Model_1 = c(model_1_coeffs["(Intercept)"], model_1_coeffs["lag_gwage232"], model_1_coeffs["gmwage"], model_1_coeffs["gcpi"], NA),
Model_2 = c(model_2_coeffs["(Intercept)"], model_2_coeffs["lag_gwage232"], model_2_coeffs["gmwage"], model_2_coeffs["gcpi"], model_2_coeffs["lag_gemp232"])
)
print(model_comparison)
## Coefficients Model_1 Model_2
## (Intercept) Intercept 0.002400259 0.002450634
## lag_gwage232 Lag(gwage232) -0.077909250 -0.074546055
## gmwage gmwage 0.151845901 0.152706968
## gcpi gcpi 0.263087554 0.252295742
## Lag(gemp232) NA 0.066130774
#Adding the lagged variables (gwage232_{t-1} and gemp232_{t-1}) didn’t change the gmwage coefficient much. It remained positive and significant, indicating the lagged variables don’t significantly alter the effect of minimum wage on wage growth.
# (v) Run the regression of gmwage on lag_gwage232 and lag_gemp232,
model_gmwage <- lm(gmwage ~ lag_gwage232 + lag_gemp232, data = minwage_clean)
r_squared <- summary(model_gmwage)$r.squared
print(paste("R-squared for gmwage model: ", round(r_squared, 4)))
## [1] "R-squared for gmwage model: 0.0039"
#The R-squared value from regressing gmwage on lagged gwage232 and gemp232 is very low (0.0039), suggesting these variables do not explain much of the variation in gmwage. This supports the result in part (iv) that adding lagged variables doesn’t affect gmwage much.
#####Chapter12-C11
library(wooldridge)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data("nyse")
nyse_lagged <- nyse %>%
arrange(t) %>%
mutate(
lag_return_1 = lag(return, 1), # Lagged return 1
lag_return_2 = lag(return, 2) # Lagged return 2
) %>%
filter(!is.na(lag_return_1) & !is.na(lag_return_2))
#(i) Estimate the OLS model
model_ols <- lm(return ~ lag_return_1 + lag_return_2, data = nyse_lagged)
nyse_lagged$squared_residuals <- residuals(model_ols)^2
mean_squared_residuals <- mean(nyse_lagged$squared_residuals)
min_squared_residuals <- min(nyse_lagged$squared_residuals)
max_squared_residuals <- max(nyse_lagged$squared_residuals)
cat("Mean Squared Residuals:", mean_squared_residuals, "\n")
## Mean Squared Residuals: 4.439886
cat("Min Squared Residuals:", min_squared_residuals, "\n")
## Min Squared Residuals: 9.293076e-05
cat("Max Squared Residuals:", max_squared_residuals, "\n")
## Max Squared Residuals: 233.9956
#The model was estimated using two lagged returns (lag_return_1 and lag_return_2), and squared residuals (errors from the model) were calculated. The squared residuals had the following statistics: Mean: 4.44; Minimum: 0.000093; Maximum: 233.99. These values indicate the magnitude of deviations between the predicted and actual returns.
#(ii) Estimate the heteroskedasticity model
model_hetero <- lm(squared_residuals ~ lag_return_1 + lag_return_2, data = nyse_lagged)
summary(model_hetero)
##
## Call:
## lm(formula = squared_residuals ~ lag_return_1 + lag_return_2,
## data = nyse_lagged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.867 -3.814 -1.979 0.953 224.439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7447 0.4282 11.080 < 2e-16 ***
## lag_return_1 -1.0596 0.2015 -5.260 1.93e-07 ***
## lag_return_2 -0.5179 0.2013 -2.573 0.0103 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.14 on 685 degrees of freedom
## Multiple R-squared: 0.04995, Adjusted R-squared: 0.04717
## F-statistic: 18.01 on 2 and 685 DF, p-value: 2.39e-08
#The squared residuals from the OLS regression were used as the dependent variable, with lagged returns (lag_return_1 and lag_return_2) as predictors. The estimated coefficients were: Intercept: 4.7447 (representing baseline variance when returns are 0); lag_return_1: -1.0596 (indicating a decrease in variance as past returns increase); lag_return_2: -0.5179 (a weaker effect compared to lag_return_1). The R-squared of the model was 0.04995, explaining about 5% of the variance, although the coefficients were statistically significant.
#(iii) Plot conditional variance (squared residuals) vs lagged return_1
plot(nyse_lagged$lag_return_1, nyse_lagged$squared_residuals,
main = "Conditional Variance vs Lagged Return",
xlab = "Lagged Return", ylab = "Squared Residuals (Conditional Variance)",
pch = 16, col = "blue")

#A plot was created to show the relationship between squared residuals (conditional variance) and lag_return_1. The variance was smallest when lag_return_1 was close to 0. Smallest variance: When lag_return_1 is 0, the variance was 0.000093.
# (iv) Check for Negative Variance Estimates
predicted_variance <- predict(model_hetero)
negative_variances <- sum(predicted_variance < 0)
cat("Number of Negative Variances:", negative_variances, "\n")
## Number of Negative Variances: 12
#When predicting dynamic variance using the heteroskedasticity model, 12 negative variance estimates were found. Negative variance is not possible, suggesting that the model does not fully capture the volatility in the data.
#(v) Compare with ARCH(1) Model
library(rugarch)
## Loading required package: parallel
##
## Attaching package: 'rugarch'
## The following object is masked from 'package:stats':
##
## sigma
spec_1 <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 0)),
mean.model = list(armaOrder = c(0, 0)))
arch_model_1 <- ugarchfit(spec_1, nyse_lagged$return)
cat("ARCH(1) Model AIC:", infocriteria(arch_model_1)[1], "\n")
## ARCH(1) Model AIC: 4.249404
cat("ARCH(1) Model BIC:", infocriteria(arch_model_1)[2], "\n")
## ARCH(1) Model BIC: 4.269173
#An ARCH(1) model was fit to the data, modeling variance based on the previous period’s squared return. AIC: The ARCH(1) model showed a slightly better AIC, suggesting a better fit than the model in the heteroskedasticity approach. BIC: The BIC for ARCH(1) was higher, indicating that the simpler model from the heteroskedasticity model might be preferred when accounting for model complexity.
# (vi) Add Second Lag to ARCH Model and Compare
spec_2 <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(2, 0)),
mean.model = list(armaOrder = c(0, 0)))
arch_model_2 <- ugarchfit(spec_2, nyse_lagged$return)
cat("ARCH(2) Model AIC:", infocriteria(arch_model_2)[1], "\n")
## ARCH(2) Model AIC: 4.246008
cat("ARCH(2) Model BIC:", infocriteria(arch_model_2)[2], "\n")
## ARCH(2) Model BIC: 4.272367
if (infocriteria(arch_model_2)[1] < infocriteria(arch_model_1)[1]) {
cat("ARCH(2) fits better based on AIC\n")
} else {
cat("ARCH(1) fits better based on AIC\n")
}
## ARCH(2) fits better based on AIC
if (infocriteria(arch_model_2)[2] < infocriteria(arch_model_1)[2]) {
cat("ARCH(2) fits better based on BIC\n")
} else {
cat("ARCH(1) fits better based on BIC\n")
}
## ARCH(1) fits better based on BIC
cat("ARCH(2) Coefficients:\n")
## ARCH(2) Coefficients:
print(coef(arch_model_2))
## mu omega alpha1 alpha2
## 0.25399854 2.83334375 0.29147529 0.07244814
#A second lag was added to the ARCH model (ARCH(2)) to check if it improved the fit. AIC: ARCH(2) showed a slight improvement over ARCH(1). BIC: ARCH(1) had a better BIC, suggesting that the additional lag does not lead to a better model when considering model simplicity. this is code