## Chapter 7
# Exercise 1
# Load libraries
library(wooldridge)
library(broom)
library(car)
## Loading required package: carData
# Load the dataset
data("sleep75")
# Fit the regression model
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
# Summary of the model
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
#(i)
# Check the coefficient for male
coeftest <- summary(model)$coefficients
coeftest["male", ]
## Estimate Std. Error t value Pr(>|t|)
## 87.75242861 34.32616153 2.55642999 0.01078518
#(ii)
# Check the coefficient for totwrk
coeftest["totwrk", ]
## Estimate Std. Error t value Pr(>|t|)
## -1.634232e-01 1.813210e-02 -9.012918e+00 1.891272e-18
#(iii)
# Run an F-test for age and age^2 jointly
linearHypothesis(model, c("age = 0", "I(age^2) = 0"))
##
## Linear hypothesis test:
## age = 0
## I(age^2) = 0
##
## Model 1: restricted model
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
# Exercise 3
data("gpa2")
# Fit the regression model
model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
# Display summary
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black,
## data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## female:black 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
#(i)
# Test if hsize^2 is significant
coeftest <- summary(model)$coefficients
coeftest["I(hsize^2)", ]
## Estimate Std. Error t value Pr(>|t|)
## -2.194828e+00 5.271635e-01 -4.163468e+00 3.198417e-05
# Calculate optimal high school size
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
optimal_hsize
## hsize
## 4.39603
#(ii)
# Coefficient for female
coeftest["female", ]
## Estimate Std. Error t value Pr(>|t|)
## -4.509145e+01 4.291063e+00 -1.050822e+01 1.656301e-25
#(iii)
# Coefficient for black
coeftest["black", ]
## Estimate Std. Error t value Pr(>|t|)
## -1.698126e+02 1.271315e+01 -1.335725e+01 7.140387e-40
linearHypothesis(model, "black = 0")
##
## Linear hypothesis test:
## black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 76652652
## 2 4131 73479121 1 3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(iv)
# Calculate difference
diff_black_female <- coef(model)["black"] + coef(model)["female:black"]
diff_black_female
## black
## -107.5063
linearHypothesis(model, c("black + female:black = 0"))
##
## Linear hypothesis test:
## black + female:black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 74700518
## 2 4131 73479121 1 1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Exercise C1
data("gpa1")
#(i)
# Model with mothcoll and fathcoll
model1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC, data = gpa1)
# Display summary
summary(model1)
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC,
## data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## PC 0.151854 0.058716 2.586 0.010762 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
#(ii)
# Joint significance test
linearHypothesis(model1, c("mothcoll = 0", "fathcoll = 0"))
##
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
##
## Model 1: restricted model
## Model 2: colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 137 15.149
## 2 135 15.094 2 0.054685 0.2446 0.7834
#(iii)
# Model with hsGPA squared
model2 <- lm(colGPA ~ hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll + PC, data = gpa1)
# Display summary
summary(model2)
##
## Call:
## lm(formula = colGPA ~ hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll +
## PC, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78998 -0.24327 -0.00648 0.26179 0.72231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.040328 2.443038 2.063 0.0410 *
## hsGPA -1.802520 1.443552 -1.249 0.2140
## I(hsGPA^2) 0.337341 0.215711 1.564 0.1202
## ACT 0.004786 0.010786 0.444 0.6580
## mothcoll 0.003091 0.060110 0.051 0.9591
## fathcoll 0.062761 0.062401 1.006 0.3163
## PC 0.140446 0.058858 2.386 0.0184 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared: 0.2361, Adjusted R-squared: 0.2019
## F-statistic: 6.904 on 6 and 134 DF, p-value: 2.088e-06
# Exercise C2
data("wage2")
#(i)
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
#(ii)
model2 <- lm(log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure +
## I(tenure^2) + married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
anova(model1, model2) # F-test for joint significance
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south +
## urban
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) +
## married + black + south + urban
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
#(iii)
model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model3)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
#(iv)
wage2$group <- interaction(wage2$married, wage2$black)
model4 <- lm(log(wage) ~ group + educ + exper + tenure + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ group + educ + exper + tenure + south +
## urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## group1.0 0.188915 0.042878 4.406 1.18e-05 ***
## group0.1 -0.240820 0.096023 -2.508 0.012314 *
## group1.1 0.009448 0.056013 0.169 0.866083
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
## Chapter 8
# Exercise 1
#Statements (ii) and (iii) are true and consequences of heteroskedasticity
# Exercise 5
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# Load the dataset
data(smoke)
# Create a binary smoking variable
smoke$smokes <- ifelse(smoke$cigs > 0, 1, 0)
# Estimate the model
model <- lm(smokes ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model)
##
## Call:
## lm(formula = smokes ~ log(cigpric) + log(income) + educ + age +
## I(age^2) + restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.6715 -0.3958 -0.2724 0.5344 0.9340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.6560624 0.8549619 0.767 0.443095
## log(cigpric) -0.0689259 0.2041088 -0.338 0.735684
## log(income) 0.0121620 0.0257244 0.473 0.636498
## educ -0.0288802 0.0059008 -4.894 1.19e-06 ***
## age 0.0198158 0.0056660 3.497 0.000496 ***
## I(age^2) -0.0002598 0.0000617 -4.210 2.84e-05 ***
## restaurn -0.1007616 0.0394431 -2.555 0.010815 *
## white -0.0256801 0.0515172 -0.498 0.618286
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared: 0.062, Adjusted R-squared: 0.05378
## F-statistic: 7.544 on 7 and 799 DF, p-value: 8.035e-09
# Robust standard errors
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5606e-01 8.6408e-01 0.7593 0.4479200
## log(cigpric) -6.8926e-02 2.0803e-01 -0.3313 0.7404890
## log(income) 1.2162e-02 2.5823e-02 0.4710 0.6377879
## educ -2.8880e-02 5.6421e-03 -5.1187 3.857e-07 ***
## age 1.9816e-02 5.4115e-03 3.6618 0.0002669 ***
## I(age^2) -2.5979e-04 5.7133e-05 -4.5470 6.284e-06 ***
## restaurn -1.0076e-01 3.7690e-02 -2.6734 0.0076615 **
## white -2.5680e-02 5.0690e-02 -0.5066 0.6125706
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (ii) Effect of 4 additional years of education
change_educ <- coef(model)["educ"] * 4
print(paste("Change in smoking probability with 4 more years of education:", change_educ))
## [1] "Change in smoking probability with 4 more years of education: -0.115520886961172"
# (iii) Age that reduces smoking probability
age_vertex <- -coef(model)["age"] / (2 * coef(model)["I(age^2)"])
print(paste("Age at which smoking probability starts decreasing:", age_vertex))
## [1] "Age at which smoking probability starts decreasing: 38.1386089151421"
# (iv) Interpretation of restaurn coefficient
restaurn_coef <- coef(model)["restaurn"]
print(paste("The coefficient on restaurn is:", restaurn_coef))
## [1] "The coefficient on restaurn is: -0.100761630797458"
print("Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points).")
## [1] "Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points)."
# (v) Predicted probability for individual
predicted <- coef(model)["(Intercept)"] +
coef(model)["log(cigpric)"] * log(67.44) +
coef(model)["log(income)"] * log(6500) +
coef(model)["educ"] * 16 +
coef(model)["age"] * 77 +
coef(model)["I(age^2)"] * (77^2) +
coef(model)["restaurn"] * 0 +
coef(model)["white"] * 0
print(paste("Predicted probability of smoking for individual 206:", predicted))
## [1] "Predicted probability of smoking for individual 206: -0.00396546766766326"
# Exercise C4
# Load the VOTE1 dataset
data("vote1")
#(i)
# Run the OLS regression
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# Get residuals
vote1$residuals <- resid(model)
# Regress residuals on independent variables
resid_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
# Display R-squared
summary(resid_model)$r.squared
## [1] 1.198131e-32
#(ii)
# Perform Breusch-Pagan test
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 9.0934, df = 4, p-value = 0.05881
#(iii)
# Perform White test using auxiliary regression with squared terms
white_test <- bptest(model, ~ prtystrA + democA + log(expendA) + log(expendB) +
I(prtystrA^2) + I(democA^2) + I(log(expendA)^2) + I(log(expendB)^2),
data = vote1)
# Display test results
white_test
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 19.581, df = 7, p-value = 0.00655
# Exercise C13
data("fertil2")
#(i)
# Estimate the model
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
# Summary with usual standard errors
summary(model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
# Heteroskedasticity-robust standard errors
coeftest(model, vcov = vcovHC(model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24385099 -17.3160 < 2.2e-16 ***
## age 0.34092552 0.01917466 17.7800 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035051 -7.8206 6.549e-15 ***
## educ -0.07523232 0.00630771 -11.9270 < 2.2e-16 ***
## electric -0.31004041 0.06394815 -4.8483 1.289e-06 ***
## urban -0.20003386 0.04547093 -4.3992 1.113e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(ii)
# Extended model with religious dummies
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + catholic + protest, data = fertil2)
# Joint significance test
linearHypothesis(model2, c("catholic = 0", "protest = 0"), vcov = vcovHC(model2, type = "HC1"))
##
## Linear hypothesis test:
## catholic = 0
## protest = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic +
## protest
##
## Note: Coefficient covariance matrix supplied.
##
## Res.Df Df F Pr(>F)
## 1 4352
## 2 4350 2 0.1394 0.8699
#(iii)
# Obtain fitted values and residuals
fitted_vals <- fitted(model2)
residuals_sq <- resid(model2)^2
# Regression of squared residuals on fitted values
het_test <- lm(residuals_sq ~ fitted_vals + I(fitted_vals^2))
# Test joint significance
summary(het_test)
##
## Call:
## lm(formula = residuals_sq ~ fitted_vals + I(fitted_vals^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.747 -1.278 -0.300 0.339 47.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.31003 0.11168 2.776 0.00552 **
## fitted_vals -0.14880 0.10204 -1.458 0.14483
## I(fitted_vals^2) 0.26755 0.01965 13.614 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.642 on 4355 degrees of freedom
## Multiple R-squared: 0.2499, Adjusted R-squared: 0.2495
## F-statistic: 725.4 on 2 and 4355 DF, p-value: < 2.2e-16
linearHypothesis(het_test, c("fitted_vals = 0", "I(fitted_vals^2) = 0"))
##
## Linear hypothesis test:
## fitted_vals = 0
## I(fitted_vals^2) = 0
##
## Model 1: restricted model
## Model 2: residuals_sq ~ fitted_vals + I(fitted_vals^2)
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4357 77019
## 2 4355 57774 2 19246 725.37 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#(iv)
#Yes, A significant portion of the error variance is systematically related to the fitted valuesA significant portion of the error variance is systematically related to the fitted values.
## Chapter 9
#Exercise 1
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Step 1: Load the CEOSAL2 dataset
data("ceosal2")
# Step 2: Estimate the base model
base_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
summary(base_model)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten, data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## log(sales) 0.187787 0.040003 4.694 5.46e-06 ***
## log(mktval) 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
# Step 3: Calculate and store the R-squared of the base model
R2_base <- summary(base_model)$r.squared
cat("R-squared for the base model: ", R2_base, "\n")
## R-squared for the base model: 0.3525374
# Step 4: Add ceoten^2 and comten^2 and estimate the updated model
CEOSAL2 <- ceosal2 %>%
mutate(ceoten_sq = ceoten^2, comten_sq = comten^2)
updated_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
summary(updated_model)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47661 -0.26615 -0.03878 0.27787 1.89344
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.424e+00 2.656e-01 16.655 < 2e-16 ***
## log(sales) 1.857e-01 3.980e-02 4.665 6.25e-06 ***
## log(mktval) 1.018e-01 4.872e-02 2.089 0.038228 *
## profmarg -2.575e-03 2.088e-03 -1.233 0.219137
## ceoten 4.772e-02 1.416e-02 3.371 0.000929 ***
## comten -6.063e-03 1.189e-02 -0.510 0.610815
## ceoten_sq -1.119e-03 4.814e-04 -2.324 0.021329 *
## comten_sq -5.389e-05 2.528e-04 -0.213 0.831474
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared: 0.3745, Adjusted R-squared: 0.3486
## F-statistic: 14.45 on 7 and 169 DF, p-value: 1.136e-14
# Step 5: Extract R-squared for the updated model
R2_updated <- summary(updated_model)$r.squared
cat("R-squared for the updated model: ", R2_updated, "\n")
## R-squared for the updated model: 0.3744746
# Step 6: Compare R-squared values
if (R2_updated > R2_base) {
cat("The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.\n")
} else {
cat("No evidence of functional form misspecification.\n")
}
## The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.
# Exercise 5
data("crime4")
missing_crimes <- crime4 %>%
filter(is.na(crmrte))
cat("Number of colleges missing crime data: ", nrow(missing_crimes), "\n")
## Number of colleges missing crime data: 0
# Exercise C4
# Part (i): Reestimate the model including the DC dummy
model_with_dc <- lm(infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
summary(model_with_dc)
##
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6894 -0.8973 -0.1412 0.7054 3.1705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.2125 6.7267 5.383 5.09e-07 ***
## lpcinc -2.3964 0.8866 -2.703 0.00812 **
## lphysic -1.5548 0.7734 -2.010 0.04718 *
## lpopul 0.5755 0.1365 4.215 5.60e-05 ***
## DC 13.9632 1.2466 11.201 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.255 on 97 degrees of freedom
## Multiple R-squared: 0.6432, Adjusted R-squared: 0.6285
## F-statistic: 43.72 on 4 and 97 DF, p-value: < 2.2e-16
# Extract and interpret the coefficient for DC
dc_coeff <- summary(model_with_dc)$coefficients["DC", ]
cat("Coefficient on DC:", dc_coeff[1], "\n")
## Coefficient on DC: 13.96317
cat("Standard error for DC:", dc_coeff[2], "\n")
## Standard error for DC: 1.246646
cat("P-value for DC:", dc_coeff[4], "\n")
## P-value for DC: 3.514299e-19
# Part (ii): Reestimate the model without the DC dummy
model_without_dc <- lm(infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
summary(model_without_dc)
##
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0204 -1.1897 -0.1152 0.9083 8.1890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.22614 10.13465 3.574 0.000547 ***
## lpcinc -4.88415 1.29319 -3.777 0.000273 ***
## lphysic 4.02783 0.89101 4.521 1.73e-05 ***
## lpopul -0.05356 0.18748 -0.286 0.775712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.891 on 98 degrees of freedom
## Multiple R-squared: 0.1818, Adjusted R-squared: 0.1568
## F-statistic: 7.26 on 3 and 98 DF, p-value: 0.0001898
# Extract coefficients and standard errors from both models
coeff_without_dc <- coef(summary(model_without_dc))[, c(1, 2)] # Without DC
coeff_with_dc <- coef(summary(model_with_dc))[, c(1, 2)] # With DC
# Align the rows by variable names
comparison <- merge(
as.data.frame(coeff_without_dc, row.names = rownames(coeff_without_dc)),
as.data.frame(coeff_with_dc, row.names = rownames(coeff_with_dc)),
by = "row.names",
all = TRUE
)
# Rename columns for clarity
colnames(comparison) <- c("Variable", "Estimate (Without DC)", "Std. Error (Without DC)",
"Estimate (With DC)", "Std. Error (With DC)")
# Display the comparison table
print(comparison)
## Variable Estimate (Without DC) Std. Error (Without DC) Estimate (With DC)
## 1 (Intercept) 36.22614485 10.1346544 36.2125046
## 2 DC NA NA 13.9631727
## 3 lpcinc -4.88414573 1.2931923 -2.3963515
## 4 lphysic 4.02783276 0.8910099 -1.5547710
## 5 lpopul -0.05356253 0.1874791 0.5755155
## Std. Error (With DC)
## 1 6.7267094
## 2 1.2466463
## 3 0.8866076
## 4 0.7734138
## 5 0.1365241
# Exercise C5
library(dplyr)
library(quantreg)
## Loading required package: SparseM
# Load the RDCHEM dataset
data("rdchem")
# Inspect the dataset
summary(rdchem)
## rd sales profits rdintens
## Min. : 1.70 Min. : 42.0 Min. : -4.30 Min. :1.027
## 1st Qu.: 10.85 1st Qu.: 507.8 1st Qu.: 42.12 1st Qu.:2.031
## Median : 42.60 Median : 1332.5 Median : 111.70 Median :2.641
## Mean : 153.68 Mean : 3797.0 Mean : 370.50 Mean :3.266
## 3rd Qu.: 78.85 3rd Qu.: 2856.6 3rd Qu.: 295.68 3rd Qu.:3.965
## Max. :1428.00 Max. :39709.0 Max. :4154.00 Max. :9.422
## profmarg salessq lsales lrd
## Min. :-3.219 Min. :1.764e+03 Min. : 3.738 Min. :0.5306
## 1st Qu.: 4.074 1st Qu.:2.579e+05 1st Qu.: 6.230 1st Qu.:2.3839
## Median : 8.760 Median :1.790e+06 Median : 7.191 Median :3.7498
## Mean : 9.823 Mean :7.020e+07 Mean : 7.165 Mean :3.6028
## 3rd Qu.:16.456 3rd Qu.:8.162e+06 3rd Qu.: 7.957 3rd Qu.:4.3631
## Max. :27.187 Max. :1.577e+09 Max. :10.589 Max. :7.2640
head(rdchem)
## rd sales profits rdintens profmarg salessq lsales lrd
## 1 430.6 4570.2 186.9 9.421906 4.089536 20886730.00 8.427312 6.0651798
## 2 59.0 2830.0 467.0 2.084806 16.501766 8008900.00 7.948032 4.0775375
## 3 23.5 596.8 107.4 3.937668 17.995979 356170.22 6.391582 3.1570003
## 4 3.5 133.6 -4.3 2.619760 -3.218563 17848.96 4.894850 1.2527629
## 5 1.7 42.0 8.0 4.047619 19.047619 1764.00 3.737670 0.5306283
## 6 8.4 390.0 47.3 2.153846 12.128205 152100.00 5.966147 2.1282318
# Convert sales to billions
rdchem <- rdchem %>%
mutate(sales_bil = sales / 1000,
sales_bil_sq = sales_bil^2)
#(i)
# OLS with all data
ols_full <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem)
summary(ols_full)
##
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## sales_bil_sq -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
# Identify the firm with the largest sales
largest_firm <- rdchem %>% filter(sales_bil == max(sales_bil))
# Remove the firm with the largest sales
rdchem_no_outlier <- rdchem %>% filter(sales_bil != max(sales_bil))
# OLS without the largest firm
ols_no_outlier <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier)
summary(ols_no_outlier)
##
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## data = rdchem_no_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0843 -1.1354 -0.5505 0.7570 5.7783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.98352 0.71763 2.764 0.0102 *
## sales_bil 0.36062 0.23887 1.510 0.1427
## sales_bil_sq -0.01025 0.01308 -0.784 0.4401
## profmarg 0.05528 0.04579 1.207 0.2378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared: 0.1912, Adjusted R-squared: 0.1013
## F-statistic: 2.128 on 3 and 27 DF, p-value: 0.1201
#(ii)
# LAD with all data
lad_full <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem, tau = 0.5)
summary(lad_full)
##
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## tau = 0.5, data = rdchem)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## sales_bil_sq -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
# LAD without the largest firm
lad_no_outlier <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier, tau = 0.5)
summary(lad_no_outlier)
##
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg,
## tau = 0.5, data = rdchem_no_outlier)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 2.61047 0.58936 2.81404
## sales_bil -0.22364 -0.23542 0.87607
## sales_bil_sq 0.01681 -0.03201 0.02883
## profmarg 0.07594 0.00578 0.16392
#(iii)
cat("LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.")
## LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.
## Chapter 10
# Exercise 1
# (i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.
# Disagree.
# Time series data often exhibit autocorrelation, meaning current observations are correlated with past observations.
# Unlike cross-sectional data, the temporal order introduces dependence.
# (ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions.
# Agree.
# The first three Gauss-Markov assumptions (linearity, no multicollinearity, zero conditional mean of errors)
# ensure that the OLS estimator remains unbiased in time series regression.
# (iii) A trending variable cannot be used as the dependent variable in multiple regression analysis.
# Disagree.
# Trending variables can be used as dependent variables if the trend is explicitly modeled,
# such as including a time trend variable in the regression.
# (iv) Seasonality is not an issue when using annual time series observations.
# Agree.
# Seasonality is generally a concern in higher-frequency data (e.g., monthly or quarterly).
# Annual data averages out seasonal effects, reducing their impact.
# Exercise 5
# Specify a model that accounts for trends and seasonality:
cat("housing_model <- lm(housing_starts ~ time + interest_rate + income + Q1 + Q2 + Q3, data = housing_data)")
## housing_model <- lm(housing_starts ~ time + interest_rate + income + Q1 + Q2 + Q3, data = housing_data)
# Explanation:
# - time: Captures overall trend
# - interest_rate: Reflects interest rate effects
# - income: Reflects income effects
# - Q1, Q2, Q3: Seasonal dummies for the first three quarters (Q4 as the reference)
# - housing_starts: Dependent variable representing quarterly housing starts
# Exercise C1
# Create a dummy variable for years after 1979
intdef$after1979 <- ifelse(intdef$year > 1979, 1, 0)
# Fit the regression model
interest_model <- lm(i3 ~ inf + after1979, data = intdef)
# Display the summary of the model
summary(interest_model)
##
## Call:
## lm(formula = i3 ~ inf + after1979, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5906 -0.7579 0.1414 1.1752 3.8358
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.52833 0.44914 3.403 0.00128 **
## inf 0.62991 0.08151 7.728 3.05e-10 ***
## after1979 2.17781 0.49630 4.388 5.48e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.837 on 53 degrees of freedom
## Multiple R-squared: 0.6047, Adjusted R-squared: 0.5898
## F-statistic: 40.53 on 2 and 53 DF, p-value: 2.086e-11
# Exercise C9
# (i) Expected Signs:
# β1 (pcip): Positive. Higher industrial production growth often indicates economic growth, boosting stock returns.
# β2 (i3): Negative. Higher T-bill returns may lead investors to prefer risk-free assets over stocks.
# (ii) Estimate the equation by OLS
volat_model <- lm(rsp500 ~ pcip + i3, data = volat)
summary(volat_model)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
# (iii) Statistical Significance
# pcip: Not statistically significant (p = 0.7785)
# i3: Statistically significant (p = 0.0121)
# Interpretation
# A one-unit increase in pcip has a very small and statistically insignificant effect on rsp500.
# A one-unit increase in i3 reduces rsp500 by approximately 1.36 points, and this relationship is statistically significant.
# (iv) Implication for Predictability
# i3 is significant, suggesting some predictability from short-term interest rates.
# pcip is insignificant, suggesting it does not add predictive power.
# Efficient Market Hypothesis (EMH)
# Low R-squared (0.01189) indicates most variation in rsp500 is unexplained,
# aligning with the EMH's assertion of largely unpredictable stock returns.
## Chapter 11
# Exercise C7
data("consump", package = "wooldridge")
# Part (i): Test the PIH
# Create the growth variable gct and lagged growth variable gct_lag
consump$gct <- c(NA, diff(log(consump$c)))
consump$gct_lag <- c(NA, head(consump$gct, -1))
# Fit the regression model for gct ~ gct_lag
model1 <- lm(gct ~ gct_lag, data = consump, na.action = na.omit)
summary(model1)
##
## Call:
## lm(formula = gct ~ gct_lag, data = consump, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001451 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## gct_lag 0.446141 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.00731
# Part (ii): Add additional variables and test their significance
# Create additional variables
consump$gy <- c(NA, diff(log(consump$y))) # Growth rate of income
consump$gy_lag <- c(NA, head(consump$gy, -1)) # Lagged growth rate of income
consump$i3_lag <- c(NA, head(consump$i3, -1)) # Lagged T-bill rate
consump$inf_lag <- c(NA, head(consump$inf, -1)) # Lagged inflation rate
# Fit the regression model with additional variables
model2 <- lm(gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump, na.action = na.omit)
summary(model2)
##
## Call:
## lm(formula = gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0249092 -0.0075869 0.0000855 0.0087233 0.0188618
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0225940 0.0070892 3.187 0.00335 **
## gct_lag 0.4335853 0.2896533 1.497 0.14486
## gy_lag -0.1079101 0.1946373 -0.554 0.58340
## i3_lag -0.0007467 0.0011107 -0.672 0.50654
## inf_lag -0.0008281 0.0010041 -0.825 0.41606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01134 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3038, Adjusted R-squared: 0.211
## F-statistic: 3.273 on 4 and 30 DF, p-value: 0.02431
# Part (iii): Compare the significance of gct_lag between model1 and model2
# The p-value for gct_lag is shown in the summaries of model1 and model2.
# Part (iv): Perform an F-test to check joint significance of added variables
anova_result <- anova(model1, model2)
anova_result
## Analysis of Variance Table
##
## Model 1: gct ~ gct_lag
## Model 2: gct ~ gct_lag + gy_lag + i3_lag + inf_lag
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044446
## 2 30 0.0038608 3 0.00058384 1.5122 0.2315
# Print conclusions
cat("\nPart (i) Conclusion:\n")
##
## Part (i) Conclusion:
cat("The coefficient for gct_lag in model1 is significant (p-value =", summary(model1)$coefficients[2, 4], ").\n")
## The coefficient for gct_lag in model1 is significant (p-value = 0.007310181 ).
cat("This suggests the PIH is not supported.\n")
## This suggests the PIH is not supported.
cat("\nPart (ii) Conclusion:\n")
##
## Part (ii) Conclusion:
cat("In model2, none of the additional variables (gy_lag, i3_lag, inf_lag) are individually significant (p-values > 0.05).\n")
## In model2, none of the additional variables (gy_lag, i3_lag, inf_lag) are individually significant (p-values > 0.05).
cat("\nPart (iii) Conclusion:\n")
##
## Part (iii) Conclusion:
cat("The p-value for gct_lag increases from model1 (p =", summary(model1)$coefficients[2, 4],
") to model2 (p =", summary(model2)$coefficients[2, 4], ").\n")
## The p-value for gct_lag increases from model1 (p = 0.007310181 ) to model2 (p = 0.1448643 ).
cat("This indicates that the significance of gct_lag decreases when additional variables are included.\n")
## This indicates that the significance of gct_lag decreases when additional variables are included.
cat("\nPart (iv) Conclusion:\n")
##
## Part (iv) Conclusion:
cat("The F-test for joint significance of the additional variables in model2 yields a p-value of", anova_result$`Pr(>F)`[2], ".\n")
## The F-test for joint significance of the additional variables in model2 yields a p-value of 0.2314828 .
cat("The additional variables are not jointly significant at the 5% level.\n")
## The additional variables are not jointly significant at the 5% level.
# Exercise C12
data("minwage", package = "wooldridge")
# Remove missing values from gwage232
gwage232_clean <- na.omit(minwage$gwage232)
# Calculate first-order autocorrelation
acf_result <- acf(gwage232_clean, plot = FALSE)
autocorrelation <- acf_result$acf[2] # First lag autocorrelation
cat("Part (i) First-order autocorrelation in gwage232:", autocorrelation, "\n")
## Part (i) First-order autocorrelation in gwage232: -0.03493135
# Part (ii): Estimate the dynamic model
# Fit the dynamic model
model_dynamic <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
summary(model_dynamic)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044642 -0.004134 -0.001312 0.004482 0.041612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024003 0.0004308 5.572 3.79e-08 ***
## lag(gwage232, 1) -0.0779092 0.0342851 -2.272 0.02341 *
## gmwage 0.1518459 0.0096485 15.738 < 2e-16 ***
## gcpi 0.2630876 0.0824457 3.191 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007889 on 606 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.2986, Adjusted R-squared: 0.2951
## F-statistic: 85.99 on 3 and 606 DF, p-value: < 2.2e-16
# Part (iii): Add gemp232_lag to the model and test its significance
minwage$gemp232_lag <- c(NA, head(minwage$gemp232, -1)) # Lagged growth in employment
model_with_gemp <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_with_gemp)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag,
## data = minwage, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043842 -0.004378 -0.001034 0.004321 0.042548
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.002451 0.000426 5.753 1.4e-08 ***
## lag(gwage232, 1) -0.074546 0.033901 -2.199 0.028262 *
## gmwage 0.152707 0.009540 16.007 < 2e-16 ***
## gcpi 0.252296 0.081544 3.094 0.002066 **
## gemp232_lag 0.066131 0.016962 3.899 0.000108 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007798 on 605 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3158, Adjusted R-squared: 0.3112
## F-statistic: 69.8 on 4 and 605 DF, p-value: < 2.2e-16
# Part (iv): Compare the effect of adding lagged variables
# Extract coefficients for gmwage from the models
gmwage_coef_dynamic <- summary(model_dynamic)$coefficients["gmwage", "Estimate"]
gmwage_coef_with_gemp <- summary(model_with_gemp)$coefficients["gmwage", "Estimate"]
cat("Part (iv) Coefficient for gmwage in model_dynamic:", gmwage_coef_dynamic, "\n")
## Part (iv) Coefficient for gmwage in model_dynamic: 0.1518459
cat("Coefficient for gmwage in model_with_gemp:", gmwage_coef_with_gemp, "\n")
## Coefficient for gmwage in model_with_gemp: 0.152707
cat("Adding the lagged variables changes the effect of gmwage slightly.\n")
## Adding the lagged variables changes the effect of gmwage slightly.
# Part (v): Regression of gmwage on gwage232_lag and gemp232_lag
model_gmwage <- lm(gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_gmwage)
##
## Call:
## lm(formula = gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01914 -0.00500 -0.00379 -0.00287 0.62208
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003433 0.001440 2.384 0.0174 *
## lag(gwage232, 1) 0.203167 0.143140 1.419 0.1563
## gemp232_lag -0.041706 0.072110 -0.578 0.5632
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03318 on 607 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.00392, Adjusted R-squared: 0.0006377
## F-statistic: 1.194 on 2 and 607 DF, p-value: 0.3036
# Report R-squared
rsquared <- summary(model_gmwage)$r.squared
cat("Part (v) R-squared from regression of gmwage:", rsquared, "\n")
## Part (v) R-squared from regression of gmwage: 0.003919682
cat("A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).\n")
## A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).
## Chapter 12
# Exercise C11
data("nyse", package = "wooldridge")
# Create the lagged variable for return
nyse$return_lag1 <- c(NA, head(nyse$return, -1))
# Fit the model (12.47)
model_1247 <- lm(return ~ return_lag1, data = nyse, na.action = na.omit)
summary(model_1247)
##
## Call:
## lm(formula = return ~ return_lag1, data = nyse, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.261 -1.302 0.098 1.316 8.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17963 0.08074 2.225 0.0264 *
## return_lag1 0.05890 0.03802 1.549 0.1218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.11 on 687 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.003481, Adjusted R-squared: 0.00203
## F-statistic: 2.399 on 1 and 687 DF, p-value: 0.1218
# Exclude rows with missing values in the lagged variable (used in the regression)
nyse_clean <- nyse[!is.na(nyse$return_lag1), ]
# Add squared residuals to the cleaned dataset
nyse_clean$squared_resid <- residuals(model_1247)^2
# Calculate average, minimum, and maximum of squared residuals
avg_squared_resid <- mean(nyse_clean$squared_resid)
min_squared_resid <- min(nyse_clean$squared_resid)
max_squared_resid <- max(nyse_clean$squared_resid)
# Print the results
cat("Part (i) Average of squared residuals:", avg_squared_resid, "\n")
## Part (i) Average of squared residuals: 4.440839
cat("Minimum of squared residuals:", min_squared_resid, "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max_squared_resid, "\n")
## Maximum of squared residuals: 232.8946
# Part (ii) Create the squared lagged variable in the cleaned dataset
nyse_clean$return_lag1_sq <- nyse_clean$return_lag1^2
# Estimate the heteroskedasticity model
hetero_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
# Summarize the results
summary(hetero_model)
##
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## return_lag1 -0.78946 0.19569 -4.034 6.09e-05 ***
## return_lag1_sq 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
# Report estimated coefficients, standard errors, R-squared, and adjusted R-squared
cat("Part (ii) Coefficients and diagnostics for the heteroskedasticity model:\n")
## Part (ii) Coefficients and diagnostics for the heteroskedasticity model:
print(summary(hetero_model))
##
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## return_lag1 -0.78946 0.19569 -4.034 6.09e-05 ***
## return_lag1_sq 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
# Part (iii): Conditional variance as a function of lagged return
# Sketch conditional variance manually using the estimated coefficients
delta_0 <- coef(hetero_model)[1]
delta_1 <- coef(hetero_model)[2]
delta_2 <- coef(hetero_model)[3]
lagged_returns <- seq(min(nyse$return_lag1, na.rm = TRUE), max(nyse$return_lag1, na.rm = TRUE), length.out = 100)
conditional_variance <- delta_0 + delta_1 * lagged_returns + delta_2 * lagged_returns^2
# Plot the conditional variance
plot(lagged_returns, conditional_variance, type = "l", col = "blue",
main = "Conditional Variance as a Function of Lagged Return",
xlab = "Lagged Return", ylab = "Conditional Variance")
# Smallest variance and corresponding lagged return
min_var_index <- which.min(conditional_variance)
cat("Part (iii) Smallest variance:", conditional_variance[min_var_index], "\n")
## Part (iii) Smallest variance: 2.734254
cat("Lagged return corresponding to smallest variance:", lagged_returns[min_var_index], "\n")
## Lagged return corresponding to smallest variance: 1.245583
# Part (iv): Check for negative variance
any_negative_variance <- any(conditional_variance < 0)
cat("Part (iv) Does the model produce any negative variance?", any_negative_variance, "\n")
## Part (iv) Does the model produce any negative variance? FALSE
# Part (v): Compare model (ii) to ARCH(1)
# ARCH(1) regression
arch1_model <- lm(squared_resid ~ lag(squared_resid, 1), data = nyse_clean, na.action = na.omit)
summary(arch1_model)
##
## Call:
## lm(formula = squared_resid ~ lag(squared_resid, 1), data = nyse_clean,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.337 -3.292 -2.157 0.556 223.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.94743 0.44023 6.695 4.49e-11 ***
## lag(squared_resid, 1) 0.33706 0.03595 9.377 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 686 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1136, Adjusted R-squared: 0.1123
## F-statistic: 87.92 on 1 and 686 DF, p-value: < 2.2e-16
# Compare R-squared
cat("Part (v) R-squared for heteroskedasticity model:", summary(hetero_model)$r.squared, "\n")
## Part (v) R-squared for heteroskedasticity model: 0.1303208
cat("R-squared for ARCH(1) model:", summary(arch1_model)$r.squared, "\n")
## R-squared for ARCH(1) model: 0.1136065
# Part (vi): ARCH(2) regression
# Create the second lag of squared residuals in the cleaned dataset
nyse_clean$squared_resid_lag2 <- c(NA, NA, head(nyse_clean$squared_resid, -2))
# Fit the ARCH(2) model
arch2_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2, data = nyse_clean)
# Summarize the ARCH(2) model results
summary(arch2_model)
##
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2,
## data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.755 -3.039 -1.958 0.596 221.737
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.16429 0.45510 6.953 8.37e-12 ***
## return_lag1 -0.78460 0.19640 -3.995 7.17e-05 ***
## return_lag1_sq 0.28459 0.03810 7.469 2.47e-13 ***
## squared_resid_lag2 0.03492 0.03827 0.913 0.362
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.67 on 683 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1313, Adjusted R-squared: 0.1274
## F-statistic: 34.4 on 3 and 683 DF, p-value: < 2.2e-16
# Compare fit of ARCH(2) with previous models
cat("Part (vi) R-squared for ARCH(2) model:", summary(arch2_model)$r.squared, "\n")
## Part (vi) R-squared for ARCH(2) model: 0.1312591
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.