Chapter 7 (1)

This document analyzes the sleep75 dataset from the wooldridge package to answer three questions:

library(wooldridge)
str(sleep75)
## 'data.frame':    706 obs. of  34 variables:
##  $ age     : int  32 31 44 30 64 41 35 47 32 30 ...
##  $ black   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ case    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ clerical: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ construc: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ educ    : int  12 14 17 12 14 12 12 13 17 15 ...
##  $ earns74 : num  0 9500 42500 42500 2500 ...
##  $ gdhlth  : int  0 1 1 1 1 1 1 1 1 1 ...
##  $ inlf    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ leis1   : int  3529 2140 4595 3211 4052 4812 4787 3544 4359 4211 ...
##  $ leis2   : int  3479 2140 4505 3211 4007 4797 4157 3469 4359 4061 ...
##  $ leis3   : int  3479 2140 4227 3211 4007 4797 4157 3439 4121 4061 ...
##  $ smsa    : int  0 0 1 0 0 0 0 1 0 1 ...
##  $ lhrwage : num  1.956 0.358 3.022 2.264 1.012 ...
##  $ lothinc : num  10.08 0 0 0 9.33 ...
##  $ male    : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ marr    : int  1 0 1 1 1 1 1 1 1 1 ...
##  $ prot    : int  1 1 0 1 1 1 1 1 0 0 ...
##  $ rlxall  : int  3163 2920 3038 3083 3493 4078 3810 3033 3606 3168 ...
##  $ selfe   : int  0 1 1 1 0 0 0 1 0 1 ...
##  $ sleep   : int  3113 2920 2670 3083 3448 4063 3180 2928 3368 3018 ...
##  $ slpnaps : int  3163 2920 2760 3083 3493 4078 3810 3003 3368 3168 ...
##  $ south   : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ spsepay : num  0 0 20000 5000 2400 0 12000 0 0 6000 ...
##  $ spwrk75 : int  0 0 1 1 1 0 1 0 0 1 ...
##  $ totwrk  : int  3438 5020 2815 3786 2580 1205 2113 3608 2353 2851 ...
##  $ union   : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ worknrm : int  3438 5020 2815 3786 2580 0 2113 3608 2353 2851 ...
##  $ workscnd: int  0 0 0 0 0 1205 0 0 0 0 ...
##  $ exper   : int  14 11 21 12 44 23 17 28 9 9 ...
##  $ yngkid  : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ yrsmarr : int  13 0 0 12 33 23 0 24 11 7 ...
##  $ hrwage  : num  7.07 1.43 20.53 9.62 2.75 ...
##  $ agesq   : int  1024 961 1936 900 4096 1681 1225 2209 1024 900 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

1.Is there evidence that men sleep more than women?

coef_male <- summary(model)$coefficients["male", ]
cat("Coefficient for male:", coef_male["Estimate"], "\n")
## Coefficient for male: 87.75243
cat("P-value for male:", coef_male["Pr(>|t|)"], "\n")
## P-value for male: 0.01078518
if (coef_male["Pr(>|t|)"] < 0.05) {
  cat("Evidence shows that men sleep significantly more than women.\n")
} else {
  cat("No significant evidence that men sleep more than women.\n")
}
## Evidence shows that men sleep significantly more than women.

2. Is there a statistically significant tradeoff between working and sleeping?

coef_totwrk <- summary(model)$coefficients["totwrk", ]
cat("Coefficient for totwrk:", coef_totwrk["Estimate"], "\n")
## Coefficient for totwrk: -0.1634232
cat("P-value for totwrk:", coef_totwrk["Pr(>|t|)"], "\n")
## P-value for totwrk: 1.891272e-18
if (coef_totwrk["Pr(>|t|)"] < 0.05) {
  cat("There is a statistically significant tradeoff between working and sleeping.\n")
} else {
  cat("No significant tradeoff between working and sleeping.\n")
}
## There is a statistically significant tradeoff between working and sleeping.

3. Does age have a significant effect on sleep?

model_reduced <- lm(sleep ~ totwrk + educ + male, data = sleep75)


anova_test <- anova(model_reduced, model)
cat("ANOVA test results:\n")
## ANOVA test results:
print(anova_test)
## Analysis of Variance Table
## 
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506
if (anova_test$`Pr(>F)`[2] < 0.05) {
  cat("Reject the null hypothesis: Age has a significant effect on sleeping.\n")
} else {
  cat("Fail to reject the null hypothesis: Age has no significant effect on sleeping.\n")
}
## Fail to reject the null hypothesis: Age has no significant effect on sleeping.

Chapter 7 (3)

library(wooldridge)
str(gpa2)
## 'data.frame':    4137 obs. of  12 variables:
##  $ sat     : int  920 1170 810 940 1180 980 880 980 1240 1230 ...
##  $ tothrs  : int  43 18 14 40 18 114 78 55 18 17 ...
##  $ colgpa  : num  2.04 4 1.78 2.42 2.61 ...
##  $ athlete : int  1 0 1 0 0 0 0 0 0 0 ...
##  $ verbmath: num  0.484 0.828 0.884 0.808 0.735 ...
##  $ hsize   : num  0.1 9.4 1.19 5.71 2.14 2.68 3.11 2.68 3.67 0.1 ...
##  $ hsrank  : int  4 191 42 252 86 41 161 101 161 3 ...
##  $ hsperc  : num  40 20.3 35.3 44.1 40.2 ...
##  $ female  : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ white   : int  0 1 1 1 1 1 0 1 1 1 ...
##  $ black   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hsizesq : num  0.01 88.36 1.42 32.6 4.58 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(sat ~ hsize + I(hsize^2) + female + black + I(female * black), data = gpa2)
summary(model)
## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + I(female * 
##     black), data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1028.0972     6.2902 163.445  < 2e-16 ***
## hsize               19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)          -2.1948     0.5272  -4.163 3.20e-05 ***
## female             -45.0915     4.2911 -10.508  < 2e-16 ***
## black             -169.8126    12.7131 -13.357  < 2e-16 ***
## I(female * black)   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

Question (i): Is there strong evidence that hsize^2 should be included in the model?

coef_hsize2 <- summary(model)$coefficients["I(hsize^2)", ]
cat("Coefficient for hsize^2:", coef_hsize2["Estimate"], "\n")
## Coefficient for hsize^2: -2.194828
cat("P-value for hsize^2:", coef_hsize2["Pr(>|t|)"], "\n")
## P-value for hsize^2: 3.198417e-05
if (coef_hsize2["Pr(>|t|)"] < 0.05) {
  cat("There is strong evidence that hsize^2 should be included in the model.\n")
} else {
  cat("There is no strong evidence that hsize^2 should be included in the model.\n")
}
## There is strong evidence that hsize^2 should be included in the model.
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
cat("Optimal high school size (in hundreds):", optimal_hsize, "\n")
## Optimal high school size (in hundreds): 4.39603

Question (ii): Estimated difference in SAT score between nonblack females and nonblack males

coef_female <- summary(model)$coefficients["female", ]
cat("Coefficient for female:", coef_female["Estimate"], "\n")
## Coefficient for female: -45.09145
cat("P-value for female:", coef_female["Pr(>|t|)"], "\n")
## P-value for female: 1.656301e-25

Statistical significance of the difference

if (coef_female["Pr(>|t|)"] < 0.05) {
  cat("The difference in SAT score between nonblack females and nonblack males is statistically significant.\n")
} else {
  cat("The difference in SAT score between nonblack females and nonblack males is not statistically significant.\n")
}
## The difference in SAT score between nonblack females and nonblack males is statistically significant.

Question (iii): Estimated difference in SAT score between nonblack males and black males

coef_black <- summary(model)$coefficients["black", ]
cat("Coefficient for black:", coef_black["Estimate"], "\n")
## Coefficient for black: -169.8126
cat("P-value for black:", coef_black["Pr(>|t|)"], "\n")
## P-value for black: 7.140387e-40
if (coef_black["Pr(>|t|)"] < 0.05) {
  cat("There is a statistically significant difference in SAT score between nonblack males and black males.\n")
} else {
  cat("There is no statistically significant difference in SAT score between nonblack males and black males.\n")
}
## There is a statistically significant difference in SAT score between nonblack males and black males.

Question (iv): Estimated difference in SAT score between black females and nonblack females

coef_female_black <- summary(model)$coefficients["I(female * black)", ]
diff_black_female <- coef_black["Estimate"] + coef_female_black["Estimate"]
cat("Estimated difference in SAT score between black females and nonblack females:", diff_black_female, "\n")
## Estimated difference in SAT score between black females and nonblack females: -107.5063

Test statistical significance

p_value_female_black <- coef_female_black["Pr(>|t|)"]
cat("P-value for female-black interaction term:", p_value_female_black, "\n")
## P-value for female-black interaction term: 0.0006048744
if (p_value_female_black < 0.05) {
  cat("The difference in SAT score between black females and nonblack females is statistically significant.\n")
} else {
  cat("The difference in SAT score between black females and nonblack females is not statistically significant.\n")
}
## The difference in SAT score between black females and nonblack females is statistically significant.

C1 Use the data in GPAl for this exercise.

(i)Add the variables mothcoll and fathcoll to the equation estimated in (7.6) and report the results in the usual form.

data(wage2)


model_wage <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model_wage)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

(ii) Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.

wage2$exper_sq <- wage2$exper^2
wage2$tenure_sq <- wage2$tenure^2

model_wage_quad <- lm(log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq + married + black + south + urban, data = wage2)
summary(model_wage_quad)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq + 
##     married + black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## exper_sq    -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## tenure_sq   -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16
model_wage_restricted <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
anova(model_wage_restricted, model_wage_quad)
## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban
## Model 2: log(wage) ~ educ + exper + exper_sq + tenure + tenure_sq + married + 
##     black + south + urban
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226

(iii) Add hsGPA^2 to the model from part (i) and decide whether this generalization is needed.

model_wage_interaction <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model_wage_interaction)
## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
wage2$married_black <- wage2$married * wage2$black
wage2$married_nonblack <- wage2$married * (1 - wage2$black)
wage2$single_black <- (1 - wage2$married) * wage2$black
wage2$single_nonblack <- (1 - wage2$married) * (1 - wage2$black)


model_wage_groups <- lm(log(wage) ~ married_black + married_nonblack + single_black + single_nonblack + educ + exper + tenure + south + urban, data = wage2)
summary(model_wage_groups)
## 
## Call:
## lm(formula = log(wage) ~ married_black + married_nonblack + single_black + 
##     single_nonblack + educ + exper + tenure + south + urban, 
##     data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.403793   0.114122  47.351  < 2e-16 ***
## married_black     0.009448   0.056013   0.169 0.866083    
## married_nonblack  0.188915   0.042878   4.406 1.18e-05 ***
## single_black     -0.240820   0.096023  -2.508 0.012314 *  
## single_nonblack         NA         NA      NA       NA    
## educ              0.065475   0.006253  10.471  < 2e-16 ***
## exper             0.014146   0.003191   4.433 1.04e-05 ***
## tenure            0.011663   0.002458   4.745 2.41e-06 ***
## south            -0.091989   0.026321  -3.495 0.000497 ***
## urban             0.184350   0.026978   6.833 1.50e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16

C2 Use the data in WAGE2 for this exercise.

(i)Estimate the model and report the results in the usual form. Holding other factors fixed, what is the approximate difference in monthly salary between blacks and nonblacks? Is this difference statistically significant?

library(car)
## Loading required package: carData
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)


summary(model1)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16
coef_black <- coef(summary(model1))["black", "Estimate"]
pval_black <- coef(summary(model1))["black", "Pr(>|t|)"]


cat("Difference in log(wage) between blacks and nonblacks (coefficient for black):", coef_black, "\n")
## Difference in log(wage) between blacks and nonblacks (coefficient for black): -0.1883499
cat("P-value for black:", pval_black, "\n")
## P-value for black: 6.839315e-07
if (pval_black < 0.05) {
  cat("The difference is statistically significant at the 5% level.\n")
} else {
  cat("The difference is NOT statistically significant at the 5% level.\n")
}
## The difference is statistically significant at the 5% level.

(ii)Add the variables exper? and tenure? to the equation and show that they are jointly insignificant at even the 20% level.

wage2$exper2 <- wage2$exper^2
wage2$tenure2 <- wage2$tenure^2


model2 <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = wage2)


test <- linearHypothesis(model2, c("exper2 = 0", "tenure2 = 0"))


cat("Joint significance test for exper^2 and tenure^2:\n")
## Joint significance test for exper^2 and tenure^2:
print(test)
## 
## Linear hypothesis test:
## exper2 = 0
## tenure2 = 0
## 
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + 
##     black + south + urban
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226
if (test$`Pr(>F)`[2] > 0.2) {
  cat("Exper^2 and tenure^2 are jointly insignificant at the 20% level.\n")
} else {
  cat("Exper^2 and tenure^2 are significant at the 20% level.\n")
}
## Exper^2 and tenure^2 are jointly insignificant at the 20% level.

(iii) Extend the original model to allow the return to education to depend on race and test whether the return to education does depend on race.

model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)

# View the summary of the model
summary(model3)
## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
# Test if the return to education depends on race (interaction term)
coef_interaction <- summary(model3)$coefficients["educ:black", ]
cat("Coefficient for educ:black interaction:", coef_interaction["Estimate"], "\n")
## Coefficient for educ:black interaction: -0.02262361
cat("P-value for educ:black interaction:", coef_interaction["Pr(>|t|)"], "\n")
## P-value for educ:black interaction: 0.2626026

(iv) Again, start with the original model, but now allow wages to differ across four groups of people: married and black, married and nonblack, single and black, and single and nonblack. What is the estimated wage differential between married blacks and married nonblacks?

wage2$married_black <- interaction(wage2$married, wage2$black)


model4 <- lm(log(wage) ~ married_black + educ + exper + tenure + south + urban, data = wage2)


summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ married_black + educ + exper + tenure + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       5.403793   0.114122  47.351  < 2e-16 ***
## married_black1.0  0.188915   0.042878   4.406 1.18e-05 ***
## married_black0.1 -0.240820   0.096023  -2.508 0.012314 *  
## married_black1.1  0.009448   0.056013   0.169 0.866083    
## educ              0.065475   0.006253  10.471  < 2e-16 ***
## exper             0.014146   0.003191   4.433 1.04e-05 ***
## tenure            0.011663   0.002458   4.745 2.41e-06 ***
## south            -0.091989   0.026321  -3.495 0.000497 ***
## urban             0.184350   0.026978   6.833 1.50e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16
coef_married_black <- coef(summary(model4))["married_black1.1", "Estimate"]
coef_married_nonblack <- coef(summary(model4))["married_black1.0", "Estimate"]


wage_diff <- coef_married_black - coef_married_nonblack


cat("Estimated wage differential between married blacks and married nonblacks:", wage_diff, "\n")
## Estimated wage differential between married blacks and married nonblacks: -0.1794663

Chapter 8

1 Which of the following are consequences of heteroskedasticity?


(i)The OLS estimators, B, are inconsistent.


False. OLS estimators remain consistent even in the presence of heteroskedasticity.


(ii) The usual F statistic no longer has an F distribution.


True. The usual F-statistic does not follow the F-distribution under heteroskedasticity, as it assumes homoskedasticity.


(iii) The OLS estimators are no longer BLUE.


True. OLS estimators are no longer BLUE (Best Linear Unbiased Estimators) because heteroskedasticity violates the assumption of constant variance.


5 The variable smokes is a binary variable equal to one if a person smokes, and zero otherwise. Using the data in SMOKE, we estimate a linear probability model for smokes

The variable white equals one if the respondent is white, and zero otherwise; the other independent variables are defined in Example 8.7. Both the usual and heteroskedasticity-robust standard errors are reported.

(i)Are there any important differences between the two sets of standard errors?

library(wooldridge)
library(sandwich)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
model <- lm(cigs ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)


summary(model)
## 
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age + 
##     I(age^2) + restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.772  -9.330  -5.907   7.945  70.275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.682419  24.220729  -0.111  0.91184    
## log(cigpric) -0.850907   5.782321  -0.147  0.88305    
## log(income)   0.869014   0.728763   1.192  0.23344    
## educ         -0.501753   0.167168  -3.001  0.00277 ** 
## age           0.774502   0.160516   4.825 1.68e-06 ***
## I(age^2)     -0.009069   0.001748  -5.188 2.70e-07 ***
## restaurn     -2.865621   1.117406  -2.565  0.01051 *  
## white        -0.559236   1.459461  -0.383  0.70169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05291,    Adjusted R-squared:  0.04461 
## F-statistic: 6.377 on 7 and 799 DF,  p-value: 2.588e-07
library(sandwich)
library(lmtest)


robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
cat("Robust vs. usual standard errors:\n")
## Robust vs. usual standard errors:
print(robust_se)
## 
## t test of coefficients:
## 
##                Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  -2.6824188 25.9019368 -0.1036  0.917544    
## log(cigpric) -0.8509074  6.0543955 -0.1405  0.888266    
## log(income)   0.8690140  0.5979719  1.4533  0.146542    
## educ         -0.5017532  0.1624097 -3.0894  0.002075 ** 
## age           0.7745022  0.1380317  5.6110 2.773e-08 ***
## I(age^2)     -0.0090686  0.0014589 -6.2159 8.214e-10 ***
## restaurn     -2.8656212  1.0172750 -2.8170  0.004968 ** 
## white        -0.5592364  1.3782830 -0.4057  0.685036    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(ii) Holding other factors fixed, if education increases by four years, what happens to the estimated probability of smoking?

coef_educ <- summary(model)$coefficients["educ", ]
effect_4_years <- coef_educ["Estimate"] * 4
cat("Effect of a 4-year increase in education on probability of smoking:", effect_4_years, "\n")
## Effect of a 4-year increase in education on probability of smoking: -2.007013

(iii)At what point does another year of age reduce the probability of smoking?

coef_age <- summary(model)$coefficients["age", "Estimate"]
coef_age2 <- summary(model)$coefficients["I(age^2)", "Estimate"]

age <- 77  #example
marginal_effect_age <- coef_age + 2 * coef_age2 * age
cat("Marginal effect of an additional year of age on smoking probability:", marginal_effect_age, "\n")
## Marginal effect of an additional year of age on smoking probability: -0.6220627

(iv) Interpret the coefficient on the binary variable restaurn (a dummy variable equal to one if the person lives in a state with restaurant smoking restrictions).

coef_restaurn <- summary(model)$coefficients["restaurn", ]
cat("Coefficient of restaurn:", coef_restaurn["Estimate"], "\n")
## Coefficient of restaurn: -2.865621
cat("Interpretation: Living in a state with restaurant smoking restrictions decreases the probability of smoking by", abs(coef_restaurn["Estimate"]), "on average.\n")
## Interpretation: Living in a state with restaurant smoking restrictions decreases the probability of smoking by 2.865621 on average.

(v) Person number 206 in the data set has the following characteristics: cigpric = 67.44, income = 6,500, educ = 16, age = 77, restaurn = 0, white = 0, and smokes = 0. Compute the predicted probability of smoking for this person and comment on the result.

person_206 <- data.frame(
  cigpric = 67.44,
  income = 6500,
  educ = 16,
  age = 77,
  restaurn = 0,
  white = 0
)


person_206$age2 <- person_206$age^2


prob_smoking <- predict(model, newdata = person_206)
cat("Predicted probability of smoking for person 206:", prob_smoking, "\n")
## Predicted probability of smoking for person 206: -0.7953678
cat("Comment: The predicted value is a probability, but linear probability models can sometimes predict values outside [0, 1]. Check if the prediction is reasonable.\n")
## Comment: The predicted value is a probability, but linear probability models can sometimes predict values outside [0, 1]. Check if the prediction is reasonable.

C4 Use VOTE1 for this exercise.


(i)Estimate a model with votaA as the dependent variable.

library(wooldridge)
library(lmtest) # For Breusch-Pagan and White tests


str(vote1)
## 'data.frame':    173 obs. of  10 variables:
##  $ state   : chr  "AL" "AK" "AZ" "AZ" ...
##  $ district: int  7 1 2 3 3 4 2 3 5 6 ...
##  $ democA  : int  1 0 1 0 0 1 0 1 1 1 ...
##  $ voteA   : int  68 62 73 69 75 69 59 71 76 73 ...
##  $ expendA : num  328.3 626.4 99.6 319.7 159.2 ...
##  $ expendB : num  8.74 402.48 3.07 26.28 60.05 ...
##  $ prtystrA: int  41 60 55 64 66 46 58 49 71 64 ...
##  $ lexpendA: num  5.79 6.44 4.6 5.77 5.07 ...
##  $ lexpendB: num  2.17 6 1.12 3.27 4.1 ...
##  $ shareA  : num  97.4 60.9 97 92.4 72.6 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

summary(model)
## 
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.66141    4.73604   7.952 2.56e-13 ***
## prtystrA      0.25192    0.07129   3.534  0.00053 ***
## democA        3.79294    1.40652   2.697  0.00772 ** 
## log(expendA)  5.77929    0.39182  14.750  < 2e-16 ***
## log(expendB) -6.23784    0.39746 -15.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7964 
## F-statistic: 169.2 on 4 and 168 DF,  p-value: < 2.2e-16
residuals <- resid(model)

residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)
## 
## Call:
## lm(formula = residuals ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)   6.029e-15  4.736e+00       0        1
## prtystrA      3.001e-17  7.129e-02       0        1
## democA       -2.151e-15  1.407e+00       0        1
## log(expendA)  7.193e-17  3.918e-01       0        1
## log(expendB) -1.305e-15  3.975e-01       0        1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  8.097e-32,  Adjusted R-squared:  -0.02381 
## F-statistic: 3.401e-30 on 4 and 168 DF,  p-value: 1
cat("Explanation: The R-squared is 0 because the residuals are orthogonal to the fitted values of the independent variables by construction in OLS.\n")
## Explanation: The R-squared is 0 because the residuals are orthogonal to the fitted values of the independent variables by construction in OLS.

(ii) Now, compute the Breusch-Pagan test for heteroskedasticity. Use the F statistic version and report the p-value.

bp_test <- bptest(model)
cat("Breusch-Pagan test results:\n")
## Breusch-Pagan test results:
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.0934, df = 4, p-value = 0.05881

(iii) Compute the special case of the White test for heteroskedasticity, again using the F statistic form. How strong is the evidence for heteroskedasticity now?

white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2), data = vote1)
cat("White test results:\n")
## White test results:
print(white_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 5.49, df = 2, p-value = 0.06425

C13 Use the data in FERTIL2 to answer this question.

(i): Estimate the model

library(wooldridge)
library(lmtest)
library(sandwich)
library(car)


str(fertil2)
## 'data.frame':    4361 obs. of  27 variables:
##  $ mnthborn: int  5 1 7 11 5 8 7 9 12 9 ...
##  $ yearborn: int  64 56 58 45 45 52 51 70 53 39 ...
##  $ age     : int  24 32 30 42 43 36 37 18 34 49 ...
##  $ electric: int  1 1 1 1 1 1 1 1 0 1 ...
##  $ radio   : int  1 1 0 0 1 0 1 1 1 1 ...
##  $ tv      : int  1 1 0 1 1 0 1 1 0 0 ...
##  $ bicycle : int  1 1 0 0 1 0 1 1 0 0 ...
##  $ educ    : int  12 13 5 4 11 7 16 10 5 4 ...
##  $ ceb     : int  0 3 1 3 2 1 4 0 1 0 ...
##  $ agefbrth: int  NA 25 27 17 24 26 20 NA 19 NA ...
##  $ children: int  0 3 1 2 2 1 4 0 1 0 ...
##  $ knowmeth: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ usemeth : int  0 1 0 0 1 1 1 1 1 0 ...
##  $ monthfm : int  NA 11 6 1 3 11 5 NA 7 11 ...
##  $ yearfm  : int  NA 80 83 61 66 76 78 NA 72 61 ...
##  $ agefm   : int  NA 24 24 15 20 24 26 NA 18 22 ...
##  $ idlnchld: int  2 3 5 3 2 4 4 4 4 4 ...
##  $ heduc   : int  NA 12 7 11 14 9 17 NA 3 1 ...
##  $ agesq   : int  576 1024 900 1764 1849 1296 1369 324 1156 2401 ...
##  $ urban   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ urb_educ: int  12 13 5 4 11 7 16 10 5 4 ...
##  $ spirit  : int  0 0 1 0 0 0 0 0 0 1 ...
##  $ protest : int  0 0 0 0 1 0 0 0 1 0 ...
##  $ catholic: int  0 0 0 0 0 0 1 1 0 0 ...
##  $ frsthalf: int  1 1 0 0 1 0 0 0 0 0 ...
##  $ educ0   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ evermarr: int  0 1 1 1 1 1 1 0 1 1 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)


summary(model1)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16
robust_se <- coeftest(model1, vcov = vcovHC(model1, type = "HC1"))
cat("Heteroskedasticity-robust standard errors:\n")
## Heteroskedasticity-robust standard errors:
print(robust_se)
## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24385099 -17.3160 < 2.2e-16 ***
## age          0.34092552  0.01917466  17.7800 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035051  -7.8206 6.549e-15 ***
## educ        -0.07523232  0.00630771 -11.9270 < 2.2e-16 ***
## electric    -0.31004041  0.06394815  -4.8483 1.289e-06 ***
## urban       -0.20003386  0.04547093  -4.3992 1.113e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Compare the robust and non-robust standard errors. Robust standard errors are generally larger when heteroskedasticity is present.\n")
## Compare the robust and non-robust standard errors. Robust standard errors are generally larger when heteroskedasticity is present.

(ii): Add religious dummy variables and test joint significance

model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + catholic + protest + spirit, data = fertil2)


summary(model2)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban + 
##     catholic + protest + spirit, data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9514 -0.7041 -0.0101  0.7213  7.4435 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.3147175  0.2433391 -17.731  < 2e-16 ***
## age          0.3418668  0.0165186  20.696  < 2e-16 ***
## I(age^2)    -0.0027596  0.0002722 -10.139  < 2e-16 ***
## educ        -0.0762130  0.0064608 -11.796  < 2e-16 ***
## electric    -0.3057189  0.0690241  -4.429 9.69e-06 ***
## urban       -0.2034131  0.0465864  -4.366 1.29e-05 ***
## catholic     0.1173539  0.0834249   1.407   0.1596    
## protest      0.0753961  0.0652158   1.156   0.2477    
## spirit       0.1404603  0.0558052   2.517   0.0119 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.451 on 4349 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.574,  Adjusted R-squared:  0.5733 
## F-statistic: 732.6 on 8 and 4349 DF,  p-value: < 2.2e-16
joint_test <- linearHypothesis(model2, c("catholic = 0", "protest = 0", "spirit = 0"))
cat("Joint significance test for religious dummy variables:\n")
## Joint significance test for religious dummy variables:
print(joint_test)
## 
## Linear hypothesis test:
## catholic = 0
## protest = 0
## spirit = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic + 
##     protest + spirit
## 
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   4352 9176.4                              
## 2   4349 9162.5  3     13.88 2.1961 0.08641 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("P-values from non-robust tests are in the joint significance test above.\n")
## P-values from non-robust tests are in the joint significance test above.
robust_joint_test <- linearHypothesis(model2, c("catholic = 0", "protest = 0", "spirit = 0"), vcov = vcovHC(model2, type = "HC1"))
cat("Robust joint significance test for religious dummy variables:\n")
## Robust joint significance test for religious dummy variables:
print(robust_joint_test)
## 
## Linear hypothesis test:
## catholic = 0
## protest = 0
## spirit = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic + 
##     protest + spirit
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df      F Pr(>F)  
## 1   4352                   
## 2   4349  3 2.1559 0.0911 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(iii): Test for heteroskedasticity using fitted values

fitted_values <- fitted(model2)
residuals_squared <- residuals(model2)^2


heteroskedasticity_model <- lm(residuals_squared ~ fitted_values + I(fitted_values^2))
summary(heteroskedasticity_model)
## 
## Call:
## lm(formula = residuals_squared ~ fitted_values + I(fitted_values^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.873 -1.290 -0.302  0.357 47.684 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          0.3126     0.1114   2.807  0.00503 ** 
## fitted_values       -0.1489     0.1018  -1.462  0.14381    
## I(fitted_values^2)   0.2668     0.0196  13.607  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.632 on 4355 degrees of freedom
## Multiple R-squared:  0.2501, Adjusted R-squared:  0.2497 
## F-statistic: 726.1 on 2 and 4355 DF,  p-value: < 2.2e-16
heteroskedasticity_test <- linearHypothesis(heteroskedasticity_model, c("fitted_values = 0", "I(fitted_values^2) = 0"))
cat("Heteroskedasticity test:\n")
## Heteroskedasticity test:
print(heteroskedasticity_test)
## 
## Linear hypothesis test:
## fitted_values = 0
## I(fitted_values^2) = 0
## 
## Model 1: restricted model
## Model 2: residuals_squared ~ fitted_values + I(fitted_values^2)
## 
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   4357 76589                                  
## 2   4355 57436  2     19153 726.11 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Conclude that heteroskedasticity is present if the p-value is below 0.05.\n")
## Conclude that heteroskedasticity is present if the p-value is below 0.05.

Question (iv): Practical importance of heteroskedasticity

cat("Practical importance depends on the size of the residual variance relative to the model's predictions and whether robust standard errors significantly change inference.\n")
## Practical importance depends on the size of the residual variance relative to the model's predictions and whether robust standard errors significantly change inference.

Chapter 9

1.

# Load necessary library
library(wooldridge)

# View the structure of the CEOSAL2 dataset
str(ceosal2)
## 'data.frame':    177 obs. of  15 variables:
##  $ salary  : int  1161 600 379 651 497 1067 945 1261 503 1094 ...
##  $ age     : int  49 43 51 55 44 64 59 63 47 64 ...
##  $ college : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grad    : int  1 1 1 0 1 1 0 1 1 1 ...
##  $ comten  : int  9 10 9 22 8 7 35 32 4 39 ...
##  $ ceoten  : int  2 10 3 22 6 7 10 8 4 5 ...
##  $ sales   : num  6200 283 169 1100 351 19000 536 4800 610 2900 ...
##  $ profits : int  966 48 40 -54 28 614 24 191 7 230 ...
##  $ mktval  : num  23200 1100 1100 1000 387 3900 623 2100 454 3900 ...
##  $ lsalary : num  7.06 6.4 5.94 6.48 6.21 ...
##  $ lsales  : num  8.73 5.65 5.13 7 5.86 ...
##  $ lmktval : num  10.05 7 7 6.91 5.96 ...
##  $ comtensq: int  81 100 81 484 64 49 1225 1024 16 1521 ...
##  $ ceotensq: int  4 100 9 484 36 49 100 64 16 25 ...
##  $ profmarg: num  15.58 16.96 23.67 -4.91 7.98 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# Question: Functional form misspecification
# Step 1: Estimate the base model
model_base <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)

# Summary of the base model
summary(model_base)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = ceosal2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15
# Extract R-squared for the base model
r2_base <- summary(model_base)$r.squared
cat("R-squared for the base model:", r2_base, "\n")
## R-squared for the base model: 0.3525374
# Step 2: Add ceoten^2 and comten^2 to the model
model_extended <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + I(ceoten^2) + comten + I(comten^2), data = ceosal2)

# Summary of the extended model
summary(model_extended)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + I(ceoten^2) + comten + I(comten^2), data = ceosal2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47661 -0.26615 -0.03878  0.27787  1.89344 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.424e+00  2.656e-01  16.655  < 2e-16 ***
## log(sales)   1.857e-01  3.980e-02   4.665 6.25e-06 ***
## log(mktval)  1.018e-01  4.872e-02   2.089 0.038228 *  
## profmarg    -2.575e-03  2.088e-03  -1.233 0.219137    
## ceoten       4.772e-02  1.416e-02   3.371 0.000929 ***
## I(ceoten^2) -1.119e-03  4.814e-04  -2.324 0.021329 *  
## comten      -6.063e-03  1.189e-02  -0.510 0.610815    
## I(comten^2) -5.389e-05  2.528e-04  -0.213 0.831474    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared:  0.3745, Adjusted R-squared:  0.3486 
## F-statistic: 14.45 on 7 and 169 DF,  p-value: 1.136e-14
# Extract R-squared for the extended model
r2_extended <- summary(model_extended)$r.squared
cat("R-squared for the extended model:", r2_extended, "\n")
## R-squared for the extended model: 0.3744746
# Step 3: Test for functional form misspecification
# Calculate the difference in R-squared
diff_r2 <- r2_extended - r2_base
cat("Difference in R-squared:", diff_r2, "\n")
## Difference in R-squared: 0.02193721
# Perform an F-test to test the joint significance of ceoten^2 and comten^2
anova_test <- anova(model_base, model_extended)
cat("ANOVA test results:\n")
## ANOVA test results:
print(anova_test)
## Analysis of Variance Table
## 
## Model 1: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + 
##     comten
## Model 2: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + 
##     I(ceoten^2) + comten + I(comten^2)
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    171 41.856                              
## 2    169 40.438  2    1.4182 2.9634 0.05433 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Conclusion
cat("If the p-value from the F-test is less than 0.05, we reject the null hypothesis that ceoten^2 and comten^2 have no effect, indicating evidence of functional form misspecification.\n")
## If the p-value from the F-test is less than 0.05, we reject the null hypothesis that ceoten^2 and comten^2 have no effect, indicating evidence of functional form misspecification.

5

# Explanation of Exogenous Sample Selection

cat("Exogenous sample selection occurs when inclusion/exclusion from the sample\n")
## Exogenous sample selection occurs when inclusion/exclusion from the sample
cat("is unrelated to the dependent variable (e.g., campus crimes).\n\n")
## is unrelated to the dependent variable (e.g., campus crimes).
cat("- If colleges fail to report crimes due to random factors (e.g., lack of resources),\n")
## - If colleges fail to report crimes due to random factors (e.g., lack of resources),
cat("  the selection is exogenous, and regression results remain unbiased.\n\n")
##   the selection is exogenous, and regression results remain unbiased.
cat("- If failure to report is related to crime rates (e.g., high-crime colleges avoid reporting),\n")
## - If failure to report is related to crime rates (e.g., high-crime colleges avoid reporting),
cat("  the selection is endogenous, leading to biased results (sample selection bias).\n\n")
##   the selection is endogenous, leading to biased results (sample selection bias).
cat("Endogenous selection can be addressed with methods like the Heckman selection model.\n")
## Endogenous selection can be addressed with methods like the Heckman selection model.

C4

(i): Re-estimate equation (9.43) including DC

library(wooldridge)


data_1990 <- subset(infmrt, year == 1990)


str(data_1990)
## 'data.frame':    51 obs. of  12 variables:
##  $ year   : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ infmort: num  6.2 7.1 6.4 7 8.1 ...
##  $ afdcprt: int  62 21 25 282 52 135 1031 323 549 657 ...
##  $ popul  : int  1228 1109 563 6016 1003 3287 17990 7730 11882 10847 ...
##  $ pcinc  : int  17125 21051 17630 22558 18771 25528 22068 24977 18725 17422 ...
##  $ physic : int  178 200 253 337 254 305 315 246 235 196 ...
##  $ afdcper: num  5.05 1.89 4.44 4.69 5.18 ...
##  $ d90    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ lpcinc : num  9.75 9.95 9.78 10.02 9.84 ...
##  $ lphysic: num  5.18 5.3 5.53 5.82 5.54 ...
##  $ DC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lpopul : num  7.11 7.01 6.33 8.7 6.91 ...
model_with_dc <- lm(infmort ~ log(pcinc) + afdcper + physic + d90 + DC, data = data_1990)


summary(model_with_dc)
## 
## Call:
## lm(formula = infmort ~ log(pcinc) + afdcper + physic + d90 + 
##     DC, data = data_1990)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4020 -0.7924 -0.1626  0.9818  2.8058 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.122458  16.646868   0.548   0.5863    
## log(pcinc)   0.058412   1.773035   0.033   0.9739    
## afdcper      0.377671   0.150912   2.503   0.0159 *  
## physic      -0.011217   0.005873  -1.910   0.0624 .  
## d90                NA         NA      NA       NA    
## DC          14.527523   2.352392   6.176 1.58e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.304 on 46 degrees of freedom
## Multiple R-squared:  0.662,  Adjusted R-squared:  0.6326 
## F-statistic: 22.52 on 4 and 46 DF,  p-value: 2.374e-10
# Interpret the coefficient for DC
coef_dc <- summary(model_with_dc)$coefficients["DC", ]
cat("Coefficient for DC:", coef_dc["Estimate"], "\n")
## Coefficient for DC: 14.52752
cat("Standard Error for DC:", coef_dc["Std. Error"], "\n")
## Standard Error for DC: 2.352392
cat("P-value for DC:", coef_dc["Pr(>|t|)"], "\n")
## P-value for DC: 1.578493e-07
cat("Interpretation: The coefficient for DC represents the difference in infant mortality\n")
## Interpretation: The coefficient for DC represents the difference in infant mortality
cat("relative to other states after controlling for other variables.\n")
## relative to other states after controlling for other variables.

Question (ii): Compare estimates with the original model (without DC)

model_without_dc <- lm(infmort ~ log(pcinc) + afdcper + physic + d90, data = data_1990)

summary(model_without_dc)
## 
## Call:
## lm(formula = infmort ~ log(pcinc) + afdcper + physic + d90, data = data_1990)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7333 -0.9933  0.0199  1.0997  4.4608 
## 
## Coefficients: (1 not defined because of singularities)
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 53.471078  20.094084   2.661  0.01063 * 
## log(pcinc)  -5.029289   2.100645  -2.394  0.02070 * 
## afdcper      0.377488   0.201916   1.870  0.06779 . 
## physic       0.016439   0.005085   3.233  0.00224 **
## d90                NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.744 on 47 degrees of freedom
## Multiple R-squared:  0.3818, Adjusted R-squared:  0.3423 
## F-statistic: 9.674 on 3 and 47 DF,  p-value: 4.383e-05
cat("\nComparison of estimates and standard errors:\n")
## 
## Comparison of estimates and standard errors:
common_vars <- intersect(
  rownames(summary(model_without_dc)$coefficients),
  rownames(summary(model_with_dc)$coefficients)
)


comparison <- data.frame(
  Variable = common_vars,
  Estimate_Without_DC = summary(model_without_dc)$coefficients[common_vars, "Estimate"],
  Std_Error_Without_DC = summary(model_without_dc)$coefficients[common_vars, "Std. Error"],
  Estimate_With_DC = summary(model_with_dc)$coefficients[common_vars, "Estimate"],
  Std_Error_With_DC = summary(model_with_dc)$coefficients[common_vars, "Std. Error"]
)

# Print the comparison
print(comparison)
##                Variable Estimate_Without_DC Std_Error_Without_DC
## (Intercept) (Intercept)         53.47107751         20.094083512
## log(pcinc)   log(pcinc)         -5.02928918          2.100644629
## afdcper         afdcper          0.37748839          0.201916424
## physic           physic          0.01643928          0.005084557
##             Estimate_With_DC Std_Error_With_DC
## (Intercept)       9.12245833      16.646867718
## log(pcinc)        0.05841225       1.773035150
## afdcper           0.37767143       0.150911800
## physic           -0.01121737       0.005873413
cat("\nConclusion:\n")
## 
## Conclusion:
cat("Including a dummy variable for a single observation (DC) can significantly\n")
## Including a dummy variable for a single observation (DC) can significantly
cat("alter the coefficients and standard errors of the model. This happens because\n")
## alter the coefficients and standard errors of the model. This happens because
cat("the dummy absorbs variability specific to that single observation, which can\n")
## the dummy absorbs variability specific to that single observation, which can
cat("affect the estimates for other variables.\n")
## affect the estimates for other variables.

C5

(i) OLS estimation with and without the largest firm

library(wooldridge)
library(quantreg)  
## Loading required package: SparseM
str(rdchem)
## 'data.frame':    32 obs. of  8 variables:
##  $ rd      : num  430.6 59 23.5 3.5 1.7 ...
##  $ sales   : num  4570 2830 597 134 42 ...
##  $ profits : num  186.9 467 107.4 -4.3 8 ...
##  $ rdintens: num  9.42 2.08 3.94 2.62 4.05 ...
##  $ profmarg: num  4.09 16.5 18 -3.22 19.05 ...
##  $ salessq : num  20886730 8008900 356170 17849 1764 ...
##  $ lsales  : num  8.43 7.95 6.39 4.89 3.74 ...
##  $ lrd     : num  6.065 4.078 3.157 1.253 0.531 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
rdchem$sales_bil <- rdchem$sales / 1000



model_ols_all <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)


summary_ols_all <- summary(model_ols_all)
cat("OLS model with all firms:\n")
## OLS model with all firms:
print(summary_ols_all)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.058967   0.626269   3.288  0.00272 **
## sales_bil       0.316611   0.138854   2.280  0.03041 * 
## I(sales_bil^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg        0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
rdchem_trimmed <- subset(rdchem, sales_bil < 40)

model_ols_trimmed <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_trimmed)


summary_ols_trimmed <- summary(model_ols_trimmed)
cat("\nOLS model without the largest firm:\n")
## 
## OLS model without the largest firm:
print(summary_ols_trimmed)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem_trimmed)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.058967   0.626269   3.288  0.00272 **
## sales_bil       0.316611   0.138854   2.280  0.03041 * 
## I(sales_bil^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg        0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
cat("\nComparison of OLS estimates:\n")
## 
## Comparison of OLS estimates:
comparison_ols <- data.frame(
  Variable = rownames(summary_ols_all$coefficients),
  Estimate_All = summary_ols_all$coefficients[, "Estimate"],
  Estimate_Trimmed = summary_ols_trimmed$coefficients[, "Estimate"]
)
print(comparison_ols)
##                      Variable Estimate_All Estimate_Trimmed
## (Intercept)       (Intercept)  2.058966676      2.058966676
## sales_bil           sales_bil  0.316610977      0.316610977
## I(sales_bil^2) I(sales_bil^2) -0.007389624     -0.007389624
## profmarg             profmarg  0.053322172      0.053322172

(ii): LAD estimation with and without the largest firm

model_lad_all <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem, tau = 0.5)

summary_lad_all <- summary(model_lad_all)
cat("\nLAD model with all firms:\n")
## 
## LAD model with all firms:
print(summary_lad_all)
## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     tau = 0.5, data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     1.40428      0.87031  2.66628
## sales_bil       0.26346     -0.13508  0.75753
## I(sales_bil^2) -0.00600     -0.01679  0.00344
## profmarg        0.11400      0.01376  0.16427
model_lad_trimmed <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_trimmed, tau = 0.5)

summary_lad_trimmed <- summary(model_lad_trimmed)
cat("\nLAD model without the largest firm:\n")
## 
## LAD model without the largest firm:
print(summary_lad_trimmed)
## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     tau = 0.5, data = rdchem_trimmed)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     1.40428      0.87031  2.66628
## sales_bil       0.26346     -0.13508  0.75753
## I(sales_bil^2) -0.00600     -0.01679  0.00344
## profmarg        0.11400      0.01376  0.16427
cat("\nComparison of LAD estimates:\n")
## 
## Comparison of LAD estimates:
comparison_lad <- data.frame(
  Variable = rownames(summary_lad_all$coefficients),
  Estimate_All = summary_lad_all$coefficients[, 1],
  Estimate_Trimmed = summary_lad_trimmed$coefficients[, 1]
)
print(comparison_lad)
##                      Variable Estimate_All Estimate_Trimmed
## (Intercept)       (Intercept)  1.404279415      1.404279415
## sales_bil           sales_bil  0.263463789      0.263463789
## I(sales_bil^2) I(sales_bil^2) -0.006001127     -0.006001127
## profmarg             profmarg  0.114003661      0.114003661

(iii): Discuss resilience of OLS vs LAD to outliers

cat("\nConclusion:\n")
## 
## Conclusion:
cat("Compare the differences in estimates between models with and without the largest firm for OLS and LAD.\n")
## Compare the differences in estimates between models with and without the largest firm for OLS and LAD.
cat("If LAD estimates change less than OLS estimates, it demonstrates that LAD is more resilient to outliers.\n")
## If LAD estimates change less than OLS estimates, it demonstrates that LAD is more resilient to outliers.

Chapter 10

1

(i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.

cat("Question (i):\n")
## Question (i):
cat("Disagree. Time series data are typically not independently distributed because\n")
## Disagree. Time series data are typically not independently distributed because
cat("observations are often correlated over time (e.g., due to trends, cycles, or autocorrelation).\n")
## observations are often correlated over time (e.g., due to trends, cycles, or autocorrelation).
cat("Independence is a common assumption in cross-sectional data but is rarely valid in time series data.\n\n")
## Independence is a common assumption in cross-sectional data but is rarely valid in time series data.

(ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions.

cat("Question (ii):\n")
## Question (ii):
cat("Agree. The first three Gauss-Markov assumptions are:\n")
## Agree. The first three Gauss-Markov assumptions are:
cat("1. Linearity in parameters.\n")
## 1. Linearity in parameters.
cat("2. No perfect multicollinearity.\n")
## 2. No perfect multicollinearity.
cat("3. Zero conditional mean of errors (no endogeneity).\n")
## 3. Zero conditional mean of errors (no endogeneity).
cat("Under these conditions, OLS remains unbiased in time series regression.\n\n")
## Under these conditions, OLS remains unbiased in time series regression.

(iv) Seasonality is not an issue when using annual time series observations.

cat("Question (iv):\n")
## Question (iv):
cat("Agree. Seasonality typically affects higher-frequency data such as monthly or quarterly observations.\n")
## Agree. Seasonality typically affects higher-frequency data such as monthly or quarterly observations.
cat("In annual time series data, seasonal effects are less relevant because they average out over the year.\n\n")
## In annual time series data, seasonal effects are less relevant because they average out over the year.

5

# Print the model using the cat function
cat("Model Specification:\n")
## Model Specification:
cat("housing_starts = β0 + β1 * time + β2 * Q1 + β3 * Q2 + β4 * Q3 + β5 * interest_rate + β6 * income + u\n")
## housing_starts = β0 + β1 * time + β2 * Q1 + β3 * Q2 + β4 * Q3 + β5 * interest_rate + β6 * income + u
cat("\nWhere:\n")
## 
## Where:
cat("- 'time' captures the trend in housing starts.\n")
## - 'time' captures the trend in housing starts.
cat("- 'Q1', 'Q2', 'Q3' are quarterly dummies to account for seasonality (Q4 is the reference category).\n")
## - 'Q1', 'Q2', 'Q3' are quarterly dummies to account for seasonality (Q4 is the reference category).
cat("- 'interest_rate' captures the relationship between borrowing costs and housing starts.\n")
## - 'interest_rate' captures the relationship between borrowing costs and housing starts.
cat("- 'income' captures the effect of real per capita income on housing starts.\n")
## - 'income' captures the effect of real per capita income on housing starts.

C1

library(wooldridge)

str(intdef)
## 'data.frame':    56 obs. of  13 variables:
##  $ year : int  1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 ...
##  $ i3   : num  1.04 1.1 1.22 1.55 1.77 ...
##  $ inf  : num  8.1 -1.2 1.3 7.9 1.9 ...
##  $ rec  : num  16.2 14.5 14.4 16.1 19 ...
##  $ out  : num  11.6 14.3 15.6 14.2 19.4 ...
##  $ def  : num  -4.6 -0.2 1.2 -1.9 0.4 ...
##  $ i3_1 : num  NA 1.04 1.1 1.22 1.55 ...
##  $ inf_1: num  NA 8.1 -1.2 1.3 7.9 ...
##  $ def_1: num  NA -4.6 -0.2 1.2 -1.9 ...
##  $ ci3  : num  NA 0.06 0.12 0.33 0.22 ...
##  $ cinf : num  NA -9.3 2.5 6.6 -6 ...
##  $ cdef : num  NA 4.4 1.4 -3.1 2.3 ...
##  $ y77  : int  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
intdef$after1979 <- ifelse(intdef$year > 1979, 1, 0)

model <- lm(i3 ~ inf + def + after1979, data = intdef)

summary(model)
## 
## Call:
## lm(formula = i3 ~ inf + def + after1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4674 -0.8407  0.2388  1.0148  3.9654 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.29623    0.42535   3.047  0.00362 ** 
## inf          0.60842    0.07625   7.979 1.37e-10 ***
## def          0.36266    0.12025   3.016  0.00396 ** 
## after1979    1.55877    0.50577   3.082  0.00329 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.711 on 52 degrees of freedom
## Multiple R-squared:  0.6635, Adjusted R-squared:  0.6441 
## F-statistic: 34.18 on 3 and 52 DF,  p-value: 2.408e-12
coef_after1979 <- summary(model)$coefficients["after1979", ]
cat("\nCoefficient for after1979:\n")
## 
## Coefficient for after1979:
cat("Estimate:", coef_after1979["Estimate"], "\n")
## Estimate: 1.558766
cat("Standard Error:", coef_after1979["Std. Error"], "\n")
## Standard Error: 0.5057717
cat("P-value:", coef_after1979["Pr(>|t|)"], "\n")
## P-value: 0.003285646
if (coef_after1979["Pr(>|t|)"] < 0.05) {
  cat("\nConclusion:\nThe coefficient for 'after1979' is statistically significant, indicating a shift\n")
  cat("in the interest rate equation after 1979. The Federal Reserve's policy change likely\n")
  cat("altered the relationship between interest rates and other variables.\n")
} else {
  cat("\nConclusion:\nThe coefficient for 'after1979' is not statistically significant, suggesting\n")
  cat("no substantial shift in the interest rate equation after 1979.\n")
}
## 
## Conclusion:
## The coefficient for 'after1979' is statistically significant, indicating a shift
## in the interest rate equation after 1979. The Federal Reserve's policy change likely
## altered the relationship between interest rates and other variables.

C9

(i): Consider the signs of β1 and β2

library(wooldridge)

str(volat)
## 'data.frame':    558 obs. of  17 variables:
##  $ date  : num  1947 1947 1947 1947 1947 ...
##  $ sp500 : num  15.2 15.8 15.2 14.6 14.3 ...
##  $ divyld: num  4.49 4.38 4.61 4.75 5.05 ...
##  $ i3    : num  0.38 0.38 0.38 0.38 0.38 ...
##  $ ip    : num  22.4 22.5 22.6 22.5 22.6 ...
##  $ pcsp  : num  NA 46.5 -48.6 -44.3 -21.4 ...
##  $ rsp500: num  NA 50.9 -44 -39.6 -16.3 ...
##  $ pcip  : num  NA 5.36 5.33 -5.31 5.33 ...
##  $ ci3   : num  NA 0 0 0 0 ...
##  $ ci3_1 : num  NA NA 0 0 0 ...
##  $ ci3_2 : num  NA NA NA 0 0 ...
##  $ pcip_1: num  NA NA 5.36 5.33 -5.31 ...
##  $ pcip_2: num  NA NA NA 5.36 5.33 ...
##  $ pcip_3: num  NA NA NA NA 5.36 ...
##  $ pcsp_1: num  NA NA 46.5 -48.6 -44.3 ...
##  $ pcsp_2: num  NA NA NA 46.5 -48.6 ...
##  $ pcsp_3: num  NA NA NA NA 46.5 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
cat("Question (i):\n")
## Question (i):
cat("β1 (pcip): Expected to have a positive sign, as increases in industrial production\n")
## β1 (pcip): Expected to have a positive sign, as increases in industrial production
cat("          often signal economic growth, which could positively impact stock returns.\n")
##           often signal economic growth, which could positively impact stock returns.
cat("β2 (i3): Expected to have a negative sign, as higher T-bill returns might make\n")
## β2 (i3): Expected to have a negative sign, as higher T-bill returns might make
cat("         risk-free investments more attractive, reducing stock returns.\n\n")
##          risk-free investments more attractive, reducing stock returns.

(ii): Estimate the equation by OLS

model <- lm(rsp500 ~ pcip + i3, data = volat)

summary_model <- summary(model)
cat("Question (ii):\n")
## Question (ii):
print(summary_model)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637
cat("\nInterpretation of Coefficients:\n")
## 
## Interpretation of Coefficients:
cat("β1 (pcip): Indicates the change in S&P 500 returns for a 1% change in industrial production.\n")
## β1 (pcip): Indicates the change in S&P 500 returns for a 1% change in industrial production.
cat("β2 (i3): Indicates the change in S&P 500 returns for a 1% change in the T-bill rate.\n")
## β2 (i3): Indicates the change in S&P 500 returns for a 1% change in the T-bill rate.

(iii): Identify statistically significant variables

cat("\nQuestion (iii):\n")
## 
## Question (iii):
significance <- summary_model$coefficients[, "Pr(>|t|)"]
significant_vars <- names(significance[significance < 0.05])
cat("Statistically significant variables (p < 0.05):", significant_vars, "\n")
## Statistically significant variables (p < 0.05): (Intercept) i3

(iv): Implications of significance for predictability

cat("\nQuestion (iv):\n")
## 
## Question (iv):
if ("pcip" %in% significant_vars || "i3" %in% significant_vars) {
  cat("The statistical significance of predictors suggests that returns on the S&P 500\n")
  cat("are predictable to some extent based on changes in industrial production (pcip)\n")
  cat("or T-bill returns (i3). However, this does not guarantee strong predictability,\n")
  cat("as financial markets are generally efficient.\n")
} else {
  cat("If no variables are statistically significant, it implies that the S&P 500 returns\n")
  cat("may not be predictable using these variables, consistent with market efficiency.\n")
}
## The statistical significance of predictors suggests that returns on the S&P 500
## are predictable to some extent based on changes in industrial production (pcip)
## or T-bill returns (i3). However, this does not guarantee strong predictability,
## as financial markets are generally efficient.

Chapter 11

C7

i): Test the PIH by estimating gc_t = β0 + β1 * gc_(t-1) + u_t

library(wooldridge)


str(consump)
## 'data.frame':    37 obs. of  24 variables:
##  $ year   : int  1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 ...
##  $ i3     : num  3.41 2.93 2.38 2.78 3.16 ...
##  $ inf    : num  0.7 1.7 1 1 1.3 ...
##  $ rdisp  : num  1530 1565 1616 1694 1756 ...
##  $ rnondc : num  606 615 627 646 660 ...
##  $ rserv  : num  687 717 746 783 819 ...
##  $ pop    : num  177830 180671 183691 186538 189242 ...
##  $ y      : num  8604 8664 8796 9080 9276 ...
##  $ rcons  : num  1294 1333 1373 1430 1479 ...
##  $ c      : num  7275 7377 7476 7665 7814 ...
##  $ r3     : num  2.71 1.23 1.38 1.78 1.86 ...
##  $ lc     : num  8.89 8.91 8.92 8.94 8.96 ...
##  $ ly     : num  9.06 9.07 9.08 9.11 9.14 ...
##  $ gc     : num  NA 0.0139 0.0133 0.0251 0.0192 ...
##  $ gy     : num  NA 0.00696 0.01511 0.03171 0.02145 ...
##  $ gc_1   : num  NA NA 0.0139 0.0133 0.0251 ...
##  $ gy_1   : num  NA NA 0.00696 0.01511 0.03171 ...
##  $ r3_1   : num  NA 2.71 1.23 1.38 1.78 ...
##  $ lc_ly  : num  -0.168 -0.161 -0.163 -0.169 -0.172 ...
##  $ lc_ly_1: num  NA -0.168 -0.161 -0.163 -0.169 ...
##  $ gc_2   : num  NA NA NA 0.0139 0.0133 ...
##  $ gy_2   : num  NA NA NA 0.00696 0.01511 ...
##  $ r3_2   : num  NA NA 2.71 1.23 1.38 ...
##  $ lc_ly_2: num  NA NA -0.168 -0.161 -0.163 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
model1 <- lm(gc ~ gc_1, data = consump)


summary_model1 <- summary(model1)
cat("Question (i):\n")
## Question (i):
print(summary_model1)
## 
## Call:
## lm(formula = gc ~ gc_1, data = consump)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001450  0.007142  0.020227 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.011431   0.003778   3.026  0.00478 **
## gc_1        0.446133   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.007311
cat("\nNull Hypothesis (H0): β1 = 0, indicating no relationship between gc_t and gc_(t-1).\n")
## 
## Null Hypothesis (H0): β1 = 0, indicating no relationship between gc_t and gc_(t-1).
cat("Alternative Hypothesis (H1): β1 ≠ 0, indicating a relationship exists.\n")
## Alternative Hypothesis (H1): β1 ≠ 0, indicating a relationship exists.
if (summary_model1$coefficients["gc_1", "Pr(>|t|)"] < 0.05) {
  cat("Conclusion: Reject H0 at the 5% level. There is evidence of a relationship.\n")
} else {
  cat("Conclusion: Fail to reject H0. The PIH is supported by the data.\n")
}
## Conclusion: Reject H0 at the 5% level. There is evidence of a relationship.

(ii): Add gy_(t-1), i3_(t-1), and inf_(t-1 to the regression

model2 <- lm(gc ~ gc_1 + gy_1 + i3 + inf, data = consump)

summary_model2 <- summary(model2)
cat("\nQuestion (ii):\n")
## 
## Question (ii):
print(summary_model2)
## 
## Call:
## lm(formula = gc ~ gc_1 + gy_1 + i3 + inf, data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0262462 -0.0076662  0.0009066  0.0063953  0.0174975 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.007e-02  5.839e-03   3.438  0.00174 **
## gc_1         5.398e-01  2.604e-01   2.073  0.04689 * 
## gy_1        -1.017e-01  1.792e-01  -0.567  0.57463   
## i3          -2.523e-05  1.028e-03  -0.025  0.98059   
## inf         -1.701e-03  8.825e-04  -1.928  0.06341 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01069 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3819, Adjusted R-squared:  0.2995 
## F-statistic: 4.635 on 4 and 30 DF,  p-value: 0.004938
cat("\nSignificance of new variables (gy_1, i3_1, inf_1):\n")
## 
## Significance of new variables (gy_1, i3_1, inf_1):
p_values <- summary_model2$coefficients[c("gy_1", "i3", "inf"), "Pr(>|t|)"]
print(p_values)
##       gy_1         i3        inf 
## 0.57463167 0.98058921 0.06340824
cat("\nJoint Significance of gy_1, i3_1, inf_1 (at 5% level):\n")
## 
## Joint Significance of gy_1, i3_1, inf_1 (at 5% level):
if (all(p_values < 0.05)) {
  cat("All new variables are individually significant.\n")
} else if (any(p_values < 0.05)) {
  cat("Some of the new variables are individually significant.\n")
} else {
  cat("None of the new variables are individually significant.\n")
}
## None of the new variables are individually significant.

(iii): P-value for gc_1 after including new variables

cat("\nQuestion (iii):\n")
## 
## Question (iii):
p_value_gc1 <- summary_model2$coefficients["gc_1", "Pr(>|t|)"]
cat("P-value for gc_1 in the extended model:", p_value_gc1, "\n")
## P-value for gc_1 in the extended model: 0.04689048
if (p_value_gc1 < 0.05) {
  cat("Conclusion: gc_1 is still significant after including the new variables.\n")
} else {
  cat("Conclusion: gc_1 is no longer significant, providing more support for the PIH.\n")
}
## Conclusion: gc_1 is still significant after including the new variables.

(iv): F-test for joint significance of all explanatory variables

anova_test <- anova(model1, model2)
cat("\nQuestion (iv): F-test for joint significance:\n")
## 
## Question (iv): F-test for joint significance:
print(anova_test)
## Analysis of Variance Table
## 
## Model 1: gc ~ gc_1
## Model 2: gc ~ gc_1 + gy_1 + i3 + inf
##   Res.Df       RSS Df Sum of Sq      F  Pr(>F)  
## 1     33 0.0044447                              
## 2     30 0.0034276  3 0.0010171 2.9675 0.04767 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\nF-statistic and associated p-value for the extended model:\n")
## 
## F-statistic and associated p-value for the extended model:
f_stat <- anova_test$`F`[2]
p_value_f <- anova_test$`Pr(>F)`[2]
cat("F-statistic:", f_stat, "\n")
## F-statistic: 2.967484
cat("P-value:", p_value_f, "\n")
## P-value: 0.04766985
if (p_value_f < 0.05) {
  cat("Conclusion: The explanatory variables are jointly significant, suggesting PIH is less supported.\n")
} else {
  cat("Conclusion: The explanatory variables are not jointly significant, supporting the PIH.\n")
}
## Conclusion: The explanatory variables are jointly significant, suggesting PIH is less supported.

##C12

(i): Find the first-order autocorrelation in gwage232

library(wooldridge)


str(minwage)
## 'data.frame':    612 obs. of  58 variables:
##  $ emp232   : num  271 272 269 263 258 ...
##  $ wage232  : num  0.86 0.86 0.86 0.86 0.87 ...
##  $ emp236   : num  53.1 54.8 52.9 48.3 47.6 ...
##  $ wage236  : num  0.99 1 0.98 0.91 0.93 ...
##  $ emp234   : num  93.4 94.6 95.4 95.2 96.3 ...
##  $ wage234  : num  0.92 0.92 0.91 0.92 0.92 ...
##  $ emp314   : num  254 257 257 254 246 ...
##  $ wage314  : num  0.99 0.98 0.99 0.99 0.99 ...
##  $ emp228   : num  185 185 182 180 174 ...
##  $ wage228  : num  0.91 0.94 0.98 0.98 0.98 ...
##  $ emp233   : num  346 358 358 327 303 ...
##  $ wage233  : num  1.55 1.57 1.54 1.42 1.38 ...
##  $ emp394   : num  79.5 79.5 79.6 78.2 76.2 ...
##  $ wage394  : num  1.03 1.04 1.05 1.05 1.06 ...
##  $ emp231   : num  144 146 147 146 146 ...
##  $ wage231  : num  1.27 1.27 1.27 1.25 1.25 ...
##  $ emp226   : num  90.6 90.7 90.6 89.4 88.6 ...
##  $ wage226  : num  1.07 1.08 1.1 1.13 1.12 ...
##  $ emp387   : num  41.9 42.4 42 41.7 41.6 ...
##  $ wage387  : num  1.07 1.07 1.08 1.09 1.1 ...
##  $ emp056   : num  560 544 572 582 578 ...
##  $ wage056  : num  0.94 0.94 0.96 0.98 0.97 ...
##  $ unem     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ cpi      : num  21.5 21.5 21.9 21.9 21.9 ...
##  $ minwage  : num  0.4 0.4 0.4 0.4 0.4 ...
##  $ lemp232  : num  5.6 5.61 5.59 5.57 5.55 ...
##  $ lwage232 : num  -0.151 -0.151 -0.151 -0.151 -0.139 ...
##  $ gemp232  : num  NA 0.00663 -0.01405 -0.02259 -0.01767 ...
##  $ gwage232 : num  NA 0 0 0 0.0116 ...
##  $ lminwage : num  -0.916 -0.916 -0.916 -0.916 -0.916 ...
##  $ gmwage   : num  NA 0 0 0 0 0 0 0 0 0 ...
##  $ gmwage_1 : num  NA NA 0 0 0 0 0 0 0 0 ...
##  $ gmwage_2 : num  NA NA NA 0 0 0 0 0 0 0 ...
##  $ gmwage_3 : num  NA NA NA NA 0 0 0 0 0 0 ...
##  $ gmwage_4 : num  NA NA NA NA NA 0 0 0 0 0 ...
##  $ gmwage_5 : num  NA NA NA NA NA NA 0 0 0 0 ...
##  $ gmwage_6 : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ gmwage_7 : num  NA NA NA NA NA NA NA NA 0 0 ...
##  $ gmwage_8 : num  NA NA NA NA NA NA NA NA NA 0 ...
##  $ gmwage_9 : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gmwage_10: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gmwage_11: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gmwage_12: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ lemp236  : num  3.97 4 3.97 3.88 3.86 ...
##  $ gcpi     : num  NA 0 0.0184 0 0 ...
##  $ lcpi     : num  3.07 3.07 3.09 3.09 3.09 ...
##  $ lwage236 : num  -0.0101 0 -0.0202 -0.0943 -0.0726 ...
##  $ gemp236  : num  NA 0.0315 -0.0353 -0.091 -0.0146 ...
##  $ gwage236 : num  NA 0.0101 -0.0202 -0.0741 0.0217 ...
##  $ lemp234  : num  4.54 4.55 4.56 4.56 4.57 ...
##  $ lwage234 : num  -0.0834 -0.0834 -0.0943 -0.0834 -0.0834 ...
##  $ gemp234  : num  NA 0.01277 0.00842 -0.0021 0.01149 ...
##  $ gwage234 : num  NA 0 -0.0109 0.0109 0 ...
##  $ lemp314  : num  5.54 5.55 5.55 5.54 5.5 ...
##  $ lwage314 : num  -0.0101 -0.0202 -0.0101 -0.0101 -0.0101 ...
##  $ gemp314  : num  NA 0.01058 0.00117 -0.01214 -0.03203 ...
##  $ gwage314 : num  NA -0.0102 0.0102 0 0 ...
##  $ t        : int  1 2 3 4 5 6 7 8 9 10 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
acf_result <- acf(minwage$wage232, lag.max = 1, plot = FALSE)
first_order_ac <- acf_result$acf[2]
cat("Question (i):\n")
## Question (i):
cat("First-order autocorrelation in gwage232:", first_order_ac, "\n")
## First-order autocorrelation in gwage232: 0.9951486
if (abs(first_order_ac) > 0.5) {
  cat("The series appears to be weakly dependent, as the autocorrelation is significant.\n")
} else {
  cat("The series does not appear to be strongly dependent.\n")
}
## The series appears to be weakly dependent, as the autocorrelation is significant.

(ii): Estimate the dynamic model

model_dynamic <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)

cat("\nQuestion (ii):\n")
## 
## Question (ii):
summary_dynamic <- summary(model_dynamic)
print(summary_dynamic)
## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.138e-17 -1.537e-18 -3.990e-19  1.031e-18  3.063e-16 
## 
## Coefficients:
##                    Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)       3.369e-18  6.877e-19  4.899e+00 1.24e-06 ***
## lag(gwage232, 1)  1.000e+00  6.479e-17  1.544e+16  < 2e-16 ***
## gmwage           -4.748e-20  1.825e-17 -3.000e-03    0.998    
## gcpi              5.317e-16  1.321e-16  4.024e+00 6.46e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.264e-17 on 607 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.123e+32 on 3 and 607 DF,  p-value: < 2.2e-16
cat("Interpretation: The coefficient on gmwage indicates the contemporaneous impact of minimum wage growth\n")
## Interpretation: The coefficient on gmwage indicates the contemporaneous impact of minimum wage growth
cat("on wage growth in sector 232, controlling for past wage growth and CPI growth.\n")
## on wage growth in sector 232, controlling for past wage growth and CPI growth.

(iii): Add lagged growth in employment (gemp232_1)

model_extended <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 1), data = minwage)

cat("\nQuestion (iii):\n")
## 
## Question (iii):
summary_extended <- summary(model_extended)
print(summary_extended)
## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 
##     1), data = minwage)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -1.123e-17 -1.582e-18 -4.110e-19  8.800e-19  3.060e-16 
## 
## Coefficients:
##                    Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)       3.369e-18  6.880e-19  4.897e+00 1.25e-06 ***
## lag(gwage232, 1)  1.000e+00  6.483e-17  1.543e+16  < 2e-16 ***
## gmwage            1.042e-19  1.826e-17  6.000e-03    0.995    
## gcpi              5.319e-16  1.322e-16  4.024e+00 6.45e-05 ***
## lag(gemp232, 1)  -2.146e-17  2.747e-17 -7.810e-01    0.435    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.264e-17 on 606 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 8.416e+31 on 4 and 606 DF,  p-value: < 2.2e-16
p_value_gemp <- summary_extended$coefficients["lag(gemp232, 1)", "Pr(>|t|)"]
cat("\nP-value for lagged growth in employment (gemp232_1):", p_value_gemp, "\n")
## 
## P-value for lagged growth in employment (gemp232_1): 0.4350518
if (p_value_gemp < 0.05) {
  cat("Lagged growth in employment is statistically significant.\n")
} else {
  cat("Lagged growth in employment is not statistically significant.\n")
}
## Lagged growth in employment is not statistically significant.

(iv): Compare models with and without lagged variables

model_without_lagged <- lm(gwage232 ~ gmwage + gcpi, data = minwage)

cat("\nQuestion (iv):\n")
## 
## Question (iv):
summary_without_lagged <- summary(model_without_lagged)
comparison <- data.frame(
  Variable = rownames(summary_without_lagged$coefficients),
  Coefficient_Without_Lagged = summary_without_lagged$coefficients[, "Estimate"],
  Coefficient_With_Lagged = summary_extended$coefficients[rownames(summary_without_lagged$coefficients), "Estimate"]
)
print(comparison)
##                Variable Coefficient_Without_Lagged Coefficient_With_Lagged
## (Intercept) (Intercept)                0.002184222            3.368611e-18
## gmwage           gmwage                0.150571350            1.042475e-19
## gcpi               gcpi                0.243522252            5.318641e-16
cat("Adding lagged variables affects the coefficient on gmwage if the lagged variables\n")
## Adding lagged variables affects the coefficient on gmwage if the lagged variables
cat("are significant and explain part of the variation in gwage232.\n")
## are significant and explain part of the variation in gwage232.

(v): Regression of gmwage on lagged variables

model_rsq <- lm(gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = minwage)

cat("\nQuestion (v):\n")
## 
## Question (v):
summary_rsq <- summary(model_rsq)
cat("R-squared of the regression of gmwage on lagged variables:", summary_rsq$r.squared, "\n")
## R-squared of the regression of gmwage on lagged variables: 0.2825375
cat("Interpretation: A high R-squared indicates that gmwage is predictable based on lagged variables,\n")
## Interpretation: A high R-squared indicates that gmwage is predictable based on lagged variables,
cat("which could affect its coefficient in the model.\n")
## which could affect its coefficient in the model.

Chapter 12

C11

(i): Estimate the model and obtain squared OLS residuals

library(wooldridge)
library(lmtest)   # For testing heteroskedasticity
library(tseries)  # For ARCH models
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(fGarch)   # For ARCH/GARCH estimation
## NOTE: Packages 'fBasics', 'timeDate', and 'timeSeries' are no longer
## attached to the search() path when 'fGarch' is attached.
## 
## If needed attach them yourself in your R script by e.g.,
##         require("timeSeries")
data("nyse")


model_ols <- lm(return ~ return_1, data = nyse)

summary(model_ols)
## 
## Call:
## lm(formula = return ~ return_1, data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.261  -1.302   0.098   1.316   8.065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17963    0.08074   2.225   0.0264 *
## return_1     0.05890    0.03802   1.549   0.1218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 687 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.003481,   Adjusted R-squared:  0.00203 
## F-statistic: 2.399 on 1 and 687 DF,  p-value: 0.1218
residuals_ols <- residuals(model_ols)
squared_residuals <- residuals_ols^2


cat("Question (i):\n")
## Question (i):
cat("Average of squared residuals:", mean(squared_residuals), "\n")
## Average of squared residuals: 4.440839
cat("Minimum of squared residuals:", min(squared_residuals), "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max(squared_residuals), "\n")
## Maximum of squared residuals: 232.8946
nyse_clean <- nyse[complete.cases(nyse$return, nyse$return_1), ]

# Add squared residuals to the cleaned dataset
nyse_clean$squared_residuals <- residuals(lm(return ~ return_1, data = nyse_clean))^2

(ii): Estimate the heteroskedasticity model using squared residuals

nyse_clean <- nyse[complete.cases(nyse$return_1, nyse$squared_residuals), ]

model_arch1 <- lm(squared_residuals ~ return_1 + I(return_1^2), data = nyse_clean)

cat("\nQuestion (ii):\n")
## 
## Question (ii):
summary_arch1 <- summary(model_arch1)
print(summary_arch1)
## 
## Call:
## lm(formula = squared_residuals ~ return_1 + I(return_1^2), data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    3.25734    0.44085   7.389 4.32e-13 ***
## return_1      -0.78946    0.19569  -4.034 6.09e-05 ***
## I(return_1^2)  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16

(iii): Sketch the conditional variance

nyse_clean <- nyse[complete.cases(nyse$return_1, nyse$squared_residuals), ]

cat("Length of return_1:", length(nyse_clean$return_1), "\n")
## Length of return_1: 689
cat("Length of squared_residuals:", length(nyse_clean$squared_residuals), "\n")
## Length of squared_residuals: 0
cat("\nQuestion (iii):\n")
## 
## Question (iii):
cat("The conditional variance is a function of return_1. Sketching requires plotting.\n")
## The conditional variance is a function of return_1. Sketching requires plotting.
plot(
  nyse_clean$return_1, 
  nyse_clean$squared_residuals,
  main = "Conditional Variance vs. Lagged Return",
  xlab = "Lagged Return (return_1)",
  ylab = "Squared Residuals",
  pch = 20
)

conditional_variance <- fitted(model_arch1)

cat("Length of conditional_variance:", length(conditional_variance), "\n")
## Length of conditional_variance: 689
cat("Minimum variance:", min(conditional_variance), "\n")
## Minimum variance: 2.732132
cat("Maximum variance:", max(conditional_variance), "\n")
## Maximum variance: 84.99515