Chapter 7

Question 1

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(wooldridge)
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(dslabs)
library(ISLR2)
library(matlib)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(sandwich)


data("sleep75")

sleep_model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(sleep_model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16
# (i) Is there evidence that men sleep more than women?
coef_male <- coef(summary(sleep_model))["male", "Estimate"]
p_value_male <- coef(summary(sleep_model))["male", "Pr(>|t|)"]
cat("Coefficient for 'male':", coef_male, "\n")
## Coefficient for 'male': 87.75243
cat("p-value for 'male':", p_value_male, "\n")
## p-value for 'male': 0.01078518
if (p_value_male < 0.05) {
  cat("Conclusion: There is significant evidence that men sleep more (or less) than women.\n")
} else {
  cat("Conclusion: There is no significant evidence that men sleep more (or less) than women.\n")
}
## Conclusion: There is significant evidence that men sleep more (or less) than women.
# (ii) Tradeoff between working and sleeping
coef_totwrk <- coef(summary(sleep_model))["totwrk", "Estimate"]

cat("Coefficient for 'totwrk':", coef_totwrk, "\n")
## Coefficient for 'totwrk': -0.1634232
cat("Interpretation: Each additional minute of work reduces sleep by", abs(coef_totwrk), "minutes.\n")
## Interpretation: Each additional minute of work reduces sleep by 0.1634232 minutes.
# (iii) Test whether age has a significant effect on sleep
sleep_model_no_age <- lm(sleep ~ totwrk + educ + male, data = sleep75)
anova(sleep_model_no_age, sleep_model)
## Analysis of Variance Table
## 
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506

Question 3

data("gpa2")
sat_model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(sat_model)
## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black, 
##     data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)     -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16
# (i) Is hsize^2 important in the model?
beta1 <- coef(sat_model)["hsize"]
beta2 <- coef(sat_model)["I(hsize^2)"]
optimal_hsize <- -beta1 / (2 * beta2)
cat("Optimal high school size:", optimal_hsize, "\n")
## Optimal high school size: 4.39603
# (ii) SAT score difference between nonblack females and nonblack males
coef_female <- coef(summary(sat_model))["female", "Estimate"]
p_value_female <- coef(summary(sat_model))["female", "Pr(>|t|)"]
cat("Coefficient for 'female' (SAT score difference between nonblack females and nonblack males):", coef_female, "\n")
## Coefficient for 'female' (SAT score difference between nonblack females and nonblack males): -45.09145
cat("p-value for 'female':", p_value_female, "\n")
## p-value for 'female': 1.656301e-25
if (p_value_female < 0.05) {
  cat("Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
} else {
  cat("Conclusion: There is no significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
}
## Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.
# (iii) SAT score difference between nonblack males and black males
linearHypothesis(sat_model, "black = 0")
## 
## Linear hypothesis test:
## black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 76652652                                  
## 2   4131 73479121  1   3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) SAT score difference between black females and nonblack females
linearHypothesis(sat_model, "black + female:black = 0")
## 
## Linear hypothesis test:
## black  + female:black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 74700518                                  
## 2   4131 73479121  1   1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

C1

library(wooldridge)
data1 <- wooldridge::gpa1
head(data1)
##   age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1  21    0      0      1       0    0      0        1        0    3.0   3.0  21
## 2  21    0      0      1       0    0      0        1        0    3.4   3.2  24
## 3  20    0      1      0       0    0      0        1        0    3.0   3.6  26
## 4  19    1      0      0       0    1      1        1        0    3.5   3.5  27
## 5  20    0      1      0       0    0      0        1        0    3.6   3.9  28
## 6  20    0      0      1       0    1      1        1        0    3.0   3.4  25
##   job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1     0     1     1    0    0       0  0     0   1        1        0     0
## 2     0     1     1    0    0       0  0     0   1        0        1     1
## 3     1     0     0    0    1       0  0     0   1        1        0     1
## 4     1     0     0    0    1       0  0     0   0        1        0     0
## 5     0     1     0    1    0       0  0     0   1        1        1     0
## 6     0     0     0    0    1       0  0     0   1        1        0     0
##   skipped alcohol gradMI fathcoll mothcoll
## 1       2     1.0      1        0        0
## 2       0     1.0      1        1        1
## 3       0     1.0      1        1        1
## 4       0     0.0      0        0        0
## 5       0     1.5      1        1        0
## 6       0     0.0      0        1        0
data_C1 <- gpa1
model_C1 <- lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll,data= data_C1)
summary(model_C1)
## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll, 
##     data = data_C1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## PC           0.151854   0.058716   2.586 0.010762 *  
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06
  1. From the model we can see that there are two significant variables including PC and hsGPA, Compared to the result estimated in (7.6), PC coefficient decrease from 0.157 to 0.151, and it is still statistically significant.The impact of PC remains largely consistent with equation (7.6), maintaining substantial significance with a t(pc) value approximately around 2.58.
library(wooldridge)
data1 <- wooldridge::gpa1

# Perform the regression analysis
model_7C1_ur <- lm(colGPA ~ PC + hsGPA + ACT, data = data1)
summary(model_7C1_ur)
## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7901 -0.2622 -0.0107  0.2334  0.7570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263520   0.333125   3.793 0.000223 ***
## PC          0.157309   0.057287   2.746 0.006844 ** 
## hsGPA       0.447242   0.093647   4.776 4.54e-06 ***
## ACT         0.008659   0.010534   0.822 0.412513    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.2023 
## F-statistic: 12.83 on 3 and 137 DF,  p-value: 1.932e-07
F= ((0.2222-0.2194)/2)/((1-0.2222)/135)
F
## [1] 0.2429931
critical_value = qf(0.95,2,135, TRUE)
critical_value
## [1] 4.423013
model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data1)
summary(model6)
## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll, 
##     data = data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## PC           0.151854   0.058716   2.586 0.010762 *  
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06
library(car)
hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test
## 
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
## 
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    137 15.149                           
## 2    135 15.094  2  0.054685 0.2446 0.7834
summary(joint_test)
##      Res.Df           RSS              Df      Sum of Sq             F         
##  Min.   :135.0   Min.   :15.09   Min.   :2   Min.   :0.05469   Min.   :0.2446  
##  1st Qu.:135.5   1st Qu.:15.11   1st Qu.:2   1st Qu.:0.05469   1st Qu.:0.2446  
##  Median :136.0   Median :15.12   Median :2   Median :0.05469   Median :0.2446  
##  Mean   :136.0   Mean   :15.12   Mean   :2   Mean   :0.05469   Mean   :0.2446  
##  3rd Qu.:136.5   3rd Qu.:15.14   3rd Qu.:2   3rd Qu.:0.05469   3rd Qu.:0.2446  
##  Max.   :137.0   Max.   :15.15   Max.   :2   Max.   :0.05469   Max.   :0.2446  
##                                  NA's   :1   NA's   :1         NA's   :1       
##      Pr(>F)      
##  Min.   :0.7834  
##  1st Qu.:0.7834  
##  Median :0.7834  
##  Mean   :0.7834  
##  3rd Qu.:0.7834  
##  Max.   :0.7834  
##  NA's   :1
  1. Conducting the F test to assess the combined significance of mothcoll and fathcoll, with 2 and 135 degrees of freedom, yields a result of about 0.24 with a p-value close to 0.78. This joint insignificance of these variables doesn’t notably affect the estimates of the other coefficients when included in the regression.
model3 <- lm (colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll, data = gpa1)
summary (model3)
## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + 
##     fathcoll, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78998 -0.24327 -0.00648  0.26179  0.72231 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.040328   2.443038   2.063   0.0410 *
## PC           0.140446   0.058858   2.386   0.0184 *
## hsGPA       -1.802520   1.443552  -1.249   0.2140  
## I(hsGPA^2)   0.337341   0.215711   1.564   0.1202  
## ACT          0.004786   0.010786   0.444   0.6580  
## mothcoll     0.003091   0.060110   0.051   0.9591  
## fathcoll     0.062761   0.062401   1.006   0.3163  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared:  0.2361, Adjusted R-squared:  0.2019 
## F-statistic: 6.904 on 6 and 134 DF,  p-value: 2.088e-06
  1. When hsGPA² is included in the regression, its coefficient is approximately 0.337, with a t-statistic around 1.56. The coefficient for hsGPA itself is roughly -1.803. This suggests a borderline significance for the relationship between hsGPA and the outcome. The quadratic term indicates a U-shaped relationship, with a turning point around hsGPA* = 2.68, which complicates interpretation. Although the inclusion of hsGPA² acts as a robustness check, the main result concerning the coefficient for PC decreases to about 0.140, but it remains statistically significant.

C2

data3 <- wooldridge::wage2
head(data3)
##   wage hours  IQ KWW educ exper tenure age married black south urban sibs
## 1  769    40  93  35   12    11      2  31       1     0     0     1    1
## 2  808    50 119  41   18    11     16  37       1     0     0     1    1
## 3  825    40 108  46   14    11      9  33       1     0     0     1    1
## 4  650    40  96  32   12    13      7  32       1     0     0     1    4
## 5  562    40  74  27   11    14      5  34       1     0     0     1   10
## 6 1400    40 116  43   16    14      2  35       1     1     0     1    1
##   brthord meduc feduc    lwage
## 1       2     8     8 6.645091
## 2      NA    14    14 6.694562
## 3       2    14    14 6.715384
## 4       3    12    12 6.476973
## 5       6     6    11 6.331502
## 6       2     8    NA 7.244227
data("wage2")
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban, data = wage2)
summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16
  1. The black coefficient suggests that, holding other factors constant, Black men earn approximately 18.8% less than non-Black men. With a t-statistic of around -4.95, this difference is statistically significant, indicating strong evidence for the income disparity between these groups.
model4 <- lm(log(wage) ~ educ +exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure + 
##     I(tenure^2) + married + black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## I(exper^2)  -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## I(tenure^2) -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16
linearHypothesis(model4, c("I(exper^2)=0", "I(tenure^2)=0" ))
## 
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
## 
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + 
##     married + black + south + urban
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226
F= ((0.255-0.2526)/2)/((1-0.255)/925)
F
## [1] 1.489933
1-pf(F,2,925)
## [1] 0.2259282
  1. The F-statistic for testing the joint significance of exper² and tenure², with 2 and 925 degrees of freedom, is approximately 1.49, yielding a p-value of around 0.226. Since the p-value exceeds 0.20, the quadratic terms are deemed collectively insignificant at the 20% significance level.
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban + I(black*educ), data = wage2)
summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + I(black * educ), data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.374817   0.114703  46.859  < 2e-16 ***
## educ             0.067115   0.006428  10.442  < 2e-16 ***
## exper            0.013826   0.003191   4.333 1.63e-05 ***
## tenure           0.011787   0.002453   4.805 1.80e-06 ***
## married          0.198908   0.039047   5.094 4.25e-07 ***
## black            0.094809   0.255399   0.371 0.710561    
## south           -0.089450   0.026277  -3.404 0.000692 ***
## urban            0.183852   0.026955   6.821 1.63e-11 ***
## I(black * educ) -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
  1. When the interaction term black â‹… educ is added to the equation, its coefficient is approximately -0.0226 with a standard error of 0.0202. This implies that the return on education for Black men increases by about 2.3 percentage points less per year of education compared to non-Black men (a difference of around 6.7%). Although this difference is notable if reflective of population-level disparities, the t-statistic is around 1.12 in absolute value. This is insufficient to reject the null hypothesis, meaning there is not enough evidence to conclude that the relationship between education and returns is significantly dependent on race.
model4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+educ*black,data= wage2)
summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + educ * black, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## black        0.094809   0.255399   0.371 0.710561    
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
model_7C2_4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+married*black,data= wage2)
summary(model4)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + educ * black, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## black        0.094809   0.255399   0.371 0.710561    
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
  1. Married Black individuals are represented by married = 1 and black = 1, while married non-Black individuals are represented by married = 1 and black = 0. The difference in monthly salary between these two groups is the sum of the coefficients for the black variable and the married × black interaction term: -0.2408 + 0.0614 = -0.1794.

This suggests that, on average, married Black individuals earn 17.94% less in monthly salary compared to married non-Black individuals.

Chapter 8

Question 1

data("smoke")

ols_model <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(ols_model)
## 
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) + 
##     restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.758  -9.336  -5.895   7.927  70.267 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.782521   8.852068  -0.653  0.51379    
## cigpric     -0.005724   0.101064  -0.057  0.95485    
## log(income)  0.865360   0.728766   1.187  0.23541    
## educ        -0.501908   0.167169  -3.002  0.00276 ** 
## age          0.774357   0.160516   4.824 1.68e-06 ***
## I(age^2)    -0.009068   0.001748  -5.187 2.71e-07 ***
## restaurn    -2.880176   1.116067  -2.581  0.01004 *  
## white       -0.553286   1.459490  -0.379  0.70472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05289,    Adjusted R-squared:  0.04459 
## F-statistic: 6.374 on 7 and 799 DF,  p-value: 2.609e-07
bptest(ols_model)
## 
##  studentized Breusch-Pagan test
## 
## data:  ols_model
## BP = 32.431, df = 7, p-value = 3.379e-05
  1. The OLS estimators, bj, are inconsistent: False. Heteroskedasticity does not make the OLS estimators inconsistent.
coeftest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -5.7825205  8.8831245 -0.6510  0.515262    
## cigpric     -0.0057240  0.1062869 -0.0539  0.957064    
## log(income)  0.8653600  0.5985769  1.4457  0.148655    
## educ        -0.5019084  0.1624052 -3.0905  0.002068 ** 
## age          0.7743569  0.1380708  5.6084 2.814e-08 ***
## I(age^2)    -0.0090678  0.0014596 -6.2127 8.375e-10 ***
## restaurn    -2.8801757  1.0155265 -2.8361  0.004682 ** 
## white       -0.5532861  1.3782955 -0.4014  0.688213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. The usual F statistic no longer has an F distribution: True. In the presence of heteroskedasticity, the usual F-statistic can no longer be relied upon to follow an F-distribution.
waldtest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
## Wald test
## 
## Model 1: cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + 
##     white
## Model 2: cigs ~ 1
##   Res.Df Df      F   Pr(>F)    
## 1    799                       
## 2    806 -7 9.3617 3.49e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. The OLS estimators are no longer BLUE: True. Heteroskedasticity violates one of the Gauss-Markov assumptions, specifically the assumption of homoscedasticity (constant variance of errors).
usual_se <- summary(ols_model)$coefficients[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(ols_model, type = "HC1")))

comparison <- data.frame(Variable = names(coef(ols_model)), Usual_SE = usual_se, Robust_SE = robust_se)
print(comparison)
##                Variable    Usual_SE   Robust_SE
## (Intercept) (Intercept) 8.852067639 8.883124494
## cigpric         cigpric 0.101063853 0.106286855
## log(income) log(income) 0.728765519 0.598576892
## educ               educ 0.167169257 0.162405199
## age                 age 0.160516185 0.138070842
## I(age^2)       I(age^2) 0.001748065 0.001459558
## restaurn       restaurn 1.116067336 1.015526517
## white             white 1.459489762 1.378295460

Question 5

data4 <- wooldridge::smoke
head(data4)
##   educ cigpric white age income cigs restaurn   lincome agesq lcigpric
## 1 16.0  60.506     1  46  20000    0        0  9.903487  2116 4.102743
## 2 16.0  57.883     1  40  30000    0        0 10.308952  1600 4.058424
## 3 12.0  57.664     1  58  30000    3        0 10.308952  3364 4.054633
## 4 13.5  57.883     1  30  20000    0        0  9.903487   900 4.058424
## 5 10.0  58.320     1  17  20000    0        0  9.903487   289 4.065945
## 6  6.0  59.340     1  86   6500    0        0  8.779557  7396 4.083283
model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)
## 
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age + 
##     agesq + restaurn + white, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.772  -9.330  -5.907   7.945  70.275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.682419  24.220729  -0.111  0.91184    
## log(cigpric) -0.850907   5.782321  -0.147  0.88305    
## log(income)   0.869014   0.728763   1.192  0.23344    
## educ         -0.501753   0.167168  -3.001  0.00277 ** 
## age           0.774502   0.160516   4.825 1.68e-06 ***
## agesq        -0.009069   0.001748  -5.188 2.70e-07 ***
## restaurn     -2.865621   1.117406  -2.565  0.01051 *  
## white        -0.559236   1.459461  -0.383  0.70169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05291,    Adjusted R-squared:  0.04461 
## F-statistic: 6.377 on 7 and 799 DF,  p-value: 2.588e-07
# (i) The standard errors for each coefficient, whether calculated using the usual method or heteroskedasticity-robust approach, yield very similar results in practice. There are no notable differences between them.
s_error <- coef(summary(model11))
s_error
##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)  -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441  5.782321084 -0.1471567 8.830454e-01
## log(income)   0.869013971  0.728763480  1.1924499 2.334389e-01
## educ         -0.501753247  0.167167689 -3.0014966 2.770186e-03
## age           0.774502156  0.160515805  4.8250835 1.676279e-06
## agesq        -0.009068603  0.001748055 -5.1878253 2.699355e-07
## restaurn     -2.865621212  1.117405936 -2.5645301 1.051320e-02
## white        -0.559236403  1.459461034 -0.3831801 7.016882e-01
# (ii) Calculate the change in the probability of smoking if education increases by 4 years
edu_4 <- coef(model11)["educ"] * 4
edu_4
##      educ 
## -2.007013
# (iii) Find the age at which the probability of smoking starts to decrease
age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age
## age 
##  NA
print(paste("The turning point for the quadratic is calculated as 0.020 / [2(0.00026)] ≈ 38.46, which is approximately 38 and a half years."))
## [1] "The turning point for the quadratic is calculated as 0.020 / [2(0.00026)] ≈ 38.46, which is approximately 38 and a half years."
# (iv) Interpret the coefficient on restaurn
coef_res <- coef(model11)["restaurn"]
coef_res
##  restaurn 
## -2.865621
# (v) Predict the probability of smoking for person 206
library(lmtest)
person_206 <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test
## 
##  studentized Breusch-Pagan test
## 
## data:  model11
## BP = 32.377, df = 7, p-value = 3.458e-05
print(paste("The equation for smoking is: smokes = −0.656 + 0.069 × log(67.44) + 0.012 × log(6,500) − 0.029 × (16) + 0.020 × (77) + 0.00026 × (77²) − 0.0052."))
## [1] "The equation for smoking is: smokes = −0.656 + 0.069 × log(67.44) + 0.012 × log(6,500) − 0.029 × (16) + 0.020 × (77) + 0.00026 × (77²) − 0.0052."

C4

data("vote1")
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# (i) Estimate the model with OLS
residual_model <- lm(resid(model) ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
r_squared <- summary(residual_model)$r.squared
print(paste("R^2 of residual regression:", r_squared))  # Expected R^2 ≈ 0
## [1] "R^2 of residual regression: 1.19813088272216e-32"
# (ii) Breusch-Pagan test for heteroskedasticity
bp_test <- bptest(model)
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test for heteroskedasticity (F-statistic version)
white_test <- bptest(model, ~ poly(fitted(model), 2), data = vote1)
print(white_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 5.49, df = 2, p-value = 0.06425

C13

library(sandwich)
library(lmtest)
data6 <- wooldridge::fertil2
head(data6)
##   mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1        5       64  24        1     1  1       1   12   0       NA        0
## 2        1       56  32        1     1  1       1   13   3       25        3
## 3        7       58  30        1     0  0       0    5   1       27        1
## 4       11       45  42        1     0  1       0    4   3       17        2
## 5        5       45  43        1     1  1       1   11   2       24        2
## 6        8       52  36        1     0  0       0    7   1       26        1
##   knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1        1       0      NA     NA    NA        2    NA   576     1       12
## 2        1       1      11     80    24        3    12  1024     1       13
## 3        1       0       6     83    24        5     7   900     1        5
## 4        1       0       1     61    15        3    11  1764     1        4
## 5        1       1       3     66    20        2    14  1849     1       11
## 6        1       1      11     76    24        4     9  1296     1        7
##   spirit protest catholic frsthalf educ0 evermarr
## 1      0       0        0        1     0        0
## 2      0       0        0        1     0        1
## 3      1       0        0        0     0        1
## 4      0       0        0        0     0        1
## 5      0       1        0        1     0        1
## 6      0       0        0        0     0        1
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
robust_se <- sqrt(diag(vcovHC(model)))
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16
print(summary_with_robust_se)
##                             Robust SE
## (Intercept) -4.222516228 0.2443961935
## age          0.340925520 0.0192199445
## I(age^2)    -0.002741209 0.0003513959
## educ        -0.075232323 0.0063159137
## electric    -0.310040409 0.0640737262
## urban       -0.200033857 0.0455162364
coeftest(model6, vcov=vcovHC(model6, type = "HC0"))
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.2555544  0.3368805  3.7270  0.000284 ***
## PC           0.1518539  0.0596641  2.5451  0.012048 *  
## hsGPA        0.4502203  0.0962508  4.6776 6.962e-06 ***
## ACT          0.0077242  0.0103074  0.7494  0.454932    
## mothcoll    -0.0037579  0.0601271 -0.0625  0.950258    
## fathcoll     0.0417999  0.0578749  0.7222  0.471392    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
F= ((0.574-0.5734)/3)/((1-0.574)/4349)
F
## [1] 2.041784
critical_value = qf(0.95,3,4349, TRUE)
critical_value
## [1] 3.403585
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
a <- coeftest(model, vcov = sandwich)
a
## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24368307 -17.3279 < 2.2e-16 ***
## age          0.34092552  0.01916146  17.7923 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035027  -7.8260 6.278e-15 ***
## educ        -0.07523232  0.00630336 -11.9353 < 2.2e-16 ***
## electric    -0.31004041  0.06390411  -4.8517 1.267e-06 ***
## urban       -0.20003386  0.04543962  -4.4022 1.097e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1065, df = 5, p-value < 2.2e-16
# (i) Some robust standard are smaller than the non-robust standard errors.
usqr=(summary(model6)$residual)^2
yhat=model6$fitted.values
yhatsqr=model6$fitted.values^2
model6WT=summary(lm(usqr~yhat+yhatsqr,data=fertil2))$r.squared
model6WT*4361
## [1] 143.6236
1-pchisq(1093.183,2)
## [1] 0
joint_test <- coeftest(model6, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
##  (Intercept)           PC        hsGPA          ACT     mothcoll     fathcoll 
## 5.903211e-04 1.682044e-02 1.977193e-05 4.846838e-01 9.528773e-01 4.948069e-01
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
head(summary_with_robust_se)
##                             Robust SE
## (Intercept) -4.222516228 0.2443961935
## age          0.340925520 0.0192199445
## I(age^2)    -0.002741209 0.0003513959
## educ        -0.075232323 0.0063159137
## electric    -0.310040409 0.0640737262
## urban       -0.200033857 0.0455162364
fv <- fitted(model6)
head(fv,6)
##        1        2        3        4        5        6 
## 2.768423 2.919682 3.115218 3.039878 3.269490 3.021208
residuals <- resid(model6)
head(residuals,6)
##           1           2           3           4           5           6 
##  0.23157702  0.48031850 -0.11521801  0.46012183  0.33050949 -0.02120773
hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)
## 
## Call:
## lm(formula = residuals^2 ~ fv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13813 -0.07873 -0.04322  0.04132  0.51313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.27680    0.18877  -1.466   0.1448  
## fv           0.12558    0.06165   2.037   0.0436 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.128 on 139 degrees of freedom
## Multiple R-squared:  0.02898,    Adjusted R-squared:  0.022 
## F-statistic: 4.149 on 1 and 139 DF,  p-value: 0.04357
print(paste("iii. Ho: There is no heteroskedasticity H1: There is heteroskedasticity."))
## [1] "iii. Ho: There is no heteroskedasticity H1: There is heteroskedasticity."
print(paste("As the p-value is below alpha at a 95% confidence level, there exists adequate statistical evidence to reject the null hypothesis, signifying the presence of heteroskedasticity in the model."))
## [1] "As the p-value is below alpha at a 95% confidence level, there exists adequate statistical evidence to reject the null hypothesis, signifying the presence of heteroskedasticity in the model."
# (iv) Is heteroskedasticity practically important?
print(paste("No, since the difference between the robust and non robust standar errors was really small."))
## [1] "No, since the difference between the robust and non robust standar errors was really small."

Chapter 9

Question 1

data7 <- wooldridge::ceosal2
head(data7)
##   salary age college grad comten ceoten sales profits mktval  lsalary   lsales
## 1   1161  49       1    1      9      2  6200     966  23200 7.057037 8.732305
## 2    600  43       1    1     10     10   283      48   1100 6.396930 5.645447
## 3    379  51       1    1      9      3   169      40   1100 5.937536 5.129899
## 4    651  55       1    0     22     22  1100     -54   1000 6.478509 7.003066
## 5    497  44       1    1      8      6   351      28    387 6.208590 5.860786
## 6   1067  64       1    1      7      7 19000     614   3900 6.972606 9.852194
##     lmktval comtensq ceotensq  profmarg
## 1 10.051908       81        4 15.580646
## 2  7.003066      100      100 16.961130
## 3  7.003066       81        9 23.668638
## 4  6.907755      484      484 -4.909091
## 5  5.958425       64       36  7.977208
## 6  8.268732       49       49  3.231579
model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = data7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15
r_squared <- summary(model11)$r.squared
r_squared
## [1] 0.3525374
model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)
## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceotensq + comtensq, data = data7)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47481 -0.25933 -0.00511  0.27010  2.07583 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.612e+00  2.524e-01  18.276  < 2e-16 ***
## log(sales)   1.805e-01  4.021e-02   4.489 1.31e-05 ***
## log(mktval)  1.018e-01  4.988e-02   2.040   0.0429 *  
## profmarg    -2.077e-03  2.135e-03  -0.973   0.3321    
## ceotensq     3.761e-04  1.916e-04   1.963   0.0512 .  
## comtensq    -1.788e-04  7.236e-05  -2.471   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared:  0.3324, Adjusted R-squared:  0.3129 
## F-statistic: 17.03 on 5 and 171 DF,  p-value: 1.195e-13
r_squared <- summary(model12)$r.squared
r_squared
## [1] 0.3323998

Question 5

data9 <- wooldridge::campus
head(data9)
##   enroll priv police crime   lcrime  lenroll  lpolice
## 1  21836    0     24   446 6.100319 9.991315 3.178054
## 2   6485    0     13     1 0.000000 8.777247 2.564949
## 3   2123    0      3     1 0.000000 7.660585 1.098612
## 4   8240    0     17   121 4.795791 9.016756 2.833213
## 5  19793    0     30   470 6.152733 9.893084 3.401197
## 6   3256    1      9    25 3.218876 8.088255 2.197225
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
  cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
  cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.

C4

data("infmrt")

# Add a dummy variable for District of Columbia (assuming DC is observation 51)
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)

# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43481 -0.10338  0.00447  0.09096  0.39050 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.456e+00  8.688e-02  28.264  < 2e-16 ***
## pcinc       -3.078e-05  6.499e-06  -4.737 7.33e-06 ***
## physic       1.495e-03  2.729e-04   5.478 3.33e-07 ***
## dc          -7.940e-02  1.664e-01  -0.477    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared:  0.2537, Adjusted R-squared:  0.2309 
## F-statistic:  11.1 on 3 and 98 DF,  p-value: 2.444e-06
# (ii) Compare with the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43432 -0.10235  0.00626  0.09337  0.39064 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.448e+00  8.502e-02  28.794  < 2e-16 ***
## pcinc       -3.026e-05  6.378e-06  -4.743 7.07e-06 ***
## physic       1.487e-03  2.713e-04   5.480 3.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared:  0.252,  Adjusted R-squared:  0.2369 
## F-statistic: 16.67 on 2 and 99 DF,  p-value: 5.744e-07
# Compare both models
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
## 
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     99 2.6064                           
## 2     98 2.6004  1 0.0060446 0.2278 0.6342

C5

library(quantreg) 
## Loading required package: SparseM
data <- as.data.frame(rdchem)
data$sales_billion <- data$sales / 1e3

ols_full <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data)
summary(ols_full)
## 
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         2.058967   0.626269   3.288  0.00272 **
## sales_billion       0.316611   0.138854   2.280  0.03041 * 
## I(sales_billion^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg            0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
# (i) OLS regression
data_filtered <- data %>% filter(sales_billion < 40)
ols_filtered <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered)
summary(ols_filtered)
## 
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, data = data_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         2.058967   0.626269   3.288  0.00272 **
## sales_billion       0.316611   0.138854   2.280  0.03041 * 
## I(sales_billion^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg            0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
# (ii) LAD regression
lad_full <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data, tau = 0.5)
summary(lad_full)
## 
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, tau = 0.5, data = data)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                    coefficients lower bd upper bd
## (Intercept)         1.40428      0.87031  2.66628
## sales_billion       0.26346     -0.13508  0.75753
## I(sales_billion^2) -0.00600     -0.01679  0.00344
## profmarg            0.11400      0.01376  0.16427
lad_filtered <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered, tau = 0.5)
summary(lad_filtered)
## 
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, tau = 0.5, data = data_filtered)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                    coefficients lower bd upper bd
## (Intercept)         1.40428      0.87031  2.66628
## sales_billion       0.26346     -0.13508  0.75753
## I(sales_billion^2) -0.00600     -0.01679  0.00344
## profmarg            0.11400      0.01376  0.16427
# (iii) Compare OLS and LAD results
comparison <- data.frame(
  Method = c("OLS (Full)", "OLS (Filtered)", "LAD (Full)", "LAD (Filtered)"),
  Intercept = c(coef(ols_full)[1], coef(ols_filtered)[1], coef(lad_full)[1], coef(lad_filtered)[1]),
  Sales = c(coef(ols_full)[2], coef(ols_filtered)[2], coef(lad_full)[2], coef(lad_filtered)[2]),
  Sales_Squared = c(coef(ols_full)[3], coef(ols_filtered)[3], coef(lad_full)[3], coef(lad_filtered)[3]),
  Profmarg = c(coef(ols_full)[4], coef(ols_filtered)[4], coef(lad_full)[4], coef(lad_filtered)[4])
)
print(comparison)
##           Method Intercept     Sales Sales_Squared   Profmarg
## 1     OLS (Full)  2.058967 0.3166110  -0.007389624 0.05332217
## 2 OLS (Filtered)  2.058967 0.3166110  -0.007389624 0.05332217
## 3     LAD (Full)  1.404279 0.2634638  -0.006001127 0.11400366
## 4 LAD (Filtered)  1.404279 0.2634638  -0.006001127 0.11400366

Chapter 10

Question 1

  1. I disagree—time series data frequently show autocorrelation, meaning the observations are not independently distributed. I agree—while the first three Gauss-Markov assumptions (linearity, random sampling, and no perfect multicollinearity) guarantee that the OLS estimator is unbiased, additional assumptions, such as no autocorrelation and homoscedasticity, are necessary for the OLS estimator to be efficient in time series data.
  2. I disagree—while a trending variable can be used as the dependent variable, it is important to carefully account for the trend. For instance, including a time trend variable or differencing the data can help mitigate problems like spurious regression, which arise from non-stationarity.
  3. I agree—seasonality usually refers to patterns that repeat within a year, such as quarterly or monthly data. When using annual time series data, seasonality is typically not a concern because the data is aggregated over the year, which helps smooth out seasonal fluctuations.

Question 5

# Example code to create quarterly dummies and time trend

# Create quarterly dummies and time trend
#housing_data <- housing_data %>%
#  mutate(
#    quarter = as.factor(quarter),   
#    time_trend = 1:n()              
#  )

# Run the regression model
#housing_model <- lm(hs ~ interest_rate + income + time_trend + quarter, data = housing_data)
#summary(housing_model)

C1

library(dplyr)      

data("intdef")

intdef <- intdef %>%
  mutate(
    post_1979 = ifelse(year > 1979, 1, 0)  # Dummy variable: 1 if year > 1979, else 0
  )

# Run the regression including the dummy variable
policy_model <- lm(i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
summary(policy_model)
## 
## Call:
## lm(formula = i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5802 -0.8353  0.1936  0.7983  4.0447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.39151    0.39028   3.565 0.000800 ***
## inf_1        0.56529    0.07144   7.913 1.99e-10 ***
## def_1        0.38269    0.11156   3.430 0.001203 ** 
## post_1979    1.77623    0.47268   3.758 0.000442 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.59 on 51 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.705,  Adjusted R-squared:  0.6876 
## F-statistic: 40.63 on 3 and 51 DF,  p-value: 1.479e-13
cat("\nConclusion:\n")
## 
## Conclusion:
cat("The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.\n")
## The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.

C9

data("volat")

# (i) Discuss expected signs of coefficients
cat("Expected signs:\n")
## Expected signs:
cat("β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.\n")
## β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.
cat("β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.\n\n")
## β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.
# (ii) Estimate the equation by OLS
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637
cat("\nOLS Results:\n")
## 
## OLS Results:
cat("The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.\n\n")
## The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.
# (iii)Determine which variables are statistically significant

p_values <- summary(model_c9)$coefficients[, 4]
significant <- p_values < 0.05 

cat("P-values for each variable:\n")
## P-values for each variable:
print(p_values)
##  (Intercept)         pcip           i3 
## 1.444871e-08 7.784810e-01 1.207402e-02
cat("\nSignificance (TRUE = significant at 5% level):\n")
## 
## Significance (TRUE = significant at 5% level):
print(significant)
## (Intercept)        pcip          i3 
##        TRUE       FALSE        TRUE
# (iv)Interpret predictability
if (all(significant[-1] == FALSE)) {
  cat("\nThe variables pcip and i3 are NOT statistically significant.\n")
  cat("This suggests that the return on the S&P 500 is NOT predictable using these variables.\n")
} else {
  cat("\nAt least one variable is statistically significant.\n")
  cat("This suggests that the return on the S&P 500 shows SOME level of predictability.\n")
}
## 
## At least one variable is statistically significant.
## This suggests that the return on the S&P 500 shows SOME level of predictability.

Chapter 11

C7

data("consump")

# (i) Estimate g_ct = β0 + β1 * g_ct-1 + u_t and test the PIH
consump <- consump %>%
  mutate(g_ct = log(c) - lag(log(c)))  # g_ct = log(c_t) - log(c_t-1)

model_i <- lm(g_ct ~ lag(g_ct, 1), data = consump)
summary(model_i)
## 
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1), data = consump)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001451  0.007142  0.020227 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.011431   0.003778   3.026  0.00478 **
## lag(g_ct, 1) 0.446141   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.00731
# (ii) Add lagged income growth, interest rate, and inflation
consump <- consump %>%
  mutate(gy_lag = lag(gy),  
         i3_lag = lag(i3),  
         inf_lag = lag(inf))  

model_ii <- lm(g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model_ii)
## 
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, 
##     data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249093 -0.0075869  0.0000855  0.0087234  0.0188617 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.0225940  0.0070892   3.187  0.00335 **
## lag(g_ct, 1)  0.4335832  0.2896525   1.497  0.14487   
## gy_lag       -0.1079087  0.1946371  -0.554  0.58341   
## i3_lag       -0.0007467  0.0011107  -0.672  0.50654   
## inf_lag      -0.0008281  0.0010041  -0.825  0.41606   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01134 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.211 
## F-statistic: 3.273 on 4 and 30 DF,  p-value: 0.02431
# (iii) Compare p-value of lag(g_ct, 1) in model_i and model_ii
cat("\nPART (iii): P-values for lag(g_ct, 1)\n")
## 
## PART (iii): P-values for lag(g_ct, 1)
cat("Model (i) p-value for lag(g_ct, 1):", summary(model_i)$coefficients[2, 4], "\n")
## Model (i) p-value for lag(g_ct, 1): 0.007310181
cat("Model (ii) p-value for lag(g_ct, 1):", summary(model_ii)$coefficients[2, 4], "\n")
## Model (ii) p-value for lag(g_ct, 1): 0.1448651
# (iv) F-statistic for joint significance of all variables
anova_test <- anova(model_i, model_ii)
cat("\nPART (iv): F-statistic for joint significance\n")
## 
## PART (iv): F-statistic for joint significance
print(anova_test)
## Analysis of Variance Table
## 
## Model 1: g_ct ~ lag(g_ct, 1)
## Model 2: g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag
##   Res.Df       RSS Df  Sum of Sq      F Pr(>F)
## 1     33 0.0044446                            
## 2     30 0.0038608  3 0.00058384 1.5122 0.2315

C12

data("minwage")

# Create a lagged variable for gwage232
minwage$lag_gwage232 <- lag(minwage$gwage232)

# (i) 
filtered_data <- minwage[complete.cases(minwage$gwage232, minwage$lag_gwage232), ]

acf(filtered_data$gwage232, lag.max = 1)

# (ii) 
model1 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
summary(model1)
## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044642 -0.004134 -0.001312  0.004482  0.041612 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0024003  0.0004308   5.572 3.79e-08 ***
## lag_gwage232 -0.0779092  0.0342851  -2.272  0.02341 *  
## gmwage        0.1518459  0.0096485  15.738  < 2e-16 ***
## gcpi          0.2630876  0.0824457   3.191  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007889 on 606 degrees of freedom
## Multiple R-squared:  0.2986, Adjusted R-squared:  0.2951 
## F-statistic: 85.99 on 3 and 606 DF,  p-value: < 2.2e-16
# (iii) 
model2 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), data = filtered_data)
summary(model2)
## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), 
##     data = filtered_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043786 -0.004384 -0.001022  0.004325  0.042554 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0024248  0.0004268   5.681 2.08e-08 ***
## lag_gwage232 -0.0756372  0.0339207  -2.230 0.026126 *  
## gmwage        0.1526838  0.0095403  16.004  < 2e-16 ***
## gcpi          0.2652171  0.0826041   3.211 0.001394 ** 
## lag(gemp232)  0.0662916  0.0169639   3.908 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007798 on 604 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3167, Adjusted R-squared:  0.3122 
## F-statistic: 69.98 on 4 and 604 DF,  p-value: < 2.2e-16
# (iv)
coef(model1)["gmwage"]
##    gmwage 
## 0.1518459
coef(model2)["gmwage"]
##    gmwage 
## 0.1526838
# (v)
model3 <- lm(gmwage ~ lag(gwage232) + lag(gemp232), data = filtered_data)
summary(model3)$r.squared
## [1] 0.003908469

Chapter 12

C11

data("nyse")

# (i) Estimate the OLS model in equation (12.47)
nyse <- nyse %>% mutate(return_lag1 = lag(return, 1))  
ols_model <- lm(return ~ return_lag1, data = nyse)

cat("OLS Model Summary:\n")
## OLS Model Summary:
summary(ols_model)
## 
## Call:
## lm(formula = return ~ return_lag1, data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.261  -1.302   0.098   1.316   8.065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17963    0.08074   2.225   0.0264 *
## return_lag1  0.05890    0.03802   1.549   0.1218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 687 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.003481,   Adjusted R-squared:  0.00203 
## F-statistic: 2.399 on 1 and 687 DF,  p-value: 0.1218
nyse <- nyse %>% filter(!is.na(return_lag1))  
nyse$residuals_sq <- residuals(ols_model)^2

mean_resid_sq <- mean(nyse$residuals_sq)
min_resid_sq <- min(nyse$residuals_sq)
max_resid_sq <- max(nyse$residuals_sq)

cat("Mean of squared residuals:", mean_resid_sq, "\n")
## Mean of squared residuals: 4.440839
cat("Minimum of squared residuals:", min_resid_sq, "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max_resid_sq, "\n")
## Maximum of squared residuals: 232.8946
# (ii) Estimate heteroskedasticity model using lagged return
hetero_model <- lm(residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)

cat("Heteroskedasticity Model Summary:\n")
## Heteroskedasticity Model Summary:
summary(hetero_model)
## 
## Call:
## lm(formula = residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1      -0.78946    0.19569  -4.034 6.09e-05 ***
## I(return_lag1^2)  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16
# (iii) Plot conditional variance as a function of lagged return
plot_data <- data.frame(return_lag1 = seq(min(nyse$return_lag1, na.rm = TRUE), 
                                          max(nyse$return_lag1, na.rm = TRUE), length.out = 100))
plot_data$conditional_variance <- predict(hetero_model, newdata = plot_data)
plot(plot_data$return_lag1, plot_data$conditional_variance, type = "l", 
     main = "Conditional Variance vs. Lagged Return",
     xlab = "Lagged Return", ylab = "Conditional Variance")

min_variance_index <- which.min(plot_data$conditional_variance)
cat("Smallest conditional variance:", plot_data$conditional_variance[min_variance_index], "\n")
## Smallest conditional variance: 2.734254
cat("Lagged return value for smallest variance:", plot_data$return_lag1[min_variance_index], "\n")
## Lagged return value for smallest variance: 1.245583
# (iv) Check for negative variance estimates
if (any(plot_data$conditional_variance < 0)) {
  cat("Negative variance estimates are present.\n")
} else {
  cat("No negative variance estimates.\n")
}
## No negative variance estimates.
# (v) Compare ARCH(1) model fit
nyse$residuals_sq_lag1 <- lag(nyse$residuals_sq, 1)
arch1_model <- lm(residuals_sq ~ residuals_sq_lag1, data = nyse)
summary(arch1_model)
## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1, data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.337  -3.292  -2.157   0.556 223.981 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.94743    0.44023   6.695 4.49e-11 ***
## residuals_sq_lag1  0.33706    0.03595   9.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 686 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1136, Adjusted R-squared:  0.1123 
## F-statistic: 87.92 on 1 and 686 DF,  p-value: < 2.2e-16
cat("ARCH(1) model R-squared:", summary(arch1_model)$r.squared, "\n")
## ARCH(1) model R-squared: 0.1136065
cat("Heteroskedasticity model R-squared:", summary(hetero_model)$r.squared, "\n")
## Heteroskedasticity model R-squared: 0.1303208
# (vi) Add second lag to ARCH(1) model for ARCH(2)
nyse$residuals_sq_lag2 <- lag(nyse$residuals_sq, 2)
arch2_model <- lm(residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, data = nyse)
summary(arch2_model)
## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, 
##     data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.934  -3.298  -2.158   0.600 224.296 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.82950    0.45495   6.219 8.69e-10 ***
## residuals_sq_lag1  0.32284    0.03820   8.450  < 2e-16 ***
## residuals_sq_lag2  0.04179    0.03820   1.094    0.274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 684 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1151, Adjusted R-squared:  0.1125 
## F-statistic: 44.47 on 2 and 684 DF,  p-value: < 2.2e-16
cat("ARCH(2) model Summary:\n")
## ARCH(2) model Summary:
print(summary(arch2_model))
## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, 
##     data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.934  -3.298  -2.158   0.600 224.296 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.82950    0.45495   6.219 8.69e-10 ***
## residuals_sq_lag1  0.32284    0.03820   8.450  < 2e-16 ***
## residuals_sq_lag2  0.04179    0.03820   1.094    0.274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 684 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1151, Adjusted R-squared:  0.1125 
## F-statistic: 44.47 on 2 and 684 DF,  p-value: < 2.2e-16