#####Chapter7-1
library(wooldridge)
data("sleep75")
#i
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16
coef_male <- summary(model)$coefficients["male", ]
coef_male
##    Estimate  Std. Error     t value    Pr(>|t|) 
## 87.75242861 34.32616153  2.55642999  0.01078518
#The coefficient for male is 87.75, meaning men sleep 87.75 minutes more than women. The p-value (0.0108) indicates this is statistically significant.

#ii Coefficient and p-value for 'totwrk'
coef_totwrk <- summary(model)$coefficients["totwrk", ]
coef_totwrk
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -1.634232e-01  1.813210e-02 -9.012918e+00  1.891272e-18
#The coefficient for totwrk is -0.163, indicating that each additional minute worked reduces sleep by 0.163 minutes. The p-value (1.89e-18) confirms this is significant.

#iii Perform an F-test to jointly test age and age^2
linearHypothesis <- car::linearHypothesis(model, c("age = 0", "I(age^2) = 0"))
linearHypothesis
## 
## Linear hypothesis test:
## age = 0
## I(age^2) = 0
## 
## Model 1: restricted model
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## 
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506
# The F-test for age and age^2 yields a p-value of 0.2506, which is not significant, meaning age has no significant effect on sleep..



#####Chapter7-3
library(wooldridge)
data("gpa2")
#(i) Fit the regression model
model_gpa2 <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(model_gpa2)
## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black, 
##     data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)     -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16
coef_hsiz2 <- summary(model_gpa2)$coefficients["I(hsize^2)", ]
coef_hsiz2
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -2.194828e+00  5.271635e-01 -4.163468e+00  3.198417e-05
b1 <- coef(model_gpa2)["hsize"]
b2 <- coef(model_gpa2)["I(hsize^2)"]
optimal_hsize <- -b1 / (2 * b2)
optimal_hsize
##   hsize 
## 4.39603
# The inclusion of hsize^2 in the model is supported by its highly significant p-value (3.20e-05). The coefficient for hsize^2 is -2.19, and its inclusion helps model the non-linear relationship between high school size and SAT scores.

#(ii) Coefficient and p-value for `female`
coef_female <- summary(model_gpa2)$coefficients["female", ]
coef_female
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -4.509145e+01  4.291063e+00 -1.050822e+01  1.656301e-25
#Holding hsize fixed, the estimated difference in SAT scores between nonblack females and nonblack males is -45.09, as indicated by the coefficient for female. The p-value for this coefficient is extremely low (<2e-16), meaning this difference is statistically significant.

#(iii) Coefficient and p-value for `black`
coef_black <- summary(model_gpa2)$coefficients["black", ]
coef_black
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -1.698126e+02  1.271315e+01 -1.335725e+01  7.140387e-40
#The estimated difference in SAT scores between nonblack males and black males is -169.81, as indicated by the coefficient for black. To test whether this difference is statistically significant, an F-test is performed, and the p-value is <2e-16, indicating a significant difference between the two groups.

#(iv) Calculate the difference: female + black + female:black
library(car)
## Loading required package: carData
library(carData)
coef_female_black <- coef(model_gpa2)["female"] + coef(model_gpa2)["black"] + coef(model_gpa2)["female:black"]
coef_female_black
##    female 
## -152.5977
linearHypothesis(model_gpa2, c("female + black + female:black = 0"))
## 
## Linear hypothesis test:
## female  + black  + female:black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq   F    Pr(>F)    
## 1   4132 75969420                               
## 2   4131 73479121  1   2490298 140 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The estimated difference in SAT scores between black females and nonblack females is -152.60. This is the combined effect of female, black, and female:black, which has a significant p-value (<2.2e-16) from the F-test. This shows that the difference is statistically significant. To formally test this difference, the F-test for the joint hypothesis (female + black + female:black = 0) provides strong evidence for the difference.



#####Chapter7-C1
library(wooldridge)
data("gpa1")
#(i) Fit the base model (without mothcoll and fathcoll)
model_base <- lm(colGPA ~ hsGPA + ACT + PC, data = gpa1)
# Fit the extended model (with mothcoll and fathcoll)
model_extended <- lm(colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll, data = gpa1)
summary(model_base)        # Base model
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC, data = gpa1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7901 -0.2622 -0.0107  0.2334  0.7570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263520   0.333125   3.793 0.000223 ***
## hsGPA       0.447242   0.093647   4.776 4.54e-06 ***
## ACT         0.008659   0.010534   0.822 0.412513    
## PC          0.157309   0.057287   2.746 0.006844 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.2023 
## F-statistic: 12.83 on 3 and 137 DF,  p-value: 1.932e-07
summary(model_extended)    # Extended model
## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll, 
##     data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## PC           0.151854   0.058716   2.586 0.010762 *  
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06
#After adding mothcoll and fathcoll to the model, the estimated coefficient for PC ownership (0.1519) remains statistically significant with a p-value of 0.0108. This indicates that PC ownership still has a significant effect on college GPA, despite the inclusion of the parental variables.mothcoll and fathcoll are not statistically significant in the model, with p-values of 0.9504 and 0.4963, respectively, suggesting that their inclusion does not substantially affect the model's fit.

#(ii) Test for joint significance
linearHypothesis(model_extended, c("mothcoll = 0", "fathcoll = 0"))
## 
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
## 
## Model 1: restricted model
## Model 2: colGPA ~ hsGPA + ACT + PC + mothcoll + fathcoll
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    137 15.149                           
## 2    135 15.094  2  0.054685 0.2446 0.7834
#The p-value for the joint hypothesis test (mothcoll = 0 and fathcoll = 0) is 0.7834, which is much greater than the common significance level of 0.05. This indicates that mothcoll and fathcoll are jointly not significant in explaining college GPA.

#(iii) Fit the model with hsGPA^2
model_hsGPA2 <- lm(colGPA ~ hsGPA + I(hsGPA^2) + ACT + PC + mothcoll + fathcoll, data = gpa1)
summary(model_hsGPA2)
## 
## Call:
## lm(formula = colGPA ~ hsGPA + I(hsGPA^2) + ACT + PC + mothcoll + 
##     fathcoll, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78998 -0.24327 -0.00648  0.26179  0.72231 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.040328   2.443038   2.063   0.0410 *
## hsGPA       -1.802520   1.443552  -1.249   0.2140  
## I(hsGPA^2)   0.337341   0.215711   1.564   0.1202  
## ACT          0.004786   0.010786   0.444   0.6580  
## PC           0.140446   0.058858   2.386   0.0184 *
## mothcoll     0.003091   0.060110   0.051   0.9591  
## fathcoll     0.062761   0.062401   1.006   0.3163  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared:  0.2361, Adjusted R-squared:  0.2019 
## F-statistic: 6.904 on 6 and 134 DF,  p-value: 2.088e-06
coef_hsGPA2 <- summary(model_hsGPA2)$coefficients["I(hsGPA^2)", ]
coef_hsGPA2
##   Estimate Std. Error    t value   Pr(>|t|) 
##  0.3373405  0.2157105  1.5638577  0.1202096
#The addition of hsGPA² in the model results in a coefficient of 0.3373 with a p-value of 0.1202, which is not statistically significant at the 0.05 level. Given that hsGPA² is not significant, it suggests that this non-linear term is not needed to improve the model. Therefore, there is no strong evidence to support its inclusion.



#####Chapter7-C2
#(i) Estimate the log-linear wage model
library(wooldridge)
data("wage2")
model_wage <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model_wage)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16
#The coefficient for black is -0.188350, which implies that, holding other factors constant, blacks earn approximately 18.8% less than nonblacks. This difference is statistically significant, with a p-value of 6.84e-07, indicating that the racial wage gap is unlikely due to random chance.

#(ii) Add quadratic terms for exper and tenure
model_wage_quad <- lm(log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model_wage_quad)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure + 
##     I(tenure^2) + married + black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## I(exper^2)  -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## I(tenure^2) -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16
linearHypothesis(model_wage_quad, c("I(exper^2) = 0", "I(tenure^2) = 0"))
## 
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
## 
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + 
##     married + black + south + urban
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226
#When testing the joint significance of the quadratic terms for experience (exper^2) and tenure (tenure^2), the p-value is 0.226. This indicates that these variables are jointly insignificant at the 20% significance level. Thus, it fail to reject the null hypothesis that these quadratic terms do not significantly contribute to the model.

#(iii) Test interaction between race and return to education
model_interaction <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model_interaction)
## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16
#The interaction term between education and race (educ:black) has a coefficient of -0.022624, and the associated p-value is 0.262603, which is above the 5% significance level. Therefore, it can be concluded that the return to education does not significantly depend on race in this model.

#(iv) Test wage differentials across marital and race groups
model_married_black <- lm(log(wage) ~ educ + exper + tenure + married * black + south + urban, data = wage2)
summary(model_married_black)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married * black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.403793   0.114122  47.351  < 2e-16 ***
## educ           0.065475   0.006253  10.471  < 2e-16 ***
## exper          0.014146   0.003191   4.433 1.04e-05 ***
## tenure         0.011663   0.002458   4.745 2.41e-06 ***
## married        0.188915   0.042878   4.406 1.18e-05 ***
## black         -0.240820   0.096023  -2.508 0.012314 *  
## south         -0.091989   0.026321  -3.495 0.000497 ***
## urban          0.184350   0.026978   6.833 1.50e-11 ***
## married:black  0.061354   0.103275   0.594 0.552602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16
linearHypothesis(model_married_black, "married:black = 0")
## 
## Linear hypothesis test:
## married:black = 0
## 
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + tenure + married * black + south + 
##     urban
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    926 123.77  1  0.047174 0.3529 0.5526
#The coefficient for the interaction term married:black is 0.061354, indicating a wage differential of about 6.1% between married blacks and married nonblacks. However, the p-value is 0.552602, which is quite high, meaning the wage differential is not statistically significant. Therefore, we cannot conclude that being married and black leads to a significant wage difference compared to married nonblacks.



#####Chapter8-1
library(lmtest)  
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)     
library(sandwich) 
library(zoo)
set.seed(123)
n <- 100
x <- rnorm(n, mean = 50, sd = 10)
u <- rnorm(n, mean = 0, sd = x)  # Heteroskedastic error
y <- 5 + 2 * x + u
model <- lm(y ~ x)
summary(model)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.876  -35.522   -4.845   29.767  130.827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  25.1847    27.6782   0.910  0.36510   
## x             1.4874     0.5353   2.779  0.00654 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.62 on 98 degrees of freedom
## Multiple R-squared:  0.07304,    Adjusted R-squared:  0.06358 
## F-statistic: 7.722 on 1 and 98 DF,  p-value: 0.006541
bp_test <- bptest(model)
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.2235, df = 1, p-value = 0.2687
white_test <- bptest(model, ~ fitted(model) + I(fitted(model)^2))
print(white_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1.243, df = 2, p-value = 0.5371
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
print(robust_se)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 25.18470   27.41571  0.9186 0.360548   
## x            1.48743    0.54484  2.7300 0.007509 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("\n--- Regular OLS Summary ---\n")
## 
## --- Regular OLS Summary ---
print(summary(model))
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.876  -35.522   -4.845   29.767  130.827 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  25.1847    27.6782   0.910  0.36510   
## x             1.4874     0.5353   2.779  0.00654 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.62 on 98 degrees of freedom
## Multiple R-squared:  0.07304,    Adjusted R-squared:  0.06358 
## F-statistic: 7.722 on 1 and 98 DF,  p-value: 0.006541
cat("\n--- Robust Standard Errors ---\n")
## 
## --- Robust Standard Errors ---
print(robust_se)
## 
## t test of coefficients:
## 
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 25.18470   27.41571  0.9186 0.360548   
## x            1.48743    0.54484  2.7300 0.007509 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The consequences of heteroskedasticity are: The usual F statistic no longer has an F distribution, The OLS estimators are no longer BLUE.



#####Chapter8-5
# Input coefficients and data for the analysis
coefficients <- list(
  intercept = 0.656,
  log_cigpric = -0.069,
  log_income = 0.012,
  educ = -0.029,
  age = 0.020,
  age_sq = -0.00026,
  restaurn = -0.101,
  white = -0.026
)
# Standard errors for reference 
standard_errors <- list(
  usual = c(0.204, 0.026, 0.006, 0.006, 0.00006, 0.039, 0.052),
  robust = c(0.207, 0.026, 0.006, 0.005, 0.00006, 0.038, 0.050)
)
#(i) Compare standard errors
cat("Comparison of Standard Errors:\n")
## Comparison of Standard Errors:
usual <- c(0.204, 0.026, 0.006, 0.006, 0.00006, 0.039, 0.052)
robust <- c(0.207, 0.026, 0.006, 0.005, 0.00006, 0.038, 0.050)
comparison <- data.frame(
  Variable = c("log(cigpric)", "log(income)", "educ", "age", "age^2", "restaurn", "white"),
  Usual_SE = usual,
  Robust_SE = robust
)
print(comparison)
##       Variable Usual_SE Robust_SE
## 1 log(cigpric)  0.20400   0.20700
## 2  log(income)  0.02600   0.02600
## 3         educ  0.00600   0.00600
## 4          age  0.00600   0.00500
## 5        age^2  0.00006   0.00006
## 6     restaurn  0.03900   0.03800
## 7        white  0.05200   0.05000
#(ii) Change in probability for 4 years of education
beta_educ <- coefficients$educ
delta_prob <- 4 * beta_educ
cat("Change in probability for 4 additional years of education:", delta_prob, "\n")
## Change in probability for 4 additional years of education: -0.116
#(iii) Calculate turning point for age
beta_age <- coefficients$age
beta_age_sq <- coefficients$age_sq
turning_point <- -beta_age / (2 * beta_age_sq)
cat("Turning point where age reduces probability:", turning_point, "\n")
## Turning point where age reduces probability: 38.46154
#(iv) Print coefficient interpretation
beta_restaurn <- coefficients$restaurn
cat("Effect of restaurant smoking restrictions:", beta_restaurn, "\n")
## Effect of restaurant smoking restrictions: -0.101
#(v) Define characteristics of person 206
log_cigpric <- log(67.44)
log_income <- log(6500)
educ <- 16
age <- 77
restaurn <- 0
white <- 0
# Predicted probability
predicted_prob <- coefficients$intercept +
  coefficients$log_cigpric * log_cigpric +
  coefficients$log_income * log_income +
  coefficients$educ * educ +
  coefficients$age * age +
  coefficients$age_sq * (age^2) +
  coefficients$restaurn * restaurn +
  coefficients$white * white
cat("Predicted probability of smoking for person 206:", predicted_prob, "\n")
## Predicted probability of smoking for person 206: 0.005239246
#####Chapter8-C4
#(i)Regress residuals on independent variables
library(wooldridge)
data("vote1")
ols_model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
residuals <- resid(ols_model)
residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)$r.squared
## [1] 1.198131e-32
#The output indicates that the R^2from regressing the residuals on the independent variables is 0 (which is expected because residuals in OLS regressions are orthogonal to the independent variables). In this case, the regression of the residuals on the independent variables will not capture any variation, leading to R^2=0.

#(ii) Breusch-Pagan Test for Heteroskedasticity
squared_residuals <- residuals^2
bp_model <- lm(squared_residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
bp_fstat <- summary(bp_model)$fstatistic
bp_pvalue <- pf(bp_fstat[1], bp_fstat[2], bp_fstat[3], lower.tail = FALSE)
cat("Breusch-Pagan F-statistic:", bp_fstat[1], "\n")
## Breusch-Pagan F-statistic: 2.330113
cat("p-value:", bp_pvalue, "\n")
## p-value: 0.05805751
#The p-value of 0.0581 is slightly greater than the typical threshold of 0.05. This suggests marginal evidence for heteroskedasticity, but the result is not strong enough to reject the null hypothesis of homoskedasticity at the 5% significance level. The evidence for heteroskedasticity is weak.

#(iii) Special Case of White's Test for Heteroskedasticity
vote1$prtystrA_sq <- vote1$prtystrA^2
vote1$democA_sq <- vote1$democA^2
vote1$log_expendA_sq <- log(vote1$expendA)^2
vote1$log_expendB_sq <- log(vote1$expendB)^2
vote1$interaction <- log(vote1$expendA) * log(vote1$expendB)
white_model <- lm(squared_residuals ~ prtystrA + democA + log(expendA) + log(expendB) +
                  prtystrA_sq + democA_sq + log_expendA_sq + log_expendB_sq + interaction, data = vote1)
white_fstat <- summary(white_model)$fstatistic
white_pvalue <- pf(white_fstat[1], white_fstat[2], white_fstat[3], lower.tail = FALSE)
cat("White's Test F-statistic:", white_fstat[1], "\n")
## White's Test F-statistic: 2.745312
cat("p-value:", white_pvalue, "\n")
## p-value: 0.00719407
#The p-value of 0.0072 is less than 0.05, suggesting strong evidence of heteroskedasticity. This indicates that the model’s error variance is not constant across observations, and heteroskedasticity may be present.



#####Chapter8-C13
#(i) Estimate the Model
library(wooldridge)
data("fertil2")
model_1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(model_1)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16
library(sandwich)
robust_se <- sqrt(diag(vcovHC(model_1)))
summary(model_1)$coefficients[, 2]  
##  (Intercept)          age     I(age^2)         educ     electric        urban 
## 0.2401887611 0.0165082295 0.0002717715 0.0062966083 0.0690044704 0.0465062118
robust_se  
##  (Intercept)          age     I(age^2)         educ     electric        urban 
## 0.2443961935 0.0192199445 0.0003513959 0.0063159137 0.0640737262 0.0455162364
#No, in this case, robust standard errors are not always bigger. They are slightly larger for some variables (e.g., age), but smaller for others (e.g., electric).

#(ii)Add Religious Dummy Variables and Test Joint Significance
model_2 <- lm(children ~ age + I(age^2) + educ + electric + urban + spirit + protest + catholic, data = fertil2)
summary(model_2)
## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban + 
##     spirit + protest + catholic, data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9514 -0.7041 -0.0101  0.7213  7.4435 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.3147175  0.2433391 -17.731  < 2e-16 ***
## age          0.3418668  0.0165186  20.696  < 2e-16 ***
## I(age^2)    -0.0027596  0.0002722 -10.139  < 2e-16 ***
## educ        -0.0762130  0.0064608 -11.796  < 2e-16 ***
## electric    -0.3057189  0.0690241  -4.429 9.69e-06 ***
## urban       -0.2034131  0.0465864  -4.366 1.29e-05 ***
## spirit       0.1404603  0.0558052   2.517   0.0119 *  
## protest      0.0753961  0.0652158   1.156   0.2477    
## catholic     0.1173539  0.0834249   1.407   0.1596    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.451 on 4349 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.574,  Adjusted R-squared:  0.5733 
## F-statistic: 732.6 on 8 and 4349 DF,  p-value: < 2.2e-16
robust_se_model_2 <- sqrt(diag(vcovHC(model_2)))
joint_test <- linearHypothesis(model_2, c("spirit = 0", "protest = 0", "catholic = 0"))
robust_joint_test <- linearHypothesis(model_2, c("spirit = 0", "protest = 0", "catholic = 0"), vcov = vcovHC(model_2))
cat("Nonrobust p-value:", joint_test$`Pr(>F)`[2], "\n")
## Nonrobust p-value: 0.08640607
cat("Robust p-value:", robust_joint_test$`Pr(>F)`[2], "\n")
## Robust p-value: 0.09170157
#Nonrobust p-value(0.0864),Robust p-value(0.0917). Both p-values suggest that the religious dummies (spirit, protest, catholic) are not jointly significant.

#(iii)Obtain Fitted Values and Residuals, Regress Residuals on Fitted Values
fitted_values <- fitted(model_2)
residuals_values <- residuals(model_2)
model_3 <- lm(residuals_values^2 ~ fitted_values^2)
joint_test_residuals <- summary(model_3)$fstatistic
joint_pvalue_residuals <- pf(joint_test_residuals[1], joint_test_residuals[2], joint_test_residuals[3], lower.tail = FALSE)
cat("Joint significance test p-value:", joint_pvalue_residuals, "\n")
## Joint significance test p-value: 3.906415e-235
#After regressing residuals on fitted values, if significant, it suggests heteroskedasticity is present.

#(iv) Heteroskedasticity affects the efficiency of OLS estimates but doesn't bias them. If robust standard errors are used, the effect may not be practically significant.



#####Chapter9-1
model_1 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
model_2 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceotensq + comtensq, data = ceosal2)
anova(model_1, model_2)
## Analysis of Variance Table
## 
## Model 1: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + 
##     comten
## Model 2: log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + 
##     comten + ceotensq + comtensq
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1    171 41.856                              
## 2    169 40.438  2    1.4182 2.9634 0.05433 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The p-value of 0.05433 suggests that adding ceotensq and comtensq does not significantly improve the model at the 5% level. Therefore, there is no strong evidence of functional form misspecification, and the original model appears well-specified.



#####Chapter9-5
#The failure of colleges to report campus crimes in 1992 can likely be viewed as endogenous sample selection rather than exogenous. If colleges with higher crime rates are less likely to report, this creates a bias because the decision to report is correlated with the crime rate. Therefore, non-reporting is not random, and the sample is not exogenously selected. This would lead to biased estimates of the relationship between student enrollment and campus crimes.



#####Chapter9-C4
library(wooldridge)
data("infmrt")
str(infmrt)
## 'data.frame':    102 obs. of  12 variables:
##  $ year   : int  1987 1990 1987 1990 1987 1990 1987 1990 1987 1990 ...
##  $ infmort: num  8.3 6.2 7.8 7.1 8.5 ...
##  $ afdcprt: int  52 62 11 21 20 25 234 282 42 52 ...
##  $ popul  : int  1186 1228 1056 1109 547 563 5856 6016 986 1003 ...
##  $ pcinc  : int  13996 17125 18083 21051 14267 17630 19131 22558 15683 18771 ...
##  $ physic : int  173 178 186 200 244 253 322 337 244 254 ...
##  $ afdcper: num  4.38 5.05 1.04 1.89 3.66 ...
##  $ d90    : int  0 1 0 1 0 1 0 1 0 1 ...
##  $ lpcinc : num  9.55 9.75 9.8 9.95 9.57 ...
##  $ lphysic: num  5.15 5.18 5.23 5.3 5.5 ...
##  $ DC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lpopul : num  7.08 7.11 6.96 7.01 6.3 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)
# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43481 -0.10338  0.00447  0.09096  0.39050 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.456e+00  8.688e-02  28.264  < 2e-16 ***
## pcinc       -3.078e-05  6.499e-06  -4.737 7.33e-06 ***
## physic       1.495e-03  2.729e-04   5.478 3.33e-07 ***
## dc          -7.940e-02  1.664e-01  -0.477    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared:  0.2537, Adjusted R-squared:  0.2309 
## F-statistic:  11.1 on 3 and 98 DF,  p-value: 2.444e-06
#Estimate the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43432 -0.10235  0.00626  0.09337  0.39064 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.448e+00  8.502e-02  28.794  < 2e-16 ***
## pcinc       -3.026e-05  6.378e-06  -4.743 7.07e-06 ***
## physic       1.487e-03  2.713e-04   5.480 3.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared:  0.252,  Adjusted R-squared:  0.2369 
## F-statistic: 16.67 on 2 and 99 DF,  p-value: 5.744e-07
#The coefficient for dc is -0.0794, indicating a slight decrease in the log of the infant mortality rate for Washington, DC. However, with a p-value of 0.634, it is not statistically significant, meaning we can't conclude DC significantly affects infant mortality.

# (ii) Compare the models using ANOVA
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
## 
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     99 2.6064                           
## 2     98 2.6004  1 0.0060446 0.2278 0.6342
#The estimates for pcinc and physic are nearly the same between the two models. Including the dc dummy slightly increases the standard errors but does not significantly affect the other coefficients. The dc variable's insignificance suggests adding a dummy for a single observation doesn't improve the model.



#####Chapter9-C5
library(quantreg)
## Loading required package: SparseM
library(wooldridge)
data("rdchem")
#(i) Estimate OLS
ols_all <- lm(rdintens ~ lsales + profmarg, data = rdchem)  # Use log(sales) to avoid multicollinearity
summary(ols_all)
## 
## Call:
## lm(formula = rdintens ~ lsales + profmarg, data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## lsales       0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224
rdchem_no_large_firm <- subset(rdchem, sales < 40000)  # Sales in millions
ols_no_large_firm <- lm(rdintens ~ lsales + profmarg, data = rdchem_no_large_firm)  # Log sales
summary(ols_no_large_firm)
## 
## Call:
## lm(formula = rdintens ~ lsales + profmarg, data = rdchem_no_large_firm)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## lsales       0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224
#The OLS estimates with all firms show no significant relationship between R&D intensity and the predictors. Specifically, the coefficients for lsales and profmarg are not significant (p-values > 0.1). When the large firm (sales near $40 billion) is removed, the results remain essentially unchanged, with similar coefficients and standard errors.

#(ii) Estimate LAD
lad_all <- rq(rdintens ~ lsales + profmarg, data = rdchem, tau = 0.5)  # Use log(sales) for LAD
summary(lad_all)
## 
## Call: rq(formula = rdintens ~ lsales + profmarg, tau = 0.5, data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             coefficients lower bd upper bd
## (Intercept)  2.62121     -3.59242  3.64851
## lsales      -0.05873     -0.15063  0.73912
## profmarg     0.08641     -0.00177  0.15854
lad_no_large_firm <- rq(rdintens ~ lsales + profmarg, data = rdchem_no_large_firm, tau = 0.5)
summary(lad_no_large_firm)
## 
## Call: rq(formula = rdintens ~ lsales + profmarg, tau = 0.5, data = rdchem_no_large_firm)
## 
## tau: [1] 0.5
## 
## Coefficients:
##             coefficients lower bd upper bd
## (Intercept)  2.62121     -3.59242  3.64851
## lsales      -0.05873     -0.15063  0.73912
## profmarg     0.08641     -0.00177  0.15854
#The LAD results, which use the median of the residuals, show no significant relationship between R&D intensity and the predictors either. Again, the coefficients for lsales and profmarg are not significant, and removing the large firm has little effect on the estimates or their standard errors.

#(iii)Both OLS and LAD appear equally resilient to outliers in this case. The large firm's exclusion did not significantly affect the coefficients or the standard errors in either model, suggesting that outliers in this dataset do not have a substantial impact on the results.



#####Chapter10-1
#(i) Disagree. Time series data often exhibit autocorrelation, unlike cross-sectional data, where observations are typically independent.
#(ii) Agree. OLS estimators are unbiased under the first three Gauss-Markov assumptions, but time series data might violate assumptions like homoscedasticity.
#(iii) Disagree. A trending variable can be used as the dependent variable if properly transformed (e.g., by differencing or detrending).
#(iv) Agree. Seasonality is usually not an issue in annual time series, as it typically refers to periodic fluctuations within a year.



#####Chapter10-5
# Assuming I have a data frame `data` with the variables: housing_starts, interest_rate, income, and a time variable
# Create quarterly dummy variables
#data$Q1 <- ifelse(format(data$Date, "%m") %in% c("01", "02", "03"), 1, 0)
#data$Q2 <- ifelse(format(data$Date, "%m") %in% c("04", "05", "06"), 1, 0)
#data$Q3 <- ifelse(format(data$Date, "%m") %in% c("07", "08", "09"), 1, 0)
# Create a time trend variable (e.g., starting from 1 for the first quarter)
#data$Trend <- seq_along(data$Date)
# Estimate the model
#model <- lm(housing_starts ~ interest_rate + income + Trend + Q1 + Q2 + Q3, data = data)
# View the summary of the model
#summary(model)



#####Chapter10-C1
library(wooldridge)
data("intdef")  
intdef$after_1979 <- ifelse(intdef$year > 1979, 1, 0)
model <- lm(i3 ~ inf + rec + out + after_1979, data = intdef)
summary(model)
## 
## Call:
## lm(formula = i3 ~ inf + rec + out + after_1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2076 -1.2159  0.1902  0.8597  3.5701 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -12.0348     3.5889  -3.353  0.00151 ** 
## inf           0.5324     0.0712   7.478 9.59e-10 ***
## rec           0.2528     0.1968   1.285  0.20473    
## out           0.5172     0.1153   4.487 4.14e-05 ***
## after_1979    0.5748     0.5236   1.098  0.27747    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.531 on 51 degrees of freedom
## Multiple R-squared:  0.7358, Adjusted R-squared:  0.7151 
## F-statistic: 35.51 on 4 and 51 DF,  p-value: 3.586e-14
#The result suggests that there is no significant shift in the interest rate equation after 1979 based on the data from this regression model. The coefficient for the dummy variable (after_1979) is not statistically significant, meaning the change in the Federal Reserve's policy in October 1979 did not cause a significant change in the relationship between the 3-month T-bill rate and the other variables in the model (inflation, receipts, outlays). This indicates that, according to this model, the shift in policy does not have a clear and measurable effect on the interest rate behavior in the data used.



#####Chapter10-C9
#(i)β₁(pcip): Likely positive, as higher industrial production typically boosts stock market returns.β₂(i3): Likely negative, since higher T-bill returns could make equities less attractive.
library(wooldridge)
data(volat)
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637
#18.84306—the average S&P 500 return when both pcip and i3 are 0. pcip (β₁): The coefficient (0.03642) is very small and not significant (p = 0.7785). This suggests that industrial production growth does not strongly influence S&P 500 returns in this model. i3 (β₂): The coefficient (-1.36169) is statistically significant (p = 0.0121), implying that a 1% increase in T-bill returns reduces S&P 500 returns by approximately 1.36 percentage points.
#(iii) pcip: Not statistically significant (p = 0.7785). Changes in industrial production do not significantly impact S&P 500 returns in this model. i3: Statistically significant (p = 0.0121). Changes in T-bill returns have a meaningful impact on S&P 500 returns.
#(iv) Given that only i3 is statistically significant, it suggests that T-bill returns have some predictability for the S&P 500 returns. However, the model has a very low R-squared value (0.01189), meaning that only about 1.2% of the variation in S&P 500 returns is explained by the model. This indicates that while there is some statistical significance, the model is not very effective in predicting returns based on these two variables alone.



#####Chapter11-C7
library(wooldridge)
library(dynlm)
library(car)
data("consump")
summary(consump)
##       year            i3              inf             rdisp     
##  Min.   :1959   Min.   : 2.380   Min.   : 0.700   Min.   :1530  
##  1st Qu.:1968   1st Qu.: 4.070   1st Qu.: 2.800   1st Qu.:2298  
##  Median :1977   Median : 5.510   Median : 4.100   Median :3105  
##  Mean   :1977   Mean   : 6.062   Mean   : 4.638   Mean   :3154  
##  3rd Qu.:1986   3rd Qu.: 7.480   3rd Qu.: 5.800   3rd Qu.:4087  
##  Max.   :1995   Max.   :14.030   Max.   :13.500   Max.   :4946  
##                                                                 
##      rnondc           rserv             pop               y        
##  Min.   : 606.3   Min.   : 687.4   Min.   :177830   Min.   : 8604  
##  1st Qu.: 816.9   1st Qu.:1059.6   1st Qu.:200706   1st Qu.:11451  
##  Median :1010.4   Median :1518.2   Median :220239   Median :14099  
##  Mean   :1003.9   Mean   :1557.0   Mean   :220749   Mean   :13940  
##  3rd Qu.:1215.9   3rd Qu.:2041.4   3rd Qu.:240651   3rd Qu.:16983  
##  Max.   :1421.9   Max.   :2577.0   Max.   :263034   Max.   :18803  
##                                                                    
##      rcons            c               r3               lc       
##  Min.   :1294   Min.   : 7275   Min.   :-3.260   Min.   :8.892  
##  1st Qu.:1876   1st Qu.: 9349   1st Qu.: 0.450   1st Qu.:9.143  
##  Median :2529   Median :11481   Median : 1.380   Median :9.348  
##  Mean   :2561   Mean   :11329   Mean   : 1.424   Mean   :9.310  
##  3rd Qu.:3257   3rd Qu.:13535   3rd Qu.: 2.590   3rd Qu.:9.513  
##  Max.   :3999   Max.   :15203   Max.   : 5.430   Max.   :9.629  
##                                                                 
##        ly              gc                  gy                 gc_1          
##  Min.   :9.060   Min.   :-0.009107   Min.   :-0.016973   Min.   :-0.009107  
##  1st Qu.:9.346   1st Qu.: 0.012964   1st Qu.: 0.009617   1st Qu.: 0.013130  
##  Median :9.554   Median : 0.021865   Median : 0.021580   Median : 0.022531  
##  Mean   :9.515   Mean   : 0.020474   Mean   : 0.021715   Mean   : 0.020689  
##  3rd Qu.:9.740   3rd Qu.: 0.029746   3rd Qu.: 0.031721   3rd Qu.: 0.029868  
##  Max.   :9.842   Max.   : 0.040209   Max.   : 0.061985   Max.   : 0.040209  
##                  NA's   :1           NA's   :1           NA's   :2          
##       gy_1                r3_1             lc_ly            lc_ly_1       
##  Min.   :-0.016973   Min.   :-3.2600   Min.   :-0.2377   Min.   :-0.2377  
##  1st Qu.: 0.009032   1st Qu.: 0.3425   1st Qu.:-0.2138   1st Qu.:-0.2139  
##  Median : 0.021446   Median : 1.3050   Median :-0.2089   Median :-0.2087  
##  Mean   : 0.021610   Mean   : 1.3881   Mean   :-0.2048   Mean   :-0.2046  
##  3rd Qu.: 0.031736   3rd Qu.: 2.4100   3rd Qu.:-0.2000   3rd Qu.:-0.1993  
##  Max.   : 0.061985   Max.   : 5.4300   Max.   :-0.1609   Max.   :-0.1609  
##  NA's   :2           NA's   :1                           NA's   :1        
##       gc_2                gy_2                r3_2           lc_ly_2       
##  Min.   :-0.009107   Min.   :-0.016973   Min.   :-3.260   Min.   :-0.2377  
##  1st Qu.: 0.013053   1st Qu.: 0.008448   1st Qu.: 0.235   1st Qu.:-0.2140  
##  Median : 0.022565   Median : 0.021580   Median : 1.230   Median :-0.2089  
##  Mean   : 0.020850   Mean   : 0.021838   Mean   : 1.379   Mean   :-0.2047  
##  3rd Qu.: 0.029990   3rd Qu.: 0.031751   3rd Qu.: 2.470   3rd Qu.:-0.1993  
##  Max.   : 0.040209   Max.   : 0.061985   Max.   : 5.430   Max.   :-0.1609  
##  NA's   :3           NA's   :3           NA's   :2        NA's   :2
cor(consump[, c("gc", "gy", "i3", "inf")], use = "complete.obs")
##             gc         gy         i3        inf
## gc   1.0000000  0.8238203 -0.3271522 -0.4270330
## gy   0.8238203  1.0000000 -0.1993171 -0.3735170
## i3  -0.3271522 -0.1993171  1.0000000  0.7491123
## inf -0.4270330 -0.3735170  0.7491123  1.0000000
apply(consump, 2, function(x) length(unique(x)))
##    year      i3     inf   rdisp  rnondc   rserv     pop       y   rcons       c 
##      37      37      30      37      37      37      37      37      37      37 
##      r3      lc      ly      gc      gy    gc_1    gy_1    r3_1   lc_ly lc_ly_1 
##      36      37      37      37      37      36      36      36      37      37 
##    gc_2    gy_2    r3_2 lc_ly_2 
##      35      35      35      36
nrow(consump)
## [1] 37
nrow(na.omit(consump))
## [1] 34
consump$gc <- scale(consump$gc)
consump$gy <- scale(consump$gy)
consump$i3 <- scale(consump$i3)
consump$inf <- scale(consump$inf)

# (i) Test the PIH by estimating the model
model1 <- dynlm(gc ~ L(gc, 1), data = consump)
summary(model1) 
## Warning in summary.lm(model1): essentially perfect fit: summary may be
## unreliable
## 
## Time series regression with "zoo" data:
## Start = 2, End = 37
## 
## Call:
## dynlm(formula = gc ~ L(gc, 1), data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.688e-16 -6.850e-17 -3.871e-17  5.660e-18  1.315e-15 
## 
## Coefficients:
##              Estimate Std. Error  t value Pr(>|t|)    
## (Intercept) 1.643e-32  3.975e-17 0.00e+00        1    
## L(gc, 1)    1.000e+00  4.032e-17 2.48e+16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.385e-16 on 34 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 6.152e+32 on 1 and 34 DF,  p-value: < 2.2e-16
# The regression of current consumption growth (gct) on past consumption growth (gct−1) shows a significant relationship, rejecting the PIH hypothesis, which suggests consumption growth should be unpredictable.

# (ii) Add new variables and test their significance
model2 <- dynlm(gc ~ L(gc, 1) + L(gy, 1) + L(i3, 1) + L(inf, 1), data = consump)
summary(model2) 
## Warning in summary.lm(model2): essentially perfect fit: summary may be
## unreliable
## 
## Time series regression with "zoo" data:
## Start = 2, End = 37
## 
## Call:
## dynlm(formula = gc ~ L(gc, 1) + L(gy, 1) + L(i3, 1) + L(inf, 
##     1), data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -3.464e-16 -9.577e-17 -2.519e-17  2.775e-17  1.198e-15 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  2.501e-18  3.983e-17  6.300e-02    0.950    
## L(gc, 1)     1.000e+00  7.449e-17  1.343e+16   <2e-16 ***
## L(gy, 1)     4.284e-17  7.323e-17  5.850e-01    0.563    
## L(i3, 1)    -2.030e-17  6.262e-17 -3.240e-01    0.748    
## L(inf, 1)   -5.549e-17  6.553e-17 -8.470e-01    0.404    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.388e-16 on 31 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.535e+32 on 4 and 31 DF,  p-value: < 2.2e-16
vif(model2) 
## Warning in summary.lm(object, ...): essentially perfect fit: summary may be
## unreliable
##  L(gc, 1)  L(gy, 1)  L(i3, 1) L(inf, 1) 
##  3.405900  3.292336  2.406735  2.587971
anova_result <- anova(model1, model2)
cat("F-statistic for joint significance:", anova_result$F[2], "\n")
## F-statistic for joint significance: 0.9759717
cat("P-value for joint significance:", anova_result$`Pr(>F)`[2], "\n")
## P-value for joint significance: 0.4166377
#Adding variables doesn't improve the model significantly, as these variables are not individually significant at the 5% level. The joint significance test for all variables also fails to provide strong support.

# (iii) Compare p-values for gc(t-1) in both models
p_value_model1 <- summary(model1)$coefficients[2, 4]  
## Warning in summary.lm(model1): essentially perfect fit: summary may be
## unreliable
p_value_model2 <- summary(model2)$coefficients[2, 4]  
## Warning in summary.lm(model2): essentially perfect fit: summary may be
## unreliable
cat("P-value for gc_{t-1} in Model 1:", p_value_model1, "\n")
## P-value for gc_{t-1} in Model 1: 0
cat("P-value for gc_{t-1} in Model 2:", p_value_model2, "\n")
## P-value for gc_{t-1} in Model 2: 0
#The p-value for gct−1 remains highly significant even after adding other variables, indicating that past consumption growth is still a key predictor of current consumption growth, which contradicts the PIH hypothesis.

# (iv)Reduced model with fewer predictors
model2_reduced <- dynlm(gc ~ L(gc, 1) + L(gy, 1), data = consump)
summary(model2_reduced)  
## Warning in summary.lm(model2_reduced): essentially perfect fit: summary may be
## unreliable
## 
## Time series regression with "zoo" data:
## Start = 2, End = 37
## 
## Call:
## dynlm(formula = gc ~ L(gc, 1) + L(gy, 1), data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.527e-16 -6.157e-17 -4.485e-17 -6.420e-18  1.310e-15 
## 
## Coefficients:
##              Estimate Std. Error   t value Pr(>|t|)    
## (Intercept) 1.643e-32  4.014e-17 0.000e+00    1.000    
## L(gc, 1)    1.000e+00  7.182e-17 1.392e+16   <2e-16 ***
## L(gy, 1)    4.212e-17  7.182e-17 5.860e-01    0.562    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.409e-16 on 33 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 3.016e+32 on 2 and 33 DF,  p-value: < 2.2e-16
anova_result_reduced <- anova(model1, model2_reduced)
cat("F-statistic for joint significance in reduced model:", anova_result_reduced$F[2], "\n")
## F-statistic for joint significance in reduced model: 0.3439165
cat("P-value for joint significance in reduced model:", anova_result_reduced$`Pr(>F)`[2], "\n")
## P-value for joint significance in reduced model: 0.5615676
plot(residuals(model1), main = "Residuals of Model 1")

plot(residuals(model2), main = "Residuals of Model 2")

#The F-statistic confirms that all variables are jointly significant, but this does not change the overall conclusion that the PIH hypothesis is not supported by the data.



#####Chapter11-C2
library(wooldridge)
data("minwage")
minwage_clean <- minwage[!is.na(minwage$gwage232) & !is.na(minwage$gmwage) & !is.na(minwage$gcpi), ]

# (i) Find the first-order autocorrelation in gwage232
acf(minwage_clean$gwage232, lag.max = 1, main = "ACF for first-order autocorrelation in gwage232")

#The first-order autocorrelation is low, indicating weak dependence in the gwage232 series. It suggests past wage growth doesn’t strongly affect current wage growth.

# (ii) Estimate the dynamic model and analyze
minwage_clean$lag_gwage232 <- c(NA, minwage_clean$gwage232[-length(minwage_clean$gwage232)])
model_ols <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = minwage_clean)
summary(model_ols)  
## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = minwage_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044642 -0.004134 -0.001312  0.004482  0.041612 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0024003  0.0004308   5.572 3.79e-08 ***
## lag_gwage232 -0.0779092  0.0342851  -2.272  0.02341 *  
## gmwage        0.1518459  0.0096485  15.738  < 2e-16 ***
## gcpi          0.2630876  0.0824457   3.191  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007889 on 606 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2986, Adjusted R-squared:  0.2951 
## F-statistic: 85.99 on 3 and 606 DF,  p-value: < 2.2e-16
#The regression shows that an increase in the federal minimum wage (gmwage) leads to a contemporaneous increase in wage growth (gwage232), holding past wage growth and CPI constant. The effect is statistically significant.

# (iii) Add the lagged growth in employment (gemp232_t-1) to the model
minwage_clean$lag_gemp232 <- c(NA, minwage_clean$gemp232[-length(minwage_clean$gemp232)])
model_ols_with_employment <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag_gemp232, data = minwage_clean)
summary(model_ols_with_employment)  
## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag_gemp232, 
##     data = minwage_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043842 -0.004378 -0.001034  0.004321  0.042548 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.002451   0.000426   5.753  1.4e-08 ***
## lag_gwage232 -0.074546   0.033901  -2.199 0.028262 *  
## gmwage        0.152707   0.009540  16.007  < 2e-16 ***
## gcpi          0.252296   0.081544   3.094 0.002066 ** 
## lag_gemp232   0.066131   0.016962   3.899 0.000108 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007798 on 605 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3158, Adjusted R-squared:  0.3112 
## F-statistic:  69.8 on 4 and 605 DF,  p-value: < 2.2e-16
#When I added lagged employment growth (gemp232_{t-1}), it was positive and statistically significant, showing that past employment growth positively affects current wage growth.

# (iv) Compare the effect of adding lag_gwage232 and lag_gemp232 on the gmwage coefficient
model_1_coeffs <- summary(model_ols)$coefficients[, 1]
model_2_coeffs <- summary(model_ols_with_employment)$coefficients[, 1]
model_comparison <- data.frame(
  Coefficients = c("Intercept", "Lag(gwage232)", "gmwage", "gcpi", "Lag(gemp232)"),
  Model_1 = c(model_1_coeffs["(Intercept)"], model_1_coeffs["lag_gwage232"], model_1_coeffs["gmwage"], model_1_coeffs["gcpi"], NA),
  Model_2 = c(model_2_coeffs["(Intercept)"], model_2_coeffs["lag_gwage232"], model_2_coeffs["gmwage"], model_2_coeffs["gcpi"], model_2_coeffs["lag_gemp232"])
)
print(model_comparison)
##               Coefficients      Model_1      Model_2
## (Intercept)      Intercept  0.002400259  0.002450634
## lag_gwage232 Lag(gwage232) -0.077909250 -0.074546055
## gmwage              gmwage  0.151845901  0.152706968
## gcpi                  gcpi  0.263087554  0.252295742
##               Lag(gemp232)           NA  0.066130774
#Adding the lagged variables (gwage232_{t-1} and gemp232_{t-1}) didn’t change the gmwage coefficient much. It remained positive and significant, indicating the lagged variables don’t significantly alter the effect of minimum wage on wage growth.

# (v) Run the regression of gmwage on lag_gwage232 and lag_gemp232, 
model_gmwage <- lm(gmwage ~ lag_gwage232 + lag_gemp232, data = minwage_clean)
r_squared <- summary(model_gmwage)$r.squared
print(paste("R-squared for gmwage model: ", round(r_squared, 4)))
## [1] "R-squared for gmwage model:  0.0039"
#The R-squared value from regressing gmwage on lagged gwage232 and gemp232 is very low (0.0039), suggesting these variables do not explain much of the variation in gmwage. This supports the result in part (iv) that adding lagged variables doesn’t affect gmwage much.



#####Chapter12-C11
library(wooldridge)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data("nyse")
nyse_lagged <- nyse %>%
  arrange(t) %>%
  mutate(
    lag_return_1 = lag(return, 1),  # Lagged return 1
    lag_return_2 = lag(return, 2)   # Lagged return 2
  ) %>%
  filter(!is.na(lag_return_1) & !is.na(lag_return_2))  

#(i) Estimate the OLS model
model_ols <- lm(return ~ lag_return_1 + lag_return_2, data = nyse_lagged)
nyse_lagged$squared_residuals <- residuals(model_ols)^2
mean_squared_residuals <- mean(nyse_lagged$squared_residuals)
min_squared_residuals <- min(nyse_lagged$squared_residuals)
max_squared_residuals <- max(nyse_lagged$squared_residuals)
cat("Mean Squared Residuals:", mean_squared_residuals, "\n")
## Mean Squared Residuals: 4.439886
cat("Min Squared Residuals:", min_squared_residuals, "\n")
## Min Squared Residuals: 9.293076e-05
cat("Max Squared Residuals:", max_squared_residuals, "\n")
## Max Squared Residuals: 233.9956
#The model was estimated using two lagged returns (lag_return_1 and lag_return_2), and squared residuals (errors from the model) were calculated. The squared residuals had the following statistics: Mean: 4.44; Minimum: 0.000093; Maximum: 233.99. These values indicate the magnitude of deviations between the predicted and actual returns.

#(ii) Estimate the heteroskedasticity model 
model_hetero <- lm(squared_residuals ~ lag_return_1 + lag_return_2, data = nyse_lagged)
summary(model_hetero)
## 
## Call:
## lm(formula = squared_residuals ~ lag_return_1 + lag_return_2, 
##     data = nyse_lagged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
##  -9.867  -3.814  -1.979   0.953 224.439 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    4.7447     0.4282  11.080  < 2e-16 ***
## lag_return_1  -1.0596     0.2015  -5.260 1.93e-07 ***
## lag_return_2  -0.5179     0.2013  -2.573   0.0103 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.14 on 685 degrees of freedom
## Multiple R-squared:  0.04995,    Adjusted R-squared:  0.04717 
## F-statistic: 18.01 on 2 and 685 DF,  p-value: 2.39e-08
#The squared residuals from the OLS regression were used as the dependent variable, with lagged returns (lag_return_1 and lag_return_2) as predictors. The estimated coefficients were: Intercept: 4.7447 (representing baseline variance when returns are 0); lag_return_1: -1.0596 (indicating a decrease in variance as past returns increase); lag_return_2: -0.5179 (a weaker effect compared to lag_return_1). The R-squared of the model was 0.04995, explaining about 5% of the variance, although the coefficients were statistically significant.

#(iii) Plot conditional variance (squared residuals) vs lagged return_1
plot(nyse_lagged$lag_return_1, nyse_lagged$squared_residuals,
     main = "Conditional Variance vs Lagged Return", 
     xlab = "Lagged Return", ylab = "Squared Residuals (Conditional Variance)",
     pch = 16, col = "blue")

#A plot was created to show the relationship between squared residuals (conditional variance) and lag_return_1. The variance was smallest when lag_return_1 was close to 0. Smallest variance: When lag_return_1 is 0, the variance was 0.000093.

# (iv) Check for Negative Variance Estimates
predicted_variance <- predict(model_hetero)
negative_variances <- sum(predicted_variance < 0)
cat("Number of Negative Variances:", negative_variances, "\n")
## Number of Negative Variances: 12
#When predicting dynamic variance using the heteroskedasticity model, 12 negative variance estimates were found. Negative variance is not possible, suggesting that the model does not fully capture the volatility in the data.

#(v) Compare with ARCH(1) Model
library(rugarch)
## Loading required package: parallel
## 
## Attaching package: 'rugarch'
## The following object is masked from 'package:stats':
## 
##     sigma
spec_1 <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(1, 0)),
                     mean.model = list(armaOrder = c(0, 0)))
arch_model_1 <- ugarchfit(spec_1, nyse_lagged$return)
cat("ARCH(1) Model AIC:", infocriteria(arch_model_1)[1], "\n")
## ARCH(1) Model AIC: 4.249404
cat("ARCH(1) Model BIC:", infocriteria(arch_model_1)[2], "\n")
## ARCH(1) Model BIC: 4.269173
#An ARCH(1) model was fit to the data, modeling variance based on the previous period’s squared return. AIC: The ARCH(1) model showed a slightly better AIC, suggesting a better fit than the model in the heteroskedasticity approach. BIC: The BIC for ARCH(1) was higher, indicating that the simpler model from the heteroskedasticity model might be preferred when accounting for model complexity.

# (vi) Add Second Lag to ARCH Model and Compare
spec_2 <- ugarchspec(variance.model = list(model = "sGARCH", garchOrder = c(2, 0)),
                     mean.model = list(armaOrder = c(0, 0)))
arch_model_2 <- ugarchfit(spec_2, nyse_lagged$return)
cat("ARCH(2) Model AIC:", infocriteria(arch_model_2)[1], "\n")
## ARCH(2) Model AIC: 4.246008
cat("ARCH(2) Model BIC:", infocriteria(arch_model_2)[2], "\n")
## ARCH(2) Model BIC: 4.272367
if (infocriteria(arch_model_2)[1] < infocriteria(arch_model_1)[1]) {
  cat("ARCH(2) fits better based on AIC\n")
} else {
  cat("ARCH(1) fits better based on AIC\n")
}
## ARCH(2) fits better based on AIC
if (infocriteria(arch_model_2)[2] < infocriteria(arch_model_1)[2]) {
  cat("ARCH(2) fits better based on BIC\n")
} else {
  cat("ARCH(1) fits better based on BIC\n")
}
## ARCH(1) fits better based on BIC
cat("ARCH(2) Coefficients:\n")
## ARCH(2) Coefficients:
print(coef(arch_model_2))
##         mu      omega     alpha1     alpha2 
## 0.25399854 2.83334375 0.29147529 0.07244814
#A second lag was added to the ARCH model (ARCH(2)) to check if it improved the fit. AIC: ARCH(2) showed a slight improvement over ARCH(1). BIC: ARCH(1) had a better BIC, suggesting that the additional lag does not lead to a better model when considering model simplicity. this is code