final

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## Chapter 7
# Exercise 1
# Load libraries
library(wooldridge)
library(broom)
library(car)

## Loading required package: carData

# Load the dataset
data("sleep75")

# Fit the regression model
model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)

# Summary of the model
summary(model)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

#(i)
# Check the coefficient for male
coeftest <- summary(model)$coefficients
coeftest["male", ]

##    Estimate  Std. Error     t value    Pr(>|t|) 
## 87.75242861 34.32616153  2.55642999  0.01078518

#(ii)
# Check the coefficient for totwrk
coeftest["totwrk", ]

##      Estimate    Std. Error       t value      Pr(>|t|) 
## -1.634232e-01  1.813210e-02 -9.012918e+00  1.891272e-18

#(iii)
# Run an F-test for age and age^2 jointly
linearHypothesis(model, c("age = 0", "I(age^2) = 0"))

## 
## Linear hypothesis test:
## age = 0
## I(age^2) = 0
## 
## Model 1: restricted model
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## 
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506

# Exercise 3
data("gpa2")
# Fit the regression model
model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)

# Display summary
summary(model)

## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black, 
##     data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)     -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

#(i)
# Test if hsize^2 is significant
coeftest <- summary(model)$coefficients
coeftest["I(hsize^2)", ]

##      Estimate    Std. Error       t value      Pr(>|t|) 
## -2.194828e+00  5.271635e-01 -4.163468e+00  3.198417e-05

# Calculate optimal high school size
optimal_hsize <- -coef(model)["hsize"] / (2 * coef(model)["I(hsize^2)"])
optimal_hsize

##   hsize 
## 4.39603

#(ii)
# Coefficient for female
coeftest["female", ]

##      Estimate    Std. Error       t value      Pr(>|t|) 
## -4.509145e+01  4.291063e+00 -1.050822e+01  1.656301e-25

#(iii)
# Coefficient for black
coeftest["black", ]

##      Estimate    Std. Error       t value      Pr(>|t|) 
## -1.698126e+02  1.271315e+01 -1.335725e+01  7.140387e-40

linearHypothesis(model, "black = 0")

## 
## Linear hypothesis test:
## black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 76652652                                  
## 2   4131 73479121  1   3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#(iv)
# Calculate difference
diff_black_female <- coef(model)["black"] + coef(model)["female:black"]
diff_black_female

##     black 
## -107.5063

linearHypothesis(model, c("black + female:black = 0"))

## 
## Linear hypothesis test:
## black  + female:black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 74700518                                  
## 2   4131 73479121  1   1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Exercise C1
data("gpa1")
#(i)
# Model with mothcoll and fathcoll
model1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC, data = gpa1)

# Display summary
summary(model1)

## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC, 
##     data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## PC           0.151854   0.058716   2.586 0.010762 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06

#(ii)
# Joint significance test
linearHypothesis(model1, c("mothcoll = 0", "fathcoll = 0"))

## 
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
## 
## Model 1: restricted model
## Model 2: colGPA ~ hsGPA + ACT + mothcoll + fathcoll + PC
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    137 15.149                           
## 2    135 15.094  2  0.054685 0.2446 0.7834

#(iii)
# Model with hsGPA squared
model2 <- lm(colGPA ~ hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll + PC, data = gpa1)

# Display summary
summary(model2)

## 
## Call:
## lm(formula = colGPA ~ hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll + 
##     PC, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78998 -0.24327 -0.00648  0.26179  0.72231 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.040328   2.443038   2.063   0.0410 *
## hsGPA       -1.802520   1.443552  -1.249   0.2140  
## I(hsGPA^2)   0.337341   0.215711   1.564   0.1202  
## ACT          0.004786   0.010786   0.444   0.6580  
## mothcoll     0.003091   0.060110   0.051   0.9591  
## fathcoll     0.062761   0.062401   1.006   0.3163  
## PC           0.140446   0.058858   2.386   0.0184 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared:  0.2361, Adjusted R-squared:  0.2019 
## F-statistic: 6.904 on 6 and 134 DF,  p-value: 2.088e-06

# Exercise C2
data("wage2")
#(i)
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

#(ii)
model2 <- lm(log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + married + black + south + urban, data = wage2)
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure + 
##     I(tenure^2) + married + black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## I(exper^2)  -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## I(tenure^2) -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

anova(model1, model2) # F-test for joint significance

## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + 
##     married + black + south + urban
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226

#(iii)
model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + south + urban, data = wage2)
summary(model3)

## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

#(iv)
wage2$group <- interaction(wage2$married, wage2$black)

model4 <- lm(log(wage) ~ group + educ + exper + tenure + south + urban, data = wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ group + educ + exper + tenure + south + 
##     urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.403793   0.114122  47.351  < 2e-16 ***
## group1.0     0.188915   0.042878   4.406 1.18e-05 ***
## group0.1    -0.240820   0.096023  -2.508 0.012314 *  
## group1.1     0.009448   0.056013   0.169 0.866083    
## educ         0.065475   0.006253  10.471  < 2e-16 ***
## exper        0.014146   0.003191   4.433 1.04e-05 ***
## tenure       0.011663   0.002458   4.745 2.41e-06 ***
## south       -0.091989   0.026321  -3.495 0.000497 ***
## urban        0.184350   0.026978   6.833 1.50e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16

## Chapter 8

# Exercise 1
#Statements (ii) and (iii) are true and consequences of heteroskedasticity

# Exercise 5

library(sandwich)
library(lmtest)# Load the dataset

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

data(smoke)

# Create a binary smoking variable
smoke$smokes <- ifelse(smoke$cigs > 0, 1, 0)

# Estimate the model
model <- lm(smokes ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model)

## 
## Call:
## lm(formula = smokes ~ log(cigpric) + log(income) + educ + age + 
##     I(age^2) + restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6715 -0.3958 -0.2724  0.5344  0.9340 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6560624  0.8549619   0.767 0.443095    
## log(cigpric) -0.0689259  0.2041088  -0.338 0.735684    
## log(income)   0.0121620  0.0257244   0.473 0.636498    
## educ         -0.0288802  0.0059008  -4.894 1.19e-06 ***
## age           0.0198158  0.0056660   3.497 0.000496 ***
## I(age^2)     -0.0002598  0.0000617  -4.210 2.84e-05 ***
## restaurn     -0.1007616  0.0394431  -2.555 0.010815 *  
## white        -0.0256801  0.0515172  -0.498 0.618286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared:  0.062,  Adjusted R-squared:  0.05378 
## F-statistic: 7.544 on 7 and 799 DF,  p-value: 8.035e-09

# Robust standard errors
robust_se <- coeftest(model, vcov = vcovHC(model, type = "HC1"))
print(robust_se)

## 
## t test of coefficients:
## 
##                 Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)   6.5606e-01  8.6408e-01  0.7593 0.4479200    
## log(cigpric) -6.8926e-02  2.0803e-01 -0.3313 0.7404890    
## log(income)   1.2162e-02  2.5823e-02  0.4710 0.6377879    
## educ         -2.8880e-02  5.6421e-03 -5.1187 3.857e-07 ***
## age           1.9816e-02  5.4115e-03  3.6618 0.0002669 ***
## I(age^2)     -2.5979e-04  5.7133e-05 -4.5470 6.284e-06 ***
## restaurn     -1.0076e-01  3.7690e-02 -2.6734 0.0076615 ** 
## white        -2.5680e-02  5.0690e-02 -0.5066 0.6125706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (ii) Effect of 4 additional years of education
change_educ <- coef(model)["educ"] * 4
print(paste("Change in smoking probability with 4 more years of education:", change_educ))

## [1] "Change in smoking probability with 4 more years of education: -0.115520886961172"

# (iii) Age that reduces smoking probability
age_vertex <- -coef(model)["age"] / (2 * coef(model)["I(age^2)"])
print(paste("Age at which smoking probability starts decreasing:", age_vertex))

## [1] "Age at which smoking probability starts decreasing: 38.1386089151421"

# (iv) Interpretation of restaurn coefficient
restaurn_coef <- coef(model)["restaurn"]
print(paste("The coefficient on restaurn is:", restaurn_coef))

## [1] "The coefficient on restaurn is: -0.100761630797458"

print("Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points).")

## [1] "Interpretation: Living in a state with restaurant smoking restrictions reduces the probability of smoking by approximately this value (in percentage points)."

# (v) Predicted probability for individual
predicted <- coef(model)["(Intercept)"] +
  coef(model)["log(cigpric)"] * log(67.44) +
  coef(model)["log(income)"] * log(6500) +
  coef(model)["educ"] * 16 +
  coef(model)["age"] * 77 +
  coef(model)["I(age^2)"] * (77^2) +
  coef(model)["restaurn"] * 0 +
  coef(model)["white"] * 0

print(paste("Predicted probability of smoking for individual 206:", predicted))

## [1] "Predicted probability of smoking for individual 206: -0.00396546766766326"

# Exercise C4

# Load the VOTE1 dataset
data("vote1")

#(i)
# Run the OLS regression
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Get residuals
vote1$residuals <- resid(model)

# Regress residuals on independent variables
resid_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Display R-squared
summary(resid_model)$r.squared

## [1] 1.198131e-32

#(ii)
# Perform Breusch-Pagan test
bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.0934, df = 4, p-value = 0.05881

#(iii)
# Perform White test using auxiliary regression with squared terms
white_test <- bptest(model, ~ prtystrA + democA + log(expendA) + log(expendB) + 
                       I(prtystrA^2) + I(democA^2) + I(log(expendA)^2) + I(log(expendB)^2), 
                     data = vote1)

# Display test results
white_test

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 19.581, df = 7, p-value = 0.00655

# Exercise C13

data("fertil2")

#(i)
# Estimate the model
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)

# Summary with usual standard errors
summary(model)

## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16

# Heteroskedasticity-robust standard errors
coeftest(model, vcov = vcovHC(model, type = "HC1"))

## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24385099 -17.3160 < 2.2e-16 ***
## age          0.34092552  0.01917466  17.7800 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035051  -7.8206 6.549e-15 ***
## educ        -0.07523232  0.00630771 -11.9270 < 2.2e-16 ***
## electric    -0.31004041  0.06394815  -4.8483 1.289e-06 ***
## urban       -0.20003386  0.04547093  -4.3992 1.113e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#(ii)
# Extended model with religious dummies
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + catholic + protest, data = fertil2)

# Joint significance test
linearHypothesis(model2, c("catholic = 0", "protest = 0"), vcov = vcovHC(model2, type = "HC1"))

## 
## Linear hypothesis test:
## catholic = 0
## protest = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + catholic + 
##     protest
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df      F Pr(>F)
## 1   4352                 
## 2   4350  2 0.1394 0.8699

#(iii)
# Obtain fitted values and residuals
fitted_vals <- fitted(model2)
residuals_sq <- resid(model2)^2

# Regression of squared residuals on fitted values
het_test <- lm(residuals_sq ~ fitted_vals + I(fitted_vals^2))

# Test joint significance
summary(het_test)

## 
## Call:
## lm(formula = residuals_sq ~ fitted_vals + I(fitted_vals^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.747 -1.278 -0.300  0.339 47.684 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.31003    0.11168   2.776  0.00552 ** 
## fitted_vals      -0.14880    0.10204  -1.458  0.14483    
## I(fitted_vals^2)  0.26755    0.01965  13.614  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.642 on 4355 degrees of freedom
## Multiple R-squared:  0.2499, Adjusted R-squared:  0.2495 
## F-statistic: 725.4 on 2 and 4355 DF,  p-value: < 2.2e-16

linearHypothesis(het_test, c("fitted_vals = 0", "I(fitted_vals^2) = 0"))

## 
## Linear hypothesis test:
## fitted_vals = 0
## I(fitted_vals^2) = 0
## 
## Model 1: restricted model
## Model 2: residuals_sq ~ fitted_vals + I(fitted_vals^2)
## 
##   Res.Df   RSS Df Sum of Sq      F    Pr(>F)    
## 1   4357 77019                                  
## 2   4355 57774  2     19246 725.37 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#(iv)
#Yes, A significant portion of the error variance is systematically related to the fitted valuesA significant portion of the error variance is systematically related to the fitted values.

## Chapter 9

#Exercise 1
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

# Exercise 1
# Step 1: Load the CEOSAL2 dataset
data("ceosal2")

# Step 2: Estimate the base model
base_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = ceosal2)
summary(base_model)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = ceosal2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15

# Extract R-squared for the base model
R2_base <- summary(base_model)$r.squared
cat("R-squared for the base model: ", R2_base, "\n")

## R-squared for the base model:  0.3525374

# Step 3: Add ceoten^2 and comten^2 and estimate the updated model
CEOSAL2 <- ceosal2 %>%
  mutate(ceoten_sq = ceoten^2, comten_sq = comten^2)

updated_model <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
summary(updated_model)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten + ceoten_sq + comten_sq, data = CEOSAL2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47661 -0.26615 -0.03878  0.27787  1.89344 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.424e+00  2.656e-01  16.655  < 2e-16 ***
## log(sales)   1.857e-01  3.980e-02   4.665 6.25e-06 ***
## log(mktval)  1.018e-01  4.872e-02   2.089 0.038228 *  
## profmarg    -2.575e-03  2.088e-03  -1.233 0.219137    
## ceoten       4.772e-02  1.416e-02   3.371 0.000929 ***
## comten      -6.063e-03  1.189e-02  -0.510 0.610815    
## ceoten_sq   -1.119e-03  4.814e-04  -2.324 0.021329 *  
## comten_sq   -5.389e-05  2.528e-04  -0.213 0.831474    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4892 on 169 degrees of freedom
## Multiple R-squared:  0.3745, Adjusted R-squared:  0.3486 
## F-statistic: 14.45 on 7 and 169 DF,  p-value: 1.136e-14

# Extract R-squared for the updated model
R2_updated <- summary(updated_model)$r.squared
cat("R-squared for the updated model: ", R2_updated, "\n")

## R-squared for the updated model:  0.3744746

# Step 4: Compare R-squared values
if (R2_updated > R2_base) {
  cat("The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.\n")
} else {
  cat("No evidence of functional form misspecification.\n")
}

## The R-squared increased after adding ceoten^2 and comten^2, suggesting a possible functional form misspecification in the base model.

# Exercise 5
data("crime4")

missing_crimes <- crime4 %>%
  filter(is.na(crmrte)) 

cat("Number of colleges missing crime data: ", nrow(missing_crimes), "\n")

## Number of colleges missing crime data:  0

# Exercise C4
# Part (i): Reestimate the model including the DC dummy
model_with_dc <- lm(infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
summary(model_with_dc)

## 
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6894 -0.8973 -0.1412  0.7054  3.1705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2125     6.7267   5.383 5.09e-07 ***
## lpcinc       -2.3964     0.8866  -2.703  0.00812 ** 
## lphysic      -1.5548     0.7734  -2.010  0.04718 *  
## lpopul        0.5755     0.1365   4.215 5.60e-05 ***
## DC           13.9632     1.2466  11.201  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.255 on 97 degrees of freedom
## Multiple R-squared:  0.6432, Adjusted R-squared:  0.6285 
## F-statistic: 43.72 on 4 and 97 DF,  p-value: < 2.2e-16

# Extract and interpret the coefficient for DC
dc_coeff <- summary(model_with_dc)$coefficients["DC", ]
cat("Coefficient on DC:", dc_coeff[1], "\n")

## Coefficient on DC: 13.96317

cat("Standard error for DC:", dc_coeff[2], "\n")

## Standard error for DC: 1.246646

cat("P-value for DC:", dc_coeff[4], "\n")

## P-value for DC: 3.514299e-19

# Part (ii): Reestimate the model without the DC dummy
model_without_dc <- lm(infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
summary(model_without_dc)

## 
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul, data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0204 -1.1897 -0.1152  0.9083  8.1890 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.22614   10.13465   3.574 0.000547 ***
## lpcinc      -4.88415    1.29319  -3.777 0.000273 ***
## lphysic      4.02783    0.89101   4.521 1.73e-05 ***
## lpopul      -0.05356    0.18748  -0.286 0.775712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.891 on 98 degrees of freedom
## Multiple R-squared:  0.1818, Adjusted R-squared:  0.1568 
## F-statistic:  7.26 on 3 and 98 DF,  p-value: 0.0001898

# Extract coefficients and standard errors from both models
coeff_without_dc <- coef(summary(model_without_dc))[, c(1, 2)]  # Without DC
coeff_with_dc <- coef(summary(model_with_dc))[, c(1, 2)]        # With DC

# Align the rows by variable names
comparison <- merge(
  as.data.frame(coeff_without_dc, row.names = rownames(coeff_without_dc)),
  as.data.frame(coeff_with_dc, row.names = rownames(coeff_with_dc)),
  by = "row.names",
  all = TRUE
)

# Rename columns for clarity
colnames(comparison) <- c("Variable", "Estimate (Without DC)", "Std. Error (Without DC)", 
                          "Estimate (With DC)", "Std. Error (With DC)")

# Display the comparison table
print(comparison)

##      Variable Estimate (Without DC) Std. Error (Without DC) Estimate (With DC)
## 1 (Intercept)           36.22614485              10.1346544         36.2125046
## 2          DC                    NA                      NA         13.9631727
## 3      lpcinc           -4.88414573               1.2931923         -2.3963515
## 4     lphysic            4.02783276               0.8910099         -1.5547710
## 5      lpopul           -0.05356253               0.1874791          0.5755155
##   Std. Error (With DC)
## 1            6.7267094
## 2            1.2466463
## 3            0.8866076
## 4            0.7734138
## 5            0.1365241

# Exercise C5
library(dplyr)
library(quantreg)

## Loading required package: SparseM

# Load the RDCHEM dataset
data("rdchem")

# Inspect the dataset
summary(rdchem)

##        rd              sales            profits           rdintens    
##  Min.   :   1.70   Min.   :   42.0   Min.   :  -4.30   Min.   :1.027  
##  1st Qu.:  10.85   1st Qu.:  507.8   1st Qu.:  42.12   1st Qu.:2.031  
##  Median :  42.60   Median : 1332.5   Median : 111.70   Median :2.641  
##  Mean   : 153.68   Mean   : 3797.0   Mean   : 370.50   Mean   :3.266  
##  3rd Qu.:  78.85   3rd Qu.: 2856.6   3rd Qu.: 295.68   3rd Qu.:3.965  
##  Max.   :1428.00   Max.   :39709.0   Max.   :4154.00   Max.   :9.422  
##     profmarg         salessq              lsales            lrd        
##  Min.   :-3.219   Min.   :1.764e+03   Min.   : 3.738   Min.   :0.5306  
##  1st Qu.: 4.074   1st Qu.:2.579e+05   1st Qu.: 6.230   1st Qu.:2.3839  
##  Median : 8.760   Median :1.790e+06   Median : 7.191   Median :3.7498  
##  Mean   : 9.823   Mean   :7.020e+07   Mean   : 7.165   Mean   :3.6028  
##  3rd Qu.:16.456   3rd Qu.:8.162e+06   3rd Qu.: 7.957   3rd Qu.:4.3631  
##  Max.   :27.187   Max.   :1.577e+09   Max.   :10.589   Max.   :7.2640

head(rdchem)

##      rd  sales profits rdintens  profmarg     salessq   lsales       lrd
## 1 430.6 4570.2   186.9 9.421906  4.089536 20886730.00 8.427312 6.0651798
## 2  59.0 2830.0   467.0 2.084806 16.501766  8008900.00 7.948032 4.0775375
## 3  23.5  596.8   107.4 3.937668 17.995979   356170.22 6.391582 3.1570003
## 4   3.5  133.6    -4.3 2.619760 -3.218563    17848.96 4.894850 1.2527629
## 5   1.7   42.0     8.0 4.047619 19.047619     1764.00 3.737670 0.5306283
## 6   8.4  390.0    47.3 2.153846 12.128205   152100.00 5.966147 2.1282318

# Convert sales to billions
rdchem <- rdchem %>%
  mutate(sales_bil = sales / 1000,
         sales_bil_sq = sales_bil^2)
#(i)
# OLS with all data
ols_full <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem)
summary(ols_full)

## 
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   2.058967   0.626269   3.288  0.00272 **
## sales_bil     0.316611   0.138854   2.280  0.03041 * 
## sales_bil_sq -0.007390   0.003716  -1.989  0.05657 . 
## profmarg      0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107

# Identify the firm with the largest sales
largest_firm <- rdchem %>% filter(sales_bil == max(sales_bil))

# Remove the firm with the largest sales
rdchem_no_outlier <- rdchem %>% filter(sales_bil != max(sales_bil))

# OLS without the largest firm
ols_no_outlier <- lm(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier)
summary(ols_no_outlier)

## 
## Call:
## lm(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     data = rdchem_no_outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0843 -1.1354 -0.5505  0.7570  5.7783 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)   1.98352    0.71763   2.764   0.0102 *
## sales_bil     0.36062    0.23887   1.510   0.1427  
## sales_bil_sq -0.01025    0.01308  -0.784   0.4401  
## profmarg      0.05528    0.04579   1.207   0.2378  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared:  0.1912, Adjusted R-squared:  0.1013 
## F-statistic: 2.128 on 3 and 27 DF,  p-value: 0.1201

#(ii)
# LAD with all data
lad_full <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem, tau = 0.5)
summary(lad_full)

## 
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     tau = 0.5, data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##              coefficients lower bd upper bd
## (Intercept)   1.40428      0.87031  2.66628
## sales_bil     0.26346     -0.13508  0.75753
## sales_bil_sq -0.00600     -0.01679  0.00344
## profmarg      0.11400      0.01376  0.16427

# LAD without the largest firm
lad_no_outlier <- rq(rdintens ~ sales_bil + sales_bil_sq + profmarg, data = rdchem_no_outlier, tau = 0.5)
summary(lad_no_outlier)

## 
## Call: rq(formula = rdintens ~ sales_bil + sales_bil_sq + profmarg, 
##     tau = 0.5, data = rdchem_no_outlier)
## 
## tau: [1] 0.5
## 
## Coefficients:
##              coefficients lower bd upper bd
## (Intercept)   2.61047      0.58936  2.81404
## sales_bil    -0.22364     -0.23542  0.87607
## sales_bil_sq  0.01681     -0.03201  0.02883
## profmarg      0.07594      0.00578  0.16392

#(iii)
cat("LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.")

## LAD is more resilient to outliers than OLS as it shows smaller variations after removing the largest firm.

## Chapter 10

# Exercise 1
# (i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.
# Disagree.
# Time series data often exhibit autocorrelation, meaning current observations are correlated with past observations.
# Unlike cross-sectional data, the temporal order introduces dependence.

# (ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions.
# Agree.
# The first three Gauss-Markov assumptions (linearity, no multicollinearity, zero conditional mean of errors)
# ensure that the OLS estimator remains unbiased in time series regression.

# (iii) A trending variable cannot be used as the dependent variable in multiple regression analysis.
# Disagree.
# Trending variables can be used as dependent variables if the trend is explicitly modeled,
# such as including a time trend variable in the regression.

# (iv) Seasonality is not an issue when using annual time series observations.
# Agree.
# Seasonality is generally a concern in higher-frequency data (e.g., monthly or quarterly).
# Annual data averages out seasonal effects, reducing their impact.

# Exercise 5
# Specify a model that accounts for trends and seasonality:

cat("housing_model <- lm(housing_starts ~ time + interest_rate + income + Q1 + Q2 + Q3, data = housing_data)")

## housing_model <- lm(housing_starts ~ time + interest_rate + income + Q1 + Q2 + Q3, data = housing_data)

# Explanation:
# - time: Captures overall trend
# - interest_rate: Reflects interest rate effects
# - income: Reflects income effects
# - Q1, Q2, Q3: Seasonal dummies for the first three quarters (Q4 as the reference)
# - housing_starts: Dependent variable representing quarterly housing starts

# Exercise C1

# Create a dummy variable for years after 1979
intdef$after1979 <- ifelse(intdef$year > 1979, 1, 0)

# Fit the regression model
interest_model <- lm(i3 ~ inf + after1979, data = intdef)

# Display the summary of the model
summary(interest_model)

## 
## Call:
## lm(formula = i3 ~ inf + after1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5906 -0.7579  0.1414  1.1752  3.8358 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.52833    0.44914   3.403  0.00128 ** 
## inf          0.62991    0.08151   7.728 3.05e-10 ***
## after1979    2.17781    0.49630   4.388 5.48e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.837 on 53 degrees of freedom
## Multiple R-squared:  0.6047, Adjusted R-squared:  0.5898 
## F-statistic: 40.53 on 2 and 53 DF,  p-value: 2.086e-11

# Exercise C9
# (i) Expected Signs:
# β1 (pcip): Positive. Higher industrial production growth often indicates economic growth, boosting stock returns.
# β2 (i3): Negative. Higher T-bill returns may lead investors to prefer risk-free assets over stocks.

# (ii) Estimate the equation by OLS
volat_model <- lm(rsp500 ~ pcip + i3, data = volat)
summary(volat_model)

## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637

# (iii) Statistical Significance
# pcip: Not statistically significant (p = 0.7785)
# i3: Statistically significant (p = 0.0121)

# Interpretation
# A one-unit increase in pcip has a very small and statistically insignificant effect on rsp500.
# A one-unit increase in i3 reduces rsp500 by approximately 1.36 points, and this relationship is statistically significant.

# (iv) Implication for Predictability
# i3 is significant, suggesting some predictability from short-term interest rates.
# pcip is insignificant, suggesting it does not add predictive power.

# Efficient Market Hypothesis (EMH)
# Low R-squared (0.01189) indicates most variation in rsp500 is unexplained, 
# aligning with the EMH's assertion of largely unpredictable stock returns.

## Chapter 11
# Exercise C7

data("consump", package = "wooldridge")

# Part (i): Test the PIH
# Create the growth variable gct and lagged growth variable gct_lag
consump$gct <- c(NA, diff(log(consump$c)))
consump$gct_lag <- c(NA, head(consump$gct, -1))

# Fit the regression model for gct ~ gct_lag
model1 <- lm(gct ~ gct_lag, data = consump, na.action = na.omit)
summary(model1)

## 
## Call:
## lm(formula = gct ~ gct_lag, data = consump, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001451  0.007142  0.020227 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.011431   0.003778   3.026  0.00478 **
## gct_lag     0.446141   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.00731

# Part (ii): Add additional variables and test their significance
# Create additional variables
consump$gy <- c(NA, diff(log(consump$y)))  # Growth rate of income
consump$gy_lag <- c(NA, head(consump$gy, -1))  # Lagged growth rate of income
consump$i3_lag <- c(NA, head(consump$i3, -1))  # Lagged T-bill rate
consump$inf_lag <- c(NA, head(consump$inf, -1))  # Lagged inflation rate

# Fit the regression model with additional variables
model2 <- lm(gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump, na.action = na.omit)
summary(model2)

## 
## Call:
## lm(formula = gct ~ gct_lag + gy_lag + i3_lag + inf_lag, data = consump, 
##     na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249092 -0.0075869  0.0000855  0.0087233  0.0188618 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.0225940  0.0070892   3.187  0.00335 **
## gct_lag      0.4335853  0.2896533   1.497  0.14486   
## gy_lag      -0.1079101  0.1946373  -0.554  0.58340   
## i3_lag      -0.0007467  0.0011107  -0.672  0.50654   
## inf_lag     -0.0008281  0.0010041  -0.825  0.41606   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01134 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.211 
## F-statistic: 3.273 on 4 and 30 DF,  p-value: 0.02431

# Part (iii): Compare the significance of gct_lag between model1 and model2
# The p-value for gct_lag is shown in the summaries of model1 and model2.

# Part (iv): Perform an F-test to check joint significance of added variables
anova_result <- anova(model1, model2)
anova_result

## Analysis of Variance Table
## 
## Model 1: gct ~ gct_lag
## Model 2: gct ~ gct_lag + gy_lag + i3_lag + inf_lag
##   Res.Df       RSS Df  Sum of Sq      F Pr(>F)
## 1     33 0.0044446                            
## 2     30 0.0038608  3 0.00058384 1.5122 0.2315

# Print conclusions
cat("\nPart (i) Conclusion:\n")

## 
## Part (i) Conclusion:

cat("The coefficient for gct_lag in model1 is significant (p-value =", summary(model1)$coefficients[2, 4], ").\n")

## The coefficient for gct_lag in model1 is significant (p-value = 0.007310181 ).

cat("This suggests the PIH is not supported.\n")

## This suggests the PIH is not supported.

cat("\nPart (ii) Conclusion:\n")

## 
## Part (ii) Conclusion:

cat("In model2, none of the additional variables (gy_lag, i3_lag, inf_lag) are individually significant (p-values > 0.05).\n")

## In model2, none of the additional variables (gy_lag, i3_lag, inf_lag) are individually significant (p-values > 0.05).

cat("\nPart (iii) Conclusion:\n")

## 
## Part (iii) Conclusion:

cat("The p-value for gct_lag increases from model1 (p =", summary(model1)$coefficients[2, 4], 
    ") to model2 (p =", summary(model2)$coefficients[2, 4], ").\n")

## The p-value for gct_lag increases from model1 (p = 0.007310181 ) to model2 (p = 0.1448643 ).

cat("This indicates that the significance of gct_lag decreases when additional variables are included.\n")

## This indicates that the significance of gct_lag decreases when additional variables are included.

cat("\nPart (iv) Conclusion:\n")

## 
## Part (iv) Conclusion:

cat("The F-test for joint significance of the additional variables in model2 yields a p-value of", anova_result$`Pr(>F)`[2], ".\n")

## The F-test for joint significance of the additional variables in model2 yields a p-value of 0.2314828 .

cat("The additional variables are not jointly significant at the 5% level.\n")

## The additional variables are not jointly significant at the 5% level.

# Exercise C12
data("minwage", package = "wooldridge")

# Remove missing values from gwage232
gwage232_clean <- na.omit(minwage$gwage232)

# Calculate first-order autocorrelation
acf_result <- acf(gwage232_clean, plot = FALSE)
autocorrelation <- acf_result$acf[2] # First lag autocorrelation
cat("Part (i) First-order autocorrelation in gwage232:", autocorrelation, "\n")

## Part (i) First-order autocorrelation in gwage232: -0.03493135

# Part (ii): Estimate the dynamic model
# Fit the dynamic model
model_dynamic <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
summary(model_dynamic)

## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = minwage)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044642 -0.004134 -0.001312  0.004482  0.041612 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.0024003  0.0004308   5.572 3.79e-08 ***
## lag(gwage232, 1) -0.0779092  0.0342851  -2.272  0.02341 *  
## gmwage            0.1518459  0.0096485  15.738  < 2e-16 ***
## gcpi              0.2630876  0.0824457   3.191  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007889 on 606 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.2986, Adjusted R-squared:  0.2951 
## F-statistic: 85.99 on 3 and 606 DF,  p-value: < 2.2e-16

# Part (iii): Add gemp232_lag to the model and test its significance
minwage$gemp232_lag <- c(NA, head(minwage$gemp232, -1)) # Lagged growth in employment
model_with_gemp <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_with_gemp)# Part (iv): Compare the effect of adding lagged variables

## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + gemp232_lag, 
##     data = minwage, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043842 -0.004378 -0.001034  0.004321  0.042548 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.002451   0.000426   5.753  1.4e-08 ***
## lag(gwage232, 1) -0.074546   0.033901  -2.199 0.028262 *  
## gmwage            0.152707   0.009540  16.007  < 2e-16 ***
## gcpi              0.252296   0.081544   3.094 0.002066 ** 
## gemp232_lag       0.066131   0.016962   3.899 0.000108 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007798 on 605 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3158, Adjusted R-squared:  0.3112 
## F-statistic:  69.8 on 4 and 605 DF,  p-value: < 2.2e-16

# Extract coefficients for gmwage from the models
gmwage_coef_dynamic <- summary(model_dynamic)$coefficients["gmwage", "Estimate"]
gmwage_coef_with_gemp <- summary(model_with_gemp)$coefficients["gmwage", "Estimate"]
cat("Part (iv) Coefficient for gmwage in model_dynamic:", gmwage_coef_dynamic, "\n")

## Part (iv) Coefficient for gmwage in model_dynamic: 0.1518459

cat("Coefficient for gmwage in model_with_gemp:", gmwage_coef_with_gemp, "\n")

## Coefficient for gmwage in model_with_gemp: 0.152707

cat("Adding the lagged variables changes the effect of gmwage slightly.\n")

## Adding the lagged variables changes the effect of gmwage slightly.

# Part (v): Regression of gmwage on gwage232_lag and gemp232_lag
model_gmwage <- lm(gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage, na.action = na.omit)
summary(model_gmwage)

## 
## Call:
## lm(formula = gmwage ~ lag(gwage232, 1) + gemp232_lag, data = minwage, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01914 -0.00500 -0.00379 -0.00287  0.62208 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)       0.003433   0.001440   2.384   0.0174 *
## lag(gwage232, 1)  0.203167   0.143140   1.419   0.1563  
## gemp232_lag      -0.041706   0.072110  -0.578   0.5632  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03318 on 607 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.00392,    Adjusted R-squared:  0.0006377 
## F-statistic: 1.194 on 2 and 607 DF,  p-value: 0.3036

# Report R-squared
rsquared <- summary(model_gmwage)$r.squared
cat("Part (v) R-squared from regression of gmwage:", rsquared, "\n")

## Part (v) R-squared from regression of gmwage: 0.003919682

cat("A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).\n")

## A higher R-squared suggests that gmwage is explained well by lagged variables, aligning with part (iv).

## Chapter 12

# Exercise C11
data("nyse", package = "wooldridge")

# Create the lagged variable for return
nyse$return_lag1 <- c(NA, head(nyse$return, -1))

# Fit the model (12.47)
model_1247 <- lm(return ~ return_lag1, data = nyse, na.action = na.omit)
summary(model_1247)

## 
## Call:
## lm(formula = return ~ return_lag1, data = nyse, na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.261  -1.302   0.098   1.316   8.065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17963    0.08074   2.225   0.0264 *
## return_lag1  0.05890    0.03802   1.549   0.1218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 687 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.003481,   Adjusted R-squared:  0.00203 
## F-statistic: 2.399 on 1 and 687 DF,  p-value: 0.1218

# Exclude rows with missing values in the lagged variable (used in the regression)
nyse_clean <- nyse[!is.na(nyse$return_lag1), ]

# Add squared residuals to the cleaned dataset
nyse_clean$squared_resid <- residuals(model_1247)^2

# Calculate average, minimum, and maximum of squared residuals
avg_squared_resid <- mean(nyse_clean$squared_resid)
min_squared_resid <- min(nyse_clean$squared_resid)
max_squared_resid <- max(nyse_clean$squared_resid)

# Print the results
cat("Part (i) Average of squared residuals:", avg_squared_resid, "\n")

## Part (i) Average of squared residuals: 4.440839

cat("Minimum of squared residuals:", min_squared_resid, "\n")

## Minimum of squared residuals: 7.35465e-06

cat("Maximum of squared residuals:", max_squared_resid, "\n")

## Maximum of squared residuals: 232.8946

# Part (ii) Create the squared lagged variable in the cleaned dataset
nyse_clean$return_lag1_sq <- nyse_clean$return_lag1^2

# Estimate the heteroskedasticity model
hetero_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)

# Summarize the results
summary(hetero_model)

## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1    -0.78946    0.19569  -4.034 6.09e-05 ***
## return_lag1_sq  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16

# Report estimated coefficients, standard errors, R-squared, and adjusted R-squared
cat("Part (ii) Coefficients and diagnostics for the heteroskedasticity model:\n")

## Part (ii) Coefficients and diagnostics for the heteroskedasticity model:

print(summary(hetero_model))

## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq, data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1    -0.78946    0.19569  -4.034 6.09e-05 ***
## return_lag1_sq  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16

# Part (iii): Conditional variance as a function of lagged return
# Sketch conditional variance manually using the estimated coefficients
delta_0 <- coef(hetero_model)[1]
delta_1 <- coef(hetero_model)[2]
delta_2 <- coef(hetero_model)[3]

lagged_returns <- seq(min(nyse$return_lag1, na.rm = TRUE), max(nyse$return_lag1, na.rm = TRUE), length.out = 100)
conditional_variance <- delta_0 + delta_1 * lagged_returns + delta_2 * lagged_returns^2

# Plot the conditional variance
plot(lagged_returns, conditional_variance, type = "l", col = "blue",
     main = "Conditional Variance as a Function of Lagged Return",
     xlab = "Lagged Return", ylab = "Conditional Variance")

# Smallest variance and corresponding lagged return
min_var_index <- which.min(conditional_variance)
cat("Part (iii) Smallest variance:", conditional_variance[min_var_index], "\n")

## Part (iii) Smallest variance: 2.734254

cat("Lagged return corresponding to smallest variance:", lagged_returns[min_var_index], "\n")

## Lagged return corresponding to smallest variance: 1.245583

# Part (iv): Check for negative variance
any_negative_variance <- any(conditional_variance < 0)
cat("Part (iv) Does the model produce any negative variance?", any_negative_variance, "\n")

## Part (iv) Does the model produce any negative variance? FALSE

# Part (v): Compare model (ii) to ARCH(1)
# ARCH(1) regression
arch1_model <- lm(squared_resid ~ lag(squared_resid, 1), data = nyse_clean, na.action = na.omit)
summary(arch1_model)

## 
## Call:
## lm(formula = squared_resid ~ lag(squared_resid, 1), data = nyse_clean, 
##     na.action = na.omit)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.337  -3.292  -2.157   0.556 223.981 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            2.94743    0.44023   6.695 4.49e-11 ***
## lag(squared_resid, 1)  0.33706    0.03595   9.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 686 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1136, Adjusted R-squared:  0.1123 
## F-statistic: 87.92 on 1 and 686 DF,  p-value: < 2.2e-16

# Compare R-squared
cat("Part (v) R-squared for heteroskedasticity model:", summary(hetero_model)$r.squared, "\n")

## Part (v) R-squared for heteroskedasticity model: 0.1303208

cat("R-squared for ARCH(1) model:", summary(arch1_model)$r.squared, "\n")

## R-squared for ARCH(1) model: 0.1136065

# Part (vi): ARCH(2) regression
# Create the second lag of squared residuals in the cleaned dataset
nyse_clean$squared_resid_lag2 <- c(NA, NA, head(nyse_clean$squared_resid, -2))

# Fit the ARCH(2) model
arch2_model <- lm(squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2, data = nyse_clean)

# Summarize the ARCH(2) model results
summary(arch2_model)

## 
## Call:
## lm(formula = squared_resid ~ return_lag1 + return_lag1_sq + squared_resid_lag2, 
##     data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.755  -3.039  -1.958   0.596 221.737 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.16429    0.45510   6.953 8.37e-12 ***
## return_lag1        -0.78460    0.19640  -3.995 7.17e-05 ***
## return_lag1_sq      0.28459    0.03810   7.469 2.47e-13 ***
## squared_resid_lag2  0.03492    0.03827   0.913    0.362    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.67 on 683 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1313, Adjusted R-squared:  0.1274 
## F-statistic:  34.4 on 3 and 683 DF,  p-value: < 2.2e-16

# Compare fit of ARCH(2) with previous models
cat("Part (vi) R-squared for ARCH(2) model:", summary(arch2_model)$r.squared, "\n")

## Part (vi) R-squared for ARCH(2) model: 0.1312591

final

112035150

2025-01-09

R Markdown

Including Plots