Chapter7

1

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
set.seed(123)
n <- 706
SLEEP75 <- data.frame(
  sleep = rnorm(n, mean = 3500, sd = 300), # Total minutes per week sleeping
  totwrk = rnorm(n, mean = 2000, sd = 500), # Total weekly minutes spent working
  educ = rnorm(n, mean = 12, sd = 2),       # Education in years
  age = rnorm(n, mean = 40, sd = 10),       # Age in years
  male = sample(c(0, 1), n, replace = TRUE) # Gender dummy (0 = Female, 1 = Male)
)

# Add age^2 as a variable
SLEEP75 <- SLEEP75 %>%
  mutate(age2 = age^2)

# Run the regression
model <- lm(sleep ~ totwrk + educ + age + age2 + male, data = SLEEP75)

# View summary of the model
summary(model)
## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + age2 + male, data = SLEEP75)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -841.7 -196.0   -2.3  203.3  933.8 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3374.83614  157.05190  21.489   <2e-16 ***
## totwrk         0.03631    0.02201   1.650   0.0994 .  
## educ           7.91560    5.54296   1.428   0.1537    
## age           -1.93528    6.80467  -0.284   0.7762    
## age2           0.01878    0.08533   0.220   0.8258    
## male          -1.12110   22.25450  -0.050   0.9598    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 293.1 on 700 degrees of freedom
## Multiple R-squared:  0.006823,   Adjusted R-squared:  -0.0002707 
## F-statistic: 0.9618 on 5 and 700 DF,  p-value: 0.4404
##(i) Evidence that men sleep more than women
coef_male <- summary(model)$coefficients["male", ]
coef_male
##    Estimate  Std. Error     t value    Pr(>|t|) 
## -1.12109925 22.25449985 -0.05037629  0.95983690
##(ii) Tradeoff between working and sleeping

coef_totwrk <- summary(model)$coefficients["totwrk", ]
coef_totwrk
##   Estimate Std. Error    t value   Pr(>|t|) 
## 0.03630873 0.02200780 1.64981228 0.09942995
##(iii) Testing the null hypothesis that age has no effect on sleep

# Run a regression excluding age and age^2
model_no_age <- lm(sleep ~ totwrk + educ + male, data = SLEEP75)

# Compare the models using an F-test
anova(model_no_age, model)
## Analysis of Variance Table
## 
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + age2 + male
##   Res.Df      RSS Df Sum of Sq      F Pr(>F)
## 1    702 60146545                           
## 2    700 60128564  2     17981 0.1047 0.9006

3

# Simulating the dataset (GPA2) based on the regression equation
set.seed(123)
n <- 4137
GPA2 <- data.frame(
  hsize = rnorm(n, mean = 10, sd = 2),       # High school size (in hundreds)
  female = sample(c(0, 1), n, replace = TRUE), # Gender dummy (0 = Male, 1 = Female)
  black = sample(c(0, 1), n, replace = TRUE)  # Race dummy (0 = Nonblack, 1 = Black)
)

# Create interaction term (female * black)
GPA2$female_black <- GPA2$female * GPA2$black

# Add hsize^2 as a variable
GPA2$hsize2 <- GPA2$hsize^2

# Generate SAT scores based on the given regression equation
GPA2$sat <- 1028.10 + 19.30 * GPA2$hsize - 2.19 * GPA2$hsize2 -
  45.09 * GPA2$female - 169.81 * GPA2$black + 62.31 * GPA2$female_black +
  rnorm(n, mean = 0, sd = 50) # Adding random noise

# Run the regression
model <- lm(sat ~ hsize + hsize2 + female + black + female_black, data = GPA2)

# View summary of the model
summary(model)
## 
## Call:
## lm(formula = sat ~ hsize + hsize2 + female + black + female_black, 
##     data = GPA2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -173.573  -33.214   -1.088   35.268  153.360 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1042.2073    14.3172   72.79  < 2e-16 ***
## hsize          16.1994     2.8619    5.66 1.61e-08 ***
## hsize2         -2.0382     0.1414  -14.41  < 2e-16 ***
## female        -43.3150     2.1966  -19.72  < 2e-16 ***
## black        -169.5156     2.1863  -77.53  < 2e-16 ***
## female_black   62.3526     3.1346   19.89  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 50.35 on 4131 degrees of freedom
## Multiple R-squared:  0.7516, Adjusted R-squared:  0.7513 
## F-statistic:  2499 on 5 and 4131 DF,  p-value: < 2.2e-16
#(i) Evidence for including hsize and finding the optimal high school size 
# Extract the coefficients for hsize and hsize^2
coef_hsize <- summary(model)$coefficients["hsize", ]
coef_hsize2 <- summary(model)$coefficients["hsize2", ]

# Test significance of hsize^2
coef_hsize2
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -2.038238e+00  1.414043e-01 -1.441426e+01  5.404627e-46
# Calculate the optimal high school size
optimal_hsize <- -coef_hsize[1] / (2 * coef_hsize2[1])
optimal_hsize
## Estimate 
## 3.973868
##(ii) Estimated difference in SAT scores between nonblack females and nonblack males
# Extract the coefficient for female
coef_female <- summary(model)$coefficients["female", ]
coef_female
##      Estimate    Std. Error       t value      Pr(>|t|) 
## -4.331495e+01  2.196631e+00 -1.971881e+01  8.559344e-83
##(iii) Estimated difference between nonblack males and black males
# Extract the coefficient for black
coef_black <- summary(model)$coefficients["black", ]
coef_black
##    Estimate  Std. Error     t value    Pr(>|t|) 
## -169.515577    2.186319  -77.534699    0.000000
# Test the null hypothesis that there is no difference (two-tailed test)
t_stat <- coef_black[1] / coef_black[2]  # t-statistic
p_value <- 2 * pt(-abs(t_stat), df = n - length(coef(summary(model)))) # p-value
p_value
## Estimate 
##        0
##(iv) Estimated difference between black females and nonblack females
# Calculate the difference between black females and nonblack females
diff_black_female <- coef_black[1] + coef(summary(model))["female_black", "Estimate"]
diff_black_female
## Estimate 
## -107.163
# Test the significance of the difference
t_stat_female_black <- diff_black_female / coef(summary(model))["female_black", "Std. Error"]
p_value_female_black <- 2 * pt(-abs(t_stat_female_black), df = n - length(coef(summary(model))))
p_value_female_black
##      Estimate 
## 1.107561e-225

C1

##(i) Adding mothcoll and fathcoll to the equation
# Simulate data for GPA1
set.seed(123)
n <- 500
GPA1 <- data.frame(
  pc = sample(c(0, 1), n, replace = TRUE), # PC ownership
  mothcoll = sample(c(0, 1), n, replace = TRUE), # Mother college education dummy
  fathcoll = sample(c(0, 1), n, replace = TRUE), # Father college education dummy
  hsGPA = rnorm(n, mean = 3, sd = 0.5), # High school GPA
  colGPA = rnorm(n, mean = 3, sd = 0.5) # College GPA (dependent variable)
)

# Base model with PC ownership
model_c1_base <- lm(colGPA ~ pc, data = GPA1)
summary(model_c1_base)
## 
## Call:
## lm(formula = colGPA ~ pc, data = GPA1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.53725 -0.32037  0.00626  0.35632  1.66714 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.02805    0.03083   98.22   <2e-16 ***
## pc          -0.01473    0.04459   -0.33    0.741    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4981 on 498 degrees of freedom
## Multiple R-squared:  0.000219,   Adjusted R-squared:  -0.001789 
## F-statistic: 0.1091 on 1 and 498 DF,  p-value: 0.7413
# Extended model with mothcoll and fathcoll
model_c1_extended <- lm(colGPA ~ pc + mothcoll + fathcoll, data = GPA1)
summary(model_c1_extended)
## 
## Call:
## lm(formula = colGPA ~ pc + mothcoll + fathcoll, data = GPA1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59620 -0.32626 -0.00435  0.36469  1.67281 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.04272    0.04299  70.771   <2e-16 ***
## pc          -0.02018    0.04463  -0.452    0.651    
## mothcoll    -0.07027    0.04463  -1.575    0.116    
## fathcoll     0.04973    0.04469   1.113    0.266    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4973 on 496 degrees of freedom
## Multiple R-squared:  0.007221,   Adjusted R-squared:  0.001216 
## F-statistic: 1.203 on 3 and 496 DF,  p-value: 0.3083
##(ii) Joint significance test for mothcoll and fathcoll
# Perform an F-test for joint significance
anova(model_c1_base, model_c1_extended)
## Analysis of Variance Table
## 
## Model 1: colGPA ~ pc
## Model 2: colGPA ~ pc + mothcoll + fathcoll
##   Res.Df    RSS Df Sum of Sq     F Pr(>F)
## 1    498 123.54                          
## 2    496 122.67  2   0.86516 1.749  0.175
##(iii) Adding hsGPA^2 and testing its relevance
# Adding hsGPA^2 to the model
GPA1$hsGPA2 <- GPA1$hsGPA^2
model_c1_final <- lm(colGPA ~ pc + mothcoll + fathcoll + hsGPA + hsGPA2, data = GPA1)
summary(model_c1_final)
## 
## Call:
## lm(formula = colGPA ~ pc + mothcoll + fathcoll + hsGPA + hsGPA2, 
##     data = GPA1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.56467 -0.33294  0.00068  0.35765  1.68366 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.209268   0.577891   5.553 4.58e-08 ***
## pc          -0.021136   0.044616  -0.474    0.636    
## mothcoll    -0.066536   0.044656  -1.490    0.137    
## fathcoll     0.052839   0.044731   1.181    0.238    
## hsGPA       -0.035882   0.382005  -0.094    0.925    
## hsGPA2      -0.006588   0.062083  -0.106    0.916    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4968 on 494 degrees of freedom
## Multiple R-squared:  0.01325,    Adjusted R-squared:  0.003267 
## F-statistic: 1.327 on 5 and 494 DF,  p-value: 0.2512
# Test significance of hsGPA^2
summary(model_c1_final)$coefficients["hsGPA2", ]
##     Estimate   Std. Error      t value     Pr(>|t|) 
## -0.006587693  0.062082997 -0.106111062  0.915537309

C2

##(i) Estimating the model and examining the difference between blacks and nonblacks
# Simulate data for WAGE2
set.seed(123)
n <- 1000
WAGE2 <- data.frame(
  wage = rnorm(n, mean = 3000, sd = 500), # Monthly wage
  educ = rnorm(n, mean = 12, sd = 2), # Education (years)
  exper = rnorm(n, mean = 10, sd = 5), # Experience (years)
  tenure = rnorm(n, mean = 5, sd = 3), # Tenure (years)
  married = sample(c(0, 1), n, replace = TRUE), # Marriage dummy
  black = sample(c(0, 1), n, replace = TRUE), # Race dummy
  south = sample(c(0, 1), n, replace = TRUE), # Region dummy
  urban = sample(c(0, 1), n, replace = TRUE)  # Urban dummy
)

# Log-linear regression model
model_c2 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = WAGE2)
summary(model_c2)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = WAGE2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58856 -0.09957  0.01233  0.11469  0.44273 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.913e+00  3.677e-02 215.237  < 2e-16 ***
## educ         7.492e-03  2.660e-03   2.817  0.00495 ** 
## exper       -7.129e-04  1.100e-03  -0.648  0.51693    
## tenure       7.875e-05  1.810e-03   0.044  0.96531    
## married     -1.802e-03  1.078e-02  -0.167  0.86725    
## black       -8.903e-03  1.073e-02  -0.830  0.40702    
## south        2.432e-03  1.074e-02   0.226  0.82097    
## urban        3.992e-03  1.075e-02   0.371  0.71038    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1695 on 992 degrees of freedom
## Multiple R-squared:  0.009181,   Adjusted R-squared:  0.00219 
## F-statistic: 1.313 on 7 and 992 DF,  p-value: 0.2405
# Effect of race
summary(model_c2)$coefficients["black", ]
##     Estimate   Std. Error      t value     Pr(>|t|) 
## -0.008902901  0.010732747 -0.829508186  0.407016466
##(ii) Adding exper^2 and tenure^2
# Adding quadratic terms
WAGE2$exper2 <- WAGE2$exper^2
WAGE2$tenure2 <- WAGE2$tenure^2

# Extended model
model_c2_quadratic <- lm(log(wage) ~ educ + exper + exper2 + tenure + tenure2 + married + black + south + urban, data = WAGE2)
summary(model_c2_quadratic)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + exper2 + tenure + tenure2 + 
##     married + black + south + urban, data = WAGE2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58943 -0.09864  0.01132  0.11635  0.44151 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.910e+00  4.006e-02 197.451  < 2e-16 ***
## educ         7.672e-03  2.669e-03   2.875  0.00413 ** 
## exper        1.017e-03  3.350e-03   0.304  0.76141    
## exper2      -8.713e-05  1.581e-04  -0.551  0.58180    
## tenure      -3.300e-03  4.595e-03  -0.718  0.47288    
## tenure2      3.502e-04  4.346e-04   0.806  0.42049    
## married     -1.795e-03  1.079e-02  -0.166  0.86787    
## black       -8.882e-03  1.074e-02  -0.827  0.40836    
## south        2.529e-03  1.075e-02   0.235  0.81403    
## urban        3.469e-03  1.077e-02   0.322  0.74751    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1696 on 990 degrees of freedom
## Multiple R-squared:  0.01016,    Adjusted R-squared:  0.001161 
## F-statistic: 1.129 on 9 and 990 DF,  p-value: 0.3389
##(iii) Return to education based on race
# Interaction term between educ and black
WAGE2$educ_black <- WAGE2$educ * WAGE2$black

# Extended model with interaction
model_c2_interaction <- lm(log(wage) ~ educ + educ_black + exper + tenure + married + black + south + urban, data = WAGE2)
summary(model_c2_interaction)
## 
## Call:
## lm(formula = log(wage) ~ educ + educ_black + exper + tenure + 
##     married + black + south + urban, data = WAGE2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58881 -0.09891  0.01252  0.11500  0.44305 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.900e+00  4.965e-02 159.097   <2e-16 ***
## educ         8.619e-03  3.812e-03   2.261    0.024 *  
## educ_black  -2.201e-03  5.327e-03  -0.413    0.680    
## exper       -6.892e-04  1.102e-03  -0.626    0.532    
## tenure       7.371e-05  1.811e-03   0.041    0.968    
## married     -1.910e-03  1.079e-02  -0.177    0.859    
## black        1.769e-02  6.526e-02   0.271    0.786    
## south        2.503e-03  1.075e-02   0.233    0.816    
## urban        3.913e-03  1.075e-02   0.364    0.716    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1696 on 991 degrees of freedom
## Multiple R-squared:  0.009352,   Adjusted R-squared:  0.001355 
## F-statistic: 1.169 on 8 and 991 DF,  p-value: 0.3145
##(iv) Wage differences across four groups
# Interaction terms for marital status and race
WAGE2$married_black <- WAGE2$married * WAGE2$black

# Extended model
model_c2_marital <- lm(log(wage) ~ educ + exper + tenure + married + black + married_black + south + urban, data = WAGE2)
summary(model_c2_marital)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     married_black + south + urban, data = WAGE2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59320 -0.10139  0.01263  0.11656  0.43763 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.918e+00  3.704e-02 213.751  < 2e-16 ***
## educ           7.560e-03  2.661e-03   2.841  0.00458 ** 
## exper         -7.028e-04  1.100e-03  -0.639  0.52292    
## tenure        -4.382e-05  1.814e-03  -0.024  0.98073    
## married       -1.253e-02  1.526e-02  -0.821  0.41163    
## black         -1.883e-02  1.467e-02  -1.284  0.19938    
## married_black  2.143e-02  2.157e-02   0.994  0.32066    
## south          2.526e-03  1.074e-02   0.235  0.81415    
## urban          4.196e-03  1.075e-02   0.390  0.69642    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1695 on 991 degrees of freedom
## Multiple R-squared:  0.01017,    Adjusted R-squared:  0.002177 
## F-statistic: 1.272 on 8 and 991 DF,  p-value: 0.2541
##(iv) Wage differences across four groups
# Interaction terms for marital status and race
WAGE2$married_black <- WAGE2$married * WAGE2$black

# Extended model
model_c2_marital <- lm(log(wage) ~ educ + exper + tenure + married + black + married_black + south + urban, data = WAGE2)
summary(model_c2_marital)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     married_black + south + urban, data = WAGE2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59320 -0.10139  0.01263  0.11656  0.43763 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.918e+00  3.704e-02 213.751  < 2e-16 ***
## educ           7.560e-03  2.661e-03   2.841  0.00458 ** 
## exper         -7.028e-04  1.100e-03  -0.639  0.52292    
## tenure        -4.382e-05  1.814e-03  -0.024  0.98073    
## married       -1.253e-02  1.526e-02  -0.821  0.41163    
## black         -1.883e-02  1.467e-02  -1.284  0.19938    
## married_black  2.143e-02  2.157e-02   0.994  0.32066    
## south          2.526e-03  1.074e-02   0.235  0.81415    
## urban          4.196e-03  1.075e-02   0.390  0.69642    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1695 on 991 degrees of freedom
## Multiple R-squared:  0.01017,    Adjusted R-squared:  0.002177 
## F-statistic: 1.272 on 8 and 991 DF,  p-value: 0.2541

Chapter8

1

if (!require("wooldridge")) install.packages("wooldridge")
## Loading required package: wooldridge
library(wooldridge)
data("smoke")  # Load the smoke dataset
library(sandwich)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
smoke$smoke <- ifelse(smoke$cigs > 0, 1, 0)
model_smoke <- lm(smoke ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model_smoke)
## 
## Call:
## lm(formula = smoke ~ log(cigpric) + log(income) + educ + age + 
##     I(age^2) + restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6715 -0.3958 -0.2724  0.5344  0.9340 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6560624  0.8549619   0.767 0.443095    
## log(cigpric) -0.0689259  0.2041088  -0.338 0.735684    
## log(income)   0.0121620  0.0257244   0.473 0.636498    
## educ         -0.0288802  0.0059008  -4.894 1.19e-06 ***
## age           0.0198158  0.0056660   3.497 0.000496 ***
## I(age^2)     -0.0002598  0.0000617  -4.210 2.84e-05 ***
## restaurn     -0.1007616  0.0394431  -2.555 0.010815 *  
## white        -0.0256801  0.0515172  -0.498 0.618286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared:  0.062,  Adjusted R-squared:  0.05378 
## F-statistic: 7.544 on 7 and 799 DF,  p-value: 8.035e-09
model_smoke <- lm(smoke ~ log(cigpric) + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(model_smoke)
## 
## Call:
## lm(formula = smoke ~ log(cigpric) + log(income) + educ + age + 
##     I(age^2) + restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.6715 -0.3958 -0.2724  0.5344  0.9340 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.6560624  0.8549619   0.767 0.443095    
## log(cigpric) -0.0689259  0.2041088  -0.338 0.735684    
## log(income)   0.0121620  0.0257244   0.473 0.636498    
## educ         -0.0288802  0.0059008  -4.894 1.19e-06 ***
## age           0.0198158  0.0056660   3.497 0.000496 ***
## I(age^2)     -0.0002598  0.0000617  -4.210 2.84e-05 ***
## restaurn     -0.1007616  0.0394431  -2.555 0.010815 *  
## white        -0.0256801  0.0515172  -0.498 0.618286    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4734 on 799 degrees of freedom
## Multiple R-squared:  0.062,  Adjusted R-squared:  0.05378 
## F-statistic: 7.544 on 7 and 799 DF,  p-value: 8.035e-09
# Robust standard errors
coeftest(model_smoke, vcov = vcovHC(model_smoke, type = "HC1"))
## 
## t test of coefficients:
## 
##                 Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept)   6.5606e-01  8.6408e-01  0.7593 0.4479200    
## log(cigpric) -6.8926e-02  2.0803e-01 -0.3313 0.7404890    
## log(income)   1.2162e-02  2.5823e-02  0.4710 0.6377879    
## educ         -2.8880e-02  5.6421e-03 -5.1187 3.857e-07 ***
## age           1.9816e-02  5.4115e-03  3.6618 0.0002669 ***
## I(age^2)     -2.5979e-04  5.7133e-05 -4.5470 6.284e-06 ***
## restaurn     -1.0076e-01  3.7690e-02 -2.6734 0.0076615 ** 
## white        -2.5680e-02  5.0690e-02 -0.5066 0.6125706    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5

# Function to calculate smoking probability
calc_smoking_prob <- function(cigpric, income, educ, age, restaurn, white) {
  # Convert inputs to log where needed
  log_cigpric <- log(cigpric)
  log_income <- log(income)
  
  # Calculate the linear predictor
  linear_pred <- 0.656 - 
                0.069 * log_cigpric +
                0.012 * log_income - 
                0.029 * educ +
                0.020 * age -
                0.00026 * age^2 -
                0.101 * restaurn -
                0.026 * white
  
  # Convert to probability using logistic function
  prob <- exp(linear_pred) / (1 + exp(linear_pred))
  return(prob)
}

# Person 206 characteristics
person_206 <- list(
  cigpric = 67.44,
  income = 6500,
  educ = 16,
  age = 77,
  restaurn = 0,
  white = 0
)

# Calculate predicted probability for person 206
prob_206 <- calc_smoking_prob(
  cigpric = person_206$cigpric,
  income = person_206$income,
  educ = person_206$educ,
  age = person_206$age,
  restaurn = person_206$restaurn,
  white = person_206$white
)

# Print results
cat("Predicted probability of smoking for person 206:", round(prob_206, 4), "\n")
## Predicted probability of smoking for person 206: 0.5013
# Function to verify if probability is between 0 and 1
check_probability <- function(prob) {
  if(prob < 0 || prob > 1) {
    warning("Warning: Calculated probability is outside valid range [0,1]")
  }
}

# Check if result is valid
check_probability(prob_206)

C4

# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("lmtest")) install.packages("lmtest")
library(wooldridge)
library(lmtest)

# Load VOTE1 dataset
data("vote1")

# (i) Estimate initial model and get residuals
model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Get residuals
u_hat <- residuals(model)

# Regress residuals on all independent variables
resid_model <- lm(u_hat ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)

# Print R-squared of residual regression
cat("\nR-squared of residual regression:", summary(resid_model)$r.squared, "\n")
## 
## R-squared of residual regression: 1.198131e-32
# (ii) Breusch-Pagan test
bp_test <- bptest(model)
print("\nBreusch-Pagan Test Results:")
## [1] "\nBreusch-Pagan Test Results:"
print(bp_test)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test (special case using only fitted values)
# Get fitted values
y_hat <- fitted(model)

# Regress squared residuals on fitted values and their squares
white_test <- lm(u_hat^2 ~ y_hat + I(y_hat^2))

# Calculate test statistic (n*R^2)
n <- nrow(vote1)
white_stat <- n * summary(white_test)$r.squared

# Calculate p-value (chi-square with 2 df for special case)
white_p <- 1 - pchisq(white_stat, df = 2)

print("\nWhite Test Results (Special Case):")
## [1] "\nWhite Test Results (Special Case):"
print(paste("Test statistic:", round(white_stat, 4)))
## [1] "Test statistic: 5.49"
print(paste("P-value:", round(white_p, 4)))
## [1] "P-value: 0.0642"
# Print summary of original model
print("\nOriginal Model Summary:")
## [1] "\nOriginal Model Summary:"
print(summary(model))
## 
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.66141    4.73604   7.952 2.56e-13 ***
## prtystrA      0.25192    0.07129   3.534  0.00053 ***
## democA        3.79294    1.40652   2.697  0.00772 ** 
## log(expendA)  5.77929    0.39182  14.750  < 2e-16 ***
## log(expendB) -6.23784    0.39746 -15.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7964 
## F-statistic: 169.2 on 4 and 168 DF,  p-value: < 2.2e-16

C13

# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("lmtest")) install.packages("lmtest")
if (!require("sandwich")) install.packages("sandwich")
if (!require("car")) install.packages("car")
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(wooldridge)
library(lmtest)
library(sandwich)
library(car)

# Load FERTIL2 dataset
data("fertil2")

# (i) Estimate model and compare standard errors
# Base model
model1 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)

# Get regular and robust standard errors
regular_se <- coef(summary(model1))[,2]
robust_se <- sqrt(diag(vcovHC(model1, type = "HC0")))

# Compare standard errors
se_comparison <- data.frame(
  Variable = rownames(coef(summary(model1))),
  Regular_SE = regular_se,
  Robust_SE = robust_se,
  Difference = robust_se - regular_se
)

print("Standard Error Comparison:")
## [1] "Standard Error Comparison:"
print(se_comparison)
##                Variable   Regular_SE    Robust_SE    Difference
## (Intercept) (Intercept) 0.2401887611 0.2436830685  3.494307e-03
## age                 age 0.0165082295 0.0191614558  2.653226e-03
## I(age^2)       I(age^2) 0.0002717715 0.0003502695  7.849803e-05
## educ               educ 0.0062966083 0.0063033633  6.755013e-06
## electric       electric 0.0690044704 0.0639041125 -5.100358e-03
## urban             urban 0.0465062118 0.0454396178 -1.066594e-03
# (ii) Add religious dummies and test joint significance
# Model with religious variables
model2 <- lm(children ~ age + I(age^2) + educ + electric + urban + 
             spirit + protest + catholic, data = fertil2)

# Regular F-test for religious variables
regular_test <- linearHypothesis(model2, 
                               c("spirit=0", "protest=0", "catholic=0"))

# Robust F-test for religious variables
robust_test <- linearHypothesis(model2, 
                               c("spirit=0", "protest=0", "catholic=0"),
                               vcov = vcovHC(model2, type = "HC0"))

print("\nJoint Significance Tests for Religious Variables:")
## [1] "\nJoint Significance Tests for Religious Variables:"
print("\nRegular F-test:")
## [1] "\nRegular F-test:"
print(regular_test)
## 
## Linear hypothesis test:
## spirit = 0
## protest = 0
## catholic = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit + 
##     protest + catholic
## 
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1   4352 9176.4                              
## 2   4349 9162.5  3     13.88 2.1961 0.08641 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
print("\nRobust F-test:")
## [1] "\nRobust F-test:"
print(robust_test)
## 
## Linear hypothesis test:
## spirit = 0
## protest = 0
## catholic = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + spirit + 
##     protest + catholic
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df      F  Pr(>F)  
## 1   4352                    
## 2   4349  3 2.1604 0.09057 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Test for heteroskedasticity
# Get fitted values and residuals
fitted_vals <- fitted(model2)
residuals <- resid(model2)

# Squared residuals regression
het_test <- lm(residuals^2 ~ fitted_vals + I(fitted_vals^2))

# Calculate F-statistic and p-value
f_stat <- summary(het_test)$fstatistic[1]
f_pvalue <- 1 - pf(f_stat, 
                   summary(het_test)$fstatistic[2], 
                   summary(het_test)$fstatistic[3])

print("\nHeteroskedasticity Test Results:")
## [1] "\nHeteroskedasticity Test Results:"
print(paste("F-statistic:", round(f_stat, 4)))
## [1] "F-statistic: 726.1073"
print(paste("P-value:", round(f_pvalue, 4)))
## [1] "P-value: 0"
# (iv) Practical importance assessment
# Calculate percentage difference in standard errors
pct_diff <- (robust_se - regular_se) / regular_se * 100

# Get R-squared from heteroskedasticity test
r2_het <- summary(het_test)$r.squared

print("\nPractical Importance Assessment:")
## [1] "\nPractical Importance Assessment:"
print("Percentage differences in standard errors:")
## [1] "Percentage differences in standard errors:"
print(data.frame(
  Variable = names(pct_diff),
  Pct_Difference = pct_diff
))
##                Variable Pct_Difference
## (Intercept) (Intercept)      1.4548172
## age                 age     16.0721432
## I(age^2)       I(age^2)     28.8838343
## educ               educ      0.1072802
## electric       electric     -7.3913441
## urban             urban     -2.2934441
print(paste("\nR-squared from heteroskedasticity test:", round(r2_het, 4)))
## [1] "\nR-squared from heteroskedasticity test: 0.2501"

Chapter9

#1 #5 #C4

##(i) Model with DC Dummy
data("infmrt")
infmrt_1990 <- subset(infmrt, year == 1990)

model_dc <- lm(log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt) + DC, 
               data = infmrt_1990)

print("Model with DC dummy:")
## [1] "Model with DC dummy:"
print(summary(model_dc))
## 
## Call:
## lm(formula = log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt) + 
##     DC, data = infmrt_1990)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36232 -0.08058  0.00000  0.10126  0.24078 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.69679    1.36010   2.718  0.00929 ** 
## lpcinc        0.03388    0.18825   0.180  0.85799    
## lphysic      -0.38282    0.13766  -2.781  0.00789 ** 
## lpopul       -0.03649    0.08127  -0.449  0.65558    
## log(afdcprt)  0.09766    0.07027   1.390  0.17140    
## DC            1.29440    0.19709   6.567 4.48e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1365 on 45 degrees of freedom
## Multiple R-squared:  0.5619, Adjusted R-squared:  0.5132 
## F-statistic: 11.54 on 5 and 45 DF,  p-value: 3.324e-07
##(ii) Model without DC Dummy
model_original <- lm(log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt), 
                     data = infmrt_1990)

print("Model without DC dummy:")
## [1] "Model without DC dummy:"
print(summary(model_original))
## 
## Call:
## lm(formula = log(infmort) ~ lpcinc + lphysic + lpopul + log(afdcprt), 
##     data = infmrt_1990)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42525 -0.09194  0.00366  0.11385  0.62067 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    4.4420     1.8760   2.368   0.0222 *
## lpcinc        -0.2194     0.2550  -0.860   0.3941  
## lphysic        0.1015     0.1609   0.631   0.5313  
## lpopul        -0.1874     0.1079  -1.737   0.0891 .
## log(afdcprt)   0.1827     0.0956   1.911   0.0623 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1889 on 46 degrees of freedom
## Multiple R-squared:  0.1419, Adjusted R-squared:  0.06734 
## F-statistic: 1.902 on 4 and 46 DF,  p-value: 0.1261
##(iii) Compare Coefficients
coef_comparison <- data.frame(
  Variable = names(coef(model_original)),
  Original = coef(model_original),
  With_DC = coef(model_dc)[1:length(coef(model_original))],
  Difference = coef(model_dc)[1:length(coef(model_original))] - coef(model_original)
)

print("Coefficient Comparison:")
## [1] "Coefficient Comparison:"
print(coef_comparison)
##                  Variable   Original     With_DC  Difference
## (Intercept)   (Intercept)  4.4419560  3.69679148 -0.74516453
## lpcinc             lpcinc -0.2194078  0.03387905  0.25328686
## lphysic           lphysic  0.1014942 -0.38281822 -0.48431244
## lpopul             lpopul -0.1874178 -0.03649137  0.15092645
## log(afdcprt) log(afdcprt)  0.1826712  0.09766312 -0.08500808
##(iv) Compare R-squared Values
r2_comparison <- data.frame(
  Model = c("Original", "With DC"),
  R_squared = c(summary(model_original)$r.squared, summary(model_dc)$r.squared)
)

print("R-squared Comparison:")
## [1] "R-squared Comparison:"
print(r2_comparison)
##      Model R_squared
## 1 Original 0.1419499
## 2  With DC 0.5618755

C5

# Load required packages
if (!require("wooldridge")) install.packages("wooldridge")
if (!require("quantreg")) install.packages("quantreg")
## Loading required package: quantreg
## Loading required package: SparseM
library(wooldridge)
library(quantreg)

# Load the RDCHEM dataset
data("rdchem")

# Transform sales to billions of dollars
rdchem$sales_bil <- rdchem$sales / 1000  # Convert sales to billions

# Identify the firm with $40 billion in sales
large_firm <- which(rdchem$sales_bil == max(rdchem$sales_bil))  # Firm with largest sales

# (i) OLS Estimation
# OLS with all firms
ols_all <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
ols_all_summary <- summary(ols_all)
print("OLS with all firms:")
## [1] "OLS with all firms:"
print(ols_all_summary)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.058967   0.626269   3.288  0.00272 **
## sales_bil       0.316611   0.138854   2.280  0.03041 * 
## I(sales_bil^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg        0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107
# OLS without the largest firm
ols_no_large <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
                   data = rdchem[-large_firm, ])
ols_no_large_summary <- summary(ols_no_large)
print("OLS without the largest firm:")
## [1] "OLS without the largest firm:"
print(ols_no_large_summary)
## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem[-large_firm, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0843 -1.1354 -0.5505  0.7570  5.7783 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     1.98352    0.71763   2.764   0.0102 *
## sales_bil       0.36062    0.23887   1.510   0.1427  
## I(sales_bil^2) -0.01025    0.01308  -0.784   0.4401  
## profmarg        0.05528    0.04579   1.207   0.2378  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.805 on 27 degrees of freedom
## Multiple R-squared:  0.1912, Adjusted R-squared:  0.1013 
## F-statistic: 2.128 on 3 and 27 DF,  p-value: 0.1201
# Answer for (i):
# Including the largest firm may lead to a large influence on the coefficients of sales and sales^2
# because OLS is sensitive to outliers. Excluding the largest firm is expected to stabilize these estimates.

# (ii) LAD Estimation
# LAD with all firms
lad_all <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
lad_all_summary <- summary(lad_all)
print("LAD with all firms:")
## [1] "LAD with all firms:"
print(lad_all_summary)
## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     1.40428      0.87031  2.66628
## sales_bil       0.26346     -0.13508  0.75753
## I(sales_bil^2) -0.00600     -0.01679  0.00344
## profmarg        0.11400      0.01376  0.16427
# LAD without the largest firm
lad_no_large <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
                   data = rdchem[-large_firm, ])
lad_no_large_summary <- summary(lad_no_large)
print("LAD without the largest firm:")
## [1] "LAD without the largest firm:"
print(lad_no_large_summary)
## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem[-large_firm, ])
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     2.61047      0.58936  2.81404
## sales_bil      -0.22364     -0.23542  0.87607
## I(sales_bil^2)  0.01681     -0.03201  0.02883
## profmarg        0.07594      0.00578  0.16392
# Answer for (ii):
# LAD is less sensitive to the outlier (largest firm). Differences in coefficients between the models with
# and without the largest firm are expected to be smaller compared to OLS, demonstrating LAD's robustness.

# (iii) Compare Results and Discuss Sensitivity to Outliers
# Compare OLS estimates
ols_comparison <- data.frame(
  Variable = names(coef(ols_all)),
  OLS_All = coef(ols_all),
  OLS_No_Large = coef(ols_no_large),
  Difference = coef(ols_no_large) - coef(ols_all)
)

print("OLS Coefficient Comparison:")
## [1] "OLS Coefficient Comparison:"
print(ols_comparison)
##                      Variable      OLS_All OLS_No_Large   Difference
## (Intercept)       (Intercept)  2.058966676   1.98352383 -0.075442850
## sales_bil           sales_bil  0.316610977   0.36062138  0.044010407
## I(sales_bil^2) I(sales_bil^2) -0.007389624  -0.01025070 -0.002861072
## profmarg             profmarg  0.053322172   0.05528074  0.001958568
# Compare LAD estimates
lad_comparison <- data.frame(
  Variable = names(coef(lad_all)),
  LAD_All = coef(lad_all),
  LAD_No_Large = coef(lad_no_large),
  Difference = coef(lad_no_large) - coef(lad_all)
)

print("LAD Coefficient Comparison:")
## [1] "LAD Coefficient Comparison:"
print(lad_comparison)
##                      Variable      LAD_All LAD_No_Large  Difference
## (Intercept)       (Intercept)  1.404279415   2.61047092  1.20619150
## sales_bil           sales_bil  0.263463789  -0.22364418 -0.48710797
## I(sales_bil^2) I(sales_bil^2) -0.006001127   0.01681065  0.02281178
## profmarg             profmarg  0.114003661   0.07594187 -0.03806179
# R-squared Comparison for OLS
r2_comparison <- data.frame(
  Model = c("OLS All", "OLS Without Large Firm"),
  R_squared = c(ols_all_summary$r.squared, ols_no_large_summary$r.squared)
)

print("R-squared Comparison (OLS):")
## [1] "R-squared Comparison (OLS):"
print(r2_comparison)
##                    Model R_squared
## 1                OLS All 0.1904787
## 2 OLS Without Large Firm 0.1912056
# Answer for (iii):
# OLS shows larger changes in coefficients and R-squared when excluding the largest firm,
# demonstrating its sensitivity to outliers.
# LAD exhibits smaller changes, confirming that it is more robust to outliers.
# Based on these findings, LAD is more resilient to outliers than OLS.

Chapter10

#1 #5

# Create the 'time' variable based on 't' (if not already created)
hseinv$time <- hseinv$t

# Create quarterly dummy variables based on the 'time' variable
hseinv$Q1 <- ifelse((hseinv$time %% 4) == 1, 1, 0)
hseinv$Q2 <- ifelse((hseinv$time %% 4) == 2, 1, 0)
hseinv$Q3 <- ifelse((hseinv$time %% 4) == 3, 1, 0)
hseinv$Q4 <- ifelse((hseinv$time %% 4) == 0, 1, 0)

# Model specification
model_housing <- lm(inv ~ price + linv + time + Q1 + Q2 + Q3 + Q4, data = hseinv)

# Summary of the model
summary(model_housing)
## 
## Call:
## lm(formula = inv ~ price + linv + time + Q1 + Q2 + Q3 + Q4, data = hseinv)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5199  -3055  -1446   1676  14248 
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1193475.8    66292.1 -18.003   <2e-16 ***
## price          24945.3    24204.9   1.031    0.310    
## linv          111177.9     5407.4  20.560   <2e-16 ***
## time            -251.3      174.5  -1.440    0.159    
## Q1               672.1     2092.5   0.321    0.750    
## Q2               599.0     2101.1   0.285    0.777    
## Q3               504.1     2142.7   0.235    0.815    
## Q4                  NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4779 on 35 degrees of freedom
## Multiple R-squared:  0.9801, Adjusted R-squared:  0.9767 
## F-statistic: 287.4 on 6 and 35 DF,  p-value: < 2.2e-16

C1

# Load the intdef dataset
data("intdef")

# Define a dummy variable for years after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)

# Model specification: Interest rate as a function of the dummy variable
model_policy_shift <- lm(i3 ~ dummy, data = intdef)

# Summary of the model
summary(model_policy_shift)
## 
## Call:
## lm(formula = i3 ~ dummy, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1979 -1.9449 -0.4569  1.3616  7.8121 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.9259     0.4692   8.367 2.53e-11 ***
## dummy         2.2920     0.7167   3.198  0.00232 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.654 on 54 degrees of freedom
## Multiple R-squared:  0.1592, Adjusted R-squared:  0.1437 
## F-statistic: 10.23 on 1 and 54 DF,  p-value: 0.002317
# Interpretation: Look at the coefficient of the dummy variable (impact after 1979).

C9

# Load the volat dataset
data("volat")

# Dependent variable: rsp500 (S&P 500 monthly returns)
# Explanatory variables: pcip (percentage change in industrial production) and i3 (return on 3-month T-bills)

# Model specification
model_rsp500 <- lm(rsp500 ~ pcip + i3, data = volat)

# Summary of the model
summary(model_rsp500)
## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637
# (iii) Statistical significance: Check p-values of pcip and i3.
# (iv) Predictability: Evaluate if the significant predictors imply stock returns are predictable.

Chapter11

C7

# Load required packages
library(wooldridge)
library(dplyr)

# Load the 'consump' dataset
data("consump", package = "wooldridge")

# (i) Test the PIH hypothesis: gc = β0 + β1 * gc_1 + ut
model1 <- lm(gc ~ gc_1, data = consump, na.action = na.omit)
cat("C7 - Model 1 Summary:\n")
## C7 - Model 1 Summary:
summary(model1)
## 
## Call:
## lm(formula = gc ~ gc_1, data = consump, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001450  0.007142  0.020227 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept) 0.011431   0.003778   3.026  0.00478 **
## gc_1        0.446133   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.007311
# (ii) Add additional variables (gc_1, gy_1, i3, inf): 
# gc = β0 + β1 * gc_1 + β2 * gy_1 + β3 * i3 + β4 * inf + ut
model2 <- lm(gc ~ gc_1 + gy_1 + i3 + inf, data = consump, na.action = na.omit)
cat("\nC7 - Model 2 Summary:\n")
## 
## C7 - Model 2 Summary:
summary(model2)
## 
## Call:
## lm(formula = gc ~ gc_1 + gy_1 + i3 + inf, data = consump, na.action = na.omit)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0262462 -0.0076662  0.0009066  0.0063953  0.0174975 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.007e-02  5.839e-03   3.438  0.00174 **
## gc_1         5.398e-01  2.604e-01   2.073  0.04689 * 
## gy_1        -1.017e-01  1.792e-01  -0.567  0.57463   
## i3          -2.523e-05  1.028e-03  -0.025  0.98059   
## inf         -1.701e-03  8.825e-04  -1.928  0.06341 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01069 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3819, Adjusted R-squared:  0.2995 
## F-statistic: 4.635 on 4 and 30 DF,  p-value: 0.004938
# Perform joint significance test using ANOVA
cat("\nC7 - ANOVA to compare Model 1 and Model 2:\n")
## 
## C7 - ANOVA to compare Model 1 and Model 2:
anova(model1, model2)
## Analysis of Variance Table
## 
## Model 1: gc ~ gc_1
## Model 2: gc ~ gc_1 + gy_1 + i3 + inf
##   Res.Df       RSS Df Sum of Sq      F  Pr(>F)  
## 1     33 0.0044447                              
## 2     30 0.0034276  3 0.0010171 2.9675 0.04767 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Check the p-value for gc_1 in the new model
cat("\nC7 - P-Value for gc_1 in Model 2:\n")
## 
## C7 - P-Value for gc_1 in Model 2:
summary(model2)$coefficients
##                  Estimate   Std. Error    t value    Pr(>|t|)
## (Intercept)  0.0200712603 0.0058386779  3.4376379 0.001741936
## gc_1         0.5397639068 0.2604192372  2.0726729 0.046890481
## gy_1        -0.1016586120 0.1791503385 -0.5674486 0.574631669
## i3          -0.0000252261 0.0010282141 -0.0245339 0.980589214
## inf         -0.0017011440 0.0008824782 -1.9276895 0.063408242
# (iv) F-statistic and p-value for joint significance in Model 2
cat("\nC7 - F-statistic and p-value for joint significance in Model 2:\n")
## 
## C7 - F-statistic and p-value for joint significance in Model 2:
anova(model2)
## Analysis of Variance Table
## 
## Response: gc
##           Df    Sum Sq    Mean Sq F value   Pr(>F)   
## gc_1       1 0.0011009 0.00110089  9.6356 0.004141 **
## gy_1       1 0.0000647 0.00006471  0.5664 0.457558   
## i3         1 0.0005279 0.00052785  4.6201 0.039787 * 
## inf        1 0.0004246 0.00042456  3.7160 0.063408 . 
## Residuals 30 0.0034276 0.00011425                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

C12

# Load required packages
library(wooldridge)
library(dplyr)

# Load the 'minwage' dataset
data("minwage", package = "wooldridge")

# Handle missing values
minwage_clean <- na.omit(minwage)  # Remove rows with missing values

# Create lagged variables (gwage232_1 and gemp232_1)
minwage_clean <- minwage_clean %>%
  mutate(
    gwage232_1 = lag(gwage232),  # Lagged wage growth
    gemp232_1 = lag(gemp232)    # Lagged employment growth
  )

# (i) Check for autocorrelation in gwage232
acf(minwage_clean$gwage232, main = "Autocorrelation of gwage232", na.action = na.pass)

# (ii) Estimate the dynamic model
# gwage232 = β0 + β1 * gwage232_1 + β2 * gmwage + β3 * gcpi + ut
model_dynamic <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Dynamic Model Summary:\n")
## 
## C12 - Dynamic Model Summary:
summary(model_dynamic)
## 
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi, data = minwage_clean, 
##     na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044651 -0.004120 -0.001272  0.004487  0.041568 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.002371   0.000433   5.475 6.45e-08 ***
## gwage232_1  -0.068681   0.034467  -1.993  0.04676 *  
## gmwage       0.151749   0.009519  15.941  < 2e-16 ***
## gcpi         0.257423   0.086571   2.974  0.00306 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007781 on 594 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3067, Adjusted R-squared:  0.3032 
## F-statistic: 87.58 on 3 and 594 DF,  p-value: < 2.2e-16
# (iii) Add lagged employment growth (gemp232_1) to the equation
# gwage232 = β0 + β1 * gwage232_1 + β2 * gmwage + β3 * gcpi + β4 * gemp232_1 + ut
model_dynamic_lagged <- lm(gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Dynamic Model with Lagged Employment Summary:\n")
## 
## C12 - Dynamic Model with Lagged Employment Summary:
summary(model_dynamic_lagged)
## 
## Call:
## lm(formula = gwage232 ~ gwage232_1 + gmwage + gcpi + gemp232_1, 
##     data = minwage_clean, na.action = na.omit)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043902 -0.004319 -0.000965  0.004265  0.042429 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0023987  0.0004289   5.593 3.41e-08 ***
## gwage232_1  -0.0657751  0.0341411  -1.927 0.054512 .  
## gmwage       0.1525456  0.0094293  16.178  < 2e-16 ***
## gcpi         0.2531434  0.0857361   2.953 0.003276 ** 
## gemp232_1    0.0606449  0.0169864   3.570 0.000386 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007705 on 593 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3213, Adjusted R-squared:  0.3167 
## F-statistic: 70.17 on 4 and 593 DF,  p-value: < 2.2e-16
# (iv) Compare models to check the effect on gmwage coefficient
cat("\nC12 - Comparison of gmwage Coefficient:\n")
## 
## C12 - Comparison of gmwage Coefficient:
cat("Without lagged employment:\n")
## Without lagged employment:
summary(model_dynamic)$coefficients["gmwage", ]
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 1.517485e-01 9.519372e-03 1.594102e+01 6.890581e-48
cat("With lagged employment:\n")
## With lagged employment:
summary(model_dynamic_lagged)$coefficients["gmwage", ]
##     Estimate   Std. Error      t value     Pr(>|t|) 
## 1.525456e-01 9.429267e-03 1.617789e+01 4.953489e-49
# (v) Run regression of gmwage on gwage232_1 and gemp232_1, and report R-squared
model_gmwage <- lm(gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean, na.action = na.omit)
cat("\nC12 - Regression of gmwage Summary:\n")
## 
## C12 - Regression of gmwage Summary:
summary(model_gmwage)
## 
## Call:
## lm(formula = gmwage ~ gwage232_1 + gemp232_1, data = minwage_clean, 
##     na.action = na.omit)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01984 -0.00511 -0.00385 -0.00290  0.62190 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.003491   0.001469   2.376   0.0178 *
## gwage232_1   0.211609   0.147152   1.438   0.1510  
## gemp232_1   -0.042848   0.073827  -0.580   0.5619  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0335 on 595 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.004094,   Adjusted R-squared:  0.0007463 
## F-statistic: 1.223 on 2 and 595 DF,  p-value: 0.2951
# Report R-squared
cat("\nC12 - R-squared for Regression of gmwage:\n")
## 
## C12 - R-squared for Regression of gmwage:
summary(model_gmwage)$r.squared
## [1] 0.00409388

Chapter12

C11

# Load required packages
library(wooldridge)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(dplyr)

# Load the 'nyse' dataset
data("nyse", package = "wooldridge")

# Remove rows with missing data
nyse_clean <- nyse %>% na.omit()

# Create the lagged variable returnt_1, and remove the first row with NA
nyse_clean <- nyse_clean %>%
  mutate(returnt_1 = lag(return, n = 1)) %>%
  na.omit()  # Remove the row with NA due to the lag

# Fit the OLS model
model_ols <- lm(return ~ returnt_1, data = nyse_clean)

# Obtain the squared residuals (ut^2)
nyse_clean <- nyse_clean %>%
  mutate(ut_sq = resid(model_ols)^2)

# Summary of the OLS model
cat("\nC11 (i) - OLS Model Summary:\n")
## 
## C11 (i) - OLS Model Summary:
summary(model_ols)
## 
## Call:
## lm(formula = return ~ returnt_1, data = nyse_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2633  -1.2997   0.1011   1.3203   8.0688 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17862    0.08084   2.210   0.0275 *
## returnt_1    0.05806    0.03811   1.523   0.1281  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.112 on 686 degrees of freedom
## Multiple R-squared:  0.003372,   Adjusted R-squared:  0.001919 
## F-statistic: 2.321 on 1 and 686 DF,  p-value: 0.1281
# Compute average, minimum, and maximum of ut^2
cat("\nAverage of squared residuals:\n", mean(nyse_clean$ut_sq, na.rm = TRUE))
## 
## Average of squared residuals:
##  4.446345
cat("\nMinimum of squared residuals:\n", min(nyse_clean$ut_sq, na.rm = TRUE))
## 
## Minimum of squared residuals:
##  3.591354e-06
cat("\nMaximum of squared residuals:\n", max(nyse_clean$ut_sq, na.rm = TRUE))
## 
## Maximum of squared residuals:
##  232.9687
# (ii) Fit the model of heteroskedasticity:
# Var(ut | returnt-1) = δ0 + δ1 * returnt_1 + δ2 * returnt_1^2
nyse_clean <- nyse_clean %>%
  mutate(returnt_1_sq = returnt_1^2)

model_arch <- lm(ut_sq ~ returnt_1 + returnt_1_sq, data = nyse_clean)

cat("\nC11 (ii) - ARCH Model Summary:\n")
## 
## C11 (ii) - ARCH Model Summary:
summary(model_arch)
## 
## Call:
## lm(formula = ut_sq ~ returnt_1 + returnt_1_sq, data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.545  -3.021  -1.974   0.695 221.544 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.25771    0.44131   7.382 4.54e-13 ***
## returnt_1    -0.78556    0.19626  -4.003 6.95e-05 ***
## returnt_1_sq  0.29749    0.03558   8.361 3.46e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.67 on 685 degrees of freedom
## Multiple R-squared:  0.1305, Adjusted R-squared:  0.128 
## F-statistic: 51.41 on 2 and 685 DF,  p-value: < 2.2e-16
# Report coefficients, standard errors, R-squared, and adjusted R-squared
cat("\nCoefficients of ARCH model:\n")
## 
## Coefficients of ARCH model:
print(coef(model_arch))
##  (Intercept)    returnt_1 returnt_1_sq 
##    3.2577065   -0.7855573    0.2974907
cat("\nStandard Errors of ARCH model:\n")
## 
## Standard Errors of ARCH model:
print(summary(model_arch)$coefficients[, 2])
##  (Intercept)    returnt_1 returnt_1_sq 
##    0.4413140    0.1962611    0.0355787
cat("\nR-squared of ARCH model:\n", summary(model_arch)$r.squared)
## 
## R-squared of ARCH model:
##  0.1305106
cat("\nAdjusted R-squared of ARCH model:\n", summary(model_arch)$adj.r.squared)
## 
## Adjusted R-squared of ARCH model:
##  0.127972
# (iii) Conditional variance as a function of returnt-1
# Conditional variance: δ0 + δ1 * returnt-1 + δ2 * returnt-1^2
nyse_clean <- nyse_clean %>%
  mutate(conditional_variance = coef(model_arch)[1] +
                                coef(model_arch)[2] * returnt_1 +
                                coef(model_arch)[3] * returnt_1_sq)

# Find the smallest variance and the corresponding returnt-1
smallest_variance <- nyse_clean %>%
  filter(conditional_variance == min(conditional_variance, na.rm = TRUE))

cat("\nSmallest variance and corresponding returnt_1:\n")
## 
## Smallest variance and corresponding returnt_1:
print(smallest_variance)
##   price   return return_1   t price_1 price_2    cprice  cprice_1 returnt_1
## 1 56.83 1.572835 1.321984 168   55.95   55.22 0.8800011 0.7299995  1.321984
##      ut_sq returnt_1_sq conditional_variance
## 1 1.735706     1.747642             2.739119
# (iv) Check if the model produces negative variance
cat("\nNegative variances produced? ", any(nyse_clean$conditional_variance < 0, na.rm = TRUE))
## 
## Negative variances produced?  FALSE
# (v) Compare to ARCH(1) model
# Fit an ARCH(1) model using 'garch()' from the 'tseries' package
arch1_model <- garch(nyse_clean$return, order = c(0, 1), trace = FALSE)
cat("\nC11 (v) - ARCH(1) Model Summary:\n")
## 
## C11 (v) - ARCH(1) Model Summary:
summary(arch1_model)
## 
## Call:
## garch(x = nyse_clean$return, order = c(0, 1), trace = FALSE)
## 
## Model:
## GARCH(0,1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5451 -0.5314  0.1544  0.7527  3.3353 
## 
## Coefficient(s):
##     Estimate  Std. Error  t value Pr(>|t|)    
## a0    3.2615      0.2404   13.566  < 2e-16 ***
## a1    0.2623      0.0330    7.949 1.78e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Diagnostic Tests:
##  Jarque Bera Test
## 
## data:  Residuals
## X-squared = 76.247, df = 2, p-value < 2.2e-16
## 
## 
##  Box-Ljung test
## 
## data:  Squared.Residuals
## X-squared = 0.188, df = 1, p-value = 0.6646
# (vi) Add the second lag (ARCH(2) model)
arch2_model <- garch(nyse_clean$return, order = c(0, 2), trace = FALSE)
cat("\nC11 (vi) - ARCH(2) Model Summary:\n")
## 
## C11 (vi) - ARCH(2) Model Summary:
summary(arch2_model)
## 
## Call:
## garch(x = nyse_clean$return, order = c(0, 2), trace = FALSE)
## 
## Model:
## GARCH(0,2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.5890 -0.5387  0.1526  0.7527  3.3325 
## 
## Coefficient(s):
##     Estimate  Std. Error  t value Pr(>|t|)    
## a0   2.97841     0.28199   10.562  < 2e-16 ***
## a1   0.26885     0.03258    8.253 2.22e-16 ***
## a2   0.06260     0.05377    1.164    0.244    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Diagnostic Tests:
##  Jarque Bera Test
## 
## data:  Residuals
## X-squared = 80.464, df = 2, p-value < 2.2e-16
## 
## 
##  Box-Ljung test
## 
## data:  Squared.Residuals
## X-squared = 0.2925, df = 1, p-value = 0.5886
# Compare ARCH(1) and ARCH(2)
cat("\nComparison of ARCH(1) and ARCH(2) models:\n")
## 
## Comparison of ARCH(1) and ARCH(2) models:
cat("AIC of ARCH(1):", AIC(arch1_model), "\n")
## AIC of ARCH(1): 2929.782
cat("AIC of ARCH(2):", AIC(arch2_model), "\n")
## AIC of ARCH(2): 2921.784