library(wooldridge)
library(car)
## Loading required package: carData
data("sleep75")
sleep_model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(sleep_model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
# (i) Is there evidence that men sleep more than women?
coef_male <- coef(summary(sleep_model))["male", "Estimate"]
p_value_male <- coef(summary(sleep_model))["male", "Pr(>|t|)"]
cat("Coefficient for 'male':", coef_male, "\n")
## Coefficient for 'male': 87.75243
cat("p-value for 'male':", p_value_male, "\n")
## p-value for 'male': 0.01078518
if (p_value_male < 0.05) {
cat("Conclusion: There is significant evidence that men sleep more (or less) than women.\n")
} else {
cat("Conclusion: There is no significant evidence that men sleep more (or less) than women.\n")
}
## Conclusion: There is significant evidence that men sleep more (or less) than women.
# (ii) Tradeoff between working and sleeping
coef_totwrk <- coef(summary(sleep_model))["totwrk", "Estimate"]
cat("Coefficient for 'totwrk':", coef_totwrk, "\n")
## Coefficient for 'totwrk': -0.1634232
cat("Interpretation: Each additional minute of work reduces sleep by", abs(coef_totwrk), "minutes.\n")
## Interpretation: Each additional minute of work reduces sleep by 0.1634232 minutes.
# (iii) Test whether age has a significant effect on sleep
sleep_model_no_age <- lm(sleep ~ totwrk + educ + male, data = sleep75)
anova(sleep_model_no_age, sleep_model)
## Analysis of Variance Table
##
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 702 122631662
## 2 700 122147777 2 483885 1.3865 0.2506
data("gpa2")
sat_model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(sat_model)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black,
## data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## female:black 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
# (i) Is hsize^2 important in the model?
beta1 <- coef(sat_model)["hsize"]
beta2 <- coef(sat_model)["I(hsize^2)"]
optimal_hsize <- -beta1 / (2 * beta2)
cat("Optimal high school size:", optimal_hsize, "\n")
## Optimal high school size: 4.39603
# (ii) SAT score difference between nonblack females and nonblack males
coef_female <- coef(summary(sat_model))["female", "Estimate"]
p_value_female <- coef(summary(sat_model))["female", "Pr(>|t|)"]
cat("Coefficient for 'female' (SAT score difference between nonblack females and nonblack males):", coef_female, "\n")
## Coefficient for 'female' (SAT score difference between nonblack females and nonblack males): -45.09145
cat("p-value for 'female':", p_value_female, "\n")
## p-value for 'female': 1.656301e-25
if (p_value_female < 0.05) {
cat("Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
} else {
cat("Conclusion: There is no significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
}
## Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.
# (iii) SAT score difference between nonblack males and black males
linearHypothesis(sat_model, "black = 0")
##
## Linear hypothesis test:
## black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 76652652
## 2 4131 73479121 1 3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iv) SAT score difference between black females and nonblack females
linearHypothesis(sat_model, "black + female:black = 0")
##
## Linear hypothesis test:
## black + female:black = 0
##
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4132 74700518
## 2 4131 73479121 1 1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
data("gpa1")
# (i) Add mothcoll and fathcoll to the model and report results
lm1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = gpa1)
summary(lm1)
##
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81126 -0.24179 -0.03234 0.27047 0.84316
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.268408 0.342297 3.706 0.000306 ***
## hsGPA 0.456839 0.096196 4.749 5.12e-06 ***
## ACT 0.007929 0.010898 0.728 0.468119
## mothcoll 0.016244 0.061009 0.266 0.790441
## fathcoll 0.057430 0.062233 0.923 0.357739
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3413 on 136 degrees of freedom
## Multiple R-squared: 0.1837, Adjusted R-squared: 0.1597
## F-statistic: 7.65 on 4 and 136 DF, p-value: 1.371e-05
# (ii) Test for joint significance of mothcoll and fathcoll
lm_reduced <- lm(colGPA ~ hsGPA + ACT, data = gpa1)
anova(lm_reduced, lm1)
## Analysis of Variance Table
##
## Model 1: colGPA ~ hsGPA + ACT
## Model 2: colGPA ~ hsGPA + ACT + mothcoll + fathcoll
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 138 15.982
## 2 136 15.842 2 0.14061 0.6036 0.5483
# (iii) Add hsGPA^2 to the model and evaluate whether this generalization is needed
gpa1$hsGPA_sq <- gpa1$hsGPA^2
lm2 <- lm(colGPA ~ hsGPA + hsGPA_sq + ACT + mothcoll + fathcoll, data = gpa1)
summary(lm2)
##
## Call:
## lm(formula = colGPA ~ hsGPA + hsGPA_sq + ACT + mothcoll + fathcoll,
## data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.77587 -0.23343 -0.02708 0.27998 0.80816
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.767777 2.465721 2.339 0.0208 *
## hsGPA -2.222512 1.457477 -1.525 0.1296
## hsGPA_sq 0.401135 0.217737 1.842 0.0676 .
## ACT 0.004417 0.010971 0.403 0.6879
## mothcoll 0.022601 0.060577 0.373 0.7097
## fathcoll 0.080959 0.063001 1.285 0.2010
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3383 on 135 degrees of freedom
## Multiple R-squared: 0.2037, Adjusted R-squared: 0.1742
## F-statistic: 6.906 on 5 and 135 DF, p-value: 9.025e-06
data("wage2")
# (i) Estimate the log-linear wage model
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
#(ii) Add exper^2 and tenure^2 to the model
wage2$exper_sq <- wage2$exper^2
wage2$tenure_sq <- wage2$tenure^2
model2 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + exper_sq + tenure_sq, data = wage2)
summary(model2)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + exper_sq + tenure_sq, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## exper_sq -0.0001138 0.0005319 -0.214 0.830622
## tenure_sq -0.0007964 0.0004710 -1.691 0.091188 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south +
## urban
## Model 2: log(wage) ~ educ + exper + tenure + married + black + south +
## urban + exper_sq + tenure_sq
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
#(iii) Allow the return to education to depend on race
model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + black + south + urban, data = wage2)
summary(model3)
##
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married +
## black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## black 0.094809 0.255399 0.371 0.710561
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
#(iv)Allow wages to differ across four groups: married black, married nonblack, single black, and single nonblack
model4 <- lm(log(wage) ~ educ + exper + tenure + married * black + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married * black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## married 0.188915 0.042878 4.406 1.18e-05 ***
## black -0.240820 0.096023 -2.508 0.012314 *
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## married:black 0.061354 0.103275 0.594 0.552602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
data("smoke")
ols_model <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(ols_model)
##
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) +
## restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.758 -9.336 -5.895 7.927 70.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.782521 8.852068 -0.653 0.51379
## cigpric -0.005724 0.101064 -0.057 0.95485
## log(income) 0.865360 0.728766 1.187 0.23541
## educ -0.501908 0.167169 -3.002 0.00276 **
## age 0.774357 0.160516 4.824 1.68e-06 ***
## I(age^2) -0.009068 0.001748 -5.187 2.71e-07 ***
## restaurn -2.880176 1.116067 -2.581 0.01004 *
## white -0.553286 1.459490 -0.379 0.70472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05289, Adjusted R-squared: 0.04459
## F-statistic: 6.374 on 7 and 799 DF, p-value: 2.609e-07
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
bptest(ols_model)
##
## studentized Breusch-Pagan test
##
## data: ols_model
## BP = 32.431, df = 7, p-value = 3.379e-05
# (i) The OLS estimators are consistent in the presence of heteroskedasticity.
coeftest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.7825205 8.8831245 -0.6510 0.515262
## cigpric -0.0057240 0.1062869 -0.0539 0.957064
## log(income) 0.8653600 0.5985769 1.4457 0.148655
## educ -0.5019084 0.1624052 -3.0905 0.002068 **
## age 0.7743569 0.1380708 5.6084 2.814e-08 ***
## I(age^2) -0.0090678 0.0014596 -6.2127 8.375e-10 ***
## restaurn -2.8801757 1.0155265 -2.8361 0.004682 **
## white -0.5532861 1.3782955 -0.4014 0.688213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (ii) The usual F-statistic may no longer have an F-distribution.
waldtest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))
## Wald test
##
## Model 1: cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn +
## white
## Model 2: cigs ~ 1
## Res.Df Df F Pr(>F)
## 1 799
## 2 806 -7 9.3617 3.49e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) OLS estimators remain unbiased, but they are no longer BLUE (Best Linear Unbiased Estimators) because the assumption of homoskedasticity is violated.
usual_se <- summary(ols_model)$coefficients[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(ols_model, type = "HC1")))
comparison <- data.frame(Variable = names(coef(ols_model)), Usual_SE = usual_se, Robust_SE = robust_se)
print(comparison)
## Variable Usual_SE Robust_SE
## (Intercept) (Intercept) 8.852067639 8.883124494
## cigpric cigpric 0.101063853 0.106286855
## log(income) log(income) 0.728765519 0.598576892
## educ educ 0.167169257 0.162405199
## age age 0.160516185 0.138070842
## I(age^2) I(age^2) 0.001748065 0.001459558
## restaurn restaurn 1.116067336 1.015526517
## white white 1.459489762 1.378295460
data("smoke")
lpm <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(lpm)
##
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) +
## restaurn + white, data = smoke)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.758 -9.336 -5.895 7.927 70.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.782521 8.852068 -0.653 0.51379
## cigpric -0.005724 0.101064 -0.057 0.95485
## log(income) 0.865360 0.728766 1.187 0.23541
## educ -0.501908 0.167169 -3.002 0.00276 **
## age 0.774357 0.160516 4.824 1.68e-06 ***
## I(age^2) -0.009068 0.001748 -5.187 2.71e-07 ***
## restaurn -2.880176 1.116067 -2.581 0.01004 *
## white -0.553286 1.459490 -0.379 0.70472
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05289, Adjusted R-squared: 0.04459
## F-statistic: 6.374 on 7 and 799 DF, p-value: 2.609e-07
# (i) Compare usual standard errors with heteroskedasticity-robust standard errors
library(lmtest)
library(sandwich)
coeftest(lpm, vcov = vcovHC(lpm, type = "HC1"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.7825205 8.8831245 -0.6510 0.515262
## cigpric -0.0057240 0.1062869 -0.0539 0.957064
## log(income) 0.8653600 0.5985769 1.4457 0.148655
## educ -0.5019084 0.1624052 -3.0905 0.002068 **
## age 0.7743569 0.1380708 5.6084 2.814e-08 ***
## I(age^2) -0.0090678 0.0014596 -6.2127 8.375e-10 ***
## restaurn -2.8801757 1.0155265 -2.8361 0.004682 **
## white -0.5532861 1.3782955 -0.4014 0.688213
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (ii) Calculate the change in the probability of smoking if education increases by 4 years
delta_educ <- 4
coef_educ <- coef(lpm)["educ"]
change_in_prob <- delta_educ * coef_educ
cat("Change in probability of smoking for a 4-year increase in education:", change_in_prob, "\n")
## Change in probability of smoking for a 4-year increase in education: -2.007634
# (iii) Find the age at which the probability of smoking starts to decrease
age_turning_point <- -coef(lpm)["age"] / (2 * coef(lpm)["I(age^2)"])
cat("Turning point for age (years):", age_turning_point, "\n")
## Turning point for age (years): 42.69818
# (iv) Interpret the coefficient on restaurn
cat("The coefficient on restaurn is", coef(lpm)["restaurn"], ". This indicates the change in the probability of smoking if a person lives in a state with restaurant smoking restrictions.\n")
## The coefficient on restaurn is -2.880176 . This indicates the change in the probability of smoking if a person lives in a state with restaurant smoking restrictions.
# (v) Predict the probability of smoking for person 206
person_206 <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
predicted_prob <- predict(lpm, newdata = person_206)
cat("Predicted probability of smoking for person 206:", predicted_prob, "\n")
## Predicted probability of smoking for person 206: -0.7390957
data("vote1")
# (i) Estimate the model with OLS
vote_model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(vote_model)
##
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB),
## data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.66141 4.73604 7.952 2.56e-13 ***
## prtystrA 0.25192 0.07129 3.534 0.00053 ***
## democA 3.79294 1.40652 2.697 0.00772 **
## log(expendA) 5.77929 0.39182 14.750 < 2e-16 ***
## log(expendB) -6.23784 0.39746 -15.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 0.8012, Adjusted R-squared: 0.7964
## F-statistic: 169.2 on 4 and 168 DF, p-value: < 2.2e-16
residuals_u <- resid(vote_model)
vote1$residuals_u <- residuals_u
residuals_model <- lm(residuals_u ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)
##
## Call:
## lm(formula = residuals_u ~ prtystrA + democA + log(expendA) +
## log(expendB), data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.363e-16 4.736e+00 0 1
## prtystrA -8.313e-18 7.129e-02 0 1
## democA 3.928e-17 1.407e+00 0 1
## log(expendA) 2.053e-16 3.918e-01 0 1
## log(expendB) 0.000e+00 3.975e-01 0 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 1.198e-32, Adjusted R-squared: -0.02381
## F-statistic: 5.032e-31 on 4 and 168 DF, p-value: 1
cat("R-squared is 0 because the residuals are orthogonal to the fitted values by definition.\n")
## R-squared is 0 because the residuals are orthogonal to the fitted values by definition.
# (ii) Breusch-Pagan test for heteroskedasticity
bp_test <- bptest(vote_model)
cat("Breusch-Pagan Test:\n")
## Breusch-Pagan Test:
print(bp_test)
##
## studentized Breusch-Pagan test
##
## data: vote_model
## BP = 9.0934, df = 4, p-value = 0.05881
# (iii) White test for heteroskedasticity (F-statistic version)
white_test <- bptest(vote_model, ~ prtystrA + democA + log(expendA) + log(expendB) + I(prtystrA^2) + I(democA^2) + I(log(expendA)^2) + I(log(expendB)^2), data = vote1)
cat("White Test (F-statistic version):\n")
## White Test (F-statistic version):
print(white_test)
##
## studentized Breusch-Pagan test
##
## data: vote_model
## BP = 19.581, df = 7, p-value = 0.00655
data("fertil2")
# (i) Estimate the model
children_model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(children_model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
robust_se <- coeftest(children_model, vcov = vcovHC(children_model, type = "HC1"))
cat("Heteroskedasticity-Robust Standard Errors:\n")
## Heteroskedasticity-Robust Standard Errors:
print(robust_se)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24385099 -17.3160 < 2.2e-16 ***
## age 0.34092552 0.01917466 17.7800 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035051 -7.8206 6.549e-15 ***
## educ -0.07523232 0.00630771 -11.9270 < 2.2e-16 ***
## electric -0.31004041 0.06394815 -4.8483 1.289e-06 ***
## urban -0.20003386 0.04547093 -4.3992 1.113e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
cat("Are the robust standard errors larger? Check manually from the output.\n")
## Are the robust standard errors larger? Check manually from the output.
# (ii) Add religious dummy variables
# Revised model using only available religion dummies
religion_model <- lm(children ~ age + I(age^2) + educ + electric + urban + protest + catholic, data = fertil2)
summary(religion_model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban +
## protest + catholic, data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9035 -0.7162 -0.0054 0.7162 7.4447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2197793 0.2405453 -17.543 < 2e-16 ***
## age 0.3408098 0.0165234 20.626 < 2e-16 ***
## I(age^2) -0.0027390 0.0002722 -10.061 < 2e-16 ***
## educ -0.0752322 0.0064530 -11.659 < 2e-16 ***
## electric -0.3106840 0.0690382 -4.500 6.97e-06 ***
## urban -0.2012432 0.0466070 -4.318 1.61e-05 ***
## protest -0.0148806 0.0545003 -0.273 0.785
## catholic 0.0266610 0.0752882 0.354 0.723
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4350 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5727
## F-statistic: 835.3 on 7 and 4350 DF, p-value: < 2.2e-16
linearHypothesis(religion_model, c("protest = 0", "catholic = 0"))
##
## Linear hypothesis test:
## protest = 0
## catholic = 0
##
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + protest +
## catholic
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4352 9176.4
## 2 4350 9175.9 2 0.53328 0.1264 0.8813
# (iii) Fit residuals regression for heteroskedasticity
fitted_vals <- fitted(religion_model)
residuals_squared <- residuals(religion_model)^2
residuals_model <- lm(residuals_squared ~ fitted_vals + I(fitted_vals^2))
summary(residuals_model)
##
## Call:
## lm(formula = residuals_squared ~ fitted_vals + I(fitted_vals^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.747 -1.278 -0.300 0.339 47.684
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.31003 0.11168 2.776 0.00552 **
## fitted_vals -0.14880 0.10204 -1.458 0.14483
## I(fitted_vals^2) 0.26755 0.01965 13.614 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.642 on 4355 degrees of freedom
## Multiple R-squared: 0.2499, Adjusted R-squared: 0.2495
## F-statistic: 725.4 on 2 and 4355 DF, p-value: < 2.2e-16
# Test for heteroskedasticity
cat("Check joint significance in residuals regression. If F-test p-value < 0.05, heteroskedasticity is present.\n")
## Check joint significance in residuals regression. If F-test p-value < 0.05, heteroskedasticity is present.
# (iv) Interpretation of practical importance
cat("Is heteroskedasticity practically important? Compare the size of robust vs. nonrobust SEs to decide.\n")
## Is heteroskedasticity practically important? Compare the size of robust vs. nonrobust SEs to decide.
data("ceosal2")
# Original model
model_original <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg, data = ceosal2)
summary(model_original)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg,
## data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.26981 -0.30174 -0.01638 0.30474 1.91129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.620690 0.254344 18.167 < 2e-16 ***
## log(sales) 0.158483 0.039814 3.981 0.000101 ***
## log(mktval) 0.112261 0.050393 2.228 0.027190 *
## profmarg -0.002259 0.002165 -1.043 0.298346
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5102 on 173 degrees of freedom
## Multiple R-squared: 0.3035, Adjusted R-squared: 0.2914
## F-statistic: 25.13 on 3 and 173 DF, p-value: 1.522e-13
# Expanded model including ceoten^2 and comten^2
model_extended <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + I(ceoten^2) + I(comten^2), data = ceosal2)
summary(model_extended)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## I(ceoten^2) + I(comten^2), data = ceosal2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47481 -0.25933 -0.00511 0.27010 2.07583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.612e+00 2.524e-01 18.276 < 2e-16 ***
## log(sales) 1.805e-01 4.021e-02 4.489 1.31e-05 ***
## log(mktval) 1.018e-01 4.988e-02 2.040 0.0429 *
## profmarg -2.077e-03 2.135e-03 -0.973 0.3321
## I(ceoten^2) 3.761e-04 1.916e-04 1.963 0.0512 .
## I(comten^2) -1.788e-04 7.236e-05 -2.471 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared: 0.3324, Adjusted R-squared: 0.3129
## F-statistic: 17.03 on 5 and 171 DF, p-value: 1.195e-13
cat("R-squared for original model:", summary(model_original)$r.squared, "\n")
## R-squared for original model: 0.3034944
cat("R-squared for extended model:", summary(model_extended)$r.squared, "\n")
## R-squared for extended model: 0.3323998
# Interpretation: Compare R-squared values and significance of ceoten^2 and comten^2.
No, it is likely endogenous because unobserved factors (like crime severity, college size, or safety policies) influence both reporting likelihood and the number of crimes. Ignoring this can bias the estimates.
data("infmrt")
# Add a dummy variable for District of Columbia (assuming DC is observation 51)
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)
# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43481 -0.10338 0.00447 0.09096 0.39050
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.456e+00 8.688e-02 28.264 < 2e-16 ***
## pcinc -3.078e-05 6.499e-06 -4.737 7.33e-06 ***
## physic 1.495e-03 2.729e-04 5.478 3.33e-07 ***
## dc -7.940e-02 1.664e-01 -0.477 0.634
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared: 0.2537, Adjusted R-squared: 0.2309
## F-statistic: 11.1 on 3 and 98 DF, p-value: 2.444e-06
# (ii) Compare with the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)
##
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.43432 -0.10235 0.00626 0.09337 0.39064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.448e+00 8.502e-02 28.794 < 2e-16 ***
## pcinc -3.026e-05 6.378e-06 -4.743 7.07e-06 ***
## physic 1.487e-03 2.713e-04 5.480 3.24e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared: 0.252, Adjusted R-squared: 0.2369
## F-statistic: 16.67 on 2 and 99 DF, p-value: 5.744e-07
# Compare both models
anova(model_without_dc, model_with_dc)
## Analysis of Variance Table
##
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 99 2.6064
## 2 98 2.6004 1 0.0060446 0.2278 0.6342
library(quantreg)
## Loading required package: SparseM
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data("rdchem")
rdchem <- rdchem %>%
mutate(sales_bil = sales / 1000)
# (i) OLS regression
ols_model <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
summary(ols_model)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## I(sales_bil^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
summary(rdchem$sales_bil)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0420 0.5078 1.3325 3.7970 2.8566 39.7090
rdchem_filtered <- filter(rdchem, sales_bil < 40)
ols_model_filtered <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_filtered)
summary(ols_model_filtered)
##
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_bil 0.316611 0.138854 2.280 0.03041 *
## I(sales_bil^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
# (ii) LAD regression
lad_model <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
summary(lad_model)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## I(sales_bil^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
lad_model_filtered <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_filtered)
summary(lad_model_filtered)
##
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg,
## data = rdchem_filtered)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_bil 0.26346 -0.13508 0.75753
## I(sales_bil^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
# (iii) Compare OLS and LAD results
cat("OLS vs LAD Coefficients (with and without outlier):\n")
## OLS vs LAD Coefficients (with and without outlier):
cat("\nOLS with outlier:\n")
##
## OLS with outlier:
print(coef(ols_model))
## (Intercept) sales_bil I(sales_bil^2) profmarg
## 2.058966676 0.316610977 -0.007389624 0.053322172
cat("\nOLS without outlier:\n")
##
## OLS without outlier:
print(coef(ols_model_filtered))
## (Intercept) sales_bil I(sales_bil^2) profmarg
## 2.058966676 0.316610977 -0.007389624 0.053322172
cat("\nLAD with outlier:\n")
##
## LAD with outlier:
print(coef(lad_model))
## (Intercept) sales_bil I(sales_bil^2) profmarg
## 1.404279415 0.263463789 -0.006001127 0.114003661
cat("\nLAD without outlier:\n")
##
## LAD without outlier:
print(coef(lad_model_filtered))
## (Intercept) sales_bil I(sales_bil^2) profmarg
## 1.404279415 0.263463789 -0.006001127 0.114003661
Disagree: In time series data, observations are often autocorrelated (i.e., a value at time t depends on values at previous times). Assuming independence is inappropriate unless autocorrelation tests confirm no significant autocorrelation.
Agree: If the Gauss-Markov assumptions (linearity, no perfect multicollinearity, and zero conditional mean) hold, OLS will produce unbiased estimates. However, time series models require an additional assumption: no autocorrelation in errors.
Disagree:A trending variable can be used, but it’s essential to account for the trend by including a time trend or differencing the variable to ensure stationarity. Otherwise, regression results might suffer from spurious correlation.
Agree: Seasonality typically affects monthly or quarterly data (e.g., retail sales, temperature), but it is generally not an issue in annual time series.
# Example code to create quarterly dummies and time trend
# Create quarterly dummies and time trend
#housing_data <- housing_data %>%
# mutate(
# quarter = as.factor(quarter),
# time_trend = 1:n()
# )
# Run the regression model
#housing_model <- lm(hs ~ interest_rate + income + time_trend + quarter, data = housing_data)
#summary(housing_model)
library(dplyr)
data("intdef")
intdef <- intdef %>%
mutate(
post_1979 = ifelse(year > 1979, 1, 0) # Dummy variable: 1 if year > 1979, else 0
)
# Run the regression including the dummy variable
policy_model <- lm(i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
summary(policy_model)
##
## Call:
## lm(formula = i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.5802 -0.8353 0.1936 0.7983 4.0447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.39151 0.39028 3.565 0.000800 ***
## inf_1 0.56529 0.07144 7.913 1.99e-10 ***
## def_1 0.38269 0.11156 3.430 0.001203 **
## post_1979 1.77623 0.47268 3.758 0.000442 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.59 on 51 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.705, Adjusted R-squared: 0.6876
## F-statistic: 40.63 on 3 and 51 DF, p-value: 1.479e-13
cat("\nConclusion:\n")
##
## Conclusion:
cat("The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.\n")
## The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.
data("volat")
# (i) Discuss expected signs of coefficients
cat("Expected signs:\n")
## Expected signs:
cat("β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.\n")
## β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.
cat("β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.\n\n")
## β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.
# (ii) Estimate the equation by OLS
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
cat("\nOLS Results:\n")
##
## OLS Results:
cat("The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.\n\n")
## The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.
# (iii)Determine which variables are statistically significant
p_values <- summary(model_c9)$coefficients[, 4]
significant <- p_values < 0.05
cat("P-values for each variable:\n")
## P-values for each variable:
print(p_values)
## (Intercept) pcip i3
## 1.444871e-08 7.784810e-01 1.207402e-02
cat("\nSignificance (TRUE = significant at 5% level):\n")
##
## Significance (TRUE = significant at 5% level):
print(significant)
## (Intercept) pcip i3
## TRUE FALSE TRUE
# (iv)Interpret predictability
if (all(significant[-1] == FALSE)) {
cat("\nThe variables pcip and i3 are NOT statistically significant.\n")
cat("This suggests that the return on the S&P 500 is NOT predictable using these variables.\n")
} else {
cat("\nAt least one variable is statistically significant.\n")
cat("This suggests that the return on the S&P 500 shows SOME level of predictability.\n")
}
##
## At least one variable is statistically significant.
## This suggests that the return on the S&P 500 shows SOME level of predictability.
data("consump")
# (i) Estimate g_ct = β0 + β1 * g_ct-1 + u_t and test the PIH
consump <- consump %>%
mutate(g_ct = log(c) - lag(log(c))) # g_ct = log(c_t) - log(c_t-1)
model_i <- lm(g_ct ~ lag(g_ct, 1), data = consump)
summary(model_i)
##
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001451 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## lag(g_ct, 1) 0.446141 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.00731
# (ii) Add lagged income growth, interest rate, and inflation
consump <- consump %>%
mutate(gy_lag = lag(gy),
i3_lag = lag(i3),
inf_lag = lag(inf))
model_ii <- lm(g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model_ii)
##
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag,
## data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0249093 -0.0075869 0.0000855 0.0087234 0.0188617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0225940 0.0070892 3.187 0.00335 **
## lag(g_ct, 1) 0.4335832 0.2896525 1.497 0.14487
## gy_lag -0.1079087 0.1946371 -0.554 0.58341
## i3_lag -0.0007467 0.0011107 -0.672 0.50654
## inf_lag -0.0008281 0.0010041 -0.825 0.41606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01134 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3038, Adjusted R-squared: 0.211
## F-statistic: 3.273 on 4 and 30 DF, p-value: 0.02431
# (iii) Compare p-value of lag(g_ct, 1) in model_i and model_ii
cat("\nPART (iii): P-values for lag(g_ct, 1)\n")
##
## PART (iii): P-values for lag(g_ct, 1)
cat("Model (i) p-value for lag(g_ct, 1):", summary(model_i)$coefficients[2, 4], "\n")
## Model (i) p-value for lag(g_ct, 1): 0.007310181
cat("Model (ii) p-value for lag(g_ct, 1):", summary(model_ii)$coefficients[2, 4], "\n")
## Model (ii) p-value for lag(g_ct, 1): 0.1448651
# Compare if the p-value for lag(g_ct, 1) changes to determine support for PIH
# (iv) F-statistic for joint significance of all variables
anova_test <- anova(model_i, model_ii)
cat("\nPART (iv): F-statistic for joint significance\n")
##
## PART (iv): F-statistic for joint significance
print(anova_test)
## Analysis of Variance Table
##
## Model 1: g_ct ~ lag(g_ct, 1)
## Model 2: g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044446
## 2 30 0.0038608 3 0.00058384 1.5122 0.2315
data("minwage")
# Create a lagged variable for gwage232
minwage$lag_gwage232 <- lag(minwage$gwage232)
# (i)
filtered_data <- minwage[complete.cases(minwage$gwage232, minwage$lag_gwage232), ]
acf(filtered_data$gwage232, lag.max = 1)
# (ii)
model1 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
summary(model1)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044642 -0.004134 -0.001312 0.004482 0.041612
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024003 0.0004308 5.572 3.79e-08 ***
## lag_gwage232 -0.0779092 0.0342851 -2.272 0.02341 *
## gmwage 0.1518459 0.0096485 15.738 < 2e-16 ***
## gcpi 0.2630876 0.0824457 3.191 0.00149 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007889 on 606 degrees of freedom
## Multiple R-squared: 0.2986, Adjusted R-squared: 0.2951
## F-statistic: 85.99 on 3 and 606 DF, p-value: < 2.2e-16
# (iii)
model2 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), data = filtered_data)
summary(model2)
##
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232),
## data = filtered_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043786 -0.004384 -0.001022 0.004325 0.042554
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0024248 0.0004268 5.681 2.08e-08 ***
## lag_gwage232 -0.0756372 0.0339207 -2.230 0.026126 *
## gmwage 0.1526838 0.0095403 16.004 < 2e-16 ***
## gcpi 0.2652171 0.0826041 3.211 0.001394 **
## lag(gemp232) 0.0662916 0.0169639 3.908 0.000104 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007798 on 604 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.3167, Adjusted R-squared: 0.3122
## F-statistic: 69.98 on 4 and 604 DF, p-value: < 2.2e-16
# (iv)
coef(model1)["gmwage"]
## gmwage
## 0.1518459
coef(model2)["gmwage"]
## gmwage
## 0.1526838
# (v)
model3 <- lm(gmwage ~ lag(gwage232) + lag(gemp232), data = filtered_data)
summary(model3)$r.squared
## [1] 0.003908469
data("nyse")
# (i) Estimate the OLS model in equation (12.47)
nyse <- nyse %>% mutate(return_lag1 = lag(return, 1))
ols_model <- lm(return ~ return_lag1, data = nyse)
cat("OLS Model Summary:\n")
## OLS Model Summary:
summary(ols_model)
##
## Call:
## lm(formula = return ~ return_lag1, data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.261 -1.302 0.098 1.316 8.065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17963 0.08074 2.225 0.0264 *
## return_lag1 0.05890 0.03802 1.549 0.1218
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.11 on 687 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.003481, Adjusted R-squared: 0.00203
## F-statistic: 2.399 on 1 and 687 DF, p-value: 0.1218
nyse <- nyse %>% filter(!is.na(return_lag1))
nyse$residuals_sq <- residuals(ols_model)^2
mean_resid_sq <- mean(nyse$residuals_sq)
min_resid_sq <- min(nyse$residuals_sq)
max_resid_sq <- max(nyse$residuals_sq)
cat("Mean of squared residuals:", mean_resid_sq, "\n")
## Mean of squared residuals: 4.440839
cat("Minimum of squared residuals:", min_resid_sq, "\n")
## Minimum of squared residuals: 7.35465e-06
cat("Maximum of squared residuals:", max_resid_sq, "\n")
## Maximum of squared residuals: 232.8946
# (ii) Estimate heteroskedasticity model using lagged return
hetero_model <- lm(residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
cat("Heteroskedasticity Model Summary:\n")
## Heteroskedasticity Model Summary:
summary(hetero_model)
##
## Call:
## lm(formula = residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.459 -3.011 -1.975 0.676 221.469
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25734 0.44085 7.389 4.32e-13 ***
## return_lag1 -0.78946 0.19569 -4.034 6.09e-05 ***
## I(return_lag1^2) 0.29666 0.03552 8.351 3.75e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared: 0.1303, Adjusted R-squared: 0.1278
## F-statistic: 51.4 on 2 and 686 DF, p-value: < 2.2e-16
# (iii) Plot conditional variance as a function of lagged return
plot_data <- data.frame(return_lag1 = seq(min(nyse$return_lag1, na.rm = TRUE),
max(nyse$return_lag1, na.rm = TRUE), length.out = 100))
plot_data$conditional_variance <- predict(hetero_model, newdata = plot_data)
plot(plot_data$return_lag1, plot_data$conditional_variance, type = "l",
main = "Conditional Variance vs. Lagged Return",
xlab = "Lagged Return", ylab = "Conditional Variance")
min_variance_index <- which.min(plot_data$conditional_variance)
cat("Smallest conditional variance:", plot_data$conditional_variance[min_variance_index], "\n")
## Smallest conditional variance: 2.734254
cat("Lagged return value for smallest variance:", plot_data$return_lag1[min_variance_index], "\n")
## Lagged return value for smallest variance: 1.245583
# (iv) Check for negative variance estimates
if (any(plot_data$conditional_variance < 0)) {
cat("Negative variance estimates are present.\n")
} else {
cat("No negative variance estimates.\n")
}
## No negative variance estimates.
# (v) Compare ARCH(1) model fit
nyse$residuals_sq_lag1 <- lag(nyse$residuals_sq, 1)
arch1_model <- lm(residuals_sq ~ residuals_sq_lag1, data = nyse)
summary(arch1_model)
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1, data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.337 -3.292 -2.157 0.556 223.981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.94743 0.44023 6.695 4.49e-11 ***
## residuals_sq_lag1 0.33706 0.03595 9.377 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 686 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.1136, Adjusted R-squared: 0.1123
## F-statistic: 87.92 on 1 and 686 DF, p-value: < 2.2e-16
cat("ARCH(1) model R-squared:", summary(arch1_model)$r.squared, "\n")
## ARCH(1) model R-squared: 0.1136065
cat("Heteroskedasticity model R-squared:", summary(hetero_model)$r.squared, "\n")
## Heteroskedasticity model R-squared: 0.1303208
# (vi) Add second lag to ARCH(1) model for ARCH(2)
nyse$residuals_sq_lag2 <- lag(nyse$residuals_sq, 2)
arch2_model <- lm(residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, data = nyse)
summary(arch2_model)
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2,
## data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.934 -3.298 -2.158 0.600 224.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.82950 0.45495 6.219 8.69e-10 ***
## residuals_sq_lag1 0.32284 0.03820 8.450 < 2e-16 ***
## residuals_sq_lag2 0.04179 0.03820 1.094 0.274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 684 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1151, Adjusted R-squared: 0.1125
## F-statistic: 44.47 on 2 and 684 DF, p-value: < 2.2e-16
cat("ARCH(2) model Summary:\n")
## ARCH(2) model Summary:
print(summary(arch2_model))
##
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2,
## data = nyse)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.934 -3.298 -2.158 0.600 224.296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.82950 0.45495 6.219 8.69e-10 ***
## residuals_sq_lag1 0.32284 0.03820 8.450 < 2e-16 ***
## residuals_sq_lag2 0.04179 0.03820 1.094 0.274
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.76 on 684 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1151, Adjusted R-squared: 0.1125
## F-statistic: 44.47 on 2 and 684 DF, p-value: < 2.2e-16