Final exam

CHAPTER 7 question 1

library(wooldridge)
library(car)

## Loading required package: carData

data("sleep75")

sleep_model <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(sleep_model)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

# (i) Is there evidence that men sleep more than women?
coef_male <- coef(summary(sleep_model))["male", "Estimate"]
p_value_male <- coef(summary(sleep_model))["male", "Pr(>|t|)"]
cat("Coefficient for 'male':", coef_male, "\n")

## Coefficient for 'male': 87.75243

cat("p-value for 'male':", p_value_male, "\n")

## p-value for 'male': 0.01078518

if (p_value_male < 0.05) {
  cat("Conclusion: There is significant evidence that men sleep more (or less) than women.\n")
} else {
  cat("Conclusion: There is no significant evidence that men sleep more (or less) than women.\n")
}

## Conclusion: There is significant evidence that men sleep more (or less) than women.

# (ii) Tradeoff between working and sleeping
coef_totwrk <- coef(summary(sleep_model))["totwrk", "Estimate"]

cat("Coefficient for 'totwrk':", coef_totwrk, "\n")

## Coefficient for 'totwrk': -0.1634232

cat("Interpretation: Each additional minute of work reduces sleep by", abs(coef_totwrk), "minutes.\n")

## Interpretation: Each additional minute of work reduces sleep by 0.1634232 minutes.

# (iii) Test whether age has a significant effect on sleep
sleep_model_no_age <- lm(sleep ~ totwrk + educ + male, data = sleep75)
anova(sleep_model_no_age, sleep_model)

## Analysis of Variance Table
## 
## Model 1: sleep ~ totwrk + educ + male
## Model 2: sleep ~ totwrk + educ + age + I(age^2) + male
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1    702 122631662                           
## 2    700 122147777  2    483885 1.3865 0.2506

question 3

data("gpa2")
sat_model <- lm(sat ~ hsize + I(hsize^2) + female + black + female:black, data = gpa2)
summary(sat_model)

## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + female:black, 
##     data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)     -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

# (i) Is hsize^2 important in the model?
beta1 <- coef(sat_model)["hsize"]
beta2 <- coef(sat_model)["I(hsize^2)"]
optimal_hsize <- -beta1 / (2 * beta2)
cat("Optimal high school size:", optimal_hsize, "\n")

## Optimal high school size: 4.39603

# (ii) SAT score difference between nonblack females and nonblack males
coef_female <- coef(summary(sat_model))["female", "Estimate"]
p_value_female <- coef(summary(sat_model))["female", "Pr(>|t|)"]
cat("Coefficient for 'female' (SAT score difference between nonblack females and nonblack males):", coef_female, "\n")

## Coefficient for 'female' (SAT score difference between nonblack females and nonblack males): -45.09145

cat("p-value for 'female':", p_value_female, "\n")

## p-value for 'female': 1.656301e-25

if (p_value_female < 0.05) {
  cat("Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
} else {
  cat("Conclusion: There is no significant evidence of a difference in SAT scores between nonblack females and nonblack males.\n")
}

## Conclusion: There is significant evidence of a difference in SAT scores between nonblack females and nonblack males.

# (iii) SAT score difference between nonblack males and black males
linearHypothesis(sat_model, "black = 0")

## 
## Linear hypothesis test:
## black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 76652652                                  
## 2   4131 73479121  1   3173531 178.42 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (iv) SAT score difference between black females and nonblack females
linearHypothesis(sat_model, "black + female:black = 0")

## 
## Linear hypothesis test:
## black  + female:black = 0
## 
## Model 1: restricted model
## Model 2: sat ~ hsize + I(hsize^2) + female + black + female:black
## 
##   Res.Df      RSS Df Sum of Sq      F    Pr(>F)    
## 1   4132 74700518                                  
## 2   4131 73479121  1   1221397 68.667 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

question C1

data("gpa1")
# (i) Add mothcoll and fathcoll to the model and report results
lm1 <- lm(colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = gpa1)
summary(lm1)

## 
## Call:
## lm(formula = colGPA ~ hsGPA + ACT + mothcoll + fathcoll, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.81126 -0.24179 -0.03234  0.27047  0.84316 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.268408   0.342297   3.706 0.000306 ***
## hsGPA       0.456839   0.096196   4.749 5.12e-06 ***
## ACT         0.007929   0.010898   0.728 0.468119    
## mothcoll    0.016244   0.061009   0.266 0.790441    
## fathcoll    0.057430   0.062233   0.923 0.357739    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3413 on 136 degrees of freedom
## Multiple R-squared:  0.1837, Adjusted R-squared:  0.1597 
## F-statistic:  7.65 on 4 and 136 DF,  p-value: 1.371e-05

# (ii) Test for joint significance of mothcoll and fathcoll
lm_reduced <- lm(colGPA ~ hsGPA + ACT, data = gpa1)
anova(lm_reduced, lm1)

## Analysis of Variance Table
## 
## Model 1: colGPA ~ hsGPA + ACT
## Model 2: colGPA ~ hsGPA + ACT + mothcoll + fathcoll
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    138 15.982                           
## 2    136 15.842  2   0.14061 0.6036 0.5483

# (iii) Add hsGPA^2 to the model and evaluate whether this generalization is needed
gpa1$hsGPA_sq <- gpa1$hsGPA^2
lm2 <- lm(colGPA ~ hsGPA + hsGPA_sq + ACT + mothcoll + fathcoll, data = gpa1)
summary(lm2)

## 
## Call:
## lm(formula = colGPA ~ hsGPA + hsGPA_sq + ACT + mothcoll + fathcoll, 
##     data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.77587 -0.23343 -0.02708  0.27998  0.80816 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.767777   2.465721   2.339   0.0208 *
## hsGPA       -2.222512   1.457477  -1.525   0.1296  
## hsGPA_sq     0.401135   0.217737   1.842   0.0676 .
## ACT          0.004417   0.010971   0.403   0.6879  
## mothcoll     0.022601   0.060577   0.373   0.7097  
## fathcoll     0.080959   0.063001   1.285   0.2010  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3383 on 135 degrees of freedom
## Multiple R-squared:  0.2037, Adjusted R-squared:  0.1742 
## F-statistic: 6.906 on 5 and 135 DF,  p-value: 9.025e-06

question C2

data("wage2")

# (i) Estimate the log-linear wage model
model1 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = wage2)
summary(model1)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

#(ii) Add exper^2 and tenure^2 to the model
wage2$exper_sq <- wage2$exper^2
wage2$tenure_sq <- wage2$tenure^2

model2 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + exper_sq + tenure_sq, data = wage2)
summary(model2)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + exper_sq + tenure_sq, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## exper_sq    -0.0001138  0.0005319  -0.214 0.830622    
## tenure_sq   -0.0007964  0.0004710  -1.691 0.091188 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

anova(model1, model2)

## Analysis of Variance Table
## 
## Model 1: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban
## Model 2: log(wage) ~ educ + exper + tenure + married + black + south + 
##     urban + exper_sq + tenure_sq
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226

#(iii) Allow the return to education to depend on race
model3 <- lm(log(wage) ~ educ * black + exper + tenure + married + black + south + urban, data = wage2)
summary(model3)

## 
## Call:
## lm(formula = log(wage) ~ educ * black + exper + tenure + married + 
##     black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## black        0.094809   0.255399   0.371 0.710561    
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

#(iv)Allow wages to differ across four groups: married black, married nonblack, single black, and single nonblack
model4 <- lm(log(wage) ~ educ + exper + tenure + married * black + south + urban, data = wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married * black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.403793   0.114122  47.351  < 2e-16 ***
## educ           0.065475   0.006253  10.471  < 2e-16 ***
## exper          0.014146   0.003191   4.433 1.04e-05 ***
## tenure         0.011663   0.002458   4.745 2.41e-06 ***
## married        0.188915   0.042878   4.406 1.18e-05 ***
## black         -0.240820   0.096023  -2.508 0.012314 *  
## south         -0.091989   0.026321  -3.495 0.000497 ***
## urban          0.184350   0.026978   6.833 1.50e-11 ***
## married:black  0.061354   0.103275   0.594 0.552602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16

CHAPTER 8 question 1

data("smoke")

ols_model <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(ols_model)

## 
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) + 
##     restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.758  -9.336  -5.895   7.927  70.267 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.782521   8.852068  -0.653  0.51379    
## cigpric     -0.005724   0.101064  -0.057  0.95485    
## log(income)  0.865360   0.728766   1.187  0.23541    
## educ        -0.501908   0.167169  -3.002  0.00276 ** 
## age          0.774357   0.160516   4.824 1.68e-06 ***
## I(age^2)    -0.009068   0.001748  -5.187 2.71e-07 ***
## restaurn    -2.880176   1.116067  -2.581  0.01004 *  
## white       -0.553286   1.459490  -0.379  0.70472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05289,    Adjusted R-squared:  0.04459 
## F-statistic: 6.374 on 7 and 799 DF,  p-value: 2.609e-07

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)
bptest(ols_model)

## 
##  studentized Breusch-Pagan test
## 
## data:  ols_model
## BP = 32.431, df = 7, p-value = 3.379e-05

# (i) The OLS estimators are consistent in the presence of heteroskedasticity.
coeftest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -5.7825205  8.8831245 -0.6510  0.515262    
## cigpric     -0.0057240  0.1062869 -0.0539  0.957064    
## log(income)  0.8653600  0.5985769  1.4457  0.148655    
## educ        -0.5019084  0.1624052 -3.0905  0.002068 ** 
## age          0.7743569  0.1380708  5.6084 2.814e-08 ***
## I(age^2)    -0.0090678  0.0014596 -6.2127 8.375e-10 ***
## restaurn    -2.8801757  1.0155265 -2.8361  0.004682 ** 
## white       -0.5532861  1.3782955 -0.4014  0.688213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (ii) The usual F-statistic may no longer have an F-distribution.
waldtest(ols_model, vcov = vcovHC(ols_model, type = "HC1"))

## Wald test
## 
## Model 1: cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + 
##     white
## Model 2: cigs ~ 1
##   Res.Df Df      F   Pr(>F)    
## 1    799                       
## 2    806 -7 9.3617 3.49e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (iii) OLS estimators remain unbiased, but they are no longer BLUE (Best Linear Unbiased Estimators) because the assumption of homoskedasticity is violated.
usual_se <- summary(ols_model)$coefficients[, "Std. Error"]
robust_se <- sqrt(diag(vcovHC(ols_model, type = "HC1")))

comparison <- data.frame(Variable = names(coef(ols_model)), Usual_SE = usual_se, Robust_SE = robust_se)
print(comparison)

##                Variable    Usual_SE   Robust_SE
## (Intercept) (Intercept) 8.852067639 8.883124494
## cigpric         cigpric 0.101063853 0.106286855
## log(income) log(income) 0.728765519 0.598576892
## educ               educ 0.167169257 0.162405199
## age                 age 0.160516185 0.138070842
## I(age^2)       I(age^2) 0.001748065 0.001459558
## restaurn       restaurn 1.116067336 1.015526517
## white             white 1.459489762 1.378295460

question 5

data("smoke")

lpm <- lm(cigs ~ cigpric + log(income) + educ + age + I(age^2) + restaurn + white, data = smoke)
summary(lpm)

## 
## Call:
## lm(formula = cigs ~ cigpric + log(income) + educ + age + I(age^2) + 
##     restaurn + white, data = smoke)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.758  -9.336  -5.895   7.927  70.267 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5.782521   8.852068  -0.653  0.51379    
## cigpric     -0.005724   0.101064  -0.057  0.95485    
## log(income)  0.865360   0.728766   1.187  0.23541    
## educ        -0.501908   0.167169  -3.002  0.00276 ** 
## age          0.774357   0.160516   4.824 1.68e-06 ***
## I(age^2)    -0.009068   0.001748  -5.187 2.71e-07 ***
## restaurn    -2.880176   1.116067  -2.581  0.01004 *  
## white       -0.553286   1.459490  -0.379  0.70472    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05289,    Adjusted R-squared:  0.04459 
## F-statistic: 6.374 on 7 and 799 DF,  p-value: 2.609e-07

# (i) Compare usual standard errors with heteroskedasticity-robust standard errors
library(lmtest)
library(sandwich)
coeftest(lpm, vcov = vcovHC(lpm, type = "HC1"))

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) -5.7825205  8.8831245 -0.6510  0.515262    
## cigpric     -0.0057240  0.1062869 -0.0539  0.957064    
## log(income)  0.8653600  0.5985769  1.4457  0.148655    
## educ        -0.5019084  0.1624052 -3.0905  0.002068 ** 
## age          0.7743569  0.1380708  5.6084 2.814e-08 ***
## I(age^2)    -0.0090678  0.0014596 -6.2127 8.375e-10 ***
## restaurn    -2.8801757  1.0155265 -2.8361  0.004682 ** 
## white       -0.5532861  1.3782955 -0.4014  0.688213    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# (ii) Calculate the change in the probability of smoking if education increases by 4 years
delta_educ <- 4
coef_educ <- coef(lpm)["educ"]
change_in_prob <- delta_educ * coef_educ
cat("Change in probability of smoking for a 4-year increase in education:", change_in_prob, "\n")

## Change in probability of smoking for a 4-year increase in education: -2.007634

# (iii) Find the age at which the probability of smoking starts to decrease
age_turning_point <- -coef(lpm)["age"] / (2 * coef(lpm)["I(age^2)"])
cat("Turning point for age (years):", age_turning_point, "\n")

## Turning point for age (years): 42.69818

# (iv) Interpret the coefficient on restaurn
cat("The coefficient on restaurn is", coef(lpm)["restaurn"], ". This indicates the change in the probability of smoking if a person lives in a state with restaurant smoking restrictions.\n")

## The coefficient on restaurn is -2.880176 . This indicates the change in the probability of smoking if a person lives in a state with restaurant smoking restrictions.

# (v) Predict the probability of smoking for person 206
person_206 <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
predicted_prob <- predict(lpm, newdata = person_206)
cat("Predicted probability of smoking for person 206:", predicted_prob, "\n")

## Predicted probability of smoking for person 206: -0.7390957

question C4

data("vote1")

# (i) Estimate the model with OLS
vote_model <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(vote_model)

## 
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.66141    4.73604   7.952 2.56e-13 ***
## prtystrA      0.25192    0.07129   3.534  0.00053 ***
## democA        3.79294    1.40652   2.697  0.00772 ** 
## log(expendA)  5.77929    0.39182  14.750  < 2e-16 ***
## log(expendB) -6.23784    0.39746 -15.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7964 
## F-statistic: 169.2 on 4 and 168 DF,  p-value: < 2.2e-16

residuals_u <- resid(vote_model)
vote1$residuals_u <- residuals_u

residuals_model <- lm(residuals_u ~ prtystrA + democA + log(expendA) + log(expendB), data = vote1)
summary(residuals_model)

## 
## Call:
## lm(formula = residuals_u ~ prtystrA + democA + log(expendA) + 
##     log(expendB), data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -3.363e-16  4.736e+00       0        1
## prtystrA     -8.313e-18  7.129e-02       0        1
## democA        3.928e-17  1.407e+00       0        1
## log(expendA)  2.053e-16  3.918e-01       0        1
## log(expendB)  0.000e+00  3.975e-01       0        1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  1.198e-32,  Adjusted R-squared:  -0.02381 
## F-statistic: 5.032e-31 on 4 and 168 DF,  p-value: 1

cat("R-squared is 0 because the residuals are orthogonal to the fitted values by definition.\n")

## R-squared is 0 because the residuals are orthogonal to the fitted values by definition.

# (ii) Breusch-Pagan test for heteroskedasticity
bp_test <- bptest(vote_model)
cat("Breusch-Pagan Test:\n")

## Breusch-Pagan Test:

print(bp_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  vote_model
## BP = 9.0934, df = 4, p-value = 0.05881

# (iii) White test for heteroskedasticity (F-statistic version)
white_test <- bptest(vote_model, ~ prtystrA + democA + log(expendA) + log(expendB) + I(prtystrA^2) + I(democA^2) + I(log(expendA)^2) + I(log(expendB)^2), data = vote1)
cat("White Test (F-statistic version):\n")

## White Test (F-statistic version):

print(white_test)

## 
##  studentized Breusch-Pagan test
## 
## data:  vote_model
## BP = 19.581, df = 7, p-value = 0.00655

question C13

data("fertil2")

# (i) Estimate the model
children_model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
summary(children_model)

## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16

robust_se <- coeftest(children_model, vcov = vcovHC(children_model, type = "HC1"))
cat("Heteroskedasticity-Robust Standard Errors:\n")

## Heteroskedasticity-Robust Standard Errors:

print(robust_se)

## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24385099 -17.3160 < 2.2e-16 ***
## age          0.34092552  0.01917466  17.7800 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035051  -7.8206 6.549e-15 ***
## educ        -0.07523232  0.00630771 -11.9270 < 2.2e-16 ***
## electric    -0.31004041  0.06394815  -4.8483 1.289e-06 ***
## urban       -0.20003386  0.04547093  -4.3992 1.113e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("Are the robust standard errors larger? Check manually from the output.\n")

## Are the robust standard errors larger? Check manually from the output.

# (ii) Add religious dummy variables
# Revised model using only available religion dummies
religion_model <- lm(children ~ age + I(age^2) + educ + electric + urban + protest + catholic, data = fertil2)
summary(religion_model)

## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban + 
##     protest + catholic, data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9035 -0.7162 -0.0054  0.7162  7.4447 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2197793  0.2405453 -17.543  < 2e-16 ***
## age          0.3408098  0.0165234  20.626  < 2e-16 ***
## I(age^2)    -0.0027390  0.0002722 -10.061  < 2e-16 ***
## educ        -0.0752322  0.0064530 -11.659  < 2e-16 ***
## electric    -0.3106840  0.0690382  -4.500 6.97e-06 ***
## urban       -0.2012432  0.0466070  -4.318 1.61e-05 ***
## protest     -0.0148806  0.0545003  -0.273    0.785    
## catholic     0.0266610  0.0752882   0.354    0.723    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4350 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5727 
## F-statistic: 835.3 on 7 and 4350 DF,  p-value: < 2.2e-16

linearHypothesis(religion_model, c("protest = 0", "catholic = 0"))

## 
## Linear hypothesis test:
## protest = 0
## catholic = 0
## 
## Model 1: restricted model
## Model 2: children ~ age + I(age^2) + educ + electric + urban + protest + 
##     catholic
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1   4352 9176.4                           
## 2   4350 9175.9  2   0.53328 0.1264 0.8813

# (iii) Fit residuals regression for heteroskedasticity
fitted_vals <- fitted(religion_model)
residuals_squared <- residuals(religion_model)^2
residuals_model <- lm(residuals_squared ~ fitted_vals + I(fitted_vals^2))
summary(residuals_model)

## 
## Call:
## lm(formula = residuals_squared ~ fitted_vals + I(fitted_vals^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.747 -1.278 -0.300  0.339 47.684 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.31003    0.11168   2.776  0.00552 ** 
## fitted_vals      -0.14880    0.10204  -1.458  0.14483    
## I(fitted_vals^2)  0.26755    0.01965  13.614  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.642 on 4355 degrees of freedom
## Multiple R-squared:  0.2499, Adjusted R-squared:  0.2495 
## F-statistic: 725.4 on 2 and 4355 DF,  p-value: < 2.2e-16

# Test for heteroskedasticity
cat("Check joint significance in residuals regression. If F-test p-value < 0.05, heteroskedasticity is present.\n")

## Check joint significance in residuals regression. If F-test p-value < 0.05, heteroskedasticity is present.

# (iv) Interpretation of practical importance
cat("Is heteroskedasticity practically important? Compare the size of robust vs. nonrobust SEs to decide.\n")

## Is heteroskedasticity practically important? Compare the size of robust vs. nonrobust SEs to decide.

CHAPTER 9 question 1

data("ceosal2")

# Original model
model_original <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg, data = ceosal2)
summary(model_original)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg, 
##     data = ceosal2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26981 -0.30174 -0.01638  0.30474  1.91129 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.620690   0.254344  18.167  < 2e-16 ***
## log(sales)   0.158483   0.039814   3.981 0.000101 ***
## log(mktval)  0.112261   0.050393   2.228 0.027190 *  
## profmarg    -0.002259   0.002165  -1.043 0.298346    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5102 on 173 degrees of freedom
## Multiple R-squared:  0.3035, Adjusted R-squared:  0.2914 
## F-statistic: 25.13 on 3 and 173 DF,  p-value: 1.522e-13

# Expanded model including ceoten^2 and comten^2
model_extended <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + I(ceoten^2) + I(comten^2), data = ceosal2)
summary(model_extended)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     I(ceoten^2) + I(comten^2), data = ceosal2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47481 -0.25933 -0.00511  0.27010  2.07583 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.612e+00  2.524e-01  18.276  < 2e-16 ***
## log(sales)   1.805e-01  4.021e-02   4.489 1.31e-05 ***
## log(mktval)  1.018e-01  4.988e-02   2.040   0.0429 *  
## profmarg    -2.077e-03  2.135e-03  -0.973   0.3321    
## I(ceoten^2)  3.761e-04  1.916e-04   1.963   0.0512 .  
## I(comten^2) -1.788e-04  7.236e-05  -2.471   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared:  0.3324, Adjusted R-squared:  0.3129 
## F-statistic: 17.03 on 5 and 171 DF,  p-value: 1.195e-13

cat("R-squared for original model:", summary(model_original)$r.squared, "\n")

## R-squared for original model: 0.3034944

cat("R-squared for extended model:", summary(model_extended)$r.squared, "\n")

## R-squared for extended model: 0.3323998

# Interpretation: Compare R-squared values and significance of ceoten^2 and comten^2.

question 5

No, it is likely endogenous because unobserved factors (like crime severity, college size, or safety policies) influence both reporting likelihood and the number of crimes. Ignoring this can bias the estimates.

question C4

data("infmrt")

# Add a dummy variable for District of Columbia (assuming DC is observation 51)
infmrt$dc <- ifelse(row.names(infmrt) == "51", 1, 0)

# (i) Estimate the model including the DC dummy variable
model_with_dc <- lm(log(infmort) ~ pcinc + physic + dc, data = infmrt)
summary(model_with_dc)

## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic + dc, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43481 -0.10338  0.00447  0.09096  0.39050 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.456e+00  8.688e-02  28.264  < 2e-16 ***
## pcinc       -3.078e-05  6.499e-06  -4.737 7.33e-06 ***
## physic       1.495e-03  2.729e-04   5.478 3.33e-07 ***
## dc          -7.940e-02  1.664e-01  -0.477    0.634    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1629 on 98 degrees of freedom
## Multiple R-squared:  0.2537, Adjusted R-squared:  0.2309 
## F-statistic:  11.1 on 3 and 98 DF,  p-value: 2.444e-06

# (ii) Compare with the model without the DC dummy variable
model_without_dc <- lm(log(infmort) ~ pcinc + physic, data = infmrt)
summary(model_without_dc)

## 
## Call:
## lm(formula = log(infmort) ~ pcinc + physic, data = infmrt)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43432 -0.10235  0.00626  0.09337  0.39064 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.448e+00  8.502e-02  28.794  < 2e-16 ***
## pcinc       -3.026e-05  6.378e-06  -4.743 7.07e-06 ***
## physic       1.487e-03  2.713e-04   5.480 3.24e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1623 on 99 degrees of freedom
## Multiple R-squared:  0.252,  Adjusted R-squared:  0.2369 
## F-statistic: 16.67 on 2 and 99 DF,  p-value: 5.744e-07

# Compare both models
anova(model_without_dc, model_with_dc)

## Analysis of Variance Table
## 
## Model 1: log(infmort) ~ pcinc + physic
## Model 2: log(infmort) ~ pcinc + physic + dc
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     99 2.6064                           
## 2     98 2.6004  1 0.0060446 0.2278 0.6342

question C5

library(quantreg)

## Loading required package: SparseM

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2) 

data("rdchem")

rdchem <- rdchem %>%
  mutate(sales_bil = sales / 1000)  

# (i) OLS regression
ols_model <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
summary(ols_model)

## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.058967   0.626269   3.288  0.00272 **
## sales_bil       0.316611   0.138854   2.280  0.03041 * 
## I(sales_bil^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg        0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107

summary(rdchem$sales_bil)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0420  0.5078  1.3325  3.7970  2.8566 39.7090

rdchem_filtered <- filter(rdchem, sales_bil < 40)  

ols_model_filtered <- lm(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_filtered)
summary(ols_model_filtered)

## 
## Call:
## lm(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)     2.058967   0.626269   3.288  0.00272 **
## sales_bil       0.316611   0.138854   2.280  0.03041 * 
## I(sales_bil^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg        0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107

# (ii) LAD regression
lad_model <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem)
summary(lad_model)

## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     1.40428      0.87031  2.66628
## sales_bil       0.26346     -0.13508  0.75753
## I(sales_bil^2) -0.00600     -0.01679  0.00344
## profmarg        0.11400      0.01376  0.16427

lad_model_filtered <- rq(rdintens ~ sales_bil + I(sales_bil^2) + profmarg, data = rdchem_filtered)
summary(lad_model_filtered)

## 
## Call: rq(formula = rdintens ~ sales_bil + I(sales_bil^2) + profmarg, 
##     data = rdchem_filtered)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                coefficients lower bd upper bd
## (Intercept)     1.40428      0.87031  2.66628
## sales_bil       0.26346     -0.13508  0.75753
## I(sales_bil^2) -0.00600     -0.01679  0.00344
## profmarg        0.11400      0.01376  0.16427

# (iii) Compare OLS and LAD results
cat("OLS vs LAD Coefficients (with and without outlier):\n")

## OLS vs LAD Coefficients (with and without outlier):

cat("\nOLS with outlier:\n")

## 
## OLS with outlier:

print(coef(ols_model))

##    (Intercept)      sales_bil I(sales_bil^2)       profmarg 
##    2.058966676    0.316610977   -0.007389624    0.053322172

cat("\nOLS without outlier:\n")

## 
## OLS without outlier:

print(coef(ols_model_filtered))

##    (Intercept)      sales_bil I(sales_bil^2)       profmarg 
##    2.058966676    0.316610977   -0.007389624    0.053322172

cat("\nLAD with outlier:\n")

## 
## LAD with outlier:

print(coef(lad_model))

##    (Intercept)      sales_bil I(sales_bil^2)       profmarg 
##    1.404279415    0.263463789   -0.006001127    0.114003661

cat("\nLAD without outlier:\n")

## 
## LAD without outlier:

print(coef(lad_model_filtered))

##    (Intercept)      sales_bil I(sales_bil^2)       profmarg 
##    1.404279415    0.263463789   -0.006001127    0.114003661

CHAPTER 10 question 1

Disagree: In time series data, observations are often autocorrelated (i.e., a value at time t depends on values at previous times). Assuming independence is inappropriate unless autocorrelation tests confirm no significant autocorrelation.
Agree: If the Gauss-Markov assumptions (linearity, no perfect multicollinearity, and zero conditional mean) hold, OLS will produce unbiased estimates. However, time series models require an additional assumption: no autocorrelation in errors.
Disagree:A trending variable can be used, but it’s essential to account for the trend by including a time trend or differencing the variable to ensure stationarity. Otherwise, regression results might suffer from spurious correlation.
Agree: Seasonality typically affects monthly or quarterly data (e.g., retail sales, temperature), but it is generally not an issue in annual time series.

question 5

# Example code to create quarterly dummies and time trend

# Create quarterly dummies and time trend
#housing_data <- housing_data %>%
#  mutate(
#    quarter = as.factor(quarter),   
#    time_trend = 1:n()              
#  )

# Run the regression model
#housing_model <- lm(hs ~ interest_rate + income + time_trend + quarter, data = housing_data)
#summary(housing_model)

question C1

library(dplyr)      

data("intdef")

intdef <- intdef %>%
  mutate(
    post_1979 = ifelse(year > 1979, 1, 0)  # Dummy variable: 1 if year > 1979, else 0
  )

# Run the regression including the dummy variable
policy_model <- lm(i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
summary(policy_model)

## 
## Call:
## lm(formula = i3_1 ~ inf_1 + def_1 + post_1979, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.5802 -0.8353  0.1936  0.7983  4.0447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.39151    0.39028   3.565 0.000800 ***
## inf_1        0.56529    0.07144   7.913 1.99e-10 ***
## def_1        0.38269    0.11156   3.430 0.001203 ** 
## post_1979    1.77623    0.47268   3.758 0.000442 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.59 on 51 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.705,  Adjusted R-squared:  0.6876 
## F-statistic: 40.63 on 3 and 51 DF,  p-value: 1.479e-13

cat("\nConclusion:\n")

## 
## Conclusion:

cat("The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.\n")

## The coefficient on post_1979 indicates whether there was a significant shift in the interest rate equation after 1979.

question C9

data("volat")

# (i) Discuss expected signs of coefficients
cat("Expected signs:\n")

## Expected signs:

cat("β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.\n")

## β1 (pcip): Positive. An increase in industrial production should generally lead to positive stock market returns.

cat("β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.\n\n")

## β2 (i3): Negative. An increase in short-term interest rates often reduces stock market returns as borrowing becomes more expensive.

# (ii) Estimate the equation by OLS
model_c9 <- lm(rsp500 ~ pcip + i3, data = volat)
summary(model_c9)

## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637

cat("\nOLS Results:\n")

## 
## OLS Results:

cat("The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.\n\n")

## The coefficients for pcip and i3, along with their standard errors, t-values, and p-values, are displayed in the summary above.

# (iii)Determine which variables are statistically significant

p_values <- summary(model_c9)$coefficients[, 4]
significant <- p_values < 0.05 

cat("P-values for each variable:\n")

## P-values for each variable:

print(p_values)

##  (Intercept)         pcip           i3 
## 1.444871e-08 7.784810e-01 1.207402e-02

cat("\nSignificance (TRUE = significant at 5% level):\n")

## 
## Significance (TRUE = significant at 5% level):

print(significant)

## (Intercept)        pcip          i3 
##        TRUE       FALSE        TRUE

# (iv)Interpret predictability
if (all(significant[-1] == FALSE)) {
  cat("\nThe variables pcip and i3 are NOT statistically significant.\n")
  cat("This suggests that the return on the S&P 500 is NOT predictable using these variables.\n")
} else {
  cat("\nAt least one variable is statistically significant.\n")
  cat("This suggests that the return on the S&P 500 shows SOME level of predictability.\n")
}

## 
## At least one variable is statistically significant.
## This suggests that the return on the S&P 500 shows SOME level of predictability.

CHAPTER 11 question C7

data("consump")

# (i) Estimate g_ct = β0 + β1 * g_ct-1 + u_t and test the PIH
consump <- consump %>%
  mutate(g_ct = log(c) - lag(log(c)))  # g_ct = log(c_t) - log(c_t-1)

model_i <- lm(g_ct ~ lag(g_ct, 1), data = consump)
summary(model_i)

## 
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1), data = consump)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001451  0.007142  0.020227 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.011431   0.003778   3.026  0.00478 **
## lag(g_ct, 1) 0.446141   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.00731

# (ii) Add lagged income growth, interest rate, and inflation
consump <- consump %>%
  mutate(gy_lag = lag(gy),  
         i3_lag = lag(i3),  
         inf_lag = lag(inf))  

model_ii <- lm(g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model_ii)

## 
## Call:
## lm(formula = g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag, 
##     data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249093 -0.0075869  0.0000855  0.0087234  0.0188617 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   0.0225940  0.0070892   3.187  0.00335 **
## lag(g_ct, 1)  0.4335832  0.2896525   1.497  0.14487   
## gy_lag       -0.1079087  0.1946371  -0.554  0.58341   
## i3_lag       -0.0007467  0.0011107  -0.672  0.50654   
## inf_lag      -0.0008281  0.0010041  -0.825  0.41606   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01134 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.211 
## F-statistic: 3.273 on 4 and 30 DF,  p-value: 0.02431

# (iii) Compare p-value of lag(g_ct, 1) in model_i and model_ii
cat("\nPART (iii): P-values for lag(g_ct, 1)\n")

## 
## PART (iii): P-values for lag(g_ct, 1)

cat("Model (i) p-value for lag(g_ct, 1):", summary(model_i)$coefficients[2, 4], "\n")

## Model (i) p-value for lag(g_ct, 1): 0.007310181

cat("Model (ii) p-value for lag(g_ct, 1):", summary(model_ii)$coefficients[2, 4], "\n")

## Model (ii) p-value for lag(g_ct, 1): 0.1448651

# Compare if the p-value for lag(g_ct, 1) changes to determine support for PIH

# (iv) F-statistic for joint significance of all variables
anova_test <- anova(model_i, model_ii)
cat("\nPART (iv): F-statistic for joint significance\n")

## 
## PART (iv): F-statistic for joint significance

print(anova_test)

## Analysis of Variance Table
## 
## Model 1: g_ct ~ lag(g_ct, 1)
## Model 2: g_ct ~ lag(g_ct, 1) + gy_lag + i3_lag + inf_lag
##   Res.Df       RSS Df  Sum of Sq      F Pr(>F)
## 1     33 0.0044446                            
## 2     30 0.0038608  3 0.00058384 1.5122 0.2315

question C12

data("minwage")

# Create a lagged variable for gwage232
minwage$lag_gwage232 <- lag(minwage$gwage232)

# (i) 
filtered_data <- minwage[complete.cases(minwage$gwage232, minwage$lag_gwage232), ]

acf(filtered_data$gwage232, lag.max = 1)

# (ii) 
model1 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
summary(model1)

## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi, data = filtered_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044642 -0.004134 -0.001312  0.004482  0.041612 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0024003  0.0004308   5.572 3.79e-08 ***
## lag_gwage232 -0.0779092  0.0342851  -2.272  0.02341 *  
## gmwage        0.1518459  0.0096485  15.738  < 2e-16 ***
## gcpi          0.2630876  0.0824457   3.191  0.00149 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007889 on 606 degrees of freedom
## Multiple R-squared:  0.2986, Adjusted R-squared:  0.2951 
## F-statistic: 85.99 on 3 and 606 DF,  p-value: < 2.2e-16

# (iii) 
model2 <- lm(gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), data = filtered_data)
summary(model2)

## 
## Call:
## lm(formula = gwage232 ~ lag_gwage232 + gmwage + gcpi + lag(gemp232), 
##     data = filtered_data)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043786 -0.004384 -0.001022  0.004325  0.042554 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   0.0024248  0.0004268   5.681 2.08e-08 ***
## lag_gwage232 -0.0756372  0.0339207  -2.230 0.026126 *  
## gmwage        0.1526838  0.0095403  16.004  < 2e-16 ***
## gcpi          0.2652171  0.0826041   3.211 0.001394 ** 
## lag(gemp232)  0.0662916  0.0169639   3.908 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007798 on 604 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3167, Adjusted R-squared:  0.3122 
## F-statistic: 69.98 on 4 and 604 DF,  p-value: < 2.2e-16

# (iv)
coef(model1)["gmwage"]

##    gmwage 
## 0.1518459

coef(model2)["gmwage"]

##    gmwage 
## 0.1526838

# (v)
model3 <- lm(gmwage ~ lag(gwage232) + lag(gemp232), data = filtered_data)
summary(model3)$r.squared

## [1] 0.003908469

CHAPTER 12 question C11

data("nyse")

# (i) Estimate the OLS model in equation (12.47)
nyse <- nyse %>% mutate(return_lag1 = lag(return, 1))  
ols_model <- lm(return ~ return_lag1, data = nyse)

cat("OLS Model Summary:\n")

## OLS Model Summary:

summary(ols_model)

## 
## Call:
## lm(formula = return ~ return_lag1, data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.261  -1.302   0.098   1.316   8.065 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17963    0.08074   2.225   0.0264 *
## return_lag1  0.05890    0.03802   1.549   0.1218  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.11 on 687 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.003481,   Adjusted R-squared:  0.00203 
## F-statistic: 2.399 on 1 and 687 DF,  p-value: 0.1218

nyse <- nyse %>% filter(!is.na(return_lag1))  
nyse$residuals_sq <- residuals(ols_model)^2

mean_resid_sq <- mean(nyse$residuals_sq)
min_resid_sq <- min(nyse$residuals_sq)
max_resid_sq <- max(nyse$residuals_sq)

cat("Mean of squared residuals:", mean_resid_sq, "\n")

## Mean of squared residuals: 4.440839

cat("Minimum of squared residuals:", min_resid_sq, "\n")

## Minimum of squared residuals: 7.35465e-06

cat("Maximum of squared residuals:", max_resid_sq, "\n")

## Maximum of squared residuals: 232.8946

# (ii) Estimate heteroskedasticity model using lagged return
hetero_model <- lm(residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)

cat("Heteroskedasticity Model Summary:\n")

## Heteroskedasticity Model Summary:

summary(hetero_model)

## 
## Call:
## lm(formula = residuals_sq ~ return_lag1 + I(return_lag1^2), data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.459  -3.011  -1.975   0.676 221.469 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3.25734    0.44085   7.389 4.32e-13 ***
## return_lag1      -0.78946    0.19569  -4.034 6.09e-05 ***
## I(return_lag1^2)  0.29666    0.03552   8.351 3.75e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.66 on 686 degrees of freedom
## Multiple R-squared:  0.1303, Adjusted R-squared:  0.1278 
## F-statistic:  51.4 on 2 and 686 DF,  p-value: < 2.2e-16

# (iii) Plot conditional variance as a function of lagged return
plot_data <- data.frame(return_lag1 = seq(min(nyse$return_lag1, na.rm = TRUE), 
                                          max(nyse$return_lag1, na.rm = TRUE), length.out = 100))
plot_data$conditional_variance <- predict(hetero_model, newdata = plot_data)
plot(plot_data$return_lag1, plot_data$conditional_variance, type = "l", 
     main = "Conditional Variance vs. Lagged Return",
     xlab = "Lagged Return", ylab = "Conditional Variance")

min_variance_index <- which.min(plot_data$conditional_variance)
cat("Smallest conditional variance:", plot_data$conditional_variance[min_variance_index], "\n")

## Smallest conditional variance: 2.734254

cat("Lagged return value for smallest variance:", plot_data$return_lag1[min_variance_index], "\n")

## Lagged return value for smallest variance: 1.245583

# (iv) Check for negative variance estimates
if (any(plot_data$conditional_variance < 0)) {
  cat("Negative variance estimates are present.\n")
} else {
  cat("No negative variance estimates.\n")
}

## No negative variance estimates.

# (v) Compare ARCH(1) model fit
nyse$residuals_sq_lag1 <- lag(nyse$residuals_sq, 1)
arch1_model <- lm(residuals_sq ~ residuals_sq_lag1, data = nyse)
summary(arch1_model)

## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1, data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -23.337  -3.292  -2.157   0.556 223.981 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.94743    0.44023   6.695 4.49e-11 ***
## residuals_sq_lag1  0.33706    0.03595   9.377  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 686 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.1136, Adjusted R-squared:  0.1123 
## F-statistic: 87.92 on 1 and 686 DF,  p-value: < 2.2e-16

cat("ARCH(1) model R-squared:", summary(arch1_model)$r.squared, "\n")

## ARCH(1) model R-squared: 0.1136065

cat("Heteroskedasticity model R-squared:", summary(hetero_model)$r.squared, "\n")

## Heteroskedasticity model R-squared: 0.1303208

# (vi) Add second lag to ARCH(1) model for ARCH(2)
nyse$residuals_sq_lag2 <- lag(nyse$residuals_sq, 2)
arch2_model <- lm(residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, data = nyse)
summary(arch2_model)

## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, 
##     data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.934  -3.298  -2.158   0.600 224.296 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.82950    0.45495   6.219 8.69e-10 ***
## residuals_sq_lag1  0.32284    0.03820   8.450  < 2e-16 ***
## residuals_sq_lag2  0.04179    0.03820   1.094    0.274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 684 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1151, Adjusted R-squared:  0.1125 
## F-statistic: 44.47 on 2 and 684 DF,  p-value: < 2.2e-16

cat("ARCH(2) model Summary:\n")

## ARCH(2) model Summary:

print(summary(arch2_model))

## 
## Call:
## lm(formula = residuals_sq ~ residuals_sq_lag1 + residuals_sq_lag2, 
##     data = nyse)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -22.934  -3.298  -2.158   0.600 224.296 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        2.82950    0.45495   6.219 8.69e-10 ***
## residuals_sq_lag1  0.32284    0.03820   8.450  < 2e-16 ***
## residuals_sq_lag2  0.04179    0.03820   1.094    0.274    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.76 on 684 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1151, Adjusted R-squared:  0.1125 
## F-statistic: 44.47 on 2 and 684 DF,  p-value: < 2.2e-16

Final exam

Budkhand

2025-01-04

CHAPTER 7

question 1

question 3

question C1

question C2

CHAPTER 8

question 1

question 5

question C4

question C13

CHAPTER 9

question 1

question 5

question C4

question C5

CHAPTER 10

question 1

question 5

question C1

question C9

CHAPTER 11

question C7

question C12

CHAPTER 12

question C11