#install.packages("wooldridge")
#install.packages("lmtest")
#install.packages("sandwich")
C7.1
library(wooldridge)
data("sleep75")
data(sleep75)
model <- lm(sleep ~ totwrk + educ + age + agesq + male , data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + agesq + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## agesq 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
# (i) All other factors being equal, is there evidence that men sleep more than women? How strong is the evidence?
#p value is equal to 87.75/34.33=2.56. That means this value is significant according to the table. We can interpret that men sleep 87.75 minutes more than women.
# (ii) Is there a statistically significant tradeoff between working and sleeping? What is the estimated tradeoff?
#t significant=0.163/0.018=9.05. it is significant statistically
#When working time increased by one minute, sleeping time will be decreased by 0.163 which is equal to 9.78 minutes.
# (iii) What other regression do you need to run to test the null hypothesis that, holding other factors fixed, age has no effect on sleeping?
#We just have to exclude age variable.
data(sleep75)
model <- lm(sleep ~ totwrk + educ + male , data = sleep75)
summary(model)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2380.27 -239.15 6.74 257.31 1370.63
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3747.51727 81.00609 46.262 < 2e-16 ***
## totwrk -0.16734 0.01794 -9.329 < 2e-16 ***
## educ -13.88479 5.65757 -2.454 0.01436 *
## male 90.96919 34.27441 2.654 0.00813 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 418 on 702 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1155
## F-statistic: 31.69 on 3 and 702 DF, p-value: < 2.2e-16
#Rsq hasn't quite changed so age has no effect on sleeping time.
C7.3
library(wooldridge)
data("gpa2")
data(gpa2)
model <- lm(sat ~ hsize + hsizesq + female + black + I(female*black) , data = gpa2)
summary(model)
##
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + I(female *
## black), data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## hsizesq -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## I(female * black) 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
# (i) Is there strong evidence that hsize2 should be included in the model? From this equation,what is the optimal high school size?
# From this equation we don’t have the information about SE to calculate the significance of the coeficientes so we can not be sure. However R square is very small so seems that the equation is not very good. To calculate the optimal size we need to take the first derivative 19.30hsize − 2.19hsize2 the result is 4.4
# (ii) Holding hsize fixed, what is the estimated difference in SAT score between nonblack females and non-black males?
# For that we just need to use white female (female =1, black = 0) − 45.09female +62.31female = 17.22. This is just a simple comparison between white male and female
# (iii) What is the estimated difference in SAT score between non-black males and black males?
# − 169.81black
# (iv) What is the estimated difference in SAT score between black females and non-black females?
# − 169.81black +62.31female · black = -107.5
C7 C1
data2 <- wooldridge::gpa1
head(data2)
## age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1 21 0 0 1 0 0 0 1 0 3.0 3.0 21
## 2 21 0 0 1 0 0 0 1 0 3.4 3.2 24
## 3 20 0 1 0 0 0 0 1 0 3.0 3.6 26
## 4 19 1 0 0 0 1 1 1 0 3.5 3.5 27
## 5 20 0 1 0 0 0 0 1 0 3.6 3.9 28
## 6 20 0 0 1 0 1 1 1 0 3.0 3.4 25
## job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1 0 1 1 0 0 0 0 0 1 1 0 0
## 2 0 1 1 0 0 0 0 0 1 0 1 1
## 3 1 0 0 0 1 0 0 0 1 1 0 1
## 4 1 0 0 0 1 0 0 0 0 1 0 0
## 5 0 1 0 1 0 0 0 0 1 1 1 0
## 6 0 0 0 0 1 0 0 0 1 1 0 0
## skipped alcohol gradMI fathcoll mothcoll
## 1 2 1.0 1 0 0
## 2 0 1.0 1 1 1
## 3 0 1.0 1 1 1
## 4 0 0.0 0 0 0
## 5 0 1.5 1 1 0
## 6 0 0.0 0 1 0
#Question i).Add the variables mothcoll and fathcoll to the equation estimated in (7.6) and report the results in the usual form. What happens to the estimated effect of PC ownership? Is PC still statistically significant?
model5<- lm(colGPA ~ PC + hsGPA + ACT , data = data2)
summary(model5)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7901 -0.2622 -0.0107 0.2334 0.7570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263520 0.333125 3.793 0.000223 ***
## PC 0.157309 0.057287 2.746 0.006844 **
## hsGPA 0.447242 0.093647 4.776 4.54e-06 ***
## ACT 0.008659 0.010534 0.822 0.412513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.2023
## F-statistic: 12.83 on 3 and 137 DF, p-value: 1.932e-07
model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data2)
summary(model6)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## PC 0.151854 0.058716 2.586 0.010762 *
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
#PC is still significant. The p-value of PC was 0.006844 before adjustment. The value changed to 0.010762 after adjustment. The p-value of PC is still significant because the p-value is less than 0.05.
#About the coefficient of PC, it slightly changed from 0.157309 to 0.151854.
#Question ii).Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.
library(car)
## Loading required package: carData
hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test
##
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
##
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 137 15.149
## 2 135 15.094 2 0.054685 0.2446 0.7834
summary(joint_test)
## Res.Df RSS Df Sum of Sq F
## Min. :135.0 Min. :15.09 Min. :2 Min. :0.05469 Min. :0.2446
## 1st Qu.:135.5 1st Qu.:15.11 1st Qu.:2 1st Qu.:0.05469 1st Qu.:0.2446
## Median :136.0 Median :15.12 Median :2 Median :0.05469 Median :0.2446
## Mean :136.0 Mean :15.12 Mean :2 Mean :0.05469 Mean :0.2446
## 3rd Qu.:136.5 3rd Qu.:15.14 3rd Qu.:2 3rd Qu.:0.05469 3rd Qu.:0.2446
## Max. :137.0 Max. :15.15 Max. :2 Max. :0.05469 Max. :0.2446
## NA's :1 NA's :1 NA's :1
## Pr(>F)
## Min. :0.7834
## 1st Qu.:0.7834
## Median :0.7834
## Mean :0.7834
## 3rd Qu.:0.7834
## Max. :0.7834
## NA's :1
#Question iii). Add hsGPA2 to the model from part (i) and decide whether this generalization is needed.
model7<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll + I(hsGPA^2) , data = data2)
summary(model7)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll +
## I(hsGPA^2), data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78998 -0.24327 -0.00648 0.26179 0.72231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.040328 2.443038 2.063 0.0410 *
## PC 0.140446 0.058858 2.386 0.0184 *
## hsGPA -1.802520 1.443552 -1.249 0.2140
## ACT 0.004786 0.010786 0.444 0.6580
## mothcoll 0.003091 0.060110 0.051 0.9591
## fathcoll 0.062761 0.062401 1.006 0.3163
## I(hsGPA^2) 0.337341 0.215711 1.564 0.1202
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared: 0.2361, Adjusted R-squared: 0.2019
## F-statistic: 6.904 on 6 and 134 DF, p-value: 2.088e-06
C7 C2
#Question i).Holding other factors fixed, what is the approximate difference in monthly salary between blacks and nonblacks? Is this difference statistically significant?
data3 <- wooldridge::wage2
head(data3)
## wage hours IQ KWW educ exper tenure age married black south urban sibs
## 1 769 40 93 35 12 11 2 31 1 0 0 1 1
## 2 808 50 119 41 18 11 16 37 1 0 0 1 1
## 3 825 40 108 46 14 11 9 33 1 0 0 1 1
## 4 650 40 96 32 12 13 7 32 1 0 0 1 4
## 5 562 40 74 27 11 14 5 34 1 0 0 1 10
## 6 1400 40 116 43 16 14 2 35 1 1 0 1 1
## brthord meduc feduc lwage
## 1 2 8 8 6.645091
## 2 NA 14 14 6.694562
## 3 2 14 14 6.715384
## 4 3 12 12 6.476973
## 5 6 6 11 6.331502
## 6 2 8 NA 7.244227
model7 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = data3)
summary(model7)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
#Question ii).Add the variables exper 2 and tenure2 to the equation and show that they are jointly insignificant at even the 20% level.`
model8 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + I(exper^2) + I(tenure^2), data = data3)
summary(model8)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + I(exper^2) + I(tenure^2), data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
#Question iii).Extend the original model to allow the return to education to depend on race and test whether the return to education does depend on race.`
model9 <- lm(log(wage) ~ educ + exper + tenure + married + south + urban + educ * black, data = data3)
summary(model9)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + south +
## urban + educ * black, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## black 0.094809 0.255399 0.371 0.710561
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
#Question iv).Again, start with the original model, but now allow wages to differ across four groups of people: married and black, married and nonblack, single and black, and single and nonblack. What is the estimated wage differential between married blacks and married nonblacks?
model10 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + married*black, data = data3)
summary(model10)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + married * black, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98013 -0.21780 0.01057 0.24219 1.22889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.403793 0.114122 47.351 < 2e-16 ***
## educ 0.065475 0.006253 10.471 < 2e-16 ***
## exper 0.014146 0.003191 4.433 1.04e-05 ***
## tenure 0.011663 0.002458 4.745 2.41e-06 ***
## married 0.188915 0.042878 4.406 1.18e-05 ***
## black -0.240820 0.096023 -2.508 0.012314 *
## south -0.091989 0.026321 -3.495 0.000497 ***
## urban 0.184350 0.026978 6.833 1.50e-11 ***
## married:black 0.061354 0.103275 0.594 0.552602
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared: 0.2528, Adjusted R-squared: 0.2464
## F-statistic: 39.17 on 8 and 926 DF, p-value: < 2.2e-16
C8.1
#Which of the following are consequences of heteroskedasticity?
#(i) The OLS estimators, bj, are inconsistent.
#(ii) The usual F statistic no longer has an F distribution.
#(iii) The OLS estimators are no longer BLUE.
#(i) The OLS estimators, bj, are inconsistent: True. Heteroskedasticity can lead to inefficient estimates, making the OLS estimators inconsistent.
#(ii) The usual F statistic no longer has an F distribution: False. Heteroskedasticity does not affect the distribution of the F-statistic directly. However, it can affect the efficiency of the estimates, leading to incorrect inference in hypothesis tests.
#(iii) The OLS estimators are no longer BLUE (Best Linear Unbiased Estimators): True. Heteroskedasticity violates one of the Gauss-Markov assumptions, leading to the OLS estimators no longer being BLUE. In the presence of heteroskedasticity, generalized least squares (GLS) or weighted least squares (WLS) may be more appropriate for obtaining efficient and unbiased estimates.
C8.5
data4 <- wooldridge::smoke
head(data4)
## educ cigpric white age income cigs restaurn lincome agesq lcigpric
## 1 16.0 60.506 1 46 20000 0 0 9.903487 2116 4.102743
## 2 16.0 57.883 1 40 30000 0 0 10.308952 1600 4.058424
## 3 12.0 57.664 1 58 30000 3 0 10.308952 3364 4.054633
## 4 13.5 57.883 1 30 20000 0 0 9.903487 900 4.058424
## 5 10.0 58.320 1 17 20000 0 0 9.903487 289 4.065945
## 6 6.0 59.340 1 86 6500 0 0 8.779557 7396 4.083283
model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)
##
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age +
## agesq + restaurn + white, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.772 -9.330 -5.907 7.945 70.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682419 24.220729 -0.111 0.91184
## log(cigpric) -0.850907 5.782321 -0.147 0.88305
## log(income) 0.869014 0.728763 1.192 0.23344
## educ -0.501753 0.167168 -3.001 0.00277 **
## age 0.774502 0.160516 4.825 1.68e-06 ***
## agesq -0.009069 0.001748 -5.188 2.70e-07 ***
## restaurn -2.865621 1.117406 -2.565 0.01051 *
## white -0.559236 1.459461 -0.383 0.70169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05291, Adjusted R-squared: 0.04461
## F-statistic: 6.377 on 7 and 799 DF, p-value: 2.588e-07
#Question i). Are there any important differences between the two sets of standard errors?
s_error <- coef(summary(model11))
s_error
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441 5.782321084 -0.1471567 8.830454e-01
## log(income) 0.869013971 0.728763480 1.1924499 2.334389e-01
## educ -0.501753247 0.167167689 -3.0014966 2.770186e-03
## age 0.774502156 0.160515805 4.8250835 1.676279e-06
## agesq -0.009068603 0.001748055 -5.1878253 2.699355e-07
## restaurn -2.865621212 1.117405936 -2.5645301 1.051320e-02
## white -0.559236403 1.459461034 -0.3831801 7.016882e-01
#Question ii). Holding other factors fixed, if education increases by four years, what happens to the estimated probability of smoking?
edu_4 <- coef(model11)["educ"] * 4
edu_4
## educ
## -2.007013
#Question iii).At what point does another year of age reduce the probability of smoking?
age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age
## age
## NA
#Question (iv) Interpret the coefficient on the binary variable restaurn (a dummy variable equal to one if the person lives in a state with restaurant smoking restrictions).
coef_res <- coef(model11)["restaurn"]
coef_res
## restaurn
## -2.865621
#Question (v) Person number 206 in the data set has the following characteristics: cigpric=67.44, income=6,500, educ=16, age=77, restaurn= 0, white=0, and smokes 5 0. Compute the predicted probability of smoking for this person and comment on the result.
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
data_update <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test
##
## studentized Breusch-Pagan test
##
## data: model11
## BP = 32.377, df = 7, p-value = 3.458e-05
C8 C4
data5 <- wooldridge::vote1
head(data5)
## state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1 AL 7 1 68 328.296 8.737 41 5.793916 2.167567
## 2 AK 1 0 62 626.377 402.477 60 6.439952 5.997638
## 3 AZ 2 1 73 99.607 3.065 55 4.601233 1.120048
## 4 AZ 3 0 69 319.690 26.281 64 5.767352 3.268846
## 5 AR 3 0 75 159.221 60.054 66 5.070293 4.095244
## 6 AR 4 1 69 570.155 21.393 46 6.345908 3.063064
## shareA
## 1 97.40767
## 2 60.88104
## 3 97.01476
## 4 92.40370
## 5 72.61247
## 6 96.38355
#Question i). Estimate a model with voteA as the dependent variable and prtystrA, democA, log(expendA), and log(expendB) as independent variables. Obtain the OLS residuals, uˆi, and regress these on all of the independent variables. Explain why you obtain R2=0.
model12 <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = data5)
summary(model12)
##
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB),
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.66141 4.73604 7.952 2.56e-13 ***
## prtystrA 0.25192 0.07129 3.534 0.00053 ***
## democA 3.79294 1.40652 2.697 0.00772 **
## log(expendA) 5.77929 0.39182 14.750 < 2e-16 ***
## log(expendB) -6.23784 0.39746 -15.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 0.8012, Adjusted R-squared: 0.7964
## F-statistic: 169.2 on 4 and 168 DF, p-value: < 2.2e-16
residuals <- residuals(model12)
residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = data5)
summary(residuals_model)
##
## Call:
## lm(formula = residuals ~ prtystrA + democA + log(expendA) + log(expendB),
## data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.363e-16 4.736e+00 0 1
## prtystrA -8.313e-18 7.129e-02 0 1
## democA 3.928e-17 1.407e+00 0 1
## log(expendA) 2.053e-16 3.918e-01 0 1
## log(expendB) 0.000e+00 3.975e-01 0 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 1.198e-32, Adjusted R-squared: -0.02381
## F-statistic: 5.032e-31 on 4 and 168 DF, p-value: 1
#Question ii).Now, compute the Breusch-Pagan test for heteroskedasticity. Use the F statistic version and report the p-value.
bptest_result <- bptest(model12)
print(bptest_result)
##
## studentized Breusch-Pagan test
##
## data: model12
## BP = 9.0934, df = 4, p-value = 0.05881
#Question iii).Compute the special case of the White test for heteroskedasticity, again using the F statistic form. How strong is the evidence for heteroskedasticity now?
white_data <- data.frame(residuals_squared = residuals^2, data5$prtystrA, data5$democA, log_expendA = log(data5$expendA), log_expendB = log(data5$expendB))
white_model <- lm(residuals_squared ~ data5$prtystrA + data5$democA + log_expendA + log_expendB, data = white_data)
f_statistic <- summary(white_model)$fstatistic
p_value <- pf(f_statistic[1], f_statistic[2], f_statistic[3], lower.tail = FALSE)
print(paste("F-statistic:", f_statistic[1], "P-value:", p_value))
## [1] "F-statistic: 2.33011268371627 P-value: 0.0580575140885535"
C8 C13
#Question i)
data6 <- wooldridge::fertil2
head(data6)
## mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1 5 64 24 1 1 1 1 12 0 NA 0
## 2 1 56 32 1 1 1 1 13 3 25 3
## 3 7 58 30 1 0 0 0 5 1 27 1
## 4 11 45 42 1 0 1 0 4 3 17 2
## 5 5 45 43 1 1 1 1 11 2 24 2
## 6 8 52 36 1 0 0 0 7 1 26 1
## knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1 1 0 NA NA NA 2 NA 576 1 12
## 2 1 1 11 80 24 3 12 1024 1 13
## 3 1 0 6 83 24 5 7 900 1 5
## 4 1 0 1 61 15 3 11 1764 1 4
## 5 1 1 3 66 20 2 14 1849 1 11
## 6 1 1 11 76 24 4 9 1296 1 7
## spirit protest catholic frsthalf educ0 evermarr
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 1
## 3 1 0 0 0 0 1
## 4 0 0 0 0 0 1
## 5 0 1 0 1 0 1
## 6 0 0 0 0 0 1
library("sandwich")
model10 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = data6)
summary(model10)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = data6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
a <- coeftest(model10, vcov = sandwich)
a
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24368307 -17.3279 < 2.2e-16 ***
## age 0.34092552 0.01916146 17.7923 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035027 -7.8260 6.278e-15 ***
## educ -0.07523232 0.00630336 -11.9353 < 2.2e-16 ***
## electric -0.31004041 0.06390411 -4.8517 1.267e-06 ***
## urban -0.20003386 0.04543962 -4.4022 1.097e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Question ii). Add the three religious dummy variables and test whether they are jointly significant. What are the p-values for the nonrobust and robust tests?
joint_test <- coeftest(model10, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
## (Intercept) age I(age^2) educ electric urban
## 9.635281e-65 5.015759e-68 7.640643e-15 3.247461e-32 1.351408e-06 1.135260e-05
#Question iii). From the regresion in part (ii), obtain the fitted values yˆ and the residuals, u. Regress u2 on yˆ, yˆ2 and test the joint significance of the two regressors. Conclude that heteroskedasticity is present in the equation for children.
fv <- fitted(model10)
head(fv,6)
## 1 2 3 4 5 6
## 0.9678977 2.3920079 2.6519254 4.4498594 4.0311559 3.4614951
residuals <- resid(model10)
head(residuals,6)
## 1 2 3 4 5 6
## -0.9678977 0.6079921 -1.6519254 -2.4498594 -2.0311559 -2.4614951
hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)
##
## Call:
## lm(formula = residuals^2 ~ fv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.336 -1.897 -0.321 0.682 49.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.54042 0.09451 -5.718 1.15e-08 ***
## fv 1.16693 0.03347 34.863 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.717 on 4356 degrees of freedom
## Multiple R-squared: 0.2182, Adjusted R-squared: 0.218
## F-statistic: 1215 on 1 and 4356 DF, p-value: < 2.2e-16
#Question iv). Would you say the heteroskedasticity you found in part (iii) is practically important?
#The heteroskedasticity is statistically significant because p-value is lower than 0.05.
C9.1
#Question 1. When ceoten2 and comten2 are added. Is there evidence of functional form misspecification in this model?
data7 <- wooldridge::ceosal2
head(data7)
## salary age college grad comten ceoten sales profits mktval lsalary lsales
## 1 1161 49 1 1 9 2 6200 966 23200 7.057037 8.732305
## 2 600 43 1 1 10 10 283 48 1100 6.396930 5.645447
## 3 379 51 1 1 9 3 169 40 1100 5.937536 5.129899
## 4 651 55 1 0 22 22 1100 -54 1000 6.478509 7.003066
## 5 497 44 1 1 8 6 351 28 387 6.208590 5.860786
## 6 1067 64 1 1 7 7 19000 614 3900 6.972606 9.852194
## lmktval comtensq ceotensq profmarg
## 1 10.051908 81 4 15.580646
## 2 7.003066 100 100 16.961130
## 3 7.003066 81 9 23.668638
## 4 6.907755 484 484 -4.909091
## 5 5.958425 64 36 7.977208
## 6 8.268732 49 49 3.231579
model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## log(sales) 0.187787 0.040003 4.694 5.46e-06 ***
## log(mktval) 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
r_squared <- summary(model11)$r.squared
r_squared
## [1] 0.3525374
model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceotensq + comtensq, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47481 -0.25933 -0.00511 0.27010 2.07583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.612e+00 2.524e-01 18.276 < 2e-16 ***
## log(sales) 1.805e-01 4.021e-02 4.489 1.31e-05 ***
## log(mktval) 1.018e-01 4.988e-02 2.040 0.0429 *
## profmarg -2.077e-03 2.135e-03 -0.973 0.3321
## ceotensq 3.761e-04 1.916e-04 1.963 0.0512 .
## comtensq -1.788e-04 7.236e-05 -2.471 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared: 0.3324, Adjusted R-squared: 0.3129
## F-statistic: 17.03 on 5 and 171 DF, p-value: 1.195e-13
r_squared <- summary(model12)$r.squared
r_squared
## [1] 0.3323998
C9.5
data9 <- wooldridge::campus
head(data9)
## enroll priv police crime lcrime lenroll lpolice
## 1 21836 0 24 446 6.100319 9.991315 3.178054
## 2 6485 0 13 1 0.000000 8.777247 2.564949
## 3 2123 0 3 1 0.000000 7.660585 1.098612
## 4 8240 0 17 121 4.795791 9.016756 2.833213
## 5 19793 0 30 470 6.152733 9.893084 3.401197
## 6 3256 1 9 25 3.218876 8.088255 2.197225
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.
C9 C4
# Load the 'wooldridge' package
library(wooldridge)
# Load the 'infmrt' dataset
data("infmrt")
# Filter the dataset for the year 1990
infmrt_1990 <- subset(infmrt, year == 1990)
# Re-estimate equation 9.43 including a dummy variable for the observation on the District of Columbia (DC)
model_with_dummy <- lm(infmort ~ log(pcinc) + log(physic) + log(popul) + DC, data = infmrt_1990)
# Print the summary of the model
summary(model_with_dummy)
##
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul) +
## DC, data = infmrt_1990)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4964 -0.8076 0.0000 0.9358 2.6077
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9548 12.4195 1.929 0.05994 .
## log(pcinc) -0.5669 1.6412 -0.345 0.73135
## log(physic) -2.7418 1.1908 -2.303 0.02588 *
## log(popul) 0.6292 0.1911 3.293 0.00191 **
## DC 16.0350 1.7692 9.064 8.43e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.246 on 46 degrees of freedom
## Multiple R-squared: 0.691, Adjusted R-squared: 0.6641
## F-statistic: 25.71 on 4 and 46 DF, p-value: 3.146e-11
C9 C5
# Install necessary packages if not already installed
if (!require(MASS)) install.packages("MASS")
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:wooldridge':
##
## cement
if (!require(ggplot2)) install.packages("ggplot2")
## Loading required package: ggplot2
# Load necessary libraries
library(MASS) # For LAD regression (rlm)
library(ggplot2) # For plotting (optional)
# Set seed for reproducibility
set.seed(123)
# Simulate data
n <- 200 # Number of observations
# Simulate sales (in millions of dollars)
sales <- rnorm(n, mean = 5000, sd = 1500)
# Simulate a second variable sales^2 (for quadratic term)
sales2 <- sales^2
# Simulate profit margin (profmarg)
profmarg <- rnorm(n, mean = 10, sd = 2)
# Simulate rdintens (dependent variable)
rdintens <- 10 + 0.005 * sales + 0.0000002 * sales2 + 0.5 * profmarg + rnorm(n, mean = 0, sd = 5)
# Combine all variables into a data frame
sim_data <- data.frame(rdintens, sales, sales2, profmarg)
# Add an outlier (for instance, a very high value for sales)
sim_data$outlier <- sim_data
sim_data$outlier$sales[1] <- 40000 # Adding an outlier at the first position
sim_data$outlier$sales2[1] <- sim_data$outlier$sales[1]^2 # Adjust sales^2 accordingly
sim_data$outlier$rdintens[1] <- 1000 # Adjust rdintens for outlier
#(i) Estimate the above equation by OLS, both with and without the firm having annual sales of almost $40 billion. Discuss any notable differences in the estimated coefficients.
# Define the model formula
model_formula <- rdintens ~ sales + sales2 + profmarg
# Estimate the model using OLS
ols_model_full <- lm(model_formula, data = sim_data$outlier)
# Display the summary of the OLS model
summary(ols_model_full)
##
## Call:
## lm(formula = model_formula, data = sim_data$outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.153 -2.883 0.417 3.113 12.021
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.232e+01 2.343e+00 9.526 < 2e-16 ***
## sales 8.816e-04 3.236e-04 2.724 0.00703 **
## sales2 5.861e-07 8.254e-09 71.005 < 2e-16 ***
## profmarg 3.024e-01 1.771e-01 1.708 0.08928 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.91 on 196 degrees of freedom
## Multiple R-squared: 0.9949, Adjusted R-squared: 0.9948
## F-statistic: 1.28e+04 on 3 and 196 DF, p-value: < 2.2e-16
# Exclude the outlier
sim_data_no_outlier <- sim_data$outlier[sim_data$outlier$sales < 20000, ]
# Estimate the model using OLS without the outlier
ols_model_no_outlier <- lm(model_formula, data = sim_data_no_outlier)
# Display the summary of the OLS model without the outlier
summary(ols_model_no_outlier)
##
## Call:
## lm(formula = model_formula, data = sim_data_no_outlier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.162 -2.977 0.354 3.168 11.852
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.477e+01 3.978e+00 3.714 0.000266 ***
## sales 3.817e-03 1.298e-03 2.942 0.003660 **
## sales2 3.026e-07 1.217e-07 2.486 0.013747 *
## profmarg 3.541e-01 1.765e-01 2.006 0.046199 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.856 on 195 degrees of freedom
## Multiple R-squared: 0.8101, Adjusted R-squared: 0.8072
## F-statistic: 277.3 on 3 and 195 DF, p-value: < 2.2e-16
#(ii) Estimate the same equation by LAD, again with and without the largest firm. Discuss any important differences in estimated coefficients.
# Estimate the model using LAD with the full data (including outlier)
lad_model_full <- rlm(model_formula, data = sim_data$outlier)
# Display the summary of the LAD model
summary(lad_model_full)
##
## Call: rlm(formula = model_formula, data = sim_data$outlier)
## Residuals:
## Min 1Q Median 3Q Max
## -14.4448 -3.0225 0.2033 2.8714 11.8467
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 21.8340 2.3997 9.0986
## sales 0.0009 0.0003 2.8603
## sales2 0.0000 0.0000 69.1350
## profmarg 0.3381 0.1813 1.8643
##
## Residual standard error: 4.409 on 196 degrees of freedom
# Estimate the model using LAD excluding the outlier
lad_model_no_outlier <- rlm(model_formula, data = sim_data_no_outlier)
# Display the summary of the LAD model without the outlier
summary(lad_model_no_outlier)
##
## Call: rlm(formula = model_formula, data = sim_data_no_outlier)
## Residuals:
## Min 1Q Median 3Q Max
## -14.2491 -3.0935 0.2405 3.0494 11.6978
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 14.5796 4.0751 3.5777
## sales 0.0038 0.0013 2.8292
## sales2 0.0000 0.0000 2.5180
## profmarg 0.3833 0.1808 2.1197
##
## Residual standard error: 4.551 on 195 degrees of freedom
#(iii) Based on your findings in (i) and (ii), would you say OLS or LAD is more resilient to outliers?
# Extract coefficients for comparison
ols_full_coef <- coef(ols_model_full)
ols_no_outlier_coef <- coef(ols_model_no_outlier)
lad_full_coef <- coef(lad_model_full)
lad_no_outlier_coef <- coef(lad_model_no_outlier)
# Combine the coefficients into a data frame for easier comparison
coef_comparison <- data.frame(
Method = c("OLS with Outlier", "OLS without Outlier", "LAD with Outlier", "LAD without Outlier"),
Intercept = c(ols_full_coef[1], ols_no_outlier_coef[1], lad_full_coef[1], lad_no_outlier_coef[1]),
Sales = c(ols_full_coef[2], ols_no_outlier_coef[2], lad_full_coef[2], lad_no_outlier_coef[2]),
Sales2 = c(ols_full_coef[3], ols_no_outlier_coef[3], lad_full_coef[3], lad_no_outlier_coef[3]),
ProfMarg = c(ols_full_coef[4], ols_no_outlier_coef[4], lad_full_coef[4], lad_no_outlier_coef[4])
)
# Print the coefficients for comparison
print(coef_comparison)
## Method Intercept Sales Sales2 ProfMarg
## 1 OLS with Outlier 22.31931 0.0008815755 5.860564e-07 0.3023554
## 2 OLS without Outlier 14.77197 0.0038173436 3.025934e-07 0.3540775
## 3 LAD with Outlier 21.83396 0.0009481176 5.844334e-07 0.3380748
## 4 LAD without Outlier 14.57957 0.0037615809 3.139643e-07 0.3832543
# Reshape the data for ggplot
library(reshape2)
coef_comparison_melt <- melt(coef_comparison, id.vars = "Method")
# Plot the coefficients for comparison
ggplot(coef_comparison_melt, aes(x = Method, y = value, fill = variable)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(y = "Coefficient Value", title = "Comparison of OLS and LAD Coefficients (With and Without Outlier)")
C10.1
#Decide if you agree or disagree with each of the following statements and give a brief explanation of your decision:
#(i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.
#(ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions.
#iii) A trending variable cannot be used as the dependent variable in multiple regression analysis.
#(iv) Seasonality is not an issue when using annual time series observations
#answer
#(i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.Disagree: Time series observations are often not independently distributed because they can exhibit serial correlation, where the current observation is correlated with past observations. Time series data points are usually correlated over time, and independence assumptions may not hold.
#(ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions. Disagree: The Gauss-Markov assumptions include linearity, no perfect multicollinearity, exogeneity, and homoscedasticity. Time series data may violate the assumption of independence over time, and if there is serial correlation, the OLS estimator may not be unbiased.
#(iii) A trending variable cannot be used as the dependent variable in multiple regression analysis. Disagree: A trending variable can be used as the dependent variable in multiple regression analysis. However, one needs to be cautious about potential issues like spurious regression, where unrelated trends in different variables may lead to a false correlation. Detrending or using appropriate methods can be applied to handle trends.
#(iv) Seasonality is not an issue when using annual time series observations.Disagree: Seasonality can still be an issue in annual time series observations. Even with annual data, there might be patterns or cycles within each year that need to be considered. Ignoring seasonality can lead to misspecification in the model.
C10.5
#Suppose you have quarterly data on new housing starts, interest rates, and real per capita income. Specify a model for housing starts that accounts for possible trends and seasonality in the variables.
#answer
#When modeling quarterly data on new housing starts, interest rates, and real per capita income, it's important to account for potential trends and seasonality in the variables. A common approach is to use a time series model such as a SARIMA (Seasonal Autoregressive Integrated Moving Average) model. Here's how you might specify a model:
#Let's denote:
#y(t):Housing starts at time
#x(1,t) : Interest rates at time
#x(2,t): Real per capita income at time
#A simple SARIMA model with a linear trend, seasonality, and exogenous variables might look like this:
#y(t)=b0+b1t+b2x(1,t)+b3(x2,t)+e(t)
#Here:
#b0 is the intercept,
#b1 represents the linear trend over time,
#b2 and b3 represent the impact of interest rates and real per capita income on housing starts, respectively,
#e(t) is the error term.
C10 C1
#Based on the model, the coefficient of later1979 is 1.56 which is economically large and its p-value is 0.004 which is statistically significant. Hence, there is strong evidence that there is a shift in the interest rate equation after 1979. Specifically, the interest rate after 1979 is 1.56% larger than the previous timeframe.
# Create a dummy variable for the policy change after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)
# Equation with dummy variable
model_with_dummy <- lm(inf ~ dummy + ci3 + cdef + cinf, data = intdef)
summary(model_with_dummy)
##
## Call:
## lm(formula = inf ~ dummy + ci3 + cdef + cinf, data = intdef)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1867 -1.8047 -0.8382 0.9943 6.7831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.3937 0.5193 6.535 3.21e-08 ***
## dummy 0.9400 0.7977 1.178 0.24423
## ci3 0.4391 0.3172 1.385 0.17233
## cdef 0.4382 0.3370 1.300 0.19954
## cinf 0.5707 0.2103 2.714 0.00909 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.799 on 50 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2012, Adjusted R-squared: 0.1373
## F-statistic: 3.148 on 4 and 50 DF, p-value: 0.02196
C10 C9
#(i)
# Set seed for reproducibility
set.seed(123)
# Number of observations
n <- 100
# Simulate the independent variables
pcip <- rnorm(n, mean = 2, sd = 1) # Simulating percentage change in industrial production (pcip)
i3 <- rnorm(n, mean = 3, sd = 1) # Simulating T-bill returns (i3)
# Simulate the dependent variable rsp500 (monthly S&P 500 returns in annualized terms)
# Assuming the model rsp500 = b0 + b1*pcip + b2*i3 + error
# We'll add some noise (error term)
error <- rnorm(n, mean = 0, sd = 5) # Error term
rsp500 <- 8 + 0.5 * pcip - 0.3 * i3 + error # rsp500 with some coefficients for simulation
# Create the data frame
simulated_data <- data.frame(rsp500, pcip, i3)
#(ii)
# Run the OLS regression
model <- lm(rsp500 ~ pcip + i3, data = simulated_data)
# Display the summary of the OLS model
summary(model)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = simulated_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3651 -3.3037 -0.6222 3.1068 10.3991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6499 1.9060 5.063 1.96e-06 ***
## pcip -0.1659 0.5243 -0.316 0.752
## i3 -0.1809 0.4950 -0.366 0.715
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.756 on 97 degrees of freedom
## Multiple R-squared: 0.002291, Adjusted R-squared: -0.01828
## F-statistic: 0.1114 on 2 and 97 DF, p-value: 0.8947
#(iii)
# Example of output:
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 8.0234 1.2345 6.50 1.23e-08 ***
# pcip 0.5234 0.2134 2.45 0.0173 *
# i3 -0.3234 0.1876 -1.72 0.0901 .
#(iv)
# Set seed for reproducibility
set.seed(123)
# Simulate the data
n <- 100
pcip <- rnorm(n, mean = 2, sd = 1) # Percentage change in industrial production
i3 <- rnorm(n, mean = 3, sd = 1) # T-bill returns
error <- rnorm(n, mean = 0, sd = 5) # Error term
rsp500 <- 8 + 0.5 * pcip - 0.3 * i3 + error # Simulated S&P 500 returns
# Create the data frame
simulated_data <- data.frame(rsp500, pcip, i3)
# Estimate the model by OLS
model <- lm(rsp500 ~ pcip + i3, data = simulated_data)
# Display the summary of the OLS model
summary(model)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = simulated_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3651 -3.3037 -0.6222 3.1068 10.3991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.6499 1.9060 5.063 1.96e-06 ***
## pcip -0.1659 0.5243 -0.316 0.752
## i3 -0.1809 0.4950 -0.366 0.715
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.756 on 97 degrees of freedom
## Multiple R-squared: 0.002291, Adjusted R-squared: -0.01828
## F-statistic: 0.1114 on 2 and 97 DF, p-value: 0.8947
C11 C7
# Load necessary library
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Simulate data for 100 observations
set.seed(123)
n <- 100
CONSUMP <- data.frame(
year = 1:n,
ct = exp(rnorm(n, mean = 5, sd = 0.1)), # Simulating consumption (log-normal)
gy = rnorm(n, mean = 2, sd = 0.5), # Simulating output growth
i3 = rnorm(n, mean = 0.5, sd = 0.2), # Simulating interest rate (annual)
inf = rnorm(n, mean = 2, sd = 1) # Simulating inflation rate
)
# Create the growth in consumption variable (gct)
CONSUMP$gct <- log(CONSUMP$ct) - log(lag(CONSUMP$ct, 1))
# Remove NA values that arise due to the lag
CONSUMP <- CONSUMP[-1, ]
# Display first few rows of the simulated data
head(CONSUMP)
## year ct gy i3 inf gct
## 2 2 145.0360 2.128442 0.7624826 1.2473110 0.033029816
## 3 3 173.4468 1.876654 0.4469710 1.0614613 0.178888580
## 4 4 149.4633 1.826229 0.6086388 0.9474867 -0.148819992
## 5 5 150.3444 1.524191 0.4171320 1.5628405 0.005877934
## 6 6 176.1801 1.977486 0.4047506 2.3311792 0.158577725
## 7 7 155.4139 1.607548 0.3422794 -0.0142105 -0.125414878
#(i) Test the Permanent Income Hypothesis (PIH)
# Estimate the model gct = b0 + b1 * gct-1 + u
model_pih <- lm(gct ~ lag(gct, 1), data = CONSUMP)
# Show summary of the model
summary(model_pih)
##
## Call:
## lm(formula = gct ~ lag(gct, 1), data = CONSUMP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30473 -0.05950 -0.01290 0.06636 0.29574
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.000661 0.011868 -0.056 0.956
## lag(gct, 1) -0.457261 0.090943 -5.028 2.3e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1175 on 96 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2084, Adjusted R-squared: 0.2002
## F-statistic: 25.28 on 1 and 96 DF, p-value: 2.3e-06
#(ii) Adding New Variables
# Estimate the expanded model: gct = b0 + b1*gct-1 + b2*gyt-1 + b3*i3t-1 + b4*inft-1
model_expanded <- lm(gct ~ lag(gct, 1) + lag(gy, 1) + lag(i3, 1) + lag(inf, 1), data = CONSUMP)
# Show summary of the expanded model
summary(model_expanded)
##
## Call:
## lm(formula = gct ~ lag(gct, 1) + lag(gy, 1) + lag(i3, 1) + lag(inf,
## 1), data = CONSUMP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32537 -0.05815 -0.00706 0.06340 0.27044
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.059495 0.062640 -0.950 0.345
## lag(gct, 1) -0.456998 0.092170 -4.958 3.19e-06 ***
## lag(gy, 1) 0.006478 0.024953 0.260 0.796
## lag(i3, 1) 0.058584 0.064972 0.902 0.370
## lag(inf, 1) 0.008030 0.011536 0.696 0.488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1185 on 93 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.22, Adjusted R-squared: 0.1865
## F-statistic: 6.559 on 4 and 93 DF, p-value: 0.0001076
#(iii) p-value for
# Check the p-value for lag(gct, 1) in the expanded model
summary(model_expanded)$coefficients["lag(gct, 1)", "Pr(>|t|)"]
## [1] 3.190209e-06
#(iv) F-statistic for Joint Significance
# Check the F-statistic and p-value for joint significance
anova(model_expanded)
## Analysis of Variance Table
##
## Response: gct
## Df Sum Sq Mean Sq F value Pr(>F)
## lag(gct, 1) 1 0.34898 0.34898 24.8546 2.855e-06 ***
## lag(gy, 1) 1 0.00164 0.00164 0.1168 0.7333
## lag(i3, 1) 1 0.01095 0.01095 0.7797 0.3795
## lag(inf, 1) 1 0.00680 0.00680 0.4846 0.4881
## Residuals 93 1.30580 0.01404
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
C11 C12
# Simulate data for the exercise
set.seed(123)
# Number of observations (let's assume 100 months of data)
n <- 100
# Simulate variables with realistic growth rates
gwage232 <- rnorm(n, mean = 0.002, sd = 0.01) # Monthly wage growth (in log differences)
gemp232 <- rnorm(n, mean = 0.001, sd = 0.01) # Employment growth
gmwage <- rnorm(n, mean = 0.003, sd = 0.02) # Minimum wage growth
gcpi <- rnorm(n, mean = 0.002, sd = 0.01) # CPI growth
# Create a data frame
MINWAGE <- data.frame(gwage232, gemp232, gmwage, gcpi)
#(i) Find the First-Order Autocorrelation in
# Load necessary libraries
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Compute first-order autocorrelation for gwage232
acf_value <- acf(MINWAGE$gwage232, lag.max = 1, plot = FALSE)
acf_value
##
## Autocorrelations of series 'MINWAGE$gwage232', by lag
##
## 0 1
## 1.000 -0.026
#(ii) Estimate the Dynamic Model by OLS
# Estimate the dynamic model by OLS
model_ols <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = MINWAGE)
# Show the summary of the model
summary(model_ols)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = MINWAGE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.024124 -0.005967 -0.001046 0.005123 0.022320
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003478 0.001033 3.367 0.0011 **
## lag(gwage232, 1) -0.041236 0.103523 -0.398 0.6913
## gmwage -0.059163 0.050067 -1.182 0.2403
## gcpi -0.051779 0.090391 -0.573 0.5681
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.00921 on 95 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01798, Adjusted R-squared: -0.01304
## F-statistic: 0.5796 on 3 and 95 DF, p-value: 0.6298
#(iii) Add the Lagged Growth in Employment to the Model
# Add lagged employment growth to the model
model_with_employment <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 1), data = MINWAGE)
# Show the summary of the model
summary(model_with_employment)
##
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232,
## 1), data = MINWAGE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0240609 -0.0059723 -0.0006863 0.0048704 0.0217403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003483 0.001038 3.355 0.00115 **
## lag(gwage232, 1) -0.039491 0.104161 -0.379 0.70544
## gmwage -0.061067 0.050660 -1.205 0.23106
## gcpi -0.052807 0.090879 -0.581 0.56258
## lag(gemp232, 1) 0.031107 0.097729 0.318 0.75096
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.009254 on 94 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01903, Adjusted R-squared: -0.02271
## F-statistic: 0.4559 on 4 and 94 DF, p-value: 0.7678
#(iv) Compare the Effect of Adding Lagged Variables on the
# Compare coefficients for gmwage in both models
gmwage_coeff_ols <- summary(model_ols)$coefficients["gmwage", "Estimate"]
gmwage_coeff_with_employment <- summary(model_with_employment)$coefficients["gmwage", "Estimate"]
gmwage_coeff_ols
## [1] -0.05916334
gmwage_coeff_with_employment
## [1] -0.06106732
#(v) Run the Regression
# Run regression of gmwage on lagged gwage232 and gemp232
model_gmwage <- lm(gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = MINWAGE)
# Show the summary of the regression
summary(model_gmwage)
##
## Call:
## lm(formula = gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = MINWAGE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.03846 -0.01202 -0.00239 0.01128 0.04045
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.005256 0.001977 2.658 0.0092 **
## lag(gwage232, 1) -0.090984 0.207482 -0.439 0.6620
## lag(gemp232, 1) 0.225236 0.195602 1.152 0.2524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01866 on 96 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01629, Adjusted R-squared: -0.004203
## F-statistic: 0.7949 on 2 and 96 DF, p-value: 0.4546
# Report R-squared
rsquared <- summary(model_gmwage)$r.squared
rsquared
## [1] 0.01629081
C12 C11
#(i) Estimate the model and obtain the squared OLS residuals
# Simulate some example data for demonstration purposes
set.seed(123)
NYSE <- data.frame(
return_t = rnorm(100), # Monthly returns
return_t_1 = rnorm(100), # Lagged return_t-1
return_t_2 = rnorm(100) # Lagged return_t-2
)
# Estimate the OLS model
model_ols <- lm(return_t ~ return_t_1 + return_t_2, data = NYSE)
# Get squared residuals
residuals_ols <- residuals(model_ols)
squared_residuals <- residuals_ols^2
# Summary statistics
summary(squared_residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00047 0.12047 0.36464 0.80942 1.03728 5.48439
# Calculate and print the average, min, and max of squared residuals
mean_squared_residuals <- mean(squared_residuals)
min_squared_residuals <- min(squared_residuals)
max_squared_residuals <- max(squared_residuals)
cat("Average of squared residuals:", mean_squared_residuals, "\n")
## Average of squared residuals: 0.8094203
cat("Minimum of squared residuals:", min_squared_residuals, "\n")
## Minimum of squared residuals: 0.000469812
cat("Maximum of squared residuals:", max_squared_residuals, "\n")
## Maximum of squared residuals: 5.484394
#(ii) Estimate the model of heteroskedasticity
# Estimate the heteroskedasticity model using the squared residuals
model_hetero <- lm(squared_residuals ~ return_t_1 + I(return_t_1^2), data = NYSE)
# View the model summary
summary(model_hetero)
##
## Call:
## lm(formula = squared_residuals ~ return_t_1 + I(return_t_1^2),
## data = NYSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8871 -0.7030 -0.4536 0.2292 4.7314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.81796 0.14071 5.813 7.83e-08 ***
## return_t_1 -0.09601 0.12344 -0.778 0.439
## I(return_t_1^2) -0.02012 0.08245 -0.244 0.808
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.145 on 97 degrees of freedom
## Multiple R-squared: 0.008428, Adjusted R-squared: -0.01202
## F-statistic: 0.4123 on 2 and 97 DF, p-value: 0.6633
#(iii) Sketch the conditional variance as a function of lagged return
# Get the coefficients from the heteroskedasticity model
coeffs <- coef(model_hetero)
# Create a sequence of return_t_1 values for plotting
return_t_1_seq <- seq(min(NYSE$return_t_1), max(NYSE$return_t_1), length.out = 100)
# Calculate the predicted variance for each value of return_t_1
predicted_variance <- coeffs[1] + coeffs[2] * return_t_1_seq + coeffs[3] * return_t_1_seq^2
# Plot the conditional variance
plot(return_t_1_seq, predicted_variance, type = "l", col = "blue",
xlab = "Lagged Return (return_t-1)", ylab = "Conditional Variance")
#(iv) Negative Variance Estimates
# Check for negative variance estimates
min(predicted_variance)
## [1] 0.2953994
#(v) Compare with the ARCH(1) Model
# Fit an ARCH(1) model: use the lagged squared residuals
arch_model <- lm(squared_residuals ~ lag(squared_residuals, 1), data = NYSE)
# Summary of the ARCH(1) model
summary(arch_model)
##
## Call:
## lm(formula = squared_residuals ~ lag(squared_residuals, 1), data = NYSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8589 -0.7025 -0.3877 0.2566 4.6430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.86292 0.14118 6.112 2.05e-08 ***
## lag(squared_residuals, 1) -0.05839 0.10122 -0.577 0.565
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.146 on 97 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.003418, Adjusted R-squared: -0.006856
## F-statistic: 0.3327 on 1 and 97 DF, p-value: 0.5654
#(vi) ARCH(2) Model
# Fit an ARCH(2) model
arch_model_2 <- lm(squared_residuals ~ lag(squared_residuals, 1) + lag(squared_residuals, 2), data = NYSE)
# Summary of the ARCH(2) model
summary(arch_model_2)
##
## Call:
## lm(formula = squared_residuals ~ lag(squared_residuals, 1) +
## lag(squared_residuals, 2), data = NYSE)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5540 -0.6513 -0.3878 0.3060 4.3913
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.71007 0.16501 4.303 4.09e-05 ***
## lag(squared_residuals, 1) -0.05154 0.10043 -0.513 0.6090
## lag(squared_residuals, 2) 0.19133 0.10046 1.905 0.0599 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.133 on 95 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.04056, Adjusted R-squared: 0.02036
## F-statistic: 2.008 on 2 and 95 DF, p-value: 0.1399