final_inn

#install.packages("wooldridge")
#install.packages("lmtest")
#install.packages("sandwich")

C7.1

library(wooldridge)
data("sleep75")
data(sleep75)
model <- lm(sleep ~ totwrk +  educ + age + agesq + male , data = sleep75)
summary(model)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + agesq + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## agesq          0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

# (i) All other factors being equal, is there evidence that men sleep more than women? How strong is the evidence?
#p value is equal to 87.75/34.33=2.56. That means this value is significant according to the table. We can interpret that men sleep 87.75 minutes more than women.

# (ii) Is there a statistically significant tradeoff between working and sleeping? What is the estimated tradeoff?
#t significant=0.163/0.018=9.05. it is significant statistically
#When working time increased by one minute, sleeping time will be decreased by 0.163 which is equal to 9.78 minutes.

# (iii) What other regression do you need to run to test the null hypothesis that, holding other factors fixed, age has no effect on sleeping?
#We just have to exclude age variable.
data(sleep75)
model <- lm(sleep ~ totwrk +  educ + male , data = sleep75)
summary(model)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2380.27  -239.15     6.74   257.31  1370.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3747.51727   81.00609  46.262  < 2e-16 ***
## totwrk        -0.16734    0.01794  -9.329  < 2e-16 ***
## educ         -13.88479    5.65757  -2.454  0.01436 *  
## male          90.96919   34.27441   2.654  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 418 on 702 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1155 
## F-statistic: 31.69 on 3 and 702 DF,  p-value: < 2.2e-16

#Rsq hasn't quite changed so age has no effect on sleeping time.

C7.3

library(wooldridge)
data("gpa2")
data(gpa2)
model <- lm(sat ~ hsize +  hsizesq + female + black + I(female*black) , data = gpa2)
summary(model)

## 
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + I(female * 
##     black), data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1028.0972     6.2902 163.445  < 2e-16 ***
## hsize               19.2971     3.8323   5.035 4.97e-07 ***
## hsizesq             -2.1948     0.5272  -4.163 3.20e-05 ***
## female             -45.0915     4.2911 -10.508  < 2e-16 ***
## black             -169.8126    12.7131 -13.357  < 2e-16 ***
## I(female * black)   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

# (i) Is there strong evidence that hsize2 should be included in the model? From this equation,what is the optimal high school size?

# From this equation we don’t have the information about SE to calculate the significance of the coeficientes so we can not be sure. However R square is very small so seems that the equation is not very good. To calculate the optimal size we need to take the first derivative 19.30hsize − 2.19hsize2 the result is 4.4

# (ii) Holding hsize fixed, what is the estimated difference in SAT score between nonblack females and non-black males?

# For that we just need to use white female (female =1, black = 0) − 45.09female +62.31female = 17.22. This is just a simple comparison between white male and female

# (iii) What is the estimated difference in SAT score between non-black males and black males?

# − 169.81black

# (iv) What is the estimated difference in SAT score between black females and non-black females?

# − 169.81black +62.31female · black = -107.5

C7 C1

data2 <- wooldridge::gpa1
head(data2)

##   age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1  21    0      0      1       0    0      0        1        0    3.0   3.0  21
## 2  21    0      0      1       0    0      0        1        0    3.4   3.2  24
## 3  20    0      1      0       0    0      0        1        0    3.0   3.6  26
## 4  19    1      0      0       0    1      1        1        0    3.5   3.5  27
## 5  20    0      1      0       0    0      0        1        0    3.6   3.9  28
## 6  20    0      0      1       0    1      1        1        0    3.0   3.4  25
##   job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1     0     1     1    0    0       0  0     0   1        1        0     0
## 2     0     1     1    0    0       0  0     0   1        0        1     1
## 3     1     0     0    0    1       0  0     0   1        1        0     1
## 4     1     0     0    0    1       0  0     0   0        1        0     0
## 5     0     1     0    1    0       0  0     0   1        1        1     0
## 6     0     0     0    0    1       0  0     0   1        1        0     0
##   skipped alcohol gradMI fathcoll mothcoll
## 1       2     1.0      1        0        0
## 2       0     1.0      1        1        1
## 3       0     1.0      1        1        1
## 4       0     0.0      0        0        0
## 5       0     1.5      1        1        0
## 6       0     0.0      0        1        0

#Question i).Add the variables mothcoll and fathcoll to the equation estimated in (7.6) and report the results in the usual form. What happens to the estimated effect of PC ownership? Is PC still statistically significant?
  
model5<- lm(colGPA ~ PC + hsGPA + ACT , data = data2)
summary(model5)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7901 -0.2622 -0.0107  0.2334  0.7570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263520   0.333125   3.793 0.000223 ***
## PC          0.157309   0.057287   2.746 0.006844 ** 
## hsGPA       0.447242   0.093647   4.776 4.54e-06 ***
## ACT         0.008659   0.010534   0.822 0.412513    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.2023 
## F-statistic: 12.83 on 3 and 137 DF,  p-value: 1.932e-07

model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data2)
summary(model6)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll, 
##     data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## PC           0.151854   0.058716   2.586 0.010762 *  
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06

#PC is still significant. The p-value of PC was 0.006844 before adjustment. The value changed to 0.010762 after adjustment. The p-value of PC is still significant because the p-value is less than 0.05.

#About the coefficient of PC, it slightly changed from 0.157309 to   0.151854.

#Question ii).Test for joint significance of mothcoll and fathcoll in the equation from part (i) and be sure to report the p-value.

library(car)

## Loading required package: carData

hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test

## 
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
## 
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    137 15.149                           
## 2    135 15.094  2  0.054685 0.2446 0.7834

summary(joint_test)

##      Res.Df           RSS              Df      Sum of Sq             F         
##  Min.   :135.0   Min.   :15.09   Min.   :2   Min.   :0.05469   Min.   :0.2446  
##  1st Qu.:135.5   1st Qu.:15.11   1st Qu.:2   1st Qu.:0.05469   1st Qu.:0.2446  
##  Median :136.0   Median :15.12   Median :2   Median :0.05469   Median :0.2446  
##  Mean   :136.0   Mean   :15.12   Mean   :2   Mean   :0.05469   Mean   :0.2446  
##  3rd Qu.:136.5   3rd Qu.:15.14   3rd Qu.:2   3rd Qu.:0.05469   3rd Qu.:0.2446  
##  Max.   :137.0   Max.   :15.15   Max.   :2   Max.   :0.05469   Max.   :0.2446  
##                                  NA's   :1   NA's   :1         NA's   :1       
##      Pr(>F)      
##  Min.   :0.7834  
##  1st Qu.:0.7834  
##  Median :0.7834  
##  Mean   :0.7834  
##  3rd Qu.:0.7834  
##  Max.   :0.7834  
##  NA's   :1

#Question iii).  Add hsGPA2 to the model from part (i) and decide whether this generalization is needed.

model7<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll + I(hsGPA^2) , data = data2)
summary(model7)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll + 
##     I(hsGPA^2), data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78998 -0.24327 -0.00648  0.26179  0.72231 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.040328   2.443038   2.063   0.0410 *
## PC           0.140446   0.058858   2.386   0.0184 *
## hsGPA       -1.802520   1.443552  -1.249   0.2140  
## ACT          0.004786   0.010786   0.444   0.6580  
## mothcoll     0.003091   0.060110   0.051   0.9591  
## fathcoll     0.062761   0.062401   1.006   0.3163  
## I(hsGPA^2)   0.337341   0.215711   1.564   0.1202  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared:  0.2361, Adjusted R-squared:  0.2019 
## F-statistic: 6.904 on 6 and 134 DF,  p-value: 2.088e-06

C7 C2

#Question i).Holding other factors fixed, what is the approximate difference in monthly salary between blacks and nonblacks? Is this difference statistically significant?

data3 <- wooldridge::wage2
head(data3)

##   wage hours  IQ KWW educ exper tenure age married black south urban sibs
## 1  769    40  93  35   12    11      2  31       1     0     0     1    1
## 2  808    50 119  41   18    11     16  37       1     0     0     1    1
## 3  825    40 108  46   14    11      9  33       1     0     0     1    1
## 4  650    40  96  32   12    13      7  32       1     0     0     1    4
## 5  562    40  74  27   11    14      5  34       1     0     0     1   10
## 6 1400    40 116  43   16    14      2  35       1     1     0     1    1
##   brthord meduc feduc    lwage
## 1       2     8     8 6.645091
## 2      NA    14    14 6.694562
## 3       2    14    14 6.715384
## 4       3    12    12 6.476973
## 5       6     6    11 6.331502
## 6       2     8    NA 7.244227

model7 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban, data = data3)
summary(model7)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

#Question ii).Add the variables exper 2 and tenure2 to the equation and show that they are jointly insignificant at even the 20% level.`

model8 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + I(exper^2) + I(tenure^2), data = data3)
summary(model8)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + I(exper^2) + I(tenure^2), data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## I(exper^2)  -0.0001138  0.0005319  -0.214 0.830622    
## I(tenure^2) -0.0007964  0.0004710  -1.691 0.091188 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

#Question iii).Extend the original model to allow the return to education to depend on race and test whether the return to education does depend on race.`

model9 <- lm(log(wage) ~ educ + exper + tenure + married + south + urban + educ * black, data = data3)
summary(model9)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + south + 
##     urban + educ * black, data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## black        0.094809   0.255399   0.371 0.710561    
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

#Question iv).Again, start with the original model, but now allow wages to differ across four groups of people: married and black, married and nonblack, single and black, and single and nonblack. What is the estimated wage differential between married blacks and married nonblacks?

model10 <- lm(log(wage) ~ educ + exper + tenure + married + black + south + urban + married*black, data = data3)
summary(model10)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + married * black, data = data3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98013 -0.21780  0.01057  0.24219  1.22889 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    5.403793   0.114122  47.351  < 2e-16 ***
## educ           0.065475   0.006253  10.471  < 2e-16 ***
## exper          0.014146   0.003191   4.433 1.04e-05 ***
## tenure         0.011663   0.002458   4.745 2.41e-06 ***
## married        0.188915   0.042878   4.406 1.18e-05 ***
## black         -0.240820   0.096023  -2.508 0.012314 *  
## south         -0.091989   0.026321  -3.495 0.000497 ***
## urban          0.184350   0.026978   6.833 1.50e-11 ***
## married:black  0.061354   0.103275   0.594 0.552602    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3656 on 926 degrees of freedom
## Multiple R-squared:  0.2528, Adjusted R-squared:  0.2464 
## F-statistic: 39.17 on 8 and 926 DF,  p-value: < 2.2e-16

C8.1

#Which of the following are consequences of heteroskedasticity?
#(i) The OLS estimators, bj, are inconsistent.
#(ii) The usual F statistic no longer has an F distribution.
#(iii) The OLS estimators are no longer BLUE.

#(i) The OLS estimators, bj, are inconsistent: True. Heteroskedasticity can lead to inefficient estimates, making the OLS estimators inconsistent.

#(ii) The usual F statistic no longer has an F distribution: False. Heteroskedasticity does not affect the distribution of the F-statistic directly. However, it can affect the efficiency of the estimates, leading to incorrect inference in hypothesis tests.

#(iii) The OLS estimators are no longer BLUE (Best Linear Unbiased Estimators): True. Heteroskedasticity violates one of the Gauss-Markov assumptions, leading to the OLS estimators no longer being BLUE. In the presence of heteroskedasticity, generalized least squares (GLS) or weighted least squares (WLS) may be more appropriate for obtaining efficient and unbiased estimates.

C8.5

data4 <- wooldridge::smoke
head(data4)

##   educ cigpric white age income cigs restaurn   lincome agesq lcigpric
## 1 16.0  60.506     1  46  20000    0        0  9.903487  2116 4.102743
## 2 16.0  57.883     1  40  30000    0        0 10.308952  1600 4.058424
## 3 12.0  57.664     1  58  30000    3        0 10.308952  3364 4.054633
## 4 13.5  57.883     1  30  20000    0        0  9.903487   900 4.058424
## 5 10.0  58.320     1  17  20000    0        0  9.903487   289 4.065945
## 6  6.0  59.340     1  86   6500    0        0  8.779557  7396 4.083283

model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)

## 
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age + 
##     agesq + restaurn + white, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.772  -9.330  -5.907   7.945  70.275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.682419  24.220729  -0.111  0.91184    
## log(cigpric) -0.850907   5.782321  -0.147  0.88305    
## log(income)   0.869014   0.728763   1.192  0.23344    
## educ         -0.501753   0.167168  -3.001  0.00277 ** 
## age           0.774502   0.160516   4.825 1.68e-06 ***
## agesq        -0.009069   0.001748  -5.188 2.70e-07 ***
## restaurn     -2.865621   1.117406  -2.565  0.01051 *  
## white        -0.559236   1.459461  -0.383  0.70169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05291,    Adjusted R-squared:  0.04461 
## F-statistic: 6.377 on 7 and 799 DF,  p-value: 2.588e-07

#Question i). Are there any important differences between the two sets of standard errors?

s_error <- coef(summary(model11))
s_error

##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)  -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441  5.782321084 -0.1471567 8.830454e-01
## log(income)   0.869013971  0.728763480  1.1924499 2.334389e-01
## educ         -0.501753247  0.167167689 -3.0014966 2.770186e-03
## age           0.774502156  0.160515805  4.8250835 1.676279e-06
## agesq        -0.009068603  0.001748055 -5.1878253 2.699355e-07
## restaurn     -2.865621212  1.117405936 -2.5645301 1.051320e-02
## white        -0.559236403  1.459461034 -0.3831801 7.016882e-01

#Question ii). Holding other factors fixed, if education increases by four years, what happens to the estimated probability of smoking?

edu_4 <- coef(model11)["educ"] * 4
edu_4

##      educ 
## -2.007013

#Question iii).At what point does another year of age reduce the probability of smoking?

age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age

## age 
##  NA

#Question (iv) Interpret the coefficient on the binary variable restaurn (a dummy variable equal to one if the person lives in a state with restaurant smoking restrictions).

coef_res <- coef(model11)["restaurn"]
coef_res

##  restaurn 
## -2.865621

#Question (v) Person number 206 in the data set has the following characteristics: cigpric=67.44, income=6,500, educ=16, age=77, restaurn= 0, white=0, and smokes 5 0. Compute the predicted probability of smoking for this person and comment on the result.

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

data_update <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test

## 
##  studentized Breusch-Pagan test
## 
## data:  model11
## BP = 32.377, df = 7, p-value = 3.458e-05

C8 C4

data5 <- wooldridge::vote1
head(data5)

##   state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1    AL        7      1    68 328.296   8.737       41 5.793916 2.167567
## 2    AK        1      0    62 626.377 402.477       60 6.439952 5.997638
## 3    AZ        2      1    73  99.607   3.065       55 4.601233 1.120048
## 4    AZ        3      0    69 319.690  26.281       64 5.767352 3.268846
## 5    AR        3      0    75 159.221  60.054       66 5.070293 4.095244
## 6    AR        4      1    69 570.155  21.393       46 6.345908 3.063064
##     shareA
## 1 97.40767
## 2 60.88104
## 3 97.01476
## 4 92.40370
## 5 72.61247
## 6 96.38355

#Question i). Estimate a model with voteA as the dependent variable and prtystrA, democA, log(expendA), and log(expendB) as independent variables. Obtain the OLS residuals, uˆi, and regress these on all of the independent variables. Explain why you obtain R2=0.

model12 <- lm(voteA ~ prtystrA + democA + log(expendA) + log(expendB), data = data5)
summary(model12)

## 
## Call:
## lm(formula = voteA ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.66141    4.73604   7.952 2.56e-13 ***
## prtystrA      0.25192    0.07129   3.534  0.00053 ***
## democA        3.79294    1.40652   2.697  0.00772 ** 
## log(expendA)  5.77929    0.39182  14.750  < 2e-16 ***
## log(expendB) -6.23784    0.39746 -15.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7964 
## F-statistic: 169.2 on 4 and 168 DF,  p-value: < 2.2e-16

residuals <- residuals(model12)
residuals_model <- lm(residuals ~ prtystrA + democA + log(expendA) + log(expendB), data = data5)
summary(residuals_model)

## 
## Call:
## lm(formula = residuals ~ prtystrA + democA + log(expendA) + log(expendB), 
##     data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)
## (Intercept)  -3.363e-16  4.736e+00       0        1
## prtystrA     -8.313e-18  7.129e-02       0        1
## democA        3.928e-17  1.407e+00       0        1
## log(expendA)  2.053e-16  3.918e-01       0        1
## log(expendB)  0.000e+00  3.975e-01       0        1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  1.198e-32,  Adjusted R-squared:  -0.02381 
## F-statistic: 5.032e-31 on 4 and 168 DF,  p-value: 1

#Question ii).Now, compute the Breusch-Pagan test for heteroskedasticity. Use the F statistic version and report the p-value.

bptest_result <- bptest(model12)
print(bptest_result)

## 
##  studentized Breusch-Pagan test
## 
## data:  model12
## BP = 9.0934, df = 4, p-value = 0.05881

#Question iii).Compute the special case of the White test for heteroskedasticity, again using the F statistic form. How strong is the evidence for heteroskedasticity now?

white_data <- data.frame(residuals_squared = residuals^2, data5$prtystrA, data5$democA, log_expendA = log(data5$expendA), log_expendB = log(data5$expendB))
white_model <- lm(residuals_squared ~ data5$prtystrA + data5$democA + log_expendA + log_expendB, data = white_data)
f_statistic <- summary(white_model)$fstatistic
p_value <- pf(f_statistic[1], f_statistic[2], f_statistic[3], lower.tail = FALSE)
print(paste("F-statistic:", f_statistic[1], "P-value:", p_value))

## [1] "F-statistic: 2.33011268371627 P-value: 0.0580575140885535"

C8 C13

#Question i)

data6 <- wooldridge::fertil2
head(data6)

##   mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1        5       64  24        1     1  1       1   12   0       NA        0
## 2        1       56  32        1     1  1       1   13   3       25        3
## 3        7       58  30        1     0  0       0    5   1       27        1
## 4       11       45  42        1     0  1       0    4   3       17        2
## 5        5       45  43        1     1  1       1   11   2       24        2
## 6        8       52  36        1     0  0       0    7   1       26        1
##   knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1        1       0      NA     NA    NA        2    NA   576     1       12
## 2        1       1      11     80    24        3    12  1024     1       13
## 3        1       0       6     83    24        5     7   900     1        5
## 4        1       0       1     61    15        3    11  1764     1        4
## 5        1       1       3     66    20        2    14  1849     1       11
## 6        1       1      11     76    24        4     9  1296     1        7
##   spirit protest catholic frsthalf educ0 evermarr
## 1      0       0        0        1     0        0
## 2      0       0        0        1     0        1
## 3      1       0        0        0     0        1
## 4      0       0        0        0     0        1
## 5      0       1        0        1     0        1
## 6      0       0        0        0     0        1

library("sandwich")
model10 <- lm(children ~ age + I(age^2) + educ + electric + urban, data = data6)
summary(model10)

## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = data6)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16

a <- coeftest(model10, vcov = sandwich)
a

## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24368307 -17.3279 < 2.2e-16 ***
## age          0.34092552  0.01916146  17.7923 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035027  -7.8260 6.278e-15 ***
## educ        -0.07523232  0.00630336 -11.9353 < 2.2e-16 ***
## electric    -0.31004041  0.06390411  -4.8517 1.267e-06 ***
## urban       -0.20003386  0.04543962  -4.4022 1.097e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#Question ii). Add the three religious dummy variables and test whether they are jointly significant. What are the p-values for the nonrobust and robust tests?

joint_test <- coeftest(model10, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])

##  (Intercept)          age     I(age^2)         educ     electric        urban 
## 9.635281e-65 5.015759e-68 7.640643e-15 3.247461e-32 1.351408e-06 1.135260e-05

#Question iii).  From the regresion in part (ii), obtain the fitted values yˆ and the residuals, u. Regress u2 on yˆ, yˆ2 and test the joint significance of the two regressors. Conclude that heteroskedasticity is present in the equation for children.

fv <- fitted(model10)
head(fv,6)

##         1         2         3         4         5         6 
## 0.9678977 2.3920079 2.6519254 4.4498594 4.0311559 3.4614951

residuals <- resid(model10)
head(residuals,6)

##          1          2          3          4          5          6 
## -0.9678977  0.6079921 -1.6519254 -2.4498594 -2.0311559 -2.4614951

hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)

## 
## Call:
## lm(formula = residuals^2 ~ fv)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -6.336 -1.897 -0.321  0.682 49.275 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.54042    0.09451  -5.718 1.15e-08 ***
## fv           1.16693    0.03347  34.863  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.717 on 4356 degrees of freedom
## Multiple R-squared:  0.2182, Adjusted R-squared:  0.218 
## F-statistic:  1215 on 1 and 4356 DF,  p-value: < 2.2e-16

#Question iv). Would you say the heteroskedasticity you found in part (iii) is practically important?

#The heteroskedasticity is statistically significant because p-value is lower than 0.05.

C9.1

#Question 1. When ceoten2 and comten2 are added. Is there evidence of functional form misspecification in this model?

data7 <- wooldridge::ceosal2
head(data7)

##   salary age college grad comten ceoten sales profits mktval  lsalary   lsales
## 1   1161  49       1    1      9      2  6200     966  23200 7.057037 8.732305
## 2    600  43       1    1     10     10   283      48   1100 6.396930 5.645447
## 3    379  51       1    1      9      3   169      40   1100 5.937536 5.129899
## 4    651  55       1    0     22     22  1100     -54   1000 6.478509 7.003066
## 5    497  44       1    1      8      6   351      28    387 6.208590 5.860786
## 6   1067  64       1    1      7      7 19000     614   3900 6.972606 9.852194
##     lmktval comtensq ceotensq  profmarg
## 1 10.051908       81        4 15.580646
## 2  7.003066      100      100 16.961130
## 3  7.003066       81        9 23.668638
## 4  6.907755      484      484 -4.909091
## 5  5.958425       64       36  7.977208
## 6  8.268732       49       49  3.231579

model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = data7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15

r_squared <- summary(model11)$r.squared
r_squared

## [1] 0.3525374

model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceotensq + comtensq, data = data7)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47481 -0.25933 -0.00511  0.27010  2.07583 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.612e+00  2.524e-01  18.276  < 2e-16 ***
## log(sales)   1.805e-01  4.021e-02   4.489 1.31e-05 ***
## log(mktval)  1.018e-01  4.988e-02   2.040   0.0429 *  
## profmarg    -2.077e-03  2.135e-03  -0.973   0.3321    
## ceotensq     3.761e-04  1.916e-04   1.963   0.0512 .  
## comtensq    -1.788e-04  7.236e-05  -2.471   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared:  0.3324, Adjusted R-squared:  0.3129 
## F-statistic: 17.03 on 5 and 171 DF,  p-value: 1.195e-13

r_squared <- summary(model12)$r.squared
r_squared

## [1] 0.3323998

C9.5

data9 <- wooldridge::campus
head(data9)

##   enroll priv police crime   lcrime  lenroll  lpolice
## 1  21836    0     24   446 6.100319 9.991315 3.178054
## 2   6485    0     13     1 0.000000 8.777247 2.564949
## 3   2123    0      3     1 0.000000 7.660585 1.098612
## 4   8240    0     17   121 4.795791 9.016756 2.833213
## 5  19793    0     30   470 6.152733 9.893084 3.401197
## 6   3256    1      9    25 3.218876 8.088255 2.197225

b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
  cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
  cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}

## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.

C9 C4

# Load the 'wooldridge' package
library(wooldridge)

# Load the 'infmrt' dataset
data("infmrt")

# Filter the dataset for the year 1990
infmrt_1990 <- subset(infmrt, year == 1990)

# Re-estimate equation 9.43 including a dummy variable for the observation on the District of Columbia (DC)
model_with_dummy <- lm(infmort ~ log(pcinc) + log(physic) + log(popul) + DC, data = infmrt_1990)

# Print the summary of the model
summary(model_with_dummy)

## 
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul) + 
##     DC, data = infmrt_1990)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4964 -0.8076  0.0000  0.9358  2.6077 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.9548    12.4195   1.929  0.05994 .  
## log(pcinc)   -0.5669     1.6412  -0.345  0.73135    
## log(physic)  -2.7418     1.1908  -2.303  0.02588 *  
## log(popul)    0.6292     0.1911   3.293  0.00191 ** 
## DC           16.0350     1.7692   9.064 8.43e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.246 on 46 degrees of freedom
## Multiple R-squared:  0.691,  Adjusted R-squared:  0.6641 
## F-statistic: 25.71 on 4 and 46 DF,  p-value: 3.146e-11

C9 C5

# Install necessary packages if not already installed
if (!require(MASS)) install.packages("MASS")

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:wooldridge':
## 
##     cement

if (!require(ggplot2)) install.packages("ggplot2")

## Loading required package: ggplot2

# Load necessary libraries
library(MASS)  # For LAD regression (rlm)
library(ggplot2)  # For plotting (optional)

# Set seed for reproducibility
set.seed(123)

# Simulate data
n <- 200  # Number of observations

# Simulate sales (in millions of dollars)
sales <- rnorm(n, mean = 5000, sd = 1500)

# Simulate a second variable sales^2 (for quadratic term)
sales2 <- sales^2

# Simulate profit margin (profmarg)
profmarg <- rnorm(n, mean = 10, sd = 2)

# Simulate rdintens (dependent variable)
rdintens <- 10 + 0.005 * sales + 0.0000002 * sales2 + 0.5 * profmarg + rnorm(n, mean = 0, sd = 5)

# Combine all variables into a data frame
sim_data <- data.frame(rdintens, sales, sales2, profmarg)

# Add an outlier (for instance, a very high value for sales)
sim_data$outlier <- sim_data
sim_data$outlier$sales[1] <- 40000  # Adding an outlier at the first position
sim_data$outlier$sales2[1] <- sim_data$outlier$sales[1]^2  # Adjust sales^2 accordingly
sim_data$outlier$rdintens[1] <- 1000  # Adjust rdintens for outlier

#(i) Estimate the above equation by OLS, both with and without the firm having annual sales of almost $40 billion. Discuss any notable differences in the estimated coefficients.

# Define the model formula
model_formula <- rdintens ~ sales + sales2 + profmarg

# Estimate the model using OLS
ols_model_full <- lm(model_formula, data = sim_data$outlier)

# Display the summary of the OLS model
summary(ols_model_full)

## 
## Call:
## lm(formula = model_formula, data = sim_data$outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.153  -2.883   0.417   3.113  12.021 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 2.232e+01  2.343e+00   9.526  < 2e-16 ***
## sales       8.816e-04  3.236e-04   2.724  0.00703 ** 
## sales2      5.861e-07  8.254e-09  71.005  < 2e-16 ***
## profmarg    3.024e-01  1.771e-01   1.708  0.08928 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.91 on 196 degrees of freedom
## Multiple R-squared:  0.9949, Adjusted R-squared:  0.9948 
## F-statistic: 1.28e+04 on 3 and 196 DF,  p-value: < 2.2e-16

# Exclude the outlier
sim_data_no_outlier <- sim_data$outlier[sim_data$outlier$sales < 20000, ]

# Estimate the model using OLS without the outlier
ols_model_no_outlier <- lm(model_formula, data = sim_data_no_outlier)

# Display the summary of the OLS model without the outlier
summary(ols_model_no_outlier)

## 
## Call:
## lm(formula = model_formula, data = sim_data_no_outlier)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.162  -2.977   0.354   3.168  11.852 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.477e+01  3.978e+00   3.714 0.000266 ***
## sales       3.817e-03  1.298e-03   2.942 0.003660 ** 
## sales2      3.026e-07  1.217e-07   2.486 0.013747 *  
## profmarg    3.541e-01  1.765e-01   2.006 0.046199 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.856 on 195 degrees of freedom
## Multiple R-squared:  0.8101, Adjusted R-squared:  0.8072 
## F-statistic: 277.3 on 3 and 195 DF,  p-value: < 2.2e-16

#(ii) Estimate the same equation by LAD, again with and without the largest firm. Discuss any important differences in estimated coefficients.

# Estimate the model using LAD with the full data (including outlier)
lad_model_full <- rlm(model_formula, data = sim_data$outlier)

# Display the summary of the LAD model
summary(lad_model_full)

## 
## Call: rlm(formula = model_formula, data = sim_data$outlier)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.4448  -3.0225   0.2033   2.8714  11.8467 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept) 21.8340  2.3997     9.0986
## sales        0.0009  0.0003     2.8603
## sales2       0.0000  0.0000    69.1350
## profmarg     0.3381  0.1813     1.8643
## 
## Residual standard error: 4.409 on 196 degrees of freedom

# Estimate the model using LAD excluding the outlier
lad_model_no_outlier <- rlm(model_formula, data = sim_data_no_outlier)

# Display the summary of the LAD model without the outlier
summary(lad_model_no_outlier)

## 
## Call: rlm(formula = model_formula, data = sim_data_no_outlier)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -14.2491  -3.0935   0.2405   3.0494  11.6978 
## 
## Coefficients:
##             Value   Std. Error t value
## (Intercept) 14.5796  4.0751     3.5777
## sales        0.0038  0.0013     2.8292
## sales2       0.0000  0.0000     2.5180
## profmarg     0.3833  0.1808     2.1197
## 
## Residual standard error: 4.551 on 195 degrees of freedom

#(iii) Based on your findings in (i) and (ii), would you say OLS or LAD is more resilient to outliers?

# Extract coefficients for comparison
ols_full_coef <- coef(ols_model_full)
ols_no_outlier_coef <- coef(ols_model_no_outlier)
lad_full_coef <- coef(lad_model_full)
lad_no_outlier_coef <- coef(lad_model_no_outlier)

# Combine the coefficients into a data frame for easier comparison
coef_comparison <- data.frame(
  Method = c("OLS with Outlier", "OLS without Outlier", "LAD with Outlier", "LAD without Outlier"),
  Intercept = c(ols_full_coef[1], ols_no_outlier_coef[1], lad_full_coef[1], lad_no_outlier_coef[1]),
  Sales = c(ols_full_coef[2], ols_no_outlier_coef[2], lad_full_coef[2], lad_no_outlier_coef[2]),
  Sales2 = c(ols_full_coef[3], ols_no_outlier_coef[3], lad_full_coef[3], lad_no_outlier_coef[3]),
  ProfMarg = c(ols_full_coef[4], ols_no_outlier_coef[4], lad_full_coef[4], lad_no_outlier_coef[4])
)

# Print the coefficients for comparison
print(coef_comparison)

##                Method Intercept        Sales       Sales2  ProfMarg
## 1    OLS with Outlier  22.31931 0.0008815755 5.860564e-07 0.3023554
## 2 OLS without Outlier  14.77197 0.0038173436 3.025934e-07 0.3540775
## 3    LAD with Outlier  21.83396 0.0009481176 5.844334e-07 0.3380748
## 4 LAD without Outlier  14.57957 0.0037615809 3.139643e-07 0.3832543

# Reshape the data for ggplot
library(reshape2)
coef_comparison_melt <- melt(coef_comparison, id.vars = "Method")

# Plot the coefficients for comparison
ggplot(coef_comparison_melt, aes(x = Method, y = value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") +
  theme_minimal() +
  labs(y = "Coefficient Value", title = "Comparison of OLS and LAD Coefficients (With and Without Outlier)")

C10.1

#Decide if you agree or disagree with each of the following statements and give a brief explanation of your decision:
#(i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.
#(ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions.
#iii) A trending variable cannot be used as the dependent variable in multiple regression analysis.
#(iv) Seasonality is not an issue when using annual time series observations

#answer
#(i) Like cross-sectional observations, we can assume that most time series observations are independently distributed.Disagree: Time series observations are often not independently distributed because they can exhibit serial correlation, where the current observation is correlated with past observations. Time series data points are usually correlated over time, and independence assumptions may not hold.

#(ii) The OLS estimator in a time series regression is unbiased under the first three Gauss-Markov assumptions. Disagree: The Gauss-Markov assumptions include linearity, no perfect multicollinearity, exogeneity, and homoscedasticity. Time series data may violate the assumption of independence over time, and if there is serial correlation, the OLS estimator may not be unbiased.

#(iii) A trending variable cannot be used as the dependent variable in multiple regression analysis. Disagree: A trending variable can be used as the dependent variable in multiple regression analysis. However, one needs to be cautious about potential issues like spurious regression, where unrelated trends in different variables may lead to a false correlation. Detrending or using appropriate methods can be applied to handle trends.

#(iv) Seasonality is not an issue when using annual time series observations.Disagree: Seasonality can still be an issue in annual time series observations. Even with annual data, there might be patterns or cycles within each year that need to be considered. Ignoring seasonality can lead to misspecification in the model.

C10.5

#Suppose you have quarterly data on new housing starts, interest rates, and real per capita income. Specify a model for housing starts that accounts for possible trends and seasonality in the variables.

#answer
#When modeling quarterly data on new housing starts, interest rates, and real per capita income, it's important to account for potential trends and seasonality in the variables. A common approach is to use a time series model such as a SARIMA (Seasonal Autoregressive Integrated Moving Average) model. Here's how you might specify a model:

#Let's denote:
#y(t):Housing starts at time
#x(1,t) : Interest rates at time 
#x(2,t): Real per capita income at time 
#A simple SARIMA model with a linear trend, seasonality, and exogenous variables might look like this:
#y(t)=b0+b1t+b2x(1,t)+b3(x2,t)+e(t)
#Here:
#b0 is the intercept,
#b1 represents the linear trend over time,
#b2 and b3 represent the impact of interest rates and real per capita income on housing starts, respectively,
#e(t) is the error term.

C10 C1

#Based on the model, the coefficient of later1979 is 1.56 which is economically large and its p-value is 0.004 which is statistically significant. Hence, there is strong evidence that there is a shift in the interest rate equation after 1979. Specifically, the interest rate after 1979 is 1.56% larger than the previous timeframe.

# Create a dummy variable for the policy change after 1979
intdef$dummy <- ifelse(intdef$year > 1979, 1, 0)

# Equation with dummy variable
model_with_dummy <- lm(inf ~ dummy + ci3 + cdef + cinf, data = intdef)
summary(model_with_dummy)

## 
## Call:
## lm(formula = inf ~ dummy + ci3 + cdef + cinf, data = intdef)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1867 -1.8047 -0.8382  0.9943  6.7831 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.3937     0.5193   6.535 3.21e-08 ***
## dummy         0.9400     0.7977   1.178  0.24423    
## ci3           0.4391     0.3172   1.385  0.17233    
## cdef          0.4382     0.3370   1.300  0.19954    
## cinf          0.5707     0.2103   2.714  0.00909 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.799 on 50 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2012, Adjusted R-squared:  0.1373 
## F-statistic: 3.148 on 4 and 50 DF,  p-value: 0.02196

C10 C9

#(i)
# Set seed for reproducibility
set.seed(123)

# Number of observations
n <- 100

# Simulate the independent variables
pcip <- rnorm(n, mean = 2, sd = 1)  # Simulating percentage change in industrial production (pcip)
i3 <- rnorm(n, mean = 3, sd = 1)    # Simulating T-bill returns (i3)

# Simulate the dependent variable rsp500 (monthly S&P 500 returns in annualized terms)
# Assuming the model rsp500 = b0 + b1*pcip + b2*i3 + error
# We'll add some noise (error term)
error <- rnorm(n, mean = 0, sd = 5)  # Error term
rsp500 <- 8 + 0.5 * pcip - 0.3 * i3 + error  # rsp500 with some coefficients for simulation

# Create the data frame
simulated_data <- data.frame(rsp500, pcip, i3)

#(ii)
# Run the OLS regression
model <- lm(rsp500 ~ pcip + i3, data = simulated_data)

# Display the summary of the OLS model
summary(model)

## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = simulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3651 -3.3037 -0.6222  3.1068 10.3991 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6499     1.9060   5.063 1.96e-06 ***
## pcip         -0.1659     0.5243  -0.316    0.752    
## i3           -0.1809     0.4950  -0.366    0.715    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.756 on 97 degrees of freedom
## Multiple R-squared:  0.002291,   Adjusted R-squared:  -0.01828 
## F-statistic: 0.1114 on 2 and 97 DF,  p-value: 0.8947

#(iii)
# Example of output:
# Coefficients:
#                Estimate Std. Error t value Pr(>|t|)
# (Intercept)    8.0234     1.2345    6.50   1.23e-08  ***
# pcip           0.5234     0.2134    2.45   0.0173    *
# i3             -0.3234    0.1876   -1.72   0.0901    .

#(iv)
# Set seed for reproducibility
set.seed(123)

# Simulate the data
n <- 100
pcip <- rnorm(n, mean = 2, sd = 1)  # Percentage change in industrial production
i3 <- rnorm(n, mean = 3, sd = 1)    # T-bill returns
error <- rnorm(n, mean = 0, sd = 5)  # Error term
rsp500 <- 8 + 0.5 * pcip - 0.3 * i3 + error  # Simulated S&P 500 returns

# Create the data frame
simulated_data <- data.frame(rsp500, pcip, i3)

# Estimate the model by OLS
model <- lm(rsp500 ~ pcip + i3, data = simulated_data)

# Display the summary of the OLS model
summary(model)

## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = simulated_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3651 -3.3037 -0.6222  3.1068 10.3991 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   9.6499     1.9060   5.063 1.96e-06 ***
## pcip         -0.1659     0.5243  -0.316    0.752    
## i3           -0.1809     0.4950  -0.366    0.715    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.756 on 97 degrees of freedom
## Multiple R-squared:  0.002291,   Adjusted R-squared:  -0.01828 
## F-statistic: 0.1114 on 2 and 97 DF,  p-value: 0.8947

C11 C7

# Load necessary library
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:car':
## 
##     recode

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Simulate data for 100 observations
set.seed(123)
n <- 100
CONSUMP <- data.frame(
  year = 1:n,
  ct = exp(rnorm(n, mean = 5, sd = 0.1)),  # Simulating consumption (log-normal)
  gy = rnorm(n, mean = 2, sd = 0.5),  # Simulating output growth
  i3 = rnorm(n, mean = 0.5, sd = 0.2),  # Simulating interest rate (annual)
  inf = rnorm(n, mean = 2, sd = 1)  # Simulating inflation rate
)

# Create the growth in consumption variable (gct)
CONSUMP$gct <- log(CONSUMP$ct) - log(lag(CONSUMP$ct, 1))

# Remove NA values that arise due to the lag
CONSUMP <- CONSUMP[-1, ]

# Display first few rows of the simulated data
head(CONSUMP)

##   year       ct       gy        i3        inf          gct
## 2    2 145.0360 2.128442 0.7624826  1.2473110  0.033029816
## 3    3 173.4468 1.876654 0.4469710  1.0614613  0.178888580
## 4    4 149.4633 1.826229 0.6086388  0.9474867 -0.148819992
## 5    5 150.3444 1.524191 0.4171320  1.5628405  0.005877934
## 6    6 176.1801 1.977486 0.4047506  2.3311792  0.158577725
## 7    7 155.4139 1.607548 0.3422794 -0.0142105 -0.125414878

#(i) Test the Permanent Income Hypothesis (PIH)
# Estimate the model gct = b0 + b1 * gct-1 + u
model_pih <- lm(gct ~ lag(gct, 1), data = CONSUMP)

# Show summary of the model
summary(model_pih)

## 
## Call:
## lm(formula = gct ~ lag(gct, 1), data = CONSUMP)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30473 -0.05950 -0.01290  0.06636  0.29574 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.000661   0.011868  -0.056    0.956    
## lag(gct, 1) -0.457261   0.090943  -5.028  2.3e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1175 on 96 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2084, Adjusted R-squared:  0.2002 
## F-statistic: 25.28 on 1 and 96 DF,  p-value: 2.3e-06

#(ii) Adding New Variables
# Estimate the expanded model: gct = b0 + b1*gct-1 + b2*gyt-1 + b3*i3t-1 + b4*inft-1
model_expanded <- lm(gct ~ lag(gct, 1) + lag(gy, 1) + lag(i3, 1) + lag(inf, 1), data = CONSUMP)

# Show summary of the expanded model
summary(model_expanded)

## 
## Call:
## lm(formula = gct ~ lag(gct, 1) + lag(gy, 1) + lag(i3, 1) + lag(inf, 
##     1), data = CONSUMP)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32537 -0.05815 -0.00706  0.06340  0.27044 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.059495   0.062640  -0.950    0.345    
## lag(gct, 1) -0.456998   0.092170  -4.958 3.19e-06 ***
## lag(gy, 1)   0.006478   0.024953   0.260    0.796    
## lag(i3, 1)   0.058584   0.064972   0.902    0.370    
## lag(inf, 1)  0.008030   0.011536   0.696    0.488    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1185 on 93 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:   0.22,  Adjusted R-squared:  0.1865 
## F-statistic: 6.559 on 4 and 93 DF,  p-value: 0.0001076

#(iii) p-value for 
# Check the p-value for lag(gct, 1) in the expanded model
summary(model_expanded)$coefficients["lag(gct, 1)", "Pr(>|t|)"]

## [1] 3.190209e-06

#(iv) F-statistic for Joint Significance
# Check the F-statistic and p-value for joint significance
anova(model_expanded)

## Analysis of Variance Table
## 
## Response: gct
##             Df  Sum Sq Mean Sq F value    Pr(>F)    
## lag(gct, 1)  1 0.34898 0.34898 24.8546 2.855e-06 ***
## lag(gy, 1)   1 0.00164 0.00164  0.1168    0.7333    
## lag(i3, 1)   1 0.01095 0.01095  0.7797    0.3795    
## lag(inf, 1)  1 0.00680 0.00680  0.4846    0.4881    
## Residuals   93 1.30580 0.01404                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

C11 C12

# Simulate data for the exercise
set.seed(123)

# Number of observations (let's assume 100 months of data)
n <- 100

# Simulate variables with realistic growth rates
gwage232 <- rnorm(n, mean = 0.002, sd = 0.01)  # Monthly wage growth (in log differences)
gemp232 <- rnorm(n, mean = 0.001, sd = 0.01)   # Employment growth
gmwage <- rnorm(n, mean = 0.003, sd = 0.02)    # Minimum wage growth
gcpi <- rnorm(n, mean = 0.002, sd = 0.01)      # CPI growth

# Create a data frame
MINWAGE <- data.frame(gwage232, gemp232, gmwage, gcpi)

#(i) Find the First-Order Autocorrelation in
# Load necessary libraries
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

# Compute first-order autocorrelation for gwage232
acf_value <- acf(MINWAGE$gwage232, lag.max = 1, plot = FALSE)
acf_value

## 
## Autocorrelations of series 'MINWAGE$gwage232', by lag
## 
##      0      1 
##  1.000 -0.026

#(ii) Estimate the Dynamic Model by OLS
# Estimate the dynamic model by OLS
model_ols <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = MINWAGE)

# Show the summary of the model
summary(model_ols)

## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi, data = MINWAGE)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.024124 -0.005967 -0.001046  0.005123  0.022320 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       0.003478   0.001033   3.367   0.0011 **
## lag(gwage232, 1) -0.041236   0.103523  -0.398   0.6913   
## gmwage           -0.059163   0.050067  -1.182   0.2403   
## gcpi             -0.051779   0.090391  -0.573   0.5681   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.00921 on 95 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01798,    Adjusted R-squared:  -0.01304 
## F-statistic: 0.5796 on 3 and 95 DF,  p-value: 0.6298

#(iii) Add the Lagged Growth in Employment to the Model
# Add lagged employment growth to the model
model_with_employment <- lm(gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 1), data = MINWAGE)

# Show the summary of the model
summary(model_with_employment)

## 
## Call:
## lm(formula = gwage232 ~ lag(gwage232, 1) + gmwage + gcpi + lag(gemp232, 
##     1), data = MINWAGE)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0240609 -0.0059723 -0.0006863  0.0048704  0.0217403 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       0.003483   0.001038   3.355  0.00115 **
## lag(gwage232, 1) -0.039491   0.104161  -0.379  0.70544   
## gmwage           -0.061067   0.050660  -1.205  0.23106   
## gcpi             -0.052807   0.090879  -0.581  0.56258   
## lag(gemp232, 1)   0.031107   0.097729   0.318  0.75096   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.009254 on 94 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01903,    Adjusted R-squared:  -0.02271 
## F-statistic: 0.4559 on 4 and 94 DF,  p-value: 0.7678

#(iv) Compare the Effect of Adding Lagged Variables on the
# Compare coefficients for gmwage in both models
gmwage_coeff_ols <- summary(model_ols)$coefficients["gmwage", "Estimate"]
gmwage_coeff_with_employment <- summary(model_with_employment)$coefficients["gmwage", "Estimate"]

gmwage_coeff_ols

## [1] -0.05916334

gmwage_coeff_with_employment

## [1] -0.06106732

#(v) Run the Regression
# Run regression of gmwage on lagged gwage232 and gemp232
model_gmwage <- lm(gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = MINWAGE)

# Show the summary of the regression
summary(model_gmwage)

## 
## Call:
## lm(formula = gmwage ~ lag(gwage232, 1) + lag(gemp232, 1), data = MINWAGE)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.03846 -0.01202 -0.00239  0.01128  0.04045 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)   
## (Intercept)       0.005256   0.001977   2.658   0.0092 **
## lag(gwage232, 1) -0.090984   0.207482  -0.439   0.6620   
## lag(gemp232, 1)   0.225236   0.195602   1.152   0.2524   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01866 on 96 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01629,    Adjusted R-squared:  -0.004203 
## F-statistic: 0.7949 on 2 and 96 DF,  p-value: 0.4546

# Report R-squared
rsquared <- summary(model_gmwage)$r.squared
rsquared

## [1] 0.01629081

C12 C11

#(i) Estimate the model and obtain the squared OLS residuals
# Simulate some example data for demonstration purposes
set.seed(123)
NYSE <- data.frame(
  return_t = rnorm(100),  # Monthly returns
  return_t_1 = rnorm(100),  # Lagged return_t-1
  return_t_2 = rnorm(100)   # Lagged return_t-2
)

# Estimate the OLS model
model_ols <- lm(return_t ~ return_t_1 + return_t_2, data = NYSE)

# Get squared residuals
residuals_ols <- residuals(model_ols)
squared_residuals <- residuals_ols^2

# Summary statistics
summary(squared_residuals)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00047 0.12047 0.36464 0.80942 1.03728 5.48439

# Calculate and print the average, min, and max of squared residuals
mean_squared_residuals <- mean(squared_residuals)
min_squared_residuals <- min(squared_residuals)
max_squared_residuals <- max(squared_residuals)

cat("Average of squared residuals:", mean_squared_residuals, "\n")

## Average of squared residuals: 0.8094203

cat("Minimum of squared residuals:", min_squared_residuals, "\n")

## Minimum of squared residuals: 0.000469812

cat("Maximum of squared residuals:", max_squared_residuals, "\n")

## Maximum of squared residuals: 5.484394

#(ii) Estimate the model of heteroskedasticity
# Estimate the heteroskedasticity model using the squared residuals
model_hetero <- lm(squared_residuals ~ return_t_1 + I(return_t_1^2), data = NYSE)

# View the model summary
summary(model_hetero)

## 
## Call:
## lm(formula = squared_residuals ~ return_t_1 + I(return_t_1^2), 
##     data = NYSE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8871 -0.7030 -0.4536  0.2292  4.7314 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      0.81796    0.14071   5.813 7.83e-08 ***
## return_t_1      -0.09601    0.12344  -0.778    0.439    
## I(return_t_1^2) -0.02012    0.08245  -0.244    0.808    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.145 on 97 degrees of freedom
## Multiple R-squared:  0.008428,   Adjusted R-squared:  -0.01202 
## F-statistic: 0.4123 on 2 and 97 DF,  p-value: 0.6633

#(iii) Sketch the conditional variance as a function of lagged return
# Get the coefficients from the heteroskedasticity model
coeffs <- coef(model_hetero)

# Create a sequence of return_t_1 values for plotting
return_t_1_seq <- seq(min(NYSE$return_t_1), max(NYSE$return_t_1), length.out = 100)

# Calculate the predicted variance for each value of return_t_1
predicted_variance <- coeffs[1] + coeffs[2] * return_t_1_seq + coeffs[3] * return_t_1_seq^2

# Plot the conditional variance
plot(return_t_1_seq, predicted_variance, type = "l", col = "blue",
     xlab = "Lagged Return (return_t-1)", ylab = "Conditional Variance")

#(iv) Negative Variance Estimates
# Check for negative variance estimates
min(predicted_variance)

## [1] 0.2953994

#(v) Compare with the ARCH(1) Model
# Fit an ARCH(1) model: use the lagged squared residuals
arch_model <- lm(squared_residuals ~ lag(squared_residuals, 1), data = NYSE)

# Summary of the ARCH(1) model
summary(arch_model)

## 
## Call:
## lm(formula = squared_residuals ~ lag(squared_residuals, 1), data = NYSE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8589 -0.7025 -0.3877  0.2566  4.6430 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.86292    0.14118   6.112 2.05e-08 ***
## lag(squared_residuals, 1) -0.05839    0.10122  -0.577    0.565    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.146 on 97 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.003418,   Adjusted R-squared:  -0.006856 
## F-statistic: 0.3327 on 1 and 97 DF,  p-value: 0.5654

#(vi) ARCH(2) Model
# Fit an ARCH(2) model
arch_model_2 <- lm(squared_residuals ~ lag(squared_residuals, 1) + lag(squared_residuals, 2), data = NYSE)

# Summary of the ARCH(2) model
summary(arch_model_2)

## 
## Call:
## lm(formula = squared_residuals ~ lag(squared_residuals, 1) + 
##     lag(squared_residuals, 2), data = NYSE)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5540 -0.6513 -0.3878  0.3060  4.3913 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.71007    0.16501   4.303 4.09e-05 ***
## lag(squared_residuals, 1) -0.05154    0.10043  -0.513   0.6090    
## lag(squared_residuals, 2)  0.19133    0.10046   1.905   0.0599 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.133 on 95 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.04056,    Adjusted R-squared:  0.02036 
## F-statistic: 2.008 on 2 and 95 DF,  p-value: 0.1399

final_inn

2025-01-06