Chapter 7

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(dslabs)
library(ISLR2)
library(matlib)

## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display

## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.

library(wooldridge)
library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(sandwich)
data <- wooldridge::sleep75
head(sleep75)

##   age black case clerical construc educ earns74 gdhlth inlf leis1 leis2 leis3
## 1  32     0    1        0        0   12       0      0    1  3529  3479  3479
## 2  31     0    2        0        0   14    9500      1    1  2140  2140  2140
## 3  44     0    3        0        0   17   42500      1    1  4595  4505  4227
## 4  30     0    4        0        0   12   42500      1    1  3211  3211  3211
## 5  64     0    5        0        0   14    2500      1    1  4052  4007  4007
## 6  41     0    6        0        0   12       0      1    1  4812  4797  4797
##   smsa  lhrwage   lothinc male marr prot rlxall selfe sleep slpnaps south
## 1    0 1.955861 10.075380    1    1    1   3163     0  3113    3163     0
## 2    0 0.357674  0.000000    1    0    1   2920     1  2920    2920     1
## 3    1 3.021887  0.000000    1    1    0   3038     1  2670    2760     0
## 4    0 2.263844  0.000000    0    1    1   3083     1  3083    3083     0
## 5    0 1.011601  9.328213    1    1    1   3493     0  3448    3493     0
## 6    0 2.957511 10.657280    1    1    1   4078     0  4063    4078     0
##   spsepay spwrk75 totwrk union worknrm workscnd exper yngkid yrsmarr    hrwage
## 1       0       0   3438     0    3438        0    14      0      13  7.070004
## 2       0       0   5020     0    5020        0    11      0       0  1.429999
## 3   20000       1   2815     0    2815        0    21      0       0 20.529997
## 4    5000       1   3786     0    3786        0    12      0      12  9.619998
## 5    2400       1   2580     0    2580        0    44      0      33  2.750000
## 6       0       0   1205     0       0     1205    23      0      23 19.249998
##   agesq
## 1  1024
## 2   961
## 3  1936
## 4   900
## 5  4096
## 6  1681

data(sleep75)
model1 <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model1)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2378.00  -243.29     6.74   259.24  1350.19 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3840.83197  235.10870  16.336   <2e-16 ***
## totwrk        -0.16342    0.01813  -9.013   <2e-16 ***
## educ         -11.71332    5.86689  -1.997   0.0463 *  
## age           -8.69668   11.20746  -0.776   0.4380    
## I(age^2)       0.12844    0.13390   0.959   0.3378    
## male          87.75243   34.32616   2.556   0.0108 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared:  0.1228, Adjusted R-squared:  0.1165 
## F-statistic: 19.59 on 5 and 700 DF,  p-value: < 2.2e-16

From the model, the male coefficient stands at 87.75, which means that male sleep 87.75 more minutes than women, suggesting that, on average, men sleep approximately one and a half hours more per week compared to women in a similar situation. The t-value for male, roughly 2.56, is quite close to the critical value of 2.58 for a two-sided alternative, indicating strong evidence for a gender-based difference.

2*(1-pt(87.75/34.33,700,FALSE))

## [1] 0.01079613

The t statistic for this variable is 87.75/34.33= 2.56 and p-value is 0.011; so with significant level at 5%, the evidence is strong.

2*(1-pt(0.163/0.018,700,FALSE))

## [1] 0

The t statistic for totwrk is around -9.06, signifying high statistical significance. The coefficient suggests that an additional hour of work correlates with roughly 9.8 minutes less sleep.The p-value for totwrk is very significant with the value at nearly 0. Hence, there is a statistically significant tradeoff between working and sleeping. The estimated tradeoff is 0.163, which means that 1 more minute of work will reduce 0.163 minutes less sleep.

model1 <- lm(sleep ~ totwrk + educ + male, data = sleep75)
summary(model1)

## 
## Call:
## lm(formula = sleep ~ totwrk + educ + male, data = sleep75)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2380.27  -239.15     6.74   257.31  1370.63 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 3747.51727   81.00609  46.262  < 2e-16 ***
## totwrk        -0.16734    0.01794  -9.329  < 2e-16 ***
## educ         -13.88479    5.65757  -2.454  0.01436 *  
## male          90.96919   34.27441   2.654  0.00813 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 418 on 702 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1155 
## F-statistic: 31.69 on 3 and 702 DF,  p-value: < 2.2e-16

2*(1-pt(2.19/0.53,4131,FALSE))

## [1] 3.666238e-05

The R^2 is pretty the same, when we run a regression excluding both the ‘age’ and ‘age squared’ terms. In a model containing both these terms, age’s influence would only be considered null if the coefficients for both age and age squared equate to zero.

2*(1-pt(45.09/4.29,4131,FALSE))

## [1] 0

2*(1-pt(169.81/12.71,4131,FALSE))

## [1] 0

non-black males are represented by female=0 and black= 0, black males are represented by female=0 and black=1. Hence, the difference between these two groups is coefficient of black variable which is -169.81. It means that black males have 169.81 less score in sat than non-black males in average. The p-value is nearly at 0, so there is a statistically significant in this estimated difference.

library(wooldridge)
data1 <- wooldridge::gpa2
head(data1)

##    sat tothrs colgpa athlete verbmath hsize hsrank   hsperc female white black
## 1  920     43   2.04       1  0.48387  0.10      4 40.00000      1     0     0
## 2 1170     18   4.00       0  0.82813  9.40    191 20.31915      0     1     0
## 3  810     14   1.78       1  0.88372  1.19     42 35.29412      0     1     0
## 4  940     40   2.42       0  0.80769  5.71    252 44.13310      0     1     0
## 5 1180     18   2.61       0  0.73529  2.14     86 40.18692      0     1     0
## 6  980    114   3.03       0  0.81481  2.68     41 15.29851      1     1     0
##   hsizesq
## 1  0.0100
## 2 88.3600
## 3  1.4161
## 4 32.6041
## 5  4.5796
## 6  7.1824

data("gpa2")
model2 <- lm(sat ~ hsize + I(hsize^2) + female + black + I(female*black), data = gpa2)
summary(model2)

## 
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + I(female * 
##     black), data = gpa2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1028.0972     6.2902 163.445  < 2e-16 ***
## hsize               19.2971     3.8323   5.035 4.97e-07 ***
## I(hsize^2)          -2.1948     0.5272  -4.163 3.20e-05 ***
## female             -45.0915     4.2911 -10.508  < 2e-16 ***
## black             -169.8126    12.7131 -13.357  < 2e-16 ***
## I(female * black)   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

model4<- lm(sat ~ hsize + hsizesq + female + black + female*black , data = data1)
summary(model4)

## 
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + female * 
##     black, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -570.45  -89.54   -5.24   85.41  479.13 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1028.0972     6.2902 163.445  < 2e-16 ***
## hsize          19.2971     3.8323   5.035 4.97e-07 ***
## hsizesq        -2.1948     0.5272  -4.163 3.20e-05 ***
## female        -45.0915     4.2911 -10.508  < 2e-16 ***
## black        -169.8126    12.7131 -13.357  < 2e-16 ***
## female:black   62.3064    18.1542   3.432 0.000605 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared:  0.08578,    Adjusted R-squared:  0.08468 
## F-statistic: 77.52 on 5 and 4131 DF,  p-value: < 2.2e-16

All the variables included in the equation are significant. Because p values of all variables are lower then 0.05.The absolute value of the t statistic for hsize^2 exceeds four, providing robust evidence for its relevance in the equation. This determination is achieved by identifying the turning point, which maximizes satˆ (keeping other factors constant): 19.3/(2⋅ 2.19) ≈ 4.41. Given that hsize is scaled in hundreds, the optimal graduating class size is estimated to be around 441.

To prove that hsizesq should be included in the model, t statistical test should be done.

t=2.19/0.53=4.13 The degree of freedom = 4131-1 = 4130 Based on the t distribution table, the p value is 0.000037.The result is significant at a chosen significance level because p value is lower than 0.05. So, it can prove that there is a statistically significance of hsizesq and the variable should be included in the model.

2*(1-pt(2.19/0.53,4131,FALSE))

## [1] 3.666238e-05

− 45.09 (female) +62.31 (female*black) = 17.22. To check whether the difference between nonblack females and nonblack males is significant, t statistical test should be run. The difference in SAT scores for nonblack females, as indicated by the coefficient on female (given black = 0), is approximately 45 points lower compared to nonblack males. With a t statistic of roughly -10.51, this difference holds high statistical significance, possibly amplified by the considerably large sample size.

t_stat <- 62.31/18.15
df <- 4131-1
p_value <- 2 * (1 - pt(abs(t_stat), df))
p_value

## [1] 0.0006026815

P value is equal to 0.0006. So that the estimated difference is statistically significant.

The difference is -169.81. non-black males are represented by female=0 and black= 0, black males are represented by female=0 and black=1. Hence, the difference between these two groups is coefficient of black variable which is -169.81. It means that black males have 169.81 less score in sat than non-black males in average. When considering female = 0, the coefficient on black implies that a black male’s estimated SAT score is nearly 170 points lower than that of a comparable nonblack male. The t statistic surpasses 13 in absolute value, decisively rejecting the hypothesis that there exists no ceteris paribus difference. amplified by the considerably large sample size.

2*(1-pt(169.81/12.71,4131,FALSE))

## [1] 0

non-black females are represented by female=1 and black= 0, black females are represented by female=1 and black=1. Hence, the difference between these two groups is the sum of black variable and female * black variable (female - (female+black+female * black)) which is 169.81-62.31=107.5. It means that non-black females have 107.5 more score in sat than black females in average.

The difference based on two variables, so t-test cannot be conducted to measure the significance of the relationship. To test the difference, we can create 3 dummy variables with base group is non-black females and obtain t statistic from the coefficient of black female dummy variable.

t_stat2 <- 45.09/6.29
df <- 4131-1
p_value2 <- 2 * (1 - pt(abs(t_stat2), df))
p_value2

## [1] 8.939516e-13

library(wooldridge)
data1 <- wooldridge::gpa1
head(data1)

##   age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1  21    0      0      1       0    0      0        1        0    3.0   3.0  21
## 2  21    0      0      1       0    0      0        1        0    3.4   3.2  24
## 3  20    0      1      0       0    0      0        1        0    3.0   3.6  26
## 4  19    1      0      0       0    1      1        1        0    3.5   3.5  27
## 5  20    0      1      0       0    0      0        1        0    3.6   3.9  28
## 6  20    0      0      1       0    1      1        1        0    3.0   3.4  25
##   job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1     0     1     1    0    0       0  0     0   1        1        0     0
## 2     0     1     1    0    0       0  0     0   1        0        1     1
## 3     1     0     0    0    1       0  0     0   1        1        0     1
## 4     1     0     0    0    1       0  0     0   0        1        0     0
## 5     0     1     0    1    0       0  0     0   1        1        1     0
## 6     0     0     0    0    1       0  0     0   1        1        0     0
##   skipped alcohol gradMI fathcoll mothcoll
## 1       2     1.0      1        0        0
## 2       0     1.0      1        1        1
## 3       0     1.0      1        1        1
## 4       0     0.0      0        0        0
## 5       0     1.5      1        1        0
## 6       0     0.0      0        1        0

data_C1 <- gpa1
model_C1 <- lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll,data= data_C1)
summary(model_C1)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll, 
##     data = data_C1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## PC           0.151854   0.058716   2.586 0.010762 *  
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06

From the model we can see that there are two significant variables including PC and hsGPA, Compared to the result estimated in (7.6), PC coefficient decrease from 0.157 to 0.151, and it is still statistically significant.The impact of PC remains largely consistent with equation (7.6), maintaining substantial significance with a t(pc) value approximately around 2.58.

library(wooldridge)
data1 <- wooldridge::gpa1

# Perform the regression analysis
model_7C1_ur <- lm(colGPA ~ PC + hsGPA + ACT, data = data1)
summary(model_7C1_ur)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7901 -0.2622 -0.0107  0.2334  0.7570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.263520   0.333125   3.793 0.000223 ***
## PC          0.157309   0.057287   2.746 0.006844 ** 
## hsGPA       0.447242   0.093647   4.776 4.54e-06 ***
## ACT         0.008659   0.010534   0.822 0.412513    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared:  0.2194, Adjusted R-squared:  0.2023 
## F-statistic: 12.83 on 3 and 137 DF,  p-value: 1.932e-07

F= ((0.2222-0.2194)/2)/((1-0.2222)/135)
F

## [1] 0.2429931

critical_value = qf(0.95,2,135, TRUE)
critical_value

## [1] 4.423013

model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data1)
summary(model6)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll, 
##     data = data1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78149 -0.25726 -0.02121  0.24691  0.74432 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.255554   0.335392   3.744 0.000268 ***
## PC           0.151854   0.058716   2.586 0.010762 *  
## hsGPA        0.450220   0.094280   4.775 4.61e-06 ***
## ACT          0.007724   0.010678   0.723 0.470688    
## mothcoll    -0.003758   0.060270  -0.062 0.950376    
## fathcoll     0.041800   0.061270   0.682 0.496265    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared:  0.2222, Adjusted R-squared:  0.1934 
## F-statistic: 7.713 on 5 and 135 DF,  p-value: 2.083e-06

library(car)
hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test

## 
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
## 
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    137 15.149                           
## 2    135 15.094  2  0.054685 0.2446 0.7834

summary(joint_test)

##      Res.Df           RSS              Df      Sum of Sq             F         
##  Min.   :135.0   Min.   :15.09   Min.   :2   Min.   :0.05469   Min.   :0.2446  
##  1st Qu.:135.5   1st Qu.:15.11   1st Qu.:2   1st Qu.:0.05469   1st Qu.:0.2446  
##  Median :136.0   Median :15.12   Median :2   Median :0.05469   Median :0.2446  
##  Mean   :136.0   Mean   :15.12   Mean   :2   Mean   :0.05469   Mean   :0.2446  
##  3rd Qu.:136.5   3rd Qu.:15.14   3rd Qu.:2   3rd Qu.:0.05469   3rd Qu.:0.2446  
##  Max.   :137.0   Max.   :15.15   Max.   :2   Max.   :0.05469   Max.   :0.2446  
##                                  NA's   :1   NA's   :1         NA's   :1       
##      Pr(>F)      
##  Min.   :0.7834  
##  1st Qu.:0.7834  
##  Median :0.7834  
##  Mean   :0.7834  
##  3rd Qu.:0.7834  
##  Max.   :0.7834  
##  NA's   :1

Conducting the F test to assess the combined significance of mothcoll and fathcoll, with 2 and 135 degrees of freedom, yields a result of about 0.24 with a p-value close to 0.78. This joint insignificance of these variables doesn’t notably affect the estimates of the other coefficients when included in the regression.

model3 <- lm (colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll, data = gpa1)
summary (model3)

## 
## Call:
## lm(formula = colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + 
##     fathcoll, data = gpa1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.78998 -0.24327 -0.00648  0.26179  0.72231 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  5.040328   2.443038   2.063   0.0410 *
## PC           0.140446   0.058858   2.386   0.0184 *
## hsGPA       -1.802520   1.443552  -1.249   0.2140  
## I(hsGPA^2)   0.337341   0.215711   1.564   0.1202  
## ACT          0.004786   0.010786   0.444   0.6580  
## mothcoll     0.003091   0.060110   0.051   0.9591  
## fathcoll     0.062761   0.062401   1.006   0.3163  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared:  0.2361, Adjusted R-squared:  0.2019 
## F-statistic: 6.904 on 6 and 134 DF,  p-value: 2.088e-06

Upon including hsGPA^2 in the regression, its coefficient stands at approximately 0.337, with a t-statistic of around 1.56. The coefficient for hsGPA itself is roughly -1.803. This situation borders on the edge of significance. The quadratic relationship in hsGPA presents a U-shape, manifesting around hsGPA* = 2.68, which poses challenges in interpretation. Despite the addition of hsGPA^2 serving as a straightforward robustness check, the primary finding regarding the coefficient of PC diminishes to about 0.140 but remains statistically significant.

data3 <- wooldridge::wage2
head(data3)

##   wage hours  IQ KWW educ exper tenure age married black south urban sibs
## 1  769    40  93  35   12    11      2  31       1     0     0     1    1
## 2  808    50 119  41   18    11     16  37       1     0     0     1    1
## 3  825    40 108  46   14    11      9  33       1     0     0     1    1
## 4  650    40  96  32   12    13      7  32       1     0     0     1    4
## 5  562    40  74  27   11    14      5  34       1     0     0     1   10
## 6 1400    40 116  43   16    14      2  35       1     1     0     1    1
##   brthord meduc feduc    lwage
## 1       2     8     8 6.645091
## 2      NA    14    14 6.694562
## 3       2    14    14 6.715384
## 4       3    12    12 6.476973
## 5       6     6    11 6.331502
## 6       2     8    NA 7.244227

data("wage2")
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban, data = wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98069 -0.21996  0.00707  0.24288  1.22822 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.395497   0.113225  47.653  < 2e-16 ***
## educ         0.065431   0.006250  10.468  < 2e-16 ***
## exper        0.014043   0.003185   4.409 1.16e-05 ***
## tenure       0.011747   0.002453   4.789 1.95e-06 ***
## married      0.199417   0.039050   5.107 3.98e-07 ***
## black       -0.188350   0.037667  -5.000 6.84e-07 ***
## south       -0.090904   0.026249  -3.463 0.000558 ***
## urban        0.183912   0.026958   6.822 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared:  0.2526, Adjusted R-squared:  0.2469 
## F-statistic: 44.75 on 7 and 927 DF,  p-value: < 2.2e-16

The black coefficient indicates that, under constant levels of other factors, black men earn roughly 18.8% less compared to nonblack men. With a t statistic of approximately -4.95, this difference holds significant statistical importance.

model4 <- lm(log(wage) ~ educ +exper + I(exper^2) + tenure + I(tenure^2)
             + married + black + south + urban, data = wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure + 
##     I(tenure^2) + married + black + south + urban, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98236 -0.21972 -0.00036  0.24078  1.25127 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.3586756  0.1259143  42.558  < 2e-16 ***
## educ         0.0642761  0.0063115  10.184  < 2e-16 ***
## exper        0.0172146  0.0126138   1.365 0.172665    
## I(exper^2)  -0.0001138  0.0005319  -0.214 0.830622    
## tenure       0.0249291  0.0081297   3.066 0.002229 ** 
## I(tenure^2) -0.0007964  0.0004710  -1.691 0.091188 .  
## married      0.1985470  0.0391103   5.077 4.65e-07 ***
## black       -0.1906636  0.0377011  -5.057 5.13e-07 ***
## south       -0.0912153  0.0262356  -3.477 0.000531 ***
## urban        0.1854241  0.0269585   6.878 1.12e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared:  0.255,  Adjusted R-squared:  0.2477 
## F-statistic: 35.17 on 9 and 925 DF,  p-value: < 2.2e-16

linearHypothesis(model4, c("I(exper^2)=0", "I(tenure^2)=0" ))

## 
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
## 
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) + 
##     married + black + south + urban
## 
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1    927 123.82                           
## 2    925 123.42  2   0.39756 1.4898  0.226

F= ((0.255-0.2526)/2)/((1-0.255)/925)
F

## [1] 1.489933

1-pf(F,2,925)

## [1] 0.2259282

The F statistic evaluating the joint significance of exper^2 and tenure^2, with 2 and 925 degrees of freedom, equates to approximately 1.49, resulting in a p-value around 0.226. Since the p-value exceeds 0.20, these quadratic terms are collectively insignificant at the 20% significance level.

model4 <- lm(log(wage) ~ educ +exper +tenure + married 
             + black + south + urban + I(black*educ), data = wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + I(black * educ), data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      5.374817   0.114703  46.859  < 2e-16 ***
## educ             0.067115   0.006428  10.442  < 2e-16 ***
## exper            0.013826   0.003191   4.333 1.63e-05 ***
## tenure           0.011787   0.002453   4.805 1.80e-06 ***
## married          0.198908   0.039047   5.094 4.25e-07 ***
## black            0.094809   0.255399   0.371 0.710561    
## south           -0.089450   0.026277  -3.404 0.000692 ***
## urban            0.183852   0.026955   6.821 1.63e-11 ***
## I(black * educ) -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

Upon introducing the interaction black ⋅ educ into the equation from the first part, the coefficient for this interaction measures roughly -0.0226 (standard error ≈ 0.0202). Consequently, the estimated point suggests that the increase in returns per year of education for black men is around 2.3 percentage points lower than for nonblack men (approximately 6.7%). While this discrepancy is noteworthy if reflective of population differences, the t statistic stands at approximately 1.12 in absolute value, insufficient to reject the null hypothesis that the relationship between education and returns remains independent of race

model4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+educ*black,data= wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + educ * black, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## black        0.094809   0.255399   0.371 0.710561    
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

model_7C2_4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+married*black,data= wage2)
summary(model4)

## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black + 
##     south + urban + educ * black, data = wage2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97782 -0.21832  0.00475  0.24136  1.23226 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.374817   0.114703  46.859  < 2e-16 ***
## educ         0.067115   0.006428  10.442  < 2e-16 ***
## exper        0.013826   0.003191   4.333 1.63e-05 ***
## tenure       0.011787   0.002453   4.805 1.80e-06 ***
## married      0.198908   0.039047   5.094 4.25e-07 ***
## black        0.094809   0.255399   0.371 0.710561    
## south       -0.089450   0.026277  -3.404 0.000692 ***
## urban        0.183852   0.026955   6.821 1.63e-11 ***
## educ:black  -0.022624   0.020183  -1.121 0.262603    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared:  0.2536, Adjusted R-squared:  0.2471 
## F-statistic: 39.32 on 8 and 926 DF,  p-value: < 2.2e-16

Married-black people are represented by married=1 and black= 1, married-nonblack people are represented by married=1 and black=0. Hence, the difference between these two groups is the sum of black variable and married * black variable which is -0.2408+0.0614=-0.1794. It means that married-black people have monthly salary at 17.94% less than married-nonblack people in average.

Chapter8

The usual F statistic no longer has an F distribution.
The OLS estimators are no longer BLUE.

Statements (ii) and (iii) are true consequences of heteroskedasticity.

data4 <- wooldridge::smoke
head(data4)

##   educ cigpric white age income cigs restaurn   lincome agesq lcigpric
## 1 16.0  60.506     1  46  20000    0        0  9.903487  2116 4.102743
## 2 16.0  57.883     1  40  30000    0        0 10.308952  1600 4.058424
## 3 12.0  57.664     1  58  30000    3        0 10.308952  3364 4.054633
## 4 13.5  57.883     1  30  20000    0        0  9.903487   900 4.058424
## 5 10.0  58.320     1  17  20000    0        0  9.903487   289 4.065945
## 6  6.0  59.340     1  86   6500    0        0  8.779557  7396 4.083283

model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)

## 
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age + 
##     agesq + restaurn + white, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.772  -9.330  -5.907   7.945  70.275 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.682419  24.220729  -0.111  0.91184    
## log(cigpric) -0.850907   5.782321  -0.147  0.88305    
## log(income)   0.869014   0.728763   1.192  0.23344    
## educ         -0.501753   0.167168  -3.001  0.00277 ** 
## age           0.774502   0.160516   4.825 1.68e-06 ***
## agesq        -0.009069   0.001748  -5.188 2.70e-07 ***
## restaurn     -2.865621   1.117406  -2.565  0.01051 *  
## white        -0.559236   1.459461  -0.383  0.70169    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared:  0.05291,    Adjusted R-squared:  0.04461 
## F-statistic: 6.377 on 7 and 799 DF,  p-value: 2.588e-07

No. The standard errors for each coefficient, whether usual or heteroskedasticity-robust, show very similar practical results.There are not any significant differences between them.

s_error <- coef(summary(model11))
s_error

##                  Estimate   Std. Error    t value     Pr(>|t|)
## (Intercept)  -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441  5.782321084 -0.1471567 8.830454e-01
## log(income)   0.869013971  0.728763480  1.1924499 2.334389e-01
## educ         -0.501753247  0.167167689 -3.0014966 2.770186e-03
## age           0.774502156  0.160515805  4.8250835 1.676279e-06
## agesq        -0.009068603  0.001748055 -5.1878253 2.699355e-07
## restaurn     -2.865621212  1.117405936 -2.5645301 1.051320e-02
## white        -0.559236403  1.459461034 -0.3831801 7.016882e-01

The impact translates to −.029(4) = −.116, indicating a decrease in the probability of smoking by about .116. If the education increases by 4 years the smokes will reduce 0.116 percent.

age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age

## age 
##  NA

The turning point within the quadratic: .020/[2(.00026)] ≈ 38.46, roughly around 38 and a half years.

coef_res <- coef(model11)["restaurn"]
coef_res

##  restaurn 
## -2.865621

With other variables in the equation held constant, an individual in a state with restaurant smoking restrictions faces a reduction of .101 in the likelihood of smoking. This resembles the impact of obtaining four more years of education.

library(lmtest)
data_update <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test

## 
##  studentized Breusch-Pagan test
## 
## data:  model11
## BP = 32.377, df = 7, p-value = 3.458e-05

smokes=−.656+.069xlog(67.44)+.012xlog(6,500)−.029x(16)+.020x(77)+.00026x(77^2 )−.0052. Hence, the estimated smoking probability for this individual aligns closely with zero. (In fact, this individual isn’t a smoker, validating the equation’s prediction for this specific observation.)

data5 <- wooldridge::vote1
head(data5)

##   state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1    AL        7      1    68 328.296   8.737       41 5.793916 2.167567
## 2    AK        1      0    62 626.377 402.477       60 6.439952 5.997638
## 3    AZ        2      1    73  99.607   3.065       55 4.601233 1.120048
## 4    AZ        3      0    69 319.690  26.281       64 5.767352 3.268846
## 5    AR        3      0    75 159.221  60.054       66 5.070293 4.095244
## 6    AR        4      1    69 570.155  21.393       46 6.345908 3.063064
##     shareA
## 1 97.40767
## 2 60.88104
## 3 97.01476
## 4 92.40370
## 5 72.61247
## 6 96.38355

data ("vote1")
model5 <- lm (voteA ~ prtystrA + democA + lexpendA + lexpendB, data = vote1)
summary(model5)

## 
## Call:
## lm(formula = voteA ~ prtystrA + democA + lexpendA + lexpendB, 
##     data = vote1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.576  -4.864  -1.146   4.903  24.566 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.66142    4.73604   7.952 2.56e-13 ***
## prtystrA     0.25192    0.07129   3.534  0.00053 ***
## democA       3.79294    1.40652   2.697  0.00772 ** 
## lexpendA     5.77929    0.39182  14.750  < 2e-16 ***
## lexpendB    -6.23784    0.39746 -15.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared:  0.8012, Adjusted R-squared:  0.7964 
## F-statistic: 169.2 on 4 and 168 DF,  p-value: < 2.2e-16

data5 <- data5 %>% mutate(residual_squared=(model5$residuals)^2)
head(data5)

##   state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1    AL        7      1    68 328.296   8.737       41 5.793916 2.167567
## 2    AK        1      0    62 626.377 402.477       60 6.439952 5.997638
## 3    AZ        2      1    73  99.607   3.065       55 4.601233 1.120048
## 4    AZ        3      0    69 319.690  26.281       64 5.767352 3.268846
## 5    AR        3      0    75 159.221  60.054       66 5.070293 4.095244
## 6    AR        4      1    69 570.155  21.393       46 6.345908 3.063064
##     shareA residual_squared
## 1 97.40767        14.038468
## 2 60.88104        88.688062
## 3 97.01476         3.667331
## 4 92.40370         5.176383
## 5 72.61247       287.464399
## 6 96.38355         2.593858

Estimated equation: voteA = 37.66 + .252 prtystrA + 3.793 democA + 5.779 log(expendA) − 6.238 log(expendB) + u (4.74) (.071) (1.407) (0.392) (0.397)

model5_r<- lm(residual_squared~prtystrA+democA+log(expendA)+log(expendB),data= data5)
summary(model5_r)

## 
## Call:
## lm(formula = residual_squared ~ prtystrA + democA + log(expendA) + 
##     log(expendB), data = data5)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -94.88 -46.16 -23.51  15.84 508.32 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  113.9635    50.8150   2.243   0.0262 *
## prtystrA      -0.2993     0.7649  -0.391   0.6961  
## democA        15.6192    15.0912   1.035   0.3022  
## log(expendA) -10.3057     4.2040  -2.451   0.0153 *
## log(expendB)  -0.0514     4.2645  -0.012   0.9904  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.25 on 168 degrees of freedom
## Multiple R-squared:  0.05256,    Adjusted R-squared:   0.03 
## F-statistic:  2.33 on 4 and 168 DF,  p-value: 0.05806

bptest(model5)

## 
##  studentized Breusch-Pagan test
## 
## data:  model5
## BP = 9.0934, df = 4, p-value = 0.05881

The F statistic for joint significant (with 4 and 168 df) is about 2.33 with p-value ≈ .058. Therefore, there is some evidence of heteroskedasticity, but not quite at the 5% level.

bptest (model5, ~ prtystrA*democA*lexpendA*lexpendB + I(prtystrA^2) + I(democA^2) + I(lexpendA^2) + I(lexpendB^2), data = vote1)

## 
##  studentized Breusch-Pagan test
## 
## data:  model5
## BP = 43.327, df = 18, p-value = 0.0007194

The F test, with 2 and 170 df, is about 2.79 with p-value ≈ .065. This is slightly less evidence of heteroskedasticity than provided by the B-P test, but the conclusion is very similar.

C13

library(sandwich)
library(lmtest)
data6 <- wooldridge::fertil2
head(data6)

##   mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1        5       64  24        1     1  1       1   12   0       NA        0
## 2        1       56  32        1     1  1       1   13   3       25        3
## 3        7       58  30        1     0  0       0    5   1       27        1
## 4       11       45  42        1     0  1       0    4   3       17        2
## 5        5       45  43        1     1  1       1   11   2       24        2
## 6        8       52  36        1     0  0       0    7   1       26        1
##   knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1        1       0      NA     NA    NA        2    NA   576     1       12
## 2        1       1      11     80    24        3    12  1024     1       13
## 3        1       0       6     83    24        5     7   900     1        5
## 4        1       0       1     61    15        3    11  1764     1        4
## 5        1       1       3     66    20        2    14  1849     1       11
## 6        1       1      11     76    24        4     9  1296     1        7
##   spirit protest catholic frsthalf educ0 evermarr
## 1      0       0        0        1     0        0
## 2      0       0        0        1     0        1
## 3      1       0        0        0     0        1
## 4      0       0        0        0     0        1
## 5      0       1        0        1     0        1
## 6      0       0        0        0     0        1

model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
robust_se <- sqrt(diag(vcovHC(model)))
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)

## 
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban, 
##     data = fertil2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9012 -0.7136 -0.0039  0.7119  7.4318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -4.2225162  0.2401888 -17.580  < 2e-16 ***
## age          0.3409255  0.0165082  20.652  < 2e-16 ***
## I(age^2)    -0.0027412  0.0002718 -10.086  < 2e-16 ***
## educ        -0.0752323  0.0062966 -11.948  < 2e-16 ***
## electric    -0.3100404  0.0690045  -4.493 7.20e-06 ***
## urban       -0.2000339  0.0465062  -4.301 1.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.452 on 4352 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.5734, Adjusted R-squared:  0.5729 
## F-statistic:  1170 on 5 and 4352 DF,  p-value: < 2.2e-16

print(summary_with_robust_se)

##                             Robust SE
## (Intercept) -4.222516228 0.2443961935
## age          0.340925520 0.0192199445
## I(age^2)    -0.002741209 0.0003513959
## educ        -0.075232323 0.0063159137
## electric    -0.310040409 0.0640737262
## urban       -0.200033857 0.0455162364

coeftest(model6, vcov=vcovHC(model6, type = "HC0"))

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  1.2555544  0.3368805  3.7270  0.000284 ***
## PC           0.1518539  0.0596641  2.5451  0.012048 *  
## hsGPA        0.4502203  0.0962508  4.6776 6.962e-06 ***
## ACT          0.0077242  0.0103074  0.7494  0.454932    
## mothcoll    -0.0037579  0.0601271 -0.0625  0.950258    
## fathcoll     0.0417999  0.0578749  0.7222  0.471392    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

F= ((0.574-0.5734)/3)/((1-0.574)/4349)
F

## [1] 2.041784

critical_value = qf(0.95,3,4349, TRUE)
critical_value

## [1] 3.403585

p_value = 1- pf(F,3,4349)
p_value

## [1] 0.1058349

a <- coeftest(model, vcov = sandwich)
a

## 
## t test of coefficients:
## 
##                Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept) -4.22251623  0.24368307 -17.3279 < 2.2e-16 ***
## age          0.34092552  0.01916146  17.7923 < 2.2e-16 ***
## I(age^2)    -0.00274121  0.00035027  -7.8260 6.278e-15 ***
## educ        -0.07523232  0.00630336 -11.9353 < 2.2e-16 ***
## electric    -0.31004041  0.06390411  -4.8517 1.267e-06 ***
## urban       -0.20003386  0.04543962  -4.4022 1.097e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

bptest(model)

## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1065, df = 5, p-value < 2.2e-16

Some robust standard are smaller than the non-robust standard errors.

usqr=(summary(model6)$residual)^2
yhat=model6$fitted.values
yhatsqr=model6$fitted.values^2
model6WT=summary(lm(usqr~yhat+yhatsqr,data=fertil2))$r.squared
model6WT*4361

## [1] 143.6236

1-pchisq(1093.183,2)

## [1] 0

joint_test <- coeftest(model6, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])

##  (Intercept)           PC        hsGPA          ACT     mothcoll     fathcoll 
## 5.903211e-04 1.682044e-02 1.977193e-05 4.846838e-01 9.528773e-01 4.948069e-01

p_value = 1- pf(F,3,4349)
p_value

## [1] 0.1058349

head(summary_with_robust_se)

##                             Robust SE
## (Intercept) -4.222516228 0.2443961935
## age          0.340925520 0.0192199445
## I(age^2)    -0.002741209 0.0003513959
## educ        -0.075232323 0.0063159137
## electric    -0.310040409 0.0640737262
## urban       -0.200033857 0.0455162364

fv <- fitted(model6)
head(fv,6)

##        1        2        3        4        5        6 
## 2.768423 2.919682 3.115218 3.039878 3.269490 3.021208

residuals <- resid(model6)
head(residuals,6)

##           1           2           3           4           5           6 
##  0.23157702  0.48031850 -0.11521801  0.46012183  0.33050949 -0.02120773

hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)

## 
## Call:
## lm(formula = residuals^2 ~ fv)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.13813 -0.07873 -0.04322  0.04132  0.51313 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -0.27680    0.18877  -1.466   0.1448  
## fv           0.12558    0.06165   2.037   0.0436 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.128 on 139 degrees of freedom
## Multiple R-squared:  0.02898,    Adjusted R-squared:  0.022 
## F-statistic: 4.149 on 1 and 139 DF,  p-value: 0.04357

Ho: There is no heteroskedasticity H1: There is heteroskedasticity

As the p-value is below alpha at a 95% confidence level, there exists adequate statistical evidence to reject the null hypothesis, signifying the presence of heteroskedasticity in the model.

iv.No, since the difference between the robust and non robust standar errors was really small.

Chapter9 1

From the F-test, p-value obtained are 0.054. Hence, at 5% significant level, there is not significant evidence for functional form misspecification in the model. However, if we adjust the significant level larger than 5.5%, the evidence become significant.

data7 <- wooldridge::ceosal2
head(data7)

##   salary age college grad comten ceoten sales profits mktval  lsalary   lsales
## 1   1161  49       1    1      9      2  6200     966  23200 7.057037 8.732305
## 2    600  43       1    1     10     10   283      48   1100 6.396930 5.645447
## 3    379  51       1    1      9      3   169      40   1100 5.937536 5.129899
## 4    651  55       1    0     22     22  1100     -54   1000 6.478509 7.003066
## 5    497  44       1    1      8      6   351      28    387 6.208590 5.860786
## 6   1067  64       1    1      7      7 19000     614   3900 6.972606 9.852194
##     lmktval comtensq ceotensq  profmarg
## 1 10.051908       81        4 15.580646
## 2  7.003066      100      100 16.961130
## 3  7.003066       81        9 23.668638
## 4  6.907755      484      484 -4.909091
## 5  5.958425       64       36  7.977208
## 6  8.268732       49       49  3.231579

model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceoten + comten, data = data7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5436 -0.2796 -0.0164  0.2857  1.9879 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.571977   0.253466  18.038  < 2e-16 ***
## log(sales)   0.187787   0.040003   4.694 5.46e-06 ***
## log(mktval)  0.099872   0.049214   2.029  0.04397 *  
## profmarg    -0.002211   0.002105  -1.050  0.29514    
## ceoten       0.017104   0.005540   3.087  0.00236 ** 
## comten      -0.009238   0.003337  -2.768  0.00626 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared:  0.3525, Adjusted R-squared:  0.3336 
## F-statistic: 18.62 on 5 and 171 DF,  p-value: 9.488e-15

r_squared <- summary(model11)$r.squared
r_squared

## [1] 0.3525374

model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)

## 
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg + 
##     ceotensq + comtensq, data = data7)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.47481 -0.25933 -0.00511  0.27010  2.07583 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.612e+00  2.524e-01  18.276  < 2e-16 ***
## log(sales)   1.805e-01  4.021e-02   4.489 1.31e-05 ***
## log(mktval)  1.018e-01  4.988e-02   2.040   0.0429 *  
## profmarg    -2.077e-03  2.135e-03  -0.973   0.3321    
## ceotensq     3.761e-04  1.916e-04   1.963   0.0512 .  
## comtensq    -1.788e-04  7.236e-05  -2.471   0.0144 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared:  0.3324, Adjusted R-squared:  0.3129 
## F-statistic: 17.03 on 5 and 171 DF,  p-value: 1.195e-13

r_squared <- summary(model12)$r.squared
r_squared

## [1] 0.3323998

F= ((0.375-0.353)/2)/((1-0.375)/169)
F

## [1] 2.9744

p_value = 1-pf(F,2,169)
p_value

## [1] 0.05375882

If β 6 or β 7, denoting the population parameters on ‘ceoten2’ and ‘comten2’, respectively, do not equal zero, it suggests a potential mismatch in the functional form. To evaluate this, we conduct a joint significance test using the R-squared form of the F test: F = [(.375 − .353)/(1 − .375)][(177 – 8)/2] ≈ 2.97. With degrees of freedom 2 and approaching infinity, the 10% critical value is 2.30, and the 5% critical value is 3.00. Consequently, the p-value slightly exceeds .05, indicating reasonable evidence of functional form mismatch. However, whether this influences the estimated partial effects for different levels of the explanatory variables is a separate consideration.

The sample selection could be endogenous here. If colleges with high crime rates are inclined to conceal crime statistics due to their influence on prospective students, the likelihood of being in the sample is inversely linked to the crime equation’s u. In essence, higher u, indicating more crime for a given school size, leads to a reduced probability of the school disclosing its crime data.

data9 <- wooldridge::campus
head(data9)

##   enroll priv police crime   lcrime  lenroll  lpolice
## 1  21836    0     24   446 6.100319 9.991315 3.178054
## 2   6485    0     13     1 0.000000 8.777247 2.564949
## 3   2123    0      3     1 0.000000 7.660585 1.098612
## 4   8240    0     17   121 4.795791 9.016756 2.833213
## 5  19793    0     30   470 6.152733 9.893084 3.401197
## 6   3256    1      9    25 3.218876 8.088255 2.197225

b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
  cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
  cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}

## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.

In summary, whether the failure to report crimes constitutes exogenous sample selection depends on the underlying reasons for non-reporting. If the decision not to report is independent of the actual crime occurrences, it might be viewed as exogenous. Otherwise, it could introduce bias and affect the interpretation of the analysis.

data ("infmrt")
model9 <- lm(infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt ) 
summary(model9)

## 
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6894 -0.8973 -0.1412  0.7054  3.1705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.2125     6.7267   5.383 5.09e-07 ***
## lpcinc       -2.3964     0.8866  -2.703  0.00812 ** 
## lphysic      -1.5548     0.7734  -2.010  0.04718 *  
## lpopul        0.5755     0.1365   4.215 5.60e-05 ***
## DC           13.9632     1.2466  11.201  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.255 on 97 degrees of freedom
## Multiple R-squared:  0.6432, Adjusted R-squared:  0.6285 
## F-statistic: 43.72 on 4 and 97 DF,  p-value: < 2.2e-16

model9 <- lm(infmort~log(pcinc)+log(physic)+log(popul), data=infmrt)
summary(model9)

## 
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul), 
##     data = infmrt)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0204 -1.1897 -0.1152  0.9083  8.1890 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.22614   10.13466   3.574 0.000547 ***
## log(pcinc)  -4.88415    1.29319  -3.777 0.000273 ***
## log(physic)  4.02783    0.89101   4.521 1.73e-05 ***
## log(popul)  -0.05356    0.18748  -0.286 0.775712    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.891 on 98 degrees of freedom
## Multiple R-squared:  0.1818, Adjusted R-squared:  0.1568 
## F-statistic:  7.26 on 3 and 98 DF,  p-value: 0.0001898

The coefficient for DC implies that if a state shared the exact per capita income, per capita physicians, and population as Washington D.C., the predicted infant mortality rate for D.C. would be roughly 16 deaths per 1000 live births higher. This signifies a substantial and notable “D.C. effect.”
In the regression from part (i), including or excluding a single observation, such as D.C., yields matching coefficients and standard errors for the other variables. However, the R-squared values differ notably because predicting the infant mortality rate perfectly for D.C. impacts these metrics. Verifying that the residual for the D.C. observation is zero is advisable.

library(wooldridge)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ car::recode()   masks dplyr::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(quantreg)

## Loading required package: SparseM

data <- as.data.frame(rdchem)
data$sales_billion <- data$sales / 1e3

ols_full <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data)
summary(ols_full)

## 
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         2.058967   0.626269   3.288  0.00272 **
## sales_billion       0.316611   0.138854   2.280  0.03041 * 
## I(sales_billion^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg            0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107

# Exclude firm with annual sales of almost $40 billion
data_filtered <- data %>% filter(sales_billion < 40)
ols_filtered <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered)
summary(ols_filtered)

## 
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, data = data_filtered)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0371 -1.1238 -0.4547  0.7165  5.8522 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)   
## (Intercept)         2.058967   0.626269   3.288  0.00272 **
## sales_billion       0.316611   0.138854   2.280  0.03041 * 
## I(sales_billion^2) -0.007390   0.003716  -1.989  0.05657 . 
## profmarg            0.053322   0.044210   1.206  0.23787   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared:  0.1905, Adjusted R-squared:  0.1037 
## F-statistic: 2.196 on 3 and 28 DF,  p-value: 0.1107

#### PART (ii) - LAD Regression
# Full dataset
lad_full <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data, tau = 0.5)
summary(lad_full)

## 
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, tau = 0.5, data = data)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                    coefficients lower bd upper bd
## (Intercept)         1.40428      0.87031  2.66628
## sales_billion       0.26346     -0.13508  0.75753
## I(sales_billion^2) -0.00600     -0.01679  0.00344
## profmarg            0.11400      0.01376  0.16427

# Exclude firm with annual sales of almost $40 billion
lad_filtered <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered, tau = 0.5)
summary(lad_filtered)

## 
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) + 
##     profmarg, tau = 0.5, data = data_filtered)
## 
## tau: [1] 0.5
## 
## Coefficients:
##                    coefficients lower bd upper bd
## (Intercept)         1.40428      0.87031  2.66628
## sales_billion       0.26346     -0.13508  0.75753
## I(sales_billion^2) -0.00600     -0.01679  0.00344
## profmarg            0.11400      0.01376  0.16427

#### PART (iii) - Compare Resilience to Outliers
# Create a comparison table for OLS and LAD coefficients
comparison <- data.frame(
  Method = c("OLS (Full)", "OLS (Filtered)", "LAD (Full)", "LAD (Filtered)"),
  Intercept = c(coef(ols_full)[1], coef(ols_filtered)[1], coef(lad_full)[1], coef(lad_filtered)[1]),
  Sales = c(coef(ols_full)[2], coef(ols_filtered)[2], coef(lad_full)[2], coef(lad_filtered)[2]),
  Sales_Squared = c(coef(ols_full)[3], coef(ols_filtered)[3], coef(lad_full)[3], coef(lad_filtered)[3]),
  Profmarg = c(coef(ols_full)[4], coef(ols_filtered)[4], coef(lad_full)[4], coef(lad_filtered)[4])
)
print(comparison)

##           Method Intercept     Sales Sales_Squared   Profmarg
## 1     OLS (Full)  2.058967 0.3166110  -0.007389624 0.05332217
## 2 OLS (Filtered)  2.058967 0.3166110  -0.007389624 0.05332217
## 3     LAD (Full)  1.404279 0.2634638  -0.006001127 0.11400366
## 4 LAD (Filtered)  1.404279 0.2634638  -0.006001127 0.11400366

Chapter10

Disagree-Time series data often exhibit autocorrelation, meaning that observations are not independently distributed.
Agree-The first three Gauss-Markov assumptions (linearity, random sampling, and no perfect multicollinearity) ensure that the OLS estimator is unbiased. However, in time series data, additional assumptions (e.g., no autocorrelation and homoscedasticity) are required for the OLS estimator to be efficient.
Disagree-A trending variable can be used as the dependent variable, but care must be taken to account for the trend. For example, including a time trend variable or differencing the data can help address issues of spurious regression caused by non-stationarity.
Agree-Seasonality typically refers to patterns that repeat within a year (e.g., quarterly or monthly data). When using annual time series data, seasonality is generally not a concern because the data aggregates over the entire year, smoothing out seasonal fluctuations.

When modeling quarterly data on new housing starts, interest rates, and real per capita income, it’s important to account for potential trends and seasonality in the variables. A common approach is to use a time series model such as a SARIMA (Seasonal Autoregressive Integrated Moving Average) model. Here’s how you might specify a model:

Let’s denote: y(t):Housing starts at time x(1,t) : Interest rates at time x(2,t): Real per capita income at time A simple SARIMA model with a linear trend, seasonality, and exogenous variables might look like this: y(t)=b0+b1t+b2x(1,t)+b3(x2,t)+e(t) Here: b0 is the intercept, b1 represents the linear trend over time, b2 and b3 represent the impact of interest rates and real per capita income on housing starts, respectively, e(t) is the error term.

data_10C1 <- intdef %>% mutate(later1979= ifelse(year>1979,1,0))
head(data_10C1,5)

##   year   i3  inf  rec  out        def i3_1 inf_1      def_1        ci3 cinf
## 1 1948 1.04  8.1 16.2 11.6 -4.6000004   NA    NA         NA         NA   NA
## 2 1949 1.10 -1.2 14.5 14.3 -0.1999998 1.04   8.1 -4.6000004 0.06000006 -9.3
## 3 1950 1.22  1.3 14.4 15.6  1.2000008 1.10  -1.2 -0.1999998 0.12000000  2.5
## 4 1951 1.55  7.9 16.1 14.2 -1.9000006 1.22   1.3  1.2000008 0.32999992  6.6
## 5 1952 1.77  1.9 19.0 19.4  0.3999996 1.55   7.9 -1.9000006 0.22000003 -6.0
##        cdef y77 later1979
## 1        NA   0         0
## 2  4.400001   0         0
## 3  1.400001   0         0
## 4 -3.100001   0         0
## 5  2.300000   0         0

model_10C1 <- lm(i3~inf+def+later1979, data=data_10C1)
summary(model_10C1)

## 
## Call:
## lm(formula = i3 ~ inf + def + later1979, data = data_10C1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4674 -0.8407  0.2388  1.0148  3.9654 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.29623    0.42535   3.047  0.00362 ** 
## inf          0.60842    0.07625   7.979 1.37e-10 ***
## def          0.36266    0.12025   3.016  0.00396 ** 
## later1979    1.55877    0.50577   3.082  0.00329 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.711 on 52 degrees of freedom
## Multiple R-squared:  0.6635, Adjusted R-squared:  0.6441 
## F-statistic: 34.18 on 3 and 52 DF,  p-value: 2.408e-12

data_10C1 <- as.integer(data_10C1$year > 1979)
data13 <- wooldridge::intdef
head(data13)

##   year   i3  inf  rec  out        def i3_1 inf_1      def_1        ci3 cinf
## 1 1948 1.04  8.1 16.2 11.6 -4.6000004   NA    NA         NA         NA   NA
## 2 1949 1.10 -1.2 14.5 14.3 -0.1999998 1.04   8.1 -4.6000004 0.06000006 -9.3
## 3 1950 1.22  1.3 14.4 15.6  1.2000008 1.10  -1.2 -0.1999998 0.12000000  2.5
## 4 1951 1.55  7.9 16.1 14.2 -1.9000006 1.22   1.3  1.2000008 0.32999992  6.6
## 5 1952 1.77  1.9 19.0 19.4  0.3999996 1.55   7.9 -1.9000006 0.22000003 -6.0
## 6 1953 1.93  0.8 18.7 20.4  1.6999989 1.77   1.9  0.3999996 0.15999997 -1.1
##        cdef y77
## 1        NA   0
## 2  4.400001   0
## 3  1.400001   0
## 4 -3.100001   0
## 5  2.300000   0
## 6  1.299999   0

data13$dummy <- as.integer(data13$year > 1979)
data13$dummy

##  [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

model15 <- lm(i3 ~ inf + def , data = data13)
summary(model15)

## 
## Call:
## lm(formula = i3 ~ inf + def, data = data13)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.9948 -1.1694  0.1959  0.9602  4.7224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.73327    0.43197   4.012  0.00019 ***
## inf          0.60587    0.08213   7.376 1.12e-09 ***
## def          0.51306    0.11838   4.334 6.57e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.843 on 53 degrees of freedom
## Multiple R-squared:  0.6021, Adjusted R-squared:  0.5871 
## F-statistic: 40.09 on 2 and 53 DF,  p-value: 2.483e-11

model16 <- lm(i3 ~ inf + def + y77 , data = data13)
summary(model16)

## 
## Call:
## lm(formula = i3 ~ inf + def + y77, data = data13)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4048 -0.9632  0.2192  0.8497  4.3447 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.40531    0.42239   3.327  0.00162 ** 
## inf          0.56884    0.07832   7.263 1.88e-09 ***
## def          0.36276    0.12337   2.940  0.00488 ** 
## y77          1.47773    0.52349   2.823  0.00673 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.733 on 52 degrees of freedom
## Multiple R-squared:  0.6549, Adjusted R-squared:  0.635 
## F-statistic:  32.9 on 3 and 52 DF,  p-value: 4.608e-12

# Load necessary library
library(wooldridge)

# Load the dataset 'volat'
data("volat")

# Display the structure of the dataset to verify variable names
str(volat)

## 'data.frame':    558 obs. of  17 variables:
##  $ date  : num  1947 1947 1947 1947 1947 ...
##  $ sp500 : num  15.2 15.8 15.2 14.6 14.3 ...
##  $ divyld: num  4.49 4.38 4.61 4.75 5.05 ...
##  $ i3    : num  0.38 0.38 0.38 0.38 0.38 ...
##  $ ip    : num  22.4 22.5 22.6 22.5 22.6 ...
##  $ pcsp  : num  NA 46.5 -48.6 -44.3 -21.4 ...
##  $ rsp500: num  NA 50.9 -44 -39.6 -16.3 ...
##  $ pcip  : num  NA 5.36 5.33 -5.31 5.33 ...
##  $ ci3   : num  NA 0 0 0 0 ...
##  $ ci3_1 : num  NA NA 0 0 0 ...
##  $ ci3_2 : num  NA NA NA 0 0 ...
##  $ pcip_1: num  NA NA 5.36 5.33 -5.31 ...
##  $ pcip_2: num  NA NA NA 5.36 5.33 ...
##  $ pcip_3: num  NA NA NA NA 5.36 ...
##  $ pcsp_1: num  NA NA 46.5 -48.6 -44.3 ...
##  $ pcsp_2: num  NA NA NA 46.5 -48.6 ...
##  $ pcsp_3: num  NA NA NA NA 46.5 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"

# Check the first few rows to confirm the data
head(volat)

##      date sp500 divyld   i3   ip      pcsp    rsp500      pcip ci3 ci3_1 ci3_2
## 1 1947.01 15.21   4.49 0.38 22.4        NA        NA        NA  NA    NA    NA
## 2 1947.02 15.80   4.38 0.38 22.5  46.54833  50.92833  5.357163   0    NA    NA
## 3 1947.03 15.16   4.61 0.38 22.6 -48.60762 -43.99762  5.333354   0     0    NA
## 4 1947.04 14.60   4.75 0.38 22.5 -44.32714 -39.57714 -5.309754   0     0     0
## 5 1947.05 14.34   5.05 0.38 22.6 -21.36988 -16.31988  5.333354   0     0     0
## 6 1947.06 14.84   5.03 0.38 22.6  41.84100  46.87100  0.000000   0     0     0
##      pcip_1    pcip_2   pcip_3    pcsp_1    pcsp_2    pcsp_3
## 1        NA        NA       NA        NA        NA        NA
## 2        NA        NA       NA        NA        NA        NA
## 3  5.357163        NA       NA  46.54833        NA        NA
## 4  5.333354  5.357163       NA -48.60762  46.54833        NA
## 5 -5.309754  5.333354 5.357163 -44.32714 -48.60762  46.54833
## 6  5.333354 -5.309754 5.333354 -21.36988 -44.32714 -48.60762

# Estimate the equation rsp500_t = β0 + β1 * pcip_t + β2 * i3_t + u_t using OLS
model <- lm(rsp500 ~ pcip + i3, data = volat)

# Summarize the results of the model
summary(model)

## 
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.871  -22.580    2.103   25.524  138.137 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.84306    3.27488   5.754 1.44e-08 ***
## pcip         0.03642    0.12940   0.281   0.7785    
## i3          -1.36169    0.54072  -2.518   0.0121 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 40.13 on 554 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.01189,    Adjusted R-squared:  0.008325 
## F-statistic: 3.334 on 2 and 554 DF,  p-value: 0.03637

# Interpretation:
# (i) Look at the coefficients and their signs in the output.
# (ii) Check the p-values for statistical significance of pcip and i3.
# (iii) Use the results to discuss whether rsp500 is predictable based on the predictors.

Chapter 11 C7

# Load the wooldridge package
library(wooldridge)

# Load the CONSUMP dataset
data("consump", package = "wooldridge")

# View the structure of the dataset
str(consump)

## 'data.frame':    37 obs. of  24 variables:
##  $ year   : int  1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 ...
##  $ i3     : num  3.41 2.93 2.38 2.78 3.16 ...
##  $ inf    : num  0.7 1.7 1 1 1.3 ...
##  $ rdisp  : num  1530 1565 1616 1694 1756 ...
##  $ rnondc : num  606 615 627 646 660 ...
##  $ rserv  : num  687 717 746 783 819 ...
##  $ pop    : num  177830 180671 183691 186538 189242 ...
##  $ y      : num  8604 8664 8796 9080 9276 ...
##  $ rcons  : num  1294 1333 1373 1430 1479 ...
##  $ c      : num  7275 7377 7476 7665 7814 ...
##  $ r3     : num  2.71 1.23 1.38 1.78 1.86 ...
##  $ lc     : num  8.89 8.91 8.92 8.94 8.96 ...
##  $ ly     : num  9.06 9.07 9.08 9.11 9.14 ...
##  $ gc     : num  NA 0.0139 0.0133 0.0251 0.0192 ...
##  $ gy     : num  NA 0.00696 0.01511 0.03171 0.02145 ...
##  $ gc_1   : num  NA NA 0.0139 0.0133 0.0251 ...
##  $ gy_1   : num  NA NA 0.00696 0.01511 0.03171 ...
##  $ r3_1   : num  NA 2.71 1.23 1.38 1.78 ...
##  $ lc_ly  : num  -0.168 -0.161 -0.163 -0.169 -0.172 ...
##  $ lc_ly_1: num  NA -0.168 -0.161 -0.163 -0.169 ...
##  $ gc_2   : num  NA NA NA 0.0139 0.0133 ...
##  $ gy_2   : num  NA NA NA 0.00696 0.01511 ...
##  $ r3_2   : num  NA NA 2.71 1.23 1.38 ...
##  $ lc_ly_2: num  NA NA -0.168 -0.161 -0.163 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"

# Generate growth in consumption (gc) as specified
consump$gc <- log(consump$c) - log(dplyr::lag(consump$c, n = 1))

# PART (i) - Test the PIH hypothesis
# Run regression: gc_t = β0 + β1 * gc_t-1 + u_t
model1 <- lm(gc ~ dplyr::lag(gc, n = 1), data = consump)
summary(model1)

## 
## Call:
## lm(formula = gc ~ dplyr::lag(gc, n = 1), data = consump)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027878 -0.005974 -0.001451  0.007142  0.020227 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)   
## (Intercept)           0.011431   0.003778   3.026  0.00478 **
## dplyr::lag(gc, n = 1) 0.446141   0.156047   2.859  0.00731 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01161 on 33 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.1985, Adjusted R-squared:  0.1742 
## F-statistic: 8.174 on 1 and 33 DF,  p-value: 0.00731

# PART (ii) - Add additional variables: gy_t-1, i3_t-1, and inf_t-1
# Include lagged variables of gy, i3, and inf
consump$gy_lag <- dplyr::lag(consump$gy, n = 1)
consump$i3_lag <- dplyr::lag(consump$i3, n = 1)
consump$inf_lag <- dplyr::lag(consump$inf, n = 1)

# Run regression with additional variables
model2 <- lm(gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model2)

## 
## Call:
## lm(formula = gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag, 
##     data = consump)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0249093 -0.0075869  0.0000855  0.0087234  0.0188617 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)            0.0225940  0.0070892   3.187  0.00335 **
## dplyr::lag(gc, n = 1)  0.4335832  0.2896525   1.497  0.14487   
## gy_lag                -0.1079087  0.1946371  -0.554  0.58341   
## i3_lag                -0.0007467  0.0011107  -0.672  0.50654   
## inf_lag               -0.0008281  0.0010041  -0.825  0.41606   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01134 on 30 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.3038, Adjusted R-squared:  0.211 
## F-statistic: 3.273 on 4 and 30 DF,  p-value: 0.02431

# PART (iii) - Evaluate significance of lag(gc) in model2
# Check p-value of lag(gc)
p_value_lag_gc <- summary(model2)$coefficients["dplyr::lag(gc, n = 1)", "Pr(>|t|)"]

# PART (iv) - F-statistic and joint significance
# Perform an F-test for joint significance of all explanatory variables
anova(model1, model2)

## Analysis of Variance Table
## 
## Model 1: gc ~ dplyr::lag(gc, n = 1)
## Model 2: gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag
##   Res.Df       RSS Df  Sum of Sq      F Pr(>F)
## 1     33 0.0044446                            
## 2     30 0.0038608  3 0.00058384 1.5122 0.2315

C12

library(wooldridge)  
library(lmtest)      
library(sandwich)    
library(ggplot2)     
library(tseries)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

data("minwage") 

# Focus on the relevant variables
# gwage232: growth in wage (sector 232)
# gemp232: growth in employment (sector 232)
# gmwage: growth in federal minimum wage
# gcpi: growth in CPI

# Create lagged variables
minwage$gwage232_lag1 <- c(NA, minwage$gwage232[-nrow(minwage)])  # Lagged gwage232
minwage$gemp232_lag1 <- c(NA, minwage$gemp232[-nrow(minwage)])  # Lagged gemp232

# Remove rows with NA values
minwage_clean <- na.omit(minwage)

# (i) Calculate first-order autocorrelation in gwage232
acf_gwage <- acf(minwage_clean$gwage232, lag.max = 1, plot = FALSE)  # ACF
first_order_autocorrelation <- acf_gwage$acf[2]  # First lag

cat("First-order autocorrelation in gwage232:", first_order_autocorrelation, "\n")

## First-order autocorrelation in gwage232: -0.02425413

# (ii) Estimate the dynamic model
model_dynamic <- lm(gwage232 ~ gwage232_lag1 + gmwage + gcpi, data = minwage_clean)
summary(model_dynamic)

## 
## Call:
## lm(formula = gwage232 ~ gwage232_lag1 + gmwage + gcpi, data = minwage_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044649 -0.004114 -0.001262  0.004481  0.041568 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.0023648  0.0004295   5.506 5.45e-08 ***
## gwage232_lag1 -0.0684816  0.0343986  -1.991   0.0470 *  
## gmwage         0.1517511  0.0095115  15.955  < 2e-16 ***
## gcpi           0.2586795  0.0858602   3.013   0.0027 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007775 on 595 degrees of freedom
## Multiple R-squared:  0.3068, Adjusted R-squared:  0.3033 
## F-statistic: 87.79 on 3 and 595 DF,  p-value: < 2.2e-16

# Check if gmwage has a significant effect while controlling for other variables
cat("Effect of gmwage (contemporaneous effect):\n")

## Effect of gmwage (contemporaneous effect):

print(summary(model_dynamic)$coefficients["gmwage", ])

##     Estimate   Std. Error      t value     Pr(>|t|) 
## 1.517511e-01 9.511457e-03 1.595456e+01 5.753131e-48

# (iii) Add the lagged growth in employment (gemp232_lag1)
model_with_lagged_employment <- lm(gwage232 ~ gwage232_lag1 + gmwage + gcpi + gemp232_lag1, data = minwage_clean)
summary(model_with_lagged_employment)

## 
## Call:
## lm(formula = gwage232 ~ gwage232_lag1 + gmwage + gcpi + gemp232_lag1, 
##     data = minwage_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.043900 -0.004316 -0.000955  0.004255  0.042430 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.0023960  0.0004254   5.633 2.74e-08 ***
## gwage232_lag1 -0.0656875  0.0340720  -1.928 0.054344 .  
## gmwage         0.1525470  0.0094213  16.192  < 2e-16 ***
## gcpi           0.2536899  0.0850342   2.983 0.002968 ** 
## gemp232_lag1   0.0606620  0.0169691   3.575 0.000379 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007699 on 594 degrees of freedom
## Multiple R-squared:  0.3214, Adjusted R-squared:  0.3169 
## F-statistic: 70.34 on 4 and 594 DF,  p-value: < 2.2e-16

# Check if the lagged growth in employment is statistically significant
cat("Effect of lagged growth in employment (gemp232_lag1):\n")

## Effect of lagged growth in employment (gemp232_lag1):

print(summary(model_with_lagged_employment)$coefficients["gemp232_lag1", ])

##     Estimate   Std. Error      t value     Pr(>|t|) 
## 0.0606619954 0.0169690920 3.5748521768 0.0003789121

# (iv) Compare the two models (with and without lagged variables)
model_without_lags <- lm(gwage232 ~ gmwage + gcpi, data = minwage_clean)
summary(model_without_lags)

## 
## Call:
## lm(formula = gwage232 ~ gmwage + gcpi, data = minwage_clean)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.044501 -0.004053 -0.001328  0.004503  0.041194 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.0021926  0.0004217   5.199 2.75e-07 ***
## gmwage      0.1506131  0.0095178  15.824  < 2e-16 ***
## gcpi        0.2374126  0.0854046   2.780  0.00561 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.007794 on 596 degrees of freedom
## Multiple R-squared:  0.3022, Adjusted R-squared:  0.2999 
## F-statistic: 129.1 on 2 and 596 DF,  p-value: < 2.2e-16

# Compare coefficients of gmwage across models
cat("Coefficient of gmwage in model without lags:", summary(model_without_lags)$coefficients["gmwage", 1], "\n")

## Coefficient of gmwage in model without lags: 0.1506131

cat("Coefficient of gmwage in model with lags:", summary(model_with_lagged_employment)$coefficients["gmwage", 1], "\n")

## Coefficient of gmwage in model with lags: 0.152547

# (v) Regress gmwage on gwage232_lag1 and gemp232_lag1
model_gmwage <- lm(gmwage ~ gwage232_lag1 + gemp232_lag1, data = minwage_clean)
summary(model_gmwage)

## 
## Call:
## lm(formula = gmwage ~ gwage232_lag1 + gemp232_lag1, data = minwage_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.01987 -0.00511 -0.00385 -0.00290  0.62191 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)  
## (Intercept)    0.003487   0.001465   2.380   0.0176 *
## gwage232_lag1  0.212051   0.146727   1.445   0.1489  
## gemp232_lag1  -0.042776   0.073749  -0.580   0.5621  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03347 on 596 degrees of freedom
## Multiple R-squared:  0.004117,   Adjusted R-squared:  0.0007754 
## F-statistic: 1.232 on 2 and 596 DF,  p-value: 0.2924

# Report R-squared of the regression
r_squared_gmwage <- summary(model_gmwage)$r.squared
cat("R-squared of gmwage regression:", r_squared_gmwage, "\n")

## R-squared of gmwage regression: 0.004117273

# Interpret R-squared value
cat("Interpretation: R-squared indicates how well the lagged variables explain variation in gmwage.\n")

## Interpretation: R-squared indicates how well the lagged variables explain variation in gmwage.

Chapter 12 C11

# Load necessary libraries
library(wooldridge)  # For accessing Wooldridge datasets
library(lmtest)      # For model diagnostics
library(sandwich)    # For robust standard errors
library(tseries)     # For ARCH models
library(ggplot2)     # For plotting

# Load the NYSE dataset
data("nyse")  # The dataset is part of the Wooldridge package

# Add lagged returns for the analysis
nyse$return_lag1 <- c(NA, nyse$return[-nrow(nyse)])  # Lagged return_t-1
nyse$return_lag2 <- c(NA, nyse$return_lag1[-nrow(nyse)])  # Lagged return_t-2

# Remove rows with NA values (introduced by lags)
nyse_clean <- na.omit(nyse)

# (i) Estimate the model (12.47) and calculate squared residuals
model_12_47 <- lm(return ~ return_lag1, data = nyse_clean)
summary(model_12_47)

## 
## Call:
## lm(formula = return ~ return_lag1, data = nyse_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2633  -1.2997   0.1011   1.3203   8.0688 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  0.17862    0.08084   2.210   0.0275 *
## return_lag1  0.05806    0.03811   1.523   0.1281  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.112 on 686 degrees of freedom
## Multiple R-squared:  0.003372,   Adjusted R-squared:  0.001919 
## F-statistic: 2.321 on 1 and 686 DF,  p-value: 0.1281

# Calculate squared residuals
nyse_clean$residuals_squared <- residuals(model_12_47)^2

# Find average, minimum, and maximum of squared residuals
avg_res <- mean(nyse_clean$residuals_squared, na.rm = TRUE)
min_res <- min(nyse_clean$residuals_squared, na.rm = TRUE)
max_res <- max(nyse_clean$residuals_squared, na.rm = TRUE)

cat("Average of squared residuals:", avg_res, "\n")

## Average of squared residuals: 4.446345

cat("Minimum of squared residuals:", min_res, "\n")

## Minimum of squared residuals: 3.591354e-06

cat("Maximum of squared residuals:", max_res, "\n")

## Maximum of squared residuals: 232.9687

# (ii) Estimate the heteroskedasticity model
nyse_clean$return_lag1_sq <- nyse_clean$return_lag1^2  # Squared lagged return
hetero_model <- lm(residuals_squared ~ return_lag1 + return_lag1_sq, data = nyse_clean)
summary(hetero_model)

## 
## Call:
## lm(formula = residuals_squared ~ return_lag1 + return_lag1_sq, 
##     data = nyse_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -16.545  -3.021  -1.974   0.695 221.544 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     3.25771    0.44131   7.382 4.54e-13 ***
## return_lag1    -0.78556    0.19626  -4.003 6.95e-05 ***
## return_lag1_sq  0.29749    0.03558   8.361 3.46e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.67 on 685 degrees of freedom
## Multiple R-squared:  0.1305, Adjusted R-squared:  0.128 
## F-statistic: 51.41 on 2 and 685 DF,  p-value: < 2.2e-16

# Extract coefficients, standard errors, R-squared, and adjusted R-squared
coefs <- coef(hetero_model)
std_errors <- sqrt(diag(vcov(hetero_model)))
r_squared <- summary(hetero_model)$r.squared
adj_r_squared <- summary(hetero_model)$adj.r.squared

cat("Coefficients:", coefs, "\n")

## Coefficients: 3.257706 -0.7855573 0.2974907

cat("Standard Errors:", std_errors, "\n")

## Standard Errors: 0.441314 0.1962611 0.0355787

cat("R-squared:", r_squared, "\n")

## R-squared: 0.1305106

cat("Adjusted R-squared:", adj_r_squared, "\n")

## Adjusted R-squared: 0.127972

# (iii) Sketch conditional variance as a function of lagged return
nyse_clean$conditional_variance <- predict(hetero_model, newdata = nyse_clean)

ggplot(nyse_clean, aes(x = return_lag1, y = conditional_variance)) +
  geom_line(color = "blue") +
  labs(title = "Conditional Variance vs Lagged Return",
       x = "Lagged Return (return_t-1)", y = "Conditional Variance") +
  theme_minimal()

# Find value of return where variance is smallest
min_variance <- min(nyse_clean$conditional_variance, na.rm = TRUE)
cat("Minimum conditional variance:", min_variance, "\n")

## Minimum conditional variance: 2.739119

# (iv) Check for negative variance estimates
negative_variance <- sum(nyse_clean$conditional_variance < 0, na.rm = TRUE)
cat("Number of negative variance estimates:", negative_variance, "\n")

## Number of negative variance estimates: 0

# (v) Compare heteroskedasticity model to ARCH(1)
arch1_model <- garch(nyse_clean$return, order = c(1, 0))

## 
##  ***** ESTIMATION WITH ANALYTICAL GRADIENT ***** 
## 
## 
##      I     INITIAL X(I)        D(I)
## 
##      1     4.244486e+00     1.000e+00
##      2     5.000000e-02     1.000e+00
## 
##     IT   NF      F         RELDF    PRELDF    RELDX   STPPAR   D*STEP   NPRELDF
##      0    1  8.604e+02
##      1    4  8.604e+02  1.04e-05  3.38e-05  1.1e-03  2.9e+02  1.0e-02  4.94e-03
##      2    5  8.604e+02  1.86e-06  1.97e-06  1.0e-03  1.3e+00  1.0e-02  1.99e-06
##      3    6  8.604e+02  4.21e-09  1.02e-07  1.5e-03  0.0e+00  1.3e-02  1.02e-07
##      4    7  8.604e+02  2.86e-09  9.16e-08  7.7e-04  1.8e+00  6.7e-03  1.13e-06
##      5    8  8.604e+02  1.89e-09  4.61e-08  3.9e-04  1.9e+00  3.4e-03  1.67e-06
##      6   11  8.604e+02  1.01e-10  5.29e-09  4.5e-05  2.5e+01  4.0e-04  1.62e-06
##      7   14  8.604e+02  3.62e-11  5.04e-10  4.1e-06  3.9e+00  3.8e-05  1.61e-06
##      8   15  8.604e+02  9.74e-12  2.51e-10  2.1e-06  4.2e+02  1.9e-05  1.61e-06
##      9   16  8.604e+02  3.57e-12  1.26e-10  1.1e-06  5.6e+03  9.4e-06  1.61e-06
##     10   17  8.604e+02  1.69e-12  6.27e-11  5.3e-07  1.9e+04  4.7e-06  1.61e-06
##     11   18  8.604e+02  8.28e-13  3.14e-11  2.7e-07  4.8e+04  2.4e-06  1.60e-06
##     12   19  8.604e+02  4.09e-13  1.57e-11  1.3e-07  1.0e+05  1.2e-06  1.60e-06
##     13   20  8.604e+02  2.04e-13  7.84e-12  6.7e-08  2.2e+05  5.9e-07  1.60e-06
##     14   21  8.604e+02  1.01e-13  3.92e-12  3.3e-08  4.4e+05  2.9e-07  1.60e-06
##     15   22  8.604e+02  5.23e-14  1.96e-12  1.7e-08  8.9e+05  1.5e-07  1.60e-06
##     16   23  8.604e+02  2.43e-14  9.80e-13  8.4e-09  1.8e+06  7.3e-08  1.60e-06
##     17   24  8.604e+02  1.33e-14  4.90e-13  4.2e-09  3.6e+06  3.7e-08  1.60e-06
##     18   25  8.604e+02  6.47e-15  2.45e-13  2.1e-09  7.2e+06  1.8e-08  1.60e-06
##     19   26  8.604e+02  3.57e-15  1.23e-13  1.0e-09  1.4e+07  9.2e-09  1.60e-06
##     20   27  8.604e+02  5.29e-16  6.13e-14  5.2e-10  2.9e+07  4.6e-09  1.60e-06
##     21   28  8.604e+02  1.98e-15  3.06e-14  2.6e-10  5.8e+07  2.3e-09  1.60e-06
##     22   36  8.604e+02  0.00e+00  1.82e-18  1.6e-14  9.7e+11  1.4e-13  1.60e-06
## 
##  ***** FALSE CONVERGENCE *****
## 
##  FUNCTION     8.604278e+02   RELDX        1.557e-14
##  FUNC. EVALS      36         GRAD. EVALS      22
##  PRELDF       1.824e-18      NPRELDF      1.601e-06
## 
##      I      FINAL X(I)        D(I)          G(I)
## 
##      1    4.278928e+00     1.000e+00    -1.119e-02
##      2    4.975478e-02     1.000e+00     2.591e-03

summary(arch1_model)

## 
## Call:
## garch(x = nyse_clean$return, order = c(1, 0))
## 
## Model:
## GARCH(1,0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2203 -0.5202  0.1519  0.7279  3.9815 
## 
## Coefficient(s):
##     Estimate  Std. Error  t value Pr(>|t|)
## a0   4.27893    83.25229    0.051    0.959
## b1   0.04975    18.48916    0.003    0.998
## 
## Diagnostic Tests:
##  Jarque Bera Test
## 
## data:  Residuals
## X-squared = 776.88, df = 2, p-value < 2.2e-16
## 
## 
##  Box-Ljung test
## 
## data:  Squared.Residuals
## X-squared = 87.793, df = 1, p-value < 2.2e-16

# (vi) Extend to ARCH(2) by adding second lag
arch2_model <- garch(nyse_clean$return, order = c(2, 0))

## 
##  ***** ESTIMATION WITH ANALYTICAL GRADIENT ***** 
## 
## 
##      I     INITIAL X(I)        D(I)
## 
##      1     4.021092e+00     1.000e+00
##      2     5.000000e-02     1.000e+00
##      3     5.000000e-02     1.000e+00
## 
##     IT   NF      F         RELDF    PRELDF    RELDX   STPPAR   D*STEP   NPRELDF
##      0    1  8.581e+02
##      1    5  8.581e+02  5.14e-06  1.02e-05  2.8e-04  8.7e+02  3.2e-03  4.45e-03
##      2   12  8.581e+02  6.59e-11  6.25e-10  3.8e-06  2.2e+00  4.7e-05  1.35e-07
##      3   15  8.581e+02  7.54e-11  3.87e-10  2.1e-06  3.6e+00  2.0e-05  1.35e-07
##      4   31  8.581e+02  1.85e-15  1.60e-14  1.1e-10  1.5e+06  9.0e-10  1.36e-07
##      5   39  8.581e+02 -1.59e-15  1.30e-18  9.1e-15  2.0e+10  7.3e-14  1.60e-07
## 
##  ***** FALSE CONVERGENCE *****
## 
##  FUNCTION     8.581133e+02   RELDX        9.055e-15
##  FUNC. EVALS      39         GRAD. EVALS       5
##  PRELDF       1.304e-18      NPRELDF      1.595e-07
## 
##      I      FINAL X(I)        D(I)          G(I)
## 
##      1    4.021524e+00     1.000e+00     1.530e-02
##      2    5.223246e-02     1.000e+00    -8.293e-04
##      3    5.222700e-02     1.000e+00    -1.780e-04

## Warning in garch(nyse_clean$return, order = c(2, 0)): singular information

summary(arch2_model)

## 
## Call:
## garch(x = nyse_clean$return, order = c(2, 0))
## 
## Model:
## GARCH(2,0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.2303 -0.5216  0.1484  0.7264  3.9869 
## 
## Coefficient(s):
##     Estimate  Std. Error  t value Pr(>|t|)
## a0   4.02152          NA       NA       NA
## b1   0.05223          NA       NA       NA
## b2   0.05223          NA       NA       NA
## 
## Diagnostic Tests:
##  Jarque Bera Test
## 
## data:  Residuals
## X-squared = 786.77, df = 2, p-value < 2.2e-16
## 
## 
##  Box-Ljung test
## 
## data:  Squared.Residuals
## X-squared = 88.025, df = 1, p-value < 2.2e-16

# Compare ARCH(1) and ARCH(2) models
arch1_aic <- AIC(arch1_model)
arch2_aic <- AIC(arch2_model)

cat("ARCH(1) AIC:", arch1_aic, "\n")

## ARCH(1) AIC: 2987.477

cat("ARCH(2) AIC:", arch2_aic, "\n")

## ARCH(2) AIC: 2983.01

if (arch2_aic < arch1_aic) {
  cat("ARCH(2) model fits better than ARCH(1).\n")
} else {
  cat("ARCH(1) model fits better than ARCH(2).\n")
}

## ARCH(2) model fits better than ARCH(1).

Final Exam

Maralgua

2025-01-05

Chapter 7