library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dslabs)
library(ISLR2)
library(matlib)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.
library(wooldridge)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(sandwich)
data <- wooldridge::sleep75
head(sleep75)
## age black case clerical construc educ earns74 gdhlth inlf leis1 leis2 leis3
## 1 32 0 1 0 0 12 0 0 1 3529 3479 3479
## 2 31 0 2 0 0 14 9500 1 1 2140 2140 2140
## 3 44 0 3 0 0 17 42500 1 1 4595 4505 4227
## 4 30 0 4 0 0 12 42500 1 1 3211 3211 3211
## 5 64 0 5 0 0 14 2500 1 1 4052 4007 4007
## 6 41 0 6 0 0 12 0 1 1 4812 4797 4797
## smsa lhrwage lothinc male marr prot rlxall selfe sleep slpnaps south
## 1 0 1.955861 10.075380 1 1 1 3163 0 3113 3163 0
## 2 0 0.357674 0.000000 1 0 1 2920 1 2920 2920 1
## 3 1 3.021887 0.000000 1 1 0 3038 1 2670 2760 0
## 4 0 2.263844 0.000000 0 1 1 3083 1 3083 3083 0
## 5 0 1.011601 9.328213 1 1 1 3493 0 3448 3493 0
## 6 0 2.957511 10.657280 1 1 1 4078 0 4063 4078 0
## spsepay spwrk75 totwrk union worknrm workscnd exper yngkid yrsmarr hrwage
## 1 0 0 3438 0 3438 0 14 0 13 7.070004
## 2 0 0 5020 0 5020 0 11 0 0 1.429999
## 3 20000 1 2815 0 2815 0 21 0 0 20.529997
## 4 5000 1 3786 0 3786 0 12 0 12 9.619998
## 5 2400 1 2580 0 2580 0 44 0 33 2.750000
## 6 0 0 1205 0 0 1205 23 0 23 19.249998
## agesq
## 1 1024
## 2 961
## 3 1936
## 4 900
## 5 4096
## 6 1681
data(sleep75)
model1 <- lm(sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
summary(model1)
##
## Call:
## lm(formula = sleep ~ totwrk + educ + age + I(age^2) + male, data = sleep75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2378.00 -243.29 6.74 259.24 1350.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3840.83197 235.10870 16.336 <2e-16 ***
## totwrk -0.16342 0.01813 -9.013 <2e-16 ***
## educ -11.71332 5.86689 -1.997 0.0463 *
## age -8.69668 11.20746 -0.776 0.4380
## I(age^2) 0.12844 0.13390 0.959 0.3378
## male 87.75243 34.32616 2.556 0.0108 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.7 on 700 degrees of freedom
## Multiple R-squared: 0.1228, Adjusted R-squared: 0.1165
## F-statistic: 19.59 on 5 and 700 DF, p-value: < 2.2e-16
2*(1-pt(87.75/34.33,700,FALSE))
## [1] 0.01079613
model1 <- lm(sleep ~ totwrk + educ + male, data = sleep75)
summary(model1)
2*(1-pt(2.19/0.53,4131,FALSE))
2*(1-pt(45.09/4.29,4131,FALSE))
2*(1-pt(169.81/12.71,4131,FALSE))
library(wooldridge)
data1 <- wooldridge::gpa2
head(data1)
## sat tothrs colgpa athlete verbmath hsize hsrank hsperc female white black
## 1 920 43 2.04 1 0.48387 0.10 4 40.00000 1 0 0
## 2 1170 18 4.00 0 0.82813 9.40 191 20.31915 0 1 0
## 3 810 14 1.78 1 0.88372 1.19 42 35.29412 0 1 0
## 4 940 40 2.42 0 0.80769 5.71 252 44.13310 0 1 0
## 5 1180 18 2.61 0 0.73529 2.14 86 40.18692 0 1 0
## 6 980 114 3.03 0 0.81481 2.68 41 15.29851 1 1 0
## hsizesq
## 1 0.0100
## 2 88.3600
## 3 1.4161
## 4 32.6041
## 5 4.5796
## 6 7.1824
data("gpa2")
model2 <- lm(sat ~ hsize + I(hsize^2) + female + black + I(female*black), data = gpa2)
summary(model2)
##
## Call:
## lm(formula = sat ~ hsize + I(hsize^2) + female + black + I(female *
## black), data = gpa2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## I(hsize^2) -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## I(female * black) 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
model4<- lm(sat ~ hsize + hsizesq + female + black + female*black , data = data1)
summary(model4)
##
## Call:
## lm(formula = sat ~ hsize + hsizesq + female + black + female *
## black, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -570.45 -89.54 -5.24 85.41 479.13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1028.0972 6.2902 163.445 < 2e-16 ***
## hsize 19.2971 3.8323 5.035 4.97e-07 ***
## hsizesq -2.1948 0.5272 -4.163 3.20e-05 ***
## female -45.0915 4.2911 -10.508 < 2e-16 ***
## black -169.8126 12.7131 -13.357 < 2e-16 ***
## female:black 62.3064 18.1542 3.432 0.000605 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 133.4 on 4131 degrees of freedom
## Multiple R-squared: 0.08578, Adjusted R-squared: 0.08468
## F-statistic: 77.52 on 5 and 4131 DF, p-value: < 2.2e-16
2*(1-pt(2.19/0.53,4131,FALSE))
## [1] 3.666238e-05
t_stat <- 62.31/18.15
df <- 4131-1
p_value <- 2 * (1 - pt(abs(t_stat), df))
p_value
## [1] 0.0006026815
P value is equal to 0.0006. So that the estimated difference is statistically significant.
The SAT score difference between non-Black males (female = 0, black = 0) and Black males (female = 0, black = 1) is -169.81, which corresponds to the coefficient for the black variable. This indicates that, on average, Black males score 169.81 points lower on the SAT compared to non-Black males. When holding female = 0, the coefficient on black shows that a Black male’s predicted SAT score is about 170 points lower than that of a comparable non-Black male.
The t-statistic for this coefficient exceeds 13 in absolute value, providing strong evidence to reject the hypothesis that no difference exists between these groups under ceteris paribus conditions. This result is further reinforced by the large sample size, which increases the precision of the estimate.
2*(1-pt(169.81/12.71,4131,FALSE))
Non-Black females are identified by female =
1 and black = 0, while Black females are
represented by female = 1 and black =
1. The difference in SAT scores between these two groups is
calculated as the sum of the coefficients for the black
variable and the female × black interaction term:
169.81 - 62.31 = 107.5.
This indicates that, on average, non-Black females score 107.5 points higher on the SAT than Black females.
Since this difference is based on two variables (the black and female × black terms), a t-test cannot be directly applied to measure its statistical significance. Instead, to test this difference, we can introduce three dummy variables with non-Black females as the reference group. The t-statistic can then be obtained from the coefficient of the Black female dummy variable to evaluate the significance of the difference.
t_stat2 <- 45.09/6.29 df <- 4131-1 p_value2 <- 2 * (1 - pt(abs(t_stat2), df)) p_value2
library(wooldridge)
data1 <- wooldridge::gpa1
head(data1)
## age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1 21 0 0 1 0 0 0 1 0 3.0 3.0 21
## 2 21 0 0 1 0 0 0 1 0 3.4 3.2 24
## 3 20 0 1 0 0 0 0 1 0 3.0 3.6 26
## 4 19 1 0 0 0 1 1 1 0 3.5 3.5 27
## 5 20 0 1 0 0 0 0 1 0 3.6 3.9 28
## 6 20 0 0 1 0 1 1 1 0 3.0 3.4 25
## job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1 0 1 1 0 0 0 0 0 1 1 0 0
## 2 0 1 1 0 0 0 0 0 1 0 1 1
## 3 1 0 0 0 1 0 0 0 1 1 0 1
## 4 1 0 0 0 1 0 0 0 0 1 0 0
## 5 0 1 0 1 0 0 0 0 1 1 1 0
## 6 0 0 0 0 1 0 0 0 1 1 0 0
## skipped alcohol gradMI fathcoll mothcoll
## 1 2 1.0 1 0 0
## 2 0 1.0 1 1 1
## 3 0 1.0 1 1 1
## 4 0 0.0 0 0 0
## 5 0 1.5 1 1 0
## 6 0 0.0 0 1 0
library(wooldridge)
data1 <- wooldridge::gpa1
head(data1)
## age soph junior senior senior5 male campus business engineer colGPA hsGPA ACT
## 1 21 0 0 1 0 0 0 1 0 3.0 3.0 21
## 2 21 0 0 1 0 0 0 1 0 3.4 3.2 24
## 3 20 0 1 0 0 0 0 1 0 3.0 3.6 26
## 4 19 1 0 0 0 1 1 1 0 3.5 3.5 27
## 5 20 0 1 0 0 0 0 1 0 3.6 3.9 28
## 6 20 0 0 1 0 1 1 1 0 3.0 3.4 25
## job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1 0 1 1 0 0 0 0 0 1 1 0 0
## 2 0 1 1 0 0 0 0 0 1 0 1 1
## 3 1 0 0 0 1 0 0 0 1 1 0 1
## 4 1 0 0 0 1 0 0 0 0 1 0 0
## 5 0 1 0 1 0 0 0 0 1 1 1 0
## 6 0 0 0 0 1 0 0 0 1 1 0 0
## skipped alcohol gradMI fathcoll mothcoll
## 1 2 1.0 1 0 0
## 2 0 1.0 1 1 1
## 3 0 1.0 1 1 1
## 4 0 0.0 0 0 0
## 5 0 1.5 1 1 0
## 6 0 0.0 0 1 0
data_C1 <- gpa1
model_C1 <- lm(colGPA~PC+hsGPA+ACT+mothcoll+fathcoll,data= data_C1)
summary(model_C1)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll,
## data = data_C1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## PC 0.151854 0.058716 2.586 0.010762 *
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
library(wooldridge)
data1 <- wooldridge::gpa1
# Perform the regression analysis
model_7C1_ur <- lm(colGPA ~ PC + hsGPA + ACT, data = data1)
summary(model_7C1_ur)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7901 -0.2622 -0.0107 0.2334 0.7570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.263520 0.333125 3.793 0.000223 ***
## PC 0.157309 0.057287 2.746 0.006844 **
## hsGPA 0.447242 0.093647 4.776 4.54e-06 ***
## ACT 0.008659 0.010534 0.822 0.412513
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3325 on 137 degrees of freedom
## Multiple R-squared: 0.2194, Adjusted R-squared: 0.2023
## F-statistic: 12.83 on 3 and 137 DF, p-value: 1.932e-07
F= ((0.2222-0.2194)/2)/((1-0.2222)/135)
F
## [1] 0.2429931
critical_value = qf(0.95,2,135, TRUE)
critical_value
## [1] 4.423013
model6<- lm(colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll , data = data1)
summary(model6)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll,
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78149 -0.25726 -0.02121 0.24691 0.74432
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.255554 0.335392 3.744 0.000268 ***
## PC 0.151854 0.058716 2.586 0.010762 *
## hsGPA 0.450220 0.094280 4.775 4.61e-06 ***
## ACT 0.007724 0.010678 0.723 0.470688
## mothcoll -0.003758 0.060270 -0.062 0.950376
## fathcoll 0.041800 0.061270 0.682 0.496265
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3344 on 135 degrees of freedom
## Multiple R-squared: 0.2222, Adjusted R-squared: 0.1934
## F-statistic: 7.713 on 5 and 135 DF, p-value: 2.083e-06
library(car)
hypotheses <- c("mothcoll=0", "fathcoll=0")
joint_test <- linearHypothesis(model6, hypotheses)
joint_test
##
## Linear hypothesis test:
## mothcoll = 0
## fathcoll = 0
##
## Model 1: restricted model
## Model 2: colGPA ~ PC + hsGPA + ACT + mothcoll + fathcoll
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 137 15.149
## 2 135 15.094 2 0.054685 0.2446 0.7834
summary(joint_test)
## Res.Df RSS Df Sum of Sq F
## Min. :135.0 Min. :15.09 Min. :2 Min. :0.05469 Min. :0.2446
## 1st Qu.:135.5 1st Qu.:15.11 1st Qu.:2 1st Qu.:0.05469 1st Qu.:0.2446
## Median :136.0 Median :15.12 Median :2 Median :0.05469 Median :0.2446
## Mean :136.0 Mean :15.12 Mean :2 Mean :0.05469 Mean :0.2446
## 3rd Qu.:136.5 3rd Qu.:15.14 3rd Qu.:2 3rd Qu.:0.05469 3rd Qu.:0.2446
## Max. :137.0 Max. :15.15 Max. :2 Max. :0.05469 Max. :0.2446
## NA's :1 NA's :1 NA's :1
## Pr(>F)
## Min. :0.7834
## 1st Qu.:0.7834
## Median :0.7834
## Mean :0.7834
## 3rd Qu.:0.7834
## Max. :0.7834
## NA's :1
model3 <- lm (colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll + fathcoll, data = gpa1)
summary (model3)
##
## Call:
## lm(formula = colGPA ~ PC + hsGPA + I(hsGPA^2) + ACT + mothcoll +
## fathcoll, data = gpa1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.78998 -0.24327 -0.00648 0.26179 0.72231
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.040328 2.443038 2.063 0.0410 *
## PC 0.140446 0.058858 2.386 0.0184 *
## hsGPA -1.802520 1.443552 -1.249 0.2140
## I(hsGPA^2) 0.337341 0.215711 1.564 0.1202
## ACT 0.004786 0.010786 0.444 0.6580
## mothcoll 0.003091 0.060110 0.051 0.9591
## fathcoll 0.062761 0.062401 1.006 0.3163
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3326 on 134 degrees of freedom
## Multiple R-squared: 0.2361, Adjusted R-squared: 0.2019
## F-statistic: 6.904 on 6 and 134 DF, p-value: 2.088e-06
data3 <- wooldridge::wage2
head(data3)
## wage hours IQ KWW educ exper tenure age married black south urban sibs
## 1 769 40 93 35 12 11 2 31 1 0 0 1 1
## 2 808 50 119 41 18 11 16 37 1 0 0 1 1
## 3 825 40 108 46 14 11 9 33 1 0 0 1 1
## 4 650 40 96 32 12 13 7 32 1 0 0 1 4
## 5 562 40 74 27 11 14 5 34 1 0 0 1 10
## 6 1400 40 116 43 16 14 2 35 1 1 0 1 1
## brthord meduc feduc lwage
## 1 2 8 8 6.645091
## 2 NA 14 14 6.694562
## 3 2 14 14 6.715384
## 4 3 12 12 6.476973
## 5 6 6 11 6.331502
## 6 2 8 NA 7.244227
data("wage2")
model4 <- lm(log(wage) ~ educ +exper +tenure + married + black + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98069 -0.21996 0.00707 0.24288 1.22822
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.395497 0.113225 47.653 < 2e-16 ***
## educ 0.065431 0.006250 10.468 < 2e-16 ***
## exper 0.014043 0.003185 4.409 1.16e-05 ***
## tenure 0.011747 0.002453 4.789 1.95e-06 ***
## married 0.199417 0.039050 5.107 3.98e-07 ***
## black -0.188350 0.037667 -5.000 6.84e-07 ***
## south -0.090904 0.026249 -3.463 0.000558 ***
## urban 0.183912 0.026958 6.822 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3655 on 927 degrees of freedom
## Multiple R-squared: 0.2526, Adjusted R-squared: 0.2469
## F-statistic: 44.75 on 7 and 927 DF, p-value: < 2.2e-16
model4 <- lm(log(wage) ~ educ +exper + I(exper^2) + tenure + I(tenure^2)
+ married + black + south + urban, data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + I(exper^2) + tenure +
## I(tenure^2) + married + black + south + urban, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98236 -0.21972 -0.00036 0.24078 1.25127
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.3586756 0.1259143 42.558 < 2e-16 ***
## educ 0.0642761 0.0063115 10.184 < 2e-16 ***
## exper 0.0172146 0.0126138 1.365 0.172665
## I(exper^2) -0.0001138 0.0005319 -0.214 0.830622
## tenure 0.0249291 0.0081297 3.066 0.002229 **
## I(tenure^2) -0.0007964 0.0004710 -1.691 0.091188 .
## married 0.1985470 0.0391103 5.077 4.65e-07 ***
## black -0.1906636 0.0377011 -5.057 5.13e-07 ***
## south -0.0912153 0.0262356 -3.477 0.000531 ***
## urban 0.1854241 0.0269585 6.878 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3653 on 925 degrees of freedom
## Multiple R-squared: 0.255, Adjusted R-squared: 0.2477
## F-statistic: 35.17 on 9 and 925 DF, p-value: < 2.2e-16
linearHypothesis(model4, c("I(exper^2)=0", "I(tenure^2)=0" ))
##
## Linear hypothesis test:
## I(exper^2) = 0
## I(tenure^2) = 0
##
## Model 1: restricted model
## Model 2: log(wage) ~ educ + exper + I(exper^2) + tenure + I(tenure^2) +
## married + black + south + urban
##
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 927 123.82
## 2 925 123.42 2 0.39756 1.4898 0.226
F= ((0.255-0.2526)/2)/((1-0.255)/925)
F
## [1] 1.489933
1-pf(F,2,925)
## [1] 0.2259282
model4 <- lm(log(wage) ~ educ +exper +tenure + married
+ black + south + urban + I(black*educ), data = wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + I(black * educ), data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## I(black * educ) -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
model4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+educ*black,data= wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + educ * black, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
model_7C2_4 <- lm(log(wage)~educ+exper+tenure+married+black+south+urban+married*black,data= wage2)
summary(model4)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure + married + black +
## south + urban + educ * black, data = wage2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97782 -0.21832 0.00475 0.24136 1.23226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.374817 0.114703 46.859 < 2e-16 ***
## educ 0.067115 0.006428 10.442 < 2e-16 ***
## exper 0.013826 0.003191 4.333 1.63e-05 ***
## tenure 0.011787 0.002453 4.805 1.80e-06 ***
## married 0.198908 0.039047 5.094 4.25e-07 ***
## black 0.094809 0.255399 0.371 0.710561
## south -0.089450 0.026277 -3.404 0.000692 ***
## urban 0.183852 0.026955 6.821 1.63e-11 ***
## educ:black -0.022624 0.020183 -1.121 0.262603
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3654 on 926 degrees of freedom
## Multiple R-squared: 0.2536, Adjusted R-squared: 0.2471
## F-statistic: 39.32 on 8 and 926 DF, p-value: < 2.2e-16
Married Black individuals are represented by married = 1 and black = 1, while married non-Black individuals are represented by married = 1 and black = 0. The difference in monthly salary between these two groups is the sum of the coefficients for the black variable and the married × black interaction term: -0.2408 + 0.0614 = -0.1794.
This suggests that, on average, married Black individuals earn 17.94% less in monthly salary compared to married non-Black individuals.
The OLS estimators, bj, are inconsistent: False. Heteroskedasticity does not make the OLS estimators inconsistent. The estimators remain consistent, but they are no longer efficient. This means that OLS will still produce correct estimates of the coefficients in large samples, but they will not have the minimum variance among all unbiased estimators.
The usual F statistic no longer has an F distribution: True. In the presence of heteroskedasticity, the usual F-statistic can no longer be relied upon to follow an F-distribution. The distribution of the test statistic may be distorted, leading to incorrect conclusions in hypothesis testing. This is why robust standard errors are often used in such cases.
The OLS estimators are no longer BLUE: True. Heteroskedasticity violates one of the Gauss-Markov assumptions, specifically the assumption of homoscedasticity (constant variance of errors). As a result, OLS estimators are no longer the Best Linear Unbiased Estimators (BLUE). They remain unbiased, but they are no longer the most efficient estimators. Techniques like Generalized Least Squares (GLS) or using robust standard errors can provide more efficient estimates in the presence of heteroskedasticity.
data4 <- wooldridge::smoke
head(data4)
## educ cigpric white age income cigs restaurn lincome agesq lcigpric
## 1 16.0 60.506 1 46 20000 0 0 9.903487 2116 4.102743
## 2 16.0 57.883 1 40 30000 0 0 10.308952 1600 4.058424
## 3 12.0 57.664 1 58 30000 3 0 10.308952 3364 4.054633
## 4 13.5 57.883 1 30 20000 0 0 9.903487 900 4.058424
## 5 10.0 58.320 1 17 20000 0 0 9.903487 289 4.065945
## 6 6.0 59.340 1 86 6500 0 0 8.779557 7396 4.083283
model11 <- lm(cigs ~ log(cigpric) + log(income) + educ + age + agesq + restaurn + white , data = data4)
summary(model11)
##
## Call:
## lm(formula = cigs ~ log(cigpric) + log(income) + educ + age +
## agesq + restaurn + white, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.772 -9.330 -5.907 7.945 70.275
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682419 24.220729 -0.111 0.91184
## log(cigpric) -0.850907 5.782321 -0.147 0.88305
## log(income) 0.869014 0.728763 1.192 0.23344
## educ -0.501753 0.167168 -3.001 0.00277 **
## age 0.774502 0.160516 4.825 1.68e-06 ***
## agesq -0.009069 0.001748 -5.188 2.70e-07 ***
## restaurn -2.865621 1.117406 -2.565 0.01051 *
## white -0.559236 1.459461 -0.383 0.70169
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.41 on 799 degrees of freedom
## Multiple R-squared: 0.05291, Adjusted R-squared: 0.04461
## F-statistic: 6.377 on 7 and 799 DF, p-value: 2.588e-07
s_error <- coef(summary(model11))
s_error
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.682418774 24.220728831 -0.1107489 9.118433e-01
## log(cigpric) -0.850907441 5.782321084 -0.1471567 8.830454e-01
## log(income) 0.869013971 0.728763480 1.1924499 2.334389e-01
## educ -0.501753247 0.167167689 -3.0014966 2.770186e-03
## age 0.774502156 0.160515805 4.8250835 1.676279e-06
## agesq -0.009068603 0.001748055 -5.1878253 2.699355e-07
## restaurn -2.865621212 1.117405936 -2.5645301 1.051320e-02
## white -0.559236403 1.459461034 -0.3831801 7.016882e-01
age <- -coef(model11)["age"] / (2 * coef(model11)["I(age^2)"])
age
## age
## NA
coef_res <- coef(model11)["restaurn"]
coef_res
## restaurn
## -2.865621
library(lmtest)
data_update <- data.frame(cigpric = 67.44, income = 6500, educ = 16, age = 77, restaurn = 0, white = 0)
het_test <- bptest(model11)
het_test
##
## studentized Breusch-Pagan test
##
## data: model11
## BP = 32.377, df = 7, p-value = 3.458e-05
The equation for smoking is: smokes = −0.656 + 0.069 × log(67.44) + 0.012 × log(6,500) − 0.029 × (16) + 0.020 × (77) + 0.00026 × (77²) − 0.0052.
When plugging in the values, the estimated probability of smoking for this individual is very close to zero. This suggests that the individual is unlikely to smoke, and in fact, the person isn’t a smoker, which confirms that the equation’s prediction aligns well with this specific observation.
data5 <- wooldridge::vote1
head(data5)
## state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1 AL 7 1 68 328.296 8.737 41 5.793916 2.167567
## 2 AK 1 0 62 626.377 402.477 60 6.439952 5.997638
## 3 AZ 2 1 73 99.607 3.065 55 4.601233 1.120048
## 4 AZ 3 0 69 319.690 26.281 64 5.767352 3.268846
## 5 AR 3 0 75 159.221 60.054 66 5.070293 4.095244
## 6 AR 4 1 69 570.155 21.393 46 6.345908 3.063064
## shareA
## 1 97.40767
## 2 60.88104
## 3 97.01476
## 4 92.40370
## 5 72.61247
## 6 96.38355
data ("vote1")
model5 <- lm (voteA ~ prtystrA + democA + lexpendA + lexpendB, data = vote1)
summary(model5)
##
## Call:
## lm(formula = voteA ~ prtystrA + democA + lexpendA + lexpendB,
## data = vote1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.576 -4.864 -1.146 4.903 24.566
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.66142 4.73604 7.952 2.56e-13 ***
## prtystrA 0.25192 0.07129 3.534 0.00053 ***
## democA 3.79294 1.40652 2.697 0.00772 **
## lexpendA 5.77929 0.39182 14.750 < 2e-16 ***
## lexpendB -6.23784 0.39746 -15.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.573 on 168 degrees of freedom
## Multiple R-squared: 0.8012, Adjusted R-squared: 0.7964
## F-statistic: 169.2 on 4 and 168 DF, p-value: < 2.2e-16
data5 <- data5 %>% mutate(residual_squared=(model5$residuals)^2)
head(data5)
## state district democA voteA expendA expendB prtystrA lexpendA lexpendB
## 1 AL 7 1 68 328.296 8.737 41 5.793916 2.167567
## 2 AK 1 0 62 626.377 402.477 60 6.439952 5.997638
## 3 AZ 2 1 73 99.607 3.065 55 4.601233 1.120048
## 4 AZ 3 0 69 319.690 26.281 64 5.767352 3.268846
## 5 AR 3 0 75 159.221 60.054 66 5.070293 4.095244
## 6 AR 4 1 69 570.155 21.393 46 6.345908 3.063064
## shareA residual_squared
## 1 97.40767 14.038468
## 2 60.88104 88.688062
## 3 97.01476 3.667331
## 4 92.40370 5.176383
## 5 72.61247 287.464399
## 6 96.38355 2.593858
model5_r<- lm(residual_squared~prtystrA+democA+log(expendA)+log(expendB),data= data5)
summary(model5_r)
##
## Call:
## lm(formula = residual_squared ~ prtystrA + democA + log(expendA) +
## log(expendB), data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -94.88 -46.16 -23.51 15.84 508.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 113.9635 50.8150 2.243 0.0262 *
## prtystrA -0.2993 0.7649 -0.391 0.6961
## democA 15.6192 15.0912 1.035 0.3022
## log(expendA) -10.3057 4.2040 -2.451 0.0153 *
## log(expendB) -0.0514 4.2645 -0.012 0.9904
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.25 on 168 degrees of freedom
## Multiple R-squared: 0.05256, Adjusted R-squared: 0.03
## F-statistic: 2.33 on 4 and 168 DF, p-value: 0.05806
bptest(model5)
##
## studentized Breusch-Pagan test
##
## data: model5
## BP = 9.0934, df = 4, p-value = 0.05881
bptest (model5, ~ prtystrA*democA*lexpendA*lexpendB + I(prtystrA^2) + I(democA^2) + I(lexpendA^2) + I(lexpendB^2), data = vote1)
##
## studentized Breusch-Pagan test
##
## data: model5
## BP = 43.327, df = 18, p-value = 0.0007194
library(sandwich)
library(lmtest)
data6 <- wooldridge::fertil2
head(data6)
## mnthborn yearborn age electric radio tv bicycle educ ceb agefbrth children
## 1 5 64 24 1 1 1 1 12 0 NA 0
## 2 1 56 32 1 1 1 1 13 3 25 3
## 3 7 58 30 1 0 0 0 5 1 27 1
## 4 11 45 42 1 0 1 0 4 3 17 2
## 5 5 45 43 1 1 1 1 11 2 24 2
## 6 8 52 36 1 0 0 0 7 1 26 1
## knowmeth usemeth monthfm yearfm agefm idlnchld heduc agesq urban urb_educ
## 1 1 0 NA NA NA 2 NA 576 1 12
## 2 1 1 11 80 24 3 12 1024 1 13
## 3 1 0 6 83 24 5 7 900 1 5
## 4 1 0 1 61 15 3 11 1764 1 4
## 5 1 1 3 66 20 2 14 1849 1 11
## 6 1 1 11 76 24 4 9 1296 1 7
## spirit protest catholic frsthalf educ0 evermarr
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 1
## 3 1 0 0 0 0 1
## 4 0 0 0 0 0 1
## 5 0 1 0 1 0 1
## 6 0 0 0 0 0 1
model <- lm(children ~ age + I(age^2) + educ + electric + urban, data = fertil2)
robust_se <- sqrt(diag(vcovHC(model)))
summary_with_robust_se <- cbind(coef(model), "Robust SE" = robust_se)
summary(model)
##
## Call:
## lm(formula = children ~ age + I(age^2) + educ + electric + urban,
## data = fertil2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9012 -0.7136 -0.0039 0.7119 7.4318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.2225162 0.2401888 -17.580 < 2e-16 ***
## age 0.3409255 0.0165082 20.652 < 2e-16 ***
## I(age^2) -0.0027412 0.0002718 -10.086 < 2e-16 ***
## educ -0.0752323 0.0062966 -11.948 < 2e-16 ***
## electric -0.3100404 0.0690045 -4.493 7.20e-06 ***
## urban -0.2000339 0.0465062 -4.301 1.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.452 on 4352 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.5734, Adjusted R-squared: 0.5729
## F-statistic: 1170 on 5 and 4352 DF, p-value: < 2.2e-16
print(summary_with_robust_se)
## Robust SE
## (Intercept) -4.222516228 0.2443961935
## age 0.340925520 0.0192199445
## I(age^2) -0.002741209 0.0003513959
## educ -0.075232323 0.0063159137
## electric -0.310040409 0.0640737262
## urban -0.200033857 0.0455162364
coeftest(model6, vcov=vcovHC(model6, type = "HC0"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.2555544 0.3368805 3.7270 0.000284 ***
## PC 0.1518539 0.0596641 2.5451 0.012048 *
## hsGPA 0.4502203 0.0962508 4.6776 6.962e-06 ***
## ACT 0.0077242 0.0103074 0.7494 0.454932
## mothcoll -0.0037579 0.0601271 -0.0625 0.950258
## fathcoll 0.0417999 0.0578749 0.7222 0.471392
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
F= ((0.574-0.5734)/3)/((1-0.574)/4349)
F
## [1] 2.041784
critical_value = qf(0.95,3,4349, TRUE)
critical_value
## [1] 3.403585
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
a <- coeftest(model, vcov = sandwich)
a
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.22251623 0.24368307 -17.3279 < 2.2e-16 ***
## age 0.34092552 0.01916146 17.7923 < 2.2e-16 ***
## I(age^2) -0.00274121 0.00035027 -7.8260 6.278e-15 ***
## educ -0.07523232 0.00630336 -11.9353 < 2.2e-16 ***
## electric -0.31004041 0.06390411 -4.8517 1.267e-06 ***
## urban -0.20003386 0.04543962 -4.4022 1.097e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1065, df = 5, p-value < 2.2e-16
usqr=(summary(model6)$residual)^2
yhat=model6$fitted.values
yhatsqr=model6$fitted.values^2
model6WT=summary(lm(usqr~yhat+yhatsqr,data=fertil2))$r.squared
model6WT*4361
## [1] 143.6236
1-pchisq(1093.183,2)
## [1] 0
joint_test <- coeftest(model6, vcov = vcovHC)
print(joint_test[, "Pr(>|t|)"])
## (Intercept) PC hsGPA ACT mothcoll fathcoll
## 5.903211e-04 1.682044e-02 1.977193e-05 4.846838e-01 9.528773e-01 4.948069e-01
p_value = 1- pf(F,3,4349)
p_value
## [1] 0.1058349
head(summary_with_robust_se)
## Robust SE
## (Intercept) -4.222516228 0.2443961935
## age 0.340925520 0.0192199445
## I(age^2) -0.002741209 0.0003513959
## educ -0.075232323 0.0063159137
## electric -0.310040409 0.0640737262
## urban -0.200033857 0.0455162364
fv <- fitted(model6)
head(fv,6)
## 1 2 3 4 5 6
## 2.768423 2.919682 3.115218 3.039878 3.269490 3.021208
residuals <- resid(model6)
head(residuals,6)
## 1 2 3 4 5 6
## 0.23157702 0.48031850 -0.11521801 0.46012183 0.33050949 -0.02120773
hetero_test <- lm(residuals^2 ~ fv)
summary(hetero_test)
##
## Call:
## lm(formula = residuals^2 ~ fv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.13813 -0.07873 -0.04322 0.04132 0.51313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.27680 0.18877 -1.466 0.1448
## fv 0.12558 0.06165 2.037 0.0436 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.128 on 139 degrees of freedom
## Multiple R-squared: 0.02898, Adjusted R-squared: 0.022
## F-statistic: 4.149 on 1 and 139 DF, p-value: 0.04357
The hypotheses are:
H₀: There is no heteroskedasticity
H₁: There is heteroskedasticity
Since the p-value is below the significance level (α) at a 95% confidence level, we have sufficient statistical evidence to reject the null hypothesis, indicating that heteroskedasticity is present in the model.
No, since the difference between the robust and non robust standard errors was really small.
data7 <- wooldridge::ceosal2
head(data7)
## salary age college grad comten ceoten sales profits mktval lsalary lsales
## 1 1161 49 1 1 9 2 6200 966 23200 7.057037 8.732305
## 2 600 43 1 1 10 10 283 48 1100 6.396930 5.645447
## 3 379 51 1 1 9 3 169 40 1100 5.937536 5.129899
## 4 651 55 1 0 22 22 1100 -54 1000 6.478509 7.003066
## 5 497 44 1 1 8 6 351 28 387 6.208590 5.860786
## 6 1067 64 1 1 7 7 19000 614 3900 6.972606 9.852194
## lmktval comtensq ceotensq profmarg
## 1 10.051908 81 4 15.580646
## 2 7.003066 100 100 16.961130
## 3 7.003066 81 9 23.668638
## 4 6.907755 484 484 -4.909091
## 5 5.958425 64 36 7.977208
## 6 8.268732 49 49 3.231579
model11 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceoten + comten, data = data7)
summary(model11)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceoten + comten, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5436 -0.2796 -0.0164 0.2857 1.9879
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.571977 0.253466 18.038 < 2e-16 ***
## log(sales) 0.187787 0.040003 4.694 5.46e-06 ***
## log(mktval) 0.099872 0.049214 2.029 0.04397 *
## profmarg -0.002211 0.002105 -1.050 0.29514
## ceoten 0.017104 0.005540 3.087 0.00236 **
## comten -0.009238 0.003337 -2.768 0.00626 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4947 on 171 degrees of freedom
## Multiple R-squared: 0.3525, Adjusted R-squared: 0.3336
## F-statistic: 18.62 on 5 and 171 DF, p-value: 9.488e-15
r_squared <- summary(model11)$r.squared
r_squared
## [1] 0.3525374
model12 <- lm(log(salary) ~ log(sales) + log(mktval) + profmarg + ceotensq + comtensq, data = data7)
summary(model12)
##
## Call:
## lm(formula = log(salary) ~ log(sales) + log(mktval) + profmarg +
## ceotensq + comtensq, data = data7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.47481 -0.25933 -0.00511 0.27010 2.07583
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.612e+00 2.524e-01 18.276 < 2e-16 ***
## log(sales) 1.805e-01 4.021e-02 4.489 1.31e-05 ***
## log(mktval) 1.018e-01 4.988e-02 2.040 0.0429 *
## profmarg -2.077e-03 2.135e-03 -0.973 0.3321
## ceotensq 3.761e-04 1.916e-04 1.963 0.0512 .
## comtensq -1.788e-04 7.236e-05 -2.471 0.0144 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5024 on 171 degrees of freedom
## Multiple R-squared: 0.3324, Adjusted R-squared: 0.3129
## F-statistic: 17.03 on 5 and 171 DF, p-value: 1.195e-13
r_squared <- summary(model12)$r.squared
r_squared
## [1] 0.3323998
data9 <- wooldridge::campus
head(data9)
## enroll priv police crime lcrime lenroll lpolice
## 1 21836 0 24 446 6.100319 9.991315 3.178054
## 2 6485 0 13 1 0.000000 8.777247 2.564949
## 3 2123 0 3 1 0.000000 7.660585 1.098612
## 4 8240 0 17 121 4.795791 9.016756 2.833213
## 5 19793 0 30 470 6.152733 9.893084 3.401197
## 6 3256 1 9 25 3.218876 8.088255 2.197225
b0 <- -6.63
se_b0 <- 1.03
b1 <- 1.27
se_b1 <- 0.11
n <- nrow(data9)
t_stat <- (b1 - 1) / se_b1
df <- n - 2
critical_value <- qt(0.95, df)
if (t_stat > critical_value) {
cat("Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.\n")
} else {
cat("Fail to reject the null hypothesis H0: B1 = 1 at the 5% level.\n")
}
## Reject the null hypothesis H0: B1 = 1 in favor of H1: B1 > 1 at the 5% level.
data ("infmrt")
model9 <- lm(infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt )
summary(model9)
##
## Call:
## lm(formula = infmort ~ lpcinc + lphysic + lpopul + DC, data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6894 -0.8973 -0.1412 0.7054 3.1705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.2125 6.7267 5.383 5.09e-07 ***
## lpcinc -2.3964 0.8866 -2.703 0.00812 **
## lphysic -1.5548 0.7734 -2.010 0.04718 *
## lpopul 0.5755 0.1365 4.215 5.60e-05 ***
## DC 13.9632 1.2466 11.201 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.255 on 97 degrees of freedom
## Multiple R-squared: 0.6432, Adjusted R-squared: 0.6285
## F-statistic: 43.72 on 4 and 97 DF, p-value: < 2.2e-16
model9 <- lm(infmort~log(pcinc)+log(physic)+log(popul), data=infmrt)
summary(model9)
##
## Call:
## lm(formula = infmort ~ log(pcinc) + log(physic) + log(popul),
## data = infmrt)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0204 -1.1897 -0.1152 0.9083 8.1890
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.22614 10.13466 3.574 0.000547 ***
## log(pcinc) -4.88415 1.29319 -3.777 0.000273 ***
## log(physic) 4.02783 0.89101 4.521 1.73e-05 ***
## log(popul) -0.05356 0.18748 -0.286 0.775712
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.891 on 98 degrees of freedom
## Multiple R-squared: 0.1818, Adjusted R-squared: 0.1568
## F-statistic: 7.26 on 3 and 98 DF, p-value: 0.0001898
library(wooldridge)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ car::recode() masks dplyr::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(quantreg)
## Loading required package: SparseM
data <- as.data.frame(rdchem)
data$sales_billion <- data$sales / 1e3
ols_full <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data)
summary(ols_full)
##
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_billion 0.316611 0.138854 2.280 0.03041 *
## I(sales_billion^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
data_filtered <- data %>% filter(sales_billion < 40)
ols_filtered <- lm(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered)
summary(ols_filtered)
##
## Call:
## lm(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, data = data_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0371 -1.1238 -0.4547 0.7165 5.8522
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.058967 0.626269 3.288 0.00272 **
## sales_billion 0.316611 0.138854 2.280 0.03041 *
## I(sales_billion^2) -0.007390 0.003716 -1.989 0.05657 .
## profmarg 0.053322 0.044210 1.206 0.23787
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.774 on 28 degrees of freedom
## Multiple R-squared: 0.1905, Adjusted R-squared: 0.1037
## F-statistic: 2.196 on 3 and 28 DF, p-value: 0.1107
lad_full <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data, tau = 0.5)
summary(lad_full)
##
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, tau = 0.5, data = data)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_billion 0.26346 -0.13508 0.75753
## I(sales_billion^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
lad_filtered <- rq(rdintens ~ sales_billion + I(sales_billion^2) + profmarg, data = data_filtered, tau = 0.5)
summary(lad_filtered)
##
## Call: rq(formula = rdintens ~ sales_billion + I(sales_billion^2) +
## profmarg, tau = 0.5, data = data_filtered)
##
## tau: [1] 0.5
##
## Coefficients:
## coefficients lower bd upper bd
## (Intercept) 1.40428 0.87031 2.66628
## sales_billion 0.26346 -0.13508 0.75753
## I(sales_billion^2) -0.00600 -0.01679 0.00344
## profmarg 0.11400 0.01376 0.16427
comparison <- data.frame(
Method = c("OLS (Full)", "OLS (Filtered)", "LAD (Full)", "LAD (Filtered)"),
Intercept = c(coef(ols_full)[1], coef(ols_filtered)[1], coef(lad_full)[1], coef(lad_filtered)[1]),
Sales = c(coef(ols_full)[2], coef(ols_filtered)[2], coef(lad_full)[2], coef(lad_filtered)[2]),
Sales_Squared = c(coef(ols_full)[3], coef(ols_filtered)[3], coef(lad_full)[3], coef(lad_filtered)[3]),
Profmarg = c(coef(ols_full)[4], coef(ols_filtered)[4], coef(lad_full)[4], coef(lad_filtered)[4])
)
print(comparison)
## Method Intercept Sales Sales_Squared Profmarg
## 1 OLS (Full) 2.058967 0.3166110 -0.007389624 0.05332217
## 2 OLS (Filtered) 2.058967 0.3166110 -0.007389624 0.05332217
## 3 LAD (Full) 1.404279 0.2634638 -0.006001127 0.11400366
## 4 LAD (Filtered) 1.404279 0.2634638 -0.006001127 0.11400366
When modeling quarterly data on new housing starts, interest rates, and real per capita income, it is crucial to account for potential trends and seasonality in the variables. A common approach is to use a time series model, such as a SARIMA (Seasonal Autoregressive Integrated Moving Average) model. Here’s how you might specify such a model:
Let:
- y(t) = Housing starts at time t
- x(1,t) = Interest rates at time t
- x(2,t) = Real per capita income at time t
A simple SARIMA model with a linear trend, seasonality, and exogenous variables could be specified as:
y(t) = b0 + b1t + b2x(1,t) + b3x(2,t) + e(t)
Where:
- b0 is the intercept
- b1 represents the linear trend over time
- b2 and b3 represent the impact of interest rates and real per capita income on housing starts, respectively
- e(t) is the error term
data_10C1 <- intdef %>% mutate(later1979= ifelse(year>1979,1,0))
head(data_10C1,5)
## year i3 inf rec out def i3_1 inf_1 def_1 ci3 cinf
## 1 1948 1.04 8.1 16.2 11.6 -4.6000004 NA NA NA NA NA
## 2 1949 1.10 -1.2 14.5 14.3 -0.1999998 1.04 8.1 -4.6000004 0.06000006 -9.3
## 3 1950 1.22 1.3 14.4 15.6 1.2000008 1.10 -1.2 -0.1999998 0.12000000 2.5
## 4 1951 1.55 7.9 16.1 14.2 -1.9000006 1.22 1.3 1.2000008 0.32999992 6.6
## 5 1952 1.77 1.9 19.0 19.4 0.3999996 1.55 7.9 -1.9000006 0.22000003 -6.0
## cdef y77 later1979
## 1 NA 0 0
## 2 4.400001 0 0
## 3 1.400001 0 0
## 4 -3.100001 0 0
## 5 2.300000 0 0
model_10C1 <- lm(i3~inf+def+later1979, data=data_10C1)
summary(model_10C1)
##
## Call:
## lm(formula = i3 ~ inf + def + later1979, data = data_10C1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4674 -0.8407 0.2388 1.0148 3.9654
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.29623 0.42535 3.047 0.00362 **
## inf 0.60842 0.07625 7.979 1.37e-10 ***
## def 0.36266 0.12025 3.016 0.00396 **
## later1979 1.55877 0.50577 3.082 0.00329 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.711 on 52 degrees of freedom
## Multiple R-squared: 0.6635, Adjusted R-squared: 0.6441
## F-statistic: 34.18 on 3 and 52 DF, p-value: 2.408e-12
data_10C1 <- as.integer(data_10C1$year > 1979)
data13 <- wooldridge::intdef
head(data13)
## year i3 inf rec out def i3_1 inf_1 def_1 ci3 cinf
## 1 1948 1.04 8.1 16.2 11.6 -4.6000004 NA NA NA NA NA
## 2 1949 1.10 -1.2 14.5 14.3 -0.1999998 1.04 8.1 -4.6000004 0.06000006 -9.3
## 3 1950 1.22 1.3 14.4 15.6 1.2000008 1.10 -1.2 -0.1999998 0.12000000 2.5
## 4 1951 1.55 7.9 16.1 14.2 -1.9000006 1.22 1.3 1.2000008 0.32999992 6.6
## 5 1952 1.77 1.9 19.0 19.4 0.3999996 1.55 7.9 -1.9000006 0.22000003 -6.0
## 6 1953 1.93 0.8 18.7 20.4 1.6999989 1.77 1.9 0.3999996 0.15999997 -1.1
## cdef y77
## 1 NA 0
## 2 4.400001 0
## 3 1.400001 0
## 4 -3.100001 0
## 5 2.300000 0
## 6 1.299999 0
data13$dummy <- as.integer(data13$year > 1979)
data13$dummy
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1
## [39] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
model15 <- lm(i3 ~ inf + def , data = data13)
summary(model15)
##
## Call:
## lm(formula = i3 ~ inf + def, data = data13)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.9948 -1.1694 0.1959 0.9602 4.7224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.73327 0.43197 4.012 0.00019 ***
## inf 0.60587 0.08213 7.376 1.12e-09 ***
## def 0.51306 0.11838 4.334 6.57e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.843 on 53 degrees of freedom
## Multiple R-squared: 0.6021, Adjusted R-squared: 0.5871
## F-statistic: 40.09 on 2 and 53 DF, p-value: 2.483e-11
model16 <- lm(i3 ~ inf + def + y77 , data = data13)
summary(model16)
##
## Call:
## lm(formula = i3 ~ inf + def + y77, data = data13)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4048 -0.9632 0.2192 0.8497 4.3447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.40531 0.42239 3.327 0.00162 **
## inf 0.56884 0.07832 7.263 1.88e-09 ***
## def 0.36276 0.12337 2.940 0.00488 **
## y77 1.47773 0.52349 2.823 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.733 on 52 degrees of freedom
## Multiple R-squared: 0.6549, Adjusted R-squared: 0.635
## F-statistic: 32.9 on 3 and 52 DF, p-value: 4.608e-12
# Load necessary library
library(wooldridge)
# Load the dataset 'volat'
data("volat")
# Display the structure of the dataset to verify variable names
str(volat)
## 'data.frame': 558 obs. of 17 variables:
## $ date : num 1947 1947 1947 1947 1947 ...
## $ sp500 : num 15.2 15.8 15.2 14.6 14.3 ...
## $ divyld: num 4.49 4.38 4.61 4.75 5.05 ...
## $ i3 : num 0.38 0.38 0.38 0.38 0.38 ...
## $ ip : num 22.4 22.5 22.6 22.5 22.6 ...
## $ pcsp : num NA 46.5 -48.6 -44.3 -21.4 ...
## $ rsp500: num NA 50.9 -44 -39.6 -16.3 ...
## $ pcip : num NA 5.36 5.33 -5.31 5.33 ...
## $ ci3 : num NA 0 0 0 0 ...
## $ ci3_1 : num NA NA 0 0 0 ...
## $ ci3_2 : num NA NA NA 0 0 ...
## $ pcip_1: num NA NA 5.36 5.33 -5.31 ...
## $ pcip_2: num NA NA NA 5.36 5.33 ...
## $ pcip_3: num NA NA NA NA 5.36 ...
## $ pcsp_1: num NA NA 46.5 -48.6 -44.3 ...
## $ pcsp_2: num NA NA NA 46.5 -48.6 ...
## $ pcsp_3: num NA NA NA NA 46.5 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# Check the first few rows to confirm the data
head(volat)
## date sp500 divyld i3 ip pcsp rsp500 pcip ci3 ci3_1 ci3_2
## 1 1947.01 15.21 4.49 0.38 22.4 NA NA NA NA NA NA
## 2 1947.02 15.80 4.38 0.38 22.5 46.54833 50.92833 5.357163 0 NA NA
## 3 1947.03 15.16 4.61 0.38 22.6 -48.60762 -43.99762 5.333354 0 0 NA
## 4 1947.04 14.60 4.75 0.38 22.5 -44.32714 -39.57714 -5.309754 0 0 0
## 5 1947.05 14.34 5.05 0.38 22.6 -21.36988 -16.31988 5.333354 0 0 0
## 6 1947.06 14.84 5.03 0.38 22.6 41.84100 46.87100 0.000000 0 0 0
## pcip_1 pcip_2 pcip_3 pcsp_1 pcsp_2 pcsp_3
## 1 NA NA NA NA NA NA
## 2 NA NA NA NA NA NA
## 3 5.357163 NA NA 46.54833 NA NA
## 4 5.333354 5.357163 NA -48.60762 46.54833 NA
## 5 -5.309754 5.333354 5.357163 -44.32714 -48.60762 46.54833
## 6 5.333354 -5.309754 5.333354 -21.36988 -44.32714 -48.60762
# Estimate the equation rsp500_t = β0 + β1 * pcip_t + β2 * i3_t + u_t using OLS
model <- lm(rsp500 ~ pcip + i3, data = volat)
# Summarize the results of the model
summary(model)
##
## Call:
## lm(formula = rsp500 ~ pcip + i3, data = volat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.871 -22.580 2.103 25.524 138.137
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.84306 3.27488 5.754 1.44e-08 ***
## pcip 0.03642 0.12940 0.281 0.7785
## i3 -1.36169 0.54072 -2.518 0.0121 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40.13 on 554 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.01189, Adjusted R-squared: 0.008325
## F-statistic: 3.334 on 2 and 554 DF, p-value: 0.03637
Interpretation:
1. Look at the coefficients and their signs in the output.
2. Check the p-values for statistical significance of pcip and i3.
3. Use the results to discuss whether rsp500 is predictable based on the predictors.
# Load the wooldridge package
library(wooldridge)
# Load the CONSUMP dataset
data("consump", package = "wooldridge")
# View the structure of the dataset
str(consump)
## 'data.frame': 37 obs. of 24 variables:
## $ year : int 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 ...
## $ i3 : num 3.41 2.93 2.38 2.78 3.16 ...
## $ inf : num 0.7 1.7 1 1 1.3 ...
## $ rdisp : num 1530 1565 1616 1694 1756 ...
## $ rnondc : num 606 615 627 646 660 ...
## $ rserv : num 687 717 746 783 819 ...
## $ pop : num 177830 180671 183691 186538 189242 ...
## $ y : num 8604 8664 8796 9080 9276 ...
## $ rcons : num 1294 1333 1373 1430 1479 ...
## $ c : num 7275 7377 7476 7665 7814 ...
## $ r3 : num 2.71 1.23 1.38 1.78 1.86 ...
## $ lc : num 8.89 8.91 8.92 8.94 8.96 ...
## $ ly : num 9.06 9.07 9.08 9.11 9.14 ...
## $ gc : num NA 0.0139 0.0133 0.0251 0.0192 ...
## $ gy : num NA 0.00696 0.01511 0.03171 0.02145 ...
## $ gc_1 : num NA NA 0.0139 0.0133 0.0251 ...
## $ gy_1 : num NA NA 0.00696 0.01511 0.03171 ...
## $ r3_1 : num NA 2.71 1.23 1.38 1.78 ...
## $ lc_ly : num -0.168 -0.161 -0.163 -0.169 -0.172 ...
## $ lc_ly_1: num NA -0.168 -0.161 -0.163 -0.169 ...
## $ gc_2 : num NA NA NA 0.0139 0.0133 ...
## $ gy_2 : num NA NA NA 0.00696 0.01511 ...
## $ r3_2 : num NA NA 2.71 1.23 1.38 ...
## $ lc_ly_2: num NA NA -0.168 -0.161 -0.163 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# Generate growth in consumption (gc) as specified
consump$gc <- log(consump$c) - log(dplyr::lag(consump$c, n = 1))
# Run regression: gc_t = β0 + β1 * gc_t-1 + u_t
model1 <- lm(gc ~ dplyr::lag(gc, n = 1), data = consump)
summary(model1)
##
## Call:
## lm(formula = gc ~ dplyr::lag(gc, n = 1), data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027878 -0.005974 -0.001451 0.007142 0.020227
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.011431 0.003778 3.026 0.00478 **
## dplyr::lag(gc, n = 1) 0.446141 0.156047 2.859 0.00731 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01161 on 33 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.1985, Adjusted R-squared: 0.1742
## F-statistic: 8.174 on 1 and 33 DF, p-value: 0.00731
# Include lagged variables of gy, i3, and inf
consump$gy_lag <- dplyr::lag(consump$gy, n = 1)
consump$i3_lag <- dplyr::lag(consump$i3, n = 1)
consump$inf_lag <- dplyr::lag(consump$inf, n = 1)
# Run regression with additional variables
model2 <- lm(gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag, data = consump)
summary(model2)
##
## Call:
## lm(formula = gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag,
## data = consump)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0249093 -0.0075869 0.0000855 0.0087234 0.0188617
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0225940 0.0070892 3.187 0.00335 **
## dplyr::lag(gc, n = 1) 0.4335832 0.2896525 1.497 0.14487
## gy_lag -0.1079087 0.1946371 -0.554 0.58341
## i3_lag -0.0007467 0.0011107 -0.672 0.50654
## inf_lag -0.0008281 0.0010041 -0.825 0.41606
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01134 on 30 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3038, Adjusted R-squared: 0.211
## F-statistic: 3.273 on 4 and 30 DF, p-value: 0.02431
# Check p-value of lag(gc)
p_value_lag_gc <- summary(model2)$coefficients["dplyr::lag(gc, n = 1)", "Pr(>|t|)"]
# Perform an F-test for joint significance of all explanatory variables
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: gc ~ dplyr::lag(gc, n = 1)
## Model 2: gc ~ dplyr::lag(gc, n = 1) + gy_lag + i3_lag + inf_lag
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 33 0.0044446
## 2 30 0.0038608 3 0.00058384 1.5122 0.2315
library(wooldridge)
library(lmtest)
library(sandwich)
library(ggplot2)
library(tseries)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
data("minwage")
# Focus on the relevant variables
# gwage232: growth in wage (sector 232)
# gemp232: growth in employment (sector 232)
# gmwage: growth in federal minimum wage
# gcpi: growth in CPI
# Create lagged variables
minwage$gwage232_lag1 <- c(NA, minwage$gwage232[-nrow(minwage)]) # Lagged gwage232
minwage$gemp232_lag1 <- c(NA, minwage$gemp232[-nrow(minwage)]) # Lagged gemp232
# Remove rows with NA values
minwage_clean <- na.omit(minwage)
acf_gwage <- acf(minwage_clean$gwage232, lag.max = 1, plot = FALSE) # ACF
first_order_autocorrelation <- acf_gwage$acf[2] # First lag
cat("First-order autocorrelation in gwage232:", first_order_autocorrelation, "\n")
## First-order autocorrelation in gwage232: -0.02425413
model_dynamic <- lm(gwage232 ~ gwage232_lag1 + gmwage + gcpi, data = minwage_clean)
summary(model_dynamic)
##
## Call:
## lm(formula = gwage232 ~ gwage232_lag1 + gmwage + gcpi, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044649 -0.004114 -0.001262 0.004481 0.041568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0023648 0.0004295 5.506 5.45e-08 ***
## gwage232_lag1 -0.0684816 0.0343986 -1.991 0.0470 *
## gmwage 0.1517511 0.0095115 15.955 < 2e-16 ***
## gcpi 0.2586795 0.0858602 3.013 0.0027 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007775 on 595 degrees of freedom
## Multiple R-squared: 0.3068, Adjusted R-squared: 0.3033
## F-statistic: 87.79 on 3 and 595 DF, p-value: < 2.2e-16
# Check if gmwage has a significant effect while controlling for other variables
cat("Effect of gmwage (contemporaneous effect):\n")
## Effect of gmwage (contemporaneous effect):
print(summary(model_dynamic)$coefficients["gmwage", ])
## Estimate Std. Error t value Pr(>|t|)
## 1.517511e-01 9.511457e-03 1.595456e+01 5.753131e-48
model_with_lagged_employment <- lm(gwage232 ~ gwage232_lag1 + gmwage + gcpi + gemp232_lag1, data = minwage_clean)
summary(model_with_lagged_employment)
##
## Call:
## lm(formula = gwage232 ~ gwage232_lag1 + gmwage + gcpi + gemp232_lag1,
## data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.043900 -0.004316 -0.000955 0.004255 0.042430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0023960 0.0004254 5.633 2.74e-08 ***
## gwage232_lag1 -0.0656875 0.0340720 -1.928 0.054344 .
## gmwage 0.1525470 0.0094213 16.192 < 2e-16 ***
## gcpi 0.2536899 0.0850342 2.983 0.002968 **
## gemp232_lag1 0.0606620 0.0169691 3.575 0.000379 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007699 on 594 degrees of freedom
## Multiple R-squared: 0.3214, Adjusted R-squared: 0.3169
## F-statistic: 70.34 on 4 and 594 DF, p-value: < 2.2e-16
# Check if the lagged growth in employment is statistically significant
cat("Effect of lagged growth in employment (gemp232_lag1):\n")
## Effect of lagged growth in employment (gemp232_lag1):
print(summary(model_with_lagged_employment)$coefficients["gemp232_lag1", ])
## Estimate Std. Error t value Pr(>|t|)
## 0.0606619954 0.0169690920 3.5748521768 0.0003789121
model_without_lags <- lm(gwage232 ~ gmwage + gcpi, data = minwage_clean)
summary(model_without_lags)
##
## Call:
## lm(formula = gwage232 ~ gmwage + gcpi, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.044501 -0.004053 -0.001328 0.004503 0.041194
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0021926 0.0004217 5.199 2.75e-07 ***
## gmwage 0.1506131 0.0095178 15.824 < 2e-16 ***
## gcpi 0.2374126 0.0854046 2.780 0.00561 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007794 on 596 degrees of freedom
## Multiple R-squared: 0.3022, Adjusted R-squared: 0.2999
## F-statistic: 129.1 on 2 and 596 DF, p-value: < 2.2e-16
# Compare coefficients of gmwage across models
cat("Coefficient of gmwage in model without lags:", summary(model_without_lags)$coefficients["gmwage", 1], "\n")
## Coefficient of gmwage in model without lags: 0.1506131
cat("Coefficient of gmwage in model with lags:", summary(model_with_lagged_employment)$coefficients["gmwage", 1], "\n")
## Coefficient of gmwage in model with lags: 0.152547
model_gmwage <- lm(gmwage ~ gwage232_lag1 + gemp232_lag1, data = minwage_clean)
summary(model_gmwage)
##
## Call:
## lm(formula = gmwage ~ gwage232_lag1 + gemp232_lag1, data = minwage_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.01987 -0.00511 -0.00385 -0.00290 0.62191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.003487 0.001465 2.380 0.0176 *
## gwage232_lag1 0.212051 0.146727 1.445 0.1489
## gemp232_lag1 -0.042776 0.073749 -0.580 0.5621
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03347 on 596 degrees of freedom
## Multiple R-squared: 0.004117, Adjusted R-squared: 0.0007754
## F-statistic: 1.232 on 2 and 596 DF, p-value: 0.2924
# Report R-squared of the regression
r_squared_gmwage <- summary(model_gmwage)$r.squared
cat("R-squared of gmwage regression:", r_squared_gmwage, "\n")
## R-squared of gmwage regression: 0.004117273
# Interpret R-squared value
cat("Interpretation: R-squared indicates how well the lagged variables explain variation in gmwage.\n")
## Interpretation: R-squared indicates how well the lagged variables explain variation in gmwage.
# Load necessary libraries
library(wooldridge) # For accessing Wooldridge datasets
library(lmtest) # For model diagnostics
library(sandwich) # For robust standard errors
library(tseries) # For ARCH models
library(ggplot2) # For plotting
# Load the NYSE dataset
data("nyse") # The dataset is part of the Wooldridge package
# Add lagged returns for the analysis
nyse$return_lag1 <- c(NA, nyse$return[-nrow(nyse)]) # Lagged return_t-1
nyse$return_lag2 <- c(NA, nyse$return_lag1[-nrow(nyse)]) # Lagged return_t-2
# Remove rows with NA values (introduced by lags)
nyse_clean <- na.omit(nyse)
model_12_47 <- lm(return ~ return_lag1, data = nyse_clean)
summary(model_12_47)
##
## Call:
## lm(formula = return ~ return_lag1, data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2633 -1.2997 0.1011 1.3203 8.0688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.17862 0.08084 2.210 0.0275 *
## return_lag1 0.05806 0.03811 1.523 0.1281
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.112 on 686 degrees of freedom
## Multiple R-squared: 0.003372, Adjusted R-squared: 0.001919
## F-statistic: 2.321 on 1 and 686 DF, p-value: 0.1281
# Calculate squared residuals
nyse_clean$residuals_squared <- residuals(model_12_47)^2
# Find average, minimum, and maximum of squared residuals
avg_res <- mean(nyse_clean$residuals_squared, na.rm = TRUE)
min_res <- min(nyse_clean$residuals_squared, na.rm = TRUE)
max_res <- max(nyse_clean$residuals_squared, na.rm = TRUE)
cat("Average of squared residuals:", avg_res, "\n")
## Average of squared residuals: 4.446345
cat("Minimum of squared residuals:", min_res, "\n")
## Minimum of squared residuals: 3.591354e-06
cat("Maximum of squared residuals:", max_res, "\n")
## Maximum of squared residuals: 232.9687
nyse_clean$return_lag1_sq <- nyse_clean$return_lag1^2 # Squared lagged return
hetero_model <- lm(residuals_squared ~ return_lag1 + return_lag1_sq, data = nyse_clean)
summary(hetero_model)
##
## Call:
## lm(formula = residuals_squared ~ return_lag1 + return_lag1_sq,
## data = nyse_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.545 -3.021 -1.974 0.695 221.544
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.25771 0.44131 7.382 4.54e-13 ***
## return_lag1 -0.78556 0.19626 -4.003 6.95e-05 ***
## return_lag1_sq 0.29749 0.03558 8.361 3.46e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.67 on 685 degrees of freedom
## Multiple R-squared: 0.1305, Adjusted R-squared: 0.128
## F-statistic: 51.41 on 2 and 685 DF, p-value: < 2.2e-16
# Extract coefficients, standard errors, R-squared, and adjusted R-squared
coefs <- coef(hetero_model)
std_errors <- sqrt(diag(vcov(hetero_model)))
r_squared <- summary(hetero_model)$r.squared
adj_r_squared <- summary(hetero_model)$adj.r.squared
cat("Coefficients:", coefs, "\n")
## Coefficients: 3.257706 -0.7855573 0.2974907
cat("Standard Errors:", std_errors, "\n")
## Standard Errors: 0.441314 0.1962611 0.0355787
cat("R-squared:", r_squared, "\n")
## R-squared: 0.1305106
cat("Adjusted R-squared:", adj_r_squared, "\n")
## Adjusted R-squared: 0.127972
nyse_clean$conditional_variance <- predict(hetero_model, newdata = nyse_clean)
ggplot(nyse_clean, aes(x = return_lag1, y = conditional_variance)) +
geom_line(color = "blue") +
labs(title = "Conditional Variance vs Lagged Return",
x = "Lagged Return (return_t-1)", y = "Conditional Variance") +
theme_minimal()
# Find value of return where variance is smallest
min_variance <- min(nyse_clean$conditional_variance, na.rm = TRUE)
cat("Minimum conditional variance:", min_variance, "\n")
## Minimum conditional variance: 2.739119
negative_variance <- sum(nyse_clean$conditional_variance < 0, na.rm = TRUE)
cat("Number of negative variance estimates:", negative_variance, "\n")
## Number of negative variance estimates: 0
arch1_model <- garch(nyse_clean$return, order = c(1, 0))
##
## ***** ESTIMATION WITH ANALYTICAL GRADIENT *****
##
##
## I INITIAL X(I) D(I)
##
## 1 4.244486e+00 1.000e+00
## 2 5.000000e-02 1.000e+00
##
## IT NF F RELDF PRELDF RELDX STPPAR D*STEP NPRELDF
## 0 1 8.604e+02
## 1 4 8.604e+02 1.04e-05 3.38e-05 1.1e-03 2.9e+02 1.0e-02 4.94e-03
## 2 5 8.604e+02 1.86e-06 1.97e-06 1.0e-03 1.3e+00 1.0e-02 1.99e-06
## 3 6 8.604e+02 4.21e-09 1.02e-07 1.5e-03 0.0e+00 1.3e-02 1.02e-07
## 4 7 8.604e+02 2.86e-09 9.16e-08 7.7e-04 1.8e+00 6.7e-03 1.13e-06
## 5 8 8.604e+02 1.89e-09 4.61e-08 3.9e-04 1.9e+00 3.4e-03 1.67e-06
## 6 11 8.604e+02 1.01e-10 5.29e-09 4.5e-05 2.5e+01 4.0e-04 1.62e-06
## 7 14 8.604e+02 3.62e-11 5.04e-10 4.1e-06 3.9e+00 3.8e-05 1.61e-06
## 8 15 8.604e+02 9.74e-12 2.51e-10 2.1e-06 4.2e+02 1.9e-05 1.61e-06
## 9 16 8.604e+02 3.57e-12 1.26e-10 1.1e-06 5.6e+03 9.4e-06 1.61e-06
## 10 17 8.604e+02 1.69e-12 6.27e-11 5.3e-07 1.9e+04 4.7e-06 1.61e-06
## 11 18 8.604e+02 8.28e-13 3.14e-11 2.7e-07 4.8e+04 2.4e-06 1.60e-06
## 12 19 8.604e+02 4.09e-13 1.57e-11 1.3e-07 1.0e+05 1.2e-06 1.60e-06
## 13 20 8.604e+02 2.04e-13 7.84e-12 6.7e-08 2.2e+05 5.9e-07 1.60e-06
## 14 21 8.604e+02 1.01e-13 3.92e-12 3.3e-08 4.4e+05 2.9e-07 1.60e-06
## 15 22 8.604e+02 5.23e-14 1.96e-12 1.7e-08 8.9e+05 1.5e-07 1.60e-06
## 16 23 8.604e+02 2.43e-14 9.80e-13 8.4e-09 1.8e+06 7.3e-08 1.60e-06
## 17 24 8.604e+02 1.33e-14 4.90e-13 4.2e-09 3.6e+06 3.7e-08 1.60e-06
## 18 25 8.604e+02 6.47e-15 2.45e-13 2.1e-09 7.2e+06 1.8e-08 1.60e-06
## 19 26 8.604e+02 3.57e-15 1.23e-13 1.0e-09 1.4e+07 9.2e-09 1.60e-06
## 20 27 8.604e+02 5.29e-16 6.13e-14 5.2e-10 2.9e+07 4.6e-09 1.60e-06
## 21 28 8.604e+02 1.98e-15 3.06e-14 2.6e-10 5.8e+07 2.3e-09 1.60e-06
## 22 36 8.604e+02 0.00e+00 1.82e-18 1.6e-14 9.7e+11 1.4e-13 1.60e-06
##
## ***** FALSE CONVERGENCE *****
##
## FUNCTION 8.604278e+02 RELDX 1.557e-14
## FUNC. EVALS 36 GRAD. EVALS 22
## PRELDF 1.824e-18 NPRELDF 1.601e-06
##
## I FINAL X(I) D(I) G(I)
##
## 1 4.278928e+00 1.000e+00 -1.119e-02
## 2 4.975478e-02 1.000e+00 2.591e-03
summary(arch1_model)
##
## Call:
## garch(x = nyse_clean$return, order = c(1, 0))
##
## Model:
## GARCH(1,0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2203 -0.5202 0.1519 0.7279 3.9815
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 4.27893 83.25229 0.051 0.959
## b1 0.04975 18.48916 0.003 0.998
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 776.88, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 87.793, df = 1, p-value < 2.2e-16
arch2_model <- garch(nyse_clean$return, order = c(2, 0))
##
## ***** ESTIMATION WITH ANALYTICAL GRADIENT *****
##
##
## I INITIAL X(I) D(I)
##
## 1 4.021092e+00 1.000e+00
## 2 5.000000e-02 1.000e+00
## 3 5.000000e-02 1.000e+00
##
## IT NF F RELDF PRELDF RELDX STPPAR D*STEP NPRELDF
## 0 1 8.581e+02
## 1 5 8.581e+02 5.14e-06 1.02e-05 2.8e-04 8.7e+02 3.2e-03 4.45e-03
## 2 12 8.581e+02 6.59e-11 6.25e-10 3.8e-06 2.2e+00 4.7e-05 1.35e-07
## 3 15 8.581e+02 7.54e-11 3.87e-10 2.1e-06 3.6e+00 2.0e-05 1.35e-07
## 4 31 8.581e+02 1.85e-15 1.60e-14 1.1e-10 1.5e+06 9.0e-10 1.36e-07
## 5 39 8.581e+02 -1.59e-15 1.30e-18 9.1e-15 2.0e+10 7.3e-14 1.60e-07
##
## ***** FALSE CONVERGENCE *****
##
## FUNCTION 8.581133e+02 RELDX 9.055e-15
## FUNC. EVALS 39 GRAD. EVALS 5
## PRELDF 1.304e-18 NPRELDF 1.595e-07
##
## I FINAL X(I) D(I) G(I)
##
## 1 4.021524e+00 1.000e+00 1.530e-02
## 2 5.223246e-02 1.000e+00 -8.293e-04
## 3 5.222700e-02 1.000e+00 -1.780e-04
## Warning in garch(nyse_clean$return, order = c(2, 0)): singular information
summary(arch2_model)
##
## Call:
## garch(x = nyse_clean$return, order = c(2, 0))
##
## Model:
## GARCH(2,0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.2303 -0.5216 0.1484 0.7264 3.9869
##
## Coefficient(s):
## Estimate Std. Error t value Pr(>|t|)
## a0 4.02152 NA NA NA
## b1 0.05223 NA NA NA
## b2 0.05223 NA NA NA
##
## Diagnostic Tests:
## Jarque Bera Test
##
## data: Residuals
## X-squared = 786.77, df = 2, p-value < 2.2e-16
##
##
## Box-Ljung test
##
## data: Squared.Residuals
## X-squared = 88.025, df = 1, p-value < 2.2e-16
# Compare ARCH(1) and ARCH(2) models
arch1_aic <- AIC(arch1_model)
arch2_aic <- AIC(arch2_model)
cat("ARCH(1) AIC:", arch1_aic, "\n")
## ARCH(1) AIC: 2987.477
cat("ARCH(2) AIC:", arch2_aic, "\n")
## ARCH(2) AIC: 2983.01
if (arch2_aic < arch1_aic) {
cat("ARCH(2) model fits better than ARCH(1).\n")
} else {
cat("ARCH(1) model fits better than ARCH(2).\n")
}
## ARCH(2) model fits better than ARCH(1).