\(~\)
\(~\)
# Import library and load the datas et
library(haven)
library(tidyverse)
library(ivreg)
dta <- read_dta("/Users/bastienpatras/Desktop/Sciences Po - Master in Economics/Econometrics/PBset 7/ee2002ext.dta")
\(~\)
\(~\)
# Extracting the 0.5th and 99.5th percentile of salfr
p0050 = quantile(dta$salfr, 0.05, na.rm = TRUE)
p9995 = quantile(dta$salfr, 0.95, na.rm = TRUE)
# Applying trimming
dta_trimmed <- dta %>%
na.omit() %>%
filter(salfr > p0050 | salfr < p9995 )
\(~\)
\(~\)
# Generate age squared from trimmed data set
dta_trimmed <- dta_trimmed %>%
mutate(age.sqrt = agd^2)
\(~\)
\(~\)
# Generate age squared from trimmed data set
dta_trimmed <- dta_trimmed %>%
filter(adfe != 0 & adfe != 99, s==1)
\(~\)
\(~\)
# Generate age squared from trimmed data set
dta_trimmed <- dta_trimmed %>%
mutate(lnw = log(salfr/(4*hh)),
lnh = log(hh))
\(~\)
Because we are using a log transformation, extremely low values of the hourly wage come out as -\(\infty\) when taking the log which gives NAs in output. (it allows us to get rid of insignificant values)
\(~\)
\(~\)
# Getting rid of extremely low values and creating lnh
dta_trimmed <- dta_trimmed %>%
mutate(lnh = log(hh)) %>%
filter(lnw >= -5.2983174)
# OLS estimation
model.1 <- lm(lnw ~ agd + age.sqrt + adfe + lnh, data = dta_trimmed)
# Output
summary(model.1)
##
## Call:
## lm(formula = lnw ~ agd + age.sqrt + adfe + lnh, data = dta_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8175 -0.1805 0.0160 0.2257 3.6812
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.262e+00 1.200e-01 18.841 < 2e-16 ***
## agd 5.444e-02 3.868e-03 14.073 < 2e-16 ***
## age.sqrt -4.568e-04 4.866e-05 -9.388 < 2e-16 ***
## adfe 5.507e-02 1.616e-03 34.084 < 2e-16 ***
## lnh -1.527e-01 2.687e-02 -5.682 1.37e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.588 on 11246 degrees of freedom
## Multiple R-squared: 0.1519, Adjusted R-squared: 0.1516
## F-statistic: 503.7 on 4 and 11246 DF, p-value: < 2.2e-16
\(~\)
Because we created the variable lnw
using the variable lnh
we can expect endogenity.
\(~\)
\(~\)
# IV estimation
model.2 <- ivreg(lnw ~ agd + age.sqrt + adfe + lnh |
as.factor(enf3) + as.factor(enf6) +
as.factor(enf18) + as.factor(tymen90r),
data = dta_trimmed)
# Output
summary(model.2)
##
## Call:
## ivreg(formula = lnw ~ agd + age.sqrt + adfe + lnh | as.factor(enf3) +
## as.factor(enf6) + as.factor(enf18) + as.factor(tymen90r),
## data = dta_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.31186 -0.20966 0.02289 0.24913 3.75295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.8474669 1.8658933 -1.526 0.127023
## agd 0.0568544 0.0167603 3.392 0.000696 ***
## age.sqrt -0.0004475 0.0001842 -2.429 0.015140 *
## adfe 0.0684822 0.0138755 4.935 8.11e-07 ***
## lnh 1.1631692 0.5815866 2.000 0.045525 *
##
## Diagnostic tests:
## df1 df2 statistic p-value
## Weak instruments (agd) 18 11232 54.048 < 2e-16 ***
## Weak instruments (age.sqrt) 18 11232 63.412 < 2e-16 ***
## Weak instruments (adfe) 18 11232 21.096 < 2e-16 ***
## Weak instruments (lnh) 18 11232 3.260 3.47e-06 ***
## Wu-Hausman 4 11242 4.034 0.00286 **
## Sargan 14 NA 21.425 0.09123 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6514 on 11246 degrees of freedom
## Multiple R-Squared: -0.04061, Adjusted R-squared: -0.04098
## Wald test: 19.51 on 4 and 11246 DF, p-value: 5.166e-16
\(~\)
\(~\)
# Estimating the first stage using enf3, enf6, enf18 and tymen90r as instruments
First.stage <- lm(lnh ~ as.factor(enf3) + as.factor(enf6) +
as.factor(enf18) + as.factor(tymen90r), data = dta_trimmed)
Fitted.First.stage <- fitted(First.stage)
# Estimating the second stage using enf3, enf6, enf18 and tymen90r as instruments
Second.stage <- lm(lnw ~ agd + age.sqrt + adfe + Fitted.First.stage,
data = dta_trimmed)
summary(Second.stage)
##
## Call:
## lm(formula = lnw ~ agd + age.sqrt + adfe + Fitted.First.stage,
## data = dta_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8631 -0.1794 0.0143 0.2227 3.6796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.9903922 1.3619276 -2.196 0.028133 *
## agd 0.0500357 0.0039509 12.664 < 2e-16 ***
## age.sqrt -0.0004043 0.0000496 -8.151 3.99e-16 ***
## adfe 0.0547911 0.0016159 33.907 < 2e-16 ***
## Fitted.First.stage 1.3279227 0.3809569 3.486 0.000493 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5886 on 11246 degrees of freedom
## Multiple R-squared: 0.1504, Adjusted R-squared: 0.1501
## F-statistic: 497.8 on 4 and 11246 DF, p-value: < 2.2e-16
\(~\)
\(~\)
# Estimating the first stage using enf3, enf6, enf18 and tymen90r as instruments
First.stage <- lm(lnh ~ as.factor(enf3) + as.factor(enf6) +
as.factor(enf18) + as.factor(tymen90r), data = dta_trimmed)
Residuals.First.stage <- residuals(First.stage)
# Estimating the second stage using enf3, enf6, enf18 and tymen90r as instruments
Second.stage.1 <- lm(lnw ~ agd + age.sqrt + adfe + Residuals.First.stage,
data = dta_trimmed)
summary(Second.stage.1)
##
## Call:
## lm(formula = lnw ~ agd + age.sqrt + adfe + Residuals.First.stage,
## data = dta_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8157 -0.1802 0.0158 0.2263 3.6804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.715e+00 7.936e-02 21.612 < 2e-16 ***
## agd 5.416e-02 3.864e-03 14.016 < 2e-16 ***
## age.sqrt -4.535e-04 4.861e-05 -9.329 < 2e-16 ***
## adfe 5.511e-02 1.616e-03 34.111 < 2e-16 ***
## Residuals.First.stage -1.598e-01 2.691e-02 -5.939 2.95e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5879 on 11246 degrees of freedom
## Multiple R-squared: 0.1522, Adjusted R-squared: 0.1519
## F-statistic: 504.5 on 4 and 11246 DF, p-value: < 2.2e-16
\(~\)
\(~\)
One can test \(H_0: \text{lnh}\) is exogenous, based on the comparison of \(\beta_{OLS}\) and \(\beta_{2SLS}\) :
\(~\)
\[\begin{cases} H_0 : \text{both estimators are consistant} \\ H_1 : x_i \text{ is endogenous, } \beta_{OLS} \text{ is inconsistent } \beta_{2SLS} \text{ but is consistent.} \end{cases}\]
\(~\)
Therefore \(\sqrt{N}\left(\hat{\beta}_{OLS} - \hat{\beta}_{2SLS}\right)\) should be small under \(H_0\) and high under \(H_1\) which is the same as controlling for \(\beta_{lnh}\) being statistically significant :
\(~\)
# Extracting OLS and 2SLS estimates
beta.2SLS.lnh <- summary(model.1)$coefficients[5]
beta.OLS.lnh <- summary(Second.stage.1)$coefficients[5]
# Computing the statistic
stat <- sqrt(11251)*(beta.OLS.lnh-beta.2SLS.lnh)
stat
## [1] -0.7601581
\(~\)
We can observe from the summary and when computing \(\sqrt{N}\left(\hat{\beta}_{OLS} - \hat{\beta}_{2SLS}\right)\) that the variable \(\text{lnh}\) is endogenous.
\(~\)
\(~\)
# Extracting the residuals of 2SLS estimation
Residuals.Second.stage.1 <- residuals(Second.stage.1)
# Sargan model
Sargan.model <- lm(Residuals.Second.stage.1 ~ as.factor(enf3) + as.factor(enf6) +
as.factor(enf18) + as.factor(tymen90r),
data = dta_trimmed)
summary(Sargan.model)
##
## Call:
## lm(formula = Residuals.Second.stage.1 ~ as.factor(enf3) + as.factor(enf6) +
## as.factor(enf18) + as.factor(tymen90r), data = dta_trimmed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8180 -0.1798 0.0143 0.2239 3.6582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.016149 0.015707 -1.028 0.303905
## as.factor(enf3)1 -0.026467 0.025473 -1.039 0.298827
## as.factor(enf3)2 0.066895 0.084557 0.791 0.428891
## as.factor(enf3)3 -0.093316 0.596364 -0.156 0.875661
## as.factor(enf6)1 0.022324 0.020690 1.079 0.280621
## as.factor(enf6)2 -0.020819 0.036955 -0.563 0.573195
## as.factor(enf6)3 0.110663 0.105131 1.053 0.292539
## as.factor(enf6)4 -0.077958 0.595352 -0.131 0.895821
## as.factor(enf18)1 0.014738 0.019327 0.763 0.445744
## as.factor(enf18)2 0.028352 0.020375 1.392 0.164098
## as.factor(enf18)3 0.036713 0.027382 1.341 0.180011
## as.factor(enf18)4 -0.041482 0.048051 -0.863 0.387994
## as.factor(enf18)5 -0.060966 0.102453 -0.595 0.551816
## as.factor(enf18)6 -0.421524 0.197902 -2.130 0.033196 *
## as.factor(enf18)7 0.111889 0.415648 0.269 0.787788
## as.factor(tymen90r)2 -0.153648 0.045516 -3.376 0.000739 ***
## as.factor(tymen90r)3 -0.043079 0.033756 -1.276 0.201926
## as.factor(tymen90r)4 0.038312 0.020063 1.910 0.056208 .
## as.factor(tymen90r)5 0.003719 0.021161 0.176 0.860504
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5873 on 11232 degrees of freedom
## Multiple R-squared: 0.003463, Adjusted R-squared: 0.001866
## F-statistic: 2.168 on 18 and 11232 DF, p-value: 0.00286
# Extract the parameters from OLS regression
N = Sargan.model$df.residual + 5
R2 = summary(Sargan.model)$r.squared
# Performing the test for alpha = 0.05
sargan.stat = (N*R2)/(1-R2)
sargan.stat > qchisq(0.95, 4-1)
## [1] TRUE
\(~\)
Therefore we can safely consider that we have correctly identified a valid