library(pacman)
## Warning: package 'pacman' was built under R version 4.4.2
p_load(causalweight, lmtest, sandwich, AER, ivmodel, haven, estimatr, tidyverse,
lubridate, usmap, gridExtra, stringr, readxl,
reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)

i

Paper: Autor, David, David Dorn, Gordon Hanson, and Kaveh Majlesi. 2020. “Importing Political Polarization? The Electoral Consequences of Rising Trade Exposure.” American Economic Review 110 (10): 3139–83.

Research question: Whether the exposure of local labor markets to increased foreign competition from China has contributed to rising political polarization in the United States since 2000?

Outcome variable: Political polarization, measured by voting patterns and election outcomes.

Treatment variable: the average change in Chinese import penetration in that CZ’s industries, weighted by the share of each industry k in the CZ’s initial employment.

Endogeneity argument: Realized US imports from China in may be correlated with industry import-demand shocks in the U.S..

iv: contemporaneous composition and growth of Chinese imports in eight other high-income markets.

DAC

#install.packages("ggdag")
library(ggdag)
## Warning: package 'ggdag' was built under R version 4.4.2
## 
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
## 
##     filter
dag <- dagify(
  Polarization ~ TradeExposure,
  TradeExposure ~ ChineseImportsOtherCountries,
  labels = c(
    "Polarization" = "Political Polarization",
    "TradeExposure" = "U.S. Local Labor Market Exposure",
    "ChineseImportsOtherCountries" = "Chinese Import Exposure in Other Countries"
  )
)

ggdag(dag, text = FALSE, use_labels = "label") +
  theme_dag()

Relevance: Chinese import growth in the sample period is primarily driven by the increasing competitiveness of Chinese manufacturing that induce a supply shock to China’s global export. Cross-country and cross-industry imports by other developed countries highly correlate with the import pattern in the U.S..

Validity: The instrument is valid because Chinese import exposure in other countries is unlikely to be directly influenced by U.S. local political conditions, satisfying the exclusion restriction.

Main finding: rising trade exposure to Chinese imports increases political polarization, leading to a shift toward more extreme voting patterns in affected regions.

ii

Paper: Oh & Vukina, 2022

The authors backed the IV choice using the Bertrand pricing, which assumes prices in different markets are correlated due to common cost factors rather than common demand shock across regions. For the second part, the authors acknowledge that it is difficult to prove no global demand shocks across regions, but they control for holiday seasons and retail chains that should control for the differentiated demand patterns.

iii

install.packages("causalweight")
## Warning: package 'causalweight' is in use and will not be installed
library(causalweight)


data(JC)
table(JC$assignment)
## 
##    0    1 
## 3663 5577
table(JC$trainy1)
## 
##    0    1 
## 2666 6574
table(JC$assignment, JC$trainy1)
##    
##        0    1
##   0 1809 1854
##   1  857 4720
prop.table(table(JC$assignment, JC$trainy1))
##    
##              0          1
##   0 0.19577922 0.20064935
##   1 0.09274892 0.51082251
as.data.frame(table(JC$assignment, JC$trainy1))
##   Var1 Var2 Freq
## 1    0    0 1809
## 2    1    0  857
## 3    0    1 1854
## 4    1    1 4720
ITT <- mean(JC$earny4[JC$assignment == 1]) - mean(JC$earny4[JC$assignment == 0])
print(ITT)
## [1] 16.05513

On average, individuals assigned to JC earned $16.05 more per week, compared to the control.

ITT_model <- lm(earny4 ~ assignment, data = JC)
summary(ITT_model)
## 
## Call:
## lm(formula = earny4 ~ assignment, data = JC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -213.98 -164.65  -24.02   99.25 2211.98 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  197.926      3.212  61.620  < 2e-16 ***
## assignment    16.055      4.134   3.883 0.000104 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 194.4 on 9238 degrees of freedom
## Multiple R-squared:  0.00163,    Adjusted R-squared:  0.001522 
## F-statistic: 15.08 on 1 and 9238 DF,  p-value: 0.0001038
ITT_estimate <- coef(ITT_model)["assignment"]
SE <- summary(ITT_model)$coefficients["assignment", "Std. Error"]

cat("ITT Estimate:", ITT_estimate, "\n")
## ITT Estimate: 16.05513
cat("Standard Error:", SE, "\n")
## Standard Error: 4.134466
comp_treatment <- mean(JC$trainy1[JC$assignment == 1]) 
comp_control <- mean(JC$trainy1[JC$assignment == 0])   

comp_diff <- comp_treatment - comp_control

cat("Take-up Rate (Treatment Group):", comp_treatment, "\n")
## Take-up Rate (Treatment Group): 0.8463332
cat("Take-up Rate (Control Group):", comp_control, "\n")
## Take-up Rate (Control Group): 0.5061425
cat("Difference in Take-up Rates:", comp_diff, "\n")
## Difference in Take-up Rates: 0.3401906
takeup_model <- lm(trainy1 ~ assignment, data = JC)
summary(takeup_model)
## 
## Call:
## lm(formula = trainy1 ~ assignment, data = JC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8463 -0.5061  0.1537  0.1537  0.4939 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.506143   0.006964   72.68   <2e-16 ***
## assignment  0.340191   0.008963   37.95   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4215 on 9238 degrees of freedom
## Multiple R-squared:  0.1349, Adjusted R-squared:  0.1348 
## F-statistic:  1440 on 1 and 9238 DF,  p-value: < 2.2e-16
ITT_earnings <- mean(JC$earny4[JC$assignment == 1]) - mean(JC$earny4[JC$assignment == 0])
comp_treatment <- mean(JC$trainy1[JC$assignment == 1])  
comp_control <- mean(JC$trainy1[JC$assignment == 0])    
comp_difference <- comp_treatment - comp_control
LATE <- ITT_earnings / comp_difference

cat("ITT on Earnings:", ITT_earnings, "\n")
## ITT on Earnings: 16.05513
cat("Difference in Take-up Rates:", comp_difference, "\n")
## Difference in Take-up Rates: 0.3401906
cat("Estimated LATE:", LATE, "\n")
## Estimated LATE: 47.1945
LATE_model <- ivreg(earny4 ~ trainy1 | assignment, data = JC)

# Display regression results
summary(LATE_model)
## 
## Call:
## ivreg(formula = earny4 ~ trainy1 | assignment, data = JC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -221.23 -165.43  -22.55  100.01 2235.87 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  174.039      8.909  19.536  < 2e-16 ***
## trainy1       47.194     12.192   3.871 0.000109 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 195 on 9238 degrees of freedom
## Multiple R-Squared: -0.004797,   Adjusted R-squared: -0.004905 
## Wald test: 14.98 on 1 and 9238 DF,  p-value: 0.0001092
# Using "ivmodel"
install.packages("ivmodel")  
## Warning: package 'ivmodel' is in use and will not be installed
library(ivmodel)

# Estimate IV model
iv_result <- ivmodel(Y = JC$earny4, D = JC$trainy1, Z = JC$assignment)

# Display results
summary(iv_result)
## 
## Call:
## ivmodel(Y = JC$earny4, D = JC$trainy1, Z = JC$assignment)
## sample size: 9240
## _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
## 
## First Stage Regression Result:
## 
## F=1440.46, df1=1, df2=9238, p-value is < 2.22e-16
## R-squared=0.1348939,   Adjusted R-squared=0.1348003
## Residual standard error: 0.4214583 on 9239 degrees of freedom
## _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
## 
## Coefficients of k-Class Estimators:
## 
##              k Estimate Std. Error t value Pr(>|t|)    
## OLS     0.0000  14.2283     4.4649   3.187 0.001444 ** 
## Fuller  0.9999  47.1681    12.1881   3.870 0.000110 ***
## TSLS    1.0000  47.1945    12.1924   3.871 0.000109 ***
## LIML    1.0000  47.1945    12.1924   3.871 0.000109 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
## 
## Alternative tests for the treatment effect under H_0: beta=0.
## 
## Anderson-Rubin test (under F distribution):
## F=15.07956, df1=1, df2=9238, p-value is 0.00010379
## 95 percent confidence interval:
##  [23.3644072308337, 71.2284358822774]
## 
## Conditional Likelihood Ratio test (under Normal approximation):
## Test Stat=15.07956, p-value is 0.00010379
## 95 percent confidence interval:
##  [23.3644368880534, 71.2284057155105]

ITT measures the treatment effect of all observations assigned to treatment, regardless of compliance. The $16.05 estimate applies to those assigned to the treatment, includes both compliers and noncompliers. Non-complier’s would not reveive any treatment effects, therefore, this ITT measure is an underestimate of the effect.

Manual calculation and regression yield the same LATE estimates. LATE estimates the effect of treatment on compliers. For compliers, participating in Job Corps caused an average increase in weekly earnings of $47.19 in the fourth year.

iv

# Install and load the haven package
#install.packages("haven")
library(haven)

# Read the .dta file
Card1995 <- read_dta("Card1995.dta")

# View the data
head(Card1995)
## # A tibble: 6 × 52
##      id nearc2 nearc4 nearc4a nearc4b  ed76  ed66 age76 daded nodaded momed
##   <dbl>  <dbl>  <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1     2      0      0       0       0     7     5    29  9.94       1  10.2
## 2     3      0      0       0       0    12    11    27  8          0   8  
## 3     4      0      0       0       0    12    12    34 14          0  12  
## 4     5      1      1       1       0    11    11    27 11          0  12  
## 5     6      1      1       1       0    12    12    34  8          0   7  
## 6     7      1      1       1       0    12    11    26  9          0  12  
## # ℹ 41 more variables: nomomed <dbl>, weight <dbl>, momdad14 <dbl>,
## #   sinmom14 <dbl>, step14 <dbl>, reg661 <dbl>, reg662 <dbl>, reg663 <dbl>,
## #   reg664 <dbl>, reg665 <dbl>, reg666 <dbl>, reg667 <dbl>, reg668 <dbl>,
## #   reg669 <dbl>, south66 <dbl>, work76 <dbl>, work78 <dbl>, lwage76 <dbl>,
## #   lwage78 <dbl>, famed <dbl>, black <dbl>, smsa76r <dbl>, smsa78r <dbl>,
## #   reg76r <dbl>, reg78r <dbl>, reg80r <dbl>, smsa66r <dbl>, wage76 <dbl>,
## #   wage78 <dbl>, wage80 <dbl>, noint78 <dbl>, noint80 <dbl>, enroll76 <dbl>, …
Card1995_reg <- Card1995 %>%
  mutate(exp = age76 - ed76 - 6,
         exp_sq_100 = exp^2/100) 

head(Card1995_reg)
## # A tibble: 6 × 54
##      id nearc2 nearc4 nearc4a nearc4b  ed76  ed66 age76 daded nodaded momed
##   <dbl>  <dbl>  <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl> <dbl>
## 1     2      0      0       0       0     7     5    29  9.94       1  10.2
## 2     3      0      0       0       0    12    11    27  8          0   8  
## 3     4      0      0       0       0    12    12    34 14          0  12  
## 4     5      1      1       1       0    11    11    27 11          0  12  
## 5     6      1      1       1       0    12    12    34  8          0   7  
## 6     7      1      1       1       0    12    11    26  9          0  12  
## # ℹ 43 more variables: nomomed <dbl>, weight <dbl>, momdad14 <dbl>,
## #   sinmom14 <dbl>, step14 <dbl>, reg661 <dbl>, reg662 <dbl>, reg663 <dbl>,
## #   reg664 <dbl>, reg665 <dbl>, reg666 <dbl>, reg667 <dbl>, reg668 <dbl>,
## #   reg669 <dbl>, south66 <dbl>, work76 <dbl>, work78 <dbl>, lwage76 <dbl>,
## #   lwage78 <dbl>, famed <dbl>, black <dbl>, smsa76r <dbl>, smsa78r <dbl>,
## #   reg76r <dbl>, reg78r <dbl>, reg80r <dbl>, smsa66r <dbl>, wage76 <dbl>,
## #   wage78 <dbl>, wage80 <dbl>, noint78 <dbl>, noint80 <dbl>, enroll76 <dbl>, …
# OLS regression
ols_model <- lm(lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + smsa76r, data = Card1995_reg)

summary(ols_model)
## 
## Call:
## lm(formula = lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r, data = Card1995_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.59297 -0.22315  0.01893  0.24223  1.33190 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.733664   0.067603  70.022  < 2e-16 ***
## ed76         0.074009   0.003505  21.113  < 2e-16 ***
## exp          0.083596   0.006648  12.575  < 2e-16 ***
## exp_sq_100  -0.224088   0.031784  -7.050 2.21e-12 ***
## black       -0.189632   0.017627 -10.758  < 2e-16 ***
## reg76r      -0.124862   0.015118  -8.259  < 2e-16 ***
## smsa76r      0.161423   0.015573  10.365  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3742 on 3003 degrees of freedom
##   (603 observations deleted due to missingness)
## Multiple R-squared:  0.2905, Adjusted R-squared:  0.2891 
## F-statistic: 204.9 on 6 and 3003 DF,  p-value: < 2.2e-16
# IV regression
iv_model_1 <- ivreg(lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + smsa76r |
                    nearc4 + exp + exp_sq_100 + black + reg76r + smsa76r ,
                  data = Card1995_reg)

summary(iv_model_1)
## 
## Call:
## ivreg(formula = lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r | nearc4 + exp + exp_sq_100 + black + reg76r + smsa76r, 
##     data = Card1995_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.82125 -0.24065  0.02368  0.25469  1.43205 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.75278    0.82934   4.525 6.27e-06 ***
## ed76         0.13229    0.04923   2.687  0.00725 ** 
## exp          0.10750    0.02130   5.047 4.76e-07 ***
## exp_sq_100  -0.22841    0.03341  -6.836 9.84e-12 ***
## black       -0.13080    0.05287  -2.474  0.01342 *  
## reg76r      -0.10490    0.02307  -4.546 5.67e-06 ***
## smsa76r      0.13132    0.03013   4.359 1.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.391 on 3003 degrees of freedom
## Multiple R-Squared: 0.2252,  Adjusted R-squared: 0.2237 
## Wald test: 120.8 on 6 and 3003 DF,  p-value: < 2.2e-16
first_stage_1 <- lm(ed76 ~ nearc4 + exp + exp_sq_100 + black + reg76r + smsa76r, data = Card1995_reg)

summary(first_stage_1)
## 
## Call:
## lm(formula = ed76 ~ nearc4 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r, data = Card1995_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6389 -1.4325 -0.1028  1.3268  6.2332 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.53964    0.16286 101.559  < 2e-16 ***
## nearc4       0.30628    0.07666   3.995 6.59e-05 ***
## exp         -0.35881    0.03040 -11.805  < 2e-16 ***
## exp_sq_100  -0.21620    0.14590  -1.482    0.138    
## black       -1.03873    0.08358 -12.428  < 2e-16 ***
## reg76r      -0.32964    0.07385  -4.464 8.29e-06 ***
## smsa76r      0.39091    0.07788   5.019 5.44e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.982 on 3606 degrees of freedom
## Multiple R-squared:  0.4813, Adjusted R-squared:  0.4805 
## F-statistic: 557.7 on 6 and 3606 DF,  p-value: < 2.2e-16
reduced_form_1 <- lm(lwage76 ~ nearc4 + exp + exp_sq_100 + black + reg76r + smsa76r, data = Card1995_reg)
summary(reduced_form_1)
## 
## Call:
## lm(formula = lwage76 ~ nearc4 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r, data = Card1995_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.56525 -0.24771  0.01465  0.27091  1.38743 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5.956604   0.036371 163.775  < 2e-16 ***
## nearc4       0.044624   0.017011   2.623  0.00876 ** 
## exp          0.053258   0.006948   7.666 2.38e-14 ***
## exp_sq_100  -0.218720   0.034021  -6.429 1.49e-10 ***
## black       -0.263903   0.018485 -14.277  < 2e-16 ***
## reg76r      -0.143458   0.016336  -8.782  < 2e-16 ***
## smsa76r      0.184752   0.017503  10.555  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4005 on 3003 degrees of freedom
##   (603 observations deleted due to missingness)
## Multiple R-squared:  0.1871, Adjusted R-squared:  0.1854 
## F-statistic: 115.2 on 6 and 3003 DF,  p-value: < 2.2e-16
#install.packages("modelsummary")
library(modelsummary)
## Warning: package 'modelsummary' was built under R version 4.4.3
models <- list(
  "OLS" = ols_model,
  "IV" = iv_model_1,
  "First-Stage" = first_stage_1,
  "Reduced Form" = reduced_form_1
)

coef_map <- c(
  "ed76" = "Years of Schooling (ed76)",
  "nearc4" = "Near 4-Year College (nearc4)",
  "exp" = "Experience (exp)",
  "exp_sq_100" = "Experience2/100 (exp_sq_100)",
  "black" = "Black (black)",
  "reg76r" = "South (reg76r)",
  "smsa76r" = "SMSA (smsa76r)"
)

# Generate summary table
modelsummary(
  models,
  stars = TRUE,
  title = "Regression Results: OLS, IV, First-Stage, and Reduced Form",
  fmt = 5,
  coef_map = coef_map
)
Regression Results: OLS, IV, First-Stage, and Reduced Form
OLS IV First-Stage Reduced Form
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Years of Schooling (ed76) 0.07401*** 0.13229**
(0.00351) (0.04923)
Near 4-Year College (nearc4) 0.30628*** 0.04462**
(0.07666) (0.01701)
Experience (exp) 0.08360*** 0.10750*** -0.35881*** 0.05326***
(0.00665) (0.02130) (0.03040) (0.00695)
Experience2/100 (exp_sq_100) -0.22409*** -0.22841*** -0.21620 -0.21872***
(0.03178) (0.03341) (0.14590) (0.03402)
Black (black) -0.18963*** -0.13080* -1.03873*** -0.26390***
(0.01763) (0.05287) (0.08358) (0.01848)
South (reg76r) -0.12486*** -0.10490*** -0.32964*** -0.14346***
(0.01512) (0.02307) (0.07385) (0.01634)
SMSA (smsa76r) 0.16142*** 0.13132*** 0.39091*** 0.18475***
(0.01557) (0.03013) (0.07788) (0.01750)
Num.Obs. 3010 3010 3613 3010
R2 0.291 0.225 0.481 0.187
R2 Adj. 0.289 0.224 0.480 0.185
AIC 2633.4 2898.4 15205.5 3043.1
BIC 2681.5 2946.5 15255.0 3091.2
Log.Lik. -1308.702 -7594.738 -1513.546
F 204.932 557.743 115.164
RMSE 0.37 0.39 1.98 0.40

OLS does not control for endogeneity and the treatment estimate is not reliable. Controlling for endogeneity via IV leads to a larger estimate at 0.13, indicating 1 extra year of education leads to about 13% increase in wage,but the negative estimate on experience is odd, suggestin remaining endogeneit issue.

iv usng both nearc4a nearc4b

# IV regression
iv_model <- ivreg(lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + smsa76r |
                    nearc4a +nearc4b + exp + exp_sq_100 + black + reg76r + smsa76r,
                  data = Card1995_reg)

summary(iv_model)
## 
## Call:
## ivreg(formula = lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r | nearc4a + nearc4b + exp + exp_sq_100 + black + 
##     reg76r + smsa76r, data = Card1995_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.93985 -0.25152  0.01722  0.27365  1.48154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.26801    0.68718   4.756 2.07e-06 ***
## ed76         0.16109    0.04077   3.951 7.96e-05 ***
## exp          0.11931    0.01818   6.564 6.16e-11 ***
## exp_sq_100  -0.23054    0.03503  -6.582 5.46e-11 ***
## black       -0.10173    0.04531  -2.245   0.0248 *  
## reg76r      -0.09504    0.02165  -4.389 1.18e-05 ***
## smsa76r      0.11645    0.02705   4.305 1.73e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4108 on 3003 degrees of freedom
## Multiple R-Squared: 0.1447,  Adjusted R-squared: 0.143 
## Wald test:   111 on 6 and 3003 DF,  p-value: < 2.2e-16
first_stage <- lm(ed76 ~ nearc4a + nearc4b + exp + exp_sq_100 + black + reg76r + smsa76r, data = Card1995_reg)

reduced_form <- lm(lwage76 ~ nearc4a + nearc4b + exp + exp_sq_100 + black + reg76r + smsa76r, data = Card1995_reg)
#install.packages("modelsummary")
library(modelsummary)
models <- list(
  "OLS" = ols_model,
  "IV" = iv_model,
  "First-Stage" = first_stage,
  "Reduced Form" = reduced_form
)

coef_map <- c(
               "ed76" = "Years of Schooling (ed76)",
               "nearc4a" = "Public College (nearc4a)",
               "nearc4b" = "Private College (nearc4b)",
               "exp" = "Experience (exp)",
               "exp_sq_100" = "Experience2/100 (exp_sq_100)",
               "black" = "Black (black)",
               "reg76r" = "South (reg76r)",
               "smsa76r" = "SMSA (smsa76r)"
             )

modelsummary(models,
             stars = TRUE, # Add significance stars
             title = "Regression Results: OLS, IV, First-Stage, and Reduced Form",
             fmt = 5,
             coef_map = coef_map
             )
Regression Results: OLS, IV, First-Stage, and Reduced Form
OLS IV First-Stage Reduced Form
+ p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001
Years of Schooling (ed76) 0.07401*** 0.16109***
(0.00351) (0.04077)
Public College (nearc4a) 0.39599*** 0.06401***
(0.08119) (0.01801)
Private College (nearc4b) 0.09648 -0.00011
(0.09929) (0.02191)
Experience (exp) 0.08360*** 0.11931*** -0.36039*** 0.05258***
(0.00665) (0.01818) (0.03036) (0.00694)
Experience2/100 (exp_sq_100) -0.22409*** -0.23054*** -0.20488 -0.21464***
(0.03178) (0.03503) (0.14573) (0.03399)
Black (black) -0.18963*** -0.10173* -1.03976*** -0.26394***
(0.01763) (0.04531) (0.08347) (0.01846)
South (reg76r) -0.12486*** -0.09504*** -0.30427*** -0.13838***
(0.01512) (0.02165) (0.07414) (0.01639)
SMSA (smsa76r) 0.16142*** 0.11645*** 0.38804*** 0.18389***
(0.01557) (0.02705) (0.07778) (0.01748)
Num.Obs. 3010 3010 3613 3010
R2 0.291 0.145 0.483 0.190
R2 Adj. 0.289 0.143 0.482 0.188
AIC 2633.4 3196.0 15196.5 3034.6
BIC 2681.5 3244.0 15252.2 3088.7
Log.Lik. -1308.702 -7589.231 -1508.323
F 204.932 480.965 100.513
RMSE 0.37 0.41 1.98 0.40

Residing near a public university is more relevant to education as shown in first stage results. Endogeneity-controlled regression shows a slightly higher estimate of the treatment effect compared to the single IV result, but the coefficient on experience is still negative.

Yes, it is, and so can be experience as both can be affected by unobserved factors such as ability.

# Create interaction terms
Card1995_reg$nearc4a_age76 <- Card1995_reg$nearc4a * Card1995_reg$age76
Card1995_reg$nearc4a_age76_sq <- Card1995_reg$nearc4a * (Card1995_reg$age76^2)
Card1995_reg$nearc4b_age76 <- Card1995_reg$nearc4b * Card1995_reg$age76
Card1995_reg$nearc4b_age76_sq <- Card1995_reg$nearc4b * (Card1995_reg$age76^2)
# Estimate the 2SLS model with interactions
iv_model_interactions <- ivreg(lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + smsa76r |
                                 nearc4a + nearc4b + nearc4a_age76 + nearc4a_age76_sq + nearc4b_age76 + nearc4b_age76_sq + black + reg76r + smsa76r,
                               data = Card1995_reg)

# Summarize the results
summary(iv_model_interactions)
## 
## Call:
## ivreg(formula = lwage76 ~ ed76 + exp + exp_sq_100 + black + reg76r + 
##     smsa76r | nearc4a + nearc4b + nearc4a_age76 + nearc4a_age76_sq + 
##     nearc4b_age76 + nearc4b_age76_sq + black + reg76r + smsa76r, 
##     data = Card1995_reg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.75022 -0.23674  0.02375  0.24883  1.32243 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.34516    0.29800  14.581  < 2e-16 ***
## ed76         0.11077    0.02548   4.347 1.43e-05 ***
## exp          0.05498    0.02308   2.383 0.017243 *  
## exp_sq_100  -0.05962    0.11830  -0.504 0.614337    
## black       -0.13894    0.04143  -3.353 0.000808 ***
## reg76r      -0.11094    0.01938  -5.723 1.15e-08 ***
## smsa76r      0.13284    0.02810   4.727 2.39e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3851 on 3003 degrees of freedom
## Multiple R-Squared: 0.2484,  Adjusted R-squared: 0.2469 
## Wald test: 156.8 on 6 and 3003 DF,  p-value: < 2.2e-16

All estimates are now in the reasonable direction and significant.

# Coefficient on ed76 from IV model
beta_iv <- coef(iv_model)["ed76"]
se_iv <- sqrt(vcov(iv_model)["ed76", "ed76"])

# Coefficient on ed76 from OLS model
beta_ols <- coef(ols_model)["ed76"]
se_ols <- sqrt(vcov(ols_model)["ed76", "ed76"])

diff_beta <- beta_iv-beta_ols
#sigmaless
hausman_statistic_sigmaless <- diff_beta^2 / se_ols^2

# Compute the p-value
p_value_sigmaless <- pchisq(hausman_statistic_sigmaless, df = 1, lower.tail = FALSE)

# Display the results
cat("Hausman Test Statistic (sigmaless):", hausman_statistic_sigmaless, "\n")
## Hausman Test Statistic (sigmaless): 617.1341
cat("p-value (sigmaless):", p_value_sigmaless, "\n")
## p-value (sigmaless): 3.141124e-136

P value suggest we can reject the null hypothesis, meaning ed76 is endogenous.