library(pacman)
## Warning: package 'pacman' was built under R version 4.4.2
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, cobalt, Matchit, Zelig, estimatr, cem, tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D, cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer, foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, kableExtra, snakecase, janitor)
## Warning: package 'Matchit' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: Perhaps you meant 'MatchIt' ?
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Matchit'
## Warning: package 'Zelig' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Zelig'
## Warning in p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, : Failed to install/load:
## Matchit, Zelig
data(lalonde)
attach(lalonde)
table(treat)
## treat
##   0   1 
## 429 185

i.

library(dplyr)
library(stargazer)

balance_table <- lalonde %>%
  select(-race) %>%
  group_by(treat) %>%
  summarise(across(everything(),
                   list(mean = ~ mean(. , na.rm = TRUE),
                        sd = ~ sd(. , na.rm = TRUE)), 
                   .names = "{.col}_{.fn}")) %>%
  pivot_longer(-treat, names_to = c("Variable", ".value"), names_sep = "_") %>%
  pivot_wider(names_from = treat, values_from = c(mean, sd), names_prefix = "Group_")
t_tests <- lalonde %>%
  select(-race) %>%
  summarise(across(-treat, ~ t.test(. ~ treat)$p.value, .names = "p_value_{.col}")) %>%
  pivot_longer(everything(), names_to = "Variable", values_to = "p_value", names_prefix = "p_value_")

# Combine summary statistics with t-test results
balance_table <- left_join(balance_table, t_tests, by = "Variable")


# Display the balance table
stargazer(balance_table, type = "text", summary = FALSE, title = "Balance Table")
## 
## Balance Table
## =======================================================================================================
##   Variable   mean_Group_0      mean_Group_1       sd_Group_0        sd_Group_1           p_value       
## -------------------------------------------------------------------------------------------------------
## 1   age     28.030303030303  25.8162162162162  10.7866530174644  7.15501927478618  0.00291427440260211 
## 2   educ   10.2354312354312  10.3459459459459  2.85523844087595  2.01065025640563   0.584797660188313  
## 3 married  0.512820512820513 0.189189189189189 0.500419186851279 0.392721679149236 1.46127767613592e-16
## 4 nodegree 0.596736596736597 0.708108108108108 0.491125522231798 0.455866577054302 0.00698222507160265 
## 5   re74   5619.23650638695  2095.57368864865  6788.75079645538  4886.62035315108  1.74783141336639e-12
## 6   re75   2466.48444312354  1532.05531378378  3291.99618331065   3219.2508699216  0.00115008478712992 
## 7   re78   6984.16974230769  6349.14353027027   7294.1617908684  7867.40221773469   0.349076655556669  
## -------------------------------------------------------------------------------------------------------

Data is not balanced, showing significant differences in key covariates, such as earnings.

ii

Let Y(1) and Y(0) be the potential outcomes under treated and control, respectively. ATE is defined as E[Y(1)-Y(0)]. If the conditional independence assumption holds, potential outcomes for both treated and control are independent of their treatment status assignment given X. Therefore, controlling for all X in the OLS regression is equivalent to taking treatment assginment as random.

lalonde <- lalonde %>%
  mutate(across(c(age, educ, married, nodegree, re74, re75), 
                ~ . - mean(.), 
                .names = "demeaned_{.col}"))
  
  
  
model <- lm(re78 ~ treat + age + race + educ + married + nodegree + re74 + re75 + 
              treat * demeaned_age + treat * demeaned_educ + 
              treat * demeaned_married + treat * demeaned_nodegree + 
              treat * demeaned_re74 + treat * demeaned_re75 + treat*race, data = lalonde)

# Print summary
summary(model)
## 
## Call:
## lm(formula = re78 ~ treat + age + race + educ + married + nodegree + 
##     re74 + re75 + treat * demeaned_age + treat * demeaned_educ + 
##     treat * demeaned_married + treat * demeaned_nodegree + treat * 
##     demeaned_re74 + treat * demeaned_re75 + treat * race, data = lalonde)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -14685  -4703  -1544   3742  53958 
## 
## Coefficients: (6 not defined because of singularities)
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               35.96622 2841.98257   0.013  0.98991    
## treat                   1114.02887 1012.54820   1.100  0.27168    
## age                      -20.26636   36.44845  -0.556  0.57840    
## racehispan              1663.63073 1172.15266   1.419  0.15634    
## racewhite               1167.54240  881.61270   1.324  0.18590    
## educ                     322.45790  181.68288   1.775  0.07643 .  
## married                  -52.68613  803.52341  -0.066  0.94774    
## nodegree                 480.37271 1012.76977   0.474  0.63545    
## re74                       0.37671    0.06558   5.744 1.48e-08 ***
## re75                       0.33976    0.12186   2.788  0.00547 ** 
## demeaned_age                    NA         NA      NA       NA    
## demeaned_educ                   NA         NA      NA       NA    
## demeaned_married                NA         NA      NA       NA    
## demeaned_nodegree               NA         NA      NA       NA    
## demeaned_re74                   NA         NA      NA       NA    
## demeaned_re75                   NA         NA      NA       NA    
## treat:demeaned_age       103.82817   83.55223   1.243  0.21448    
## treat:demeaned_educ      301.50306  392.61449   0.768  0.44283    
## treat:demeaned_married  1085.12641 1614.09954   0.672  0.50167    
## treat:demeaned_nodegree -799.38196 1886.14021  -0.424  0.67185    
## treat:demeaned_re74       -0.33724    0.15513  -2.174  0.03010 *  
## treat:demeaned_re75       -0.25104    0.24670  -1.018  0.30928    
## treat:racehispan        -219.29010 2480.50005  -0.088  0.92958    
## treat:racewhite          -27.52841 1954.53950  -0.014  0.98877    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6904 on 596 degrees of freedom
## Multiple R-squared:  0.1698, Adjusted R-squared:  0.1461 
## F-statistic: 7.168 on 17 and 596 DF,  p-value: 4.558e-16

The point estimate shows NWS does have positive effect on post-earnings, but the estimate is not significant.

iii

library(Matching)
library(cem)
unique(lalonde$race)
## [1] black  hispan white 
## Levels: black hispan white
lalonde <- lalonde %>%
  mutate(black = ifelse(race == "black", 1, 0),
    hisp = ifelse(race == "hispan", 1, 0)
  )
Y <- lalonde$re78  # Outcome variable (earnings in 1978)
Tr <- lalonde$treat  # Treatment variable
#X <- lalonde %>% 
#  select(age, educ, married, race, nodegree, re74, re75)  # Covariates

X <- lalonde %>% mutate(
  black = as.numeric(black),
  hisp = as.numeric(hisp)
) %>% select(age, educ, married, nodegree, black, hisp, re74, re75)  # Select only numeric variables
match_1to1 <- Match(Y = Y, Tr = Tr, X = X, M = 1)
summary(match_1to1)  # Estimated ATT and standard error
## 
## Estimate...  198.16 
## AI SE......  1280.7 
## T-stat.....  0.15474 
## p.val......  0.87703 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  207
match_exact <- Match(Y = Y, Tr = Tr, X = X, exact = TRUE)
summary(match_exact)
## 
## Estimate...  -1106.3 
## AI SE......  174.05 
## T-stat.....  -6.3564 
## p.val......  2.0653e-10 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  13 
## Matched number of observations  (unweighted).  24 
## 
## Number of obs dropped by 'exact' or 'caliper'  172
match_1to2 <- Match(Y = Y, Tr = Tr, X = X, M = 2)
summary(match_1to2)
## 
## Estimate...  768.62 
## AI SE......  1047.7 
## T-stat.....  0.73363 
## p.val......  0.46318 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  385
match_1to3 <- Match(Y = Y, Tr = Tr, X = X, M = 3)
summary(match_1to3)
## 
## Estimate...  1159.1 
## AI SE......  992.02 
## T-stat.....  1.1684 
## p.val......  0.24263 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  583
match_1to2_bias <- Match(Y = Y, Tr = Tr, X = X, M = 2, BiasAdjust = TRUE)
summary(match_1to2_bias)
## 
## Estimate...  762.27 
## AI SE......  1051.9 
## T-stat.....  0.72465 
## p.val......  0.46867 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  385
cem_match <- cem(
  treatment = "treat", 
  data = lalonde, 
  drop = "race",  # Drop the original categorical race variable
  keep.all = TRUE
)
## 
## Using 'treat'='1' as baseline group
att_cem <- att(cem_match, re78 ~ treat, data = lalonde)
print(att_cem)
## 
##            G0  G1
## All       429 185
## Matched    43  45
## Unmatched 386 140
## 
## Linear regression model on CEM matched data:
## 
## SATT point estimate: 629.570617 (p.value=0.532848)
## 95% conf. interval: [-1340.972695, 2600.113930]
print(att_cem)
## 
##            G0  G1
## All       429 185
## Matched    43  45
## Unmatched 386 140
## 
## Linear regression model on CEM matched data:
## 
## SATT point estimate: 629.570617 (p.value=0.532848)
## 95% conf. interval: [-1340.972695, 2600.113930]

Resutls are similar. Exact matching, once controlled for bias, yielded 762, close to CEM estimate.

iv

library(dagitty)
## Warning: package 'dagitty' was built under R version 4.4.2
library(ggdag)
## Warning: package 'ggdag' was built under R version 4.4.2
## 
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
## 
##     filter
# Define the DAG
dag <- dagitty("
dag {
    U [unobserved]
    NSW -> re78
    age -> NSW
    educ -> NSW
    race -> NSW
    re74 -> NSW
    re75 -> NSW
    age -> re78
    educ -> re78
    race -> re78
    re74 -> re78
    re75 -> re78
    U -> NSW
    U -> re78
}")

# Plot the DAG
ggdag(dag) + theme_dag()

Treatment status is subject to selection bias, influenced by age, education, race, etc. These covariates also affect post NWS earnings. Unobserved factors can potentially affect both treat status and outcome.

library(ggplot2)

ggplot(lalonde, aes(x = age, y = re74)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Age vs. Earnings in 1974")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lalonde, aes(x = age, y = re75)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Age vs. Earnings in 1975")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lalonde, aes(x = age, y = re78)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Age vs. Earnings in 1978")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lalonde, aes(x = age, fill = as.factor(treat))) + 
  geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
  ggtitle("Age Distribution: Participants vs. Non-Participants")

ggplot(lalonde, aes(x = educ, y = re74)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Education vs. Earnings in 1974")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lalonde, aes(x = educ, y = re75)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Education vs. Earnings in 1975")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(lalonde, aes(x = educ, y = re78)) + 
  geom_point(alpha = 0.5) + 
  geom_smooth(method = "loess") + 
  ggtitle("Education vs. Earnings in 1978")
## `geom_smooth()` using formula = 'y ~ x'

library(Matching)

# Logit model for propensity score estimation
ps_model <- glm(treat ~ poly(age, 2) + poly(educ, 2) + race + re74 + re75,
                family = binomial(), data = lalonde)

# Store the propensity scores
lalonde$psfit <- fitted(ps_model)

# Display regression results
library(stargazer)
stargazer(ps_model, type = "text")
## 
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                              treat           
## ---------------------------------------------
## poly(age, 2)1               -4.989           
##                             (4.417)          
##                                              
## poly(age, 2)2             -26.030***         
##                             (4.625)          
##                                              
## poly(educ, 2)1               0.815           
##                             (3.528)          
##                                              
## poly(educ, 2)2            -14.658***         
##                             (4.330)          
##                                              
## racehispan                 -2.374***         
##                             (0.384)          
##                                              
## racewhite                  -3.292***         
##                             (0.301)          
##                                              
## re74                      -0.0001***         
##                            (0.00003)         
##                                              
## re75                       -0.00002          
##                            (0.00005)         
##                                              
## Constant                   0.775***          
##                             (0.174)          
##                                              
## ---------------------------------------------
## Observations                  614            
## Log Likelihood             -224.020          
## Akaike Inf. Crit.           466.040          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01
match_nn <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = lalonde$psfit, 
                  M = 3, BiasAdjust = T)

summary(match_nn)
## 
## Estimate...  1335.9 
## AI SE......  1138.5 
## T-stat.....  1.1734 
## p.val......  0.24063 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  623
library(boot)

boot_fn <- function(data, indices) {
  d <- data[indices, ]
  match_boot <- Match(Y = d$re78, Tr = d$treat, X = d$psfit, M = 3, BiasAdjust = T)
  return(match_boot$est)
}

set.seed(123)
boot_results <- boot(data = lalonde, statistic = boot_fn, R = 1000)
boot.ci(boot_results, type = "perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_results, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   (-1160,  3575 )  
## Calculations and Intervals on Original Scale
summary(lalonde$psfit[lalonde$treat == 1]) # Participants
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.03456 0.50732 0.66893 0.61782 0.83630 0.90621
summary(lalonde$psfit[lalonde$treat == 0]) # Non-Participants
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000476 0.0304701 0.0651742 0.1648084 0.1723068 0.8907984
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) + 
  geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
  ggtitle("Histogram of Propensity Scores")

# Visual check
plot(density(lalonde$psfit[lalonde$treat == 1]), col = "red", main = "Propensity Score Overlap")
lines(density(lalonde$psfit[lalonde$treat == 0]), col = "blue")
legend("topright", legend = c("Participants", "Non-Participants"), col = c("red", "blue"), lty = 1)

library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.4.3
## 
## Attaching package: 'MatchIt'
## The following object is masked _by_ '.GlobalEnv':
## 
##     lalonde
## The following object is masked from 'package:cobalt':
## 
##     lalonde
# Nearest neighbor matching with common support
matchit_model <- matchit(treat ~ poly(age, 2) + poly(educ, 2) + race + re74 + re75,
                         method = "nearest", data = lalonde, discard = "both")

summary(matchit_model)
## 
## Call:
## matchit(formula = treat ~ poly(age, 2) + poly(educ, 2) + race + 
##     re74 + re75, data = lalonde, method = "nearest", discard = "both")
## 
## Summary of Balance for All Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.6178        0.1648          1.7791     1.3626
## `poly(age, 2)`1        -0.0063        0.0027         -0.3094     0.4400
## `poly(age, 2)`2        -0.0113        0.0049         -0.5458     0.4641
## `poly(educ, 2)`1        0.0012       -0.0005          0.0550     0.4959
## `poly(educ, 2)`2       -0.0090        0.0039         -0.5579     0.2611
## raceblack               0.8432        0.2028          1.7615          .
## racehispan              0.0595        0.1422         -0.3498          .
## racewhite               0.0973        0.6550         -1.8819          .
## re74                 2095.5737     5619.2365         -0.7211     0.5181
## re75                 1532.0553     2466.4844         -0.2903     0.9563
##                  eCDF Mean eCDF Max
## distance            0.3991   0.6676
## `poly(age, 2)`1     0.0819   0.1577
## `poly(age, 2)`2     0.0801   0.1746
## `poly(educ, 2)`1    0.0339   0.1114
## `poly(educ, 2)`2    0.0659   0.2018
## raceblack           0.6404   0.6404
## racehispan          0.0827   0.0827
## racewhite           0.5577   0.5577
## re74                0.2248   0.4470
## re75                0.1342   0.2876
## 
## Summary of Balance for Matched Data:
##                  Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance                0.5949        0.3532          0.9491     1.0662
## `poly(age, 2)`1        -0.0078       -0.0048         -0.1013     0.9296
## `poly(age, 2)`2        -0.0085       -0.0107          0.0743     0.7790
## `poly(educ, 2)`1        0.0017       -0.0031          0.1541     0.7599
## `poly(educ, 2)`2       -0.0081       -0.0055         -0.1114     0.6182
## raceblack               0.8304        0.4678          0.9973          .
## racehispan              0.0643        0.1988         -0.5688          .
## racewhite               0.1053        0.3333         -0.7696          .
## re74                 2267.1411     3278.9991         -0.2071     1.3251
## re75                 1605.2408     2081.5170         -0.1479     1.1556
##                  eCDF Mean eCDF Max Std. Pair Dist.
## distance            0.1298   0.4327          0.9492
## `poly(age, 2)`1     0.0319   0.1287          1.1900
## `poly(age, 2)`2     0.0449   0.1520          1.3555
## `poly(educ, 2)`1    0.0239   0.0819          1.1140
## `poly(educ, 2)`2    0.0210   0.0936          1.0096
## raceblack           0.3626   0.3626          0.9973
## racehispan          0.1345   0.1345          1.1128
## racewhite           0.2281   0.2281          0.9669
## re74                0.1141   0.3918          0.7956
## re75                0.0772   0.1871          0.8317
## 
## Sample Sizes:
##           Control Treated
## All           429     185
## Matched       171     171
## Unmatched     132       0
## Discarded     126      14