library(pacman)
## Warning: package 'pacman' was built under R version 4.4.2
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, cobalt, Matchit, Zelig, estimatr, cem, tidyverse, lubridate, usmap, gridExtra, stringr, readxl, plot3D, cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer, foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, kableExtra, snakecase, janitor)
## Warning: package 'Matchit' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: Perhaps you meant 'MatchIt' ?
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Matchit'
## Warning: package 'Zelig' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Zelig'
## Warning in p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, : Failed to install/load:
## Matchit, Zelig
data(lalonde)
attach(lalonde)
table(treat)
## treat
## 0 1
## 429 185
library(dplyr)
library(stargazer)
balance_table <- lalonde %>%
select(-race) %>%
group_by(treat) %>%
summarise(across(everything(),
list(mean = ~ mean(. , na.rm = TRUE),
sd = ~ sd(. , na.rm = TRUE)),
.names = "{.col}_{.fn}")) %>%
pivot_longer(-treat, names_to = c("Variable", ".value"), names_sep = "_") %>%
pivot_wider(names_from = treat, values_from = c(mean, sd), names_prefix = "Group_")
t_tests <- lalonde %>%
select(-race) %>%
summarise(across(-treat, ~ t.test(. ~ treat)$p.value, .names = "p_value_{.col}")) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "p_value", names_prefix = "p_value_")
# Combine summary statistics with t-test results
balance_table <- left_join(balance_table, t_tests, by = "Variable")
# Display the balance table
stargazer(balance_table, type = "text", summary = FALSE, title = "Balance Table")
##
## Balance Table
## =======================================================================================================
## Variable mean_Group_0 mean_Group_1 sd_Group_0 sd_Group_1 p_value
## -------------------------------------------------------------------------------------------------------
## 1 age 28.030303030303 25.8162162162162 10.7866530174644 7.15501927478618 0.00291427440260211
## 2 educ 10.2354312354312 10.3459459459459 2.85523844087595 2.01065025640563 0.584797660188313
## 3 married 0.512820512820513 0.189189189189189 0.500419186851279 0.392721679149236 1.46127767613592e-16
## 4 nodegree 0.596736596736597 0.708108108108108 0.491125522231798 0.455866577054302 0.00698222507160265
## 5 re74 5619.23650638695 2095.57368864865 6788.75079645538 4886.62035315108 1.74783141336639e-12
## 6 re75 2466.48444312354 1532.05531378378 3291.99618331065 3219.2508699216 0.00115008478712992
## 7 re78 6984.16974230769 6349.14353027027 7294.1617908684 7867.40221773469 0.349076655556669
## -------------------------------------------------------------------------------------------------------
Data is not balanced, showing significant differences in key covariates, such as earnings.
Let Y(1) and Y(0) be the potential outcomes under treated and control, respectively. ATE is defined as E[Y(1)-Y(0)]. If the conditional independence assumption holds, potential outcomes for both treated and control are independent of their treatment status assignment given X. Therefore, controlling for all X in the OLS regression is equivalent to taking treatment assginment as random.
lalonde <- lalonde %>%
mutate(across(c(age, educ, married, nodegree, re74, re75),
~ . - mean(.),
.names = "demeaned_{.col}"))
model <- lm(re78 ~ treat + age + race + educ + married + nodegree + re74 + re75 +
treat * demeaned_age + treat * demeaned_educ +
treat * demeaned_married + treat * demeaned_nodegree +
treat * demeaned_re74 + treat * demeaned_re75 + treat*race, data = lalonde)
# Print summary
summary(model)
##
## Call:
## lm(formula = re78 ~ treat + age + race + educ + married + nodegree +
## re74 + re75 + treat * demeaned_age + treat * demeaned_educ +
## treat * demeaned_married + treat * demeaned_nodegree + treat *
## demeaned_re74 + treat * demeaned_re75 + treat * race, data = lalonde)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14685 -4703 -1544 3742 53958
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.96622 2841.98257 0.013 0.98991
## treat 1114.02887 1012.54820 1.100 0.27168
## age -20.26636 36.44845 -0.556 0.57840
## racehispan 1663.63073 1172.15266 1.419 0.15634
## racewhite 1167.54240 881.61270 1.324 0.18590
## educ 322.45790 181.68288 1.775 0.07643 .
## married -52.68613 803.52341 -0.066 0.94774
## nodegree 480.37271 1012.76977 0.474 0.63545
## re74 0.37671 0.06558 5.744 1.48e-08 ***
## re75 0.33976 0.12186 2.788 0.00547 **
## demeaned_age NA NA NA NA
## demeaned_educ NA NA NA NA
## demeaned_married NA NA NA NA
## demeaned_nodegree NA NA NA NA
## demeaned_re74 NA NA NA NA
## demeaned_re75 NA NA NA NA
## treat:demeaned_age 103.82817 83.55223 1.243 0.21448
## treat:demeaned_educ 301.50306 392.61449 0.768 0.44283
## treat:demeaned_married 1085.12641 1614.09954 0.672 0.50167
## treat:demeaned_nodegree -799.38196 1886.14021 -0.424 0.67185
## treat:demeaned_re74 -0.33724 0.15513 -2.174 0.03010 *
## treat:demeaned_re75 -0.25104 0.24670 -1.018 0.30928
## treat:racehispan -219.29010 2480.50005 -0.088 0.92958
## treat:racewhite -27.52841 1954.53950 -0.014 0.98877
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6904 on 596 degrees of freedom
## Multiple R-squared: 0.1698, Adjusted R-squared: 0.1461
## F-statistic: 7.168 on 17 and 596 DF, p-value: 4.558e-16
The point estimate shows NWS does have positive effect on post-earnings, but the estimate is not significant.
library(Matching)
library(cem)
unique(lalonde$race)
## [1] black hispan white
## Levels: black hispan white
lalonde <- lalonde %>%
mutate(black = ifelse(race == "black", 1, 0),
hisp = ifelse(race == "hispan", 1, 0)
)
Y <- lalonde$re78 # Outcome variable (earnings in 1978)
Tr <- lalonde$treat # Treatment variable
#X <- lalonde %>%
# select(age, educ, married, race, nodegree, re74, re75) # Covariates
X <- lalonde %>% mutate(
black = as.numeric(black),
hisp = as.numeric(hisp)
) %>% select(age, educ, married, nodegree, black, hisp, re74, re75) # Select only numeric variables
match_1to1 <- Match(Y = Y, Tr = Tr, X = X, M = 1)
summary(match_1to1) # Estimated ATT and standard error
##
## Estimate... 198.16
## AI SE...... 1280.7
## T-stat..... 0.15474
## p.val...... 0.87703
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 207
match_exact <- Match(Y = Y, Tr = Tr, X = X, exact = TRUE)
summary(match_exact)
##
## Estimate... -1106.3
## AI SE...... 174.05
## T-stat..... -6.3564
## p.val...... 2.0653e-10
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 13
## Matched number of observations (unweighted). 24
##
## Number of obs dropped by 'exact' or 'caliper' 172
match_1to2 <- Match(Y = Y, Tr = Tr, X = X, M = 2)
summary(match_1to2)
##
## Estimate... 768.62
## AI SE...... 1047.7
## T-stat..... 0.73363
## p.val...... 0.46318
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 385
match_1to3 <- Match(Y = Y, Tr = Tr, X = X, M = 3)
summary(match_1to3)
##
## Estimate... 1159.1
## AI SE...... 992.02
## T-stat..... 1.1684
## p.val...... 0.24263
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 583
match_1to2_bias <- Match(Y = Y, Tr = Tr, X = X, M = 2, BiasAdjust = TRUE)
summary(match_1to2_bias)
##
## Estimate... 762.27
## AI SE...... 1051.9
## T-stat..... 0.72465
## p.val...... 0.46867
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 385
cem_match <- cem(
treatment = "treat",
data = lalonde,
drop = "race", # Drop the original categorical race variable
keep.all = TRUE
)
##
## Using 'treat'='1' as baseline group
att_cem <- att(cem_match, re78 ~ treat, data = lalonde)
print(att_cem)
##
## G0 G1
## All 429 185
## Matched 43 45
## Unmatched 386 140
##
## Linear regression model on CEM matched data:
##
## SATT point estimate: 629.570617 (p.value=0.532848)
## 95% conf. interval: [-1340.972695, 2600.113930]
print(att_cem)
##
## G0 G1
## All 429 185
## Matched 43 45
## Unmatched 386 140
##
## Linear regression model on CEM matched data:
##
## SATT point estimate: 629.570617 (p.value=0.532848)
## 95% conf. interval: [-1340.972695, 2600.113930]
Resutls are similar. Exact matching, once controlled for bias, yielded 762, close to CEM estimate.
library(dagitty)
## Warning: package 'dagitty' was built under R version 4.4.2
library(ggdag)
## Warning: package 'ggdag' was built under R version 4.4.2
##
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
##
## filter
# Define the DAG
dag <- dagitty("
dag {
U [unobserved]
NSW -> re78
age -> NSW
educ -> NSW
race -> NSW
re74 -> NSW
re75 -> NSW
age -> re78
educ -> re78
race -> re78
re74 -> re78
re75 -> re78
U -> NSW
U -> re78
}")
# Plot the DAG
ggdag(dag) + theme_dag()
Treatment status is subject to selection bias, influenced by age, education, race, etc. These covariates also affect post NWS earnings. Unobserved factors can potentially affect both treat status and outcome.
library(ggplot2)
ggplot(lalonde, aes(x = age, y = re74)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Age vs. Earnings in 1974")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re75)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Age vs. Earnings in 1975")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re78)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Age vs. Earnings in 1978")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, fill = as.factor(treat))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
ggtitle("Age Distribution: Participants vs. Non-Participants")
ggplot(lalonde, aes(x = educ, y = re74)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Education vs. Earnings in 1974")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re75)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Education vs. Earnings in 1975")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re78)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "loess") +
ggtitle("Education vs. Earnings in 1978")
## `geom_smooth()` using formula = 'y ~ x'
library(Matching)
# Logit model for propensity score estimation
ps_model <- glm(treat ~ poly(age, 2) + poly(educ, 2) + race + re74 + re75,
family = binomial(), data = lalonde)
# Store the propensity scores
lalonde$psfit <- fitted(ps_model)
# Display regression results
library(stargazer)
stargazer(ps_model, type = "text")
##
## =============================================
## Dependent variable:
## ---------------------------
## treat
## ---------------------------------------------
## poly(age, 2)1 -4.989
## (4.417)
##
## poly(age, 2)2 -26.030***
## (4.625)
##
## poly(educ, 2)1 0.815
## (3.528)
##
## poly(educ, 2)2 -14.658***
## (4.330)
##
## racehispan -2.374***
## (0.384)
##
## racewhite -3.292***
## (0.301)
##
## re74 -0.0001***
## (0.00003)
##
## re75 -0.00002
## (0.00005)
##
## Constant 0.775***
## (0.174)
##
## ---------------------------------------------
## Observations 614
## Log Likelihood -224.020
## Akaike Inf. Crit. 466.040
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
match_nn <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = lalonde$psfit,
M = 3, BiasAdjust = T)
summary(match_nn)
##
## Estimate... 1335.9
## AI SE...... 1138.5
## T-stat..... 1.1734
## p.val...... 0.24063
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 623
library(boot)
boot_fn <- function(data, indices) {
d <- data[indices, ]
match_boot <- Match(Y = d$re78, Tr = d$treat, X = d$psfit, M = 3, BiasAdjust = T)
return(match_boot$est)
}
set.seed(123)
boot_results <- boot(data = lalonde, statistic = boot_fn, R = 1000)
boot.ci(boot_results, type = "perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (-1160, 3575 )
## Calculations and Intervals on Original Scale
summary(lalonde$psfit[lalonde$treat == 1]) # Participants
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03456 0.50732 0.66893 0.61782 0.83630 0.90621
summary(lalonde$psfit[lalonde$treat == 0]) # Non-Participants
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000476 0.0304701 0.0651742 0.1648084 0.1723068 0.8907984
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
geom_histogram(position = "identity", alpha = 0.5, bins = 20) +
ggtitle("Histogram of Propensity Scores")
# Visual check
plot(density(lalonde$psfit[lalonde$treat == 1]), col = "red", main = "Propensity Score Overlap")
lines(density(lalonde$psfit[lalonde$treat == 0]), col = "blue")
legend("topright", legend = c("Participants", "Non-Participants"), col = c("red", "blue"), lty = 1)
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.4.3
##
## Attaching package: 'MatchIt'
## The following object is masked _by_ '.GlobalEnv':
##
## lalonde
## The following object is masked from 'package:cobalt':
##
## lalonde
# Nearest neighbor matching with common support
matchit_model <- matchit(treat ~ poly(age, 2) + poly(educ, 2) + race + re74 + re75,
method = "nearest", data = lalonde, discard = "both")
summary(matchit_model)
##
## Call:
## matchit(formula = treat ~ poly(age, 2) + poly(educ, 2) + race +
## re74 + re75, data = lalonde, method = "nearest", discard = "both")
##
## Summary of Balance for All Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.6178 0.1648 1.7791 1.3626
## `poly(age, 2)`1 -0.0063 0.0027 -0.3094 0.4400
## `poly(age, 2)`2 -0.0113 0.0049 -0.5458 0.4641
## `poly(educ, 2)`1 0.0012 -0.0005 0.0550 0.4959
## `poly(educ, 2)`2 -0.0090 0.0039 -0.5579 0.2611
## raceblack 0.8432 0.2028 1.7615 .
## racehispan 0.0595 0.1422 -0.3498 .
## racewhite 0.0973 0.6550 -1.8819 .
## re74 2095.5737 5619.2365 -0.7211 0.5181
## re75 1532.0553 2466.4844 -0.2903 0.9563
## eCDF Mean eCDF Max
## distance 0.3991 0.6676
## `poly(age, 2)`1 0.0819 0.1577
## `poly(age, 2)`2 0.0801 0.1746
## `poly(educ, 2)`1 0.0339 0.1114
## `poly(educ, 2)`2 0.0659 0.2018
## raceblack 0.6404 0.6404
## racehispan 0.0827 0.0827
## racewhite 0.5577 0.5577
## re74 0.2248 0.4470
## re75 0.1342 0.2876
##
## Summary of Balance for Matched Data:
## Means Treated Means Control Std. Mean Diff. Var. Ratio
## distance 0.5949 0.3532 0.9491 1.0662
## `poly(age, 2)`1 -0.0078 -0.0048 -0.1013 0.9296
## `poly(age, 2)`2 -0.0085 -0.0107 0.0743 0.7790
## `poly(educ, 2)`1 0.0017 -0.0031 0.1541 0.7599
## `poly(educ, 2)`2 -0.0081 -0.0055 -0.1114 0.6182
## raceblack 0.8304 0.4678 0.9973 .
## racehispan 0.0643 0.1988 -0.5688 .
## racewhite 0.1053 0.3333 -0.7696 .
## re74 2267.1411 3278.9991 -0.2071 1.3251
## re75 1605.2408 2081.5170 -0.1479 1.1556
## eCDF Mean eCDF Max Std. Pair Dist.
## distance 0.1298 0.4327 0.9492
## `poly(age, 2)`1 0.0319 0.1287 1.1900
## `poly(age, 2)`2 0.0449 0.1520 1.3555
## `poly(educ, 2)`1 0.0239 0.0819 1.1140
## `poly(educ, 2)`2 0.0210 0.0936 1.0096
## raceblack 0.3626 0.3626 0.9973
## racehispan 0.1345 0.1345 1.1128
## racewhite 0.2281 0.2281 0.9669
## re74 0.1141 0.3918 0.7956
## re75 0.0772 0.1871 0.8317
##
## Sample Sizes:
## Control Treated
## All 429 185
## Matched 171 171
## Unmatched 132 0
## Discarded 126 14