library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot,
cobalt, MatchIt, Zelig, estimatr, cem, tidyverse,
lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
data(lalonde)
attach(lalonde)
table(treat)
## treat
## 0 1
## 429 185
# 0 429
# 1 185
?lalonde
## Help on topic 'lalonde' was found in the following packages:
##
## Package Library
## Matching /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## cobalt /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## MatchIt /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##
##
## Using the first match ...
library(Matching)
library(dplyr)
library(broom)
lalonde <- lalonde %>%
mutate(
black = ifelse(race == "black", 1, 0),
white = ifelse(race == "white", 1, 0),
hisp = ifelse(race == "hispan", 1, 0)
)
summary_stats <- lalonde %>%
group_by(treat) %>%
summarise(
age_mean = mean(age, na.rm = TRUE),
age_sd = sd(age, na.rm = TRUE),
educ_mean = mean(educ, na.rm = TRUE),
educ_sd = sd(educ, na.rm = TRUE),
black_mean = mean(black, na.rm = TRUE),
black_sd = sd(black, na.rm = TRUE),
white_mean = mean(white, na.rm = TRUE),
white_sd = sd(white, na.rm = TRUE),
hisp_mean = mean(hisp, na.rm = TRUE),
hisp_sd = sd(hisp, na.rm = TRUE),
married_mean = mean(married, na.rm = TRUE),
married_sd = sd(married, na.rm = TRUE),
nodegree_mean = mean(nodegree, na.rm = TRUE),
nodegree_sd = sd(nodegree, na.rm = TRUE),
re74_mean = mean(re74, na.rm = TRUE),
re74_sd = sd(re74, na.rm = TRUE),
re75_mean = mean(re75, na.rm = TRUE),
re75_sd = sd(re75, na.rm = TRUE),
re78_mean = mean(re78, na.rm = TRUE),
re78_sd = sd(re78, na.rm = TRUE)
)
t_tests <- lalonde %>%
summarise(
age_t = tidy(t.test(age ~ treat))$p.value,
educ_t = tidy(t.test(educ ~ treat))$p.value,
black_t = tidy(t.test(black ~ treat))$p.value,
white_t = tidy(t.test(white ~ treat))$p.value,
hisp_t = tidy(t.test(hisp ~ treat))$p.value,
married_t = tidy(t.test(married ~ treat))$p.value,
nodegree_t = tidy(t.test(nodegree ~ treat))$p.value,
re74_t = tidy(t.test(re74 ~ treat))$p.value,
re75_t = tidy(t.test(re75 ~ treat))$p.value,
re78_t = tidy(t.test(re78 ~ treat))$p.value
)
summary_stats <- summary_stats %>%
bind_cols(t_tests)
print(summary_stats)
## # A tibble: 2 × 31
## treat age_mean age_sd educ_mean educ_sd black_mean black_sd white_mean
## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 28.0 10.8 10.2 2.86 0.203 0.403 0.655
## 2 1 25.8 7.16 10.3 2.01 0.843 0.365 0.0973
## # ℹ 23 more variables: white_sd <dbl>, hisp_mean <dbl>, hisp_sd <dbl>,
## # married_mean <dbl>, married_sd <dbl>, nodegree_mean <dbl>,
## # nodegree_sd <dbl>, re74_mean <dbl>, re74_sd <dbl>, re75_mean <dbl>,
## # re75_sd <dbl>, re78_mean <dbl>, re78_sd <dbl>, age_t <dbl>, educ_t <dbl>,
## # black_t <dbl>, white_t <dbl>, hisp_t <dbl>, married_t <dbl>,
## # nodegree_t <dbl>, re74_t <dbl>, re75_t <dbl>, re78_t <dbl>
On average, the treated group is relatively younger with much more black rather than white and hispanic participants. The average real income of the treated group is also significantly lower, while it rose to the same level with the untreated group after treatment. By looking at the summary stats, I hypothesize that the NSW training had a positive impact on helping disadvantaged group to improve their labor market outcome.
The conditional independence assumption states that the potential outcomes are independent of the treatment assignment so it ensures that the treatment effect is unbiased. The overlap condition states that for every observation there’s a positive probability receiving both treatment and control so it ensures that the treated and control groups are comparable. With these two conditions hold, the coefficient on treat is exactly the difference in the average outcomes between the treated and control groups.
library(Matching)
library(dplyr)
library(Jmisc)
lalonde <- lalonde %>%
mutate(
age_demean = demean(age),
educ_demean = demean(educ),
black_demean = demean(black),
hisp_demean = demean(hisp),
married_demean = demean(married),
nodegree_demean = demean(nodegree),
re74_demean = demean(re74),
re75_demean = demean(re75)
)
ate_model <- lm(re78 ~ treat + age_demean + educ_demean + black_demean + hisp_demean + married_demean + nodegree_demean + re74_demean + re75_demean, data = lalonde)
summary(ate_model)
##
## Call:
## lm(formula = re78 ~ treat + age_demean + educ_demean + black_demean +
## hisp_demean + married_demean + nodegree_demean + re74_demean +
## re75_demean, data = lalonde)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13595 -4894 -1662 3929 54570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.326e+03 3.661e+02 17.280 < 2e-16 ***
## treat 1.548e+03 7.813e+02 1.982 0.0480 *
## age_demean 1.298e+01 3.249e+01 0.399 0.6897
## educ_demean 4.039e+02 1.589e+02 2.542 0.0113 *
## black_demean -1.241e+03 7.688e+02 -1.614 0.1071
## hisp_demean 4.989e+02 9.419e+02 0.530 0.5966
## married_demean 4.066e+02 6.955e+02 0.585 0.5590
## nodegree_demean 2.598e+02 8.474e+02 0.307 0.7593
## re74_demean 2.964e-01 5.827e-02 5.086 4.89e-07 ***
## re75_demean 2.315e-01 1.046e-01 2.213 0.0273 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6948 on 604 degrees of freedom
## Multiple R-squared: 0.1478, Adjusted R-squared: 0.1351
## F-statistic: 11.64 on 9 and 604 DF, p-value: < 2.2e-16
On average, the NSW training program increased participants’ real income by $1548 in 1978. This result is statistically significant at 5%.
library(Matching)
library(dplyr)
library(cem)
library(tidyverse)
# one-to-one matching
covar <- cbind(lalonde$age, lalonde$educ, lalonde$black, lalonde$hisp, lalonde$married, lalonde$nodegree, lalonde$re74, lalonde$re75)
oto_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, M = 1)
summary(oto_match_model)
##
## Estimate... 198.16
## AI SE...... 1280.7
## T-stat..... 0.15474
## p.val...... 0.87703
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 207
# exact matching
exact_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, exact = T)
summary(exact_match_model)
##
## Estimate... -1106.3
## AI SE...... 174.05
## T-stat..... -6.3564
## p.val...... 2.0653e-10
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 13
## Matched number of observations (unweighted). 24
##
## Number of obs dropped by 'exact' or 'caliper' 172
## The one-to-one matching gives an ATT of 198.16, and all treated observations find a match. While the exact matching gives a way off estimate of -1106.3 with only 13 treated observations matched. The exact matching estimate is biased as the majority of treated observations cannot find a exact match from the control group so that the sample becomes very small.
# nearest neighbor matching (1:2)
ottwo_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, M = 2)
summary(ottwo_match_model)
##
## Estimate... 768.62
## AI SE...... 1047.7
## T-stat..... 0.73363
## p.val...... 0.46318
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 385
# nearest neighbor matching (1:3)
otthree_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, M = 3)
summary(otthree_match_model)
##
## Estimate... 1159.1
## AI SE...... 992.02
## T-stat..... 1.1684
## p.val...... 0.24263
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 583
# bias correction
unbiased_ottwo_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, M = 2, BiasAdjust = T)
summary(unbiased_ottwo_match_model)
##
## Estimate... 762.27
## AI SE...... 1051.9
## T-stat..... 0.72465
## p.val...... 0.46867
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 385
unbiased_otthree_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = covar, M = 3, BiasAdjust = T)
summary(unbiased_otthree_match_model)
##
## Estimate... 1106.9
## AI SE...... 1002.6
## T-stat..... 1.1041
## p.val...... 0.26955
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 583
## Matching with bias correction produces slightly smaller estimates of ATT while the significance drops a little bit as well.
# coarsened exact matching
age_bins <- cut(lalonde$age, breaks = 6, include.lowest = T, include.highest = T, labels = F)
educ_bins <- cut(lalonde$educ, breaks = 6, include.lowest = T, include.highest = T, labels = F)
re74_bins <- cut(lalonde$re74, breaks = 6, include.lowest = T, include.highest = T, labels = F)
re75_bins <- cut(lalonde$re75, breaks = 6, include.lowest = T, include.highest = T, labels = F)
black_bins <- cut(lalonde$black, breaks = 2, include.lowest = T, include.highest = T, labels = F)
nodegree_bins <- cut(lalonde$nodegree, breaks = 2, include.lowest = T, include.highest = T, labels = F)
allbreaks <- list('age' = age_bins,
'educ' = educ_bins,
're74' = re74_bins,
're75' = re75_bins,
'black' = black_bins,
'nodegree' = nodegree_bins)
cem_model <- cem(treatment = "treat", data = lalonde, drop = "re78", cutpoints = allbreaks, keep.all = T)
##
## Using 'treat'='1' as baseline group
summary(cem_model)
## Length Class Mode
## call 3 -none- call
## strata 614 -none- numeric
## n.strata 1 -none- numeric
## vars 18 -none- character
## X 18 data.frame list
## drop 2 -none- character
## breaks 17 -none- list
## treatment 1 -none- character
## n 1 -none- numeric
## groups 614 factor numeric
## g.names 2 -none- character
## n.groups 1 -none- numeric
## group.idx 2 -none- list
## group.len 2 -none- numeric
## mstrata 614 -none- numeric
## mstrataID 26 -none- numeric
## matched 614 -none- logical
## baseline.group 1 -none- character
## tab 6 -none- numeric
## k2k 1 -none- logical
## w 614 -none- numeric
att(cem_model, re78 ~ treat, data = lalonde)
##
## G0 G1
## All 429 185
## Matched 78 68
## Unmatched 351 117
##
## Linear regression model on CEM matched data:
##
## SATT point estimate: 415.449186 (p.value=0.678605)
## 95% conf. interval: [-1545.639276, 2376.537649]
## Coarsened exact matching allows for more treated observations be able to find a match, but still more than half of them do not find a match. Compared to the exact matching estimate, this model gives a more reasonable estimate of ATT which is 415.45. However, the p-value is quite large.
library(dagitty)
library(ggdag)
##
## Attaching package: 'ggdag'
## The following object is masked from 'package:stats':
##
## filter
library(ggplot2)
library(dplyr)
library(stargazer)
library(boot)
library(kdensity)
library(MatchIt)
library(cobalt)
library(vtable)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:boot':
##
## aml
##
## Attaching package: 'survey'
## The following object is masked from 'package:dagitty':
##
## neighbours
## The following object is masked from 'package:graphics':
##
## dotchart
# DAG
dag <- dagitty("dag {
PS -> D
Education -> PS
Education -> Y
Age -> PS
Age -> Y
Income74 -> PS
Income74 -> Y
Income75 -> PS
Income75 -> Y
Black -> PS
Black -> Y
Hispanic -> PS
Hispanic -> Y
Married -> PS
Married -> Y
NoDegree -> PS
NoDegree -> Y
}")
ggdag(dag) +
theme_dag()
## All the control variables are related to the propensity score and outcome. By controlling for propensity score, we can close the back doors between D and control variables and estimate the treatment effect.
# descriptive graphs
ggplot(lalonde, aes(x = age, y = re74, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Age vs Real Earnings in 1974", x = "Age", y = "Real Earnings in 1974") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re75, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Age vs Real Earnings in 1975", x = "Age", y = "Real Earnings in 1975") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re78, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Age vs Real Earnings in 1978", x = "Age", y = "Real Earnings in 1978") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = age, fill = factor(treat))) +
geom_histogram(position = "dodge", bins = 20) +
labs(title = "Distribution of Age between NSW Non-Participants and Participants", x = "Age", y = "Count") +
theme_minimal()
ggplot(lalonde, aes(x = educ, y = re74, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Education vs Real Earnings in 1974", x = "Years of Schooling", y = "Real Earnings in 1974") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re75, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Education vs Real Earnings in 1975", x = "Years of Schooling", y = "Real Earnings in 1975") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re78, color = factor(treat))) +
geom_point() +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Education vs Real Earnings in 1978", x = "Years of Schooling", y = "Real Earnings in 1978") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, fill = factor(treat))) +
geom_histogram(position = "dodge", bins = 20) +
labs(title = "Distribution of Years of Schooling between NSW Non-Participants and Participants", x = "Years of Schooling", y = "Count") +
theme_minimal()
## The relationship between age and real income is non-linear, but it flatters out after treatment. In 1974, the control group has higher average real income than the treated group at all ages. In 1975, the control group has higher average real income than the treated group at all ages except between 33-36. After treatment, in 1978, the two curves moves closer and overlaps for younger population. The relationship between education and real income has a kink at around 10-year, but it becomes a smooth positive linear curve after treatment. Similar to age, the control group has higher average real income than the treated group at all education levels in 1975, and this relationship changes in 1978. Overall, the NSW training group brought the average real income of the two groups closer or the same when controlling for age and education.
# propensity score matching
logit_model <- glm(treat ~ poly(age, 2) + poly(educ, 2) + black + hisp + married + nodegree + re74 + re75,
family = binomial, data = lalonde)
lalonde$psfit <- fitted(logit_model)
stargazer(logit_model, type = "text", title = "Logit Model of Probability of Participation",
covariate.labels = c("Age (Polynomial)", "Education (Polynomial)", "Black", "Hispanic", "Married", "No Degree", "Earnings 1974", "Earnings 1975"),
dep.var.labels = "Participation",
out = "fitted_propensity_score.txt")
##
## Logit Model of Probability of Participation
## ==================================================
## Dependent variable:
## ---------------------------
## Participation
## --------------------------------------------------
## Age (Polynomial) -0.372
## (4.724)
##
## Education (Polynomial) -30.716***
## (4.966)
##
## Black 2.917
## (5.105)
##
## Hispanic -14.247***
## (4.760)
##
## Married 3.043***
## (0.305)
##
## No Degree 0.685
## (0.454)
##
## Earnings 1974 -1.450***
## (0.326)
##
## Earnings 1975 0.433
## (0.415)
##
## re74 -0.0001***
## (0.00003)
##
## re75 0.00002
## (0.00005)
##
## Constant -2.262***
## (0.394)
##
## --------------------------------------------------
## Observations 614
## Log Likelihood -212.994
## Akaike Inf. Crit. 447.989
## ==================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
ps_match_model <- Match(Y = lalonde$re78, Tr = lalonde$treat, X = lalonde$psfit, M = 3, BiasAdjust = T)
summary(ps_match_model)
##
## Estimate... 1470.8
## AI SE...... 1131.3
## T-stat..... 1.3001
## p.val...... 0.19358
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 603
## The standard errors are calculated after matching so that the bias-adjustment option does not take into account the variability from the logit estimation.
# bootstrapping
ps_att <- function(lalonde, indices) {
d <- lalonde[indices, ]
boot_model <- Match(Y = d$re78, Tr = d$treat, X = d$psfit, M = 3, BiasAdjust = T)
return(boot_model$est)
}
set.seed(123)
boot_results <- boot(data = lalonde, statistic = ps_att, R = 1000)
boot_estimate <- mean(boot_results$t)
boot_se <- sd(boot_results$t)
print(paste("Bootstrap ATT:", boot_estimate))
## [1] "Bootstrap ATT: 1494.00286372011"
print(paste("Bootstrap Standard Error:", boot_se))
## [1] "Bootstrap Standard Error: 992.784955013952"
## With bootstrapping, the estimated ATT is 1494 (non-bootstrap estimate is 1470.8) and the standard error is smaller comparatively which increases the t stats by 0.2.
# quality of propensity score matching
summary_psfit <- lalonde %>%
group_by(treat) %>%
summarise(
mean_psfit = mean(psfit, na.rm = TRUE),
sd_psfit = sd(psfit, na.rm = TRUE),
min_psfit = min(psfit, na.rm = TRUE),
max_psfit = max(psfit, na.rm = TRUE),
median_psfit = median(psfit, na.rm = TRUE)
)
print(summary_psfit)
## # A tibble: 2 × 6
## treat mean_psfit sd_psfit min_psfit max_psfit median_psfit
## <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0 0.156 0.216 0.0000251 0.936 0.0590
## 2 1 0.639 0.260 0.0282 0.957 0.690
psfitdens1 <- kdensity(lalonde$psfit[lalonde$treat == 1])
plot(psfitdens1, main = "Density of Fitted Propensity Score for Participants", xlab = "Fitted Propensity Score", ylab = "Density")
psfitdens0 <- kdensity(lalonde$psfit[lalonde$treat == 0])
plot(psfitdens0, main = "Density of Fitted Propensity Score for Non-Participants", xlab = "Fitted Propensity Score", ylab = "Density")
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
geom_histogram(position = "dodge", bins = 30, alpha = 0.5) +
labs(title = "Histogram of Fitted Propensity Score for Participants and Non-Participants",
x = "Fitted Propensity Score", y = "Count", fill = "Treatment Status") +
theme_minimal()
## The overlap condition is likely satisfied, but there are small groups of the treated group on the right side that don't have overlapping control groups.
common_support <- lalonde %>%
filter(psfit >= min(psfit[treat == 1]) & psfit <= max(psfit[treat == 0]))
common_support_model <- Match(Y = common_support$re78, Tr = common_support$treat, X = common_support$psfit, M = 3, BiasAdjust = T)
summary(common_support_model)
##
## Estimate... 1177.5
## AI SE...... 976.75
## T-stat..... 1.2056
## p.val...... 0.22798
##
## Original number of observations.............. 484
## Original number of treated obs............... 172
## Matched number of observations............... 172
## Matched number of observations (unweighted). 559
matchit_model <- matchit(treat ~ psfit, data = common_support, method = "nearest", ratio = 3)
matched_data <- match.data(matchit_model)
matchit_te <- matched_data %>%
summarise(
treat_mean = mean(re78[treat == 1], na.rm = TRUE),
control_mean = mean(re78[treat == 0], na.rm = TRUE),
matchit_te = treat_mean - control_mean
)
print(matchit_te)
## treat_mean control_mean matchit_te
## 1 6205.398 5961.125 244.2726
balance_table <- bal.tab(ps_match_model, data = lalonde, treat = "treat", covs = c("age", "educ", "re74", "re75", "black", "hisp"))
print(balance_table)
## Balance Measures
## Type Diff.Adj
## age Contin. -0.1113
## educ Contin. -0.0715
## re74 Contin. 0.0196
## re75 Contin. 0.0535
## black Binary -0.0041
## hisp Binary -0.0160
##
## Sample sizes
## Control Treated
## All 429. 185
## Matched (ESS) 44.53 185
## Matched (Unweighted) 158. 185
## Unmatched 271. 0
## The balance table shows good balance after matching.
# weighting method
lalonde$weights <- ifelse(lalonde$treat == 1, 1 / lalonde$psfit, 1 / (1 - lalonde$psfit))
design <- svydesign(ids = ~1, weights = ~weights, data = lalonde)
weight_te <- svyglm(re78 ~ treat, design = design)
summary(weight_te)
##
## Call:
## svyglm(formula = re78 ~ treat, design = design)
##
## Survey design:
## svydesign(ids = ~1, weights = ~weights, data = lalonde)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6268.8 392.9 15.96 <2e-16 ***
## treat 1192.7 1612.3 0.74 0.46
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 61384097)
##
## Number of Fisher Scoring iterations: 2