##What is the impact of employment training on earnings?#Testing for Imbalance Between Groupscheck<-LL %>%group_by(treated) %>%summarise_at(vars(re74, re75, u74, u75, age, education, black, married, nodegree, hispanic), list(mean = mean,var = var))round(t(check), 3)
##imbalance is from the cem package. Any variable that you do not want to ##test for balance between groups should be listed in the drop sectionimbalance_raw <-imbalance(group = LL$treated, data = LL, drop =c("re78", "treated")) #Always drop the DV and treatment variable; sometimes drop other variables as neededimbalance_raw
###Match Exact ###Perform the matching here with code that resembles most regressionsmatch_exact <-matchit(treated ~ re74+re75+u74+u75+age+education+black+married+nodegree+hispanic, method ="exact", data = LL)data_exact <-match.data(match_exact) #Creates new dataframe that only includes the matched casessummary(match_exact)
Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education +
black + married + nodegree + hispanic, data = LL, method = "exact")
Summary of Balance for All Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 3570.9990 3672.4852 -0.0176 0.7837 0.0127
re75 3066.0982 3026.6827 0.0081 0.8784 0.0250
u74 0.4411 0.4612 -0.0405 . 0.0201
u75 0.3737 0.4188 -0.0932 . 0.0451
age 24.6263 24.4471 0.0268 1.0294 0.0127
education 10.3805 10.1882 0.1058 1.2610 0.0173
black 0.8013 0.8000 0.0034 . 0.0013
married 0.1684 0.1576 0.0286 . 0.0107
nodegree 0.7306 0.8141 -0.1882 . 0.0835
hispanic 0.0943 0.1129 -0.0639 . 0.0187
eCDF Max
re74 0.0316
re75 0.0491
u74 0.0201
u75 0.0451
age 0.0459
education 0.0835
black 0.0013
married 0.0107
nodegree 0.0835
hispanic 0.0187
Summary of Balance for Matched Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 0.0000 0.0000 0 . 0
re75 0.0000 0.0000 0 . 0
u74 1.0000 1.0000 0 . 0
u75 1.0000 1.0000 0 . 0
age 23.5818 23.5818 0 0.9963 0
education 10.4364 10.4364 0 0.9963 0
black 1.0000 1.0000 0 . 0
married 0.0545 0.0545 0 . 0
nodegree 0.8000 0.8000 0 . 0
hispanic 0.0000 0.0000 0 . 0
eCDF Max Std. Pair Dist.
re74 0 0
re75 0 0
u74 0 0
u75 0 0
age 0 0
education 0 0
black 0 0
married 0 0
nodegree 0 0
hispanic 0 0
Sample Sizes:
Control Treated
All 425. 297
Matched (ESS) 45.9 55
Matched 74. 55
Unmatched 351. 242
Discarded 0. 0
imbalance_exact <-imbalance(group = data_exact$treated, data = data_exact, drop =c("re78", "treated", "weights", "subclass")) #With matched data, always add weights and subclass hereimbalance_exact
###Match Coarsened Exact ###Perform the matching here with code that resembles most regressionsmatch_cem <-matchit(treated ~ re74+re75+u74+u75+age+education+black+married+nodegree+hispanic, method ="cem", data = LL)data_cem <-match.data(match_cem) #Creates new dataframe that only includes the matched casessummary(match_cem)
Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education +
black + married + nodegree + hispanic, data = LL, method = "cem")
Summary of Balance for All Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 3570.9990 3672.4852 -0.0176 0.7837 0.0127
re75 3066.0982 3026.6827 0.0081 0.8784 0.0250
u74 0.4411 0.4612 -0.0405 . 0.0201
u75 0.3737 0.4188 -0.0932 . 0.0451
age 24.6263 24.4471 0.0268 1.0294 0.0127
education 10.3805 10.1882 0.1058 1.2610 0.0173
black 0.8013 0.8000 0.0034 . 0.0013
married 0.1684 0.1576 0.0286 . 0.0107
nodegree 0.7306 0.8141 -0.1882 . 0.0835
hispanic 0.0943 0.1129 -0.0639 . 0.0187
eCDF Max
re74 0.0316
re75 0.0491
u74 0.0201
u75 0.0451
age 0.0459
education 0.0835
black 0.0013
married 0.0107
nodegree 0.0835
hispanic 0.0187
Summary of Balance for Matched Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 1240.3217 1179.7536 0.0105 1.1933 0.0108
re75 1129.0326 1066.6980 0.0128 1.3349 0.0134
u74 0.5500 0.5500 0.0000 . 0.0000
u75 0.5000 0.5000 0.0000 . 0.0000
age 22.7312 22.7144 0.0025 1.0417 0.0093
education 10.2875 10.3133 -0.0142 1.0196 0.0018
black 0.9188 0.9188 0.0000 . 0.0000
married 0.0562 0.0563 -0.0000 . 0.0000
nodegree 0.8000 0.8000 0.0000 . 0.0000
hispanic 0.0562 0.0562 0.0000 . 0.0000
eCDF Max Std. Pair Dist.
re74 0.0450 0.0809
re75 0.0412 0.1073
u74 0.0000 0.0000
u75 0.0000 0.0000
age 0.0510 0.1589
education 0.0258 0.1781
black 0.0000 0.0000
married 0.0000 0.0000
nodegree 0.0000 0.0000
hispanic 0.0000 0.0000
Sample Sizes:
Control Treated
All 425. 297
Matched (ESS) 129.31 160
Matched 213. 160
Unmatched 212. 137
Discarded 0. 0
imbalance_cem <-imbalance(group = data_cem$treated, data = data_cem, drop =c("re78", "treated", "weights", "subclass")) #With matched data, always drop weights and subclass in this line of code. They are added by the #procedure so should not be included in the L1 test. imbalance_cem
#Can set your own coarsening points using the cutpoints function.#Code will automatically coarsen continuous variables at points it choosesmatch_cem2 <-matchit( treated ~ re74 + re75 + u74 + u75 + age + education + black + married + nodegree + hispanic, method ="cem", data = LL,cutpoints =list(age =c(25, 35, 45, 55, 65), # Custom age binseducation =c(8, 12, 16), # Custom education binsre74 =4, # 4 quantile-based bins for re74re75 =c(0, 5000, 10000, 20000) # Custom income cutoffs ))data_cem2 <-match.data(match_cem2) #Creates new dataframe that only includes the matched casessummary(match_cem2)
Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education +
black + married + nodegree + hispanic, data = LL, method = "cem",
cutpoints = list(age = c(25, 35, 45, 55, 65), education = c(8,
12, 16), re74 = 4, re75 = c(0, 5000, 10000, 20000)))
Summary of Balance for All Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 3570.9990 3672.4852 -0.0176 0.7837 0.0127
re75 3066.0982 3026.6827 0.0081 0.8784 0.0250
u74 0.4411 0.4612 -0.0405 . 0.0201
u75 0.3737 0.4188 -0.0932 . 0.0451
age 24.6263 24.4471 0.0268 1.0294 0.0127
education 10.3805 10.1882 0.1058 1.2610 0.0173
black 0.8013 0.8000 0.0034 . 0.0013
married 0.1684 0.1576 0.0286 . 0.0107
nodegree 0.7306 0.8141 -0.1882 . 0.0835
hispanic 0.0943 0.1129 -0.0639 . 0.0187
eCDF Max
re74 0.0316
re75 0.0491
u74 0.0201
u75 0.0451
age 0.0459
education 0.0835
black 0.0013
married 0.0107
nodegree 0.0835
hispanic 0.0187
Summary of Balance for Matched Data:
Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74 2307.4296 2387.2876 -0.0138 0.9598 0.0077
re75 2177.0038 2262.3725 -0.0175 0.9898 0.0121
u74 0.4573 0.4573 0.0000 . 0.0000
u75 0.3974 0.3974 0.0000 . 0.0000
age 23.5128 23.7448 -0.0347 0.9772 0.0108
education 10.3462 10.4002 -0.0297 1.0433 0.0045
black 0.8547 0.8547 -0.0000 . 0.0000
married 0.1068 0.1068 -0.0000 . 0.0000
nodegree 0.7778 0.7778 0.0000 . 0.0000
hispanic 0.0726 0.0726 0.0000 . 0.0000
eCDF Max Std. Pair Dist.
re74 0.0429 0.1705
re75 0.0383 0.1405
u74 0.0000 0.0000
u75 0.0000 0.0000
age 0.0441 0.3766
education 0.0239 0.5744
black 0.0000 0.0000
married 0.0000 0.0000
nodegree 0.0000 0.0000
hispanic 0.0000 0.0000
Sample Sizes:
Control Treated
All 425. 297
Matched (ESS) 259.28 234
Matched 337. 234
Unmatched 88. 63
Discarded 0. 0
imbalance_cem2 <-imbalance(group = data_cem2$treated, data = data_cem2, drop =c("re78", "treated", "weights", "subclass")) imbalance_cem2
#Compare t-tests of DV on treated - no control variables t.test(LL$re78~LL$treated)
Welch Two Sample t-test
data: LL$re78 by LL$treated
t = -1.8154, df = 557.06, p-value = 0.06999
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
-1845.25058 72.64296
sample estimates:
mean in group 0 mean in group 1
5090.048 5976.352
#Estimate Linear Regression on Raw Datalm1<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, LL)summary(lm1)
Call:
lm(formula = re78 ~ treated + re74 + age + education + black +
married + nodegree + hispanic, data = LL)
Residuals:
Min 1Q Median 3Q Max
-9833 -4448 -1506 3095 55072
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.519e+03 2.609e+03 1.349 0.177869
treated 8.346e+02 4.678e+02 1.784 0.074840 .
re74 1.409e-01 3.771e-02 3.735 0.000203 ***
age 2.013e+01 3.616e+01 0.557 0.578001
education 1.969e+02 1.793e+02 1.098 0.272534
black -1.520e+03 7.990e+02 -1.903 0.057460 .
married 1.530e+02 6.476e+02 0.236 0.813333
nodegree -3.262e+02 7.443e+02 -0.438 0.661317
hispanic 1.216e+02 1.046e+03 0.116 0.907507
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 6150 on 713 degrees of freedom
Multiple R-squared: 0.04342, Adjusted R-squared: 0.03269
F-statistic: 4.046 on 8 and 713 DF, p-value: 0.0001017
summ(lm1, digits=3, vif=TRUE)
Observations
722
Dependent variable
re78
Type
OLS linear regression
F(8,713)
4.046
R²
0.043
Adj. R²
0.033
Est.
S.E.
t val.
p
VIF
(Intercept)
3518.534
2608.878
1.349
0.178
NA
treated
834.584
467.802
1.784
0.075
1.012
re74
0.141
0.038
3.735
0.000
1.049
age
20.128
36.164
0.557
0.578
1.095
education
196.916
179.325
1.098
0.273
1.782
black
-1520.398
799.002
-1.903
0.057
1.946
married
152.972
647.595
0.236
0.813
1.087
nodegree
-326.224
744.328
-0.438
0.661
1.816
hispanic
121.566
1045.961
0.116
0.908
1.967
Standard errors: OLS
#Estimate Linear Regression on Coarsened Exact Matched Datalm2<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, data_cem, weights = weights)summary(lm2)
Call:
lm(formula = re78 ~ treated + re74 + age + education + black +
married + nodegree + hispanic, data = data_cem, weights = weights)
Weighted Residuals:
Min 1Q Median 3Q Max
-11957 -3827 -1536 3104 28294
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4928.2715 4007.0075 1.230 0.2195
treated 1129.0839 573.0143 1.970 0.0495 *
re74 0.1344 0.1360 0.988 0.3236
age -8.9863 60.0945 -0.150 0.8812
education 86.7092 281.5519 0.308 0.7583
black -1136.6153 1833.1963 -0.620 0.5356
married 205.2449 1324.6046 0.155 0.8769
nodegree -794.8211 978.1024 -0.813 0.4170
hispanic 2043.8536 2195.4665 0.931 0.3525
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5476 on 364 degrees of freedom
Multiple R-squared: 0.03473, Adjusted R-squared: 0.01351
F-statistic: 1.637 on 8 and 364 DF, p-value: 0.1129
#Compare results plot_summs(lm1, lm2)
stargazer(lm1, lm2, type ="text", digits =3, dep.var.labels =c("Earnings Post Training"),column.labels =c("Full Data", "Matched Data"))
#######Entropy Balancing# Create a subset of the dataset with the selected variablestreatment_var <-"treated"covariates_vars <-c("re74", "re75", "u74", "u75", "age", "education", "black", "married", "nodegree", "hispanic")dependent_var <-"re78"# Prepare treatment and covariatestreatment <- LL$treatedcovariates <- LL[, covariates_vars]# Run entropy balancinge_bal <-ebalance(Treatment = treatment,X = covariates,max.iterations =200,constraint.tolerance =1)
Converged within tolerance
# Add weights back to LLLL$eb_weight <-NALL$eb_weight[LL[[treatment_var]] ==1] <-1# Treated units get weight = 1LL$eb_weight[LL[[treatment_var]] ==0] <- e_bal$w # Control units get EB weights# Final data for regressioneb_data <- LL %>%filter(!is.na(eb_weight)) # Exclude unmatched if any#data for analysis #Now have a weight called 'eb_weight' that can be used in analysis ##Let's check that the two groups are equal now eb_data %>%group_by(treated) %>%summarise(age_weighted_mean =wtd.mean(age, weights = eb_weight), age_weighted_variance =wtd.var(age, weights = eb_weight) )
# Calculate the weighted mean differenceweighted_mean_diff <-svyby(~re78, by =~treated, design = design, FUN = svymean)# Display the resultsweighted_mean_diff
Design-based t-test
data: re78 ~ treated
t = 1.767, df = 720, p-value = 0.07765
alternative hypothesis: true difference in mean is not equal to 0
95 percent confidence interval:
-96.94675 1842.69176
sample estimates:
difference in mean
872.8725
###Adding Variance Moments to the Balancing Proceduretreatment <- LL$treated#To add variance to the matching procedure, we have to add the squared term #to the covariates list. Easiest to do with tidyverse code like belowcovariates <- eb_data %>%select(re74, re75, u74, u75, age, education, black, married, nodegree, hispanic) %>%mutate(agesq = age^2) # Add square terms to match variances a<-ebalance(treatment, covariates, base.weight =NULL,norm.constant =NULL, coefs =NULL,max.iterations =200,constraint.tolerance =1, print.level =0)
Converged within tolerance
# Add weights back to LLLL$eb_weight2 <-NALL$eb_weight2[LL[[treatment_var]] ==1] <-1# Treated units get weight = 1LL$eb_weight2[LL[[treatment_var]] ==0] <- a$w # Control units get EB weights##Let's check that the two groups are equal now LL %>%group_by(treated) %>%summarise(age_weighted_mean =wtd.mean(age, weights = eb_weight2),age_weighted_variance =wtd.var(age, weights = eb_weight2) )
# Calculate the weighted mean differenceweighted_mean_diff <-svyby(~re78, by =~treated, design = design, FUN = svymean)# Display the resultsweighted_mean_diff
Design-based t-test
data: re78 ~ treated
t = 1.767, df = 720, p-value = 0.07765
alternative hypothesis: true difference in mean is not equal to 0
95 percent confidence interval:
-96.94675 1842.69176
sample estimates:
difference in mean
872.8725