Test Matching

Author

Jingyi Yang

library(cem) #Coarsened Exact Matching

Warning: package 'cem' was built under R version 4.4.3

Loading required package: tcltk

Loading required package: lattice


How to use CEM? Type vignette("cem")

library(MatchIt)  #Coarsened Exact Matching

Warning: package 'MatchIt' was built under R version 4.4.3

library(stargazer)  #Regression Outputd


Please cite as:

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

library(jtools)  #Regression Output

Warning: package 'jtools' was built under R version 4.4.3

library(WeightIt) #Entropy Balancing

Warning: package 'WeightIt' was built under R version 4.4.3

library(Hmisc) #General Modeling


Attaching package: 'Hmisc'

The following object is masked from 'package:jtools':

    %nin%

The following objects are masked from 'package:base':

    format.pval, units

library(ebal) #Entropy Balancing

##
## ebal Package: Implements Entropy Balancing.

## See http://www.stanford.edu/~jhain/ for additional information.

library(survey) #Applying EB weights

Loading required package: grid

Loading required package: Matrix

Loading required package: survival


Attaching package: 'survey'

The following object is masked from 'package:Hmisc':

    deff

The following object is masked from 'package:WeightIt':

    calibrate

The following object is masked from 'package:graphics':

    dotchart

library(tidyr)


Attaching package: 'tidyr'

The following objects are masked from 'package:Matrix':

    expand, pack, unpack

library(tidyverse)

Warning: package 'tidyverse' was built under R version 4.4.3

Warning: package 'ggplot2' was built under R version 4.4.3

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ purrr     1.0.2
✔ forcats   1.0.0     ✔ readr     2.1.5
✔ ggplot2   3.5.1     ✔ stringr   1.5.1
✔ lubridate 1.9.3     ✔ tibble    3.2.1

── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ tidyr::expand()    masks Matrix::expand()
✖ dplyr::filter()    masks stats::filter()
✖ dplyr::lag()       masks stats::lag()
✖ tidyr::pack()      masks Matrix::pack()
✖ dplyr::src()       masks Hmisc::src()
✖ dplyr::summarize() masks Hmisc::summarize()
✖ tidyr::unpack()    masks Matrix::unpack()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

data("LL")
names(LL)

 [1] "treated"   "age"       "education" "black"     "married"   "nodegree" 
 [7] "re74"      "re75"      "re78"      "hispanic"  "u74"       "u75"

##What is the impact of employment training on earnings?

#Testing for Imbalance Between Groups
check<-LL %>%
  group_by(treated) %>%
  summarise_at(vars(re74, re75, u74, u75, age, education, black, married, nodegree, hispanic), 
               list(mean = mean,
                    var = var))

round(t(check), 3)

                       [,1]         [,2]
treated               0.000        1.000
re74_mean          3672.485     3570.999
re75_mean          3026.683     3066.098
u74_mean              0.461        0.441
u75_mean              0.419        0.374
age_mean             24.447       24.626
education_mean       10.188       10.380
black_mean            0.800        0.801
married_mean          0.158        0.168
nodegree_mean         0.814        0.731
hispanic_mean         0.113        0.094
re74_var       42530297.287 33329073.446
re75_var       27052998.539 23764541.913
u74_var               0.249        0.247
u75_var               0.244        0.235
age_var              43.432       44.708
education_var         2.620        3.304
black_var             0.160        0.160
married_var           0.133        0.140
nodegree_var          0.152        0.197
hispanic_var          0.100        0.086

##imbalance is from the cem package. Any variable that you do not want to 
##test for balance between groups should be listed in the drop section
imbalance_raw <- imbalance(group = LL$treated, data = LL, 
               drop = c("re78", "treated"))   #Always drop the DV and treatment variable; sometimes drop other variables as needed
imbalance_raw


Multivariate Imbalance Measure: L1=0.735
Percentage of local common support: LCS=12.4%

Univariate Imbalance Measures:

              statistic   type           L1 min 25%       50%      75%
age        1.792038e-01 (diff) 4.705882e-03   0   1   0.00000  -1.0000
education  1.922361e-01 (diff) 9.811844e-02   1   0   1.00000   1.0000
black      1.346801e-03 (diff) 1.346801e-03   0   0   0.00000   0.0000
married    1.070311e-02 (diff) 1.070311e-02   0   0   0.00000   0.0000
nodegree  -8.347792e-02 (diff) 8.347792e-02   0  -1   0.00000   0.0000
re74      -1.014862e+02 (diff) 5.551115e-17   0   0  69.73096 584.9160
re75       3.941545e+01 (diff) 5.551115e-17   0   0 294.18457 660.6865
hispanic  -1.866508e-02 (diff) 1.866508e-02   0   0   0.00000   0.0000
u74       -2.009903e-02 (diff) 2.009903e-02   0   0   0.00000   0.0000
u75       -4.508616e-02 (diff) 4.508616e-02   0   0   0.00000   0.0000
                 max
age          -6.0000
education     2.0000
black         0.0000
married       0.0000
nodegree      0.0000
re74      -2139.0195
re75        490.3945
hispanic      0.0000
u74           0.0000
u75           0.0000

###Match Exact 
###Perform the matching here with code that resembles most regressions
match_exact <- matchit(treated ~ re74+re75+u74+u75+age+education+black+married+nodegree+hispanic, 
                       method = "exact", data = LL)
data_exact <- match.data(match_exact) #Creates new dataframe that only includes the matched cases
summary(match_exact)


Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education + 
    black + married + nodegree + hispanic, data = LL, method = "exact")

Summary of Balance for All Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74          3570.9990     3672.4852         -0.0176     0.7837    0.0127
re75          3066.0982     3026.6827          0.0081     0.8784    0.0250
u74              0.4411        0.4612         -0.0405          .    0.0201
u75              0.3737        0.4188         -0.0932          .    0.0451
age             24.6263       24.4471          0.0268     1.0294    0.0127
education       10.3805       10.1882          0.1058     1.2610    0.0173
black            0.8013        0.8000          0.0034          .    0.0013
married          0.1684        0.1576          0.0286          .    0.0107
nodegree         0.7306        0.8141         -0.1882          .    0.0835
hispanic         0.0943        0.1129         -0.0639          .    0.0187
          eCDF Max
re74        0.0316
re75        0.0491
u74         0.0201
u75         0.0451
age         0.0459
education   0.0835
black       0.0013
married     0.0107
nodegree    0.0835
hispanic    0.0187

Summary of Balance for Matched Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74             0.0000        0.0000               0          .         0
re75             0.0000        0.0000               0          .         0
u74              1.0000        1.0000               0          .         0
u75              1.0000        1.0000               0          .         0
age             23.5818       23.5818               0     0.9963         0
education       10.4364       10.4364               0     0.9963         0
black            1.0000        1.0000               0          .         0
married          0.0545        0.0545               0          .         0
nodegree         0.8000        0.8000               0          .         0
hispanic         0.0000        0.0000               0          .         0
          eCDF Max Std. Pair Dist.
re74             0               0
re75             0               0
u74              0               0
u75              0               0
age              0               0
education        0               0
black            0               0
married          0               0
nodegree         0               0
hispanic         0               0

Sample Sizes:
              Control Treated
All             425.      297
Matched (ESS)    45.9      55
Matched          74.       55
Unmatched       351.      242
Discarded         0.        0

imbalance_exact <- imbalance(group = data_exact$treated, data = data_exact, 
          drop = c("re78", "treated", "weights",  "subclass")) #With matched data, always add weights and subclass here
imbalance_exact


Multivariate Imbalance Measure: L1=0.202
Percentage of local common support: LCS=100.0%

Univariate Imbalance Measures:

            statistic   type         L1 min 25% 50% 75% max
age        1.08181818 (diff) 0.00000000   0   0   2   2   0
education  0.27420147 (diff) 0.00000000   0   1   1   0   0
black      0.00000000 (diff) 0.00000000   0   0   0   0   0
married    0.01400491 (diff) 0.01400491   0   0   0   0   0
nodegree  -0.10540541 (diff) 0.10540541   0   0   0   0   0
re74       0.00000000 (diff) 0.00000000   0   0   0   0   0
re75       0.00000000 (diff) 0.00000000   0   0   0   0   0
hispanic   0.00000000 (diff) 0.00000000   0   0   0   0   0
u74        0.00000000 (diff) 0.00000000   0   0   0   0   0
u75        0.00000000 (diff) 0.00000000   0   0   0   0   0

###Match Coarsened Exact 
###Perform the matching here with code that resembles most regressions
match_cem <- matchit(treated ~ re74+re75+u74+u75+age+education+black+married+nodegree+hispanic, 
                       method = "cem", data = LL)
data_cem <- match.data(match_cem) #Creates new dataframe that only includes the matched cases
summary(match_cem)


Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education + 
    black + married + nodegree + hispanic, data = LL, method = "cem")

Summary of Balance for All Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74          3570.9990     3672.4852         -0.0176     0.7837    0.0127
re75          3066.0982     3026.6827          0.0081     0.8784    0.0250
u74              0.4411        0.4612         -0.0405          .    0.0201
u75              0.3737        0.4188         -0.0932          .    0.0451
age             24.6263       24.4471          0.0268     1.0294    0.0127
education       10.3805       10.1882          0.1058     1.2610    0.0173
black            0.8013        0.8000          0.0034          .    0.0013
married          0.1684        0.1576          0.0286          .    0.0107
nodegree         0.7306        0.8141         -0.1882          .    0.0835
hispanic         0.0943        0.1129         -0.0639          .    0.0187
          eCDF Max
re74        0.0316
re75        0.0491
u74         0.0201
u75         0.0451
age         0.0459
education   0.0835
black       0.0013
married     0.0107
nodegree    0.0835
hispanic    0.0187

Summary of Balance for Matched Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74          1240.3217     1179.7536          0.0105     1.1933    0.0108
re75          1129.0326     1066.6980          0.0128     1.3349    0.0134
u74              0.5500        0.5500          0.0000          .    0.0000
u75              0.5000        0.5000          0.0000          .    0.0000
age             22.7312       22.7144          0.0025     1.0417    0.0093
education       10.2875       10.3133         -0.0142     1.0196    0.0018
black            0.9188        0.9188          0.0000          .    0.0000
married          0.0562        0.0563         -0.0000          .    0.0000
nodegree         0.8000        0.8000          0.0000          .    0.0000
hispanic         0.0562        0.0562          0.0000          .    0.0000
          eCDF Max Std. Pair Dist.
re74        0.0450          0.0809
re75        0.0412          0.1073
u74         0.0000          0.0000
u75         0.0000          0.0000
age         0.0510          0.1589
education   0.0258          0.1781
black       0.0000          0.0000
married     0.0000          0.0000
nodegree    0.0000          0.0000
hispanic    0.0000          0.0000

Sample Sizes:
              Control Treated
All            425.       297
Matched (ESS)  129.31     160
Matched        213.       160
Unmatched      212.       137
Discarded        0.         0

imbalance_cem <- imbalance(group = data_cem$treated, data = data_cem, 
                             drop = c("re78", "treated", "weights",  "subclass")) 
#With matched data, always drop weights and subclass in this line of code. They are added by the 
#procedure so should not be included in the L1 test. 
imbalance_cem


Multivariate Imbalance Measure: L1=0.575
Percentage of local common support: LCS=27.5%

Univariate Imbalance Measures:

              statistic   type          L1 min 25% 50%        75%        max
age       -6.514378e-01 (diff) 0.000000000   0   0  -2   -1.00000     0.0000
education -1.231808e-01 (diff) 0.102816901   0   0   0    0.00000     0.0000
black     -1.734155e-02 (diff) 0.017341549   0   0   0    0.00000     0.0000
married   -9.301643e-03 (diff) 0.009301643   0   0   0    0.00000     0.0000
nodegree   8.732394e-02 (diff) 0.087323944   0   0   0    0.00000     0.0000
re74      -1.632072e+02 (diff) 0.000000000   0   0   0 -159.10046  -645.2949
re75      -9.254560e+01 (diff) 0.000000000   0   0   0  -82.70605 -1976.5752
hispanic   1.417254e-02 (diff) 0.014172535   0   0   0    0.00000     0.0000
u74        1.807512e-02 (diff) 0.018075117   0   0   0    0.00000     0.0000
u75        2.582160e-02 (diff) 0.025821596   0   0   1    0.00000     0.0000

#Can set your own coarsening points using the cutpoints function.
#Code will automatically coarsen continuous variables at points it chooses
match_cem2 <- matchit(
  treated ~ re74 + re75 + u74 + u75 + age + education + black + married + nodegree + hispanic, 
  method = "cem", 
  data = LL,
  cutpoints = list(
    age = c(25, 35, 45, 55, 65),           # Custom age bins
    education = c(8, 12, 16),              # Custom education bins
    re74 = 4,                              # 4 quantile-based bins for re74
    re75 = c(0, 5000, 10000, 20000)        # Custom income cutoffs
  )
)

data_cem2 <- match.data(match_cem2) #Creates new dataframe that only includes the matched cases
summary(match_cem2)


Call:
matchit(formula = treated ~ re74 + re75 + u74 + u75 + age + education + 
    black + married + nodegree + hispanic, data = LL, method = "cem", 
    cutpoints = list(age = c(25, 35, 45, 55, 65), education = c(8, 
        12, 16), re74 = 4, re75 = c(0, 5000, 10000, 20000)))

Summary of Balance for All Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74          3570.9990     3672.4852         -0.0176     0.7837    0.0127
re75          3066.0982     3026.6827          0.0081     0.8784    0.0250
u74              0.4411        0.4612         -0.0405          .    0.0201
u75              0.3737        0.4188         -0.0932          .    0.0451
age             24.6263       24.4471          0.0268     1.0294    0.0127
education       10.3805       10.1882          0.1058     1.2610    0.0173
black            0.8013        0.8000          0.0034          .    0.0013
married          0.1684        0.1576          0.0286          .    0.0107
nodegree         0.7306        0.8141         -0.1882          .    0.0835
hispanic         0.0943        0.1129         -0.0639          .    0.0187
          eCDF Max
re74        0.0316
re75        0.0491
u74         0.0201
u75         0.0451
age         0.0459
education   0.0835
black       0.0013
married     0.0107
nodegree    0.0835
hispanic    0.0187

Summary of Balance for Matched Data:
          Means Treated Means Control Std. Mean Diff. Var. Ratio eCDF Mean
re74          2307.4296     2387.2876         -0.0138     0.9598    0.0077
re75          2177.0038     2262.3725         -0.0175     0.9898    0.0121
u74              0.4573        0.4573          0.0000          .    0.0000
u75              0.3974        0.3974          0.0000          .    0.0000
age             23.5128       23.7448         -0.0347     0.9772    0.0108
education       10.3462       10.4002         -0.0297     1.0433    0.0045
black            0.8547        0.8547         -0.0000          .    0.0000
married          0.1068        0.1068         -0.0000          .    0.0000
nodegree         0.7778        0.7778          0.0000          .    0.0000
hispanic         0.0726        0.0726          0.0000          .    0.0000
          eCDF Max Std. Pair Dist.
re74        0.0429          0.1705
re75        0.0383          0.1405
u74         0.0000          0.0000
u75         0.0000          0.0000
age         0.0441          0.3766
education   0.0239          0.5744
black       0.0000          0.0000
married     0.0000          0.0000
nodegree    0.0000          0.0000
hispanic    0.0000          0.0000

Sample Sizes:
              Control Treated
All            425.       297
Matched (ESS)  259.28     234
Matched        337.       234
Unmatched       88.        63
Discarded        0.         0

imbalance_cem2 <- imbalance(group = data_cem2$treated, data = data_cem2, 
                           drop = c("re78", "treated", "weights",  "subclass")) 
imbalance_cem2


Multivariate Imbalance Measure: L1=0.722
Percentage of local common support: LCS=13.9%

Univariate Imbalance Measures:

              statistic   type          L1 min 25%       50%       75%
age       -1.893784e-01 (diff) 0.000000000   0   0    0.0000   -1.0000
education -1.117325e-01 (diff) 0.073638692   1   0   -1.0000    0.0000
black     -9.003525e-03 (diff) 0.009003525   0   0    0.0000    0.0000
married   -2.375155e-02 (diff) 0.023751553   0   0    0.0000    0.0000
nodegree   6.198483e-02 (diff) 0.061984833   0   0    0.0000    0.0000
re74      -1.553611e+02 (diff) 0.000000000   0   0 -271.2529 -383.8750
re75      -2.514696e+02 (diff) 0.000000000   0   0 -186.9884 -556.1079
hispanic   7.469122e-03 (diff) 0.007469122   0   0    0.0000    0.0000
u74        3.531665e-02 (diff) 0.035316645   0   0    0.0000    0.0000
u75        5.063532e-02 (diff) 0.050635319   0   0    0.0000    0.0000
                max
age          0.0000
education   -1.0000
black        0.0000
married      0.0000
nodegree     0.0000
re74      2423.8496
re75       315.2617
hispanic     0.0000
u74          0.0000
u75          0.0000

#Compare t-tests of DV on treated - no control variables 
t.test(LL$re78~LL$treated)


    Welch Two Sample t-test

data:  LL$re78 by LL$treated
t = -1.8154, df = 557.06, p-value = 0.06999
alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
95 percent confidence interval:
 -1845.25058    72.64296
sample estimates:
mean in group 0 mean in group 1 
       5090.048        5976.352

#Estimate Linear Regression on Raw Data
lm1<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, 
        LL)
summary(lm1)


Call:
lm(formula = re78 ~ treated + re74 + age + education + black + 
    married + nodegree + hispanic, data = LL)

Residuals:
   Min     1Q Median     3Q    Max 
 -9833  -4448  -1506   3095  55072 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  3.519e+03  2.609e+03   1.349 0.177869    
treated      8.346e+02  4.678e+02   1.784 0.074840 .  
re74         1.409e-01  3.771e-02   3.735 0.000203 ***
age          2.013e+01  3.616e+01   0.557 0.578001    
education    1.969e+02  1.793e+02   1.098 0.272534    
black       -1.520e+03  7.990e+02  -1.903 0.057460 .  
married      1.530e+02  6.476e+02   0.236 0.813333    
nodegree    -3.262e+02  7.443e+02  -0.438 0.661317    
hispanic     1.216e+02  1.046e+03   0.116 0.907507    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6150 on 713 degrees of freedom
Multiple R-squared:  0.04342,   Adjusted R-squared:  0.03269 
F-statistic: 4.046 on 8 and 713 DF,  p-value: 0.0001017

summ(lm1, digits=3, vif=TRUE)

Observations	722
Dependent variable	re78
Type	OLS linear regression

F(8,713)	4.046
R²	0.043
Adj. R²	0.033

	Est.	S.E.	t val.	p	VIF
(Intercept)	3518.534	2608.878	1.349	0.178	NA
treated	834.584	467.802	1.784	0.075	1.012
re74	0.141	0.038	3.735	0.000	1.049
age	20.128	36.164	0.557	0.578	1.095
education	196.916	179.325	1.098	0.273	1.782
black	-1520.398	799.002	-1.903	0.057	1.946
married	152.972	647.595	0.236	0.813	1.087
nodegree	-326.224	744.328	-0.438	0.661	1.816
hispanic	121.566	1045.961	0.116	0.908	1.967
Standard errors: OLS

#Estimate Linear Regression on Coarsened Exact Matched Data
lm2<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, 
        data_cem, weights = weights)
summary(lm2)


Call:
lm(formula = re78 ~ treated + re74 + age + education + black + 
    married + nodegree + hispanic, data = data_cem, weights = weights)

Weighted Residuals:
   Min     1Q Median     3Q    Max 
-11957  -3827  -1536   3104  28294 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)  
(Intercept)  4928.2715  4007.0075   1.230   0.2195  
treated      1129.0839   573.0143   1.970   0.0495 *
re74            0.1344     0.1360   0.988   0.3236  
age            -8.9863    60.0945  -0.150   0.8812  
education      86.7092   281.5519   0.308   0.7583  
black       -1136.6153  1833.1963  -0.620   0.5356  
married       205.2449  1324.6046   0.155   0.8769  
nodegree     -794.8211   978.1024  -0.813   0.4170  
hispanic     2043.8536  2195.4665   0.931   0.3525  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5476 on 364 degrees of freedom
Multiple R-squared:  0.03473,   Adjusted R-squared:  0.01351 
F-statistic: 1.637 on 8 and 364 DF,  p-value: 0.1129

#Compare results 
plot_summs(lm1, lm2)

stargazer(lm1, lm2, type = "text", digits = 3, 
          dep.var.labels = c("Earnings Post Training"),
          column.labels = c("Full Data", "Matched Data"))


===============================================================
                                Dependent variable:            
                    -------------------------------------------
                              Earnings Post Training           
                          Full Data            Matched Data    
                             (1)                   (2)         
---------------------------------------------------------------
treated                    834.584*            1,129.084**     
                          (467.802)             (573.014)      
                                                               
re74                       0.141***               0.134        
                           (0.038)               (0.136)       
                                                               
age                         20.128                -8.986       
                           (36.164)              (60.095)      
                                                               
education                  196.916                86.709       
                          (179.325)             (281.552)      
                                                               
black                    -1,520.398*            -1,136.615     
                          (799.002)            (1,833.196)     
                                                               
married                    152.972               205.245       
                          (647.595)            (1,324.605)     
                                                               
nodegree                   -326.224              -794.821      
                          (744.328)             (978.102)      
                                                               
hispanic                   121.566              2,043.854      
                         (1,045.961)           (2,195.466)     
                                                               
Constant                  3,518.534             4,928.272      
                         (2,608.878)           (4,007.007)     
                                                               
---------------------------------------------------------------
Observations                 722                   373         
R2                          0.043                 0.035        
Adjusted R2                 0.033                 0.014        
Residual Std. Error  6,149.889 (df = 713)  5,476.004 (df = 364)
F Statistic         4.046*** (df = 8; 713) 1.637 (df = 8; 364) 
===============================================================
Note:                               *p<0.1; **p<0.05; ***p<0.01

#######Entropy Balancing
# Create a subset of the dataset with the selected variables
treatment_var <- "treated"
covariates_vars <- c("re74", "re75", "u74", "u75", "age", "education", "black", 
"married", "nodegree", "hispanic")
dependent_var <- "re78"

# Prepare treatment and covariates
treatment <- LL$treated
covariates <- LL[, covariates_vars]

# Run entropy balancing
e_bal <- ebalance(
  Treatment = treatment,
  X = covariates,
  max.iterations = 200,
  constraint.tolerance = 1
)

Converged within tolerance

# Add weights back to LL
LL$eb_weight <- NA
LL$eb_weight[LL[[treatment_var]] == 1] <- 1  # Treated units get weight = 1
LL$eb_weight[LL[[treatment_var]] == 0] <- e_bal$w  # Control units get EB weights

# Final data for regression
eb_data <- LL %>% filter(!is.na(eb_weight))  # Exclude unmatched if any

#data for analysis 
#Now have a weight called 'eb_weight' that can be used in analysis 

##Let's check that the two groups are equal now 
eb_data %>%
  group_by(treated) %>%
  summarise(
    age_weighted_mean = wtd.mean(age, weights = eb_weight), 
    age_weighted_variance = wtd.var(age, weights = eb_weight)
  )

# A tibble: 2 × 3
  treated age_weighted_mean age_weighted_variance
    <dbl>             <dbl>                 <dbl>
1       0              24.6                  42.1
2       1              24.6                  44.7

check <- eb_data %>%
  group_by(treated) %>%
  summarise(across(.cols = c(re74, re75, u74, u75, age, education, black, married, nodegree, hispanic),
                   .fns = list(
                     weighted_mean = ~wtd.mean(., weights = eb_weight),
                     weighted_variance = ~wtd.var(., weights = eb_weight)),
                   .names = "{.col}_{.fn}"),
            .groups = "drop")

check_t<- round(t(check), 2)

print(check_t)

                                   [,1]        [,2]
treated                            0.00        1.00
re74_weighted_mean              3571.00     3571.00
re74_weighted_variance      36756250.77 33329073.45
re75_weighted_mean              3066.10     3066.10
re75_weighted_variance      24963228.47 23764541.91
u74_weighted_mean                  0.44        0.44
u74_weighted_variance              0.25        0.25
u75_weighted_mean                  0.37        0.37
u75_weighted_variance              0.23        0.23
age_weighted_mean                 24.63       24.63
age_weighted_variance             42.13       44.71
education_weighted_mean           10.38       10.38
education_weighted_variance        2.87        3.30
black_weighted_mean                0.80        0.80
black_weighted_variance            0.16        0.16
married_weighted_mean              0.17        0.17
married_weighted_variance          0.14        0.14
nodegree_weighted_mean             0.73        0.73
nodegree_weighted_variance         0.20        0.20
hispanic_weighted_mean             0.09        0.09
hispanic_weighted_variance         0.09        0.09

lm1<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, data=eb_data, weights = eb_weight)
lm2<-lm(re78~treated+re74+age+education+black+married+nodegree+hispanic, data=eb_data)

stargazer(lm1, lm2, type = "text", digits = 3, 
          dep.var.labels = c("Earnings Post Training"),
          column.labels = c("Entropy Balanced", "Raw Data"))


===========================================================
                                   Dependent variable:     
                               ----------------------------
                                  Earnings Post Training   
                               Entropy Balanced  Raw Data  
                                     (1)            (2)    
-----------------------------------------------------------
treated                            872.873*      834.584*  
                                  (467.301)      (467.802) 
                                                           
re74                               0.119***      0.141***  
                                   (0.040)        (0.038)  
                                                           
age                                 24.307        20.128   
                                   (37.380)      (36.164)  
                                                           
education                          260.117        196.916  
                                  (184.050)      (179.325) 
                                                           
black                            -1,572.579**   -1,520.398*
                                  (779.088)      (799.002) 
                                                           
married                            250.251        152.972  
                                  (653.145)      (647.595) 
                                                           
nodegree                           -136.860      -326.224  
                                  (734.700)      (744.328) 
                                                           
hispanic                           -56.692        121.566  
                                 (1,069.314)    (1,045.961)
                                                           
Constant                          2,701.981      3,518.534 
                                 (2,667.536)    (2,608.878)
                                                           
-----------------------------------------------------------
Observations                         722            722    
R2                                  0.037          0.043   
Adjusted R2                         0.026          0.033   
Residual Std. Error (df = 713)    5,694.561      6,149.889 
F Statistic (df = 8; 713)          3.414***      4.046***  
===========================================================
Note:                           *p<0.1; **p<0.05; ***p<0.01

# Create a survey design object
design <- svydesign(ids = ~1, data = eb_data, 
                    weights = ~eb_weight)

###Unbalanced Mean Differences
eb_data %>%
  group_by(treated) %>%
  summarise(mean_re78 = mean(re78, na.rm = TRUE))

# A tibble: 2 × 2
  treated mean_re78
    <dbl>     <dbl>
1       0     5090.
2       1     5976.

# Calculate the weighted mean difference
weighted_mean_diff <- svyby(~re78, by = ~treated, design = design, FUN = svymean)

# Display the results
weighted_mean_diff

  treated     re78       se
0       0 5103.480 287.9739
1       1 5976.352 401.3605

ttest_result <- svyttest(re78 ~ treated, design = design)
ttest_result


    Design-based t-test

data:  re78 ~ treated
t = 1.767, df = 720, p-value = 0.07765
alternative hypothesis: true difference in mean is not equal to 0
95 percent confidence interval:
  -96.94675 1842.69176
sample estimates:
difference in mean 
          872.8725

###Adding Variance Moments to the Balancing Procedure
treatment <- LL$treated

#To add variance to the matching procedure, we have to add the squared term 
#to the covariates list. Easiest to do with tidyverse code like below
covariates <- eb_data %>%
  select(re74, re75, u74, u75, age, education, black, 
         married, nodegree, hispanic) %>%
  mutate(agesq = age^2)    
# Add square terms to match variances 

a<-ebalance(treatment, covariates, base.weight = NULL,
            norm.constant = NULL, coefs = NULL,
            max.iterations = 200,
            constraint.tolerance = 1, print.level = 0)

Converged within tolerance

# Add weights back to LL
LL$eb_weight2 <- NA
LL$eb_weight2[LL[[treatment_var]] == 1] <- 1  # Treated units get weight = 1
LL$eb_weight2[LL[[treatment_var]] == 0] <- a$w  # Control units get EB weights


##Let's check that the two groups are equal now 
LL %>%
  group_by(treated) %>%
  summarise(
    age_weighted_mean = wtd.mean(age, weights = eb_weight2),
    age_weighted_variance = wtd.var(age, weights = eb_weight2)
  )

# A tibble: 2 × 3
  treated age_weighted_mean age_weighted_variance
    <dbl>             <dbl>                 <dbl>
1       0              24.6                  44.7
2       1              24.6                  44.7

check <- LL %>%
  group_by(treated) %>%
  summarise(across(.cols = c(re74, re75, u74, u75, age, education, black, married, nodegree, hispanic),
      .fns = list(
        weighted_mean = ~wtd.mean(., weights = eb_weight2),
        weighted_variance = ~wtd.var(., weights = eb_weight2)),
      .names = "{.col}_{.fn}"),
    .groups = "drop")

round(t(check), 3)

                                    [,1]         [,2]
treated                            0.000        1.000
re74_weighted_mean              3570.999     3570.999
re74_weighted_variance      36957311.340 33329073.446
re75_weighted_mean              3066.098     3066.098
re75_weighted_variance      25111720.588 23764541.913
u74_weighted_mean                  0.441        0.441
u74_weighted_variance              0.247        0.247
u75_weighted_mean                  0.374        0.374
u75_weighted_variance              0.235        0.235
age_weighted_mean                 24.626       24.626
age_weighted_variance             44.708       44.708
education_weighted_mean           10.380       10.380
education_weighted_variance        2.884        3.304
black_weighted_mean                0.801        0.801
black_weighted_variance            0.160        0.160
married_weighted_mean              0.168        0.168
married_weighted_variance          0.140        0.140
nodegree_weighted_mean             0.731        0.731
nodegree_weighted_variance         0.197        0.197
hispanic_weighted_mean             0.094        0.094
hispanic_weighted_variance         0.086        0.086

# Create a survey design object
design <- svydesign(ids = ~1, data = eb_data, weights = ~eb_weight)

###Unbalanced Mean Differences
eb_data %>%
  group_by(treated) %>%
  summarise(mean_re78 = mean(re78, na.rm = TRUE))

# A tibble: 2 × 2
  treated mean_re78
    <dbl>     <dbl>
1       0     5090.
2       1     5976.

# Calculate the weighted mean difference
weighted_mean_diff <- svyby(~re78, by = ~treated, design = design, FUN = svymean)

# Display the results
weighted_mean_diff

  treated     re78       se
0       0 5103.480 287.9739
1       1 5976.352 401.3605

#Calculate T-Tests
ttest_result <- svyttest(re78 ~ treated, design = design)
ttest_result


    Design-based t-test

data:  re78 ~ treated
t = 1.767, df = 720, p-value = 0.07765
alternative hypothesis: true difference in mean is not equal to 0
95 percent confidence interval:
  -96.94675 1842.69176
sample estimates:
difference in mean 
          872.8725

#Estimate Regression Model 
lm3<-lm(re78 ~ treated, data=eb_data, weights = eb_weight)
lm4<-lm(re78 ~ treated+re74+u74+age+education+black+married+nodegree+hispanic, data=eb_data, weights = eb_weight)
lm5<-lm(re78 ~ treated+re74+u74+age+education+black+married+nodegree+hispanic, data=eb_data)
plot_summs(lm3, lm4, lm5)

export_summs(lm3, lm4, lm5)

	Model 1	Model 2	Model 3
(Intercept)	5103.48 ***	2222.89	3068.55
	(334.27)	(2686.42)	(2636.89)
treated	872.87	872.87	855.26
	(472.73)	(466.96)	(468.03)
re74		0.16 **	0.17 ***
		(0.05)	(0.05)
u74		834.34	654.91
		(581.86)	(564.18)
age		11.84	11.07
		(38.35)	(36.99)
education		281.73	217.63
		(184.53)	(180.17)
black		-1578.23 *	-1525.65
		(778.52)	(798.82)
married		221.77	134.53
		(652.96)	(647.63)
nodegree		-60.48	-262.78
		(736.09)	(746.15)
hispanic		46.09	200.39
		(1070.92)	(1047.91)
N	722	722	722
R2	0.00	0.04	0.05
* p < 0.001; p < 0.01; * p < 0.05.