Setting Working Directory

setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 3")

#Libraries

# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, 
       cobalt, Matchit, Zelig, estimatr, cem, tidyverse, 
       lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)

## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## Warning: package 'Matchit' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

## Warning: Perhaps you meant 'MatchIt' ?

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.

## Warning in p_install(package, character.only = TRUE, ...):

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Matchit'

## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## Warning: package 'Zelig' is not available for this version of R
## 
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages

## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
##   cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'

## Warning: 'BiocManager' not available.  Could not check Bioconductor.
## 
## Please use `install.packages('BiocManager')` and then retry.

## Warning in p_install(package, character.only = TRUE, ...):

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Zelig'

## Warning in p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, : Failed to install/load:
## Matchit, Zelig

#Load Data

# Load data
data(lalonde)
attach(lalonde)
table(treat)

## treat
##   0   1 
## 429 185

#PROBLEM 1

Part i. Balance Table

The dataset does not have binary variables for black and hispanic so I create one.

# Create Binary Variable for Black and Hisp
lalonde <- lalonde %>%
  mutate(black = ifelse(race == "black", 1, 0),
         hispanic = ifelse(race == "hispan", 1, 0),
         white = ifelse(race == "white", 1, 0))

# Convert categorical variables to numeric (if needed)
#lalonde <- lalonde %>%  mutate(across(where(is.factor), as.numeric))

# Define the treatment indicator
treatment_var <- "treat" 

# Covariates, pre-treatment outcomes, and the outcome variable
covariates <- c("age", "educ", "black", "white", "hispanic", "married", "nodegree")  
pre_treatment_outcomes <- c("re74", "re75")  # Adding pre-treatment earnings
outcome_var <- "re78"  

# Compute summary statistics by treatment group
summary_stats <- lalonde %>%
  group_by(!!sym(treatment_var)) %>%
  dplyr::summarise(across(all_of(c(covariates, pre_treatment_outcomes, outcome_var)), 
                   list(mean = mean, sd = sd), na.rm = TRUE)) %>%
  pivot_longer(cols = -!!sym(treatment_var), 
               names_to = c("variable", ".value"), 
               names_sep = "_")

## Warning: There was 1 warning in `dplyr::summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `treat = 0`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

# Conduct t-tests for mean differences between treatment and control
t_tests <- lapply(c(covariates, pre_treatment_outcomes, outcome_var), function(var) {
  t_test_res <- t.test(lalonde[[var]] ~ lalonde[[treatment_var]], na.rm = TRUE)
  data.frame(variable = var, 
             mean_diff = diff(t_test_res$estimate), 
             t_stat = t_test_res$statistic, 
             p_value = t_test_res$p.value)
})

# Combine t-test results into a data frame
t_test_results <- do.call(rbind, t_tests)

# Merge summary statistics with t-test results
balance_table <- summary_stats %>%
  pivot_wider(names_from = treatment_var, values_from = c(mean, sd)) %>%
  rename(mean_control = mean_0, sd_control = sd_0, 
         mean_treated = mean_1, sd_treated = sd_1) %>%
  left_join(t_test_results, by = "variable")

## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(treatment_var)
## 
##   # Now:
##   data %>% select(all_of(treatment_var))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Round all numeric columns to 4 decimal places
balance_table <- balance_table %>%
  mutate(across(where(is.numeric), ~ round(.x, 4)))

# Display the balance table using stargazer
stargazer(balance_table, summary = FALSE, type = "text",
          title = "Balance Table: Summary Statistics and T-tests",
          digits = 4)

## 
## Balance Table: Summary Statistics and T-tests
## =======================================================================================
##    variable mean_control mean_treated sd_control sd_treated mean_diff   t_stat  p_value
## ---------------------------------------------------------------------------------------
## 1    age      28.0303      25.8162     10.7867     7.155     -2.2141    2.9911  0.0029 
## 2    educ     10.2354      10.3459      2.8552     2.0107     0.1105   -0.5468  0.5848 
## 3   black      0.2028       0.8432      0.4026     0.3646     0.6404   -19.3443    0   
## 4   white      0.655        0.0973      0.4759     0.2972    -0.5577   17.5898     0   
## 5  hispanic    0.1422       0.0595      0.3497     0.2371    -0.0827    3.4091   7e-04 
## 6  married     0.5128       0.1892      0.5004     0.3927    -0.3236    8.5961     0   
## 7  nodegree    0.5967       0.7081      0.4911     0.4559     0.1114   -2.7127   0.007 
## 8    re74    5619.2365    2095.5737   6788.7508  4886.6204  -3523.6628  7.2456     0   
## 9    re75    2466.4844    1532.0553   3291.9962  3219.2509  -934.4291   3.2776  0.0012 
## 10   re78    6984.1697    6349.1435   7294.1618  7867.4022  -635.0262   0.9377  0.3491 
## ---------------------------------------------------------------------------------------

Among the said covariates, age, race (black, hispanic, and white), married, and no degree have significant p-values, meaning they are significantly different from each other. Whereas, education is only covariate which is insignficant in both groups. Pretreatment Outcomes are significantly different as well. While, outcome variable - earnings in 1978 - are statistically insignificant in both groups

Part ii. Average Treatment Effect (ATE) using regression

Why Does the Treatment Coefficient Estimate ATE?

The potential outcomes framework assumes Conditional Independence (CIA), meaning that conditional on observed covariates X, treatment assignment is as good as random. Under overlap, for all values of X, there is a positive probability of receiving treatment (T=1) and control (T=0). In this case, the coefficient on treatment status in an OLS regression including covariates and interactions with de-meaned covariates estimates the ATE.

# Define treatment indicator
treatment_var <- "treat"

# Define covariates (including pre-treatment outcomes)
covariates <- c("age", "educ", "black", "hispanic", "married", "nodegree") #drop "white" for collinearity

# Demean control variables
lalonde_demeaned <- lalonde %>%
  mutate(across(all_of(covariates), ~ demean(.x)))  # Use Jmisc's demean function

# Run OLS regression including interaction terms
formula <- as.formula(paste(
  "re78 ~ treat +", 
  paste(covariates, collapse = " + "), "+",
  paste(paste0("treat * ", covariates), collapse = " + ")  # Proper interaction terms
))

# Run the regression
ate_model <- lm(formula, data = lalonde_demeaned)

# Display results
stargazer(ate_model, type = "text", title = "ATE Estimation via OLS", digits = 4)

## 
## ATE Estimation via OLS
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                re78            
## -----------------------------------------------
## treat                        863.0675          
##                            (1,040.7800)        
##                                                
## age                           48.2556          
##                              (36.4449)         
##                                                
## educ                        526.2022***        
##                             (188.2906)         
##                                                
## black                      -1,672.8070*        
##                             (923.8966)         
##                                                
## hispanic                     723.8747          
##                            (1,059.1580)        
##                                                
## married                    2,369.0140***       
##                             (786.5618)         
##                                                
## nodegree                     213.6237          
##                            (1,064.9220)        
##                                                
## treat:age                     34.0409          
##                              (86.7928)         
##                                                
## treat:educ                   102.7788          
##                             (411.7455)         
##                                                
## treat:black                  598.4264          
##                            (2,049.8750)        
##                                                
## treat:hispanic               -263.6663         
##                            (3,020.4450)        
##                                                
## treat:married               -1,028.0070        
##                            (1,621.1430)        
##                                                
## treat:nodegree               -591.5559         
##                            (1,947.8880)        
##                                                
## Constant                   6,404.8200***       
##                             (391.9867)         
##                                                
## -----------------------------------------------
## Observations                    614            
## R2                            0.0727           
## Adjusted R2                   0.0526           
## Residual Std. Error    7,271.7430 (df = 600)   
## F Statistic          3.6160*** (df = 13; 600)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

The treat variable is positive by a large magnitude, however, the results are not significant. Therefore, we cannot reject the null hypothesis for the treatment having any effect on the outcome - re78.

Part iii. ATT using Matching

Prepare Data

# Define treatment variable
treatment_var <- "treat"

# Define covariates (from part ii)
covariates <- c("age", "educ", "black", "hispanic", "married", "nodegree")

# Define outcome variable
outcome_var <- "re78"

# Extract required data
matching_data <- lalonde %>%
  select(all_of(c(treatment_var, outcome_var, covariates)))  # Keep only relevant variables

# Convert factors to numeric (if needed)
matching_data <- matching_data %>%
  mutate(across(where(is.factor), as.numeric))

One to One Matching

# Perform one-to-one nearest neighbor matching (default ATT estimation)
match_1to1 <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  M = 1,  # One-to-one matching
  replace = FALSE
)

# Show estimated ATT and standard error
summary(match_1to1)

## 
## Estimate...  916.99 
## SE.........  675.19 
## T-stat.....  1.3581 
## p.val......  0.17442 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  185

Exact Matching

# Perform exact matching
exact_match <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  exact = TRUE  # Exact matching
)

# Show estimated ATT and standard error
summary(exact_match)

## 
## Estimate...  426.62 
## AI SE......  298.66 
## T-stat.....  1.4285 
## p.val......  0.15316 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  54 
## Matched number of observations  (unweighted).  119 
## 
## Number of obs dropped by 'exact' or 'caliper'  131

The estimate here is smaller than one-to-one matching and number of matched observations is also lower here since exact matching finds exact matching whereas one-to-one matching allows for some flexibility.

Nearest Neighbour Matching (1:2)

# Perform nearest neighbor matching (1:2)
match_1to2 <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  M = 2,  # Two matches per treated unit
  replace = FALSE
)

# Show estimated ATT and standard error
summary(match_1to2)

## 
## Estimate...  -241.06 
## SE.........  766.77 
## T-stat.....  -0.31438 
## p.val......  0.75323 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  370

Nearest Neighbour (1:3)

# Perform nearest neighbor matching (1:3)
match_1to3 <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  M = 3,  # Three matches per treated unit
  replace = FALSE
)

# Show estimated ATT and standard error
summary(match_1to3)

## 
## Estimate...  -702.6 
## SE.........  850.88 
## T-stat.....  -0.82574 
## p.val......  0.40895 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  143 
## Matched number of observations  (unweighted).  429

More matches per treated unit but higher standard errors or less variance but more bias.

Bias Correction

1:2 Matching with Bias Correction

match_1to2_bc <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  M = 2,
  replace = TRUE,
  BiasAdjust = TRUE  # Apply bias adjustment (Abadie & Imbens 2011)
)

summary(match_1to2_bc)

## 
## Estimate...  901.92 
## AI SE......  1127.2 
## T-stat.....  0.80016 
## p.val......  0.42362 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  502

1:3 Matching with Bias Correction

match_1to3_bc <- Match(
  Y = matching_data[[outcome_var]],
  Tr = matching_data[[treatment_var]],
  X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
  M = 3,
  replace = TRUE,
  BiasAdjust = TRUE  # Apply bias adjustment
)

summary(match_1to3_bc)

## 
## Estimate...  975.69 
## AI SE......  1038.1 
## T-stat.....  0.93991 
## p.val......  0.34726 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  701

Bias correction reduces systematic differences between treated and control units. Compared to unadjusted matching, it reduced bias but increased standard errors.

Coarsened Exact Matching (CEM)

# Perform Coarsened Exact Matching (CEM)
cem_match <- cem(
  treatment = treatment_var,
  data = matching_data,
  drop = outcome_var  # Do not coarsen the outcome variable
)

## 
## Using 'treat'='1' as baseline group

# Extract matched dataset
matched_data <- matching_data[cem_match$matched, ]

# Estimate ATT using regression on matched data
cem_ols <- lm(re78 ~ treat, data = matched_data)

# Display results
summary(cem_ols)

## 
## Call:
## lm(formula = re78 ~ treat, data = matched_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -5395  -5258  -1679   2660  28704 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  5363.12     507.61  10.565   <2e-16 ***
## treat          31.91     766.79   0.042    0.967    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6028 on 249 degrees of freedom
## Multiple R-squared:  6.956e-06,  Adjusted R-squared:  -0.004009 
## F-statistic: 0.001732 on 1 and 249 DF,  p-value: 0.9668

Summary of all estimates

att_results <- data.frame(
  Method = c("One-to-One", "Exact Matching", "1:2 NN", "1:3 NN", 
             "1:2 NN (Bias Adj.)", "1:3 NN (Bias Adj.)", "CEM"),
  ATT = c(
    ifelse(length(match_1to1$est) > 0, match_1to1$est, NA),
    ifelse(length(exact_match$est) > 0, exact_match$est, NA),
    ifelse(length(match_1to2$est) > 0, match_1to2$est, NA),
    ifelse(length(match_1to3$est) > 0, match_1to3$est, NA),
    ifelse(length(match_1to2_bc$est) > 0, match_1to2_bc$est, NA),
    ifelse(length(match_1to3_bc$est) > 0, match_1to3_bc$est, NA),
    ifelse(length(coef(cem_ols)["treat"]) > 0, coef(cem_ols)["treat"], NA)
  ),
  Std_Error = c(
    ifelse(length(match_1to1$se) > 0, match_1to1$se, NA),
    ifelse(length(exact_match$se) > 0, exact_match$se, NA),
    ifelse(length(match_1to2$se) > 0, match_1to2$se, NA),
    ifelse(length(match_1to3$se) > 0, match_1to3$se, NA),
    ifelse(length(match_1to2_bc$se) > 0, match_1to2_bc$se, NA),
    ifelse(length(match_1to3_bc$se) > 0, match_1to3_bc$se, NA),
    ifelse(length(summary(cem_ols)$coefficients["treat", "Std. Error"]) > 0, 
           summary(cem_ols)$coefficients["treat", "Std. Error"], NA)
  )
)

#Summary
stargazer(att_results, summary = FALSE, type = "text", 
          title = "ATT Estimates Using Different Matching Methods", digits = 4)

## 
## ATT Estimates Using Different Matching Methods
## =========================================
##         Method          ATT    Std_Error 
## -----------------------------------------
## 1     One-to-One     916.9887            
## 2   Exact Matching   426.6234   298.6576 
## 3       1:2 NN       -241.0576           
## 4       1:3 NN       -702.6038           
## 5 1:2 NN (Bias Adj.) 901.9222  1,127.1720
## 6 1:3 NN (Bias Adj.) 975.6856  1,038.0640
## 7        CEM          31.9130   766.7865 
## -----------------------------------------

Differences: - One-to-One Matching provides a simple nearest neighbor method but may suffer from poor matches. - Exact Matching ensures perfect balance but may discard many observations. - 1:2 and 1:3 Nearest Neighbor Matching: More neighbors = More observations matched, less variance. But weaker matches introduce bias. As we can see the estimates are of negative sign. - Bias Adjustment improves estimates by correcting residual bias but increased the standard errors *- Coarsened Exact Matching (CEM) provides a balance between exact matching and nearest neighbor methods

Part iv. Average Treatment Effect for the Treated (ATT) using propensity score matching.

###DAG

library(dagitty)

## Warning: package 'dagitty' was built under R version 4.4.3

# Define the DAG
dag <- dagitty("dag {
  D -> Y;
  Age -> D;
  Educ -> D;
  Race -> D;
  Age -> Married -> Y;
  PreEarnings -> D;
  Age -> Y;
  Educ -> Y;
  Race -> Y;
  Married -> Y;
  PreEarnings -> Y;
}")

# Plot the DAG
plot(graphLayout(dag))

Descriptive Graphs

Age and Real Earnings

# Relationship between age and earnings in 1974, 1975, 1978
ggplot(lalonde, aes(x = age, y = re74)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1974)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(lalonde, aes(x = age, y = re75)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1975)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(lalonde, aes(x = age, y = re78)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1978)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Age distribution between treated and control
ggplot(lalonde, aes(x = age, fill = as.factor(treat))) +
  geom_histogram(binwidth = 2, alpha = 0.5, position = "identity") +
  ggtitle("Age Distribution: NSW Participants vs. Non-Participants")

Education and Real Earnings

# Relationship between education and earnings in 1974, 1975, 1978
ggplot(lalonde, aes(x = educ, y = re74)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1974)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(lalonde, aes(x = educ, y = re75)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1975)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(lalonde, aes(x = educ, y = re78)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1978)")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# Education distribution between treated and control
ggplot(lalonde, aes(x = educ, fill = as.factor(treat))) +
  geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
  ggtitle("Education Distribution: NSW Participants vs. Non-Participants")

Propensity Score Estimate

# Estimate the propensity score using logistic regression
ps_model <- glm(treat ~ poly(age, 2) + poly(educ, 2) + black + hispanic + married + nodegree + re74 + re75,
                data = lalonde, family = binomial)

# Extract fitted propensity scores
lalonde$psfit <- ps_model$fitted.values

# Display results
stargazer(ps_model, type = "text", title = "Propensity Score Model", digits = 4)

## 
## Propensity Score Model
## =============================================
##                       Dependent variable:    
##                   ---------------------------
##                              treat           
## ---------------------------------------------
## poly(age, 2)1               -0.3722          
##                            (4.7242)          
##                                              
## poly(age, 2)2             -30.7164***        
##                            (4.9661)          
##                                              
## poly(educ, 2)1              2.9173           
##                            (5.1049)          
##                                              
## poly(educ, 2)2            -14.2466***        
##                            (4.7597)          
##                                              
## black                      3.0426***         
##                            (0.3049)          
##                                              
## hispanic                    0.6855           
##                            (0.4537)          
##                                              
## married                   -1.4501***         
##                            (0.3263)          
##                                              
## nodegree                    0.4328           
##                            (0.4151)          
##                                              
## re74                      -0.0001***         
##                            (0.00003)         
##                                              
## re75                        0.00002          
##                            (0.00005)         
##                                              
## Constant                  -2.2623***         
##                            (0.3942)          
##                                              
## ---------------------------------------------
## Observations                  614            
## Log Likelihood             -212.9944         
## Akaike Inf. Crit.          447.9888          
## =============================================
## Note:             *p<0.1; **p<0.05; ***p<0.01

Propensity Score Matching (1:3 Matching with Bias Adjustment)

# Perform 1:3 Nearest Neighbor Matching with Bias Adjustment
match_psm_1to3 <- Match(
  Y = lalonde$re78,
  Tr = lalonde$treat,
  X = lalonde$psfit,
  M = 3,
  replace = TRUE,
  BiasAdjust = TRUE 
)

# Display ATT estimate and standard error
summary(match_psm_1to3)

## 
## Estimate...  1470.8 
## AI SE......  1131.3 
## T-stat.....  1.3001 
## p.val......  0.19358 
## 
## Original number of observations..............  614 
## Original number of treated obs...............  185 
## Matched number of observations...............  185 
## Matched number of observations  (unweighted).  603

Bootstrapping

# Function for bootstrapping
boot_psm <- function(data, indices) {
  sample_data <- data[indices, ]
  match_res <- Match(Y = sample_data$re78, Tr = sample_data$treat, X = sample_data$psfit, M = 3, BiasAdjust = TRUE)
  return(match_res$est)
}

# Apply bootstrap
boot_results <- boot(data = lalonde, statistic = boot_psm, R = 500)

# Display bootstrapped ATT estimate
boot.ci(boot_results, type = "perc")

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_results, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   (-299, 3368 )  
## Calculations and Intervals on Original Scale

Quality of PSM

Summary

# Compare propensity scores between treated and control groups
summary(lalonde$psfit[lalonde$treat == 1])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0282  0.5355  0.6902  0.6393  0.8476  0.9572

summary(lalonde$psfit[lalonde$treat == 0])

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000251 0.0256880 0.0590195 0.1555265 0.1587831 0.9361479

KDensity

# Compute density for treated (D == 1)
psfitdens1 <- kdensity(lalonde$psfit[lalonde$treat == 1])

# Compute density for control (D == 0)
psfitdens0 <- kdensity(lalonde$psfit[lalonde$treat == 0])

# Plot density for participants (treated)
plot(psfitdens1, main = "Density of Fitted Propensity Score",
     xlab = "Propensity Score", ylab = "Density", col = "blue", lwd = 2)

# Add density for non-participants (control)
lines(psfitdens0, col = "red", lwd = 2)

# Add legend
legend("topright", legend = c("Treated", "Control"), 
       col = c("blue", "red"), lwd = 2)

# alternative graph
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
  geom_density(alpha = 0.5) +
  ggtitle("Density of Fitted Propensity Scores: Treated vs. Control")

#### Histogram

ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
  geom_histogram(binwidth = 0.05, alpha = 0.5, position = "identity") +
  ggtitle("Histogram of Propensity Scores: Treated vs. Control")

There is overlap but not that much, only at the tails. Thus, I believe that the overlap condition does not hold.

Common Support & Matching with MatchIt

library(MatchIt)

## Warning: package 'MatchIt' was built under R version 4.4.2

## 
## Attaching package: 'MatchIt'

## The following object is masked _by_ '.GlobalEnv':
## 
##     lalonde

## The following object is masked from 'package:cobalt':
## 
##     lalonde

# Imposing common support using MatchIt
matchit_psm <- matchit(treat ~ poly(age, 2) + poly(educ, 2) + black + hispanic + married + nodegree + re74 + re75,
                        data = lalonde, method = "nearest", ratio = 3)

## Warning: Not all treated units will get 3 matches.

# Extract matched data
matched_data <- match.data(matchit_psm)

# Run OLS on matched data
matchit_ols <- lm(re78 ~ treat, data = matched_data)

# Display results
summary(matchit_ols)

## 
## Call:
## lm(formula = re78 ~ treat, data = matched_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -6984  -6349  -2048   4100  53959 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6984.2      360.7  19.362   <2e-16 ***
## treat         -635.0      657.1  -0.966    0.334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7471 on 612 degrees of freedom
## Multiple R-squared:  0.001524,   Adjusted R-squared:  -0.0001079 
## F-statistic: 0.9338 on 1 and 612 DF,  p-value: 0.3342

The results do not make intuitive sense since the treatment is negative and not significant. Additional check may involve below:

# Assess covariate balance before and after matching
library(cobalt)

# Plot standardized mean differences before and after matching
bal.tab(matchit_psm, var.names = covariates)

## Balance Measures
##                     Type Diff.Adj
## distance        Distance   1.9128
## poly(age, 2)_1   Contin.  -0.2543
## poly(age, 2)_2   Contin.  -0.4805
## poly(educ, 2)_1  Contin.   0.0623
## poly(educ, 2)_2  Contin.  -0.5258
## black             Binary   0.6604
## hispanic          Binary  -0.0883
## married           Binary  -0.3288
## nodegree          Binary   0.1063
## re74             Contin.  -0.7197
## re75             Contin.  -0.3050
## 
## Sample sizes
##                      Control Treated
## All                   429.       185
## Matched (ESS)         414.01     185
## Matched (Unweighted)  429.       185

love.plot(matchit_psm, threshold = 0.1, abs = TRUE)

## Warning: Standardized mean differences and raw mean differences are present in the same plot. 
## Use the `stars` argument to distinguish between them and appropriately label the x-axis.

Matches between covariates do not seem to balanced as the values are far from zero, both for adjusted and unadjusted models. Other than education_2,1, hispanic race, and no degree, all covariates are unbalanced.

EEFE 530: PS 3

2024-06-13