setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 3")
#Libraries
# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot,
cobalt, Matchit, Zelig, estimatr, cem, tidyverse,
lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'Matchit' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: Perhaps you meant 'MatchIt' ?
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Matchit'
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Warning: package 'Zelig' is not available for this version of R
##
## A version of this package for your version of R might be available elsewhere,
## see the ideas at
## https://cran.r-project.org/doc/manuals/r-patched/R-admin.html#Installing-packages
## Warning: unable to access index for repository http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4:
## cannot open URL 'http://www.stats.ox.ac.uk/pub/RWin/bin/windows/contrib/4.4/PACKAGES'
## Warning: 'BiocManager' not available. Could not check Bioconductor.
##
## Please use `install.packages('BiocManager')` and then retry.
## Warning in p_install(package, character.only = TRUE, ...):
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'Zelig'
## Warning in p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, : Failed to install/load:
## Matchit, Zelig
#Load Data
# Load data
data(lalonde)
attach(lalonde)
table(treat)
## treat
## 0 1
## 429 185
#PROBLEM 1
The dataset does not have binary variables for black and hispanic so I create one.
# Create Binary Variable for Black and Hisp
lalonde <- lalonde %>%
mutate(black = ifelse(race == "black", 1, 0),
hispanic = ifelse(race == "hispan", 1, 0),
white = ifelse(race == "white", 1, 0))
# Convert categorical variables to numeric (if needed)
#lalonde <- lalonde %>% mutate(across(where(is.factor), as.numeric))
# Define the treatment indicator
treatment_var <- "treat"
# Covariates, pre-treatment outcomes, and the outcome variable
covariates <- c("age", "educ", "black", "white", "hispanic", "married", "nodegree")
pre_treatment_outcomes <- c("re74", "re75") # Adding pre-treatment earnings
outcome_var <- "re78"
# Compute summary statistics by treatment group
summary_stats <- lalonde %>%
group_by(!!sym(treatment_var)) %>%
dplyr::summarise(across(all_of(c(covariates, pre_treatment_outcomes, outcome_var)),
list(mean = mean, sd = sd), na.rm = TRUE)) %>%
pivot_longer(cols = -!!sym(treatment_var),
names_to = c("variable", ".value"),
names_sep = "_")
## Warning: There was 1 warning in `dplyr::summarise()`.
## ℹ In argument: `across(...)`.
## ℹ In group 1: `treat = 0`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
# Conduct t-tests for mean differences between treatment and control
t_tests <- lapply(c(covariates, pre_treatment_outcomes, outcome_var), function(var) {
t_test_res <- t.test(lalonde[[var]] ~ lalonde[[treatment_var]], na.rm = TRUE)
data.frame(variable = var,
mean_diff = diff(t_test_res$estimate),
t_stat = t_test_res$statistic,
p_value = t_test_res$p.value)
})
# Combine t-test results into a data frame
t_test_results <- do.call(rbind, t_tests)
# Merge summary statistics with t-test results
balance_table <- summary_stats %>%
pivot_wider(names_from = treatment_var, values_from = c(mean, sd)) %>%
rename(mean_control = mean_0, sd_control = sd_0,
mean_treated = mean_1, sd_treated = sd_1) %>%
left_join(t_test_results, by = "variable")
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(treatment_var)
##
## # Now:
## data %>% select(all_of(treatment_var))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Round all numeric columns to 4 decimal places
balance_table <- balance_table %>%
mutate(across(where(is.numeric), ~ round(.x, 4)))
# Display the balance table using stargazer
stargazer(balance_table, summary = FALSE, type = "text",
title = "Balance Table: Summary Statistics and T-tests",
digits = 4)
##
## Balance Table: Summary Statistics and T-tests
## =======================================================================================
## variable mean_control mean_treated sd_control sd_treated mean_diff t_stat p_value
## ---------------------------------------------------------------------------------------
## 1 age 28.0303 25.8162 10.7867 7.155 -2.2141 2.9911 0.0029
## 2 educ 10.2354 10.3459 2.8552 2.0107 0.1105 -0.5468 0.5848
## 3 black 0.2028 0.8432 0.4026 0.3646 0.6404 -19.3443 0
## 4 white 0.655 0.0973 0.4759 0.2972 -0.5577 17.5898 0
## 5 hispanic 0.1422 0.0595 0.3497 0.2371 -0.0827 3.4091 7e-04
## 6 married 0.5128 0.1892 0.5004 0.3927 -0.3236 8.5961 0
## 7 nodegree 0.5967 0.7081 0.4911 0.4559 0.1114 -2.7127 0.007
## 8 re74 5619.2365 2095.5737 6788.7508 4886.6204 -3523.6628 7.2456 0
## 9 re75 2466.4844 1532.0553 3291.9962 3219.2509 -934.4291 3.2776 0.0012
## 10 re78 6984.1697 6349.1435 7294.1618 7867.4022 -635.0262 0.9377 0.3491
## ---------------------------------------------------------------------------------------
Among the said covariates, age, race (black, hispanic, and white), married, and no degree have significant p-values, meaning they are significantly different from each other. Whereas, education is only covariate which is insignficant in both groups. Pretreatment Outcomes are significantly different as well. While, outcome variable - earnings in 1978 - are statistically insignificant in both groups
The potential outcomes framework assumes Conditional Independence (CIA), meaning that conditional on observed covariates X, treatment assignment is as good as random. Under overlap, for all values of X, there is a positive probability of receiving treatment (T=1) and control (T=0). In this case, the coefficient on treatment status in an OLS regression including covariates and interactions with de-meaned covariates estimates the ATE.
# Define treatment indicator
treatment_var <- "treat"
# Define covariates (including pre-treatment outcomes)
covariates <- c("age", "educ", "black", "hispanic", "married", "nodegree") #drop "white" for collinearity
# Demean control variables
lalonde_demeaned <- lalonde %>%
mutate(across(all_of(covariates), ~ demean(.x))) # Use Jmisc's demean function
# Run OLS regression including interaction terms
formula <- as.formula(paste(
"re78 ~ treat +",
paste(covariates, collapse = " + "), "+",
paste(paste0("treat * ", covariates), collapse = " + ") # Proper interaction terms
))
# Run the regression
ate_model <- lm(formula, data = lalonde_demeaned)
# Display results
stargazer(ate_model, type = "text", title = "ATE Estimation via OLS", digits = 4)
##
## ATE Estimation via OLS
## ===============================================
## Dependent variable:
## ---------------------------
## re78
## -----------------------------------------------
## treat 863.0675
## (1,040.7800)
##
## age 48.2556
## (36.4449)
##
## educ 526.2022***
## (188.2906)
##
## black -1,672.8070*
## (923.8966)
##
## hispanic 723.8747
## (1,059.1580)
##
## married 2,369.0140***
## (786.5618)
##
## nodegree 213.6237
## (1,064.9220)
##
## treat:age 34.0409
## (86.7928)
##
## treat:educ 102.7788
## (411.7455)
##
## treat:black 598.4264
## (2,049.8750)
##
## treat:hispanic -263.6663
## (3,020.4450)
##
## treat:married -1,028.0070
## (1,621.1430)
##
## treat:nodegree -591.5559
## (1,947.8880)
##
## Constant 6,404.8200***
## (391.9867)
##
## -----------------------------------------------
## Observations 614
## R2 0.0727
## Adjusted R2 0.0526
## Residual Std. Error 7,271.7430 (df = 600)
## F Statistic 3.6160*** (df = 13; 600)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
The treat variable is positive by a large magnitude, however, the results are not significant. Therefore, we cannot reject the null hypothesis for the treatment having any effect on the outcome - re78.
# Define treatment variable
treatment_var <- "treat"
# Define covariates (from part ii)
covariates <- c("age", "educ", "black", "hispanic", "married", "nodegree")
# Define outcome variable
outcome_var <- "re78"
# Extract required data
matching_data <- lalonde %>%
select(all_of(c(treatment_var, outcome_var, covariates))) # Keep only relevant variables
# Convert factors to numeric (if needed)
matching_data <- matching_data %>%
mutate(across(where(is.factor), as.numeric))
# Perform one-to-one nearest neighbor matching (default ATT estimation)
match_1to1 <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
M = 1, # One-to-one matching
replace = FALSE
)
# Show estimated ATT and standard error
summary(match_1to1)
##
## Estimate... 916.99
## SE......... 675.19
## T-stat..... 1.3581
## p.val...... 0.17442
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 185
# Perform exact matching
exact_match <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
exact = TRUE # Exact matching
)
# Show estimated ATT and standard error
summary(exact_match)
##
## Estimate... 426.62
## AI SE...... 298.66
## T-stat..... 1.4285
## p.val...... 0.15316
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 54
## Matched number of observations (unweighted). 119
##
## Number of obs dropped by 'exact' or 'caliper' 131
The estimate here is smaller than one-to-one matching and number of matched observations is also lower here since exact matching finds exact matching whereas one-to-one matching allows for some flexibility.
# Perform nearest neighbor matching (1:2)
match_1to2 <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
M = 2, # Two matches per treated unit
replace = FALSE
)
# Show estimated ATT and standard error
summary(match_1to2)
##
## Estimate... -241.06
## SE......... 766.77
## T-stat..... -0.31438
## p.val...... 0.75323
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 370
# Perform nearest neighbor matching (1:3)
match_1to3 <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
M = 3, # Three matches per treated unit
replace = FALSE
)
# Show estimated ATT and standard error
summary(match_1to3)
##
## Estimate... -702.6
## SE......... 850.88
## T-stat..... -0.82574
## p.val...... 0.40895
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 143
## Matched number of observations (unweighted). 429
More matches per treated unit but higher standard errors or less variance but more bias.
match_1to2_bc <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
M = 2,
replace = TRUE,
BiasAdjust = TRUE # Apply bias adjustment (Abadie & Imbens 2011)
)
summary(match_1to2_bc)
##
## Estimate... 901.92
## AI SE...... 1127.2
## T-stat..... 0.80016
## p.val...... 0.42362
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 502
match_1to3_bc <- Match(
Y = matching_data[[outcome_var]],
Tr = matching_data[[treatment_var]],
X = matching_data %>% select(all_of(covariates)) %>% as.matrix(),
M = 3,
replace = TRUE,
BiasAdjust = TRUE # Apply bias adjustment
)
summary(match_1to3_bc)
##
## Estimate... 975.69
## AI SE...... 1038.1
## T-stat..... 0.93991
## p.val...... 0.34726
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 701
Bias correction reduces systematic differences between treated and control units. Compared to unadjusted matching, it reduced bias but increased standard errors.
# Perform Coarsened Exact Matching (CEM)
cem_match <- cem(
treatment = treatment_var,
data = matching_data,
drop = outcome_var # Do not coarsen the outcome variable
)
##
## Using 'treat'='1' as baseline group
# Extract matched dataset
matched_data <- matching_data[cem_match$matched, ]
# Estimate ATT using regression on matched data
cem_ols <- lm(re78 ~ treat, data = matched_data)
# Display results
summary(cem_ols)
##
## Call:
## lm(formula = re78 ~ treat, data = matched_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5395 -5258 -1679 2660 28704
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5363.12 507.61 10.565 <2e-16 ***
## treat 31.91 766.79 0.042 0.967
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6028 on 249 degrees of freedom
## Multiple R-squared: 6.956e-06, Adjusted R-squared: -0.004009
## F-statistic: 0.001732 on 1 and 249 DF, p-value: 0.9668
att_results <- data.frame(
Method = c("One-to-One", "Exact Matching", "1:2 NN", "1:3 NN",
"1:2 NN (Bias Adj.)", "1:3 NN (Bias Adj.)", "CEM"),
ATT = c(
ifelse(length(match_1to1$est) > 0, match_1to1$est, NA),
ifelse(length(exact_match$est) > 0, exact_match$est, NA),
ifelse(length(match_1to2$est) > 0, match_1to2$est, NA),
ifelse(length(match_1to3$est) > 0, match_1to3$est, NA),
ifelse(length(match_1to2_bc$est) > 0, match_1to2_bc$est, NA),
ifelse(length(match_1to3_bc$est) > 0, match_1to3_bc$est, NA),
ifelse(length(coef(cem_ols)["treat"]) > 0, coef(cem_ols)["treat"], NA)
),
Std_Error = c(
ifelse(length(match_1to1$se) > 0, match_1to1$se, NA),
ifelse(length(exact_match$se) > 0, exact_match$se, NA),
ifelse(length(match_1to2$se) > 0, match_1to2$se, NA),
ifelse(length(match_1to3$se) > 0, match_1to3$se, NA),
ifelse(length(match_1to2_bc$se) > 0, match_1to2_bc$se, NA),
ifelse(length(match_1to3_bc$se) > 0, match_1to3_bc$se, NA),
ifelse(length(summary(cem_ols)$coefficients["treat", "Std. Error"]) > 0,
summary(cem_ols)$coefficients["treat", "Std. Error"], NA)
)
)
#Summary
stargazer(att_results, summary = FALSE, type = "text",
title = "ATT Estimates Using Different Matching Methods", digits = 4)
##
## ATT Estimates Using Different Matching Methods
## =========================================
## Method ATT Std_Error
## -----------------------------------------
## 1 One-to-One 916.9887
## 2 Exact Matching 426.6234 298.6576
## 3 1:2 NN -241.0576
## 4 1:3 NN -702.6038
## 5 1:2 NN (Bias Adj.) 901.9222 1,127.1720
## 6 1:3 NN (Bias Adj.) 975.6856 1,038.0640
## 7 CEM 31.9130 766.7865
## -----------------------------------------
Differences: - One-to-One Matching provides a simple nearest neighbor method but may suffer from poor matches. - Exact Matching ensures perfect balance but may discard many observations. - 1:2 and 1:3 Nearest Neighbor Matching: More neighbors = More observations matched, less variance. But weaker matches introduce bias. As we can see the estimates are of negative sign. - Bias Adjustment improves estimates by correcting residual bias but increased the standard errors *- Coarsened Exact Matching (CEM) provides a balance between exact matching and nearest neighbor methods
###DAG
library(dagitty)
## Warning: package 'dagitty' was built under R version 4.4.3
# Define the DAG
dag <- dagitty("dag {
D -> Y;
Age -> D;
Educ -> D;
Race -> D;
Age -> Married -> Y;
PreEarnings -> D;
Age -> Y;
Educ -> Y;
Race -> Y;
Married -> Y;
PreEarnings -> Y;
}")
# Plot the DAG
plot(graphLayout(dag))
# Relationship between age and earnings in 1974, 1975, 1978
ggplot(lalonde, aes(x = age, y = re74)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1974)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re75)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1975)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(lalonde, aes(x = age, y = re78)) + geom_point() + geom_smooth() + ggtitle("Age vs. Earnings (1978)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Age distribution between treated and control
ggplot(lalonde, aes(x = age, fill = as.factor(treat))) +
geom_histogram(binwidth = 2, alpha = 0.5, position = "identity") +
ggtitle("Age Distribution: NSW Participants vs. Non-Participants")
# Relationship between education and earnings in 1974, 1975, 1978
ggplot(lalonde, aes(x = educ, y = re74)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1974)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re75)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1975)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
ggplot(lalonde, aes(x = educ, y = re78)) + geom_point() + geom_smooth() + ggtitle("Education vs. Earnings (1978)")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
# Education distribution between treated and control
ggplot(lalonde, aes(x = educ, fill = as.factor(treat))) +
geom_histogram(binwidth = 1, alpha = 0.5, position = "identity") +
ggtitle("Education Distribution: NSW Participants vs. Non-Participants")
# Estimate the propensity score using logistic regression
ps_model <- glm(treat ~ poly(age, 2) + poly(educ, 2) + black + hispanic + married + nodegree + re74 + re75,
data = lalonde, family = binomial)
# Extract fitted propensity scores
lalonde$psfit <- ps_model$fitted.values
# Display results
stargazer(ps_model, type = "text", title = "Propensity Score Model", digits = 4)
##
## Propensity Score Model
## =============================================
## Dependent variable:
## ---------------------------
## treat
## ---------------------------------------------
## poly(age, 2)1 -0.3722
## (4.7242)
##
## poly(age, 2)2 -30.7164***
## (4.9661)
##
## poly(educ, 2)1 2.9173
## (5.1049)
##
## poly(educ, 2)2 -14.2466***
## (4.7597)
##
## black 3.0426***
## (0.3049)
##
## hispanic 0.6855
## (0.4537)
##
## married -1.4501***
## (0.3263)
##
## nodegree 0.4328
## (0.4151)
##
## re74 -0.0001***
## (0.00003)
##
## re75 0.00002
## (0.00005)
##
## Constant -2.2623***
## (0.3942)
##
## ---------------------------------------------
## Observations 614
## Log Likelihood -212.9944
## Akaike Inf. Crit. 447.9888
## =============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
# Perform 1:3 Nearest Neighbor Matching with Bias Adjustment
match_psm_1to3 <- Match(
Y = lalonde$re78,
Tr = lalonde$treat,
X = lalonde$psfit,
M = 3,
replace = TRUE,
BiasAdjust = TRUE
)
# Display ATT estimate and standard error
summary(match_psm_1to3)
##
## Estimate... 1470.8
## AI SE...... 1131.3
## T-stat..... 1.3001
## p.val...... 0.19358
##
## Original number of observations.............. 614
## Original number of treated obs............... 185
## Matched number of observations............... 185
## Matched number of observations (unweighted). 603
# Function for bootstrapping
boot_psm <- function(data, indices) {
sample_data <- data[indices, ]
match_res <- Match(Y = sample_data$re78, Tr = sample_data$treat, X = sample_data$psfit, M = 3, BiasAdjust = TRUE)
return(match_res$est)
}
# Apply bootstrap
boot_results <- boot(data = lalonde, statistic = boot_psm, R = 500)
# Display bootstrapped ATT estimate
boot.ci(boot_results, type = "perc")
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 500 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = "perc")
##
## Intervals :
## Level Percentile
## 95% (-299, 3368 )
## Calculations and Intervals on Original Scale
# Compare propensity scores between treated and control groups
summary(lalonde$psfit[lalonde$treat == 1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0282 0.5355 0.6902 0.6393 0.8476 0.9572
summary(lalonde$psfit[lalonde$treat == 0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000251 0.0256880 0.0590195 0.1555265 0.1587831 0.9361479
# Compute density for treated (D == 1)
psfitdens1 <- kdensity(lalonde$psfit[lalonde$treat == 1])
# Compute density for control (D == 0)
psfitdens0 <- kdensity(lalonde$psfit[lalonde$treat == 0])
# Plot density for participants (treated)
plot(psfitdens1, main = "Density of Fitted Propensity Score",
xlab = "Propensity Score", ylab = "Density", col = "blue", lwd = 2)
# Add density for non-participants (control)
lines(psfitdens0, col = "red", lwd = 2)
# Add legend
legend("topright", legend = c("Treated", "Control"),
col = c("blue", "red"), lwd = 2)
# alternative graph
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
geom_density(alpha = 0.5) +
ggtitle("Density of Fitted Propensity Scores: Treated vs. Control")
#### Histogram
ggplot(lalonde, aes(x = psfit, fill = as.factor(treat))) +
geom_histogram(binwidth = 0.05, alpha = 0.5, position = "identity") +
ggtitle("Histogram of Propensity Scores: Treated vs. Control")
There is overlap but not that much, only at the tails. Thus, I believe that the overlap condition does not hold.
library(MatchIt)
## Warning: package 'MatchIt' was built under R version 4.4.2
##
## Attaching package: 'MatchIt'
## The following object is masked _by_ '.GlobalEnv':
##
## lalonde
## The following object is masked from 'package:cobalt':
##
## lalonde
# Imposing common support using MatchIt
matchit_psm <- matchit(treat ~ poly(age, 2) + poly(educ, 2) + black + hispanic + married + nodegree + re74 + re75,
data = lalonde, method = "nearest", ratio = 3)
## Warning: Not all treated units will get 3 matches.
# Extract matched data
matched_data <- match.data(matchit_psm)
# Run OLS on matched data
matchit_ols <- lm(re78 ~ treat, data = matched_data)
# Display results
summary(matchit_ols)
##
## Call:
## lm(formula = re78 ~ treat, data = matched_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6984 -6349 -2048 4100 53959
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6984.2 360.7 19.362 <2e-16 ***
## treat -635.0 657.1 -0.966 0.334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7471 on 612 degrees of freedom
## Multiple R-squared: 0.001524, Adjusted R-squared: -0.0001079
## F-statistic: 0.9338 on 1 and 612 DF, p-value: 0.3342
The results do not make intuitive sense since the treatment is negative and not significant. Additional check may involve below:
# Assess covariate balance before and after matching
library(cobalt)
# Plot standardized mean differences before and after matching
bal.tab(matchit_psm, var.names = covariates)
## Balance Measures
## Type Diff.Adj
## distance Distance 1.9128
## poly(age, 2)_1 Contin. -0.2543
## poly(age, 2)_2 Contin. -0.4805
## poly(educ, 2)_1 Contin. 0.0623
## poly(educ, 2)_2 Contin. -0.5258
## black Binary 0.6604
## hispanic Binary -0.0883
## married Binary -0.3288
## nodegree Binary 0.1063
## re74 Contin. -0.7197
## re75 Contin. -0.3050
##
## Sample sizes
## Control Treated
## All 429. 185
## Matched (ESS) 414.01 185
## Matched (Unweighted) 429. 185
love.plot(matchit_psm, threshold = 0.1, abs = TRUE)
## Warning: Standardized mean differences and raw mean differences are present in the same plot.
## Use the `stars` argument to distinguish between them and appropriately label the x-axis.
Matches between covariates do not seem to balanced as the values are far from zero, both for adjusted and unadjusted models. Other than education_2,1, hispanic race, and no degree, all covariates are unbalanced.