setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 5")
#Libraries
# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot,
cobalt, Matchit, Zelig, estimatr, cem, tidyverse,
lubridate, usmap, gridExtra, stringr, readxl, plot3D,
cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
library(wooldridge)
##
## Attaching package: 'wooldridge'
## The following object is masked from 'package:MASS':
##
## cement
library(fixest)
##
## Attaching package: 'fixest'
## The following object is masked from 'package:scales':
##
## pvalue
## The following object is masked from 'package:Jmisc':
##
## demean
library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
library(causalweight)
## Loading required package: ranger
library(fixest)
# Load `kielmc` data
data(kielmc)
attach(kielmc)
as.data.frame(table(nearinc, y81))
## nearinc y81 Freq
## 1 0 0 123
## 2 1 0 56
## 3 0 1 102
## 4 1 1 40
summary(rprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26000 59000 82000 83721 100230 300000
length(unique(cbd)) # number of clusters
## [1] 34
#PROBLEM 1
# 2x2 Difference-in-Differences Model
did_model <- feols(rprice ~ nearinc * y81, data = kielmc)
# Cluster-robust standard errors at the CBD level
cl_vcov <- vcov(did_model, cluster = ~cbd)
# Display results
modelsummary(did_model, vcov = cl_vcov, stars = TRUE)
| (1) | |
|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |
| (Intercept) | 82517.228*** |
| (3221.924) | |
| nearinc | -18824.370* |
| (7796.294) | |
| y81 | 18790.286*** |
| (5154.864) | |
| nearinc × y81 | -11863.903+ |
| (6621.818) | |
| Num.Obs. | 321 |
| R2 | 0.174 |
| R2 Adj. | 0.166 |
| AIC | 7538.5 |
| BIC | 7553.5 |
| RMSE | 30053.88 |
| Std.Errors | Custom |
The results suggest that the treatment or a house being near to incinerator reduces the real house prices by approximately $82,500 in 1978 US Dollars.
# 2x2 Difference-in-Differences Model with Covariates
did_model1 <- feols(rprice ~ nearinc * y81 + rooms + baths + age + agesq + larea + lland,
data = kielmc)
# Cluster-robust standard errors at the CBD level
cl_vcov1 <- vcov(did_model1, cluster = ~cbd)
# 2x2 Difference-in-Differences Model with Covariates
did_model2 <- feols(rprice ~ nearinc * y81 + rooms + baths + age + area + land,
data = kielmc)
# Cluster-robust standard errors at the CBD level
cl_vcov2 <- vcov(did_model2, cluster = ~cbd)
# Create a comparative table with model estimates
modelsummary(list("Model 1" = did_model1, "Model 2" = did_model2),
vcov = list(cl_vcov1, cl_vcov2), # Apply cluster-robust SEs
stars = TRUE, # Display significance stars
output = "markdown") # Display in a clean table format
| Model 1 | Model 2 | |
|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | ||
| (Intercept) | -279726.300** | -15131.031 |
| (93485.448) | (11348.423) | |
| nearinc | 15554.707 | 4877.285 |
| (10224.753) | (7465.639) | |
| y81 | 16672.713*** | 13792.808*** |
| (3714.861) | (3839.912) | |
| rooms | 2978.963* | 4211.879* |
| (1401.381) | (1731.493) | |
| baths | 7458.220** | 11103.272*** |
| (2595.273) | (2504.100) | |
| age | -616.245*** | -190.563*** |
| (128.447) | (53.182) | |
| agesq | 3.053*** | |
| (0.776) | ||
| larea | 32815.339*** | |
| (8487.841) | ||
| lland | 7194.257 | |
| (4660.224) | ||
| nearinc × y81 | -17545.704** | -11692.428* |
| (5821.574) | (4884.688) | |
| area | 17.937** | |
| (5.653) | ||
| land | 0.118 | |
| (0.111) | ||
| Num.Obs. | 321 | 321 |
| R2 | 0.637 | 0.636 |
| R2 Adj. | 0.627 | 0.626 |
| AIC | 7286.3 | 7285.8 |
| BIC | 7324.1 | 7319.7 |
| RMSE | 19917.40 | 19961.94 |
| Std.Errors | Custom | Custom |
# 1. Compute IPW Weights
## Define covariates
covariates <- c("rooms", "baths", "age", "agesq", "larea", "lland")
## Estimate the propensity score model
#ps_model <- glm(nearinc ~ ., data = kielmc[, c("nearinc", covariates)], family = binomial)
## Compute predicted probabilities (propensity scores)
#kielmc$pscore <- predict(ps_model, type = "response")
## Compute inverse probability weights (IPW)
#kielmc$ipw <- 1 / kielmc$pscore
# 2. DID using IPW
## IPW-DiD model with cluster bootstrap standard errors
did_ipw <- didweight(kielmc$rprice, # Outcome variable
kielmc$nearinc, # Treatment indicator
kielmc$y81, # Time indicator
x = as.matrix(kielmc[, covariates]), # Covariates matrix
boot = 500, # Number of bootstrap repetitions
cluster = kielmc$cbd) # Cluster variable
# Cluster variable
# 3. Results
treatment_effect <- as.numeric(did_ipw$effect)
se <- as.numeric(did_ipw$se)
pval <- as.numeric(did_ipw$pvalue)
# Create a clean results table with rounded values
ipw_results <- data.frame(
Metric = c("Treatment Effect", "Standard Error", "P-Value"),
Value = c(
round(did_ipw$effect, 2),
round(did_ipw$se, 2),
round(did_ipw$pvalue, 4)
)
)
## Display results
kable(ipw_results, caption = "Inverse Probability Weighted (IPW) DiD Results")
| Metric | Value |
|---|---|
| Treatment Effect | -18380.3700 |
| Standard Error | 24248.5500 |
| P-Value | 0.4485 |
When I run a model with logged covariates (1), my results show that the treatment reduces real house price by USD 17,500. Whereas, when I use covariate values at level (2), the effect on house prices goes down by USD 11,000. However, when I use IPW on selected leveled covariates, the effect on housing prices reduces by approximately USD 18,380.
It is evident that the magnitude of the effect is going down by adding covariates. Thus, not adding covariates to the model will give biased estimates. However, the effect is lowest while using IPW. IPW helps in balancing the groups so that they are more comparable and ensures similar distribution of covariates. Thus, conditional on covariates, IPW gives the most reliable estimate.
Equation 1 is TWFE specification, where as equation 2 is standard DID. The first controls for county and time fixed effects, the latter controls for treatment-group status and time, but not the fixed effects. Former assumes homogenous treatment, while latter can assume for heterogenous treatment. The parallel trend assumption is held in the former case AFTER cpntrolling for county and time FE, whereas the latter assumes PT between the groups before treatment.
Given they both want an estimayte that is robust to treatment effect heterogenity, it would be best to use standard DID (equation 2). It will provide clearer interpretation and it will be easier to test PT.
# Friend's data
i <- c(1, 1, 2, 2, 3, 3, 4, 4)
t <- c(1, 2, 1, 2, 1, 2, 1, 2)
P_t <- c(0, 1, 0, 1, 0, 1, 0, 1)
D_it <- c(0, 0, 0, 0, 0, 1, 0, 1)
D_i <- c(0, 0, 0, 0, 1, 1, 1, 1)
Y_it <- c(6, 7, 10, 9, 8, 14, 8, 12)
df <- data.frame(i=i, t=t, P_t=P_t, D_it=D_it, D_i=D_i, Y_it=Y_it)
df
## i t P_t D_it D_i Y_it
## 1 1 1 0 0 0 6
## 2 1 2 1 0 0 7
## 3 2 1 0 0 0 10
## 4 2 2 1 0 0 9
## 5 3 1 0 0 1 8
## 6 3 2 1 1 1 14
## 7 4 1 0 0 1 8
## 8 4 2 1 1 1 12
#Equation 1
twfe_model <- feols(Y_it ~ D_it | i + t, data = df, cluster = ~i)
#Equation 2
did_model <- feols(Y_it ~ D_i + P_t + D_i:P_t, data = df, cluster = ~i)
#Equation 3
rsa_model <- feols(Y_it ~ D_i:P_t | i + t, data = df, cluster = ~i)
#Compare Results
modelsummary(
list("TWFE" = twfe_model,
"Classic DiD" = did_model,
"Roth et al. (2023)" = rsa_model),
stars = TRUE,
gof_omit = "IC|R2|Log|F",
output = "markdown"
)
| TWFE | Classic DiD | Roth et al. (2023) | |
|---|---|---|---|
| + p < 0.1, * p < 0.05, ** p < 0.01, *** p < 0.001 | |||
| D_it | 5.000* | ||
| (1.366) | |||
| (Intercept) | 8.000* | ||
| (2.160) | |||
| D_i | 0.000 | ||
| (2.160) | |||
| P_t | 0.000 | ||
| (1.080) | |||
| D_i × P_t | 5.000* | 5.000* | |
| (1.528) | (1.366) | ||
| Num.Obs. | 8 | 8 | 8 |
| RMSE | 0.50 | 1.22 | 0.50 |
| Std.Errors | by: i | by: i | by: i |
The estimates from equation 1 and 3 are identifical because given the data set up, it is still two periods therefore TWFE and treatment is defined identically as well. Also, in 2 periods and 2 groups, TWFE = DID. Hence, proving that point. Thus, all estimates are the same with slightly different standard errors.
# Load `mpdta` data
library(did)
## Warning: package 'did' was built under R version 4.4.2
data(mpdta)
table(mpdta$treat, mpdta$year)
##
## 2003 2004 2005 2006 2007
## 0 309 309 309 309 309
## 1 191 191 191 191 191
#table(JC$trainy2)
table(JC$assignment, JC$trainy1)
##
## 0 1
## 0 1809 1854
## 1 857 4720
length(unique(mpdta$first.treat)) # cohorts or groups (incl. never-treated)
## [1] 4
table(mpdta$first.treat, mpdta$year)
##
## 2003 2004 2005 2006 2007
## 0 309 309 309 309 309
## 2004 20 20 20 20 20
## 2006 40 40 40 40 40
## 2007 131 131 131 131 131
summary(mpdta$first.treat[mpdta$treat==0])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 0 0 0
summary(mpdta$first.treat[mpdta$treat==1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2004 2006 2007 2006 2007 2007
summary(mpdta$lemp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.099 4.868 5.697 5.773 6.684 10.443
mpdta <- mpdta %>%
dplyr::rename(fips = countyreal) # Ensure 'fips' column exists
mpdta$treat <- as.factor(mpdta$treat) # Convert treatment variable to categorical
# Map 1: Treatment status
plot_treat <- plot_usmap(data = mpdta, values = "treat", regions = "counties", include = mpdta$fips) +
scale_fill_manual(values = c("gray90", "darkred"), name = "Treatment Status") +
labs(title = "County-Level Map: Treatment Status") +
theme_minimal()
# Map 2: First Treatment Year
mpdta$first.treat <- as.factor(mpdta$first.treat)
plot_first_treat <- plot_usmap(data = mpdta, values = "first.treat", regions = "counties") +
scale_fill_viridis_d(name = "First Treatment Year") +
labs(title = "County-Level Map: First Year of Treatment") +
theme_minimal()
# Map Together
library(gridExtra)
grid.arrange(plot_treat, plot_first_treat, ncol = 1) # ncol = 1 stacks them vertically
# Ensure first.treat is numeric
mpdta$first.treat <- as.numeric(as.character(mpdta$first.treat))
# Filter only treated counties (exclude never-treated counties `first.treat == 0`)
mpdta_treated <- mpdta %>%
filter(first.treat %in% c(2004, 2005, 2006))
# Compute mean log employment for each county by first treatment year
lemp_summary <- mpdta_treated %>%
group_by(year, first.treat) %>%
dplyr::summarise(mean_lemp = mean(lemp, na.rm = TRUE)) %>%
ungroup()
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
# Print summary to check
print(lemp_summary)
## # A tibble: 10 × 3
## year first.treat mean_lemp
## <int> <dbl> <dbl>
## 1 2003 2004 6.18
## 2 2003 2006 6.57
## 3 2004 2004 6.11
## 4 2004 2006 6.52
## 5 2005 2004 6.06
## 6 2005 2006 6.53
## 7 2006 2004 6.03
## 8 2006 2006 6.56
## 9 2007 2004 6.09
## 10 2007 2006 6.54
# Create a list to store plots
plot_list <- list()
# Loop over each first.treat year (2004, 2006, 2007)
for (ft in c(2004, 2006, 2007)) {
# Filter data for counties first treated in year `ft`
df_ft <- filter(mpdta, first.treat == ft)
# Generate the county-level map
p <- plot_usmap(data = df_ft, values = "lemp", regions = "counties") +
scale_fill_viridis_c(name = "Log Employment") +
labs(title = paste("Log Employment (lemp) for First Treatment Year:", ft)) +
theme_minimal()
# Store plot in list
plot_list[[as.character(ft)]] <- p
}
# Display maps in a grid (1 column, 3 rows)
grid.arrange(grobs = plot_list, ncol = 1)
# Getting data again
data(mpdta)
# Estimate ATT using `att_gt`
att_results <- att_gt(yname = "lemp", # Outcome variable
tname = "year", # Time variable
idname = "countyreal", # County identifier
gname = "first.treat", # First treatment year (cohort)
xformla = ~lpop, # Covariate (log population)
data = mpdta,
est_method = "reg", # Outcome regression estimation
control_group = "nevertreated", # Never-treated as control group
clustervars = "countyreal") # Cluster SEs at county level
#Print Results
summary(att_results)
##
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal",
## gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated",
## clustervars = "countyreal", est_method = "reg")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 2004 2004 -0.0149 0.0225 -0.0742 0.0443
## 2004 2005 -0.0770 0.0295 -0.1547 0.0007
## 2004 2006 -0.1411 0.0399 -0.2462 -0.0360 *
## 2004 2007 -0.1075 0.0327 -0.1937 -0.0214 *
## 2006 2004 -0.0021 0.0243 -0.0660 0.0618
## 2006 2005 -0.0070 0.0191 -0.0574 0.0435
## 2006 2006 0.0008 0.0203 -0.0526 0.0542
## 2006 2007 -0.0415 0.0200 -0.0942 0.0111
## 2007 2004 0.0264 0.0147 -0.0122 0.0650
## 2007 2005 -0.0048 0.0161 -0.0473 0.0377
## 2007 2006 -0.0285 0.0191 -0.0788 0.0218
## 2007 2007 -0.0288 0.0168 -0.0731 0.0155
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 0.23116
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Outcome Regression
The results suggest that the group that was treated in 2004, their outcomes in time 2006 and 2007 were negative and significant. Suggesting that group treated in 2004 saw a decline in the log of employment by 3.5% in year 2006 and 0.95% in 2007.
# Estimate ATT using inverse probability weighting (IPW)
att_ipw <- att_gt(yname = "lemp", # Outcome variable
tname = "year", # Time variable
idname = "countyreal", # County identifier (for clustering)
gname = "first.treat", # First treatment year (cohort)
xformla = ~ lpop, # Covariate (log population)
data = mpdta,
est_method = "ipw", # Inverse Probability Weighting (IPW)
control_group = "nevertreated", # Never-treated as control
clustervars = "countyreal") # Cluster SEs at county level
#Print Results
summary(att_ipw)
##
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal",
## gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated",
## clustervars = "countyreal", est_method = "ipw")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 2004 2004 -0.0145 0.0222 -0.0723 0.0432
## 2004 2005 -0.0764 0.0304 -0.1556 0.0027
## 2004 2006 -0.1405 0.0387 -0.2412 -0.0397 *
## 2004 2007 -0.1069 0.0349 -0.1978 -0.0161 *
## 2006 2004 -0.0009 0.0228 -0.0602 0.0585
## 2006 2005 -0.0064 0.0182 -0.0537 0.0409
## 2006 2006 0.0012 0.0192 -0.0488 0.0513
## 2006 2007 -0.0413 0.0191 -0.0910 0.0083
## 2007 2004 0.0266 0.0140 -0.0100 0.0631
## 2007 2005 -0.0047 0.0158 -0.0457 0.0363
## 2007 2006 -0.0283 0.0199 -0.0801 0.0234
## 2007 2007 -0.0289 0.0180 -0.0758 0.0180
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 0.23604
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Inverse Probability Weighting
The results are similar for the year 2006 for group treated in 2004, 3.98%, slightly higher though. However, for the year 2007, the decline in log employment increases to 2% as compared barely 1% above. However, the estimates are still similar.
# Estimate ATT using the doubly robust (DR) estimator
att_dr <- att_gt(yname = "lemp", # Outcome variable (log employment)
tname = "year", # Time variable
idname = "countyreal", # County identifier (for clustering)
gname = "first.treat", # First treatment year (cohort)
xformla = ~ lpop, # Covariate (log population)
data = mpdta,
est_method = "dr", # Doubly robust (DR) estimation
control_group = "nevertreated", # Never-treated as control
clustervars = "countyreal") # Cluster SEs at county level
#Print Results
summary(att_dr)
##
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal",
## gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated",
## clustervars = "countyreal", est_method = "dr")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
## Group-Time Average Treatment Effects:
## Group Time ATT(g,t) Std. Error [95% Simult. Conf. Band]
## 2004 2004 -0.0145 0.0230 -0.0755 0.0464
## 2004 2005 -0.0764 0.0315 -0.1598 0.0069
## 2004 2006 -0.1404 0.0379 -0.2408 -0.0401 *
## 2004 2007 -0.1069 0.0332 -0.1947 -0.0191 *
## 2006 2004 -0.0005 0.0233 -0.0621 0.0611
## 2006 2005 -0.0062 0.0185 -0.0551 0.0427
## 2006 2006 0.0010 0.0200 -0.0518 0.0537
## 2006 2007 -0.0413 0.0192 -0.0921 0.0095
## 2007 2004 0.0267 0.0142 -0.0109 0.0644
## 2007 2005 -0.0046 0.0154 -0.0452 0.0360
## 2007 2006 -0.0284 0.0188 -0.0782 0.0213
## 2007 2007 -0.0288 0.0175 -0.0751 0.0175
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## P-value for pre-test of parallel trends assumption: 0.23267
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
The decline in log employment for the group 2004 in year 2006 and 2007 is highest in this model with decline being 4.16% in 2006 and 1.57% in 2007. This is higher than the estimates above.
# Group Time ATT Plot
ggdid(att_dr)
Yes, as we look at the first plot for group 2004, the estimates for 2006 and 2007 are negative and significant.
att_agg_dr <- aggte(att_dr, type = "group")
att_agg_dr
##
## Call:
## aggte(MP = att_dr, type = "group")
##
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna. "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015>
##
##
## Overall summary of ATT's based on group/cohort aggregation:
## ATT Std. Error [ 95% Conf. Int.]
## -0.0328 0.0111 -0.0546 -0.011 *
##
##
## Group Effects:
## Group Estimate Std. Error [95% Simult. Conf. Band]
## 2004 -0.0846 0.0283 -0.1464 -0.0228 *
## 2006 -0.0202 0.0185 -0.0606 0.0202
## 2007 -0.0288 0.0169 -0.0656 0.0080
## ---
## Signif. codes: `*' confidence band does not cover 0
##
## Control Group: Never Treated, Anticipation Periods: 0
## Estimation Method: Doubly Robust
ATT estimates for the groups suggest that for the group treated in 2004, they saw a decline in the log of employment by 8.46% whereas other groups did not have a significant decline post treatment.
model_fixest <- feols(lemp ~ sunab(first.treat, year) + lpop,
data = mpdta, cluster = ~countyreal)
#Summary
summary(model_fixest)
## OLS estimation, Dep. Var.: lemp
## Observations: 2,500
## Standard-errors: Clustered (countyreal)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.150178 0.071836 29.931880 < 2.2e-16 ***
## year::-4 -0.089159 0.050040 -1.781748 7.5398e-02 .
## year::-3 -0.018323 0.045764 -0.400370 6.8906e-01
## year::-2 -0.023724 0.043255 -0.548463 5.8362e-01
## year::0 0.002722 0.041935 0.064904 9.4828e-01
## year::1 0.227935 0.050508 4.512888 7.9844e-06 ***
## year::2 0.075223 0.074440 1.010517 3.1274e-01
## year::3 0.133907 0.064962 2.061319 3.9790e-02 *
## lpop 1.093460 0.016876 64.792639 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.540817 Adj. R2: 0.870793
According to this, the estimates are positive significant post treatment in year 1 and year 3. The treatment increased employment in 22.7% in year 1 and dials down up until year 3. This is in contrast with above results which show that treatment has negative effect on employment.
library(did2s)
## Warning: package 'did2s' was built under R version 4.4.3
## did2s (v1.0.2). For more information on the methodology, visit <https://www.kylebutts.github.io/did2s>
##
## To cite did2s in publications use:
##
## Butts, Kyle (2021). did2s: Two-Stage Difference-in-Differences
## Following Gardner (2021). R package version 1.0.2.
##
## A BibTeX entry for LaTeX users is
##
## @Manual{,
## title = {did2s: Two-Stage Difference-in-Differences Following Gardner (2021)},
## author = {Kyle Butts},
## year = {2021},
## url = {https://github.com/kylebutts/did2s/},
## }
# Run Model
model_did2s <- did2s(
mpdta,
yname = "lemp",
first_stage = ~ lpop | year,
second_stage = ~ treat * first.treat,
treatment = "treat",
cluster_var = "countyreal"
)
## Running Two-stage Difference-in-Differences
## - first stage formula `~ lpop | year`
## - second stage formula `~ treat * first.treat`
## - The indicator variable that denotes when treatment is on is `treat`
## - Standard errors will be clustered by `countyreal`
# Summary
summary(model_did2s)
## OLS estimation, Dep. Var.: lemp
## Observations: 2,500
## Standard-errors: Custom
## Estimate Std. Error t value Pr(>|t|)
## treat 239.106955 57.691193 4.14460 3.5174e-05 ***
## first.treat -0.119164 0.028758 -4.14374 3.5305e-05 ***
## ... 1 variable was removed because of collinearity (treat:first.treat)
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.544414 Adj. R2: 0.015059
The estimate = -0.1192 for “first.treat” suggests that log employment is 11.9% lower in treated units after treatment, compared to what it would have been in absence of treatment.
library(didimputation)
## Warning: package 'didimputation' was built under R version 4.4.3
##
## Attaching package: 'didimputation'
## The following objects are masked from 'package:did2s':
##
## df_het, df_hom
# Run Model
did_imp <- did_imputation(
yname = "lemp", # outcome
gname = "first.treat", # first treatment year (0 = never-treated)
tname = "year", # time variable
idname = "countyreal", # unit identifier
data = mpdta, # dataframe
cluster = "countyreal" # cluster SEs
)
# Summary
summary(did_imp)
## lhs term estimate std.error
## Length:1 Length:1 Min. :-0.04771 Min. :0.01322
## Class :character Class :character 1st Qu.:-0.04771 1st Qu.:0.01322
## Mode :character Mode :character Median :-0.04771 Median :0.01322
## Mean :-0.04771 Mean :0.01322
## 3rd Qu.:-0.04771 3rd Qu.:0.01322
## Max. :-0.04771 Max. :0.01322
## conf.low conf.high
## Min. :-0.07363 Min. :-0.02179
## 1st Qu.:-0.07363 1st Qu.:-0.02179
## Median :-0.07363 Median :-0.02179
## Mean :-0.07363 Mean :-0.02179
## 3rd Qu.:-0.07363 3rd Qu.:-0.02179
## Max. :-0.07363 Max. :-0.02179
The estimated ATT is -0.0477, which implies a 4.77% reduction in the log teen employment for treated counties after treatment, relative to their counterfactual.