Setting Working Directory

setwd("C:/Users/Lenovo/OneDrive - The Pennsylvania State University/Penn State/Coursework/Sem IV/EEFE 530/Problem Sets/Problem Set 5")

#Libraries

# Load packages
library(pacman)
p_load(Matching, Jmisc, lmtest, sandwich, kdensity, haven, boot, 
       cobalt, Matchit, Zelig, estimatr, cem, tidyverse, 
       lubridate, usmap, gridExtra, stringr, readxl, plot3D,  
       cowplot, reshape2, scales, broom, data.table, ggplot2, stargazer,  
       foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont, 
       kableExtra, snakecase, janitor)

## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## Installing package into 'C:/Users/Lenovo/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

library(wooldridge)

## 
## Attaching package: 'wooldridge'

## The following object is masked from 'package:MASS':
## 
##     cement

library(fixest)

## 
## Attaching package: 'fixest'

## The following object is masked from 'package:scales':
## 
##     pvalue

## The following object is masked from 'package:Jmisc':
## 
##     demean

library(multiwayvcov)
library(lmtest)
library(stargazer)
library(modelsummary)

## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

library(causalweight)

## Loading required package: ranger

library(fixest)

Load Data

# Load `kielmc` data
data(kielmc)
attach(kielmc)

as.data.frame(table(nearinc, y81))

##   nearinc y81 Freq
## 1       0   0  123
## 2       1   0   56
## 3       0   1  102
## 4       1   1   40

summary(rprice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26000   59000   82000   83721  100230  300000

length(unique(cbd)) # number of clusters

## [1] 34

#PROBLEM 1

Part i . Without Covariates

# 2x2 Difference-in-Differences Model
did_model <- feols(rprice ~ nearinc * y81, data = kielmc)

# Cluster-robust standard errors at the CBD level
cl_vcov <- vcov(did_model, cluster = ~cbd)

# Display results
modelsummary(did_model, vcov = cl_vcov, stars = TRUE)

	(1)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	82517.228***
	(3221.924)
nearinc	-18824.370*
	(7796.294)
y81	18790.286***
	(5154.864)
nearinc × y81	-11863.903+
	(6621.818)
Num.Obs.	321
R2	0.174
R2 Adj.	0.166
AIC	7538.5
BIC	7553.5
RMSE	30053.88
Std.Errors	Custom

The results suggest that the treatment or a house being near to incinerator reduces the real house prices by approximately $82,500 in 1978 US Dollars.

Part ii. With Covariates

(1)

# 2x2 Difference-in-Differences Model with Covariates
did_model1 <- feols(rprice ~ nearinc * y81 + rooms + baths + age + agesq + larea + lland, 
                       data = kielmc)

# Cluster-robust standard errors at the CBD level
cl_vcov1 <- vcov(did_model1, cluster = ~cbd)

(2)

# 2x2 Difference-in-Differences Model with Covariates
did_model2 <- feols(rprice ~ nearinc * y81 + rooms + baths + age + area + land, 
                       data = kielmc)

# Cluster-robust standard errors at the CBD level
cl_vcov2 <- vcov(did_model2, cluster = ~cbd)

Estimation Results

# Create a comparative table with model estimates
modelsummary(list("Model 1" = did_model1, "Model 2" = did_model2),
             vcov = list(cl_vcov1, cl_vcov2),  # Apply cluster-robust SEs
             stars = TRUE,  # Display significance stars
             output = "markdown")  # Display in a clean table format

	Model 1	Model 2
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
(Intercept)	-279726.300**	-15131.031
	(93485.448)	(11348.423)
nearinc	15554.707	4877.285
	(10224.753)	(7465.639)
y81	16672.713***	13792.808***
	(3714.861)	(3839.912)
rooms	2978.963*	4211.879*
	(1401.381)	(1731.493)
baths	7458.220**	11103.272***
	(2595.273)	(2504.100)
age	-616.245***	-190.563***
	(128.447)	(53.182)
agesq	3.053***
	(0.776)
larea	32815.339***
	(8487.841)
lland	7194.257
	(4660.224)
nearinc × y81	-17545.704**	-11692.428*
	(5821.574)	(4884.688)
area		17.937**
		(5.653)
land		0.118
		(0.111)
Num.Obs.	321	321
R2	0.637	0.636
R2 Adj.	0.627	0.626
AIC	7286.3	7285.8
BIC	7324.1	7319.7
RMSE	19917.40	19961.94
Std.Errors	Custom	Custom

(3)

# 1. Compute IPW Weights

## Define covariates
covariates <- c("rooms", "baths", "age", "agesq", "larea", "lland")

## Estimate the propensity score model
#ps_model <- glm(nearinc ~ ., data = kielmc[, c("nearinc", covariates)], family = binomial)

## Compute predicted probabilities (propensity scores)
#kielmc$pscore <- predict(ps_model, type = "response")

## Compute inverse probability weights (IPW)
#kielmc$ipw <- 1 / kielmc$pscore

# 2. DID using IPW
## IPW-DiD model with cluster bootstrap standard errors
did_ipw <- didweight(kielmc$rprice,     # Outcome variable
                     kielmc$nearinc,    # Treatment indicator
                     kielmc$y81,        # Time indicator
                     x = as.matrix(kielmc[, covariates]),  # Covariates matrix
                     boot = 500,            # Number of bootstrap repetitions
                     cluster = kielmc$cbd)  # Cluster variable
            # Cluster variable

# 3. Results
treatment_effect <- as.numeric(did_ipw$effect)
se <- as.numeric(did_ipw$se)
pval <- as.numeric(did_ipw$pvalue)

# Create a clean results table with rounded values
ipw_results <- data.frame(
  Metric = c("Treatment Effect", "Standard Error", "P-Value"),
  Value = c(
    round(did_ipw$effect, 2),
    round(did_ipw$se, 2),
    round(did_ipw$pvalue, 4)
  )
)


## Display results 
kable(ipw_results, caption = "Inverse Probability Weighted (IPW) DiD Results")

Inverse Probability Weighted (IPW) DiD Results
Metric	Value
Treatment Effect	-18380.3700
Standard Error	24248.5500
P-Value	0.4485

(4) Interpretation

When I run a model with logged covariates (1), my results show that the treatment reduces real house price by USD 17,500. Whereas, when I use covariate values at level (2), the effect on house prices goes down by USD 11,000. However, when I use IPW on selected leveled covariates, the effect on housing prices reduces by approximately USD 18,380.

(5) Preferred Estimation

It is evident that the magnitude of the effect is going down by adding covariates. Thus, not adding covariates to the model will give biased estimates. However, the effect is lowest while using IPW. IPW helps in balancing the groups so that they are more comparable and ensures similar distribution of covariates. Thus, conditional on covariates, IPW gives the most reliable estimate.

Part iii. TWFE

(1)

Equation 1 is TWFE specification, where as equation 2 is standard DID. The first controls for county and time fixed effects, the latter controls for treatment-group status and time, but not the fixed effects. Former assumes homogenous treatment, while latter can assume for heterogenous treatment. The parallel trend assumption is held in the former case AFTER cpntrolling for county and time FE, whereas the latter assumes PT between the groups before treatment.

Given they both want an estimayte that is robust to treatment effect heterogenity, it would be best to use standard DID (equation 2). It will provide clearer interpretation and it will be easier to test PT.

(2) Compare Results via three models

# Friend's data
i <- c(1, 1, 2, 2, 3, 3, 4, 4)
t <- c(1, 2, 1, 2, 1, 2, 1, 2)
P_t <- c(0, 1, 0, 1, 0, 1, 0, 1)
D_it <- c(0, 0, 0, 0, 0, 1, 0, 1)
D_i <- c(0, 0, 0, 0, 1, 1, 1, 1)
Y_it <- c(6, 7, 10, 9, 8, 14, 8, 12)

df <- data.frame(i=i, t=t, P_t=P_t, D_it=D_it, D_i=D_i, Y_it=Y_it)
df

##   i t P_t D_it D_i Y_it
## 1 1 1   0    0   0    6
## 2 1 2   1    0   0    7
## 3 2 1   0    0   0   10
## 4 2 2   1    0   0    9
## 5 3 1   0    0   1    8
## 6 3 2   1    1   1   14
## 7 4 1   0    0   1    8
## 8 4 2   1    1   1   12

#Equation 1
twfe_model <- feols(Y_it ~ D_it | i + t, data = df, cluster = ~i)

#Equation 2
did_model <- feols(Y_it ~ D_i + P_t + D_i:P_t, data = df, cluster = ~i)

#Equation 3
rsa_model <- feols(Y_it ~ D_i:P_t | i + t, data = df, cluster = ~i)

#Compare Results
modelsummary(
  list("TWFE" = twfe_model, 
       "Classic DiD" = did_model, 
       "Roth et al. (2023)" = rsa_model),
  stars = TRUE, 
  gof_omit = "IC|R2|Log|F",
  output = "markdown"
)

	TWFE	Classic DiD	Roth et al. (2023)
+ p < 0.1, * p < 0.05, p < 0.01, * p < 0.001
D_it	5.000*
	(1.366)
(Intercept)		8.000*
		(2.160)
D_i		0.000
		(2.160)
P_t		0.000
		(1.080)
D_i × P_t		5.000*	5.000*
		(1.528)	(1.366)
Num.Obs.	8	8	8
RMSE	0.50	1.22	0.50
Std.Errors	by: i	by: i	by: i

The estimates from equation 1 and 3 are identifical because given the data set up, it is still two periods therefore TWFE and treatment is defined identically as well. Also, in 2 periods and 2 groups, TWFE = DID. Hence, proving that point. Thus, all estimates are the same with slightly different standard errors.

PROBLEM 2

# Load `mpdta` data
library(did)

## Warning: package 'did' was built under R version 4.4.2

data(mpdta)
table(mpdta$treat, mpdta$year)

##    
##     2003 2004 2005 2006 2007
##   0  309  309  309  309  309
##   1  191  191  191  191  191

#table(JC$trainy2)
table(JC$assignment, JC$trainy1)

##    
##        0    1
##   0 1809 1854
##   1  857 4720

length(unique(mpdta$first.treat)) # cohorts or groups (incl. never-treated)

## [1] 4

table(mpdta$first.treat, mpdta$year)

##       
##        2003 2004 2005 2006 2007
##   0     309  309  309  309  309
##   2004   20   20   20   20   20
##   2006   40   40   40   40   40
##   2007  131  131  131  131  131

summary(mpdta$first.treat[mpdta$treat==0])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0       0       0       0       0

summary(mpdta$first.treat[mpdta$treat==1])

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2004    2006    2007    2006    2007    2007

summary(mpdta$lemp)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.099   4.868   5.697   5.773   6.684  10.443

Part i. Descriptive County-level Maps

(1) Maps

Map 1

mpdta <- mpdta %>%
  dplyr::rename(fips = countyreal)  # Ensure 'fips' column exists

mpdta$treat <- as.factor(mpdta$treat)  # Convert treatment variable to categorical

# Map 1:  Treatment status
plot_treat <- plot_usmap(data = mpdta, values = "treat", regions = "counties", include = mpdta$fips) +
  scale_fill_manual(values = c("gray90", "darkred"), name = "Treatment Status") +
  labs(title = "County-Level Map: Treatment Status") +
  theme_minimal()


# Map 2: First Treatment Year
mpdta$first.treat <- as.factor(mpdta$first.treat)

plot_first_treat <- plot_usmap(data = mpdta, values = "first.treat", regions = "counties") +
  scale_fill_viridis_d(name = "First Treatment Year") +  
  labs(title = "County-Level Map: First Year of Treatment") +
  theme_minimal()


# Map Together
library(gridExtra)
grid.arrange(plot_treat, plot_first_treat, ncol = 1)  # ncol = 1 stacks them vertically

(2) Means of Y - Maps

# Ensure first.treat is numeric
mpdta$first.treat <- as.numeric(as.character(mpdta$first.treat))

# Filter only treated counties (exclude never-treated counties `first.treat == 0`)
mpdta_treated <- mpdta %>%
  filter(first.treat %in% c(2004, 2005, 2006))

# Compute mean log employment for each county by first treatment year
lemp_summary <- mpdta_treated %>%
  group_by(year, first.treat) %>%
  dplyr::summarise(mean_lemp = mean(lemp, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

# Print summary to check
print(lemp_summary)

## # A tibble: 10 × 3
##     year first.treat mean_lemp
##    <int>       <dbl>     <dbl>
##  1  2003        2004      6.18
##  2  2003        2006      6.57
##  3  2004        2004      6.11
##  4  2004        2006      6.52
##  5  2005        2004      6.06
##  6  2005        2006      6.53
##  7  2006        2004      6.03
##  8  2006        2006      6.56
##  9  2007        2004      6.09
## 10  2007        2006      6.54

# Create a list to store plots
plot_list <- list()

# Loop over each first.treat year (2004, 2006, 2007)
for (ft in c(2004, 2006, 2007)) {
  
  # Filter data for counties first treated in year `ft`
  df_ft <- filter(mpdta, first.treat == ft)
  
  # Generate the county-level map
  p <- plot_usmap(data = df_ft, values = "lemp", regions = "counties") +
    scale_fill_viridis_c(name = "Log Employment") +
    labs(title = paste("Log Employment (lemp) for First Treatment Year:", ft)) +
    theme_minimal()
  
  # Store plot in list
  plot_list[[as.character(ft)]] <- p
}

# Display maps in a grid (1 column, 3 rows)
grid.arrange(grobs = plot_list, ncol = 1)

Part ii. Cohort-year-specific average treatment effects on the treated

(1) Staggered DID

# Getting data again
data(mpdta)

# Estimate ATT using `att_gt`
att_results <- att_gt(yname = "lemp",            # Outcome variable
                      tname = "year",           # Time variable
                      idname = "countyreal",    # County identifier
                      gname = "first.treat",    # First treatment year (cohort)
                      xformla = ~lpop,         # Covariate (log population)
                      data = mpdta,           
                      est_method = "reg",       # Outcome regression estimation
                      control_group = "nevertreated",  # Never-treated as control group
                      clustervars = "countyreal")  # Cluster SEs at county level
#Print Results
summary(att_results)

## 
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal", 
##     gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated", 
##     clustervars = "countyreal", est_method = "reg")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Simult.  Conf. Band]  
##   2004 2004  -0.0149     0.0225       -0.0742      0.0443  
##   2004 2005  -0.0770     0.0295       -0.1547      0.0007  
##   2004 2006  -0.1411     0.0399       -0.2462     -0.0360 *
##   2004 2007  -0.1075     0.0327       -0.1937     -0.0214 *
##   2006 2004  -0.0021     0.0243       -0.0660      0.0618  
##   2006 2005  -0.0070     0.0191       -0.0574      0.0435  
##   2006 2006   0.0008     0.0203       -0.0526      0.0542  
##   2006 2007  -0.0415     0.0200       -0.0942      0.0111  
##   2007 2004   0.0264     0.0147       -0.0122      0.0650  
##   2007 2005  -0.0048     0.0161       -0.0473      0.0377  
##   2007 2006  -0.0285     0.0191       -0.0788      0.0218  
##   2007 2007  -0.0288     0.0168       -0.0731      0.0155  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## P-value for pre-test of parallel trends assumption:  0.23116
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Outcome Regression

The results suggest that the group that was treated in 2004, their outcomes in time 2006 and 2007 were negative and significant. Suggesting that group treated in 2004 saw a decline in the log of employment by 3.5% in year 2006 and 0.95% in 2007.

(2) Staggered DID with IPW

# Estimate ATT using inverse probability weighting (IPW)
att_ipw <- att_gt(yname = "lemp",          # Outcome variable
                  tname = "year",          # Time variable
                  idname = "countyreal",   # County identifier (for clustering)
                  gname = "first.treat",   # First treatment year (cohort)
                  xformla = ~ lpop,        # Covariate (log population)
                  data = mpdta,          
                  est_method = "ipw",      # Inverse Probability Weighting (IPW)
                  control_group = "nevertreated", # Never-treated as control
                  clustervars = "countyreal") # Cluster SEs at county level

#Print Results
summary(att_ipw)

## 
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal", 
##     gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated", 
##     clustervars = "countyreal", est_method = "ipw")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Simult.  Conf. Band]  
##   2004 2004  -0.0145     0.0222       -0.0723      0.0432  
##   2004 2005  -0.0764     0.0304       -0.1556      0.0027  
##   2004 2006  -0.1405     0.0387       -0.2412     -0.0397 *
##   2004 2007  -0.1069     0.0349       -0.1978     -0.0161 *
##   2006 2004  -0.0009     0.0228       -0.0602      0.0585  
##   2006 2005  -0.0064     0.0182       -0.0537      0.0409  
##   2006 2006   0.0012     0.0192       -0.0488      0.0513  
##   2006 2007  -0.0413     0.0191       -0.0910      0.0083  
##   2007 2004   0.0266     0.0140       -0.0100      0.0631  
##   2007 2005  -0.0047     0.0158       -0.0457      0.0363  
##   2007 2006  -0.0283     0.0199       -0.0801      0.0234  
##   2007 2007  -0.0289     0.0180       -0.0758      0.0180  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## P-value for pre-test of parallel trends assumption:  0.23604
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Inverse Probability Weighting

The results are similar for the year 2006 for group treated in 2004, 3.98%, slightly higher though. However, for the year 2007, the decline in log employment increases to 2% as compared barely 1% above. However, the estimates are still similar.

(3) DR estimation

# Estimate ATT using the doubly robust (DR) estimator
att_dr <- att_gt(yname = "lemp",           # Outcome variable (log employment)
                 tname = "year",           # Time variable
                 idname = "countyreal",    # County identifier (for clustering)
                 gname = "first.treat",    # First treatment year (cohort)
                 xformla = ~ lpop,         # Covariate (log population)
                 data = mpdta,           
                 est_method = "dr",        # Doubly robust (DR) estimation
                 control_group = "nevertreated",  # Never-treated as control
                 clustervars = "countyreal") # Cluster SEs at county level

#Print Results
summary(att_dr)

## 
## Call:
## att_gt(yname = "lemp", tname = "year", idname = "countyreal", 
##     gname = "first.treat", xformla = ~lpop, data = mpdta, control_group = "nevertreated", 
##     clustervars = "countyreal", est_method = "dr")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## Group-Time Average Treatment Effects:
##  Group Time ATT(g,t) Std. Error [95% Simult.  Conf. Band]  
##   2004 2004  -0.0145     0.0230       -0.0755      0.0464  
##   2004 2005  -0.0764     0.0315       -0.1598      0.0069  
##   2004 2006  -0.1404     0.0379       -0.2408     -0.0401 *
##   2004 2007  -0.1069     0.0332       -0.1947     -0.0191 *
##   2006 2004  -0.0005     0.0233       -0.0621      0.0611  
##   2006 2005  -0.0062     0.0185       -0.0551      0.0427  
##   2006 2006   0.0010     0.0200       -0.0518      0.0537  
##   2006 2007  -0.0413     0.0192       -0.0921      0.0095  
##   2007 2004   0.0267     0.0142       -0.0109      0.0644  
##   2007 2005  -0.0046     0.0154       -0.0452      0.0360  
##   2007 2006  -0.0284     0.0188       -0.0782      0.0213  
##   2007 2007  -0.0288     0.0175       -0.0751      0.0175  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## P-value for pre-test of parallel trends assumption:  0.23267
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust

The decline in log employment for the group 2004 in year 2006 and 2007 is highest in this model with decline being 4.16% in 2006 and 1.57% in 2007. This is higher than the estimates above.

(4) GGDID

# Group Time ATT Plot
ggdid(att_dr)

Yes, as we look at the first plot for group 2004, the estimates for 2006 and 2007 are negative and significant.

Part iii. Cohort-specific and aggregate average treatment effects on the treated

(1) Aggregate at group level

att_agg_dr <- aggte(att_dr, type = "group")
att_agg_dr

## 
## Call:
## aggte(MP = att_dr, type = "group")
## 
## Reference: Callaway, Brantly and Pedro H.C. Sant'Anna.  "Difference-in-Differences with Multiple Time Periods." Journal of Econometrics, Vol. 225, No. 2, pp. 200-230, 2021. <https://doi.org/10.1016/j.jeconom.2020.12.001>, <https://arxiv.org/abs/1803.09015> 
## 
## 
## Overall summary of ATT's based on group/cohort aggregation:  
##      ATT    Std. Error     [ 95%  Conf. Int.]  
##  -0.0328        0.0111    -0.0546      -0.011 *
## 
## 
## Group Effects:
##  Group Estimate Std. Error [95% Simult.  Conf. Band]  
##   2004  -0.0846     0.0283       -0.1464     -0.0228 *
##   2006  -0.0202     0.0185       -0.0606      0.0202  
##   2007  -0.0288     0.0169       -0.0656      0.0080  
## ---
## Signif. codes: `*' confidence band does not cover 0
## 
## Control Group:  Never Treated,  Anticipation Periods:  0
## Estimation Method:  Doubly Robust

ATT estimates for the groups suggest that for the group treated in 2004, they saw a decline in the log of employment by 8.46% whereas other groups did not have a significant decline post treatment.

(2)

(a) Method 1: Sun and Abraham 2020

model_fixest <- feols(lemp ~ sunab(first.treat, year) + lpop,
                      data = mpdta, cluster = ~countyreal)

#Summary
summary(model_fixest)

## OLS estimation, Dep. Var.: lemp
## Observations: 2,500
## Standard-errors: Clustered (countyreal) 
##              Estimate Std. Error   t value   Pr(>|t|)    
## (Intercept)  2.150178   0.071836 29.931880  < 2.2e-16 ***
## year::-4    -0.089159   0.050040 -1.781748 7.5398e-02 .  
## year::-3    -0.018323   0.045764 -0.400370 6.8906e-01    
## year::-2    -0.023724   0.043255 -0.548463 5.8362e-01    
## year::0      0.002722   0.041935  0.064904 9.4828e-01    
## year::1      0.227935   0.050508  4.512888 7.9844e-06 ***
## year::2      0.075223   0.074440  1.010517 3.1274e-01    
## year::3      0.133907   0.064962  2.061319 3.9790e-02 *  
## lpop         1.093460   0.016876 64.792639  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.540817   Adj. R2: 0.870793

According to this, the estimates are positive significant post treatment in year 1 and year 3. The treatment increased employment in 22.7% in year 1 and dials down up until year 3. This is in contrast with above results which show that treatment has negative effect on employment.

(b) Method 2: Gardner (2021)

library(did2s)

## Warning: package 'did2s' was built under R version 4.4.3

## did2s (v1.0.2). For more information on the methodology, visit <https://www.kylebutts.github.io/did2s>
## 
## To cite did2s in publications use:
## 
##   Butts, Kyle (2021).  did2s: Two-Stage Difference-in-Differences
##   Following Gardner (2021). R package version 1.0.2.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {did2s: Two-Stage Difference-in-Differences Following Gardner (2021)},
##     author = {Kyle Butts},
##     year = {2021},
##     url = {https://github.com/kylebutts/did2s/},
##   }

# Run Model
model_did2s <- did2s(
  mpdta,
  yname = "lemp",
  first_stage = ~ lpop | year,
  second_stage = ~ treat * first.treat,
  treatment = "treat",
  cluster_var = "countyreal"
)

## Running Two-stage Difference-in-Differences
##  - first stage formula `~ lpop | year`
##  - second stage formula `~ treat * first.treat`
##  - The indicator variable that denotes when treatment is on is `treat`
##  - Standard errors will be clustered by `countyreal`

# Summary
summary(model_did2s)

## OLS estimation, Dep. Var.: lemp
## Observations: 2,500
## Standard-errors: Custom 
##               Estimate Std. Error  t value   Pr(>|t|)    
## treat       239.106955  57.691193  4.14460 3.5174e-05 ***
## first.treat  -0.119164   0.028758 -4.14374 3.5305e-05 ***
## ... 1 variable was removed because of collinearity (treat:first.treat)
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## RMSE: 0.544414   Adj. R2: 0.015059

The estimate = -0.1192 for “first.treat” suggests that log employment is 11.9% lower in treated units after treatment, compared to what it would have been in absence of treatment.

(c) Method 3: Borusyak, Jaravel, and Spiess (2021)

library(didimputation)

## Warning: package 'didimputation' was built under R version 4.4.3

## 
## Attaching package: 'didimputation'

## The following objects are masked from 'package:did2s':
## 
##     df_het, df_hom

# Run Model
did_imp <- did_imputation(
  yname   = "lemp",            # outcome
  gname   = "first.treat",     # first treatment year (0 = never-treated)
  tname   = "year",            # time variable
  idname  = "countyreal",      # unit identifier
  data    = mpdta,            # dataframe
  cluster = "countyreal"       # cluster SEs
)

# Summary
summary(did_imp)

##      lhs                term              estimate          std.error      
##  Length:1           Length:1           Min.   :-0.04771   Min.   :0.01322  
##  Class :character   Class :character   1st Qu.:-0.04771   1st Qu.:0.01322  
##  Mode  :character   Mode  :character   Median :-0.04771   Median :0.01322  
##                                        Mean   :-0.04771   Mean   :0.01322  
##                                        3rd Qu.:-0.04771   3rd Qu.:0.01322  
##                                        Max.   :-0.04771   Max.   :0.01322  
##     conf.low          conf.high       
##  Min.   :-0.07363   Min.   :-0.02179  
##  1st Qu.:-0.07363   1st Qu.:-0.02179  
##  Median :-0.07363   Median :-0.02179  
##  Mean   :-0.07363   Mean   :-0.02179  
##  3rd Qu.:-0.07363   3rd Qu.:-0.02179  
##  Max.   :-0.07363   Max.   :-0.02179

The estimated ATT is -0.0477, which implies a 4.77% reduction in the log teen employment for treated counties after treatment, relative to their counterfactual.

EEFE 530: PS 5

2025-03-24