library(pacman)
p_load(devtools, synthdid, did, DRDID, gtrendsR, glmnet, causalweight, nnls, grf,
rdrobust, rdd, RDHonest, tidyverse, lubridate, usmap, gridExtra, stringr, readxl,
reshape2, scales, broom, data.table, ggplot2, stargazer, modelsummary,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor, tableone, lfe, fixest, multcomp, etable)
library(devtools)
library(synthdid)
library(MCPanel)
library(rngtools)
library(future)
library(doFuture)
library(future.batchtools)
mc_estimate = function(Y, N0, T0) {
N1=nrow(Y)-N0
T1=ncol(Y)-T0
W <- outer(c(rep(0,N0),rep(1,N1)),c(rep(0,T0),rep(1,T1)))
mc_pred <- mcnnm_cv(Y, 1-W, num_lam_L = 20)
mc_fit <- mc_pred$L + outer(mc_pred$u, mc_pred$v, '+')
mc_est <- sum(W*(Y-mc_fit))/sum(W)
mc_est
}
mc_placebo_se = function(Y, N0, T0, replications=200) {
N1 = nrow(Y) - N0
theta = function(ind) { mc_estimate(Y[ind,], length(ind)-N1, T0) }
sqrt((replications-1)/replications) * sd(replicate(replications, theta(sample(1:N0))))
}
difp_estimate = function(Y, N0, T0) {
synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)), eta.omega=1e-6)
}
sc_estimate_reg = function(Y, N0, T0) {
sc_estimate(Y, N0, T0, eta.omega=((nrow(Y)-N0)*(ncol(Y)-T0))^(1/4))
}
difp_estimate_reg = function(Y, N0, T0) {
synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)))
}
estimators = list(did=did_estimate,
sc=sc_estimate,
sdid=synthdid_estimate,
difp=difp_estimate,
mc = mc_estimate,
sc_reg = sc_estimate_reg,
difp_reg = difp_estimate_reg)
caprop99 <- read.csv("/Users/gracegao/Desktop/EEFE530/PS6/synthdid-sdid-paper/data/california_prop99.csv", sep = ";")
setup = panel.matrices(caprop99)
estimates = lapply(estimators, function(estimator) { estimator(setup$Y, setup$N0, setup$T0) } )
standard.errors = mapply(function(estimate, name) {
set.seed(12345)
if(name == 'mc') { mc_placebo_se(setup$Y, setup$N0, setup$T0) }
else { sqrt(vcov(estimate, method='placebo')) }
}, estimates, names(estimators))
# Table 1
california.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california.table) = c('estimate', 'standard error')
colnames(california.table) = toupper(names(estimators))
round(california.table, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -27.3 -19.6 -15.6 -11.1 -20.2 -21.7 -16.1
## standard error 17.7 9.9 8.4 9.5 11.5 10.0 9.2
# Figure 1
synthdid_plot(estimates[1:3], facet.vertical=FALSE,
control.name='control', treated.name='california',
lambda.comparable=TRUE, se.method = 'none',
trajectory.linetype = 1, line.width=.75, effect.curvature=-.4,
trajectory.alpha=.7, effect.alpha=.7,
diagram.alpha=1, onset.alpha=.7) +
theme(legend.position=c(.26,.07), legend.direction='horizontal',
legend.key=element_blank(), legend.background=element_blank(),
strip.background=element_blank(), strip.text.x = element_blank())
synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
Overall, the Proposition 99 led to a drop in cigarette consumption in
California.
Under SDID, there are more variations in sizes of weights among observed states, and there are no particularly high influenced states. The estimated effect of SDID is slightly higher, with all states distributed more closely around the solid line (the estimated effect).
# Restricted donor pool
caprop99_restricted_states <- caprop99[!(caprop99$State %in% c('Nevada', 'New Hampshire')), ]
setup_restricted_states = panel.matrices(caprop99_restricted_states)
estimates_restricted_states = lapply(estimators, function(estimator) { estimator(setup_restricted_states$Y, setup_restricted_states$N0, setup_restricted_states$T0) } )
standard.errors_restricted_states = mapply(function(estimate, name) {
set.seed(12345)
if(name == 'mc') { mc_placebo_se(setup_restricted_states$Y, setup_restricted_states$N0, setup_restricted_states$T0) }
else { sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_states, names(estimators))
table2 = rbind(unlist(estimates_restricted_states), unlist(standard.errors_restricted_states))
rownames(table2) = c('estimate', 'standard error')
colnames(table2) = toupper(names(estimators))
round(table2, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -30.1 -21.7 -19.5 -16.7 -24.0 -23.4 -20.6
## standard error 13.5 14.7 10.0 11.4 10.4 14.6 11.1
synthdid_plot(estimates_restricted_states[1:3], facet.vertical=FALSE,
control.name='control', treated.name='california',
lambda.comparable=TRUE, se.method='none',
trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
trajectory.alpha=.7, effect.alpha=.7,
diagram.alpha=1, onset.alpha=.7) +
theme(legend.position=c(.26,.07), legend.direction='horizontal',
legend.key=element_blank(), legend.background=element_blank(),
strip.background=element_blank(), strip.text.x = element_blank())
synthdid_units_plot(rev(estimates_restricted_states[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
# Restricted pre-treatment periods
caprop99_restricted_pre <- caprop99[!(caprop99$Year %in% 1970:1974), ]
setup_restricted_pre = panel.matrices(caprop99_restricted_pre)
estimates_restricted_pre = lapply(estimators, function(estimator) { estimator(setup_restricted_pre$Y, setup_restricted_pre$N0, setup_restricted_pre$T0) } )
standard.errors_restricted_pre = mapply(function(estimate, name) {
set.seed(12345)
if(name == 'mc') { mc_placebo_se(setup_restricted_pre$Y, setup_restricted_pre$N0, setup_restricted_pre$T0) }
else { sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_pre, names(estimators))
table3 = rbind(unlist(estimates_restricted_pre), unlist(standard.errors_restricted_pre))
rownames(table3) = c('estimate', 'standard error')
colnames(table3) = toupper(names(estimators))
round(table3, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -23.7 -20.3 -17.9 -19.6 -20.8 -23.1 -18.4
## standard error 15.0 9.5 9.1 9.8 10.7 9.4 9.8
synthdid_plot(estimates_restricted_pre[1:3], facet.vertical=FALSE,
control.name='control', treated.name='california',
lambda.comparable=TRUE, se.method='none',
trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
trajectory.alpha=.7, effect.alpha=.7,
diagram.alpha=1, onset.alpha=.7) +
theme(legend.position=c(.26,.07), legend.direction='horizontal',
legend.key=element_blank(), legend.background=element_blank(),
strip.background=element_blank(), strip.text.x = element_blank())
synthdid_units_plot(rev(estimates_restricted_pre[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
# Restricted post-treatment periods
caprop99_restricted_post <- caprop99[!(caprop99$Year %in% 1996:2000), ]
setup_restricted_post = panel.matrices(caprop99_restricted_post)
estimates_restricted_post = lapply(estimators, function(estimator) { estimator(setup_restricted_post$Y, setup_restricted_post$N0, setup_restricted_post$T0) } )
standard.errors_restricted_post = mapply(function(estimate, name) {
set.seed(12345)
if(name == 'mc') { mc_placebo_se(setup_restricted_post$Y, setup_restricted_post$N0, setup_restricted_post$T0) }
else { sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_post, names(estimators))
table4 = rbind(unlist(estimates_restricted_post), unlist(standard.errors_restricted_post))
rownames(table4) = c('estimate', 'standard error')
colnames(table4) = toupper(names(estimators))
round(table4, digits=1)
## DID SC SDID DIFP MC SC_REG DIFP_REG
## estimate -22.2 -15.3 -9.8 -8.0 -14.2 -17.1 -10.8
## standard error 18.1 8.5 7.5 9.2 11.0 8.6 9.2
synthdid_plot(estimates_restricted_post[1:3], facet.vertical=FALSE,
control.name='control', treated.name='california',
lambda.comparable=TRUE, se.method='none',
trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
trajectory.alpha=.7, effect.alpha=.7,
diagram.alpha=1, onset.alpha=.7) +
theme(legend.position=c(.26,.07), legend.direction='horizontal',
legend.key=element_blank(), legend.background=element_blank(),
strip.background=element_blank(), strip.text.x = element_blank())
synthdid_units_plot(rev(estimates_restricted_post[1:3]), se.method='none') +
theme(legend.background=element_blank(), legend.title = element_blank(),
legend.direction='horizontal', legend.position=c(.17,.07),
strip.background=element_blank(), strip.text.x = element_blank())
After removing New Hampshire and Nevada, the estimates show slightly
larger magnitudes overall. But the estimates are also less precise as
all the standard errors are bigger too.
After removing periods 1970-1974, we now have less information on pre-treatment periods, and have to make stronger parallel trend assumption. The DID and SDID estimates become less precise and robust.
After removing periods 1996-2000, now we have much shorter follow-up period for observing long-term effects of Proposition 99. All the estimates are smaller in magnitude, as we can’t fully capture the long-term impacts.
Yes. It’s about the bias variance trade off. Ridge regression introduces a regularization penalty that shrinks the estimated coefficients toward zero. Because of this shrinkage, the ridge estimator is biased for the true coefficient vector.
# a
housingdata <- readRDS("testdata20250121.RDS")
housingdata <- housingdata %>%
mutate(sale_date = ymd(sale_date),
year = year(sale_date),
month = month(sale_date),
day = day(sale_date))
housingdata <- housingdata %>%
mutate(log_sale_amount = log(sale_amount))
housingdata <- housingdata %>%
mutate(log_land_square_footage = log(land_square_footage))
train <- housingdata %>%
filter(!(abbr %in% c('NY', 'CO', 'CA')))
test <- housingdata %>%
filter(abbr %in% c('NY', 'CO', 'CA'))
# b
x_train <- model.matrix(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage + AC_presence, data = train)[,-1]
y_train <- train$log_sale_amount
set.seed(12345)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5)
plot(lasso_cv)
# c
best_lambda <- lasso_cv$lambda.min
best_lambda
## [1] 0.007830992
lasso_coef <- coef(lasso_cv, s = "lambda.min")
lasso_coef
## 8 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 5.724545138
## year_built 0.002635312
## bedrooms_all_buildings 0.023204745
## number_of_bathrooms 0.240398223
## number_of_fireplaces .
## stories_number 0.165510321
## log_land_square_footage 0.044282235
## AC_presence 0.011759949
## Marginal WTP for AC = 1.18%
# d
x_test <- model.matrix(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage + AC_presence, data = test)[,-1]
predicted_log_price <- predict(lasso_cv, newx = x_test, s = "lambda.min")
test$predicted_log_price <- as.vector(predicted_log_price)
test %>%
group_by(AC_presence) %>%
summarize(avg_predicted_log_price = mean(predicted_log_price)) %>%
pivot_wider(names_from = AC_presence, values_from = avg_predicted_log_price) %>%
mutate(predicted_difference = `1` - `0`)
## # A tibble: 1 × 3
## `0` `1` predicted_difference
## <dbl> <dbl> <dbl>
## 1 12.2 12.3 0.0887
# e
test %>%
group_by(AC_presence) %>%
summarize(avg_observed_log_price = mean(log_sale_amount)) %>%
pivot_wider(names_from = AC_presence, values_from = avg_observed_log_price) %>%
mutate(observed_difference = `1` - `0`)
## # A tibble: 1 × 3
## `0` `1` observed_difference
## <dbl> <dbl> <dbl>
## 1 13.1 12.9 -0.191
# f
lm_ny <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "NY"))
summary(lm_ny)
##
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test,
## abbr == "NY"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.48078 -0.24719 0.01334 0.20850 2.96211
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.81718 0.03149 407.1 <2e-16 ***
## AC_presence 0.37802 0.03437 11.0 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3765 on 886 degrees of freedom
## Multiple R-squared: 0.1201, Adjusted R-squared: 0.1191
## F-statistic: 120.9 on 1 and 886 DF, p-value: < 2.2e-16
lm_co <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "CO"))
summary(lm_co)
##
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test,
## abbr == "CO"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.5447 -0.3303 0.0003 0.3115 5.4629
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.759313 0.001461 8734.67 <2e-16 ***
## AC_presence 0.118032 0.002510 47.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5987 on 254006 degrees of freedom
## Multiple R-squared: 0.008628, Adjusted R-squared: 0.008624
## F-statistic: 2211 on 1 and 254006 DF, p-value: < 2.2e-16
lm_ca <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "CA"))
summary(lm_ca)
##
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test,
## abbr == "CA"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9856 -0.5062 -0.0099 0.4720 6.5373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.200252 0.001283 10287.9 <2e-16 ***
## AC_presence -0.326734 0.001823 -179.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7961 on 763145 degrees of freedom
## Multiple R-squared: 0.0404, Adjusted R-squared: 0.0404
## F-statistic: 3.213e+04 on 1 and 763145 DF, p-value: < 2.2e-16
The three states show different effects of AC on price, the magnitude varies (which are much larger than the LASSO estimates) and the sign varies too. This difference reflects the variation in climate, housing market preferences, or the prevalence of AC. The LASSO estimate is a regularized average marginal effect, giving a more conservative result. It is biased downward, with a trade off for smaller variance.
The claim is false. Partialling out a subset of covariates and then regressing residuals on the remaining ones leads to biased estimates when the omitted covariates are correlated with the included ones. If X1 and X2 are correlated, the residuals contain information from the omitted variable. Therefore, the recovered betas are not the same as the betas in equation 1.
ca_test <- test %>% filter(abbr == "CA")
Y <- ca_test$log_sale_amount
D <- ca_test$AC_presence
X <- ca_test[, c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
"number_of_fireplaces", "stories_number", "log_land_square_footage")]
set.seed(12345)
dml_ac <- treatDML(
y = Y,
d = D,
x = X,
MLmethod = "lasso",
k = 3
)
dml_ac$effect
## [1] -0.2959389
dml_ac$se
## [1] 0.001891109
dml_ac$pval
## [1] 0
The results can be improved by increasing the number of cross-fitting folds and adding more relevant covariates.