library(pacman)
p_load(devtools, synthdid, did, DRDID, gtrendsR, glmnet, causalweight, nnls, grf,
rdrobust, rdd, RDHonest, tidyverse, lubridate, usmap, gridExtra, stringr, readxl,
reshape2, scales, broom, data.table, ggplot2, stargazer, modelsummary,
foreign, ggthemes, ggforce, ggridges, latex2exp, viridis, extrafont,
kableExtra, snakecase, janitor, tableone, lfe, fixest, multcomp, etable)
library(devtools)
library(synthdid)
library(MCPanel)
library(rngtools)
library(future)
library(doFuture)
library(future.batchtools)

Problem 1

i. Partial replication

mc_estimate = function(Y, N0, T0) {
    N1=nrow(Y)-N0
    T1=ncol(Y)-T0
    W <- outer(c(rep(0,N0),rep(1,N1)),c(rep(0,T0),rep(1,T1)))
    mc_pred <- mcnnm_cv(Y, 1-W, num_lam_L = 20)
    mc_fit  <- mc_pred$L + outer(mc_pred$u, mc_pred$v, '+')
    mc_est <- sum(W*(Y-mc_fit))/sum(W)
    mc_est
}
mc_placebo_se = function(Y, N0, T0, replications=200) {
    N1 = nrow(Y) - N0
    theta = function(ind) { mc_estimate(Y[ind,], length(ind)-N1, T0) }
    sqrt((replications-1)/replications) * sd(replicate(replications, theta(sample(1:N0))))
}                

difp_estimate = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)), eta.omega=1e-6)
}

sc_estimate_reg = function(Y, N0, T0) {
    sc_estimate(Y, N0, T0, eta.omega=((nrow(Y)-N0)*(ncol(Y)-T0))^(1/4))
}
difp_estimate_reg = function(Y, N0, T0) {
    synthdid_estimate(Y, N0, T0, weights=list(lambda=rep(1/T0, T0)))
}

estimators = list(did=did_estimate,
                  sc=sc_estimate,
                  sdid=synthdid_estimate,
                  difp=difp_estimate,
                  mc = mc_estimate,
                  sc_reg = sc_estimate_reg,
                  difp_reg = difp_estimate_reg)

caprop99 <- read.csv("/Users/gracegao/Desktop/EEFE530/PS6/synthdid-sdid-paper/data/california_prop99.csv", sep = ";")
setup = panel.matrices(caprop99)

estimates = lapply(estimators, function(estimator) { estimator(setup$Y, setup$N0, setup$T0) } )
standard.errors = mapply(function(estimate, name) {
  set.seed(12345)
  if(name == 'mc') { mc_placebo_se(setup$Y, setup$N0, setup$T0) }
  else {             sqrt(vcov(estimate, method='placebo'))     }
}, estimates, names(estimators))

# Table 1
california.table = rbind(unlist(estimates), unlist(standard.errors))
rownames(california.table) = c('estimate', 'standard error')
colnames(california.table) = toupper(names(estimators))

round(california.table, digits=1)
##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -27.3 -19.6 -15.6 -11.1 -20.2  -21.7    -16.1
## standard error  17.7   9.9   8.4   9.5  11.5   10.0      9.2
# Figure 1
synthdid_plot(estimates[1:3], facet.vertical=FALSE,
              control.name='control', treated.name='california',
              lambda.comparable=TRUE, se.method = 'none',
              trajectory.linetype = 1, line.width=.75, effect.curvature=-.4,
              trajectory.alpha=.7, effect.alpha=.7,
              diagram.alpha=1, onset.alpha=.7) +
    theme(legend.position=c(.26,.07), legend.direction='horizontal',
          legend.key=element_blank(), legend.background=element_blank(),
          strip.background=element_blank(), strip.text.x = element_blank())

synthdid_units_plot(rev(estimates[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

Overall, the Proposition 99 led to a drop in cigarette consumption in California.

Under SDID, there are more variations in sizes of weights among observed states, and there are no particularly high influenced states. The estimated effect of SDID is slightly higher, with all states distributed more closely around the solid line (the estimated effect).

ii. Extensions

# Restricted donor pool
caprop99_restricted_states <- caprop99[!(caprop99$State %in% c('Nevada', 'New Hampshire')), ]

setup_restricted_states = panel.matrices(caprop99_restricted_states)

estimates_restricted_states = lapply(estimators, function(estimator) { estimator(setup_restricted_states$Y, setup_restricted_states$N0, setup_restricted_states$T0) } )

standard.errors_restricted_states = mapply(function(estimate, name) {
  set.seed(12345)
  if(name == 'mc') { mc_placebo_se(setup_restricted_states$Y, setup_restricted_states$N0, setup_restricted_states$T0) }
  else {             sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_states, names(estimators))

table2 = rbind(unlist(estimates_restricted_states), unlist(standard.errors_restricted_states))
rownames(table2) = c('estimate', 'standard error')
colnames(table2) = toupper(names(estimators))

round(table2, digits=1)
##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -30.1 -21.7 -19.5 -16.7 -24.0  -23.4    -20.6
## standard error  13.5  14.7  10.0  11.4  10.4   14.6     11.1
synthdid_plot(estimates_restricted_states[1:3], facet.vertical=FALSE,
              control.name='control', treated.name='california',
              lambda.comparable=TRUE, se.method='none',
              trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
              trajectory.alpha=.7, effect.alpha=.7,
              diagram.alpha=1, onset.alpha=.7) +
    theme(legend.position=c(.26,.07), legend.direction='horizontal',
          legend.key=element_blank(), legend.background=element_blank(),
          strip.background=element_blank(), strip.text.x = element_blank())

synthdid_units_plot(rev(estimates_restricted_states[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

# Restricted pre-treatment periods
caprop99_restricted_pre <- caprop99[!(caprop99$Year %in% 1970:1974), ]

setup_restricted_pre = panel.matrices(caprop99_restricted_pre)

estimates_restricted_pre = lapply(estimators, function(estimator) { estimator(setup_restricted_pre$Y, setup_restricted_pre$N0, setup_restricted_pre$T0) } )

standard.errors_restricted_pre = mapply(function(estimate, name) {
  set.seed(12345)
  if(name == 'mc') { mc_placebo_se(setup_restricted_pre$Y, setup_restricted_pre$N0, setup_restricted_pre$T0) }
  else {             sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_pre, names(estimators))

table3 = rbind(unlist(estimates_restricted_pre), unlist(standard.errors_restricted_pre))
rownames(table3) = c('estimate', 'standard error')
colnames(table3) = toupper(names(estimators))

round(table3, digits=1)
##                  DID    SC  SDID  DIFP    MC SC_REG DIFP_REG
## estimate       -23.7 -20.3 -17.9 -19.6 -20.8  -23.1    -18.4
## standard error  15.0   9.5   9.1   9.8  10.7    9.4      9.8
synthdid_plot(estimates_restricted_pre[1:3], facet.vertical=FALSE,
              control.name='control', treated.name='california',
              lambda.comparable=TRUE, se.method='none',
              trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
              trajectory.alpha=.7, effect.alpha=.7,
              diagram.alpha=1, onset.alpha=.7) +
    theme(legend.position=c(.26,.07), legend.direction='horizontal',
          legend.key=element_blank(), legend.background=element_blank(),
          strip.background=element_blank(), strip.text.x = element_blank())

synthdid_units_plot(rev(estimates_restricted_pre[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

# Restricted post-treatment periods
caprop99_restricted_post <- caprop99[!(caprop99$Year %in% 1996:2000), ]

setup_restricted_post = panel.matrices(caprop99_restricted_post)

estimates_restricted_post = lapply(estimators, function(estimator) { estimator(setup_restricted_post$Y, setup_restricted_post$N0, setup_restricted_post$T0) } )

standard.errors_restricted_post = mapply(function(estimate, name) {
  set.seed(12345)
  if(name == 'mc') { mc_placebo_se(setup_restricted_post$Y, setup_restricted_post$N0, setup_restricted_post$T0) }
  else {             sqrt(vcov(estimate, method='placebo')) }
}, estimates_restricted_post, names(estimators))

table4 = rbind(unlist(estimates_restricted_post), unlist(standard.errors_restricted_post))
rownames(table4) = c('estimate', 'standard error')
colnames(table4) = toupper(names(estimators))

round(table4, digits=1)
##                  DID    SC SDID DIFP    MC SC_REG DIFP_REG
## estimate       -22.2 -15.3 -9.8 -8.0 -14.2  -17.1    -10.8
## standard error  18.1   8.5  7.5  9.2  11.0    8.6      9.2
synthdid_plot(estimates_restricted_post[1:3], facet.vertical=FALSE,
              control.name='control', treated.name='california',
              lambda.comparable=TRUE, se.method='none',
              trajectory.linetype=1, line.width=.75, effect.curvature=-.4,
              trajectory.alpha=.7, effect.alpha=.7,
              diagram.alpha=1, onset.alpha=.7) +
    theme(legend.position=c(.26,.07), legend.direction='horizontal',
          legend.key=element_blank(), legend.background=element_blank(),
          strip.background=element_blank(), strip.text.x = element_blank())

synthdid_units_plot(rev(estimates_restricted_post[1:3]), se.method='none') +
    theme(legend.background=element_blank(), legend.title = element_blank(),
          legend.direction='horizontal', legend.position=c(.17,.07),
          strip.background=element_blank(), strip.text.x = element_blank())

After removing New Hampshire and Nevada, the estimates show slightly larger magnitudes overall. But the estimates are also less precise as all the standard errors are bigger too.

After removing periods 1970-1974, we now have less information on pre-treatment periods, and have to make stronger parallel trend assumption. The DID and SDID estimates become less precise and robust.

After removing periods 1996-2000, now we have much shorter follow-up period for observing long-term effects of Proposition 99. All the estimates are smaller in magnitude, as we can’t fully capture the long-term impacts.

Problem 3

i. Ridge regression

Yes. It’s about the bias variance trade off. Ridge regression introduces a regularization penalty that shrinks the estimated coefficients toward zero. Because of this shrinkage, the ridge estimator is biased for the true coefficient vector.

ii. LASSO

# a
housingdata <- readRDS("testdata20250121.RDS")

housingdata <- housingdata %>%
  mutate(sale_date = ymd(sale_date),
         year = year(sale_date),
         month = month(sale_date),
         day = day(sale_date))
housingdata <- housingdata %>%
  mutate(log_sale_amount = log(sale_amount))
housingdata <- housingdata %>%
  mutate(log_land_square_footage = log(land_square_footage))

train <- housingdata %>%
  filter(!(abbr %in% c('NY', 'CO', 'CA')))
test <- housingdata %>%
  filter(abbr %in% c('NY', 'CO', 'CA'))

# b
x_train <- model.matrix(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage + AC_presence, data = train)[,-1]
y_train <- train$log_sale_amount

set.seed(12345)
lasso_cv <- cv.glmnet(x_train, y_train, alpha = 1, nfolds = 5)

plot(lasso_cv)

# c
best_lambda <- lasso_cv$lambda.min
best_lambda
## [1] 0.007830992
lasso_coef <- coef(lasso_cv, s = "lambda.min")
lasso_coef
## 8 x 1 sparse Matrix of class "dgCMatrix"
##                                  s1
## (Intercept)             5.724545138
## year_built              0.002635312
## bedrooms_all_buildings  0.023204745
## number_of_bathrooms     0.240398223
## number_of_fireplaces    .          
## stories_number          0.165510321
## log_land_square_footage 0.044282235
## AC_presence             0.011759949
## Marginal WTP for AC = 1.18%

# d
x_test <- model.matrix(log_sale_amount ~ year_built + bedrooms_all_buildings + number_of_bathrooms + number_of_fireplaces + stories_number + log_land_square_footage + AC_presence, data = test)[,-1]

predicted_log_price <- predict(lasso_cv, newx = x_test, s = "lambda.min")
test$predicted_log_price <- as.vector(predicted_log_price)

test %>%
  group_by(AC_presence) %>%
  summarize(avg_predicted_log_price = mean(predicted_log_price)) %>%
  pivot_wider(names_from = AC_presence, values_from = avg_predicted_log_price) %>%
  mutate(predicted_difference = `1` - `0`)
## # A tibble: 1 × 3
##     `0`   `1` predicted_difference
##   <dbl> <dbl>                <dbl>
## 1  12.2  12.3               0.0887
# e
test %>%
  group_by(AC_presence) %>%
  summarize(avg_observed_log_price = mean(log_sale_amount)) %>%
  pivot_wider(names_from = AC_presence, values_from = avg_observed_log_price) %>%
  mutate(observed_difference = `1` - `0`)
## # A tibble: 1 × 3
##     `0`   `1` observed_difference
##   <dbl> <dbl>               <dbl>
## 1  13.1  12.9              -0.191
# f
lm_ny <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "NY"))
summary(lm_ny)
## 
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test, 
##     abbr == "NY"))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.48078 -0.24719  0.01334  0.20850  2.96211 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.81718    0.03149   407.1   <2e-16 ***
## AC_presence  0.37802    0.03437    11.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3765 on 886 degrees of freedom
## Multiple R-squared:  0.1201, Adjusted R-squared:  0.1191 
## F-statistic: 120.9 on 1 and 886 DF,  p-value: < 2.2e-16
lm_co <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "CO"))
summary(lm_co)
## 
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test, 
##     abbr == "CO"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5447 -0.3303  0.0003  0.3115  5.4629 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 12.759313   0.001461 8734.67   <2e-16 ***
## AC_presence  0.118032   0.002510   47.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5987 on 254006 degrees of freedom
## Multiple R-squared:  0.008628,   Adjusted R-squared:  0.008624 
## F-statistic:  2211 on 1 and 254006 DF,  p-value: < 2.2e-16
lm_ca <- lm(log_sale_amount ~ AC_presence, data = filter(test, abbr == "CA"))
summary(lm_ca)
## 
## Call:
## lm(formula = log_sale_amount ~ AC_presence, data = filter(test, 
##     abbr == "CA"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9856 -0.5062 -0.0099  0.4720  6.5373 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.200252   0.001283 10287.9   <2e-16 ***
## AC_presence -0.326734   0.001823  -179.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7961 on 763145 degrees of freedom
## Multiple R-squared:  0.0404, Adjusted R-squared:  0.0404 
## F-statistic: 3.213e+04 on 1 and 763145 DF,  p-value: < 2.2e-16

The three states show different effects of AC on price, the magnitude varies (which are much larger than the LASSO estimates) and the sign varies too. This difference reflects the variation in climate, housing market preferences, or the prevalence of AC. The LASSO estimate is a regularized average marginal effect, giving a more conservative result. It is biased downward, with a trade off for smaller variance.

iii. Partialling out

The claim is false. Partialling out a subset of covariates and then regressing residuals on the remaining ones leads to biased estimates when the omitted covariates are correlated with the included ones. If X1 and X2 are correlated, the residuals contain information from the omitted variable. Therefore, the recovered betas are not the same as the betas in equation 1.

iv. Debiased machine learning

ca_test <- test %>% filter(abbr == "CA")

Y <- ca_test$log_sale_amount
D <- ca_test$AC_presence
X <- ca_test[, c("year_built", "bedrooms_all_buildings", "number_of_bathrooms",
                 "number_of_fireplaces", "stories_number", "log_land_square_footage")]

set.seed(12345)
dml_ac <- treatDML(
  y = Y,
  d = D,
  x = X,
  MLmethod = "lasso", 
  k = 3 
)

dml_ac$effect   
## [1] -0.2959389
dml_ac$se   
## [1] 0.001891109
dml_ac$pval
## [1] 0

The results can be improved by increasing the number of cross-fitting folds and adding more relevant covariates.